From c799ba6eab7a1bf02792e2ced933be3ae5f8daa2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 11 Dec 2015 15:31:10 +0100
Subject: cfg80211: remove CFG80211_REG_DEBUG

Instead of having this Kconfig option, which just *floods* the
kernel log,
 * remove the per-channel prints that are fairly useless anyway
 * convert the conditional printing to pr_debug()

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/Kconfig |  13 ------
 net/wireless/reg.c   | 122 ++++++++++++++-------------------------------------
 2 files changed, 34 insertions(+), 101 deletions(-)

(limited to 'net')

diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index da72ed32f143..ec3bf30dd526 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -61,19 +61,6 @@ config CFG80211_DEVELOPER_WARNINGS
 	  on it (or mac80211).
 
 
-config CFG80211_REG_DEBUG
-	bool "cfg80211 regulatory debugging"
-	depends on CFG80211
-	default n
-	---help---
-	  You can enable this if you want to debug regulatory changes.
-	  For more information on cfg80211 regulatory refer to the wireless
-	  wiki:
-
-	  http://wireless.kernel.org/en/developers/Regulatory
-
-	  If unsure, say N.
-
 config CFG80211_CERTIFICATION_ONUS
 	bool "cfg80211 certification onus"
 	depends on CFG80211 && EXPERT
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 3b0ce1c484a3..bc76b281ed3a 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -60,13 +60,6 @@
 #include "regdb.h"
 #include "nl80211.h"
 
-#ifdef CONFIG_CFG80211_REG_DEBUG
-#define REG_DBG_PRINT(format, args...)			\
-	printk(KERN_DEBUG pr_fmt(format), ##args)
-#else
-#define REG_DBG_PRINT(args...)
-#endif
-
 /*
  * Grace period we give before making sure all current interfaces reside on
  * channels allowed by the current regulatory domain.
@@ -178,12 +171,10 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy)
 	if (wiphy_regd->dfs_region == regd->dfs_region)
 		goto out;
 
-	REG_DBG_PRINT("%s: device specific dfs_region "
-		      "(%s) disagrees with cfg80211's "
-		      "central dfs_region (%s)\n",
-		      dev_name(&wiphy->dev),
-		      reg_dfs_region_str(wiphy_regd->dfs_region),
-		      reg_dfs_region_str(regd->dfs_region));
+	pr_debug("%s: device specific dfs_region (%s) disagrees with cfg80211's central dfs_region (%s)\n",
+		 dev_name(&wiphy->dev),
+		 reg_dfs_region_str(wiphy_regd->dfs_region),
+		 reg_dfs_region_str(regd->dfs_region));
 
 out:
 	return regd->dfs_region;
@@ -541,7 +532,7 @@ static DECLARE_DELAYED_WORK(crda_timeout, crda_timeout_work);
 
 static void crda_timeout_work(struct work_struct *work)
 {
-	REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n");
+	pr_debug("Timeout while waiting for CRDA to reply, restoring regulatory settings\n");
 	rtnl_lock();
 	reg_crda_timeouts++;
 	restore_regulatory_settings(true);
@@ -583,7 +574,7 @@ static int call_crda(const char *alpha2)
 
 	if (!is_world_regdom((char *) alpha2))
 		pr_debug("Calling CRDA for country: %c%c\n",
-			alpha2[0], alpha2[1]);
+			 alpha2[0], alpha2[1]);
 	else
 		pr_debug("Calling CRDA to update world regulatory domain\n");
 
@@ -1130,42 +1121,6 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator)
 }
 EXPORT_SYMBOL(reg_initiator_name);
 
-static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd,
-				    struct ieee80211_channel *chan,
-				    const struct ieee80211_reg_rule *reg_rule)
-{
-#ifdef CONFIG_CFG80211_REG_DEBUG
-	const struct ieee80211_power_rule *power_rule;
-	const struct ieee80211_freq_range *freq_range;
-	char max_antenna_gain[32], bw[32];
-
-	power_rule = &reg_rule->power_rule;
-	freq_range = &reg_rule->freq_range;
-
-	if (!power_rule->max_antenna_gain)
-		snprintf(max_antenna_gain, sizeof(max_antenna_gain), "N/A");
-	else
-		snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d mBi",
-			 power_rule->max_antenna_gain);
-
-	if (reg_rule->flags & NL80211_RRF_AUTO_BW)
-		snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO",
-			 freq_range->max_bandwidth_khz,
-			 reg_get_max_bandwidth(regd, reg_rule));
-	else
-		snprintf(bw, sizeof(bw), "%d KHz",
-			 freq_range->max_bandwidth_khz);
-
-	REG_DBG_PRINT("Updating information on frequency %d MHz with regulatory rule:\n",
-		      chan->center_freq);
-
-	REG_DBG_PRINT("(%d KHz - %d KHz @ %s), (%s, %d mBm)\n",
-		      freq_range->start_freq_khz, freq_range->end_freq_khz,
-		      bw, max_antenna_gain,
-		      power_rule->max_eirp);
-#endif
-}
-
 static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd,
 					  const struct ieee80211_reg_rule *reg_rule,
 					  const struct ieee80211_channel *chan)
@@ -1240,20 +1195,19 @@ static void handle_channel(struct wiphy *wiphy,
 		if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
 		    request_wiphy && request_wiphy == wiphy &&
 		    request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
-			REG_DBG_PRINT("Disabling freq %d MHz for good\n",
-				      chan->center_freq);
+			pr_debug("Disabling freq %d MHz for good\n",
+				 chan->center_freq);
 			chan->orig_flags |= IEEE80211_CHAN_DISABLED;
 			chan->flags = chan->orig_flags;
 		} else {
-			REG_DBG_PRINT("Disabling freq %d MHz\n",
-				      chan->center_freq);
+			pr_debug("Disabling freq %d MHz\n",
+				 chan->center_freq);
 			chan->flags |= IEEE80211_CHAN_DISABLED;
 		}
 		return;
 	}
 
 	regd = reg_get_regdomain(wiphy);
-	chan_reg_rule_print_dbg(regd, chan, reg_rule);
 
 	power_rule = &reg_rule->power_rule;
 	bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan);
@@ -1391,18 +1345,15 @@ static bool ignore_reg_update(struct wiphy *wiphy,
 		return true;
 
 	if (!lr) {
-		REG_DBG_PRINT("Ignoring regulatory request set by %s "
-			      "since last_request is not set\n",
-			      reg_initiator_name(initiator));
+		pr_debug("Ignoring regulatory request set by %s since last_request is not set\n",
+			 reg_initiator_name(initiator));
 		return true;
 	}
 
 	if (initiator == NL80211_REGDOM_SET_BY_CORE &&
 	    wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) {
-		REG_DBG_PRINT("Ignoring regulatory request set by %s "
-			      "since the driver uses its own custom "
-			      "regulatory domain\n",
-			      reg_initiator_name(initiator));
+		pr_debug("Ignoring regulatory request set by %s since the driver uses its own custom regulatory domain\n",
+			 reg_initiator_name(initiator));
 		return true;
 	}
 
@@ -1413,10 +1364,8 @@ static bool ignore_reg_update(struct wiphy *wiphy,
 	if (wiphy_strict_alpha2_regd(wiphy) && !wiphy->regd &&
 	    initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
 	    !is_world_regdom(lr->alpha2)) {
-		REG_DBG_PRINT("Ignoring regulatory request set by %s "
-			      "since the driver requires its own regulatory "
-			      "domain to be set first\n",
-			      reg_initiator_name(initiator));
+		pr_debug("Ignoring regulatory request set by %s since the driver requires its own regulatory domain to be set first\n",
+			 reg_initiator_name(initiator));
 		return true;
 	}
 
@@ -1697,7 +1646,7 @@ static void reg_check_chans_work(struct work_struct *work)
 {
 	struct cfg80211_registered_device *rdev;
 
-	REG_DBG_PRINT("Verifying active interfaces after reg change\n");
+	pr_debug("Verifying active interfaces after reg change\n");
 	rtnl_lock();
 
 	list_for_each_entry(rdev, &cfg80211_rdev_list, list)
@@ -1779,8 +1728,8 @@ static void handle_channel_custom(struct wiphy *wiphy,
 	}
 
 	if (IS_ERR(reg_rule)) {
-		REG_DBG_PRINT("Disabling freq %d MHz as custom regd has no rule that fits it\n",
-			      chan->center_freq);
+		pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n",
+			 chan->center_freq);
 		if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
 			chan->flags |= IEEE80211_CHAN_DISABLED;
 		} else {
@@ -1790,8 +1739,6 @@ static void handle_channel_custom(struct wiphy *wiphy,
 		return;
 	}
 
-	chan_reg_rule_print_dbg(regd, chan, reg_rule);
-
 	power_rule = &reg_rule->power_rule;
 	bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan);
 
@@ -2522,7 +2469,7 @@ static void restore_alpha2(char *alpha2, bool reset_user)
 	if (is_user_regdom_saved()) {
 		/* Unless we're asked to ignore it and reset it */
 		if (reset_user) {
-			REG_DBG_PRINT("Restoring regulatory settings including user preference\n");
+			pr_debug("Restoring regulatory settings including user preference\n");
 			user_alpha2[0] = '9';
 			user_alpha2[1] = '7';
 
@@ -2532,24 +2479,24 @@ static void restore_alpha2(char *alpha2, bool reset_user)
 			 * back as they were for a full restore.
 			 */
 			if (!is_world_regdom(ieee80211_regdom)) {
-				REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
-					      ieee80211_regdom[0], ieee80211_regdom[1]);
+				pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
+					 ieee80211_regdom[0], ieee80211_regdom[1]);
 				alpha2[0] = ieee80211_regdom[0];
 				alpha2[1] = ieee80211_regdom[1];
 			}
 		} else {
-			REG_DBG_PRINT("Restoring regulatory settings while preserving user preference for: %c%c\n",
-				      user_alpha2[0], user_alpha2[1]);
+			pr_debug("Restoring regulatory settings while preserving user preference for: %c%c\n",
+				 user_alpha2[0], user_alpha2[1]);
 			alpha2[0] = user_alpha2[0];
 			alpha2[1] = user_alpha2[1];
 		}
 	} else if (!is_world_regdom(ieee80211_regdom)) {
-		REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
-			      ieee80211_regdom[0], ieee80211_regdom[1]);
+		pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
+			 ieee80211_regdom[0], ieee80211_regdom[1]);
 		alpha2[0] = ieee80211_regdom[0];
 		alpha2[1] = ieee80211_regdom[1];
 	} else
-		REG_DBG_PRINT("Restoring regulatory settings\n");
+		pr_debug("Restoring regulatory settings\n");
 }
 
 static void restore_custom_reg_settings(struct wiphy *wiphy)
@@ -2661,14 +2608,14 @@ static void restore_regulatory_settings(bool reset_user)
 	list_splice_tail_init(&tmp_reg_req_list, &reg_requests_list);
 	spin_unlock(&reg_requests_lock);
 
-	REG_DBG_PRINT("Kicking the queue\n");
+	pr_debug("Kicking the queue\n");
 
 	schedule_work(&reg_work);
 }
 
 void regulatory_hint_disconnect(void)
 {
-	REG_DBG_PRINT("All devices are disconnected, going to restore regulatory settings\n");
+	pr_debug("All devices are disconnected, going to restore regulatory settings\n");
 	restore_regulatory_settings(false);
 }
 
@@ -2716,10 +2663,10 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy,
 	if (!reg_beacon)
 		return -ENOMEM;
 
-	REG_DBG_PRINT("Found new beacon on frequency: %d MHz (Ch %d) on %s\n",
-		      beacon_chan->center_freq,
-		      ieee80211_frequency_to_channel(beacon_chan->center_freq),
-		      wiphy_name(wiphy));
+	pr_debug("Found new beacon on frequency: %d MHz (Ch %d) on %s\n",
+		 beacon_chan->center_freq,
+		 ieee80211_frequency_to_channel(beacon_chan->center_freq),
+		 wiphy_name(wiphy));
 
 	memcpy(&reg_beacon->chan, beacon_chan,
 	       sizeof(struct ieee80211_channel));
@@ -2798,8 +2745,7 @@ bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region)
 	case NL80211_DFS_JP:
 		return true;
 	default:
-		REG_DBG_PRINT("Ignoring uknown DFS master region: %d\n",
-			      dfs_region);
+		pr_debug("Ignoring uknown DFS master region: %d\n", dfs_region);
 		return false;
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 23a1f8d44c0bca48f04fc2a2f1edafd826ce6133 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 8 Dec 2015 16:04:31 +0200
Subject: mac80211: process and save VHT MU-MIMO group frame

The Group ID Management frame is an Action frame of
category VHT. It is transmitted by the AP to assign
or change the user position of a STA for one or more
group IDs.
Process and save the group membership data. Notify
underlying driver of changes.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  7 +++++++
 include/net/mac80211.h     | 17 +++++++++++++++++
 net/mac80211/ieee80211_i.h |  2 ++
 net/mac80211/iface.c       | 10 ++++++++++
 net/mac80211/mlme.c        |  7 +++++++
 net/mac80211/rx.c          |  5 +++++
 net/mac80211/util.c        |  3 +++
 net/mac80211/vht.c         | 25 +++++++++++++++++++++++++
 8 files changed, 76 insertions(+)

(limited to 'net')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 452c0b0d2f32..d9ddb89533a7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -843,6 +843,8 @@ enum ieee80211_vht_opmode_bits {
 };
 
 #define WLAN_SA_QUERY_TR_ID_LEN 2
+#define WLAN_MEMBERSHIP_LEN 8
+#define WLAN_USER_POSITION_LEN 16
 
 /**
  * struct ieee80211_tpc_report_ie
@@ -989,6 +991,11 @@ struct ieee80211_mgmt {
 					u8 action_code;
 					u8 operating_mode;
 				} __packed vht_opmode_notif;
+				struct {
+					u8 action_code;
+					u8 membership[WLAN_MEMBERSHIP_LEN];
+					u8 position[WLAN_USER_POSITION_LEN];
+				} __packed vht_group_notif;
 				struct {
 					u8 action_code;
 					u8 dialog_token;
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7c30faff245f..8da483b2c067 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -298,6 +298,7 @@ struct ieee80211_vif_chanctx_switch {
  *	note that this is only called when it changes after the channel
  *	context had been assigned.
  * @BSS_CHANGED_OCB: OCB join status changed
+ * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed
  */
 enum ieee80211_bss_change {
 	BSS_CHANGED_ASSOC		= 1<<0,
@@ -323,6 +324,7 @@ enum ieee80211_bss_change {
 	BSS_CHANGED_BEACON_INFO		= 1<<20,
 	BSS_CHANGED_BANDWIDTH		= 1<<21,
 	BSS_CHANGED_OCB                 = 1<<22,
+	BSS_CHANGED_MU_GROUPS		= 1<<23,
 
 	/* when adding here, make sure to change ieee80211_reconfig */
 };
@@ -435,6 +437,19 @@ struct ieee80211_event {
 	} u;
 };
 
+/**
+ * struct ieee80211_mu_group_data - STA's VHT MU-MIMO group data
+ *
+ * This structure describes the group id data of VHT MU-MIMO
+ *
+ * @membership: 64 bits array - a bit is set if station is member of the group
+ * @position: 2 bits per group id indicating the position in the group
+ */
+struct ieee80211_mu_group_data {
+	u8 membership[WLAN_MEMBERSHIP_LEN];
+	u8 position[WLAN_USER_POSITION_LEN];
+};
+
 /**
  * struct ieee80211_bss_conf - holds the BSS's changing parameters
  *
@@ -477,6 +492,7 @@ struct ieee80211_event {
  * @enable_beacon: whether beaconing should be enabled or not
  * @chandef: Channel definition for this BSS -- the hardware might be
  *	configured a higher bandwidth than this BSS uses, for example.
+ * @mu_group: VHT MU-MIMO group membership data
  * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation.
  *	This field is only valid when the channel is a wide HT/VHT channel.
  *	Note that with TDLS this can be the case (channel is HT, protection must
@@ -535,6 +551,7 @@ struct ieee80211_bss_conf {
 	s32 cqm_rssi_thold;
 	u32 cqm_rssi_hyst;
 	struct cfg80211_chan_def chandef;
+	struct ieee80211_mu_group_data mu_group;
 	__be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
 	int arp_addr_cnt;
 	bool qos;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b84f6aa32c08..747402d8c7a9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1714,6 +1714,8 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta);
 enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta);
 void ieee80211_sta_set_rx_nss(struct sta_info *sta);
+void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
+				 struct ieee80211_mgmt *mgmt);
 u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
                                   struct sta_info *sta, u8 opmode,
 				  enum ieee80211_band band);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index c9e325d2e120..33ae3c81bfc5 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1271,6 +1271,16 @@ static void ieee80211_iface_work(struct work_struct *work)
 				}
 			}
 			mutex_unlock(&local->sta_mtx);
+		} else if (ieee80211_is_action(mgmt->frame_control) &&
+			   mgmt->u.action.category == WLAN_CATEGORY_VHT) {
+			switch (mgmt->u.action.u.vht_group_notif.action_code) {
+			case WLAN_VHT_ACTION_GROUPID_MGMT:
+				ieee80211_process_mu_groups(sdata, mgmt);
+				break;
+			default:
+				WARN_ON(1);
+				break;
+			}
 		} else if (ieee80211_is_data_qos(mgmt->frame_control)) {
 			struct ieee80211_hdr *hdr = (void *)mgmt;
 			/*
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 1c342e2592c4..31d5881b31fa 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2079,6 +2079,13 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask));
 	memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa));
 	memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask));
+
+	/* reset MU-MIMO ownership and group data */
+	memset(sdata->vif.bss_conf.mu_group.membership, 0,
+	       sizeof(sdata->vif.bss_conf.mu_group.membership));
+	memset(sdata->vif.bss_conf.mu_group.position, 0,
+	       sizeof(sdata->vif.bss_conf.mu_group.position));
+	changed |= BSS_CHANGED_MU_GROUPS;
 	sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER;
 
 	sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index bc081850ac0e..a5668b54015f 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2738,6 +2738,11 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 						    opmode, status->band);
 			goto handled;
 		}
+		case WLAN_VHT_ACTION_GROUPID_MGMT: {
+			if (len < IEEE80211_MIN_ACTION_SIZE + 25)
+				goto invalid;
+			goto queue;
+		}
 		default:
 			break;
 		}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 3943d4bf289c..f4b2c04e7d81 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1928,6 +1928,9 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			  BSS_CHANGED_IDLE |
 			  BSS_CHANGED_TXPOWER;
 
+		if (sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER)
+			changed |= BSS_CHANGED_MU_GROUPS;
+
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_STATION:
 			changed |= BSS_CHANGED_ASSOC |
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index c38b2f07a919..050de08bf82e 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -1,6 +1,9 @@
 /*
  * VHT handling
  *
+ * Portions of this file
+ * Copyright(c) 2015 Intel Deutschland GmbH
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -425,6 +428,28 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 	return changed;
 }
 
+void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
+				 struct ieee80211_mgmt *mgmt)
+{
+	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+
+	if (!(sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER))
+		return;
+
+	if (!memcmp(mgmt->u.action.u.vht_group_notif.position,
+		    bss_conf->mu_group.position, WLAN_USER_POSITION_LEN) &&
+	    !memcmp(mgmt->u.action.u.vht_group_notif.membership,
+		    bss_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN))
+		return;
+
+	memcpy(mgmt->u.action.u.vht_group_notif.membership,
+	       bss_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN);
+	memcpy(mgmt->u.action.u.vht_group_notif.position,
+	       bss_conf->mu_group.position, WLAN_USER_POSITION_LEN);
+
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS);
+}
+
 void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 				 struct sta_info *sta, u8 opmode,
 				 enum ieee80211_band band)
-- 
cgit v1.2.3-59-g8ed1b


From f9cfa5f354b11e56cd8f019c12e14a42585586cd Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 8 Dec 2015 16:04:33 +0200
Subject: mac80211: add flag for duplication check

Add an option for driver to check for packet duplication
by itself.
This is needed for example by the iwlwifi driver which
parallelizes the RX path and does the duplication check
per queue.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 net/mac80211/rx.c      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8da483b2c067..ecab934dc8d9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1063,7 +1063,7 @@ enum mac80211_rx_flags {
 	RX_FLAG_HT_GF			= BIT(13),
 	RX_FLAG_AMPDU_DETAILS		= BIT(14),
 	RX_FLAG_PN_VALIDATED		= BIT(15),
-	/* bit 16 free */
+	RX_FLAG_DUP_VALIDATED		= BIT(16),
 	RX_FLAG_AMPDU_LAST_KNOWN	= BIT(17),
 	RX_FLAG_AMPDU_IS_LAST		= BIT(18),
 	RX_FLAG_AMPDU_DELIM_CRC_ERROR	= BIT(19),
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a5668b54015f..fe675d76f29c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1099,6 +1099,9 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 
+	if (status->flag & RX_FLAG_DUP_VALIDATED)
+		return RX_CONTINUE;
+
 	/*
 	 * Drop duplicate 802.11 retransmissions
 	 * (IEEE 802.11-2012: 9.3.2.10 "Duplicate detection and recovery")
-- 
cgit v1.2.3-59-g8ed1b


From fad471860c097844432c7cf5d3ae6a0a059c2bdc Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 8 Dec 2015 16:04:34 +0200
Subject: mac80211: pass RX aggregation window size to driver

Currently mac80211 does not inform the driver of the window
size when starting an RX aggregation session.
To enable managing the reorder buffer in the driver or hardware
the window size is needed.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 8 +++++---
 net/mac80211/agg-rx.c  | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ecab934dc8d9..a990338a766e 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3047,9 +3047,11 @@ enum ieee80211_reconfig_type {
  * 	ieee80211_ampdu_mlme_action. Starting sequence number (@ssn)
  * 	is the first frame we expect to perform the action on. Notice
  * 	that TX/RX_STOP can pass NULL for this parameter.
- *	The @buf_size parameter is only valid when the action is set to
- *	%IEEE80211_AMPDU_TX_OPERATIONAL and indicates the peer's reorder
- *	buffer size (number of subframes) for this session -- the driver
+ *	The @buf_size parameter is valid only when the action is set to
+ *	%IEEE80211_AMPDU_RX_START or %IEEE80211_AMPDU_TX_OPERATIONAL and
+ *	indicates the reorder buffer size (number of subframes) for this
+ *	session.
+ *	When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver
  *	may neither send aggregates containing more subframes than this
  *	nor send aggregates in a way that lost frames would exceed the
  *	buffer size. If just limiting the aggregate size, this would be
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 10ad4ac1fa0b..78672737fe3e 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -323,7 +323,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 		__skb_queue_head_init(&tid_agg_rx->reorder_buf[i]);
 
 	ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START,
-			       &sta->sta, tid, &start_seq_num, 0, false);
+			       &sta->sta, tid, &start_seq_num, buf_size, false);
 	ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n",
 	       sta->sta.addr, tid, ret);
 	if (ret) {
-- 
cgit v1.2.3-59-g8ed1b


From a85a7e28f45f7217b9a2efc3ba323de5c0e5b056 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Fri, 1 Jan 2016 23:48:52 +0800
Subject: cfg80211/mac80211: use to_delayed_work

Use to_delayed_work() instead of open-coding it.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 3 +--
 net/wireless/mlme.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 31d5881b31fa..4af9b2bcc020 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1638,8 +1638,7 @@ void ieee80211_dynamic_ps_timer(unsigned long data)
 
 void ieee80211_dfs_cac_timer_work(struct work_struct *work)
 {
-	struct delayed_work *delayed_work =
-		container_of(work, struct delayed_work, work);
+	struct delayed_work *delayed_work = to_delayed_work(work);
 	struct ieee80211_sub_if_data *sdata =
 		container_of(delayed_work, struct ieee80211_sub_if_data,
 			     dfs_cac_timer_work);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index fb44fa3bf4ef..ff328250bc44 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -711,7 +711,7 @@ EXPORT_SYMBOL(cfg80211_rx_mgmt);
 
 void cfg80211_dfs_channels_update_work(struct work_struct *work)
 {
-	struct delayed_work *delayed_work;
+	struct delayed_work *delayed_work = to_delayed_work(work);
 	struct cfg80211_registered_device *rdev;
 	struct cfg80211_chan_def chandef;
 	struct ieee80211_supported_band *sband;
@@ -721,7 +721,6 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work)
 	unsigned long timeout, next_time = 0;
 	int bandid, i;
 
-	delayed_work = container_of(work, struct delayed_work, work);
 	rdev = container_of(delayed_work, struct cfg80211_registered_device,
 			    dfs_update_channels_wk);
 	wiphy = &rdev->wiphy;
-- 
cgit v1.2.3-59-g8ed1b


From 50ea05efaf3bed7dd34bcc2635a8b3f53bd0ccc1 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sarasharon1@gmail.com>
Date: Wed, 30 Dec 2015 16:06:04 +0200
Subject: mac80211: pass block ack session timeout to to driver

Currently mac80211 does not inform the driver of the session
block ack timeout when starting a rx aggregation session.
Drivers that manage the reorder buffer need to know this
parameter.
Seeing that there are now too many arguments for the
drv_ampdu_action() function, wrap them inside a structure.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c              |  7 +--
 drivers/net/wireless/ath/ath9k/htc_drv_main.c      |  7 +--
 drivers/net/wireless/ath/ath9k/main.c              |  8 ++--
 drivers/net/wireless/ath/carl9170/main.c           |  8 ++--
 drivers/net/wireless/ath/wcn36xx/main.c            |  8 ++--
 .../broadcom/brcm80211/brcmsmac/mac80211_if.c      |  8 ++--
 drivers/net/wireless/intel/iwlegacy/4965-mac.c     |  8 ++--
 drivers/net/wireless/intel/iwlegacy/4965.h         |  4 +-
 drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c  |  9 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  |  9 ++--
 drivers/net/wireless/mac80211_hwsim.c              |  8 ++--
 drivers/net/wireless/marvell/mwl8k.c               | 10 ++--
 drivers/net/wireless/mediatek/mt7601u/main.c       |  8 ++--
 drivers/net/wireless/ralink/rt2x00/rt2800lib.c     |  7 +--
 drivers/net/wireless/ralink/rt2x00/rt2800lib.h     |  4 +-
 drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c   |  6 +--
 drivers/net/wireless/realtek/rtlwifi/core.c        |  8 ++--
 drivers/net/wireless/rsi/rsi_91x_mac80211.c        | 19 +++-----
 drivers/net/wireless/st/cw1200/sta.c               |  4 +-
 drivers/net/wireless/st/cw1200/sta.h               |  4 +-
 drivers/net/wireless/ti/wlcore/main.c              |  8 ++--
 include/net/mac80211.h                             | 44 ++++++++++++------
 net/mac80211/agg-rx.c                              | 25 ++++++++--
 net/mac80211/agg-tx.c                              | 53 ++++++++++++++--------
 net/mac80211/driver-ops.c                          | 10 ++--
 net/mac80211/driver-ops.h                          |  4 +-
 net/mac80211/trace.h                               | 43 ++++++++++--------
 27 files changed, 202 insertions(+), 139 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 6146a293601a..368de5e5a04f 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -6366,12 +6366,13 @@ static u64 ath10k_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 
 static int ath10k_ampdu_action(struct ieee80211_hw *hw,
 			       struct ieee80211_vif *vif,
-			       enum ieee80211_ampdu_mlme_action action,
-			       struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			       u8 buf_size, bool amsdu)
+			       struct ieee80211_ampdu_params *params)
 {
 	struct ath10k *ar = hw->priv;
 	struct ath10k_vif *arvif = ath10k_vif_to_arvif(vif);
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
 
 	ath10k_dbg(ar, ATH10K_DBG_MAC, "mac ampdu vdev_id %i sta %pM tid %hu action %d\n",
 		   arvif->vdev_id, sta->addr, tid, action);
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index fe1fd1a5ae15..639294a9e34d 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
@@ -1657,13 +1657,14 @@ static void ath9k_htc_reset_tsf(struct ieee80211_hw *hw,
 
 static int ath9k_htc_ampdu_action(struct ieee80211_hw *hw,
 				  struct ieee80211_vif *vif,
-				  enum ieee80211_ampdu_mlme_action action,
-				  struct ieee80211_sta *sta,
-				  u16 tid, u16 *ssn, u8 buf_size, bool amsdu)
+				  struct ieee80211_ampdu_params *params)
 {
 	struct ath9k_htc_priv *priv = hw->priv;
 	struct ath9k_htc_sta *ista;
 	int ret = 0;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
 
 	mutex_lock(&priv->mutex);
 	ath9k_htc_ps_wakeup(priv);
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index c1b33fdcca08..cf58a304e9f0 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1864,14 +1864,16 @@ static void ath9k_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 
 static int ath9k_ampdu_action(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif,
-			      enum ieee80211_ampdu_mlme_action action,
-			      struct ieee80211_sta *sta,
-			      u16 tid, u16 *ssn, u8 buf_size, bool amsdu)
+			      struct ieee80211_ampdu_params *params)
 {
 	struct ath_softc *sc = hw->priv;
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
 	bool flush = false;
 	int ret = 0;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	mutex_lock(&sc->mutex);
 
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index 19d3d64416bf..4d1527a2e292 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -1413,10 +1413,12 @@ static void carl9170_ampdu_work(struct work_struct *work)
 
 static int carl9170_op_ampdu_action(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
-				    enum ieee80211_ampdu_mlme_action action,
-				    struct ieee80211_sta *sta,
-				    u16 tid, u16 *ssn, u8 buf_size, bool amsdu)
+				    struct ieee80211_ampdu_params *params)
 {
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 	struct ar9170 *ar = hw->priv;
 	struct carl9170_sta_info *sta_info = (void *) sta->drv_priv;
 	struct carl9170_sta_tid *tid_info;
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 7c169abdbafe..a27279c2c695 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -857,12 +857,14 @@ static int wcn36xx_resume(struct ieee80211_hw *hw)
 
 static int wcn36xx_ampdu_action(struct ieee80211_hw *hw,
 		    struct ieee80211_vif *vif,
-		    enum ieee80211_ampdu_mlme_action action,
-		    struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-		    u8 buf_size, bool amsdu)
+		    struct ieee80211_ampdu_params *params)
 {
 	struct wcn36xx *wcn = hw->priv;
 	struct wcn36xx_sta *sta_priv = NULL;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	wcn36xx_dbg(WCN36XX_DBG_MAC, "mac ampdu action action %d tid %d\n",
 		    action, tid);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index bec2dc1ca2e4..61ae2768132a 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -818,13 +818,15 @@ brcms_ops_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 static int
 brcms_ops_ampdu_action(struct ieee80211_hw *hw,
 		    struct ieee80211_vif *vif,
-		    enum ieee80211_ampdu_mlme_action action,
-		    struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-		    u8 buf_size, bool amsdu)
+		    struct ieee80211_ampdu_params *params)
 {
 	struct brcms_info *wl = hw->priv;
 	struct scb *scb = &wl->wlc->pri_scb;
 	int status;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u8 buf_size = params->buf_size;
 
 	if (WARN_ON(scb->magic != SCB_MAGIC))
 		return -EIDRM;
diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
index fd38aa0763e4..b75f4ef3cdc7 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
@@ -5982,12 +5982,14 @@ il4965_mac_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 
 int
 il4965_mac_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 * ssn,
-			u8 buf_size, bool amsdu)
+			struct ieee80211_ampdu_params *params)
 {
 	struct il_priv *il = hw->priv;
 	int ret = -EINVAL;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	D_HT("A-MPDU action on addr %pM tid %d\n", sta->addr, tid);
 
diff --git a/drivers/net/wireless/intel/iwlegacy/4965.h b/drivers/net/wireless/intel/iwlegacy/4965.h
index 8ab8706f9422..e432715e02d8 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965.h
+++ b/drivers/net/wireless/intel/iwlegacy/4965.h
@@ -182,9 +182,7 @@ void il4965_mac_update_tkip_key(struct ieee80211_hw *hw,
 				struct ieee80211_sta *sta, u32 iv32,
 				u16 *phase1key);
 int il4965_mac_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			    enum ieee80211_ampdu_mlme_action action,
-			    struct ieee80211_sta *sta, u16 tid, u16 * ssn,
-			    u8 buf_size, bool amsdu);
+			    struct ieee80211_ampdu_params *params);
 int il4965_mac_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		       struct ieee80211_sta *sta);
 void
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
index 29ea1c6705b4..151721e4040c 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
@@ -732,12 +732,15 @@ static inline bool iwl_enable_tx_ampdu(const struct iwl_cfg *cfg)
 
 static int iwlagn_mac_ampdu_action(struct ieee80211_hw *hw,
 				   struct ieee80211_vif *vif,
-				   enum ieee80211_ampdu_mlme_action action,
-				   struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-				   u8 buf_size, bool amsdu)
+				   struct ieee80211_ampdu_params *params)
 {
 	struct iwl_priv *priv = IWL_MAC80211_GET_DVM(hw);
 	int ret = -EINVAL;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
+	u8 buf_size = params->buf_size;
 	struct iwl_station_priv *sta_priv = (void *) sta->drv_priv;
 
 	IWL_DEBUG_HT(priv, "A-MPDU action on addr %pM tid %d\n",
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index d70a1716f3e0..1bd3f0b700d3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -837,13 +837,16 @@ iwl_mvm_ampdu_check_trigger(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 
 static int iwl_mvm_mac_ampdu_action(struct ieee80211_hw *hw,
 				    struct ieee80211_vif *vif,
-				    enum ieee80211_ampdu_mlme_action action,
-				    struct ieee80211_sta *sta, u16 tid,
-				    u16 *ssn, u8 buf_size, bool amsdu)
+				    struct ieee80211_ampdu_params *params)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	int ret;
 	bool tx_agg_ref = false;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
+	u8 buf_size = params->buf_size;
 
 	IWL_DEBUG_HT(mvm, "A-MPDU action on addr %pM tid %d: action %d\n",
 		     sta->addr, tid, action);
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index c32889a1e39c..e31a94fd6135 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1845,10 +1845,12 @@ static int mac80211_hwsim_testmode_cmd(struct ieee80211_hw *hw,
 
 static int mac80211_hwsim_ampdu_action(struct ieee80211_hw *hw,
 				       struct ieee80211_vif *vif,
-				       enum ieee80211_ampdu_mlme_action action,
-				       struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-				       u8 buf_size, bool amsdu)
+				       struct ieee80211_ampdu_params *params)
 {
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+
 	switch (action) {
 	case IEEE80211_AMPDU_TX_START:
 		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index 30e3aaae32e2..088429d0a634 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -5421,11 +5421,13 @@ static int mwl8k_get_survey(struct ieee80211_hw *hw, int idx,
 
 static int
 mwl8k_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		   enum ieee80211_ampdu_mlme_action action,
-		   struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-		   u8 buf_size, bool amsdu)
+		   struct ieee80211_ampdu_params *params)
 {
-
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
+	u8 buf_size = params->buf_size;
 	int i, rc = 0;
 	struct mwl8k_priv *priv = hw->priv;
 	struct mwl8k_ampdu_stream *stream;
diff --git a/drivers/net/wireless/mediatek/mt7601u/main.c b/drivers/net/wireless/mediatek/mt7601u/main.c
index f715eee39851..e70dd9523911 100644
--- a/drivers/net/wireless/mediatek/mt7601u/main.c
+++ b/drivers/net/wireless/mediatek/mt7601u/main.c
@@ -334,11 +334,13 @@ static int mt7601u_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
 
 static int
 mt76_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		  enum ieee80211_ampdu_mlme_action action,
-		  struct ieee80211_sta *sta, u16 tid, u16 *ssn, u8 buf_size,
-		  bool amsdu)
+		  struct ieee80211_ampdu_params *params)
 {
 	struct mt7601u_dev *dev = hw->priv;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 	struct mt76_sta *msta = (struct mt76_sta *) sta->drv_priv;
 
 	WARN_ON(msta->wcid.idx > GROUP_WCID(0));
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
index 9733b31a780d..69c1c09687a3 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
@@ -7935,10 +7935,11 @@ u64 rt2800_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 EXPORT_SYMBOL_GPL(rt2800_get_tsf);
 
 int rt2800_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			u8 buf_size, bool amsdu)
+			struct ieee80211_ampdu_params *params)
 {
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
 	struct rt2x00_sta *sta_priv = (struct rt2x00_sta *)sta->drv_priv;
 	int ret = 0;
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
index 440790b92b19..83f1a44fb9b4 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
@@ -218,9 +218,7 @@ int rt2800_conf_tx(struct ieee80211_hw *hw,
 		   const struct ieee80211_tx_queue_params *params);
 u64 rt2800_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 int rt2800_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			u8 buf_size, bool amsdu);
+			struct ieee80211_ampdu_params *params);
 int rt2800_get_survey(struct ieee80211_hw *hw, int idx,
 		      struct survey_info *survey);
 void rt2800_disable_wpdma(struct rt2x00_dev *rt2x00dev);
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c
index 6aed923a709a..7d820c395375 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.c
@@ -5375,13 +5375,13 @@ static int rtl8xxxu_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 
 static int
 rtl8xxxu_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		      enum ieee80211_ampdu_mlme_action action,
-		      struct ieee80211_sta *sta, u16 tid, u16 *ssn, u8 buf_size,
-		      bool amsdu)
+		      struct ieee80211_ampdu_params *params)
 {
 	struct rtl8xxxu_priv *priv = hw->priv;
 	struct device *dev = &priv->udev->dev;
 	u8 ampdu_factor, ampdu_density;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
 
 	switch (action) {
 	case IEEE80211_AMPDU_TX_START:
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c
index 4ae421ef30d9..f2507610314b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1371,11 +1371,13 @@ static void rtl_op_sta_notify(struct ieee80211_hw *hw,
 
 static int rtl_op_ampdu_action(struct ieee80211_hw *hw,
 			       struct ieee80211_vif *vif,
-			       enum ieee80211_ampdu_mlme_action action,
-			       struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			       u8 buf_size, bool amsdu)
+			       struct ieee80211_ampdu_params *params)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	switch (action) {
 	case IEEE80211_AMPDU_TX_START:
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index b5bcc933a2a6..4df992de7d07 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -659,29 +659,24 @@ static int rsi_mac80211_set_key(struct ieee80211_hw *hw,
  *				 informs the f/w regarding this.
  * @hw: Pointer to the ieee80211_hw structure.
  * @vif: Pointer to the ieee80211_vif structure.
- * @action: ieee80211_ampdu_mlme_action enum.
- * @sta: Pointer to the ieee80211_sta structure.
- * @tid: Traffic identifier.
- * @ssn: Pointer to ssn value.
- * @buf_size: Buffer size (for kernel version > 2.6.38).
- * @amsdu: is AMSDU in AMPDU allowed
+ * @params: Pointer to A-MPDU action parameters
  *
  * Return: status: 0 on success, negative error code on failure.
  */
 static int rsi_mac80211_ampdu_action(struct ieee80211_hw *hw,
 				     struct ieee80211_vif *vif,
-				     enum ieee80211_ampdu_mlme_action action,
-				     struct ieee80211_sta *sta,
-				     unsigned short tid,
-				     unsigned short *ssn,
-				     unsigned char buf_size,
-				     bool amsdu)
+				     struct ieee80211_ampdu_params *params)
 {
 	int status = -EOPNOTSUPP;
 	struct rsi_hw *adapter = hw->priv;
 	struct rsi_common *common = adapter->priv;
 	u16 seq_no = 0;
 	u8 ii = 0;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
+	u8 buf_size = params->buf_size;
 
 	for (ii = 0; ii < RSI_MAX_VIFS; ii++) {
 		if (vif == adapter->vifs[ii])
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index 06321c799c90..d0ddcde6c695 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -2129,9 +2129,7 @@ void cw1200_mcast_timeout(unsigned long arg)
 
 int cw1200_ampdu_action(struct ieee80211_hw *hw,
 			struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			u8 buf_size, bool amsdu)
+			struct ieee80211_ampdu_params *params)
 {
 	/* Aggregation is implemented fully in firmware,
 	 * including block ack negotiation. Do not allow
diff --git a/drivers/net/wireless/st/cw1200/sta.h b/drivers/net/wireless/st/cw1200/sta.h
index bebb3379017f..a0bacaa39b31 100644
--- a/drivers/net/wireless/st/cw1200/sta.h
+++ b/drivers/net/wireless/st/cw1200/sta.h
@@ -109,9 +109,7 @@ void cw1200_bss_info_changed(struct ieee80211_hw *dev,
 			     u32 changed);
 int cw1200_ampdu_action(struct ieee80211_hw *hw,
 			struct ieee80211_vif *vif,
-			enum ieee80211_ampdu_mlme_action action,
-			struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			u8 buf_size, bool amsdu);
+			struct ieee80211_ampdu_params *params);
 
 void cw1200_suspend_resume(struct cw1200_common *priv,
 			  struct wsm_suspend_resume *arg);
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index d1109c4f0f0d..45662cf3169f 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -5187,14 +5187,16 @@ out:
 
 static int wl1271_op_ampdu_action(struct ieee80211_hw *hw,
 				  struct ieee80211_vif *vif,
-				  enum ieee80211_ampdu_mlme_action action,
-				  struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-				  u8 buf_size, bool amsdu)
+				  struct ieee80211_ampdu_params *params)
 {
 	struct wl1271 *wl = hw->priv;
 	struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif);
 	int ret;
 	u8 hlid, *ba_bitmap;
+	struct ieee80211_sta *sta = params->sta;
+	enum ieee80211_ampdu_mlme_action action = params->action;
+	u16 tid = params->tid;
+	u16 *ssn = &params->ssn;
 
 	wl1271_debug(DEBUG_MAC80211, "mac80211 ampdu action %d tid %d", action,
 		     tid);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index bdee1cc19c7e..6c9c559394b0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2702,6 +2702,33 @@ enum ieee80211_ampdu_mlme_action {
 	IEEE80211_AMPDU_TX_OPERATIONAL,
 };
 
+/**
+ * struct ieee80211_ampdu_params - AMPDU action parameters
+ *
+ * @action: the ampdu action, value from %ieee80211_ampdu_mlme_action.
+ * @sta: peer of this AMPDU session
+ * @tid: tid of the BA session
+ * @ssn: start sequence number of the session. TX/RX_STOP can pass 0. When
+ *	action is set to %IEEE80211_AMPDU_RX_START the driver passes back the
+ *	actual ssn value used to start the session and writes the value here.
+ * @buf_size: reorder buffer size  (number of subframes). Valid only when the
+ *	action is set to %IEEE80211_AMPDU_RX_START or
+ *	%IEEE80211_AMPDU_TX_OPERATIONAL
+ * @amsdu: indicates the peer's ability to receive A-MSDU within A-MPDU.
+ *	valid when the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL
+ * @timeout: BA session timeout. Valid only when the action is set to
+ *	%IEEE80211_AMPDU_RX_START
+ */
+struct ieee80211_ampdu_params {
+	enum ieee80211_ampdu_mlme_action action;
+	struct ieee80211_sta *sta;
+	u16 tid;
+	u16 ssn;
+	u8 buf_size;
+	bool amsdu;
+	u16 timeout;
+};
+
 /**
  * enum ieee80211_frame_release_type - frame release reason
  * @IEEE80211_FRAME_RELEASE_PSPOLL: frame released for PS-Poll
@@ -3046,15 +3073,9 @@ enum ieee80211_reconfig_type {
  * @ampdu_action: Perform a certain A-MPDU action
  * 	The RA/TID combination determines the destination and TID we want
  * 	the ampdu action to be performed for. The action is defined through
- * 	ieee80211_ampdu_mlme_action. Starting sequence number (@ssn)
- * 	is the first frame we expect to perform the action on. Notice
- * 	that TX/RX_STOP can pass NULL for this parameter.
- *	The @buf_size parameter is valid only when the action is set to
- *	%IEEE80211_AMPDU_RX_START or %IEEE80211_AMPDU_TX_OPERATIONAL and
- *	indicates the reorder buffer size (number of subframes) for this
- *	session.
+ *	ieee80211_ampdu_mlme_action.
  *	When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver
- *	may neither send aggregates containing more subframes than this
+ *	may neither send aggregates containing more subframes than @buf_size
  *	nor send aggregates in a way that lost frames would exceed the
  *	buffer size. If just limiting the aggregate size, this would be
  *	possible with a buf_size of 8:
@@ -3065,9 +3086,6 @@ enum ieee80211_reconfig_type {
  *	buffer size of 8. Correct ways to retransmit #1 would be:
  *	 - TX:       1 or 18 or 81
  *	Even "189" would be wrong since 1 could be lost again.
- *	The @amsdu parameter is valid when the action is set to
- *	%IEEE80211_AMPDU_TX_OPERATIONAL and indicates the peer's ability
- *	to receive A-MSDU within A-MPDU.
  *
  *	Returns a negative error code on failure.
  *	The callback can sleep.
@@ -3409,9 +3427,7 @@ struct ieee80211_ops {
 	int (*tx_last_beacon)(struct ieee80211_hw *hw);
 	int (*ampdu_action)(struct ieee80211_hw *hw,
 			    struct ieee80211_vif *vif,
-			    enum ieee80211_ampdu_mlme_action action,
-			    struct ieee80211_sta *sta, u16 tid, u16 *ssn,
-			    u8 buf_size, bool amsdu);
+			    struct ieee80211_ampdu_params *params);
 	int (*get_survey)(struct ieee80211_hw *hw, int idx,
 		struct survey_info *survey);
 	void (*rfkill_poll)(struct ieee80211_hw *hw);
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 78672737fe3e..ec80db7c955c 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -7,6 +7,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2010, Intel Corporation
+ * Copyright(c) 2015 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -61,6 +62,14 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 {
 	struct ieee80211_local *local = sta->local;
 	struct tid_ampdu_rx *tid_rx;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.action = IEEE80211_AMPDU_RX_STOP,
+		.tid = tid,
+		.amsdu = false,
+		.timeout = 0,
+		.ssn = 0,
+	};
 
 	lockdep_assert_held(&sta->ampdu_mlme.mtx);
 
@@ -78,8 +87,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 	       initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator",
 	       (int)reason);
 
-	if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP,
-			     &sta->sta, tid, NULL, 0, false))
+	if (drv_ampdu_action(local, sta->sdata, &params))
 		sdata_info(sta->sdata,
 			   "HW problem - can not stop rx aggregation for %pM tid %d\n",
 			   sta->sta.addr, tid);
@@ -237,6 +245,15 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 {
 	struct ieee80211_local *local = sta->sdata->local;
 	struct tid_ampdu_rx *tid_agg_rx;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.action = IEEE80211_AMPDU_RX_START,
+		.tid = tid,
+		.amsdu = false,
+		.timeout = timeout,
+		.ssn = start_seq_num,
+	};
+
 	int i, ret = -EOPNOTSUPP;
 	u16 status = WLAN_STATUS_REQUEST_DECLINED;
 
@@ -275,6 +292,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	/* make sure the size doesn't exceed the maximum supported by the hw */
 	if (buf_size > local->hw.max_rx_aggregation_subframes)
 		buf_size = local->hw.max_rx_aggregation_subframes;
+	params.buf_size = buf_size;
 
 	/* examine state machine */
 	mutex_lock(&sta->ampdu_mlme.mtx);
@@ -322,8 +340,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	for (i = 0; i < buf_size; i++)
 		__skb_queue_head_init(&tid_agg_rx->reorder_buf[i]);
 
-	ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START,
-			       &sta->sta, tid, &start_seq_num, buf_size, false);
+	ret = drv_ampdu_action(local, sta->sdata, &params);
 	ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n",
 	       sta->sta.addr, tid, ret);
 	if (ret) {
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index ff757181b0a8..4932e9f243a2 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -7,6 +7,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2010, Intel Corporation
+ * Copyright(c) 2015 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -295,7 +296,14 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 {
 	struct ieee80211_local *local = sta->local;
 	struct tid_ampdu_tx *tid_tx;
-	enum ieee80211_ampdu_mlme_action action;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.tid = tid,
+		.buf_size = 0,
+		.amsdu = false,
+		.timeout = 0,
+		.ssn = 0,
+	};
 	int ret;
 
 	lockdep_assert_held(&sta->ampdu_mlme.mtx);
@@ -304,10 +312,10 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 	case AGG_STOP_DECLINED:
 	case AGG_STOP_LOCAL_REQUEST:
 	case AGG_STOP_PEER_REQUEST:
-		action = IEEE80211_AMPDU_TX_STOP_CONT;
+		params.action = IEEE80211_AMPDU_TX_STOP_CONT;
 		break;
 	case AGG_STOP_DESTROY_STA:
-		action = IEEE80211_AMPDU_TX_STOP_FLUSH;
+		params.action = IEEE80211_AMPDU_TX_STOP_FLUSH;
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -330,9 +338,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 		spin_unlock_bh(&sta->lock);
 		if (reason != AGG_STOP_DESTROY_STA)
 			return -EALREADY;
-		ret = drv_ampdu_action(local, sta->sdata,
-				       IEEE80211_AMPDU_TX_STOP_FLUSH_CONT,
-				       &sta->sta, tid, NULL, 0, false);
+		params.action = IEEE80211_AMPDU_TX_STOP_FLUSH_CONT;
+		ret = drv_ampdu_action(local, sta->sdata, &params);
 		WARN_ON_ONCE(ret);
 		return 0;
 	}
@@ -381,8 +388,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 					WLAN_BACK_INITIATOR;
 	tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST;
 
-	ret = drv_ampdu_action(local, sta->sdata, action,
-			       &sta->sta, tid, NULL, 0, false);
+	ret = drv_ampdu_action(local, sta->sdata, &params);
 
 	/* HW shall not deny going back to legacy */
 	if (WARN_ON(ret)) {
@@ -445,7 +451,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 	struct tid_ampdu_tx *tid_tx;
 	struct ieee80211_local *local = sta->local;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
-	u16 start_seq_num;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.action = IEEE80211_AMPDU_TX_START,
+		.tid = tid,
+		.buf_size = 0,
+		.amsdu = false,
+		.timeout = 0,
+	};
 	int ret;
 
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
@@ -467,10 +480,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 	 */
 	synchronize_net();
 
-	start_seq_num = sta->tid_seq[tid] >> 4;
-
-	ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START,
-			       &sta->sta, tid, &start_seq_num, 0, false);
+	params.ssn = sta->tid_seq[tid] >> 4;
+	ret = drv_ampdu_action(local, sdata, &params);
 	if (ret) {
 		ht_dbg(sdata,
 		       "BA request denied - HW unavailable for %pM tid %d\n",
@@ -499,7 +510,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 
 	/* send AddBA request */
 	ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
-				     tid_tx->dialog_token, start_seq_num,
+				     tid_tx->dialog_token, params.ssn,
 				     IEEE80211_MAX_AMPDU_BUF,
 				     tid_tx->timeout);
 }
@@ -684,18 +695,24 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
 					 struct sta_info *sta, u16 tid)
 {
 	struct tid_ampdu_tx *tid_tx;
+	struct ieee80211_ampdu_params params = {
+		.sta = &sta->sta,
+		.action = IEEE80211_AMPDU_TX_OPERATIONAL,
+		.tid = tid,
+		.timeout = 0,
+		.ssn = 0,
+	};
 
 	lockdep_assert_held(&sta->ampdu_mlme.mtx);
 
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+	params.buf_size = tid_tx->buf_size;
+	params.amsdu = tid_tx->amsdu;
 
 	ht_dbg(sta->sdata, "Aggregation is on for %pM tid %d\n",
 	       sta->sta.addr, tid);
 
-	drv_ampdu_action(local, sta->sdata,
-			 IEEE80211_AMPDU_TX_OPERATIONAL,
-			 &sta->sta, tid, NULL, tid_tx->buf_size,
-			 tid_tx->amsdu);
+	drv_ampdu_action(local, sta->sdata, &params);
 
 	/*
 	 * synchronize with TX path, while splicing the TX path
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index ca1fe5576103..c258f1041d33 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -284,9 +284,7 @@ int drv_switch_vif_chanctx(struct ieee80211_local *local,
 
 int drv_ampdu_action(struct ieee80211_local *local,
 		     struct ieee80211_sub_if_data *sdata,
-		     enum ieee80211_ampdu_mlme_action action,
-		     struct ieee80211_sta *sta, u16 tid,
-		     u16 *ssn, u8 buf_size, bool amsdu)
+		     struct ieee80211_ampdu_params *params)
 {
 	int ret = -EOPNOTSUPP;
 
@@ -296,12 +294,10 @@ int drv_ampdu_action(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
-	trace_drv_ampdu_action(local, sdata, action, sta, tid,
-			       ssn, buf_size, amsdu);
+	trace_drv_ampdu_action(local, sdata, params);
 
 	if (local->ops->ampdu_action)
-		ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action,
-					       sta, tid, ssn, buf_size, amsdu);
+		ret = local->ops->ampdu_action(&local->hw, &sdata->vif, params);
 
 	trace_drv_return_int(local, ret);
 
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 154ce4b13406..18b0d65baff0 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -585,9 +585,7 @@ static inline int drv_tx_last_beacon(struct ieee80211_local *local)
 
 int drv_ampdu_action(struct ieee80211_local *local,
 		     struct ieee80211_sub_if_data *sdata,
-		     enum ieee80211_ampdu_mlme_action action,
-		     struct ieee80211_sta *sta, u16 tid,
-		     u16 *ssn, u8 buf_size, bool amsdu);
+		     struct ieee80211_ampdu_params *params);
 
 static inline int drv_get_survey(struct ieee80211_local *local, int idx,
 				struct survey_info *survey)
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index a6b4442776a0..2b0a17ee907a 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -80,7 +80,23 @@
 #define KEY_PR_FMT	" cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d"
 #define KEY_PR_ARG	__entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx
 
-
+#define AMPDU_ACTION_ENTRY	__field(enum ieee80211_ampdu_mlme_action,		\
+					ieee80211_ampdu_mlme_action)			\
+				STA_ENTRY						\
+				__field(u16, tid)					\
+				__field(u16, ssn)					\
+				__field(u8, buf_size)					\
+				__field(bool, amsdu)					\
+				__field(u16, timeout)
+#define AMPDU_ACTION_ASSIGN	STA_NAMED_ASSIGN(params->sta);				\
+				__entry->tid = params->tid;				\
+				__entry->ssn = params->ssn;				\
+				__entry->buf_size = params->buf_size;			\
+				__entry->amsdu = params->amsdu;				\
+				__entry->timeout = params->timeout;
+#define AMPDU_ACTION_PR_FMT	STA_PR_FMT " tid %d, ssn %d, buf_size %u, amsdu %d, timeout %d"
+#define AMPDU_ACTION_PR_ARG	STA_PR_ARG, __entry->tid, __entry->ssn,			\
+				__entry->buf_size, __entry->amsdu, __entry->timeout
 
 /*
  * Tracing for driver callbacks.
@@ -970,38 +986,25 @@ DEFINE_EVENT(local_only_evt, drv_tx_last_beacon,
 TRACE_EVENT(drv_ampdu_action,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 enum ieee80211_ampdu_mlme_action action,
-		 struct ieee80211_sta *sta, u16 tid,
-		 u16 *ssn, u8 buf_size, bool amsdu),
+		 struct ieee80211_ampdu_params *params),
 
-	TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size, amsdu),
+	TP_ARGS(local, sdata, params),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
-		STA_ENTRY
-		__field(u32, action)
-		__field(u16, tid)
-		__field(u16, ssn)
-		__field(u8, buf_size)
-		__field(bool, amsdu)
 		VIF_ENTRY
+		AMPDU_ACTION_ENTRY
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
-		STA_ASSIGN;
-		__entry->action = action;
-		__entry->tid = tid;
-		__entry->ssn = ssn ? *ssn : 0;
-		__entry->buf_size = buf_size;
-		__entry->amsdu = amsdu;
+		AMPDU_ACTION_ASSIGN;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d amsdu:%d",
-		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action,
-		__entry->tid, __entry->buf_size, __entry->amsdu
+		LOCAL_PR_FMT VIF_PR_FMT AMPDU_ACTION_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG, AMPDU_ACTION_PR_ARG
 	)
 );
 
-- 
cgit v1.2.3-59-g8ed1b


From 3b5cf20cf439c3e8963c2d2a3f225434d31827e3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 24 Jan 2016 21:17:59 +0800
Subject: sunrpc: Use skcipher and ahash/shash

This patch replaces uses of blkcipher with skcipher and the long
obsolete hash interface with either shash (for non-SG users) and
ahash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/sunrpc/gss_krb5.h       |  32 ++--
 net/sunrpc/auth_gss/gss_krb5_crypto.c | 350 +++++++++++++++++++++-------------
 net/sunrpc/auth_gss/gss_krb5_keys.c   |  12 +-
 net/sunrpc/auth_gss/gss_krb5_mech.c   |  89 ++++-----
 net/sunrpc/auth_gss/gss_krb5_seqnum.c |  22 +--
 net/sunrpc/auth_gss/gss_krb5_wrap.c   |  24 +--
 6 files changed, 306 insertions(+), 223 deletions(-)

(limited to 'net')

diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h
index df02a4188487..7df625d41e35 100644
--- a/include/linux/sunrpc/gss_krb5.h
+++ b/include/linux/sunrpc/gss_krb5.h
@@ -36,7 +36,7 @@
  *
  */
 
-#include <linux/crypto.h>
+#include <crypto/skcipher.h>
 #include <linux/sunrpc/auth_gss.h>
 #include <linux/sunrpc/gss_err.h>
 #include <linux/sunrpc/gss_asn1.h>
@@ -71,10 +71,10 @@ struct gss_krb5_enctype {
 	const u32		keyed_cksum;	/* is it a keyed cksum? */
 	const u32		keybytes;	/* raw key len, in bytes */
 	const u32		keylength;	/* final key len, in bytes */
-	u32 (*encrypt) (struct crypto_blkcipher *tfm,
+	u32 (*encrypt) (struct crypto_skcipher *tfm,
 			void *iv, void *in, void *out,
 			int length);		/* encryption function */
-	u32 (*decrypt) (struct crypto_blkcipher *tfm,
+	u32 (*decrypt) (struct crypto_skcipher *tfm,
 			void *iv, void *in, void *out,
 			int length);		/* decryption function */
 	u32 (*mk_key) (const struct gss_krb5_enctype *gk5e,
@@ -98,12 +98,12 @@ struct krb5_ctx {
 	u32			enctype;
 	u32			flags;
 	const struct gss_krb5_enctype *gk5e; /* enctype-specific info */
-	struct crypto_blkcipher	*enc;
-	struct crypto_blkcipher	*seq;
-	struct crypto_blkcipher *acceptor_enc;
-	struct crypto_blkcipher *initiator_enc;
-	struct crypto_blkcipher *acceptor_enc_aux;
-	struct crypto_blkcipher *initiator_enc_aux;
+	struct crypto_skcipher	*enc;
+	struct crypto_skcipher	*seq;
+	struct crypto_skcipher *acceptor_enc;
+	struct crypto_skcipher *initiator_enc;
+	struct crypto_skcipher *acceptor_enc_aux;
+	struct crypto_skcipher *initiator_enc_aux;
 	u8			Ksess[GSS_KRB5_MAX_KEYLEN]; /* session key */
 	u8			cksum[GSS_KRB5_MAX_KEYLEN];
 	s32			endtime;
@@ -262,24 +262,24 @@ gss_unwrap_kerberos(struct gss_ctx *ctx_id, int offset,
 
 
 u32
-krb5_encrypt(struct crypto_blkcipher *key,
+krb5_encrypt(struct crypto_skcipher *key,
 	     void *iv, void *in, void *out, int length);
 
 u32
-krb5_decrypt(struct crypto_blkcipher *key,
+krb5_decrypt(struct crypto_skcipher *key,
 	     void *iv, void *in, void *out, int length); 
 
 int
-gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *outbuf,
+gss_encrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *outbuf,
 		    int offset, struct page **pages);
 
 int
-gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *inbuf,
+gss_decrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *inbuf,
 		    int offset);
 
 s32
 krb5_make_seq_num(struct krb5_ctx *kctx,
-		struct crypto_blkcipher *key,
+		struct crypto_skcipher *key,
 		int direction,
 		u32 seqnum, unsigned char *cksum, unsigned char *buf);
 
@@ -320,12 +320,12 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset,
 
 int
 krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
-		       struct crypto_blkcipher *cipher,
+		       struct crypto_skcipher *cipher,
 		       unsigned char *cksum);
 
 int
 krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
-		       struct crypto_blkcipher *cipher,
+		       struct crypto_skcipher *cipher,
 		       s32 seqnum);
 void
 gss_krb5_make_confounder(char *p, u32 conflen);
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index fee3c15a4b52..d94a8e1e9f05 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -34,11 +34,12 @@
  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  */
 
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
 #include <linux/err.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/scatterlist.h>
-#include <linux/crypto.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/random.h>
@@ -51,7 +52,7 @@
 
 u32
 krb5_encrypt(
-	struct crypto_blkcipher *tfm,
+	struct crypto_skcipher *tfm,
 	void * iv,
 	void * in,
 	void * out,
@@ -60,24 +61,28 @@ krb5_encrypt(
 	u32 ret = -EINVAL;
 	struct scatterlist sg[1];
 	u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
-	struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv };
+	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
-	if (length % crypto_blkcipher_blocksize(tfm) != 0)
+	if (length % crypto_skcipher_blocksize(tfm) != 0)
 		goto out;
 
-	if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+	if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
 		dprintk("RPC:       gss_k5encrypt: tfm iv size too large %d\n",
-			crypto_blkcipher_ivsize(tfm));
+			crypto_skcipher_ivsize(tfm));
 		goto out;
 	}
 
 	if (iv)
-		memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm));
+		memcpy(local_iv, iv, crypto_skcipher_ivsize(tfm));
 
 	memcpy(out, in, length);
 	sg_init_one(sg, out, length);
 
-	ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, length);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg, sg, length, local_iv);
+
+	ret = crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
 out:
 	dprintk("RPC:       krb5_encrypt returns %d\n", ret);
 	return ret;
@@ -85,7 +90,7 @@ out:
 
 u32
 krb5_decrypt(
-     struct crypto_blkcipher *tfm,
+     struct crypto_skcipher *tfm,
      void * iv,
      void * in,
      void * out,
@@ -94,23 +99,27 @@ krb5_decrypt(
 	u32 ret = -EINVAL;
 	struct scatterlist sg[1];
 	u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
-	struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv };
+	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
-	if (length % crypto_blkcipher_blocksize(tfm) != 0)
+	if (length % crypto_skcipher_blocksize(tfm) != 0)
 		goto out;
 
-	if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+	if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
 		dprintk("RPC:       gss_k5decrypt: tfm iv size too large %d\n",
-			crypto_blkcipher_ivsize(tfm));
+			crypto_skcipher_ivsize(tfm));
 		goto out;
 	}
 	if (iv)
-		memcpy(local_iv,iv, crypto_blkcipher_ivsize(tfm));
+		memcpy(local_iv,iv, crypto_skcipher_ivsize(tfm));
 
 	memcpy(out, in, length);
 	sg_init_one(sg, out, length);
 
-	ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, length);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg, sg, length, local_iv);
+
+	ret = crypto_skcipher_decrypt(req);
+	skcipher_request_zero(req);
 out:
 	dprintk("RPC:       gss_k5decrypt returns %d\n",ret);
 	return ret;
@@ -119,9 +128,11 @@ out:
 static int
 checksummer(struct scatterlist *sg, void *data)
 {
-	struct hash_desc *desc = data;
+	struct ahash_request *req = data;
+
+	ahash_request_set_crypt(req, sg, NULL, sg->length);
 
-	return crypto_hash_update(desc, sg, sg->length);
+	return crypto_ahash_update(req);
 }
 
 static int
@@ -152,13 +163,13 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
 		       struct xdr_buf *body, int body_offset, u8 *cksumkey,
 		       unsigned int usage, struct xdr_netobj *cksumout)
 {
-	struct hash_desc                desc;
 	struct scatterlist              sg[1];
 	int err;
 	u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
 	u8 rc4salt[4];
-	struct crypto_hash *md5;
-	struct crypto_hash *hmac_md5;
+	struct crypto_ahash *md5;
+	struct crypto_ahash *hmac_md5;
+	struct ahash_request *req;
 
 	if (cksumkey == NULL)
 		return GSS_S_FAILURE;
@@ -174,61 +185,79 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
 		return GSS_S_FAILURE;
 	}
 
-	md5 = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+	md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(md5))
 		return GSS_S_FAILURE;
 
-	hmac_md5 = crypto_alloc_hash(kctx->gk5e->cksum_name, 0,
-				     CRYPTO_ALG_ASYNC);
+	hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0,
+				      CRYPTO_ALG_ASYNC);
 	if (IS_ERR(hmac_md5)) {
-		crypto_free_hash(md5);
+		crypto_free_ahash(md5);
+		return GSS_S_FAILURE;
+	}
+
+	req = ahash_request_alloc(md5, GFP_KERNEL);
+	if (!req) {
+		crypto_free_ahash(hmac_md5);
+		crypto_free_ahash(md5);
 		return GSS_S_FAILURE;
 	}
 
-	desc.tfm = md5;
-	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
 
-	err = crypto_hash_init(&desc);
+	err = crypto_ahash_init(req);
 	if (err)
 		goto out;
 	sg_init_one(sg, rc4salt, 4);
-	err = crypto_hash_update(&desc, sg, 4);
+	ahash_request_set_crypt(req, sg, NULL, 4);
+	err = crypto_ahash_update(req);
 	if (err)
 		goto out;
 
 	sg_init_one(sg, header, hdrlen);
-	err = crypto_hash_update(&desc, sg, hdrlen);
+	ahash_request_set_crypt(req, sg, NULL, hdrlen);
+	err = crypto_ahash_update(req);
 	if (err)
 		goto out;
 	err = xdr_process_buf(body, body_offset, body->len - body_offset,
-			      checksummer, &desc);
+			      checksummer, req);
 	if (err)
 		goto out;
-	err = crypto_hash_final(&desc, checksumdata);
+	ahash_request_set_crypt(req, NULL, checksumdata, 0);
+	err = crypto_ahash_final(req);
 	if (err)
 		goto out;
 
-	desc.tfm = hmac_md5;
-	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	ahash_request_free(req);
+	req = ahash_request_alloc(hmac_md5, GFP_KERNEL);
+	if (!req) {
+		crypto_free_ahash(hmac_md5);
+		crypto_free_ahash(md5);
+		return GSS_S_FAILURE;
+	}
+
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
 
-	err = crypto_hash_init(&desc);
+	err = crypto_ahash_init(req);
 	if (err)
 		goto out;
-	err = crypto_hash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
+	err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
 	if (err)
 		goto out;
 
-	sg_init_one(sg, checksumdata, crypto_hash_digestsize(md5));
-	err = crypto_hash_digest(&desc, sg, crypto_hash_digestsize(md5),
-				 checksumdata);
+	sg_init_one(sg, checksumdata, crypto_ahash_digestsize(md5));
+	ahash_request_set_crypt(req, sg, checksumdata,
+				crypto_ahash_digestsize(md5));
+	err = crypto_ahash_digest(req);
 	if (err)
 		goto out;
 
 	memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
 	cksumout->len = kctx->gk5e->cksumlength;
 out:
-	crypto_free_hash(md5);
-	crypto_free_hash(hmac_md5);
+	ahash_request_free(req);
+	crypto_free_ahash(md5);
+	crypto_free_ahash(hmac_md5);
 	return err ? GSS_S_FAILURE : 0;
 }
 
@@ -242,7 +271,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
 	      struct xdr_buf *body, int body_offset, u8 *cksumkey,
 	      unsigned int usage, struct xdr_netobj *cksumout)
 {
-	struct hash_desc                desc;
+	struct crypto_ahash *tfm;
+	struct ahash_request *req;
 	struct scatterlist              sg[1];
 	int err;
 	u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
@@ -259,32 +289,41 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
 		return GSS_S_FAILURE;
 	}
 
-	desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(desc.tfm))
+	tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
 		return GSS_S_FAILURE;
-	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
-	checksumlen = crypto_hash_digestsize(desc.tfm);
+	req = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!req) {
+		crypto_free_ahash(tfm);
+		return GSS_S_FAILURE;
+	}
+
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
+	checksumlen = crypto_ahash_digestsize(tfm);
 
 	if (cksumkey != NULL) {
-		err = crypto_hash_setkey(desc.tfm, cksumkey,
-					 kctx->gk5e->keylength);
+		err = crypto_ahash_setkey(tfm, cksumkey,
+					  kctx->gk5e->keylength);
 		if (err)
 			goto out;
 	}
 
-	err = crypto_hash_init(&desc);
+	err = crypto_ahash_init(req);
 	if (err)
 		goto out;
 	sg_init_one(sg, header, hdrlen);
-	err = crypto_hash_update(&desc, sg, hdrlen);
+	ahash_request_set_crypt(req, sg, NULL, hdrlen);
+	err = crypto_ahash_update(req);
 	if (err)
 		goto out;
 	err = xdr_process_buf(body, body_offset, body->len - body_offset,
-			      checksummer, &desc);
+			      checksummer, req);
 	if (err)
 		goto out;
-	err = crypto_hash_final(&desc, checksumdata);
+	ahash_request_set_crypt(req, NULL, checksumdata, 0);
+	err = crypto_ahash_final(req);
 	if (err)
 		goto out;
 
@@ -307,7 +346,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
 	}
 	cksumout->len = kctx->gk5e->cksumlength;
 out:
-	crypto_free_hash(desc.tfm);
+	ahash_request_free(req);
+	crypto_free_ahash(tfm);
 	return err ? GSS_S_FAILURE : 0;
 }
 
@@ -323,7 +363,8 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
 		 struct xdr_buf *body, int body_offset, u8 *cksumkey,
 		 unsigned int usage, struct xdr_netobj *cksumout)
 {
-	struct hash_desc desc;
+	struct crypto_ahash *tfm;
+	struct ahash_request *req;
 	struct scatterlist sg[1];
 	int err;
 	u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
@@ -340,31 +381,39 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
 		return GSS_S_FAILURE;
 	}
 
-	desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0,
-							CRYPTO_ALG_ASYNC);
-	if (IS_ERR(desc.tfm))
+	tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
 		return GSS_S_FAILURE;
-	checksumlen = crypto_hash_digestsize(desc.tfm);
-	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	checksumlen = crypto_ahash_digestsize(tfm);
+
+	req = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!req) {
+		crypto_free_ahash(tfm);
+		return GSS_S_FAILURE;
+	}
 
-	err = crypto_hash_setkey(desc.tfm, cksumkey, kctx->gk5e->keylength);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
+	err = crypto_ahash_setkey(tfm, cksumkey, kctx->gk5e->keylength);
 	if (err)
 		goto out;
 
-	err = crypto_hash_init(&desc);
+	err = crypto_ahash_init(req);
 	if (err)
 		goto out;
 	err = xdr_process_buf(body, body_offset, body->len - body_offset,
-			      checksummer, &desc);
+			      checksummer, req);
 	if (err)
 		goto out;
 	if (header != NULL) {
 		sg_init_one(sg, header, hdrlen);
-		err = crypto_hash_update(&desc, sg, hdrlen);
+		ahash_request_set_crypt(req, sg, NULL, hdrlen);
+		err = crypto_ahash_update(req);
 		if (err)
 			goto out;
 	}
-	err = crypto_hash_final(&desc, checksumdata);
+	ahash_request_set_crypt(req, NULL, checksumdata, 0);
+	err = crypto_ahash_final(req);
 	if (err)
 		goto out;
 
@@ -381,13 +430,14 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
 		break;
 	}
 out:
-	crypto_free_hash(desc.tfm);
+	ahash_request_free(req);
+	crypto_free_ahash(tfm);
 	return err ? GSS_S_FAILURE : 0;
 }
 
 struct encryptor_desc {
 	u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
-	struct blkcipher_desc desc;
+	struct skcipher_request *req;
 	int pos;
 	struct xdr_buf *outbuf;
 	struct page **pages;
@@ -402,6 +452,7 @@ encryptor(struct scatterlist *sg, void *data)
 {
 	struct encryptor_desc *desc = data;
 	struct xdr_buf *outbuf = desc->outbuf;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req);
 	struct page *in_page;
 	int thislen = desc->fraglen + sg->length;
 	int fraglen, ret;
@@ -427,7 +478,7 @@ encryptor(struct scatterlist *sg, void *data)
 	desc->fraglen += sg->length;
 	desc->pos += sg->length;
 
-	fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1);
+	fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1);
 	thislen -= fraglen;
 
 	if (thislen == 0)
@@ -436,8 +487,10 @@ encryptor(struct scatterlist *sg, void *data)
 	sg_mark_end(&desc->infrags[desc->fragno - 1]);
 	sg_mark_end(&desc->outfrags[desc->fragno - 1]);
 
-	ret = crypto_blkcipher_encrypt_iv(&desc->desc, desc->outfrags,
-					  desc->infrags, thislen);
+	skcipher_request_set_crypt(desc->req, desc->infrags, desc->outfrags,
+				   thislen, desc->iv);
+
+	ret = crypto_skcipher_encrypt(desc->req);
 	if (ret)
 		return ret;
 
@@ -459,18 +512,20 @@ encryptor(struct scatterlist *sg, void *data)
 }
 
 int
-gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
+gss_encrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf,
 		    int offset, struct page **pages)
 {
 	int ret;
 	struct encryptor_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
+	BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0);
 
-	BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0);
+	skcipher_request_set_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
 
 	memset(desc.iv, 0, sizeof(desc.iv));
-	desc.desc.tfm = tfm;
-	desc.desc.info = desc.iv;
-	desc.desc.flags = 0;
+	desc.req = req;
 	desc.pos = offset;
 	desc.outbuf = buf;
 	desc.pages = pages;
@@ -481,12 +536,13 @@ gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
 	sg_init_table(desc.outfrags, 4);
 
 	ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc);
+	skcipher_request_zero(req);
 	return ret;
 }
 
 struct decryptor_desc {
 	u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
-	struct blkcipher_desc desc;
+	struct skcipher_request *req;
 	struct scatterlist frags[4];
 	int fragno;
 	int fraglen;
@@ -497,6 +553,7 @@ decryptor(struct scatterlist *sg, void *data)
 {
 	struct decryptor_desc *desc = data;
 	int thislen = desc->fraglen + sg->length;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req);
 	int fraglen, ret;
 
 	/* Worst case is 4 fragments: head, end of page 1, start
@@ -507,7 +564,7 @@ decryptor(struct scatterlist *sg, void *data)
 	desc->fragno++;
 	desc->fraglen += sg->length;
 
-	fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1);
+	fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1);
 	thislen -= fraglen;
 
 	if (thislen == 0)
@@ -515,8 +572,10 @@ decryptor(struct scatterlist *sg, void *data)
 
 	sg_mark_end(&desc->frags[desc->fragno - 1]);
 
-	ret = crypto_blkcipher_decrypt_iv(&desc->desc, desc->frags,
-					  desc->frags, thislen);
+	skcipher_request_set_crypt(desc->req, desc->frags, desc->frags,
+				   thislen, desc->iv);
+
+	ret = crypto_skcipher_decrypt(desc->req);
 	if (ret)
 		return ret;
 
@@ -535,24 +594,29 @@ decryptor(struct scatterlist *sg, void *data)
 }
 
 int
-gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
+gss_decrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf,
 		    int offset)
 {
+	int ret;
 	struct decryptor_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
 	/* XXXJBF: */
-	BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0);
+	BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0);
+
+	skcipher_request_set_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
 
 	memset(desc.iv, 0, sizeof(desc.iv));
-	desc.desc.tfm = tfm;
-	desc.desc.info = desc.iv;
-	desc.desc.flags = 0;
+	desc.req = req;
 	desc.fragno = 0;
 	desc.fraglen = 0;
 
 	sg_init_table(desc.frags, 4);
 
-	return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
+	ret = xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
+	skcipher_request_zero(req);
+	return ret;
 }
 
 /*
@@ -594,12 +658,12 @@ xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen)
 }
 
 static u32
-gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf,
+gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf,
 		   u32 offset, u8 *iv, struct page **pages, int encrypt)
 {
 	u32 ret;
 	struct scatterlist sg[1];
-	struct blkcipher_desc desc = { .tfm = cipher, .info = iv };
+	SKCIPHER_REQUEST_ON_STACK(req, cipher);
 	u8 data[GSS_KRB5_MAX_BLOCKSIZE * 2];
 	struct page **save_pages;
 	u32 len = buf->len - offset;
@@ -625,10 +689,16 @@ gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf,
 
 	sg_init_one(sg, data, len);
 
+	skcipher_request_set_tfm(req, cipher);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg, sg, len, iv);
+
 	if (encrypt)
-		ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+		ret = crypto_skcipher_encrypt(req);
 	else
-		ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, len);
+		ret = crypto_skcipher_decrypt(req);
+
+	skcipher_request_zero(req);
 
 	if (ret)
 		goto out;
@@ -647,7 +717,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
 	struct xdr_netobj hmac;
 	u8 *cksumkey;
 	u8 *ecptr;
-	struct crypto_blkcipher *cipher, *aux_cipher;
+	struct crypto_skcipher *cipher, *aux_cipher;
 	int blocksize;
 	struct page **save_pages;
 	int nblocks, nbytes;
@@ -666,7 +736,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
 		cksumkey = kctx->acceptor_integ;
 		usage = KG_USAGE_ACCEPTOR_SEAL;
 	}
-	blocksize = crypto_blkcipher_blocksize(cipher);
+	blocksize = crypto_skcipher_blocksize(cipher);
 
 	/* hide the gss token header and insert the confounder */
 	offset += GSS_KRB5_TOK_HDR_LEN;
@@ -719,20 +789,24 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
 	memset(desc.iv, 0, sizeof(desc.iv));
 
 	if (cbcbytes) {
+		SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
+
 		desc.pos = offset + GSS_KRB5_TOK_HDR_LEN;
 		desc.fragno = 0;
 		desc.fraglen = 0;
 		desc.pages = pages;
 		desc.outbuf = buf;
-		desc.desc.info = desc.iv;
-		desc.desc.flags = 0;
-		desc.desc.tfm = aux_cipher;
+		desc.req = req;
+
+		skcipher_request_set_tfm(req, aux_cipher);
+		skcipher_request_set_callback(req, 0, NULL, NULL);
 
 		sg_init_table(desc.infrags, 4);
 		sg_init_table(desc.outfrags, 4);
 
 		err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN,
 				      cbcbytes, encryptor, &desc);
+		skcipher_request_zero(req);
 		if (err)
 			goto out_err;
 	}
@@ -763,7 +837,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
 	struct xdr_buf subbuf;
 	u32 ret = 0;
 	u8 *cksum_key;
-	struct crypto_blkcipher *cipher, *aux_cipher;
+	struct crypto_skcipher *cipher, *aux_cipher;
 	struct xdr_netobj our_hmac_obj;
 	u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN];
 	u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN];
@@ -782,7 +856,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
 		cksum_key = kctx->initiator_integ;
 		usage = KG_USAGE_INITIATOR_SEAL;
 	}
-	blocksize = crypto_blkcipher_blocksize(cipher);
+	blocksize = crypto_skcipher_blocksize(cipher);
 
 
 	/* create a segment skipping the header and leaving out the checksum */
@@ -799,15 +873,19 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
 	memset(desc.iv, 0, sizeof(desc.iv));
 
 	if (cbcbytes) {
+		SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
+
 		desc.fragno = 0;
 		desc.fraglen = 0;
-		desc.desc.info = desc.iv;
-		desc.desc.flags = 0;
-		desc.desc.tfm = aux_cipher;
+		desc.req = req;
+
+		skcipher_request_set_tfm(req, aux_cipher);
+		skcipher_request_set_callback(req, 0, NULL, NULL);
 
 		sg_init_table(desc.frags, 4);
 
 		ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc);
+		skcipher_request_zero(req);
 		if (ret)
 			goto out_err;
 	}
@@ -850,61 +928,62 @@ out_err:
  * Set the key of the given cipher.
  */
 int
-krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
+krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
 		       unsigned char *cksum)
 {
-	struct crypto_hash *hmac;
-	struct hash_desc desc;
-	struct scatterlist sg[1];
+	struct crypto_shash *hmac;
+	struct shash_desc *desc;
 	u8 Kseq[GSS_KRB5_MAX_KEYLEN];
 	u32 zeroconstant = 0;
 	int err;
 
 	dprintk("%s: entered\n", __func__);
 
-	hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+	hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
 	if (IS_ERR(hmac)) {
 		dprintk("%s: error %ld, allocating hash '%s'\n",
 			__func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
 		return PTR_ERR(hmac);
 	}
 
-	desc.tfm = hmac;
-	desc.flags = 0;
+	desc = kmalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc) {
+		dprintk("%s: failed to allocate shash descriptor for '%s'\n",
+			__func__, kctx->gk5e->cksum_name);
+		crypto_free_shash(hmac);
+		return -ENOMEM;
+	}
 
-	err = crypto_hash_init(&desc);
-	if (err)
-		goto out_err;
+	desc->tfm = hmac;
+	desc->flags = 0;
 
 	/* Compute intermediate Kseq from session key */
-	err = crypto_hash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
+	err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
 	if (err)
 		goto out_err;
 
-	sg_init_one(sg, &zeroconstant, 4);
-	err = crypto_hash_digest(&desc, sg, 4, Kseq);
+	err = crypto_shash_digest(desc, (u8 *)&zeroconstant, 4, Kseq);
 	if (err)
 		goto out_err;
 
 	/* Compute final Kseq from the checksum and intermediate Kseq */
-	err = crypto_hash_setkey(hmac, Kseq, kctx->gk5e->keylength);
+	err = crypto_shash_setkey(hmac, Kseq, kctx->gk5e->keylength);
 	if (err)
 		goto out_err;
 
-	sg_set_buf(sg, cksum, 8);
-
-	err = crypto_hash_digest(&desc, sg, 8, Kseq);
+	err = crypto_shash_digest(desc, cksum, 8, Kseq);
 	if (err)
 		goto out_err;
 
-	err = crypto_blkcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
+	err = crypto_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
 	if (err)
 		goto out_err;
 
 	err = 0;
 
 out_err:
-	crypto_free_hash(hmac);
+	kzfree(desc);
+	crypto_free_shash(hmac);
 	dprintk("%s: returning %d\n", __func__, err);
 	return err;
 }
@@ -914,12 +993,11 @@ out_err:
  * Set the key of cipher kctx->enc.
  */
 int
-krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
+krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
 		       s32 seqnum)
 {
-	struct crypto_hash *hmac;
-	struct hash_desc desc;
-	struct scatterlist sg[1];
+	struct crypto_shash *hmac;
+	struct shash_desc *desc;
 	u8 Kcrypt[GSS_KRB5_MAX_KEYLEN];
 	u8 zeroconstant[4] = {0};
 	u8 seqnumarray[4];
@@ -927,35 +1005,38 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
 
 	dprintk("%s: entered, seqnum %u\n", __func__, seqnum);
 
-	hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+	hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
 	if (IS_ERR(hmac)) {
 		dprintk("%s: error %ld, allocating hash '%s'\n",
 			__func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
 		return PTR_ERR(hmac);
 	}
 
-	desc.tfm = hmac;
-	desc.flags = 0;
+	desc = kmalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc) {
+		dprintk("%s: failed to allocate shash descriptor for '%s'\n",
+			__func__, kctx->gk5e->cksum_name);
+		crypto_free_shash(hmac);
+		return -ENOMEM;
+	}
 
-	err = crypto_hash_init(&desc);
-	if (err)
-		goto out_err;
+	desc->tfm = hmac;
+	desc->flags = 0;
 
 	/* Compute intermediate Kcrypt from session key */
 	for (i = 0; i < kctx->gk5e->keylength; i++)
 		Kcrypt[i] = kctx->Ksess[i] ^ 0xf0;
 
-	err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
+	err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
 	if (err)
 		goto out_err;
 
-	sg_init_one(sg, zeroconstant, 4);
-	err = crypto_hash_digest(&desc, sg, 4, Kcrypt);
+	err = crypto_shash_digest(desc, zeroconstant, 4, Kcrypt);
 	if (err)
 		goto out_err;
 
 	/* Compute final Kcrypt from the seqnum and intermediate Kcrypt */
-	err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
+	err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
 	if (err)
 		goto out_err;
 
@@ -964,20 +1045,19 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
 	seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff);
 	seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff);
 
-	sg_set_buf(sg, seqnumarray, 4);
-
-	err = crypto_hash_digest(&desc, sg, 4, Kcrypt);
+	err = crypto_shash_digest(desc, seqnumarray, 4, Kcrypt);
 	if (err)
 		goto out_err;
 
-	err = crypto_blkcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength);
+	err = crypto_skcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength);
 	if (err)
 		goto out_err;
 
 	err = 0;
 
 out_err:
-	crypto_free_hash(hmac);
+	kzfree(desc);
+	crypto_free_shash(hmac);
 	dprintk("%s: returning %d\n", __func__, err);
 	return err;
 }
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 234fa8d0fd9b..870133146026 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -54,9 +54,9 @@
  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  */
 
+#include <crypto/skcipher.h>
 #include <linux/err.h>
 #include <linux/types.h>
-#include <linux/crypto.h>
 #include <linux/sunrpc/gss_krb5.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/lcm.h>
@@ -147,7 +147,7 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
 	size_t blocksize, keybytes, keylength, n;
 	unsigned char *inblockdata, *outblockdata, *rawkey;
 	struct xdr_netobj inblock, outblock;
-	struct crypto_blkcipher *cipher;
+	struct crypto_skcipher *cipher;
 	u32 ret = EINVAL;
 
 	blocksize = gk5e->blocksize;
@@ -157,11 +157,11 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
 	if ((inkey->len != keylength) || (outkey->len != keylength))
 		goto err_return;
 
-	cipher = crypto_alloc_blkcipher(gk5e->encrypt_name, 0,
-					CRYPTO_ALG_ASYNC);
+	cipher = crypto_alloc_skcipher(gk5e->encrypt_name, 0,
+				       CRYPTO_ALG_ASYNC);
 	if (IS_ERR(cipher))
 		goto err_return;
-	if (crypto_blkcipher_setkey(cipher, inkey->data, inkey->len))
+	if (crypto_skcipher_setkey(cipher, inkey->data, inkey->len))
 		goto err_return;
 
 	/* allocate and set up buffers */
@@ -238,7 +238,7 @@ err_free_in:
 	memset(inblockdata, 0, blocksize);
 	kfree(inblockdata);
 err_free_cipher:
-	crypto_free_blkcipher(cipher);
+	crypto_free_skcipher(cipher);
 err_return:
 	return ret;
 }
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 28db442a0034..71341ccb9890 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -34,6 +34,8 @@
  *
  */
 
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -42,7 +44,6 @@
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/gss_krb5.h>
 #include <linux/sunrpc/xdr.h>
-#include <linux/crypto.h>
 #include <linux/sunrpc/gss_krb5_enctypes.h>
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -217,7 +218,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
 
 static inline const void *
 get_key(const void *p, const void *end,
-	struct krb5_ctx *ctx, struct crypto_blkcipher **res)
+	struct krb5_ctx *ctx, struct crypto_skcipher **res)
 {
 	struct xdr_netobj	key;
 	int			alg;
@@ -245,7 +246,7 @@ get_key(const void *p, const void *end,
 	if (IS_ERR(p))
 		goto out_err;
 
-	*res = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
+	*res = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0,
 							CRYPTO_ALG_ASYNC);
 	if (IS_ERR(*res)) {
 		printk(KERN_WARNING "gss_kerberos_mech: unable to initialize "
@@ -253,7 +254,7 @@ get_key(const void *p, const void *end,
 		*res = NULL;
 		goto out_err_free_key;
 	}
-	if (crypto_blkcipher_setkey(*res, key.data, key.len)) {
+	if (crypto_skcipher_setkey(*res, key.data, key.len)) {
 		printk(KERN_WARNING "gss_kerberos_mech: error setting key for "
 			"crypto algorithm %s\n", ctx->gk5e->encrypt_name);
 		goto out_err_free_tfm;
@@ -263,7 +264,7 @@ get_key(const void *p, const void *end,
 	return p;
 
 out_err_free_tfm:
-	crypto_free_blkcipher(*res);
+	crypto_free_skcipher(*res);
 out_err_free_key:
 	kfree(key.data);
 	p = ERR_PTR(-EINVAL);
@@ -335,30 +336,30 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
 	return 0;
 
 out_err_free_key2:
-	crypto_free_blkcipher(ctx->seq);
+	crypto_free_skcipher(ctx->seq);
 out_err_free_key1:
-	crypto_free_blkcipher(ctx->enc);
+	crypto_free_skcipher(ctx->enc);
 out_err_free_mech:
 	kfree(ctx->mech_used.data);
 out_err:
 	return PTR_ERR(p);
 }
 
-static struct crypto_blkcipher *
+static struct crypto_skcipher *
 context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key)
 {
-	struct crypto_blkcipher *cp;
+	struct crypto_skcipher *cp;
 
-	cp = crypto_alloc_blkcipher(cname, 0, CRYPTO_ALG_ASYNC);
+	cp = crypto_alloc_skcipher(cname, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(cp)) {
 		dprintk("gss_kerberos_mech: unable to initialize "
 			"crypto algorithm %s\n", cname);
 		return NULL;
 	}
-	if (crypto_blkcipher_setkey(cp, key, ctx->gk5e->keylength)) {
+	if (crypto_skcipher_setkey(cp, key, ctx->gk5e->keylength)) {
 		dprintk("gss_kerberos_mech: error setting key for "
 			"crypto algorithm %s\n", cname);
-		crypto_free_blkcipher(cp);
+		crypto_free_skcipher(cp);
 		return NULL;
 	}
 	return cp;
@@ -412,9 +413,9 @@ context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask)
 	return 0;
 
 out_free_enc:
-	crypto_free_blkcipher(ctx->enc);
+	crypto_free_skcipher(ctx->enc);
 out_free_seq:
-	crypto_free_blkcipher(ctx->seq);
+	crypto_free_skcipher(ctx->seq);
 out_err:
 	return -EINVAL;
 }
@@ -427,18 +428,17 @@ out_err:
 static int
 context_derive_keys_rc4(struct krb5_ctx *ctx)
 {
-	struct crypto_hash *hmac;
+	struct crypto_shash *hmac;
 	char sigkeyconstant[] = "signaturekey";
 	int slen = strlen(sigkeyconstant) + 1;	/* include null terminator */
-	struct hash_desc desc;
-	struct scatterlist sg[1];
+	struct shash_desc *desc;
 	int err;
 
 	dprintk("RPC:       %s: entered\n", __func__);
 	/*
 	 * derive cksum (aka Ksign) key
 	 */
-	hmac = crypto_alloc_hash(ctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+	hmac = crypto_alloc_shash(ctx->gk5e->cksum_name, 0, 0);
 	if (IS_ERR(hmac)) {
 		dprintk("%s: error %ld allocating hash '%s'\n",
 			__func__, PTR_ERR(hmac), ctx->gk5e->cksum_name);
@@ -446,37 +446,40 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
 		goto out_err;
 	}
 
-	err = crypto_hash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength);
+	err = crypto_shash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength);
 	if (err)
 		goto out_err_free_hmac;
 
-	sg_init_table(sg, 1);
-	sg_set_buf(sg, sigkeyconstant, slen);
 
-	desc.tfm = hmac;
-	desc.flags = 0;
-
-	err = crypto_hash_init(&desc);
-	if (err)
+	desc = kmalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc) {
+		dprintk("%s: failed to allocate hash descriptor for '%s'\n",
+			__func__, ctx->gk5e->cksum_name);
+		err = -ENOMEM;
 		goto out_err_free_hmac;
+	}
+
+	desc->tfm = hmac;
+	desc->flags = 0;
 
-	err = crypto_hash_digest(&desc, sg, slen, ctx->cksum);
+	err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum);
+	kzfree(desc);
 	if (err)
 		goto out_err_free_hmac;
 	/*
-	 * allocate hash, and blkciphers for data and seqnum encryption
+	 * allocate hash, and skciphers for data and seqnum encryption
 	 */
-	ctx->enc = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
-					  CRYPTO_ALG_ASYNC);
+	ctx->enc = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0,
+					 CRYPTO_ALG_ASYNC);
 	if (IS_ERR(ctx->enc)) {
 		err = PTR_ERR(ctx->enc);
 		goto out_err_free_hmac;
 	}
 
-	ctx->seq = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
-					  CRYPTO_ALG_ASYNC);
+	ctx->seq = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0,
+					 CRYPTO_ALG_ASYNC);
 	if (IS_ERR(ctx->seq)) {
-		crypto_free_blkcipher(ctx->enc);
+		crypto_free_skcipher(ctx->enc);
 		err = PTR_ERR(ctx->seq);
 		goto out_err_free_hmac;
 	}
@@ -486,7 +489,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
 	err = 0;
 
 out_err_free_hmac:
-	crypto_free_hash(hmac);
+	crypto_free_shash(hmac);
 out_err:
 	dprintk("RPC:       %s: returning %d\n", __func__, err);
 	return err;
@@ -588,7 +591,7 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
 			context_v2_alloc_cipher(ctx, "cbc(aes)",
 						ctx->acceptor_seal);
 		if (ctx->acceptor_enc_aux == NULL) {
-			crypto_free_blkcipher(ctx->initiator_enc_aux);
+			crypto_free_skcipher(ctx->initiator_enc_aux);
 			goto out_free_acceptor_enc;
 		}
 	}
@@ -596,9 +599,9 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
 	return 0;
 
 out_free_acceptor_enc:
-	crypto_free_blkcipher(ctx->acceptor_enc);
+	crypto_free_skcipher(ctx->acceptor_enc);
 out_free_initiator_enc:
-	crypto_free_blkcipher(ctx->initiator_enc);
+	crypto_free_skcipher(ctx->initiator_enc);
 out_err:
 	return -EINVAL;
 }
@@ -710,12 +713,12 @@ static void
 gss_delete_sec_context_kerberos(void *internal_ctx) {
 	struct krb5_ctx *kctx = internal_ctx;
 
-	crypto_free_blkcipher(kctx->seq);
-	crypto_free_blkcipher(kctx->enc);
-	crypto_free_blkcipher(kctx->acceptor_enc);
-	crypto_free_blkcipher(kctx->initiator_enc);
-	crypto_free_blkcipher(kctx->acceptor_enc_aux);
-	crypto_free_blkcipher(kctx->initiator_enc_aux);
+	crypto_free_skcipher(kctx->seq);
+	crypto_free_skcipher(kctx->enc);
+	crypto_free_skcipher(kctx->acceptor_enc);
+	crypto_free_skcipher(kctx->initiator_enc);
+	crypto_free_skcipher(kctx->acceptor_enc_aux);
+	crypto_free_skcipher(kctx->initiator_enc_aux);
 	kfree(kctx->mech_used.data);
 	kfree(kctx);
 }
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 20d55c793eb6..c8b9082f4a9d 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -31,9 +31,9 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include <crypto/skcipher.h>
 #include <linux/types.h>
 #include <linux/sunrpc/gss_krb5.h>
-#include <linux/crypto.h>
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY        RPCDBG_AUTH
@@ -43,13 +43,13 @@ static s32
 krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
 		      unsigned char *cksum, unsigned char *buf)
 {
-	struct crypto_blkcipher *cipher;
+	struct crypto_skcipher *cipher;
 	unsigned char plain[8];
 	s32 code;
 
 	dprintk("RPC:       %s:\n", __func__);
-	cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
-					CRYPTO_ALG_ASYNC);
+	cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
+				       CRYPTO_ALG_ASYNC);
 	if (IS_ERR(cipher))
 		return PTR_ERR(cipher);
 
@@ -68,12 +68,12 @@ krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
 
 	code = krb5_encrypt(cipher, cksum, plain, buf, 8);
 out:
-	crypto_free_blkcipher(cipher);
+	crypto_free_skcipher(cipher);
 	return code;
 }
 s32
 krb5_make_seq_num(struct krb5_ctx *kctx,
-		struct crypto_blkcipher *key,
+		struct crypto_skcipher *key,
 		int direction,
 		u32 seqnum,
 		unsigned char *cksum, unsigned char *buf)
@@ -101,13 +101,13 @@ static s32
 krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
 		     unsigned char *buf, int *direction, s32 *seqnum)
 {
-	struct crypto_blkcipher *cipher;
+	struct crypto_skcipher *cipher;
 	unsigned char plain[8];
 	s32 code;
 
 	dprintk("RPC:       %s:\n", __func__);
-	cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
-					CRYPTO_ALG_ASYNC);
+	cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
+				       CRYPTO_ALG_ASYNC);
 	if (IS_ERR(cipher))
 		return PTR_ERR(cipher);
 
@@ -130,7 +130,7 @@ krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
 	*seqnum = ((plain[0] << 24) | (plain[1] << 16) |
 					(plain[2] << 8) | (plain[3]));
 out:
-	crypto_free_blkcipher(cipher);
+	crypto_free_skcipher(cipher);
 	return code;
 }
 
@@ -142,7 +142,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
 {
 	s32 code;
 	unsigned char plain[8];
-	struct crypto_blkcipher *key = kctx->seq;
+	struct crypto_skcipher *key = kctx->seq;
 
 	dprintk("RPC:       krb5_get_seq_num:\n");
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index ca7e92a32f84..765088e4ad84 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -28,12 +28,12 @@
  * SUCH DAMAGES.
  */
 
+#include <crypto/skcipher.h>
 #include <linux/types.h>
 #include <linux/jiffies.h>
 #include <linux/sunrpc/gss_krb5.h>
 #include <linux/random.h>
 #include <linux/pagemap.h>
-#include <linux/crypto.h>
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
@@ -174,7 +174,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
 
 	now = get_seconds();
 
-	blocksize = crypto_blkcipher_blocksize(kctx->enc);
+	blocksize = crypto_skcipher_blocksize(kctx->enc);
 	gss_krb5_add_padding(buf, offset, blocksize);
 	BUG_ON((buf->len - offset) % blocksize);
 	plainlen = conflen + buf->len - offset;
@@ -239,10 +239,10 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
 		return GSS_S_FAILURE;
 
 	if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
-		struct crypto_blkcipher *cipher;
+		struct crypto_skcipher *cipher;
 		int err;
-		cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
-						CRYPTO_ALG_ASYNC);
+		cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
+					       CRYPTO_ALG_ASYNC);
 		if (IS_ERR(cipher))
 			return GSS_S_FAILURE;
 
@@ -250,7 +250,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
 
 		err = gss_encrypt_xdr_buf(cipher, buf,
 					  offset + headlen - conflen, pages);
-		crypto_free_blkcipher(cipher);
+		crypto_free_skcipher(cipher);
 		if (err)
 			return GSS_S_FAILURE;
 	} else {
@@ -327,18 +327,18 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
 		return GSS_S_BAD_SIG;
 
 	if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
-		struct crypto_blkcipher *cipher;
+		struct crypto_skcipher *cipher;
 		int err;
 
-		cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
-						CRYPTO_ALG_ASYNC);
+		cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
+					       CRYPTO_ALG_ASYNC);
 		if (IS_ERR(cipher))
 			return GSS_S_FAILURE;
 
 		krb5_rc4_setup_enc_key(kctx, cipher, seqnum);
 
 		err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset);
-		crypto_free_blkcipher(cipher);
+		crypto_free_skcipher(cipher);
 		if (err)
 			return GSS_S_DEFECTIVE_TOKEN;
 	} else {
@@ -371,7 +371,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
 	/* Copy the data back to the right position.  XXX: Would probably be
 	 * better to copy and encrypt at the same time. */
 
-	blocksize = crypto_blkcipher_blocksize(kctx->enc);
+	blocksize = crypto_skcipher_blocksize(kctx->enc);
 	data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) +
 					conflen;
 	orig_start = buf->head[0].iov_base + offset;
@@ -473,7 +473,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
 	*ptr++ = 0xff;
 	be16ptr = (__be16 *)ptr;
 
-	blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
+	blocksize = crypto_skcipher_blocksize(kctx->acceptor_enc);
 	*be16ptr++ = 0;
 	/* "inner" token header always uses 0 for RRC */
 	*be16ptr++ = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 608fb34cff8b11ebe9e4a951e92fad500cab4fe8 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 24 Jan 2016 21:18:09 +0800
Subject: lib80211: Use skcipher and ahash

This patch replaces uses of blkcipher with skcipher and the long
obsolete hash interface with ahash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/wireless/lib80211_crypt_tkip.c | 99 +++++++++++++++++++++-----------------
 net/wireless/lib80211_crypt_wep.c  | 46 ++++++++++--------
 2 files changed, 81 insertions(+), 64 deletions(-)

(limited to 'net')

diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 3cd819539241..71447cf86306 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -29,7 +29,8 @@
 #include <linux/ieee80211.h>
 #include <net/iw_handler.h>
 
-#include <linux/crypto.h>
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
 #include <linux/crc32.h>
 
 #include <net/lib80211.h>
@@ -63,10 +64,10 @@ struct lib80211_tkip_data {
 
 	int key_idx;
 
-	struct crypto_blkcipher *rx_tfm_arc4;
-	struct crypto_hash *rx_tfm_michael;
-	struct crypto_blkcipher *tx_tfm_arc4;
-	struct crypto_hash *tx_tfm_michael;
+	struct crypto_skcipher *rx_tfm_arc4;
+	struct crypto_ahash *rx_tfm_michael;
+	struct crypto_skcipher *tx_tfm_arc4;
+	struct crypto_ahash *tx_tfm_michael;
 
 	/* scratch buffers for virt_to_page() (crypto API) */
 	u8 rx_hdr[16], tx_hdr[16];
@@ -98,29 +99,29 @@ static void *lib80211_tkip_init(int key_idx)
 
 	priv->key_idx = key_idx;
 
-	priv->tx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
-						CRYPTO_ALG_ASYNC);
+	priv->tx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0,
+						  CRYPTO_ALG_ASYNC);
 	if (IS_ERR(priv->tx_tfm_arc4)) {
 		priv->tx_tfm_arc4 = NULL;
 		goto fail;
 	}
 
-	priv->tx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
-						 CRYPTO_ALG_ASYNC);
+	priv->tx_tfm_michael = crypto_alloc_ahash("michael_mic", 0,
+						  CRYPTO_ALG_ASYNC);
 	if (IS_ERR(priv->tx_tfm_michael)) {
 		priv->tx_tfm_michael = NULL;
 		goto fail;
 	}
 
-	priv->rx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
-						CRYPTO_ALG_ASYNC);
+	priv->rx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0,
+						  CRYPTO_ALG_ASYNC);
 	if (IS_ERR(priv->rx_tfm_arc4)) {
 		priv->rx_tfm_arc4 = NULL;
 		goto fail;
 	}
 
-	priv->rx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
-						 CRYPTO_ALG_ASYNC);
+	priv->rx_tfm_michael = crypto_alloc_ahash("michael_mic", 0,
+						  CRYPTO_ALG_ASYNC);
 	if (IS_ERR(priv->rx_tfm_michael)) {
 		priv->rx_tfm_michael = NULL;
 		goto fail;
@@ -130,14 +131,10 @@ static void *lib80211_tkip_init(int key_idx)
 
       fail:
 	if (priv) {
-		if (priv->tx_tfm_michael)
-			crypto_free_hash(priv->tx_tfm_michael);
-		if (priv->tx_tfm_arc4)
-			crypto_free_blkcipher(priv->tx_tfm_arc4);
-		if (priv->rx_tfm_michael)
-			crypto_free_hash(priv->rx_tfm_michael);
-		if (priv->rx_tfm_arc4)
-			crypto_free_blkcipher(priv->rx_tfm_arc4);
+		crypto_free_ahash(priv->tx_tfm_michael);
+		crypto_free_skcipher(priv->tx_tfm_arc4);
+		crypto_free_ahash(priv->rx_tfm_michael);
+		crypto_free_skcipher(priv->rx_tfm_arc4);
 		kfree(priv);
 	}
 
@@ -148,14 +145,10 @@ static void lib80211_tkip_deinit(void *priv)
 {
 	struct lib80211_tkip_data *_priv = priv;
 	if (_priv) {
-		if (_priv->tx_tfm_michael)
-			crypto_free_hash(_priv->tx_tfm_michael);
-		if (_priv->tx_tfm_arc4)
-			crypto_free_blkcipher(_priv->tx_tfm_arc4);
-		if (_priv->rx_tfm_michael)
-			crypto_free_hash(_priv->rx_tfm_michael);
-		if (_priv->rx_tfm_arc4)
-			crypto_free_blkcipher(_priv->rx_tfm_arc4);
+		crypto_free_ahash(_priv->tx_tfm_michael);
+		crypto_free_skcipher(_priv->tx_tfm_arc4);
+		crypto_free_ahash(_priv->rx_tfm_michael);
+		crypto_free_skcipher(_priv->rx_tfm_arc4);
 	}
 	kfree(priv);
 }
@@ -353,11 +346,12 @@ static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len,
 static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
 {
 	struct lib80211_tkip_data *tkey = priv;
-	struct blkcipher_desc desc = { .tfm = tkey->tx_tfm_arc4 };
+	SKCIPHER_REQUEST_ON_STACK(req, tkey->tx_tfm_arc4);
 	int len;
 	u8 rc4key[16], *pos, *icv;
 	u32 crc;
 	struct scatterlist sg;
+	int err;
 
 	if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
 		struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
@@ -382,9 +376,14 @@ static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
 	icv[2] = crc >> 16;
 	icv[3] = crc >> 24;
 
-	crypto_blkcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
+	crypto_skcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
 	sg_init_one(&sg, pos, len + 4);
-	return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
+	skcipher_request_set_tfm(req, tkey->tx_tfm_arc4);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL);
+	err = crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
+	return err;
 }
 
 /*
@@ -403,7 +402,7 @@ static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n,
 static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 {
 	struct lib80211_tkip_data *tkey = priv;
-	struct blkcipher_desc desc = { .tfm = tkey->rx_tfm_arc4 };
+	SKCIPHER_REQUEST_ON_STACK(req, tkey->rx_tfm_arc4);
 	u8 rc4key[16];
 	u8 keyidx, *pos;
 	u32 iv32;
@@ -413,6 +412,7 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 	u32 crc;
 	struct scatterlist sg;
 	int plen;
+	int err;
 
 	hdr = (struct ieee80211_hdr *)skb->data;
 
@@ -465,9 +465,14 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 
 	plen = skb->len - hdr_len - 12;
 
-	crypto_blkcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
+	crypto_skcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
 	sg_init_one(&sg, pos, plen + 4);
-	if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4)) {
+	skcipher_request_set_tfm(req, tkey->rx_tfm_arc4);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL);
+	err = crypto_skcipher_decrypt(req);
+	skcipher_request_zero(req);
+	if (err) {
 		net_dbg_ratelimited("TKIP: failed to decrypt received packet from %pM\n",
 				    hdr->addr2);
 		return -7;
@@ -505,11 +510,12 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 	return keyidx;
 }
 
-static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
+static int michael_mic(struct crypto_ahash *tfm_michael, u8 * key, u8 * hdr,
 		       u8 * data, size_t data_len, u8 * mic)
 {
-	struct hash_desc desc;
+	AHASH_REQUEST_ON_STACK(req, tfm_michael);
 	struct scatterlist sg[2];
+	int err;
 
 	if (tfm_michael == NULL) {
 		pr_warn("%s(): tfm_michael == NULL\n", __func__);
@@ -519,12 +525,15 @@ static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
 	sg_set_buf(&sg[0], hdr, 16);
 	sg_set_buf(&sg[1], data, data_len);
 
-	if (crypto_hash_setkey(tfm_michael, key, 8))
+	if (crypto_ahash_setkey(tfm_michael, key, 8))
 		return -1;
 
-	desc.tfm = tfm_michael;
-	desc.flags = 0;
-	return crypto_hash_digest(&desc, sg, data_len + 16, mic);
+	ahash_request_set_tfm(req, tfm_michael);
+	ahash_request_set_callback(req, 0, NULL, NULL);
+	ahash_request_set_crypt(req, sg, mic, data_len + 16);
+	err = crypto_ahash_digest(req);
+	ahash_request_zero(req);
+	return err;
 }
 
 static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
@@ -645,10 +654,10 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
 {
 	struct lib80211_tkip_data *tkey = priv;
 	int keyidx;
-	struct crypto_hash *tfm = tkey->tx_tfm_michael;
-	struct crypto_blkcipher *tfm2 = tkey->tx_tfm_arc4;
-	struct crypto_hash *tfm3 = tkey->rx_tfm_michael;
-	struct crypto_blkcipher *tfm4 = tkey->rx_tfm_arc4;
+	struct crypto_ahash *tfm = tkey->tx_tfm_michael;
+	struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4;
+	struct crypto_ahash *tfm3 = tkey->rx_tfm_michael;
+	struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4;
 
 	keyidx = tkey->key_idx;
 	memset(tkey, 0, sizeof(*tkey));
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
index 1c292e4ea7b6..d05f58b0fd04 100644
--- a/net/wireless/lib80211_crypt_wep.c
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -22,7 +22,7 @@
 
 #include <net/lib80211.h>
 
-#include <linux/crypto.h>
+#include <crypto/skcipher.h>
 #include <linux/crc32.h>
 
 MODULE_AUTHOR("Jouni Malinen");
@@ -35,8 +35,8 @@ struct lib80211_wep_data {
 	u8 key[WEP_KEY_LEN + 1];
 	u8 key_len;
 	u8 key_idx;
-	struct crypto_blkcipher *tx_tfm;
-	struct crypto_blkcipher *rx_tfm;
+	struct crypto_skcipher *tx_tfm;
+	struct crypto_skcipher *rx_tfm;
 };
 
 static void *lib80211_wep_init(int keyidx)
@@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx)
 		goto fail;
 	priv->key_idx = keyidx;
 
-	priv->tx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+	priv->tx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(priv->tx_tfm)) {
 		priv->tx_tfm = NULL;
 		goto fail;
 	}
 
-	priv->rx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+	priv->rx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(priv->rx_tfm)) {
 		priv->rx_tfm = NULL;
 		goto fail;
@@ -66,10 +66,8 @@ static void *lib80211_wep_init(int keyidx)
 
       fail:
 	if (priv) {
-		if (priv->tx_tfm)
-			crypto_free_blkcipher(priv->tx_tfm);
-		if (priv->rx_tfm)
-			crypto_free_blkcipher(priv->rx_tfm);
+		crypto_free_skcipher(priv->tx_tfm);
+		crypto_free_skcipher(priv->rx_tfm);
 		kfree(priv);
 	}
 	return NULL;
@@ -79,10 +77,8 @@ static void lib80211_wep_deinit(void *priv)
 {
 	struct lib80211_wep_data *_priv = priv;
 	if (_priv) {
-		if (_priv->tx_tfm)
-			crypto_free_blkcipher(_priv->tx_tfm);
-		if (_priv->rx_tfm)
-			crypto_free_blkcipher(_priv->rx_tfm);
+		crypto_free_skcipher(_priv->tx_tfm);
+		crypto_free_skcipher(_priv->rx_tfm);
 	}
 	kfree(priv);
 }
@@ -133,11 +129,12 @@ static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len,
 static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
 {
 	struct lib80211_wep_data *wep = priv;
-	struct blkcipher_desc desc = { .tfm = wep->tx_tfm };
+	SKCIPHER_REQUEST_ON_STACK(req, wep->tx_tfm);
 	u32 crc, klen, len;
 	u8 *pos, *icv;
 	struct scatterlist sg;
 	u8 key[WEP_KEY_LEN + 3];
+	int err;
 
 	/* other checks are in lib80211_wep_build_iv */
 	if (skb_tailroom(skb) < 4)
@@ -165,9 +162,14 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
 	icv[2] = crc >> 16;
 	icv[3] = crc >> 24;
 
-	crypto_blkcipher_setkey(wep->tx_tfm, key, klen);
+	crypto_skcipher_setkey(wep->tx_tfm, key, klen);
 	sg_init_one(&sg, pos, len + 4);
-	return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
+	skcipher_request_set_tfm(req, wep->tx_tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL);
+	err = crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
+	return err;
 }
 
 /* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
@@ -180,11 +182,12 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
 static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 {
 	struct lib80211_wep_data *wep = priv;
-	struct blkcipher_desc desc = { .tfm = wep->rx_tfm };
+	SKCIPHER_REQUEST_ON_STACK(req, wep->rx_tfm);
 	u32 crc, klen, plen;
 	u8 key[WEP_KEY_LEN + 3];
 	u8 keyidx, *pos, icv[4];
 	struct scatterlist sg;
+	int err;
 
 	if (skb->len < hdr_len + 8)
 		return -1;
@@ -205,9 +208,14 @@ static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 	/* Apply RC4 to data and compute CRC32 over decrypted data */
 	plen = skb->len - hdr_len - 8;
 
-	crypto_blkcipher_setkey(wep->rx_tfm, key, klen);
+	crypto_skcipher_setkey(wep->rx_tfm, key, klen);
 	sg_init_one(&sg, pos, plen + 4);
-	if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4))
+	skcipher_request_set_tfm(req, wep->rx_tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL);
+	err = crypto_skcipher_decrypt(req);
+	skcipher_request_zero(req);
+	if (err)
 		return -7;
 
 	crc = ~crc32_le(~0, pos, plen);
-- 
cgit v1.2.3-59-g8ed1b


From 71af2f6bb22a4bf42663e10f1d8913d4967ed07f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 24 Jan 2016 21:18:30 +0800
Subject: Bluetooth: Use skcipher and hash

This patch replaces uses of blkcipher with skcipher and the long
obsolete hash interface with shash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/smp.c | 135 ++++++++++++++++++++++++----------------------------
 1 file changed, 63 insertions(+), 72 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index ffed8a1d4f27..78ad7d691fbe 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -21,9 +21,10 @@
 */
 
 #include <linux/debugfs.h>
-#include <linux/crypto.h>
 #include <linux/scatterlist.h>
 #include <crypto/b128ops.h>
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
@@ -87,8 +88,8 @@ struct smp_dev {
 	u8			min_key_size;
 	u8			max_key_size;
 
-	struct crypto_blkcipher	*tfm_aes;
-	struct crypto_hash	*tfm_cmac;
+	struct crypto_skcipher	*tfm_aes;
+	struct crypto_shash	*tfm_cmac;
 };
 
 struct smp_chan {
@@ -126,8 +127,8 @@ struct smp_chan {
 	u8			dhkey[32];
 	u8			mackey[16];
 
-	struct crypto_blkcipher	*tfm_aes;
-	struct crypto_hash	*tfm_cmac;
+	struct crypto_skcipher	*tfm_aes;
+	struct crypto_shash	*tfm_cmac;
 };
 
 /* These debug key values are defined in the SMP section of the core
@@ -165,12 +166,11 @@ static inline void swap_buf(const u8 *src, u8 *dst, size_t len)
  * AES-CMAC, f4, f5, f6, g2 and h6.
  */
 
-static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m,
+static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,
 		    size_t len, u8 mac[16])
 {
 	uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX];
-	struct hash_desc desc;
-	struct scatterlist sg;
+	SHASH_DESC_ON_STACK(desc, tfm);
 	int err;
 
 	if (len > CMAC_MSG_MAX)
@@ -181,10 +181,8 @@ static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m,
 		return -EINVAL;
 	}
 
-	desc.tfm = tfm;
-	desc.flags = 0;
-
-	crypto_hash_init(&desc);
+	desc->tfm = tfm;
+	desc->flags = 0;
 
 	/* Swap key and message from LSB to MSB */
 	swap_buf(k, tmp, 16);
@@ -193,23 +191,16 @@ static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m,
 	SMP_DBG("msg (len %zu) %*phN", len, (int) len, m);
 	SMP_DBG("key %16phN", k);
 
-	err = crypto_hash_setkey(tfm, tmp, 16);
+	err = crypto_shash_setkey(tfm, tmp, 16);
 	if (err) {
 		BT_ERR("cipher setkey failed: %d", err);
 		return err;
 	}
 
-	sg_init_one(&sg, msg_msb, len);
-
-	err = crypto_hash_update(&desc, &sg, len);
+	err = crypto_shash_digest(desc, msg_msb, len, mac_msb);
+	shash_desc_zero(desc);
 	if (err) {
-		BT_ERR("Hash update error %d", err);
-		return err;
-	}
-
-	err = crypto_hash_final(&desc, mac_msb);
-	if (err) {
-		BT_ERR("Hash final error %d", err);
+		BT_ERR("Hash computation error %d", err);
 		return err;
 	}
 
@@ -220,8 +211,8 @@ static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m,
 	return 0;
 }
 
-static int smp_f4(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
-		  const u8 x[16], u8 z, u8 res[16])
+static int smp_f4(struct crypto_shash *tfm_cmac, const u8 u[32],
+		  const u8 v[32], const u8 x[16], u8 z, u8 res[16])
 {
 	u8 m[65];
 	int err;
@@ -243,7 +234,7 @@ static int smp_f4(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
 	return err;
 }
 
-static int smp_f5(struct crypto_hash *tfm_cmac, const u8 w[32],
+static int smp_f5(struct crypto_shash *tfm_cmac, const u8 w[32],
 		  const u8 n1[16], const u8 n2[16], const u8 a1[7],
 		  const u8 a2[7], u8 mackey[16], u8 ltk[16])
 {
@@ -296,7 +287,7 @@ static int smp_f5(struct crypto_hash *tfm_cmac, const u8 w[32],
 	return 0;
 }
 
-static int smp_f6(struct crypto_hash *tfm_cmac, const u8 w[16],
+static int smp_f6(struct crypto_shash *tfm_cmac, const u8 w[16],
 		  const u8 n1[16], const u8 n2[16], const u8 r[16],
 		  const u8 io_cap[3], const u8 a1[7], const u8 a2[7],
 		  u8 res[16])
@@ -324,7 +315,7 @@ static int smp_f6(struct crypto_hash *tfm_cmac, const u8 w[16],
 	return err;
 }
 
-static int smp_g2(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
+static int smp_g2(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32],
 		  const u8 x[16], const u8 y[16], u32 *val)
 {
 	u8 m[80], tmp[16];
@@ -350,7 +341,7 @@ static int smp_g2(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
 	return 0;
 }
 
-static int smp_h6(struct crypto_hash *tfm_cmac, const u8 w[16],
+static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16],
 		  const u8 key_id[4], u8 res[16])
 {
 	int err;
@@ -370,9 +361,9 @@ static int smp_h6(struct crypto_hash *tfm_cmac, const u8 w[16],
  * s1 and ah.
  */
 
-static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
+static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r)
 {
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	struct scatterlist sg;
 	uint8_t tmp[16], data[16];
 	int err;
@@ -384,13 +375,10 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
 		return -EINVAL;
 	}
 
-	desc.tfm = tfm;
-	desc.flags = 0;
-
 	/* The most significant octet of key corresponds to k[0] */
 	swap_buf(k, tmp, 16);
 
-	err = crypto_blkcipher_setkey(tfm, tmp, 16);
+	err = crypto_skcipher_setkey(tfm, tmp, 16);
 	if (err) {
 		BT_ERR("cipher setkey failed: %d", err);
 		return err;
@@ -401,7 +389,12 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
 
 	sg_init_one(&sg, data, 16);
 
-	err = crypto_blkcipher_encrypt(&desc, &sg, &sg, 16);
+	skcipher_request_set_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg, &sg, 16, NULL);
+
+	err = crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
 	if (err)
 		BT_ERR("Encrypt data error %d", err);
 
@@ -413,7 +406,7 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
 	return err;
 }
 
-static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
+static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16],
 		  const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat,
 		  const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16])
 {
@@ -462,7 +455,7 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
 	return err;
 }
 
-static int smp_s1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
+static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16],
 		  const u8 r1[16], const u8 r2[16], u8 _r[16])
 {
 	int err;
@@ -478,7 +471,7 @@ static int smp_s1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
 	return err;
 }
 
-static int smp_ah(struct crypto_blkcipher *tfm, const u8 irk[16],
+static int smp_ah(struct crypto_skcipher *tfm, const u8 irk[16],
 		  const u8 r[3], u8 res[3])
 {
 	u8 _res[16];
@@ -766,8 +759,8 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
 	kzfree(smp->slave_csrk);
 	kzfree(smp->link_key);
 
-	crypto_free_blkcipher(smp->tfm_aes);
-	crypto_free_hash(smp->tfm_cmac);
+	crypto_free_skcipher(smp->tfm_aes);
+	crypto_free_shash(smp->tfm_cmac);
 
 	/* Ensure that we don't leave any debug key around if debug key
 	 * support hasn't been explicitly enabled.
@@ -1382,17 +1375,17 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
 	if (!smp)
 		return NULL;
 
-	smp->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+	smp->tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(smp->tfm_aes)) {
 		BT_ERR("Unable to create ECB crypto context");
 		kzfree(smp);
 		return NULL;
 	}
 
-	smp->tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+	smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (IS_ERR(smp->tfm_cmac)) {
 		BT_ERR("Unable to create CMAC crypto context");
-		crypto_free_blkcipher(smp->tfm_aes);
+		crypto_free_skcipher(smp->tfm_aes);
 		kzfree(smp);
 		return NULL;
 	}
@@ -3143,8 +3136,8 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
 {
 	struct l2cap_chan *chan;
 	struct smp_dev *smp;
-	struct crypto_blkcipher *tfm_aes;
-	struct crypto_hash *tfm_cmac;
+	struct crypto_skcipher *tfm_aes;
+	struct crypto_shash *tfm_cmac;
 
 	if (cid == L2CAP_CID_SMP_BREDR) {
 		smp = NULL;
@@ -3155,17 +3148,17 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
 	if (!smp)
 		return ERR_PTR(-ENOMEM);
 
-	tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+	tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(tfm_aes)) {
 		BT_ERR("Unable to create ECB crypto context");
 		kzfree(smp);
 		return ERR_CAST(tfm_aes);
 	}
 
-	tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+	tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (IS_ERR(tfm_cmac)) {
 		BT_ERR("Unable to create CMAC crypto context");
-		crypto_free_blkcipher(tfm_aes);
+		crypto_free_skcipher(tfm_aes);
 		kzfree(smp);
 		return ERR_CAST(tfm_cmac);
 	}
@@ -3179,8 +3172,8 @@ create_chan:
 	chan = l2cap_chan_create();
 	if (!chan) {
 		if (smp) {
-			crypto_free_blkcipher(smp->tfm_aes);
-			crypto_free_hash(smp->tfm_cmac);
+			crypto_free_skcipher(smp->tfm_aes);
+			crypto_free_shash(smp->tfm_cmac);
 			kzfree(smp);
 		}
 		return ERR_PTR(-ENOMEM);
@@ -3226,10 +3219,8 @@ static void smp_del_chan(struct l2cap_chan *chan)
 	smp = chan->data;
 	if (smp) {
 		chan->data = NULL;
-		if (smp->tfm_aes)
-			crypto_free_blkcipher(smp->tfm_aes);
-		if (smp->tfm_cmac)
-			crypto_free_hash(smp->tfm_cmac);
+		crypto_free_skcipher(smp->tfm_aes);
+		crypto_free_shash(smp->tfm_cmac);
 		kzfree(smp);
 	}
 
@@ -3465,7 +3456,7 @@ void smp_unregister(struct hci_dev *hdev)
 
 #if IS_ENABLED(CONFIG_BT_SELFTEST_SMP)
 
-static int __init test_ah(struct crypto_blkcipher *tfm_aes)
+static int __init test_ah(struct crypto_skcipher *tfm_aes)
 {
 	const u8 irk[16] = {
 			0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
@@ -3485,7 +3476,7 @@ static int __init test_ah(struct crypto_blkcipher *tfm_aes)
 	return 0;
 }
 
-static int __init test_c1(struct crypto_blkcipher *tfm_aes)
+static int __init test_c1(struct crypto_skcipher *tfm_aes)
 {
 	const u8 k[16] = {
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3515,7 +3506,7 @@ static int __init test_c1(struct crypto_blkcipher *tfm_aes)
 	return 0;
 }
 
-static int __init test_s1(struct crypto_blkcipher *tfm_aes)
+static int __init test_s1(struct crypto_skcipher *tfm_aes)
 {
 	const u8 k[16] = {
 			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3540,7 +3531,7 @@ static int __init test_s1(struct crypto_blkcipher *tfm_aes)
 	return 0;
 }
 
-static int __init test_f4(struct crypto_hash *tfm_cmac)
+static int __init test_f4(struct crypto_shash *tfm_cmac)
 {
 	const u8 u[32] = {
 			0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
@@ -3572,7 +3563,7 @@ static int __init test_f4(struct crypto_hash *tfm_cmac)
 	return 0;
 }
 
-static int __init test_f5(struct crypto_hash *tfm_cmac)
+static int __init test_f5(struct crypto_shash *tfm_cmac)
 {
 	const u8 w[32] = {
 			0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86,
@@ -3609,7 +3600,7 @@ static int __init test_f5(struct crypto_hash *tfm_cmac)
 	return 0;
 }
 
-static int __init test_f6(struct crypto_hash *tfm_cmac)
+static int __init test_f6(struct crypto_shash *tfm_cmac)
 {
 	const u8 w[16] = {
 			0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd,
@@ -3642,7 +3633,7 @@ static int __init test_f6(struct crypto_hash *tfm_cmac)
 	return 0;
 }
 
-static int __init test_g2(struct crypto_hash *tfm_cmac)
+static int __init test_g2(struct crypto_shash *tfm_cmac)
 {
 	const u8 u[32] = {
 			0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
@@ -3674,7 +3665,7 @@ static int __init test_g2(struct crypto_hash *tfm_cmac)
 	return 0;
 }
 
-static int __init test_h6(struct crypto_hash *tfm_cmac)
+static int __init test_h6(struct crypto_shash *tfm_cmac)
 {
 	const u8 w[16] = {
 			0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
@@ -3711,8 +3702,8 @@ static const struct file_operations test_smp_fops = {
 	.llseek		= default_llseek,
 };
 
-static int __init run_selftests(struct crypto_blkcipher *tfm_aes,
-				struct crypto_hash *tfm_cmac)
+static int __init run_selftests(struct crypto_skcipher *tfm_aes,
+				struct crypto_shash *tfm_cmac)
 {
 	ktime_t calltime, delta, rettime;
 	unsigned long long duration;
@@ -3789,27 +3780,27 @@ done:
 
 int __init bt_selftest_smp(void)
 {
-	struct crypto_blkcipher *tfm_aes;
-	struct crypto_hash *tfm_cmac;
+	struct crypto_skcipher *tfm_aes;
+	struct crypto_shash *tfm_cmac;
 	int err;
 
-	tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+	tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(tfm_aes)) {
 		BT_ERR("Unable to create ECB crypto context");
 		return PTR_ERR(tfm_aes);
 	}
 
-	tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+	tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(tfm_cmac)) {
 		BT_ERR("Unable to create CMAC crypto context");
-		crypto_free_blkcipher(tfm_aes);
+		crypto_free_skcipher(tfm_aes);
 		return PTR_ERR(tfm_cmac);
 	}
 
 	err = run_selftests(tfm_aes, tfm_cmac);
 
-	crypto_free_hash(tfm_cmac);
-	crypto_free_blkcipher(tfm_aes);
+	crypto_free_shash(tfm_cmac);
+	crypto_free_skcipher(tfm_aes);
 
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e59dd982d355a40f08a666ce3ee3feea2af86959 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 24 Jan 2016 21:18:40 +0800
Subject: libceph: Use skcipher

This patch replaces uses of blkcipher with skcipher.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/ceph/crypto.c | 97 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 42e8649c6e79..fb9cb2bb0894 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -4,7 +4,8 @@
 #include <linux/err.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
-#include <crypto/hash.h>
+#include <crypto/aes.h>
+#include <crypto/skcipher.h>
 #include <linux/key-type.h>
 
 #include <keys/ceph-type.h>
@@ -79,9 +80,9 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
 	return 0;
 }
 
-static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+static struct crypto_skcipher *ceph_crypto_alloc_cipher(void)
 {
-	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+	return crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
 }
 
 static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
@@ -162,11 +163,11 @@ static int ceph_aes_encrypt(const void *key, int key_len,
 {
 	struct scatterlist sg_in[2], prealloc_sg;
 	struct sg_table sg_out;
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
+	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	int ret;
-	void *iv;
-	int ivsize;
+	int ivsize = AES_BLOCK_SIZE;
+	char iv[ivsize];
 	size_t zero_padding = (0x10 - (src_len & 0x0f));
 	char pad[16];
 
@@ -184,11 +185,14 @@ static int ceph_aes_encrypt(const void *key, int key_len,
 	if (ret)
 		goto out_tfm;
 
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
+	crypto_skcipher_setkey((void *)tfm, key, key_len);
 	memcpy(iv, aes_iv, ivsize);
 
+	skcipher_request_set_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
+				   src_len + zero_padding, iv);
+
 	/*
 	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
 		       key, key_len, 1);
@@ -197,8 +201,8 @@ static int ceph_aes_encrypt(const void *key, int key_len,
 	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
 			pad, zero_padding, 1);
 	*/
-	ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in,
-				     src_len + zero_padding);
+	ret = crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
 	if (ret < 0) {
 		pr_err("ceph_aes_crypt failed %d\n", ret);
 		goto out_sg;
@@ -211,7 +215,7 @@ static int ceph_aes_encrypt(const void *key, int key_len,
 out_sg:
 	teardown_sgtable(&sg_out);
 out_tfm:
-	crypto_free_blkcipher(tfm);
+	crypto_free_skcipher(tfm);
 	return ret;
 }
 
@@ -222,11 +226,11 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
 {
 	struct scatterlist sg_in[3], prealloc_sg;
 	struct sg_table sg_out;
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
+	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	int ret;
-	void *iv;
-	int ivsize;
+	int ivsize = AES_BLOCK_SIZE;
+	char iv[ivsize];
 	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
 	char pad[16];
 
@@ -245,11 +249,14 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
 	if (ret)
 		goto out_tfm;
 
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
+	crypto_skcipher_setkey((void *)tfm, key, key_len);
 	memcpy(iv, aes_iv, ivsize);
 
+	skcipher_request_set_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
+				   src1_len + src2_len + zero_padding, iv);
+
 	/*
 	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
 		       key, key_len, 1);
@@ -260,8 +267,8 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
 	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
 			pad, zero_padding, 1);
 	*/
-	ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in,
-				     src1_len + src2_len + zero_padding);
+	ret = crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
 	if (ret < 0) {
 		pr_err("ceph_aes_crypt2 failed %d\n", ret);
 		goto out_sg;
@@ -274,7 +281,7 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
 out_sg:
 	teardown_sgtable(&sg_out);
 out_tfm:
-	crypto_free_blkcipher(tfm);
+	crypto_free_skcipher(tfm);
 	return ret;
 }
 
@@ -284,11 +291,11 @@ static int ceph_aes_decrypt(const void *key, int key_len,
 {
 	struct sg_table sg_in;
 	struct scatterlist sg_out[2], prealloc_sg;
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm };
+	struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
+	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	char pad[16];
-	void *iv;
-	int ivsize;
+	int ivsize = AES_BLOCK_SIZE;
+	char iv[16];
 	int ret;
 	int last_byte;
 
@@ -302,18 +309,22 @@ static int ceph_aes_decrypt(const void *key, int key_len,
 	if (ret)
 		goto out_tfm;
 
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
+	crypto_skcipher_setkey((void *)tfm, key, key_len);
 	memcpy(iv, aes_iv, ivsize);
 
+	skcipher_request_set_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_in.sgl, sg_out,
+				   src_len, iv);
+
 	/*
 	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
 		       key, key_len, 1);
 	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
 		       src, src_len, 1);
 	*/
-	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len);
+	ret = crypto_skcipher_decrypt(req);
+	skcipher_request_zero(req);
 	if (ret < 0) {
 		pr_err("ceph_aes_decrypt failed %d\n", ret);
 		goto out_sg;
@@ -338,7 +349,7 @@ static int ceph_aes_decrypt(const void *key, int key_len,
 out_sg:
 	teardown_sgtable(&sg_in);
 out_tfm:
-	crypto_free_blkcipher(tfm);
+	crypto_free_skcipher(tfm);
 	return ret;
 }
 
@@ -349,11 +360,11 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
 {
 	struct sg_table sg_in;
 	struct scatterlist sg_out[3], prealloc_sg;
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm };
+	struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
+	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	char pad[16];
-	void *iv;
-	int ivsize;
+	int ivsize = AES_BLOCK_SIZE;
+	char iv[ivsize];
 	int ret;
 	int last_byte;
 
@@ -368,18 +379,22 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
 	if (ret)
 		goto out_tfm;
 
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
+	crypto_skcipher_setkey((void *)tfm, key, key_len);
 	memcpy(iv, aes_iv, ivsize);
 
+	skcipher_request_set_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg_in.sgl, sg_out,
+				   src_len, iv);
+
 	/*
 	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
 		       key, key_len, 1);
 	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
 		       src, src_len, 1);
 	*/
-	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len);
+	ret = crypto_skcipher_decrypt(req);
+	skcipher_request_zero(req);
 	if (ret < 0) {
 		pr_err("ceph_aes_decrypt failed %d\n", ret);
 		goto out_sg;
@@ -415,7 +430,7 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
 out_sg:
 	teardown_sgtable(&sg_in);
 out_tfm:
-	crypto_free_blkcipher(tfm);
+	crypto_free_skcipher(tfm);
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 96953718623eb7ee8839ae667dfecad7b257d400 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 24 Jan 2016 21:18:50 +0800
Subject: mac802154: Use skcipher

This patch replaces uses of blkcipher with skcipher.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
---
 net/mac802154/llsec.c | 41 +++++++++++++++++++++++------------------
 net/mac802154/llsec.h |  3 +--
 2 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index a13d02b7cee4..6a3e1c2181d3 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -17,9 +17,9 @@
 #include <linux/err.h>
 #include <linux/bug.h>
 #include <linux/completion.h>
-#include <linux/crypto.h>
 #include <linux/ieee802154.h>
 #include <crypto/aead.h>
+#include <crypto/skcipher.h>
 
 #include "ieee802154_i.h"
 #include "llsec.h"
@@ -144,18 +144,18 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template)
 			goto err_tfm;
 	}
 
-	key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
+	key->tfm0 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(key->tfm0))
 		goto err_tfm;
 
-	if (crypto_blkcipher_setkey(key->tfm0, template->key,
-				    IEEE802154_LLSEC_KEY_SIZE))
+	if (crypto_skcipher_setkey(key->tfm0, template->key,
+				   IEEE802154_LLSEC_KEY_SIZE))
 		goto err_tfm0;
 
 	return key;
 
 err_tfm0:
-	crypto_free_blkcipher(key->tfm0);
+	crypto_free_skcipher(key->tfm0);
 err_tfm:
 	for (i = 0; i < ARRAY_SIZE(key->tfm); i++)
 		if (key->tfm[i])
@@ -175,7 +175,7 @@ static void llsec_key_release(struct kref *ref)
 	for (i = 0; i < ARRAY_SIZE(key->tfm); i++)
 		crypto_free_aead(key->tfm[i]);
 
-	crypto_free_blkcipher(key->tfm0);
+	crypto_free_skcipher(key->tfm0);
 	kzfree(key);
 }
 
@@ -620,15 +620,17 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
 {
 	u8 iv[16];
 	struct scatterlist src;
-	struct blkcipher_desc req = {
-		.tfm = key->tfm0,
-		.info = iv,
-		.flags = 0,
-	};
+	SKCIPHER_REQUEST_ON_STACK(req, key->tfm0);
+	int err;
 
 	llsec_geniv(iv, sec->params.hwaddr, &hdr->sec);
 	sg_init_one(&src, skb->data, skb->len);
-	return crypto_blkcipher_encrypt_iv(&req, &src, &src, skb->len);
+	skcipher_request_set_tfm(req, key->tfm0);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &src, &src, skb->len, iv);
+	err = crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
+	return err;
 }
 
 static struct crypto_aead*
@@ -830,11 +832,8 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
 	unsigned char *data;
 	int datalen;
 	struct scatterlist src;
-	struct blkcipher_desc req = {
-		.tfm = key->tfm0,
-		.info = iv,
-		.flags = 0,
-	};
+	SKCIPHER_REQUEST_ON_STACK(req, key->tfm0);
+	int err;
 
 	llsec_geniv(iv, dev_addr, &hdr->sec);
 	data = skb_mac_header(skb) + skb->mac_len;
@@ -842,7 +841,13 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
 
 	sg_init_one(&src, data, datalen);
 
-	return crypto_blkcipher_decrypt_iv(&req, &src, &src, datalen);
+	skcipher_request_set_tfm(req, key->tfm0);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &src, &src, datalen, iv);
+
+	err = crypto_skcipher_decrypt(req);
+	skcipher_request_zero(req);
+	return err;
 }
 
 static int
diff --git a/net/mac802154/llsec.h b/net/mac802154/llsec.h
index 950578e1d7be..6f3b658e3279 100644
--- a/net/mac802154/llsec.h
+++ b/net/mac802154/llsec.h
@@ -19,7 +19,6 @@
 
 #include <linux/slab.h>
 #include <linux/hashtable.h>
-#include <linux/crypto.h>
 #include <linux/kref.h>
 #include <linux/spinlock.h>
 #include <net/af_ieee802154.h>
@@ -30,7 +29,7 @@ struct mac802154_llsec_key {
 
 	/* one tfm for each authsize (4/8/16) */
 	struct crypto_aead *tfm[3];
-	struct crypto_blkcipher *tfm0;
+	struct crypto_skcipher *tfm0;
 
 	struct kref ref;
 };
-- 
cgit v1.2.3-59-g8ed1b


From 1afe593b423918ffd5e449cec858b1591cd31781 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 24 Jan 2016 21:19:01 +0800
Subject: rxrpc: Use skcipher

This patch replaces uses of blkcipher with skcipher.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/rxrpc/ar-internal.h |   2 +-
 net/rxrpc/ar-key.c      |  12 ++--
 net/rxrpc/rxkad.c       | 172 ++++++++++++++++++++++++++++++------------------
 3 files changed, 114 insertions(+), 72 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 2934a73a5981..71598f5b11b7 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -252,7 +252,7 @@ struct rxrpc_connection {
 	struct rxrpc_security	*security;	/* applied security module */
 	struct key		*key;		/* security for this connection (client) */
 	struct key		*server_key;	/* security for this service */
-	struct crypto_blkcipher	*cipher;	/* encryption handle */
+	struct crypto_skcipher	*cipher;	/* encryption handle */
 	struct rxrpc_crypt	csum_iv;	/* packet checksum base */
 	unsigned long		events;
 #define RXRPC_CONN_CHALLENGE	0		/* send challenge packet */
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 3f6571651d32..3fb492eedeb9 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -12,11 +12,11 @@
  *	"afs@CAMBRIDGE.REDHAT.COM>
  */
 
+#include <crypto/skcipher.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <linux/skbuff.h>
 #include <linux/key-type.h>
-#include <linux/crypto.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <net/sock.h>
@@ -824,7 +824,7 @@ static void rxrpc_free_preparse(struct key_preparsed_payload *prep)
  */
 static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
 {
-	struct crypto_blkcipher *ci;
+	struct crypto_skcipher *ci;
 
 	_enter("%zu", prep->datalen);
 
@@ -833,13 +833,13 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
 
 	memcpy(&prep->payload.data[2], prep->data, 8);
 
-	ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
+	ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(ci)) {
 		_leave(" = %ld", PTR_ERR(ci));
 		return PTR_ERR(ci);
 	}
 
-	if (crypto_blkcipher_setkey(ci, prep->data, 8) < 0)
+	if (crypto_skcipher_setkey(ci, prep->data, 8) < 0)
 		BUG();
 
 	prep->payload.data[0] = ci;
@@ -853,7 +853,7 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
 static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep)
 {
 	if (prep->payload.data[0])
-		crypto_free_blkcipher(prep->payload.data[0]);
+		crypto_free_skcipher(prep->payload.data[0]);
 }
 
 /*
@@ -870,7 +870,7 @@ static void rxrpc_destroy(struct key *key)
 static void rxrpc_destroy_s(struct key *key)
 {
 	if (key->payload.data[0]) {
-		crypto_free_blkcipher(key->payload.data[0]);
+		crypto_free_skcipher(key->payload.data[0]);
 		key->payload.data[0] = NULL;
 	}
 }
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index d7a9ab5a9d9c..0d96b48a6492 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -9,11 +9,11 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <crypto/skcipher.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <linux/skbuff.h>
 #include <linux/udp.h>
-#include <linux/crypto.h>
 #include <linux/scatterlist.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
@@ -53,7 +53,7 @@ MODULE_LICENSE("GPL");
  * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE
  * packets
  */
-static struct crypto_blkcipher *rxkad_ci;
+static struct crypto_skcipher *rxkad_ci;
 static DEFINE_MUTEX(rxkad_ci_mutex);
 
 /*
@@ -61,7 +61,7 @@ static DEFINE_MUTEX(rxkad_ci_mutex);
  */
 static int rxkad_init_connection_security(struct rxrpc_connection *conn)
 {
-	struct crypto_blkcipher *ci;
+	struct crypto_skcipher *ci;
 	struct rxrpc_key_token *token;
 	int ret;
 
@@ -70,15 +70,15 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
 	token = conn->key->payload.data[0];
 	conn->security_ix = token->security_index;
 
-	ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
+	ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(ci)) {
 		_debug("no cipher");
 		ret = PTR_ERR(ci);
 		goto error;
 	}
 
-	if (crypto_blkcipher_setkey(ci, token->kad->session_key,
-				    sizeof(token->kad->session_key)) < 0)
+	if (crypto_skcipher_setkey(ci, token->kad->session_key,
+				   sizeof(token->kad->session_key)) < 0)
 		BUG();
 
 	switch (conn->security_level) {
@@ -113,7 +113,7 @@ error:
 static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
 {
 	struct rxrpc_key_token *token;
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
 	struct scatterlist sg[2];
 	struct rxrpc_crypt iv;
 	struct {
@@ -128,10 +128,6 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
 	token = conn->key->payload.data[0];
 	memcpy(&iv, token->kad->session_key, sizeof(iv));
 
-	desc.tfm = conn->cipher;
-	desc.info = iv.x;
-	desc.flags = 0;
-
 	tmpbuf.x[0] = conn->epoch;
 	tmpbuf.x[1] = conn->cid;
 	tmpbuf.x[2] = 0;
@@ -139,7 +135,13 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
 
 	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
 	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
-	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+	skcipher_request_set_tfm(req, conn->cipher);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
+
+	crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
 
 	memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv));
 	ASSERTCMP(conn->csum_iv.n[0], ==, tmpbuf.x[2]);
@@ -156,7 +158,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
 				    void *sechdr)
 {
 	struct rxrpc_skb_priv *sp;
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist sg[2];
 	struct {
@@ -177,13 +179,16 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
 
 	/* start the encryption afresh */
 	memset(&iv, 0, sizeof(iv));
-	desc.tfm = call->conn->cipher;
-	desc.info = iv.x;
-	desc.flags = 0;
 
 	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
 	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
-	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+	skcipher_request_set_tfm(req, call->conn->cipher);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
+
+	crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
 
 	memcpy(sechdr, &tmpbuf, sizeof(tmpbuf));
 
@@ -203,13 +208,14 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
 	struct rxkad_level2_hdr rxkhdr
 		__attribute__((aligned(8))); /* must be all on one page */
 	struct rxrpc_skb_priv *sp;
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist sg[16];
 	struct sk_buff *trailer;
 	unsigned int len;
 	u16 check;
 	int nsg;
+	int err;
 
 	sp = rxrpc_skb(skb);
 
@@ -223,28 +229,38 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
 	/* encrypt from the session key */
 	token = call->conn->key->payload.data[0];
 	memcpy(&iv, token->kad->session_key, sizeof(iv));
-	desc.tfm = call->conn->cipher;
-	desc.info = iv.x;
-	desc.flags = 0;
 
 	sg_init_one(&sg[0], sechdr, sizeof(rxkhdr));
 	sg_init_one(&sg[1], &rxkhdr, sizeof(rxkhdr));
-	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(rxkhdr));
+
+	skcipher_request_set_tfm(req, call->conn->cipher);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(rxkhdr), iv.x);
+
+	crypto_skcipher_encrypt(req);
 
 	/* we want to encrypt the skbuff in-place */
 	nsg = skb_cow_data(skb, 0, &trailer);
+	err = -ENOMEM;
 	if (nsg < 0 || nsg > 16)
-		return -ENOMEM;
+		goto out;
 
 	len = data_size + call->conn->size_align - 1;
 	len &= ~(call->conn->size_align - 1);
 
 	sg_init_table(sg, nsg);
 	skb_to_sgvec(skb, sg, 0, len);
-	crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+
+	skcipher_request_set_crypt(req, sg, sg, len, iv.x);
+
+	crypto_skcipher_encrypt(req);
 
 	_leave(" = 0");
-	return 0;
+	err = 0;
+
+out:
+	skcipher_request_zero(req);
+	return err;
 }
 
 /*
@@ -256,7 +272,7 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
 				void *sechdr)
 {
 	struct rxrpc_skb_priv *sp;
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist sg[2];
 	struct {
@@ -281,9 +297,6 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
 
 	/* continue encrypting from where we left off */
 	memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
-	desc.tfm = call->conn->cipher;
-	desc.info = iv.x;
-	desc.flags = 0;
 
 	/* calculate the security checksum */
 	x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
@@ -293,7 +306,13 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
 
 	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
 	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
-	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+	skcipher_request_set_tfm(req, call->conn->cipher);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
+
+	crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
 
 	y = ntohl(tmpbuf.x[1]);
 	y = (y >> 16) & 0xffff;
@@ -330,7 +349,7 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 {
 	struct rxkad_level1_hdr sechdr;
 	struct rxrpc_skb_priv *sp;
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist sg[16];
 	struct sk_buff *trailer;
@@ -352,11 +371,13 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 
 	/* start the decryption afresh */
 	memset(&iv, 0, sizeof(iv));
-	desc.tfm = call->conn->cipher;
-	desc.info = iv.x;
-	desc.flags = 0;
 
-	crypto_blkcipher_decrypt_iv(&desc, sg, sg, 8);
+	skcipher_request_set_tfm(req, call->conn->cipher);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg, sg, 8, iv.x);
+
+	crypto_skcipher_decrypt(req);
+	skcipher_request_zero(req);
 
 	/* remove the decrypted packet length */
 	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
@@ -405,7 +426,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 	const struct rxrpc_key_token *token;
 	struct rxkad_level2_hdr sechdr;
 	struct rxrpc_skb_priv *sp;
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist _sg[4], *sg;
 	struct sk_buff *trailer;
@@ -435,11 +456,13 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 	/* decrypt from the session key */
 	token = call->conn->key->payload.data[0];
 	memcpy(&iv, token->kad->session_key, sizeof(iv));
-	desc.tfm = call->conn->cipher;
-	desc.info = iv.x;
-	desc.flags = 0;
 
-	crypto_blkcipher_decrypt_iv(&desc, sg, sg, skb->len);
+	skcipher_request_set_tfm(req, call->conn->cipher);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x);
+
+	crypto_skcipher_decrypt(req);
+	skcipher_request_zero(req);
 	if (sg != _sg)
 		kfree(sg);
 
@@ -487,7 +510,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
 			       struct sk_buff *skb,
 			       u32 *_abort_code)
 {
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
 	struct rxrpc_skb_priv *sp;
 	struct rxrpc_crypt iv;
 	struct scatterlist sg[2];
@@ -516,9 +539,6 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
 
 	/* continue encrypting from where we left off */
 	memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
-	desc.tfm = call->conn->cipher;
-	desc.info = iv.x;
-	desc.flags = 0;
 
 	/* validate the security checksum */
 	x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
@@ -528,7 +548,13 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
 
 	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
 	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
-	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+	skcipher_request_set_tfm(req, call->conn->cipher);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
+
+	crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
 
 	y = ntohl(tmpbuf.x[1]);
 	y = (y >> 16) & 0xffff;
@@ -718,18 +744,21 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn,
 				   struct rxkad_response *resp,
 				   const struct rxkad_key *s2)
 {
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
 	struct rxrpc_crypt iv;
 	struct scatterlist sg[2];
 
 	/* continue encrypting from where we left off */
 	memcpy(&iv, s2->session_key, sizeof(iv));
-	desc.tfm = conn->cipher;
-	desc.info = iv.x;
-	desc.flags = 0;
 
 	rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
-	crypto_blkcipher_encrypt_iv(&desc, sg, sg, sizeof(resp->encrypted));
+
+	skcipher_request_set_tfm(req, conn->cipher);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
+
+	crypto_skcipher_encrypt(req);
+	skcipher_request_zero(req);
 }
 
 /*
@@ -822,7 +851,7 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
 				time_t *_expiry,
 				u32 *_abort_code)
 {
-	struct blkcipher_desc desc;
+	struct skcipher_request *req;
 	struct rxrpc_crypt iv, key;
 	struct scatterlist sg[1];
 	struct in_addr addr;
@@ -853,12 +882,21 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
 
 	memcpy(&iv, &conn->server_key->payload.data[2], sizeof(iv));
 
-	desc.tfm = conn->server_key->payload.data[0];
-	desc.info = iv.x;
-	desc.flags = 0;
+	req = skcipher_request_alloc(conn->server_key->payload.data[0],
+				     GFP_NOFS);
+	if (!req) {
+		*_abort_code = RXKADNOAUTH;
+		ret = -ENOMEM;
+		goto error;
+	}
 
 	sg_init_one(&sg[0], ticket, ticket_len);
-	crypto_blkcipher_decrypt_iv(&desc, sg, sg, ticket_len);
+
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg, sg, ticket_len, iv.x);
+
+	crypto_skcipher_decrypt(req);
+	skcipher_request_free(req);
 
 	p = ticket;
 	end = p + ticket_len;
@@ -966,7 +1004,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
 				   struct rxkad_response *resp,
 				   const struct rxrpc_crypt *session_key)
 {
-	struct blkcipher_desc desc;
+	SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci);
 	struct scatterlist sg[2];
 	struct rxrpc_crypt iv;
 
@@ -976,17 +1014,21 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
 	ASSERT(rxkad_ci != NULL);
 
 	mutex_lock(&rxkad_ci_mutex);
-	if (crypto_blkcipher_setkey(rxkad_ci, session_key->x,
-				    sizeof(*session_key)) < 0)
+	if (crypto_skcipher_setkey(rxkad_ci, session_key->x,
+				   sizeof(*session_key)) < 0)
 		BUG();
 
 	memcpy(&iv, session_key, sizeof(iv));
-	desc.tfm = rxkad_ci;
-	desc.info = iv.x;
-	desc.flags = 0;
 
 	rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
-	crypto_blkcipher_decrypt_iv(&desc, sg, sg, sizeof(resp->encrypted));
+
+	skcipher_request_set_tfm(req, rxkad_ci);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
+	skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
+
+	crypto_skcipher_decrypt(req);
+	skcipher_request_zero(req);
+
 	mutex_unlock(&rxkad_ci_mutex);
 
 	_leave("");
@@ -1115,7 +1157,7 @@ static void rxkad_clear(struct rxrpc_connection *conn)
 	_enter("");
 
 	if (conn->cipher)
-		crypto_free_blkcipher(conn->cipher);
+		crypto_free_skcipher(conn->cipher);
 }
 
 /*
@@ -1141,7 +1183,7 @@ static __init int rxkad_init(void)
 
 	/* pin the cipher we need so that the crypto layer doesn't invoke
 	 * keventd to go get it */
-	rxkad_ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
+	rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(rxkad_ci))
 		return PTR_ERR(rxkad_ci);
 
@@ -1155,7 +1197,7 @@ static __exit void rxkad_exit(void)
 	_enter("");
 
 	rxrpc_unregister_security(&rxkad);
-	crypto_free_blkcipher(rxkad_ci);
+	crypto_free_skcipher(rxkad_ci);
 }
 
 module_exit(rxkad_exit);
-- 
cgit v1.2.3-59-g8ed1b


From 17bc197022212a8b439b213b2f23a8603a784e56 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 24 Jan 2016 21:19:11 +0800
Subject: ipsec: Use skcipher and ahash when probing algorithms

This patch removes the last reference to hash and ablkcipher from
IPsec and replaces them with ahash and skcipher respectively.  For
skcipher there is currently no difference at all, while for ahash
the current code is actually buggy and would prevent asynchronous
algorithms from being discovered.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_algo.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index f07224d8b88f..250e567ba3d6 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -9,6 +9,8 @@
  * any later version.
  */
 
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/pfkeyv2.h>
@@ -782,14 +784,13 @@ void xfrm_probe_algs(void)
 	BUG_ON(in_softirq());
 
 	for (i = 0; i < aalg_entries(); i++) {
-		status = crypto_has_hash(aalg_list[i].name, 0,
-					 CRYPTO_ALG_ASYNC);
+		status = crypto_has_ahash(aalg_list[i].name, 0, 0);
 		if (aalg_list[i].available != status)
 			aalg_list[i].available = status;
 	}
 
 	for (i = 0; i < ealg_entries(); i++) {
-		status = crypto_has_ablkcipher(ealg_list[i].name, 0, 0);
+		status = crypto_has_skcipher(ealg_list[i].name, 0, 0);
 		if (ealg_list[i].available != status)
 			ealg_list[i].available = status;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 5821c769706561da81e9fcec4a6ca6dbbb2f30cb Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 24 Jan 2016 21:20:12 +0800
Subject: sctp: Use shash

This patch replaces uses of the long obsolete hash interface with
shash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/auth.h    |  4 ++--
 include/net/sctp/structs.h |  6 +++---
 net/sctp/auth.c            | 36 ++++++++++++++++----------------
 net/sctp/endpointola.c     |  1 -
 net/sctp/sm_make_chunk.c   | 51 ++++++++++++++++++++++++++--------------------
 net/sctp/socket.c          |  8 ++++----
 6 files changed, 57 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h
index f2d58aa37a6f..9b9fb122b31f 100644
--- a/include/net/sctp/auth.h
+++ b/include/net/sctp/auth.h
@@ -31,12 +31,12 @@
 #define __sctp_auth_h__
 
 #include <linux/list.h>
-#include <linux/crypto.h>
 
 struct sctp_endpoint;
 struct sctp_association;
 struct sctp_authkey;
 struct sctp_hmacalgo;
+struct crypto_shash;
 
 /*
  * Define a generic struct that will hold all the info
@@ -90,7 +90,7 @@ int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep,
 				struct sctp_association *asoc,
 				gfp_t gfp);
 int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp);
-void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[]);
+void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]);
 struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id);
 struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc);
 void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 20e72129be1c..6b1e3838667e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -82,7 +82,7 @@ struct sctp_bind_addr;
 struct sctp_ulpq;
 struct sctp_ep_common;
 struct sctp_ssnmap;
-struct crypto_hash;
+struct crypto_shash;
 
 
 #include <net/sctp/tsnmap.h>
@@ -166,7 +166,7 @@ struct sctp_sock {
 	struct sctp_pf *pf;
 
 	/* Access to HMAC transform. */
-	struct crypto_hash *hmac;
+	struct crypto_shash *hmac;
 	char *sctp_hmac_alg;
 
 	/* What is our base endpointer? */
@@ -1235,7 +1235,7 @@ struct sctp_endpoint {
 	/* SCTP AUTH: array of the HMACs that will be allocated
 	 * we need this per association so that we don't serialize
 	 */
-	struct crypto_hash **auth_hmacs;
+	struct crypto_shash **auth_hmacs;
 
 	/* SCTP-AUTH: hmacs for the endpoint encoded into parameter */
 	 struct sctp_hmac_algo_param *auth_hmacs_list;
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 1543e39f47c3..912eb1685a5d 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -27,9 +27,9 @@
  *   Vlad Yasevich     <vladislav.yasevich@hp.com>
  */
 
+#include <crypto/hash.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <linux/crypto.h>
 #include <linux/scatterlist.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/auth.h>
@@ -448,7 +448,7 @@ struct sctp_shared_key *sctp_auth_get_shkey(
  */
 int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
 {
-	struct crypto_hash *tfm = NULL;
+	struct crypto_shash *tfm = NULL;
 	__u16   id;
 
 	/* If AUTH extension is disabled, we are done */
@@ -462,9 +462,8 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
 		return 0;
 
 	/* Allocated the array of pointers to transorms */
-	ep->auth_hmacs = kzalloc(
-			    sizeof(struct crypto_hash *) * SCTP_AUTH_NUM_HMACS,
-			    gfp);
+	ep->auth_hmacs = kzalloc(sizeof(struct crypto_shash *) *
+				 SCTP_AUTH_NUM_HMACS, gfp);
 	if (!ep->auth_hmacs)
 		return -ENOMEM;
 
@@ -483,8 +482,7 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
 			continue;
 
 		/* Allocate the ID */
-		tfm = crypto_alloc_hash(sctp_hmac_list[id].hmac_name, 0,
-					CRYPTO_ALG_ASYNC);
+		tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0);
 		if (IS_ERR(tfm))
 			goto out_err;
 
@@ -500,7 +498,7 @@ out_err:
 }
 
 /* Destroy the hmac tfm array */
-void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[])
+void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[])
 {
 	int i;
 
@@ -508,8 +506,7 @@ void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[])
 		return;
 
 	for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) {
-		if (auth_hmacs[i])
-			crypto_free_hash(auth_hmacs[i]);
+		crypto_free_shash(auth_hmacs[i]);
 	}
 	kfree(auth_hmacs);
 }
@@ -709,8 +706,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
 			      struct sctp_auth_chunk *auth,
 			      gfp_t gfp)
 {
-	struct scatterlist sg;
-	struct hash_desc desc;
+	struct crypto_shash *tfm;
 	struct sctp_auth_bytes *asoc_key;
 	__u16 key_id, hmac_id;
 	__u8 *digest;
@@ -742,16 +738,22 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
 
 	/* set up scatter list */
 	end = skb_tail_pointer(skb);
-	sg_init_one(&sg, auth, end - (unsigned char *)auth);
 
-	desc.tfm = asoc->ep->auth_hmacs[hmac_id];
-	desc.flags = 0;
+	tfm = asoc->ep->auth_hmacs[hmac_id];
 
 	digest = auth->auth_hdr.hmac;
-	if (crypto_hash_setkey(desc.tfm, &asoc_key->data[0], asoc_key->len))
+	if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len))
 		goto free;
 
-	crypto_hash_digest(&desc, &sg, sg.length, digest);
+	{
+		SHASH_DESC_ON_STACK(desc, tfm);
+
+		desc->tfm = tfm;
+		desc->flags = 0;
+		crypto_shash_digest(desc, (u8 *)auth,
+				    end - (unsigned char *)auth, digest);
+		shash_desc_zero(desc);
+	}
 
 free:
 	if (free_key)
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 2522a6175291..9d494e35e7f9 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -42,7 +42,6 @@
 #include <linux/slab.h>
 #include <linux/in.h>
 #include <linux/random.h>	/* get_random_bytes() */
-#include <linux/crypto.h>
 #include <net/sock.h>
 #include <net/ipv6.h>
 #include <net/sctp/sctp.h>
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5d6a03fad378..1296e555fe29 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -45,6 +45,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <crypto/hash.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/ip.h>
@@ -52,7 +53,6 @@
 #include <linux/net.h>
 #include <linux/inet.h>
 #include <linux/scatterlist.h>
-#include <linux/crypto.h>
 #include <linux/slab.h>
 #include <net/sock.h>
 
@@ -1606,7 +1606,6 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
 {
 	sctp_cookie_param_t *retval;
 	struct sctp_signed_cookie *cookie;
-	struct scatterlist sg;
 	int headersize, bodysize;
 
 	/* Header size is static data prior to the actual cookie, including
@@ -1663,16 +1662,19 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
 	       ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);
 
 	if (sctp_sk(ep->base.sk)->hmac) {
-		struct hash_desc desc;
+		SHASH_DESC_ON_STACK(desc, sctp_sk(ep->base.sk)->hmac);
+		int err;
 
 		/* Sign the message.  */
-		sg_init_one(&sg, &cookie->c, bodysize);
-		desc.tfm = sctp_sk(ep->base.sk)->hmac;
-		desc.flags = 0;
-
-		if (crypto_hash_setkey(desc.tfm, ep->secret_key,
-				       sizeof(ep->secret_key)) ||
-		    crypto_hash_digest(&desc, &sg, bodysize, cookie->signature))
+		desc->tfm = sctp_sk(ep->base.sk)->hmac;
+		desc->flags = 0;
+
+		err = crypto_shash_setkey(desc->tfm, ep->secret_key,
+					  sizeof(ep->secret_key)) ?:
+		      crypto_shash_digest(desc, (u8 *)&cookie->c, bodysize,
+					  cookie->signature);
+		shash_desc_zero(desc);
+		if (err)
 			goto free_cookie;
 	}
 
@@ -1697,12 +1699,10 @@ struct sctp_association *sctp_unpack_cookie(
 	struct sctp_cookie *bear_cookie;
 	int headersize, bodysize, fixed_size;
 	__u8 *digest = ep->digest;
-	struct scatterlist sg;
 	unsigned int len;
 	sctp_scope_t scope;
 	struct sk_buff *skb = chunk->skb;
 	ktime_t kt;
-	struct hash_desc desc;
 
 	/* Header size is static data prior to the actual cookie, including
 	 * any padding.
@@ -1733,16 +1733,23 @@ struct sctp_association *sctp_unpack_cookie(
 		goto no_hmac;
 
 	/* Check the signature.  */
-	sg_init_one(&sg, bear_cookie, bodysize);
-	desc.tfm = sctp_sk(ep->base.sk)->hmac;
-	desc.flags = 0;
-
-	memset(digest, 0x00, SCTP_SIGNATURE_SIZE);
-	if (crypto_hash_setkey(desc.tfm, ep->secret_key,
-			       sizeof(ep->secret_key)) ||
-	    crypto_hash_digest(&desc, &sg, bodysize, digest)) {
-		*error = -SCTP_IERROR_NOMEM;
-		goto fail;
+	{
+		SHASH_DESC_ON_STACK(desc, sctp_sk(ep->base.sk)->hmac);
+		int err;
+
+		desc->tfm = sctp_sk(ep->base.sk)->hmac;
+		desc->flags = 0;
+
+		err = crypto_shash_setkey(desc->tfm, ep->secret_key,
+					  sizeof(ep->secret_key)) ?:
+		      crypto_shash_digest(desc, (u8 *)bear_cookie, bodysize,
+					  digest);
+		shash_desc_zero(desc);
+
+		if (err) {
+			*error = -SCTP_IERROR_NOMEM;
+			goto fail;
+		}
 	}
 
 	if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9bb80ec4c08f..4101c5b653d0 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -52,6 +52,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <crypto/hash.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/wait.h>
@@ -61,7 +62,6 @@
 #include <linux/fcntl.h>
 #include <linux/poll.h>
 #include <linux/init.h>
-#include <linux/crypto.h>
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/compat.h>
@@ -4160,7 +4160,7 @@ static void sctp_destruct_sock(struct sock *sk)
 	struct sctp_sock *sp = sctp_sk(sk);
 
 	/* Free up the HMAC transform. */
-	crypto_free_hash(sp->hmac);
+	crypto_free_shash(sp->hmac);
 
 	inet_sock_destruct(sk);
 }
@@ -6299,13 +6299,13 @@ static int sctp_listen_start(struct sock *sk, int backlog)
 {
 	struct sctp_sock *sp = sctp_sk(sk);
 	struct sctp_endpoint *ep = sp->ep;
-	struct crypto_hash *tfm = NULL;
+	struct crypto_shash *tfm = NULL;
 	char alg[32];
 
 	/* Allocate HMAC for generating cookie. */
 	if (!sp->hmac && sp->sctp_hmac_alg) {
 		sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg);
-		tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+		tfm = crypto_alloc_shash(alg, 0, 0);
 		if (IS_ERR(tfm)) {
 			net_info_ratelimited("failed to load transform for %s: %ld\n",
 					     sp->sctp_hmac_alg, PTR_ERR(tfm));
-- 
cgit v1.2.3-59-g8ed1b


From cf80e0e47e0e7a8994dfadefec0e1395c622817a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 24 Jan 2016 21:20:23 +0800
Subject: tcp: Use ahash

This patch replaces uses of the long obsolete hash interface with
ahash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  6 +-----
 net/ipv4/tcp.c          | 41 ++++++++++++++++++++++++++---------------
 net/ipv4/tcp_fastopen.c |  1 +
 net/ipv4/tcp_ipv4.c     | 23 +++++++++++++----------
 net/ipv6/tcp_ipv6.c     | 23 +++++++++++++----------
 5 files changed, 54 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8ea19977ea53..2a5b3b8daee8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -27,7 +27,6 @@
 #include <linux/cache.h>
 #include <linux/percpu.h>
 #include <linux/skbuff.h>
-#include <linux/crypto.h>
 #include <linux/cryptohash.h>
 #include <linux/kref.h>
 #include <linux/ktime.h>
@@ -1325,9 +1324,6 @@ static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
 	tp->retransmit_skb_hint = NULL;
 }
 
-/* MD5 Signature */
-struct crypto_hash;
-
 union tcp_md5_addr {
 	struct in_addr  a4;
 #if IS_ENABLED(CONFIG_IPV6)
@@ -1376,7 +1372,7 @@ union tcp_md5sum_block {
 
 /* - pool: digest algorithm, hash description and scratch buffer */
 struct tcp_md5sig_pool {
-	struct hash_desc	md5_desc;
+	struct ahash_request	*md5_req;
 	union tcp_md5sum_block	md5_blk;
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fd17eec93525..91ffef3a55d2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -247,6 +247,7 @@
 
 #define pr_fmt(fmt) "TCP: " fmt
 
+#include <crypto/hash.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -266,7 +267,6 @@
 #include <linux/swap.h>
 #include <linux/cache.h>
 #include <linux/err.h>
-#include <linux/crypto.h>
 #include <linux/time.h>
 #include <linux/slab.h>
 
@@ -2939,17 +2939,26 @@ static bool tcp_md5sig_pool_populated = false;
 
 static void __tcp_alloc_md5sig_pool(void)
 {
+	struct crypto_ahash *hash;
 	int cpu;
 
+	hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR_OR_NULL(hash))
+		return;
+
 	for_each_possible_cpu(cpu) {
-		if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) {
-			struct crypto_hash *hash;
+		struct ahash_request *req;
 
-			hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
-			if (IS_ERR_OR_NULL(hash))
-				return;
-			per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
-		}
+		if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
+			continue;
+
+		req = ahash_request_alloc(hash, GFP_KERNEL);
+		if (!req)
+			return;
+
+		ahash_request_set_callback(req, 0, NULL, NULL);
+
+		per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
 	}
 	/* before setting tcp_md5sig_pool_populated, we must commit all writes
 	 * to memory. See smp_rmb() in tcp_get_md5sig_pool()
@@ -2999,7 +3008,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
 {
 	struct scatterlist sg;
 	struct tcphdr hdr;
-	int err;
 
 	/* We are not allowed to change tcphdr, make a local copy */
 	memcpy(&hdr, th, sizeof(hdr));
@@ -3007,8 +3015,8 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
 
 	/* options aren't included in the hash */
 	sg_init_one(&sg, &hdr, sizeof(hdr));
-	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
-	return err;
+	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr));
+	return crypto_ahash_update(hp->md5_req);
 }
 EXPORT_SYMBOL(tcp_md5_hash_header);
 
@@ -3017,7 +3025,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
 {
 	struct scatterlist sg;
 	const struct tcphdr *tp = tcp_hdr(skb);
-	struct hash_desc *desc = &hp->md5_desc;
+	struct ahash_request *req = hp->md5_req;
 	unsigned int i;
 	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
 					   skb_headlen(skb) - header_len : 0;
@@ -3027,7 +3035,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
 	sg_init_table(&sg, 1);
 
 	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
-	if (crypto_hash_update(desc, &sg, head_data_len))
+	ahash_request_set_crypt(req, &sg, NULL, head_data_len);
+	if (crypto_ahash_update(req))
 		return 1;
 
 	for (i = 0; i < shi->nr_frags; ++i) {
@@ -3037,7 +3046,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
 
 		sg_set_page(&sg, page, skb_frag_size(f),
 			    offset_in_page(offset));
-		if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
+		ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
+		if (crypto_ahash_update(req))
 			return 1;
 	}
 
@@ -3054,7 +3064,8 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke
 	struct scatterlist sg;
 
 	sg_init_one(&sg, key->key, key->keylen);
-	return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
+	ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
+	return crypto_ahash_update(hp->md5_req);
 }
 EXPORT_SYMBOL(tcp_md5_hash_key);
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 55be6ac70cff..4c65ca1a86d1 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,3 +1,4 @@
+#include <linux/crypto.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5ced3e4013e3..85854db3e094 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -81,7 +81,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
-#include <linux/crypto.h>
+#include <crypto/hash.h>
 #include <linux/scatterlist.h>
 
 int sysctl_tcp_tw_reuse __read_mostly;
@@ -1031,21 +1031,22 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
 	bp->len = cpu_to_be16(nbytes);
 
 	sg_init_one(&sg, bp, sizeof(*bp));
-	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+	return crypto_ahash_update(hp->md5_req);
 }
 
 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
 {
 	struct tcp_md5sig_pool *hp;
-	struct hash_desc *desc;
+	struct ahash_request *req;
 
 	hp = tcp_get_md5sig_pool();
 	if (!hp)
 		goto clear_hash_noput;
-	desc = &hp->md5_desc;
+	req = hp->md5_req;
 
-	if (crypto_hash_init(desc))
+	if (crypto_ahash_init(req))
 		goto clear_hash;
 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
 		goto clear_hash;
@@ -1053,7 +1054,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 		goto clear_hash;
 	if (tcp_md5_hash_key(hp, key))
 		goto clear_hash;
-	if (crypto_hash_final(desc, md5_hash))
+	ahash_request_set_crypt(req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(req))
 		goto clear_hash;
 
 	tcp_put_md5sig_pool();
@@ -1071,7 +1073,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
 			const struct sk_buff *skb)
 {
 	struct tcp_md5sig_pool *hp;
-	struct hash_desc *desc;
+	struct ahash_request *req;
 	const struct tcphdr *th = tcp_hdr(skb);
 	__be32 saddr, daddr;
 
@@ -1087,9 +1089,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
 	hp = tcp_get_md5sig_pool();
 	if (!hp)
 		goto clear_hash_noput;
-	desc = &hp->md5_desc;
+	req = hp->md5_req;
 
-	if (crypto_hash_init(desc))
+	if (crypto_ahash_init(req))
 		goto clear_hash;
 
 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
@@ -1100,7 +1102,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
 		goto clear_hash;
 	if (tcp_md5_hash_key(hp, key))
 		goto clear_hash;
-	if (crypto_hash_final(desc, md5_hash))
+	ahash_request_set_crypt(req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(req))
 		goto clear_hash;
 
 	tcp_put_md5sig_pool();
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 006396e31cb0..bd5597227b18 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -66,7 +66,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
-#include <linux/crypto.h>
+#include <crypto/hash.h>
 #include <linux/scatterlist.h>
 
 static void	tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
@@ -540,7 +540,8 @@ static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
 	bp->len = cpu_to_be32(nbytes);
 
 	sg_init_one(&sg, bp, sizeof(*bp));
-	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+	return crypto_ahash_update(hp->md5_req);
 }
 
 static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
@@ -548,14 +549,14 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
 			       const struct tcphdr *th)
 {
 	struct tcp_md5sig_pool *hp;
-	struct hash_desc *desc;
+	struct ahash_request *req;
 
 	hp = tcp_get_md5sig_pool();
 	if (!hp)
 		goto clear_hash_noput;
-	desc = &hp->md5_desc;
+	req = hp->md5_req;
 
-	if (crypto_hash_init(desc))
+	if (crypto_ahash_init(req))
 		goto clear_hash;
 	if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
 		goto clear_hash;
@@ -563,7 +564,8 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
 		goto clear_hash;
 	if (tcp_md5_hash_key(hp, key))
 		goto clear_hash;
-	if (crypto_hash_final(desc, md5_hash))
+	ahash_request_set_crypt(req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(req))
 		goto clear_hash;
 
 	tcp_put_md5sig_pool();
@@ -583,7 +585,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
 {
 	const struct in6_addr *saddr, *daddr;
 	struct tcp_md5sig_pool *hp;
-	struct hash_desc *desc;
+	struct ahash_request *req;
 	const struct tcphdr *th = tcp_hdr(skb);
 
 	if (sk) { /* valid for establish/request sockets */
@@ -598,9 +600,9 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
 	hp = tcp_get_md5sig_pool();
 	if (!hp)
 		goto clear_hash_noput;
-	desc = &hp->md5_desc;
+	req = hp->md5_req;
 
-	if (crypto_hash_init(desc))
+	if (crypto_ahash_init(req))
 		goto clear_hash;
 
 	if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
@@ -611,7 +613,8 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
 		goto clear_hash;
 	if (tcp_md5_hash_key(hp, key))
 		goto clear_hash;
-	if (crypto_hash_final(desc, md5_hash))
+	ahash_request_set_crypt(req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(req))
 		goto clear_hash;
 
 	tcp_put_md5sig_pool();
-- 
cgit v1.2.3-59-g8ed1b


From e9f6452e0e55ceea004f5bc0802fb14778d16c42 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 26 Jan 2016 18:54:55 +0800
Subject: libceph: Remove unnecessary ivsize variables

This patch removes the unnecessary ivsize variabls as they always
have the value of AES_BLOCK_SIZE.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/ceph/crypto.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index fb9cb2bb0894..db2847ac5f12 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -166,8 +166,7 @@ static int ceph_aes_encrypt(const void *key, int key_len,
 	struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
 	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	int ret;
-	int ivsize = AES_BLOCK_SIZE;
-	char iv[ivsize];
+	char iv[AES_BLOCK_SIZE];
 	size_t zero_padding = (0x10 - (src_len & 0x0f));
 	char pad[16];
 
@@ -186,7 +185,7 @@ static int ceph_aes_encrypt(const void *key, int key_len,
 		goto out_tfm;
 
 	crypto_skcipher_setkey((void *)tfm, key, key_len);
-	memcpy(iv, aes_iv, ivsize);
+	memcpy(iv, aes_iv, AES_BLOCK_SIZE);
 
 	skcipher_request_set_tfm(req, tfm);
 	skcipher_request_set_callback(req, 0, NULL, NULL);
@@ -229,8 +228,7 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
 	struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
 	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	int ret;
-	int ivsize = AES_BLOCK_SIZE;
-	char iv[ivsize];
+	char iv[AES_BLOCK_SIZE];
 	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
 	char pad[16];
 
@@ -250,7 +248,7 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
 		goto out_tfm;
 
 	crypto_skcipher_setkey((void *)tfm, key, key_len);
-	memcpy(iv, aes_iv, ivsize);
+	memcpy(iv, aes_iv, AES_BLOCK_SIZE);
 
 	skcipher_request_set_tfm(req, tfm);
 	skcipher_request_set_callback(req, 0, NULL, NULL);
@@ -294,8 +292,7 @@ static int ceph_aes_decrypt(const void *key, int key_len,
 	struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
 	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	char pad[16];
-	int ivsize = AES_BLOCK_SIZE;
-	char iv[16];
+	char iv[AES_BLOCK_SIZE];
 	int ret;
 	int last_byte;
 
@@ -310,7 +307,7 @@ static int ceph_aes_decrypt(const void *key, int key_len,
 		goto out_tfm;
 
 	crypto_skcipher_setkey((void *)tfm, key, key_len);
-	memcpy(iv, aes_iv, ivsize);
+	memcpy(iv, aes_iv, AES_BLOCK_SIZE);
 
 	skcipher_request_set_tfm(req, tfm);
 	skcipher_request_set_callback(req, 0, NULL, NULL);
@@ -363,8 +360,7 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
 	struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
 	SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	char pad[16];
-	int ivsize = AES_BLOCK_SIZE;
-	char iv[ivsize];
+	char iv[AES_BLOCK_SIZE];
 	int ret;
 	int last_byte;
 
@@ -380,7 +376,7 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
 		goto out_tfm;
 
 	crypto_skcipher_setkey((void *)tfm, key, key_len);
-	memcpy(iv, aes_iv, ivsize);
+	memcpy(iv, aes_iv, AES_BLOCK_SIZE);
 
 	skcipher_request_set_tfm(req, tfm);
 	skcipher_request_set_callback(req, 0, NULL, NULL);
-- 
cgit v1.2.3-59-g8ed1b


From 582e20a03b9bcbddaeaffb38800323ddcc4a9173 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 10 Jan 2016 22:40:53 -0800
Subject: NFC: nci: Remove dead code

The N_NCI ldisc does not define a flush_buffer() ldisc method, so
the check when opening the ldisc is always false.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/nfc/nci/uart.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index 21d8875673a4..c468eabd6943 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -171,14 +171,7 @@ static int nci_uart_tty_open(struct tty_struct *tty)
 	tty->disc_data = NULL;
 	tty->receive_room = 65536;
 
-	/* Flush any pending characters in the driver and line discipline. */
-
-	/* FIXME: why is this needed. Note don't use ldisc_ref here as the
-	 * open path is before the ldisc is referencable.
-	 */
-
-	if (tty->ldisc->ops->flush_buffer)
-		tty->ldisc->ops->flush_buffer(tty);
+	/* Flush any pending characters in the driver */
 	tty_driver_flush_buffer(tty);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 9db276f8f02145068d8c04614bc28c2a4532a8c7 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 10 Jan 2016 20:36:15 -0800
Subject: tty: Use termios c_*flag macros

Expressions of the form "tty->termios.c_*flag & FLAG"
are more clearly expressed with the termios flags macros,
I_FLAG(), C_FLAG(), O_FLAG(), and L_FLAG().

Convert treewide.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/pcmcia/synclink_cs.c    | 21 ++++++++-------------
 drivers/mmc/card/sdio_uart.c         | 14 +++++++-------
 drivers/staging/dgap/dgap.c          |  2 +-
 drivers/staging/dgnc/dgnc_tty.c      |  2 +-
 drivers/tty/amiserial.c              | 22 ++++++++--------------
 drivers/tty/cyclades.c               | 14 ++++++--------
 drivers/tty/isicom.c                 |  3 +--
 drivers/tty/mxser.c                  | 10 ++++------
 drivers/tty/n_gsm.c                  |  6 +++---
 drivers/tty/pty.c                    |  3 +--
 drivers/tty/rocket.c                 |  8 ++++----
 drivers/tty/serial/68328serial.c     |  4 +---
 drivers/tty/serial/crisv10.c         | 18 +++++++-----------
 drivers/tty/serial/jsm/jsm_tty.c     |  3 +--
 drivers/tty/serial/serial_core.c     | 20 +++++++++-----------
 drivers/tty/synclink.c               | 23 +++++++++--------------
 drivers/tty/synclink_gt.c            | 19 +++++++------------
 drivers/tty/synclinkmp.c             | 19 +++++++------------
 drivers/tty/tty_ioctl.c              | 12 ++++++------
 drivers/tty/tty_port.c               |  2 +-
 drivers/usb/serial/cypress_m8.c      |  3 +--
 drivers/usb/serial/digi_acceleport.c |  8 ++++----
 drivers/usb/serial/io_edgeport.c     |  4 ++--
 drivers/usb/serial/mct_u232.c        |  2 +-
 drivers/usb/serial/mos7720.c         |  4 ++--
 drivers/usb/serial/mos7840.c         |  4 ++--
 net/irda/ircomm/ircomm_tty.c         | 11 +++++------
 net/irda/ircomm/ircomm_tty_ioctl.c   | 13 ++++---------
 28 files changed, 113 insertions(+), 161 deletions(-)

(limited to 'net')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 45df4bf914f8..22c27652e46a 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1349,7 +1349,7 @@ static void shutdown(MGSLPC_INFO * info, struct tty_struct *tty)
 	/* TODO:disable interrupts instead of reset to preserve signal states */
 	reset_device(info);
 
-	if (!tty || tty->termios.c_cflag & HUPCL) {
+	if (!tty || C_HUPCL(tty)) {
 		info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
 		set_signals(info);
 	}
@@ -1390,7 +1390,7 @@ static void mgslpc_program_hw(MGSLPC_INFO *info, struct tty_struct *tty)
 	port_irq_enable(info, (unsigned char) PVR_DSR | PVR_RI);
 	get_signals(info);
 
-	if (info->netcount || (tty && (tty->termios.c_cflag & CREAD)))
+	if (info->netcount || (tty && C_CREAD(tty)))
 		rx_start(info);
 
 	spin_unlock_irqrestore(&info->lock, flags);
@@ -1733,7 +1733,7 @@ static void mgslpc_throttle(struct tty_struct * tty)
 	if (I_IXOFF(tty))
 		mgslpc_send_xchar(tty, STOP_CHAR(tty));
 
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock, flags);
 		info->serial_signals &= ~SerialSignal_RTS;
 		set_signals(info);
@@ -1762,7 +1762,7 @@ static void mgslpc_unthrottle(struct tty_struct * tty)
 			mgslpc_send_xchar(tty, START_CHAR(tty));
 	}
 
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock, flags);
 		info->serial_signals |= SerialSignal_RTS;
 		set_signals(info);
@@ -2306,8 +2306,7 @@ static void mgslpc_set_termios(struct tty_struct *tty, struct ktermios *old_term
 	mgslpc_change_params(info, tty);
 
 	/* Handle transition to B0 status */
-	if (old_termios->c_cflag & CBAUD &&
-	    !(tty->termios.c_cflag & CBAUD)) {
+	if ((old_termios->c_cflag & CBAUD) && !C_BAUD(tty)) {
 		info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
 		spin_lock_irqsave(&info->lock, flags);
 		set_signals(info);
@@ -2315,21 +2314,17 @@ static void mgslpc_set_termios(struct tty_struct *tty, struct ktermios *old_term
 	}
 
 	/* Handle transition away from B0 status */
-	if (!(old_termios->c_cflag & CBAUD) &&
-	    tty->termios.c_cflag & CBAUD) {
+	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->serial_signals |= SerialSignal_DTR;
-		if (!(tty->termios.c_cflag & CRTSCTS) ||
-		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
 			info->serial_signals |= SerialSignal_RTS;
-		}
 		spin_lock_irqsave(&info->lock, flags);
 		set_signals(info);
 		spin_unlock_irqrestore(&info->lock, flags);
 	}
 
 	/* Handle turning off CRTSCTS */
-	if (old_termios->c_cflag & CRTSCTS &&
-	    !(tty->termios.c_cflag & CRTSCTS)) {
+	if (old_termios->c_cflag & CRTSCTS && !C_CRTSCTS(tty)) {
 		tty->hw_stopped = 0;
 		tx_release(tty);
 	}
diff --git a/drivers/mmc/card/sdio_uart.c b/drivers/mmc/card/sdio_uart.c
index d2de5925b73e..5415056f9aa5 100644
--- a/drivers/mmc/card/sdio_uart.c
+++ b/drivers/mmc/card/sdio_uart.c
@@ -493,7 +493,7 @@ static void sdio_uart_check_modem_status(struct sdio_uart_port *port)
 	if (status & UART_MSR_DCTS) {
 		port->icount.cts++;
 		tty = tty_port_tty_get(&port->port);
-		if (tty && (tty->termios.c_cflag & CRTSCTS)) {
+		if (tty && C_CRTSCTS(tty)) {
 			int cts = (status & UART_MSR_CTS);
 			if (tty->hw_stopped) {
 				if (cts) {
@@ -648,10 +648,10 @@ static int sdio_uart_activate(struct tty_port *tport, struct tty_struct *tty)
 
 	sdio_uart_change_speed(port, &tty->termios, NULL);
 
-	if (tty->termios.c_cflag & CBAUD)
+	if (C_BAUD(tty))
 		sdio_uart_set_mctrl(port, TIOCM_RTS | TIOCM_DTR);
 
-	if (tty->termios.c_cflag & CRTSCTS)
+	if (C_CRTSCTS(tty))
 		if (!(sdio_uart_get_mctrl(port) & TIOCM_CTS))
 			tty->hw_stopped = 1;
 
@@ -833,7 +833,7 @@ static void sdio_uart_throttle(struct tty_struct *tty)
 {
 	struct sdio_uart_port *port = tty->driver_data;
 
-	if (!I_IXOFF(tty) && !(tty->termios.c_cflag & CRTSCTS))
+	if (!I_IXOFF(tty) && !C_CRTSCTS(tty))
 		return;
 
 	if (sdio_uart_claim_func(port) != 0)
@@ -844,7 +844,7 @@ static void sdio_uart_throttle(struct tty_struct *tty)
 		sdio_uart_start_tx(port);
 	}
 
-	if (tty->termios.c_cflag & CRTSCTS)
+	if (C_CRTSCTS(tty))
 		sdio_uart_clear_mctrl(port, TIOCM_RTS);
 
 	sdio_uart_irq(port->func);
@@ -855,7 +855,7 @@ static void sdio_uart_unthrottle(struct tty_struct *tty)
 {
 	struct sdio_uart_port *port = tty->driver_data;
 
-	if (!I_IXOFF(tty) && !(tty->termios.c_cflag & CRTSCTS))
+	if (!I_IXOFF(tty) && !C_CRTSCTS(tty))
 		return;
 
 	if (sdio_uart_claim_func(port) != 0)
@@ -870,7 +870,7 @@ static void sdio_uart_unthrottle(struct tty_struct *tty)
 		}
 	}
 
-	if (tty->termios.c_cflag & CRTSCTS)
+	if (C_CRTSCTS(tty))
 		sdio_uart_set_mctrl(port, TIOCM_RTS);
 
 	sdio_uart_irq(port->func);
diff --git a/drivers/staging/dgap/dgap.c b/drivers/staging/dgap/dgap.c
index 5413ed8c29ff..294c1c83aa4d 100644
--- a/drivers/staging/dgap/dgap.c
+++ b/drivers/staging/dgap/dgap.c
@@ -1530,7 +1530,7 @@ static void dgap_input(struct channel_t *ch)
 	if ((bd->state != BOARD_READY) || !tp  ||
 	    (tp->magic != TTY_MAGIC) ||
 	    !(ch->ch_tun.un_flags & UN_ISOPEN) ||
-	    !(tp->termios.c_cflag & CREAD) ||
+	    !C_CREAD(tp) ||
 	    (ch->ch_tun.un_flags & UN_CLOSING)) {
 		writew(head, &bs->rx_tail);
 		writeb(1, &bs->idata);
diff --git a/drivers/staging/dgnc/dgnc_tty.c b/drivers/staging/dgnc/dgnc_tty.c
index 4a3d2c6f7eac..8b1ba65a6984 100644
--- a/drivers/staging/dgnc/dgnc_tty.c
+++ b/drivers/staging/dgnc/dgnc_tty.c
@@ -541,7 +541,7 @@ void dgnc_input(struct channel_t *ch)
 	 */
 	if (!tp || (tp->magic != TTY_MAGIC) ||
 	    !(ch->ch_tun.un_flags & UN_ISOPEN) ||
-	    !(tp->termios.c_cflag & CREAD) ||
+	    !C_CREAD(tp) ||
 	    (ch->ch_tun.un_flags & UN_CLOSING)) {
 		ch->ch_r_head = tail;
 
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 6ba5681b6385..eacf4c9f3b29 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -639,7 +639,7 @@ static void shutdown(struct tty_struct *tty, struct serial_state *info)
 	custom.adkcon = AC_UARTBRK;
 	mb();
 
-	if (tty->termios.c_cflag & HUPCL)
+	if (C_HUPCL(tty))
 		info->MCR &= ~(SER_DTR|SER_RTS);
 	rtsdtr_ctrl(info->MCR);
 
@@ -974,7 +974,7 @@ static void rs_throttle(struct tty_struct * tty)
 	if (I_IXOFF(tty))
 		rs_send_xchar(tty, STOP_CHAR(tty));
 
-	if (tty->termios.c_cflag & CRTSCTS)
+	if (C_CRTSCTS(tty))
 		info->MCR &= ~SER_RTS;
 
 	local_irq_save(flags);
@@ -999,7 +999,7 @@ static void rs_unthrottle(struct tty_struct * tty)
 		else
 			rs_send_xchar(tty, START_CHAR(tty));
 	}
-	if (tty->termios.c_cflag & CRTSCTS)
+	if (C_CRTSCTS(tty))
 		info->MCR |= SER_RTS;
 	local_irq_save(flags);
 	rtsdtr_ctrl(info->MCR);
@@ -1332,8 +1332,7 @@ static void rs_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	change_speed(tty, info, old_termios);
 
 	/* Handle transition to B0 status */
-	if ((old_termios->c_cflag & CBAUD) &&
-	    !(cflag & CBAUD)) {
+	if ((old_termios->c_cflag & CBAUD) && !(cflag & CBAUD)) {
 		info->MCR &= ~(SER_DTR|SER_RTS);
 		local_irq_save(flags);
 		rtsdtr_ctrl(info->MCR);
@@ -1341,21 +1340,17 @@ static void rs_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	}
 
 	/* Handle transition away from B0 status */
-	if (!(old_termios->c_cflag & CBAUD) &&
-	    (cflag & CBAUD)) {
+	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		info->MCR |= SER_DTR;
-		if (!(tty->termios.c_cflag & CRTSCTS) || 
-		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
 			info->MCR |= SER_RTS;
-		}
 		local_irq_save(flags);
 		rtsdtr_ctrl(info->MCR);
 		local_irq_restore(flags);
 	}
 
 	/* Handle turning off CRTSCTS */
-	if ((old_termios->c_cflag & CRTSCTS) &&
-	    !(tty->termios.c_cflag & CRTSCTS)) {
+	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
 		tty->hw_stopped = 0;
 		rs_start(tty);
 	}
@@ -1367,8 +1362,7 @@ static void rs_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	 * XXX  It's not clear whether the current behavior is correct
 	 * or not.  Hence, this may change.....
 	 */
-	if (!(old_termios->c_cflag & CLOCAL) &&
-	    (tty->termios.c_cflag & CLOCAL))
+	if (!(old_termios->c_cflag & CLOCAL) && C_CLOCAL(tty))
 		wake_up_interruptible(&info->open_wait);
 #endif
 }
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index a48e7e66b970..d67e542bab1c 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -1440,7 +1440,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 			info->port.xmit_buf = NULL;
 			free_page((unsigned long)temp);
 		}
-		if (tty->termios.c_cflag & HUPCL)
+		if (C_HUPCL(tty))
 			cyy_change_rts_dtr(info, 0, TIOCM_RTS | TIOCM_DTR);
 
 		cyy_issue_cmd(info, CyCHAN_CTL | CyDIS_RCVR);
@@ -1469,7 +1469,7 @@ static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
 			free_page((unsigned long)temp);
 		}
 
-		if (tty->termios.c_cflag & HUPCL)
+		if (C_HUPCL(tty))
 			tty_port_lower_dtr_rts(&info->port);
 
 		set_bit(TTY_IO_ERROR, &tty->flags);
@@ -2795,8 +2795,7 @@ static void cy_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 
 	cy_set_line_char(info, tty);
 
-	if ((old_termios->c_cflag & CRTSCTS) &&
-			!(tty->termios.c_cflag & CRTSCTS)) {
+	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
 		tty->hw_stopped = 0;
 		cy_start(tty);
 	}
@@ -2807,8 +2806,7 @@ static void cy_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	 * XXX  It's not clear whether the current behavior is correct
 	 * or not.  Hence, this may change.....
 	 */
-	if (!(old_termios->c_cflag & CLOCAL) &&
-	    (tty->termios.c_cflag & CLOCAL))
+	if (!(old_termios->c_cflag & CLOCAL) && C_CLOCAL(tty))
 		wake_up_interruptible(&info->port.open_wait);
 #endif
 }				/* cy_set_termios */
@@ -2868,7 +2866,7 @@ static void cy_throttle(struct tty_struct *tty)
 			info->throttle = 1;
 	}
 
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		if (!cy_is_Z(card)) {
 			spin_lock_irqsave(&card->card_lock, flags);
 			cyy_change_rts_dtr(info, 0, TIOCM_RTS);
@@ -2905,7 +2903,7 @@ static void cy_unthrottle(struct tty_struct *tty)
 			cy_send_xchar(tty, START_CHAR(tty));
 	}
 
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		card = info->card;
 		if (!cy_is_Z(card)) {
 			spin_lock_irqsave(&card->card_lock, flags);
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index 99875949bfb7..8bf67630018b 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -1204,8 +1204,7 @@ static void isicom_set_termios(struct tty_struct *tty,
 	isicom_config_port(tty);
 	spin_unlock_irqrestore(&port->card->card_lock, flags);
 
-	if ((old_termios->c_cflag & CRTSCTS) &&
-			!(tty->termios.c_cflag & CRTSCTS)) {
+	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
 		tty->hw_stopped = 0;
 		isicom_start(tty);
 	}
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 4c4a23674569..e9600cece8da 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -1864,7 +1864,7 @@ static void mxser_stoprx(struct tty_struct *tty)
 		}
 	}
 
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		info->MCR &= ~UART_MCR_RTS;
 		outb(info->MCR, info->ioaddr + UART_MCR);
 	}
@@ -1901,7 +1901,7 @@ static void mxser_unthrottle(struct tty_struct *tty)
 		}
 	}
 
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		info->MCR |= UART_MCR_RTS;
 		outb(info->MCR, info->ioaddr + UART_MCR);
 	}
@@ -1949,15 +1949,13 @@ static void mxser_set_termios(struct tty_struct *tty, struct ktermios *old_termi
 	mxser_change_speed(tty, old_termios);
 	spin_unlock_irqrestore(&info->slock, flags);
 
-	if ((old_termios->c_cflag & CRTSCTS) &&
-			!(tty->termios.c_cflag & CRTSCTS)) {
+	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
 		tty->hw_stopped = 0;
 		mxser_start(tty);
 	}
 
 	/* Handle sw stopped */
-	if ((old_termios->c_iflag & IXON) &&
-			!(tty->termios.c_iflag & IXON)) {
+	if ((old_termios->c_iflag & IXON) && !I_IXON(tty)) {
 		tty->stopped = 0;
 
 		if (info->board->chip_flag) {
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index e3cc27749344..c01620780f5b 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -1066,7 +1066,7 @@ static void gsm_process_modem(struct tty_struct *tty, struct gsm_dlci *dlci,
 	/* Carrier drop -> hangup */
 	if (tty) {
 		if ((mlines & TIOCM_CD) == 0 && (dlci->modem_rx & TIOCM_CD))
-			if (!(tty->termios.c_cflag & CLOCAL))
+			if (!C_CLOCAL(tty))
 				tty_hangup(tty);
 	}
 	if (brk & 0x01)
@@ -3116,7 +3116,7 @@ static void gsmtty_throttle(struct tty_struct *tty)
 	struct gsm_dlci *dlci = tty->driver_data;
 	if (dlci->state == DLCI_CLOSED)
 		return;
-	if (tty->termios.c_cflag & CRTSCTS)
+	if (C_CRTSCTS(tty))
 		dlci->modem_tx &= ~TIOCM_DTR;
 	dlci->throttled = 1;
 	/* Send an MSC with DTR cleared */
@@ -3128,7 +3128,7 @@ static void gsmtty_unthrottle(struct tty_struct *tty)
 	struct gsm_dlci *dlci = tty->driver_data;
 	if (dlci->state == DLCI_CLOSED)
 		return;
-	if (tty->termios.c_cflag & CRTSCTS)
+	if (C_CRTSCTS(tty))
 		dlci->modem_tx |= TIOCM_DTR;
 	dlci->throttled = 0;
 	/* Send an MSC with DTR set */
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index be5020d567ae..78170e7aa3a3 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -263,8 +263,7 @@ static void pty_set_termios(struct tty_struct *tty,
 {
 	/* See if packet mode change of state. */
 	if (tty->link && tty->link->packet) {
-		int extproc = (old_termios->c_lflag & EXTPROC) |
-				(tty->termios.c_lflag & EXTPROC);
+		int extproc = (old_termios->c_lflag & EXTPROC) | L_EXTPROC(tty);
 		int old_flow = ((old_termios->c_iflag & IXON) &&
 				(old_termios->c_cc[VSTOP] == '\023') &&
 				(old_termios->c_cc[VSTART] == '\021'));
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index 9dd02daf45ca..2ab3b6fdb675 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -959,7 +959,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 			tty->alt_speed = 460800;
 
 		configure_r_port(tty, info, NULL);
-		if (tty->termios.c_cflag & CBAUD) {
+		if (C_BAUD(tty)) {
 			sSetDTR(cp);
 			sSetRTS(cp);
 		}
@@ -1084,18 +1084,18 @@ static void rp_set_termios(struct tty_struct *tty,
 	cp = &info->channel;
 
 	/* Handle transition to B0 status */
-	if ((old_termios->c_cflag & CBAUD) && !(tty->termios.c_cflag & CBAUD)) {
+	if ((old_termios->c_cflag & CBAUD) && !C_BAUD(tty)) {
 		sClrDTR(cp);
 		sClrRTS(cp);
 	}
 
 	/* Handle transition away from B0 status */
-	if (!(old_termios->c_cflag & CBAUD) && (tty->termios.c_cflag & CBAUD)) {
+	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		sSetRTS(cp);
 		sSetDTR(cp);
 	}
 
-	if ((old_termios->c_cflag & CRTSCTS) && !(tty->termios.c_cflag & CRTSCTS))
+	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty))
 		rp_start(tty);
 }
 
diff --git a/drivers/tty/serial/68328serial.c b/drivers/tty/serial/68328serial.c
index 90639b590a10..4931212e5cf6 100644
--- a/drivers/tty/serial/68328serial.c
+++ b/drivers/tty/serial/68328serial.c
@@ -976,10 +976,8 @@ static void rs_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 
 	change_speed(info, tty);
 
-	if ((old_termios->c_cflag & CRTSCTS) &&
-	    !(tty->termios.c_cflag & CRTSCTS))
+	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty))
 		rs_start(tty);
-	
 }
 
 /*
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index e98aef797065..37f10c4a3faf 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -1413,9 +1413,8 @@ rs_stop(struct tty_struct *tty)
 		xoff = IO_FIELD(R_SERIAL0_XOFF, xoff_char,
 				STOP_CHAR(info->port.tty));
 		xoff |= IO_STATE(R_SERIAL0_XOFF, tx_stop, stop);
-		if (tty->termios.c_iflag & IXON ) {
+		if (I_IXON(tty))
 			xoff |= IO_STATE(R_SERIAL0_XOFF, auto_xoff, enable);
-		}
 
 		*((unsigned long *)&info->ioport[REG_XOFF]) = xoff;
 		local_irq_restore(flags);
@@ -1436,9 +1435,8 @@ rs_start(struct tty_struct *tty)
 					 info->xmit.tail,SERIAL_XMIT_SIZE)));
 		xoff = IO_FIELD(R_SERIAL0_XOFF, xoff_char, STOP_CHAR(tty));
 		xoff |= IO_STATE(R_SERIAL0_XOFF, tx_stop, enable);
-		if (tty->termios.c_iflag & IXON ) {
+		if (I_IXON(tty))
 			xoff |= IO_STATE(R_SERIAL0_XOFF, auto_xoff, enable);
-		}
 
 		*((unsigned long *)&info->ioport[REG_XOFF]) = xoff;
 		if (!info->uses_dma_out &&
@@ -3166,7 +3164,7 @@ rs_throttle(struct tty_struct * tty)
 	DFLOW(DEBUG_LOG(info->line,"rs_throttle\n"));
 
 	/* Do RTS before XOFF since XOFF might take some time */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		/* Turn off RTS line */
 		e100_rts(info, 0);
 	}
@@ -3185,7 +3183,7 @@ rs_unthrottle(struct tty_struct * tty)
 	DFLOW(DEBUG_LOG(info->line,"rs_unthrottle ldisc\n"));
 	DFLOW(DEBUG_LOG(info->line,"rs_unthrottle flip.count: %i\n", tty->flip.count));
 	/* Do RTS before XOFF since XOFF might take some time */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		/* Assert RTS line  */
 		e100_rts(info, 1);
 	}
@@ -3553,8 +3551,7 @@ rs_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	change_speed(info);
 
 	/* Handle turning off CRTSCTS */
-	if ((old_termios->c_cflag & CRTSCTS) &&
-	    !(tty->termios.c_cflag & CRTSCTS))
+	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty))
 		rs_start(tty);
 
 }
@@ -3765,9 +3762,8 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 		return 0;
 	}
 
-	if (tty->termios.c_cflag & CLOCAL) {
-			do_clocal = 1;
-	}
+	if (C_CLOCAL(tty))
+		do_clocal = 1;
 
 	/*
 	 * Block waiting for the carrier detect and the line to become
diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index 524e86ab3cae..00cac10ae75a 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -569,8 +569,7 @@ void jsm_input(struct jsm_channel *ch)
 	 *If the device is not open, or CREAD is off, flush
 	 *input data and return immediately.
 	 */
-	if (!tp ||
-		!(tp->termios.c_cflag & CREAD) ) {
+	if (!tp || !C_CREAD(tp)) {
 
 		jsm_dbg(READ, &ch->ch_bd->pci_dev,
 			"input. dropping %d bytes on port %d...\n",
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index b1f54ab1818c..1efdc2b476ea 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -171,14 +171,12 @@ static int uart_port_startup(struct tty_struct *tty, struct uart_state *state,
 		 */
 		uart_change_speed(tty, state, NULL);
 
-		if (init_hw) {
-			/*
-			 * Setup the RTS and DTR signals once the
-			 * port is open and ready to respond.
-			 */
-			if (tty->termios.c_cflag & CBAUD)
-				uart_set_mctrl(uport, TIOCM_RTS | TIOCM_DTR);
-		}
+		/*
+		 * Setup the RTS and DTR signals once the
+		 * port is open and ready to respond.
+		 */
+		if (init_hw && C_BAUD(tty))
+			uart_set_mctrl(uport, TIOCM_RTS | TIOCM_DTR);
 	}
 
 	/*
@@ -240,7 +238,7 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state)
 		if (uart_console(uport) && tty)
 			uport->cons->cflag = tty->termios.c_cflag;
 
-		if (!tty || (tty->termios.c_cflag & HUPCL))
+		if (!tty || C_HUPCL(tty))
 			uart_clear_mctrl(uport, TIOCM_DTR | TIOCM_RTS);
 
 		uart_port_shutdown(port);
@@ -639,7 +637,7 @@ static void uart_throttle(struct tty_struct *tty)
 
 	if (I_IXOFF(tty))
 		mask |= UPSTAT_AUTOXOFF;
-	if (tty->termios.c_cflag & CRTSCTS)
+	if (C_CRTSCTS(tty))
 		mask |= UPSTAT_AUTORTS;
 
 	if (port->status & mask) {
@@ -662,7 +660,7 @@ static void uart_unthrottle(struct tty_struct *tty)
 
 	if (I_IXOFF(tty))
 		mask |= UPSTAT_AUTOXOFF;
-	if (tty->termios.c_cflag & CRTSCTS)
+	if (C_CRTSCTS(tty))
 		mask |= UPSTAT_AUTORTS;
 
 	if (port->status & mask) {
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 6188059fd523..f5476e270734 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -2363,7 +2363,7 @@ static void mgsl_throttle(struct tty_struct * tty)
 	if (I_IXOFF(tty))
 		mgsl_send_xchar(tty, STOP_CHAR(tty));
 
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->irq_spinlock,flags);
 		info->serial_signals &= ~SerialSignal_RTS;
 	 	usc_set_serial_signals(info);
@@ -2397,7 +2397,7 @@ static void mgsl_unthrottle(struct tty_struct * tty)
 			mgsl_send_xchar(tty, START_CHAR(tty));
 	}
 
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->irq_spinlock,flags);
 		info->serial_signals |= SerialSignal_RTS;
 	 	usc_set_serial_signals(info);
@@ -3039,30 +3039,25 @@ static void mgsl_set_termios(struct tty_struct *tty, struct ktermios *old_termio
 	mgsl_change_params(info);
 
 	/* Handle transition to B0 status */
-	if (old_termios->c_cflag & CBAUD &&
-	    !(tty->termios.c_cflag & CBAUD)) {
+	if ((old_termios->c_cflag & CBAUD) && !C_BAUD(tty)) {
 		info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
 		spin_lock_irqsave(&info->irq_spinlock,flags);
 	 	usc_set_serial_signals(info);
 		spin_unlock_irqrestore(&info->irq_spinlock,flags);
 	}
-	
+
 	/* Handle transition away from B0 status */
-	if (!(old_termios->c_cflag & CBAUD) &&
-	    tty->termios.c_cflag & CBAUD) {
+	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->serial_signals |= SerialSignal_DTR;
- 		if (!(tty->termios.c_cflag & CRTSCTS) || 
- 		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
 			info->serial_signals |= SerialSignal_RTS;
- 		}
 		spin_lock_irqsave(&info->irq_spinlock,flags);
 	 	usc_set_serial_signals(info);
 		spin_unlock_irqrestore(&info->irq_spinlock,flags);
 	}
-	
+
 	/* Handle turning off CRTSCTS */
-	if (old_termios->c_cflag & CRTSCTS &&
-	    !(tty->termios.c_cflag & CRTSCTS)) {
+	if (old_termios->c_cflag & CRTSCTS && !C_CRTSCTS(tty)) {
 		tty->hw_stopped = 0;
 		mgsl_start(tty);
 	}
@@ -3281,7 +3276,7 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 		return 0;
 	}
 
-	if (tty->termios.c_cflag & CLOCAL)
+	if (C_CLOCAL(tty))
 		do_clocal = true;
 
 	/* Wait for carrier detect and the line to become
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 5505ea842179..c0a2f5a1b1c2 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -774,8 +774,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	change_params(info);
 
 	/* Handle transition to B0 status */
-	if (old_termios->c_cflag & CBAUD &&
-	    !(tty->termios.c_cflag & CBAUD)) {
+	if ((old_termios->c_cflag & CBAUD) && !C_BAUD(tty)) {
 		info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
 		spin_lock_irqsave(&info->lock,flags);
 		set_signals(info);
@@ -783,21 +782,17 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	}
 
 	/* Handle transition away from B0 status */
-	if (!(old_termios->c_cflag & CBAUD) &&
-	    tty->termios.c_cflag & CBAUD) {
+	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->signals |= SerialSignal_DTR;
- 		if (!(tty->termios.c_cflag & CRTSCTS) ||
- 		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+ 		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
 			info->signals |= SerialSignal_RTS;
- 		}
 		spin_lock_irqsave(&info->lock,flags);
 	 	set_signals(info);
 		spin_unlock_irqrestore(&info->lock,flags);
 	}
 
 	/* Handle turning off CRTSCTS */
-	if (old_termios->c_cflag & CRTSCTS &&
-	    !(tty->termios.c_cflag & CRTSCTS)) {
+	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
 		tty->hw_stopped = 0;
 		tx_release(tty);
 	}
@@ -1362,7 +1357,7 @@ static void throttle(struct tty_struct * tty)
 	DBGINFO(("%s throttle\n", info->device_name));
 	if (I_IXOFF(tty))
 		send_xchar(tty, STOP_CHAR(tty));
- 	if (tty->termios.c_cflag & CRTSCTS) {
+ 	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock,flags);
 		info->signals &= ~SerialSignal_RTS;
 	 	set_signals(info);
@@ -1387,7 +1382,7 @@ static void unthrottle(struct tty_struct * tty)
 		else
 			send_xchar(tty, START_CHAR(tty));
 	}
- 	if (tty->termios.c_cflag & CRTSCTS) {
+ 	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock,flags);
 		info->signals |= SerialSignal_RTS;
 	 	set_signals(info);
@@ -3280,7 +3275,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 		return 0;
 	}
 
-	if (tty->termios.c_cflag & CLOCAL)
+	if (C_CLOCAL(tty))
 		do_clocal = true;
 
 	/* Wait for carrier detect and the line to become
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index fb00a06dfa4b..4b314b63a2ed 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -871,8 +871,7 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	change_params(info);
 
 	/* Handle transition to B0 status */
-	if (old_termios->c_cflag & CBAUD &&
-	    !(tty->termios.c_cflag & CBAUD)) {
+	if ((old_termios->c_cflag & CBAUD) && !C_BAUD(tty)) {
 		info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
 		spin_lock_irqsave(&info->lock,flags);
 	 	set_signals(info);
@@ -880,21 +879,17 @@ static void set_termios(struct tty_struct *tty, struct ktermios *old_termios)
 	}
 
 	/* Handle transition away from B0 status */
-	if (!(old_termios->c_cflag & CBAUD) &&
-	    tty->termios.c_cflag & CBAUD) {
+	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
 		info->serial_signals |= SerialSignal_DTR;
- 		if (!(tty->termios.c_cflag & CRTSCTS) ||
- 		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+ 		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
 			info->serial_signals |= SerialSignal_RTS;
- 		}
 		spin_lock_irqsave(&info->lock,flags);
 	 	set_signals(info);
 		spin_unlock_irqrestore(&info->lock,flags);
 	}
 
 	/* Handle turning off CRTSCTS */
-	if (old_termios->c_cflag & CRTSCTS &&
-	    !(tty->termios.c_cflag & CRTSCTS)) {
+	if (old_termios->c_cflag & CRTSCTS && !C_CRTSCTS(tty)) {
 		tty->hw_stopped = 0;
 		tx_release(tty);
 	}
@@ -1472,7 +1467,7 @@ static void throttle(struct tty_struct * tty)
 	if (I_IXOFF(tty))
 		send_xchar(tty, STOP_CHAR(tty));
 
- 	if (tty->termios.c_cflag & CRTSCTS) {
+ 	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock,flags);
 		info->serial_signals &= ~SerialSignal_RTS;
 	 	set_signals(info);
@@ -1501,7 +1496,7 @@ static void unthrottle(struct tty_struct * tty)
 			send_xchar(tty, START_CHAR(tty));
 	}
 
- 	if (tty->termios.c_cflag & CRTSCTS) {
+ 	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock,flags);
 		info->serial_signals |= SerialSignal_RTS;
 	 	set_signals(info);
@@ -3297,7 +3292,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 		return 0;
 	}
 
-	if (tty->termios.c_cflag & CLOCAL)
+	if (C_CLOCAL(tty))
 		do_clocal = true;
 
 	/* Wait for carrier detect and the line to become
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 0ea351388724..23bf5bb1d8bf 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -719,16 +719,16 @@ static int get_sgflags(struct tty_struct *tty)
 {
 	int flags = 0;
 
-	if (!(tty->termios.c_lflag & ICANON)) {
-		if (tty->termios.c_lflag & ISIG)
+	if (!L_ICANON(tty)) {
+		if (L_ISIG(tty))
 			flags |= 0x02;		/* cbreak */
 		else
 			flags |= 0x20;		/* raw */
 	}
-	if (tty->termios.c_lflag & ECHO)
+	if (L_ECHO(tty))
 		flags |= 0x08;			/* echo */
-	if (tty->termios.c_oflag & OPOST)
-		if (tty->termios.c_oflag & ONLCR)
+	if (O_OPOST(tty))
+		if (O_ONLCR(tty))
 			flags |= 0x10;		/* crmod */
 	return flags;
 }
@@ -908,7 +908,7 @@ static int tty_change_softcar(struct tty_struct *tty, int arg)
 	tty->termios.c_cflag |= bit;
 	if (tty->ops->set_termios)
 		tty->ops->set_termios(tty, &old);
-	if ((tty->termios.c_cflag & CLOCAL) != bit)
+	if (C_CLOCAL(tty) != bit)
 		ret = -EINVAL;
 	up_write(&tty->termios_rwsem);
 	return ret;
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index a76aec2ca480..0473fc7543f7 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -370,7 +370,7 @@ int tty_port_block_til_ready(struct tty_port *port,
 	}
 	if (filp->f_flags & O_NONBLOCK) {
 		/* Indicate we are open */
-		if (tty->termios.c_cflag & CBAUD)
+		if (C_BAUD(tty))
 			tty_port_raise_dtr_rts(port);
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 01bf53392819..b283eb8b86d6 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -1165,8 +1165,7 @@ static void cypress_read_int_callback(struct urb *urb)
 
 	/* hangup, as defined in acm.c... this might be a bad place for it
 	 * though */
-	if (tty && !(tty->termios.c_cflag & CLOCAL) &&
-			!(priv->current_status & UART_CD)) {
+	if (tty && !C_CLOCAL(tty) && !(priv->current_status & UART_CD)) {
 		dev_dbg(dev, "%s - calling hangup\n", __func__);
 		tty_hangup(tty);
 		goto continue_read;
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 12b0e67473ba..010a42a92688 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -695,11 +695,11 @@ static void digi_set_termios(struct tty_struct *tty,
 		arg = -1;
 
 		/* reassert DTR and (maybe) RTS on transition from B0 */
-		if ((old_cflag&CBAUD) == B0) {
+		if ((old_cflag & CBAUD) == B0) {
 			/* don't set RTS if using hardware flow control */
 			/* and throttling input */
 			modem_signals = TIOCM_DTR;
-			if (!(tty->termios.c_cflag & CRTSCTS) ||
+			if (!C_CRTSCTS(tty) ||
 			    !test_bit(TTY_THROTTLED, &tty->flags))
 				modem_signals |= TIOCM_RTS;
 			digi_set_modem_signals(port, modem_signals, 1);
@@ -1491,8 +1491,8 @@ static int digi_read_oob_callback(struct urb *urb)
 
 		rts = 0;
 		if (tty)
-			rts = tty->termios.c_cflag & CRTSCTS;
-		
+			rts = C_CRTSCTS(tty);
+
 		if (tty && opcode == DIGI_CMD_READ_INPUT_SIGNALS) {
 			spin_lock(&priv->dp_port_lock);
 			/* convert from digi flags to termiox flags */
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index f49327d20ee8..f3007ecdd1b4 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -1398,7 +1398,7 @@ static void edge_throttle(struct tty_struct *tty)
 	}
 
 	/* if we are implementing RTS/CTS, toggle that line */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		edge_port->shadowMCR &= ~MCR_RTS;
 		status = send_cmd_write_uart_register(edge_port, MCR,
 							edge_port->shadowMCR);
@@ -1435,7 +1435,7 @@ static void edge_unthrottle(struct tty_struct *tty)
 			return;
 	}
 	/* if we are implementing RTS/CTS, toggle that line */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		edge_port->shadowMCR |= MCR_RTS;
 		send_cmd_write_uart_register(edge_port, MCR,
 						edge_port->shadowMCR);
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index fd707d6a10e2..4446b8d70ac2 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -428,7 +428,7 @@ static int  mct_u232_open(struct tty_struct *tty, struct usb_serial_port *port)
 	 * either.
 	 */
 	spin_lock_irqsave(&priv->lock, flags);
-	if (tty && (tty->termios.c_cflag & CBAUD))
+	if (tty && C_BAUD(tty))
 		priv->control_state = TIOCM_DTR | TIOCM_RTS;
 	else
 		priv->control_state = 0;
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 78b4f64c6b00..2eddbe538cda 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -1308,7 +1308,7 @@ static void mos7720_throttle(struct tty_struct *tty)
 	}
 
 	/* if we are implementing RTS/CTS, toggle that line */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		mos7720_port->shadowMCR &= ~UART_MCR_RTS;
 		write_mos_reg(port->serial, port->port_number, MOS7720_MCR,
 			      mos7720_port->shadowMCR);
@@ -1338,7 +1338,7 @@ static void mos7720_unthrottle(struct tty_struct *tty)
 	}
 
 	/* if we are implementing RTS/CTS, toggle that line */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		mos7720_port->shadowMCR |= UART_MCR_RTS;
 		write_mos_reg(port->serial, port->port_number, MOS7720_MCR,
 			      mos7720_port->shadowMCR);
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 2c69bfcdacc6..02ea975754f5 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -1425,7 +1425,7 @@ static void mos7840_throttle(struct tty_struct *tty)
 			return;
 	}
 	/* if we are implementing RTS/CTS, toggle that line */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		mos7840_port->shadowMCR &= ~MCR_RTS;
 		status = mos7840_set_uart_reg(port, MODEM_CONTROL_REGISTER,
 					 mos7840_port->shadowMCR);
@@ -1466,7 +1466,7 @@ static void mos7840_unthrottle(struct tty_struct *tty)
 	}
 
 	/* if we are implementing RTS/CTS, toggle that line */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		mos7840_port->shadowMCR |= MCR_RTS;
 		status = mos7840_set_uart_reg(port, MODEM_CONTROL_REGISTER,
 					 mos7840_port->shadowMCR);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index a4237707f79d..1776fe5715e5 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -287,14 +287,14 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 
 	if (filp->f_flags & O_NONBLOCK) {
 		/* nonblock mode is set */
-		if (tty->termios.c_cflag & CBAUD)
+		if (C_BAUD(tty))
 			tty_port_raise_dtr_rts(port);
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		pr_debug("%s(), O_NONBLOCK requested!\n", __func__);
 		return 0;
 	}
 
-	if (tty->termios.c_cflag & CLOCAL) {
+	if (C_CLOCAL(tty)) {
 		pr_debug("%s(), doing CLOCAL!\n", __func__);
 		do_clocal = 1;
 	}
@@ -806,7 +806,7 @@ static void ircomm_tty_throttle(struct tty_struct *tty)
 		ircomm_tty_send_xchar(tty, STOP_CHAR(tty));
 
 	/* Hardware flow control? */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		self->settings.dte &= ~IRCOMM_RTS;
 		self->settings.dte |= IRCOMM_DELTA_RTS;
 
@@ -831,12 +831,11 @@ static void ircomm_tty_unthrottle(struct tty_struct *tty)
 	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
 
 	/* Using software flow control? */
-	if (I_IXOFF(tty)) {
+	if (I_IXOFF(tty))
 		ircomm_tty_send_xchar(tty, START_CHAR(tty));
-	}
 
 	/* Using hardware flow control? */
-	if (tty->termios.c_cflag & CRTSCTS) {
+	if (C_CRTSCTS(tty)) {
 		self->settings.dte |= (IRCOMM_RTS|IRCOMM_DELTA_RTS);
 
 		ircomm_param_request(self, IRCOMM_DTE, TRUE);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 75ccdbd0728e..d3687aaa23de 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -158,26 +158,21 @@ void ircomm_tty_set_termios(struct tty_struct *tty,
 	ircomm_tty_change_speed(self, tty);
 
 	/* Handle transition to B0 status */
-	if ((old_termios->c_cflag & CBAUD) &&
-	    !(cflag & CBAUD)) {
+	if ((old_termios->c_cflag & CBAUD) && !(cflag & CBAUD)) {
 		self->settings.dte &= ~(IRCOMM_DTR|IRCOMM_RTS);
 		ircomm_param_request(self, IRCOMM_DTE, TRUE);
 	}
 
 	/* Handle transition away from B0 status */
-	if (!(old_termios->c_cflag & CBAUD) &&
-	    (cflag & CBAUD)) {
+	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		self->settings.dte |= IRCOMM_DTR;
-		if (!(tty->termios.c_cflag & CRTSCTS) ||
-		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
 			self->settings.dte |= IRCOMM_RTS;
-		}
 		ircomm_param_request(self, IRCOMM_DTE, TRUE);
 	}
 
 	/* Handle turning off CRTSCTS */
-	if ((old_termios->c_cflag & CRTSCTS) &&
-	    !(tty->termios.c_cflag & CRTSCTS))
+	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty))
 	{
 		tty->hw_stopped = 0;
 		ircomm_tty_start(tty);
-- 
cgit v1.2.3-59-g8ed1b


From afc5ab096581e1ad6e7e7d1533a6bbb1d2b12455 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 10 Jan 2016 14:51:40 -0800
Subject: tty: Remove ASYNC_CLOSING

The tty core no longer provides nor uses ASYNC_CLOSING; remove from
tty_port_close_start() and tty_port_close_end() as well as tty drivers
which open-code these state changes. Unfortunately, even though the
bit is masked from userspace, its inclusion in a uapi header precludes
removing the macro.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux-s390@vger.kernel.org
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: linux-cris-kernel@axis.com
Cc: Samuel Ortiz <samuel@sortiz.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/s390/char/con3215.c      | 3 +--
 drivers/tty/rocket.c             | 2 +-
 drivers/tty/serial/68328serial.c | 3 +--
 drivers/tty/serial/crisv10.c     | 3 +--
 drivers/tty/serial/serial_core.c | 1 -
 drivers/tty/tty_port.c           | 3 +--
 net/irda/ircomm/ircomm_tty.c     | 4 ----
 7 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index 7d82bbcb12df..e7e078b3c7e6 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -643,7 +643,6 @@ static void raw3215_shutdown(struct raw3215_info *raw)
 	if ((raw->flags & RAW3215_WORKING) ||
 	    raw->queued_write != NULL ||
 	    raw->queued_read != NULL) {
-		raw->port.flags |= ASYNC_CLOSING;
 		add_wait_queue(&raw->empty_wait, &wait);
 		set_current_state(TASK_INTERRUPTIBLE);
 		spin_unlock_irqrestore(get_ccwdev_lock(raw->cdev), flags);
@@ -651,7 +650,7 @@ static void raw3215_shutdown(struct raw3215_info *raw)
 		spin_lock_irqsave(get_ccwdev_lock(raw->cdev), flags);
 		remove_wait_queue(&raw->empty_wait, &wait);
 		set_current_state(TASK_RUNNING);
-		raw->port.flags &= ~(ASYNC_INITIALIZED | ASYNC_CLOSING);
+		raw->port.flags &= ~ASYNC_INITIALIZED;
 	}
 	spin_unlock_irqrestore(get_ccwdev_lock(raw->cdev), flags);
 }
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index 2ab3b6fdb675..0b802cdd70d0 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -1042,7 +1042,7 @@ static void rp_close(struct tty_struct *tty, struct file *filp)
 		}
 	}
 	spin_lock_irq(&port->lock);
-	info->port.flags &= ~(ASYNC_INITIALIZED | ASYNC_CLOSING | ASYNC_NORMAL_ACTIVE);
+	info->port.flags &= ~(ASYNC_INITIALIZED | ASYNC_NORMAL_ACTIVE);
 	tty->closing = 0;
 	spin_unlock_irq(&port->lock);
 	mutex_unlock(&port->mutex);
diff --git a/drivers/tty/serial/68328serial.c b/drivers/tty/serial/68328serial.c
index 4931212e5cf6..9587ed1242fe 100644
--- a/drivers/tty/serial/68328serial.c
+++ b/drivers/tty/serial/68328serial.c
@@ -1028,7 +1028,6 @@ static void rs_close(struct tty_struct *tty, struct file *filp)
 		local_irq_restore(flags);
 		return;
 	}
-	port->flags |= ASYNC_CLOSING;
 	/*
 	 * Now we wait for the transmit buffer to clear; and we notify 
 	 * the line discipline to only process XON/XOFF characters.
@@ -1058,7 +1057,7 @@ static void rs_close(struct tty_struct *tty, struct file *filp)
 			msleep_interruptible(jiffies_to_msecs(port->close_delay));
 		wake_up_interruptible(&port->open_wait);
 	}
-	port->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
+	port->flags &= ~ASYNC_NORMAL_ACTIVE;
 	local_irq_restore(flags);
 }
 
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index 37f10c4a3faf..c0172bf54a9b 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -3610,7 +3610,6 @@ rs_close(struct tty_struct *tty, struct file * filp)
 		local_irq_restore(flags);
 		return;
 	}
-	info->port.flags |= ASYNC_CLOSING;
 	/*
 	 * Now we wait for the transmit buffer to clear; and we notify
 	 * the line discipline to only process XON/XOFF characters.
@@ -3649,7 +3648,7 @@ rs_close(struct tty_struct *tty, struct file * filp)
 			schedule_timeout_interruptible(info->port.close_delay);
 		wake_up_interruptible(&info->port.open_wait);
 	}
-	info->port.flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
+	info->port.flags &= ~ASYNC_NORMAL_ACTIVE;
 	local_irq_restore(flags);
 
 	/* port closed */
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index be9359fdf5c7..85829f8568e7 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1423,7 +1423,6 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	 * Wake up anyone trying to open this port.
 	 */
 	clear_bit(ASYNCB_NORMAL_ACTIVE, &port->flags);
-	clear_bit(ASYNCB_CLOSING, &port->flags);
 	spin_unlock_irq(&port->lock);
 	wake_up_interruptible(&port->open_wait);
 
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 0473fc7543f7..dbcca30a54b1 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -476,7 +476,6 @@ int tty_port_close_start(struct tty_port *port,
 		spin_unlock_irqrestore(&port->lock, flags);
 		return 0;
 	}
-	set_bit(ASYNCB_CLOSING, &port->flags);
 	spin_unlock_irqrestore(&port->lock, flags);
 
 	tty->closing = 1;
@@ -515,7 +514,7 @@ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
 		spin_lock_irqsave(&port->lock, flags);
 		wake_up_interruptible(&port->open_wait);
 	}
-	port->flags &= ~(ASYNC_NORMAL_ACTIVE | ASYNC_CLOSING);
+	port->flags &= ~ASYNC_NORMAL_ACTIVE;
 	spin_unlock_irqrestore(&port->lock, flags);
 }
 EXPORT_SYMBOL(tty_port_close_end);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 1776fe5715e5..da126ee6d218 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -1267,10 +1267,6 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 		seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_CLOSING) {
-		seq_printf(m, "%cASYNC_CLOSING", sep);
-		sep = '|';
-	}
 	if (self->port.flags & ASYNC_NORMAL_ACTIVE) {
 		seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
 		sep = '|';
-- 
cgit v1.2.3-59-g8ed1b


From 6f68b002715460c7552e8688b6f289659d5f8046 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 6 Sep 2015 21:38:49 +0200
Subject: batman-adv: Fix kerneldoc parsing of structs/enums

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/multicast.h | 2 +-
 net/batman-adv/packet.h    | 8 ++++----
 net/batman-adv/types.h     | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 8f3cb04b9f13..685219f3fec9 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -23,7 +23,7 @@
 struct sk_buff;
 
 /**
- * batadv_forw_mode - the way a packet should be forwarded as
+ * enum batadv_forw_mode - the way a packet should be forwarded as
  * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic
  *  flooding)
  * @BATADV_FORW_SINGLE: forward the packet to a single node (currently via the
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 0558e3237e0e..e80a8fc9f49e 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -158,7 +158,7 @@ enum batadv_tt_client_flags {
 };
 
 /**
- * batadv_vlan_flags - flags for the four MSB of any vlan ID field
+ * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field
  * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
  */
 enum batadv_vlan_flags {
@@ -230,7 +230,7 @@ struct batadv_ogm_packet {
 #define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet)
 
 /**
- * batadv_icmp_header - common members among all the ICMP packets
+ * struct batadv_icmp_header - common members among all the ICMP packets
  * @packet_type: batman-adv packet type, part of the general header
  * @version: batman-adv protocol version, part of the genereal header
  * @ttl: time to live for this packet, part of the genereal header
@@ -256,7 +256,7 @@ struct batadv_icmp_header {
 };
 
 /**
- * batadv_icmp_packet - ICMP packet
+ * struct batadv_icmp_packet - ICMP packet
  * @packet_type: batman-adv packet type, part of the general header
  * @version: batman-adv protocol version, part of the genereal header
  * @ttl: time to live for this packet, part of the genereal header
@@ -282,7 +282,7 @@ struct batadv_icmp_packet {
 #define BATADV_RR_LEN 16
 
 /**
- * batadv_icmp_packet_rr - ICMP RouteRecord packet
+ * struct batadv_icmp_packet_rr - ICMP RouteRecord packet
  * @packet_type: batman-adv packet type, part of the general header
  * @version: batman-adv protocol version, part of the genereal header
  * @ttl: time to live for this packet, part of the genereal header
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 3437b667a2cd..8c3dda36d9ec 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -346,7 +346,7 @@ struct batadv_gw_node {
 };
 
 /**
- * batadv_hardif_neigh_node - unique neighbor per hard interface
+ * struct batadv_hardif_neigh_node - unique neighbor per hard interface
  * @list: list node for batadv_hard_iface::neigh_list
  * @addr: the MAC address of the neighboring interface
  * @if_incoming: pointer to incoming hard interface
-- 
cgit v1.2.3-59-g8ed1b


From 42ea22fe2c5da2ebd7123518a1b2cd7597ebffd7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 6 Sep 2015 21:38:50 +0200
Subject: batman-adv: Remove kerneldoc for missing parameters

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/fragmentation.c     | 1 -
 net/batman-adv/packet.h            | 1 -
 net/batman-adv/routing.c           | 1 -
 net/batman-adv/translation-table.c | 4 ----
 4 files changed, 7 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 20d9282f895b..fedddbe0cd28 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -242,7 +242,6 @@ err:
 /**
  * batadv_frag_merge_packets - merge a chain of fragments
  * @chain: head of chain with fragments
- * @skb: packet with total size of skb after merging
  *
  * Expand the first skb in the chain and copy the content of the remaining
  * skb's into the expanded one. After doing so, clear the chain.
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index e80a8fc9f49e..6d5b17841ff5 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -413,7 +413,6 @@ struct batadv_bcast_packet {
  * @packet_type: batman-adv packet type, part of the general header
  * @version: batman-adv protocol version, part of the genereal header
  * @ttl: time to live for this packet, part of the genereal header
- * @reserved: Align following fields to 2-byte boundaries
  * @first_source: original source of first included packet
  * @first_orig_dest: original destinal of first included packet
  * @first_crc: checksum of first included packet
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index e4f2646d9246..54d660806c2a 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -904,7 +904,6 @@ rx_success:
  * batadv_recv_unicast_tvlv - receive and process unicast tvlv packets
  * @skb: unicast tvlv packet to process
  * @recv_if: pointer to interface this packet was received on
- * @dst_addr: the payload destination
  *
  * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
  * otherwise.
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index cdfc85fa2743..3ccede59c842 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -217,7 +217,6 @@ batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry)
 
 /**
  * batadv_tt_global_hash_count - count the number of orig entries
- * @hash: hash table containing the tt entries
  * @addr: the mac address of the client to count entries for
  * @vid: VLAN identifier
  *
@@ -288,7 +287,6 @@ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv,
 /**
  * batadv_tt_global_size_mod - change the size by v of the local table
  *  identified by vid
- * @bat_priv: the bat priv with all the soft interface information
  * @vid: the VLAN identifier
  * @v: the amount to sum to the global table size
  */
@@ -2406,7 +2404,6 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
  * @orig_node: originator for which the CRCs have to be checked
  * @tt_vlan: pointer to the first tvlv VLAN entry
  * @num_vlan: number of tvlv VLAN entries
- * @create: if true, create VLAN objects if not found
  *
  * Return true if all the received CRCs match the locally stored ones, false
  * otherwise
@@ -3375,7 +3372,6 @@ out:
  * @tt_change: pointer to the first entry in the TT buffer
  * @tt_num_changes: number of tt changes inside the tt buffer
  * @ttvn: translation table version number of this changeset
- * @tt_crc: crc32 checksum of orig node's translation table
  */
 static void batadv_tt_update_orig(struct batadv_priv *bat_priv,
 				  struct batadv_orig_node *orig_node,
-- 
cgit v1.2.3-59-g8ed1b


From e51f0397ded5bcbc2dd1694bc3d6f24cb2d42fa5 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 6 Sep 2015 21:38:51 +0200
Subject: batman-adv: Fix names in kerneldoc of functions

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/originator.c        | 4 ++--
 net/batman-adv/packet.h            | 2 +-
 net/batman-adv/send.c              | 2 +-
 net/batman-adv/soft-interface.c    | 2 +-
 net/batman-adv/translation-table.c | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index fe578f75c391..dfeebc89ed9a 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -375,7 +375,7 @@ out:
 
 /**
  * batadv_neigh_ifinfo_get - find the ifinfo from an neigh_node
- * @neigh_node: the neigh node to be queried
+ * @neigh: the neigh node to be queried
  * @if_outgoing: the interface for which the ifinfo should be acquired
  *
  * The object is returned with refcounter increased by 1.
@@ -408,7 +408,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
 
 /**
  * batadv_neigh_ifinfo_new - search and possibly create an neigh_ifinfo object
- * @neigh_node: the neigh node to be queried
+ * @neigh: the neigh node to be queried
  * @if_outgoing: the interface for which the ifinfo should be acquired
  *
  * Returns NULL in case of failure or the neigh_ifinfo object for the
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 6d5b17841ff5..72fd5b0a4183 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -494,7 +494,7 @@ struct batadv_tvlv_gateway_data {
  * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container
  * @flags: translation table flags (see batadv_tt_data_flags)
  * @ttvn: translation table version number
- * @vlan_num: number of announced VLANs. In the TVLV this struct is followed by
+ * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by
  *  one batadv_tvlv_tt_vlan_data object per announced vlan
  */
 struct batadv_tvlv_tt_data {
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 782fa33ec296..3fbf7910c4d1 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -211,7 +211,7 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb,
  *  unicast 4addr header
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the skb containing the payload to encapsulate
- * @orig_node: the destination node
+ * @orig: the destination node
  * @packet_subtype: the unicast 4addr packet subtype to use
  *
  * Returns false if the payload could not be encapsulated or true otherwise.
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index ac4d08de5df4..c923e3c7b6ac 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -480,7 +480,7 @@ out:
 /**
  * batadv_softif_vlan_free_ref - decrease the vlan object refcounter and
  *  possibly free it
- * @softif_vlan: the vlan object to release
+ * @vlan: the vlan object to release
  */
 void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan)
 {
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 3ccede59c842..7d8fc90c9e38 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -3366,8 +3366,8 @@ out:
  * batadv_tt_update_orig - update global translation table with new tt
  *  information received via ogms
  * @bat_priv: the bat priv with all the soft interface information
- * @orig: the orig_node of the ogm
- * @tt_vlan: pointer to the first tvlv VLAN entry
+ * @orig_node: the orig_node of the ogm
+ * @tt_buff: pointer to the first tvlv VLAN entry
  * @tt_num_vlan: number of tvlv VLAN entries
  * @tt_change: pointer to the first entry in the TT buffer
  * @tt_num_changes: number of tt changes inside the tt buffer
-- 
cgit v1.2.3-59-g8ed1b


From f34ac9d4ec1fcb02fc70aad8d1ce0c0641a90b12 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 6 Sep 2015 21:38:53 +0200
Subject: batman-adv: Drop invalid kerneldoc for variable batadv_vlan_attrs

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/sysfs.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index fe87777fda8a..33772d9bd613 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -620,9 +620,7 @@ static struct batadv_attribute *batadv_mesh_attrs[] = {
 
 BATADV_ATTR_VLAN_BOOL(ap_isolation, S_IRUGO | S_IWUSR, NULL);
 
-/**
- * batadv_vlan_attrs - array of vlan specific sysfs attributes
- */
+/* array of vlan specific sysfs attributes */
 static struct batadv_attribute *batadv_vlan_attrs[] = {
 	&batadv_attr_vlan_ap_isolation,
 	NULL,
-- 
cgit v1.2.3-59-g8ed1b


From d1f6825067b9c167b3853c9af67e68d5412c2c63 Mon Sep 17 00:00:00 2001
From: Marek Lindner <mareklindner@neomailbox.ch>
Date: Sat, 15 Aug 2015 18:04:59 +0800
Subject: batman-adv: kernel doc readability updates

Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/types.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 8c3dda36d9ec..7f7a0f1eaf75 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -73,7 +73,7 @@ enum batadv_dhcp_recipient {
 #define BATADV_TT_SYNC_MASK	0x00F0
 
 /**
- * struct batadv_hard_iface_bat_iv - per hard interface B.A.T.M.A.N. IV data
+ * struct batadv_hard_iface_bat_iv - per hard-interface B.A.T.M.A.N. IV data
  * @ogm_buff: buffer holding the OGM packet
  * @ogm_buff_len: length of the OGM packet buffer
  * @ogm_seqno: OGM sequence number - used to identify each OGM
@@ -97,8 +97,8 @@ struct batadv_hard_iface_bat_iv {
  *  batman-adv for this interface
  * @soft_iface: the batman-adv interface which uses this network interface
  * @rcu: struct used for freeing in an RCU-safe manner
- * @bat_iv: BATMAN IV specific per hard interface data
- * @cleanup_work: work queue callback item for hard interface deinit
+ * @bat_iv: per hard-interface B.A.T.M.A.N. IV data
+ * @cleanup_work: work queue callback item for hard-interface deinit
  * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
  * @neigh_list: list of unique single hop neighbors via this interface
  * @neigh_list_lock: lock protecting neigh_list
@@ -125,7 +125,7 @@ struct batadv_hard_iface {
 /**
  * struct batadv_orig_ifinfo - originator info per outgoing interface
  * @list: list node for orig_node::ifinfo_list
- * @if_outgoing: pointer to outgoing hard interface
+ * @if_outgoing: pointer to outgoing hard-interface
  * @router: router that should be used to reach this originator
  * @last_real_seqno: last and best known sequence number
  * @last_ttl: ttl of last received packet
@@ -202,7 +202,7 @@ struct batadv_orig_node_vlan {
 
 /**
  * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members
- * @bcast_own: set of bitfields (one per hard interface) where each one counts
+ * @bcast_own: set of bitfields (one per hard-interface) where each one counts
  * the number of our OGMs this orig_node rebroadcasted "back" to us  (relative
  * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long.
  * @bcast_own_sum: sum of bcast_own
@@ -346,10 +346,11 @@ struct batadv_gw_node {
 };
 
 /**
- * struct batadv_hardif_neigh_node - unique neighbor per hard interface
+ * struct batadv_hardif_neigh_node - unique neighbor per hard-interface
  * @list: list node for batadv_hard_iface::neigh_list
  * @addr: the MAC address of the neighboring interface
- * @if_incoming: pointer to incoming hard interface
+ * @if_incoming: pointer to incoming hard-interface
+ * @last_seen: when last packet via this neighbor was received
  * @refcount: number of contexts the object is used
  * @rcu: struct used for freeing in a RCU-safe manner
  */
@@ -369,7 +370,7 @@ struct batadv_hardif_neigh_node {
  * @addr: the MAC address of the neighboring interface
  * @ifinfo_list: list for routing metrics per outgoing interface
  * @ifinfo_lock: lock protecting private ifinfo members and list
- * @if_incoming: pointer to incoming hard interface
+ * @if_incoming: pointer to incoming hard-interface
  * @last_seen: when last packet via this neighbor was received
  * @refcount: number of contexts the object is used
  * @rcu: struct used for freeing in an RCU-safe manner
@@ -388,7 +389,7 @@ struct batadv_neigh_node {
 
 /**
  * struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing
- *  interface for BATMAN IV
+ *  interface for B.A.T.M.A.N. IV
  * @tq_recv: ring buffer of received TQ values from this neigh node
  * @tq_index: ring buffer index
  * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv)
@@ -407,7 +408,7 @@ struct batadv_neigh_ifinfo_bat_iv {
 /**
  * struct batadv_neigh_ifinfo - neighbor information per outgoing interface
  * @list: list node for batadv_neigh_node::ifinfo_list
- * @if_outgoing: pointer to outgoing hard interface
+ * @if_outgoing: pointer to outgoing hard-interface
  * @bat_iv: B.A.T.M.A.N. IV private structure
  * @last_ttl: last received ttl from this neigh node
  * @refcount: number of contexts the object is used
@@ -783,8 +784,8 @@ struct batadv_softif_vlan {
  * @forw_bat_list_lock: lock protecting forw_bat_list
  * @forw_bcast_list_lock: lock protecting forw_bcast_list
  * @orig_work: work queue callback item for orig node purging
- * @cleanup_work: work queue callback item for soft interface deinit
- * @primary_if: one of the hard interfaces assigned to this mesh interface
+ * @cleanup_work: work queue callback item for soft-interface deinit
+ * @primary_if: one of the hard-interfaces assigned to this mesh interface
  *  becomes the primary interface
  * @bat_algo_ops: routing algorithm used by this mesh interface
  * @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top
-- 
cgit v1.2.3-59-g8ed1b


From 62fe710f6e2fdf4c337763bc0774cb3232fa2139 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Tue, 15 Sep 2015 19:00:48 +0200
Subject: batman-adv: Fix kerneldoc parsing of return description

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c            | 18 +++----
 net/batman-adv/bitarray.c              |  2 +-
 net/batman-adv/bitarray.h              | 11 +++--
 net/batman-adv/bridge_loop_avoidance.c | 68 ++++++++++++++++++--------
 net/batman-adv/distributed-arp-table.c | 37 +++++++-------
 net/batman-adv/fragmentation.c         | 17 ++++---
 net/batman-adv/fragmentation.h         |  2 +-
 net/batman-adv/gateway_client.c        | 14 +++---
 net/batman-adv/gateway_common.c        |  2 +-
 net/batman-adv/hard-interface.c        |  7 +--
 net/batman-adv/hash.h                  | 22 +++++----
 net/batman-adv/main.c                  | 26 +++++-----
 net/batman-adv/main.h                  | 13 +++--
 net/batman-adv/multicast.c             | 35 +++++++-------
 net/batman-adv/network-coding.c        | 42 ++++++++--------
 net/batman-adv/originator.c            | 48 +++++++++++--------
 net/batman-adv/routing.c               | 30 +++++++-----
 net/batman-adv/send.c                  | 24 +++++-----
 net/batman-adv/send.h                  |  4 +-
 net/batman-adv/soft-interface.c        | 14 +++---
 net/batman-adv/sysfs.c                 | 10 ++--
 net/batman-adv/translation-table.c     | 88 ++++++++++++++++++++--------------
 22 files changed, 305 insertions(+), 229 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index df625de55ef2..a6d389b0563c 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -88,7 +88,7 @@ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value)
  * in the given ring buffer
  * @lq_recv: pointer to the ring buffer
  *
- * Returns computed average value.
+ * Return: computed average value.
  */
 static u8 batadv_ring_buffer_avg(const u8 lq_recv[])
 {
@@ -132,7 +132,7 @@ static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node)
  * @orig_node: the orig_node that has to be changed
  * @max_if_num: the current amount of interfaces
  *
- * Returns 0 on success, a negative error code otherwise.
+ * Return: 0 on success, a negative error code otherwise.
  */
 static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
 				     int max_if_num)
@@ -180,7 +180,7 @@ unlock:
  * @max_if_num: the current amount of interfaces
  * @del_if_num: the index of the interface being removed
  *
- * Returns 0 on success, a negative error code otherwise.
+ * Return: 0 on success, a negative error code otherwise.
  */
 static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
 				     int max_if_num, int del_if_num)
@@ -246,7 +246,7 @@ unlock:
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: mac address of the originator
  *
- * Returns the originator object corresponding to the passed mac address or NULL
+ * Return: the originator object corresponding to the passed mac address or NULL
  * on failure.
  * If the object does not exists it is created an initialised.
  */
@@ -522,7 +522,7 @@ out:
  * @if_outgoing: interface for which the retransmission should be considered
  * @forw_packet: the forwarded packet which should be checked
  *
- * Returns true if new_packet can be aggregated with forw_packet
+ * Return: true if new_packet can be aggregated with forw_packet
  */
 static bool
 batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet,
@@ -1125,7 +1125,7 @@ out:
  * @if_incoming: interface where the packet was received
  * @if_outgoing: interface for which the retransmission should be considered
  *
- * Returns 1 if the link can be considered bidirectional, 0 otherwise
+ * Return: 1 if the link can be considered bidirectional, 0 otherwise
  */
 static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
 				 struct batadv_orig_node *orig_neigh_node,
@@ -1269,7 +1269,7 @@ out:
  * @if_incoming: interface on which the OGM packet was received
  * @if_outgoing: interface for which the retransmission should be considered
  *
- * Returns duplicate status as enum batadv_dup_status
+ * Return: duplicate status as enum batadv_dup_status
  */
 static enum batadv_dup_status
 batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
@@ -1929,7 +1929,7 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv,
  * @neigh2: the second neighbor object of the comparison
  * @if_outgoing2: outgoing interface for the second neighbor
  *
- * Returns a value less, equal to or greater than 0 if the metric via neigh1 is
+ * Return: a value less, equal to or greater than 0 if the metric via neigh1 is
  * lower, the same as or higher than the metric via neigh2
  */
 static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
@@ -1970,7 +1970,7 @@ out:
  * @neigh2: the second neighbor object of the comparison
  * @if_outgoing2: outgoing interface for the second neighbor
  *
- * Returns true if the metric via neigh1 is equally good or better than
+ * Return: true if the metric via neigh1 is equally good or better than
  * the metric via neigh2, false otherwise.
  */
 static bool
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 25cbc36e997a..06bab5179bf9 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -31,7 +31,7 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n)
 
 /* receive and process one packet within the sequence number window.
  *
- * returns:
+ * Return:
  *  1 if the window was moved (either new or very old)
  *  0 if the window was not moved/shifted.
  */
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 0226b220fe5b..cf2aeb0831a4 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -24,7 +24,10 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 
-/* Returns 1 if the corresponding bit in the given seq_bits indicates true
+/**
+ * batadv_test_bit
+ *
+ * Return: 1 if the corresponding bit in the given seq_bits indicates true
  * and curr_seqno is within range of last_seqno. Otherwise returns 0.
  */
 static inline int batadv_test_bit(const unsigned long *seq_bits,
@@ -48,8 +51,10 @@ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n)
 	set_bit(n, seq_bits); /* turn the position on */
 }
 
-/* receive and process one packet, returns 1 if received seq_num is considered
- * new, 0 if old
+/**
+ * batadv_bit_get_packet - receive and process one packet
+ *
+ * Return: 1 if received seq_num is considered new, 0 if old
  */
 int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
 			  int set_mark);
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index c24c481b666f..90c79948979c 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -58,7 +58,11 @@ static void
 batadv_bla_send_announce(struct batadv_priv *bat_priv,
 			 struct batadv_bla_backbone_gw *backbone_gw);
 
-/* return the index of the claim */
+/**
+ * batadv_choose_claim
+ *
+ * Return: the index of the claim
+ */
 static inline u32 batadv_choose_claim(const void *data, u32 size)
 {
 	struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
@@ -70,7 +74,11 @@ static inline u32 batadv_choose_claim(const void *data, u32 size)
 	return hash % size;
 }
 
-/* return the index of the backbone gateway */
+/**
+ * batadv_choose_backbone_gw
+ *
+ * Return: the index of the backbone gateway
+ */
 static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
 {
 	const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
@@ -145,8 +153,9 @@ static void batadv_claim_free_ref(struct batadv_bla_claim *claim)
  * @bat_priv: the bat priv with all the soft interface information
  * @data: search data (may be local/static data)
  *
- * looks for a claim in the hash, and returns it if found
- * or NULL otherwise.
+ * looks for a claim in the hash
+ *
+ * Return: claim if found or NULL otherwise.
  */
 static struct batadv_bla_claim
 *batadv_claim_hash_find(struct batadv_priv *bat_priv,
@@ -186,7 +195,7 @@ static struct batadv_bla_claim
  * @addr: the address of the originator
  * @vid: the VLAN ID
  *
- * Returns claim if found or NULL otherwise.
+ * Return: claim if found or NULL otherwise.
  */
 static struct batadv_bla_backbone_gw *
 batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
@@ -666,7 +675,11 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
 	batadv_claim_free_ref(claim);
 }
 
-/* check for ANNOUNCE frame, return 1 if handled */
+/**
+ * batadv_handle_announce - check for ANNOUNCE frame
+ *
+ * Return: 1 if handled
+ */
 static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
 				  u8 *backbone_addr, unsigned short vid)
 {
@@ -716,7 +729,11 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
 	return 1;
 }
 
-/* check for REQUEST frame, return 1 if handled */
+/**
+ * batadv_handle_request - check for REQUEST frame
+ *
+ * Return: 1 if handled
+ */
 static int batadv_handle_request(struct batadv_priv *bat_priv,
 				 struct batadv_hard_iface *primary_if,
 				 u8 *backbone_addr, struct ethhdr *ethhdr,
@@ -740,7 +757,11 @@ static int batadv_handle_request(struct batadv_priv *bat_priv,
 	return 1;
 }
 
-/* check for UNCLAIM frame, return 1 if handled */
+/**
+ * batadv_handle_unclaim - check for UNCLAIM frame
+ *
+ * Return: 1 if handled
+ */
 static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
 				 struct batadv_hard_iface *primary_if,
 				 u8 *backbone_addr, u8 *claim_addr,
@@ -769,7 +790,11 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
 	return 1;
 }
 
-/* check for CLAIM frame, return 1 if handled */
+/**
+ * batadv_handle_claim - check for CLAIM frame
+ *
+ * Return: 1 if handled
+ */
 static int batadv_handle_claim(struct batadv_priv *bat_priv,
 			       struct batadv_hard_iface *primary_if,
 			       u8 *backbone_addr, u8 *claim_addr,
@@ -809,7 +834,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv,
  * This function also applies the group ID of the sender
  * if it is in the same mesh.
  *
- * returns:
+ * Return:
  *	2  - if it is a claim packet and on the same group
  *	1  - if is a claim packet from another group
  *	0  - if it is not a claim packet
@@ -880,7 +905,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv,
  *
  * Check if this is a claim frame, and process it accordingly.
  *
- * returns 1 if it was a claim frame, otherwise return 0 to
+ * Return: 1 if it was a claim frame, otherwise return 0 to
  * tell the callee that it can use the frame on its own.
  */
 static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
@@ -1397,7 +1422,7 @@ out:
  *
  * Check if the originator is a gateway for the VLAN identified by vid.
  *
- * Returns true if orig is a backbone for this vid, false otherwise.
+ * Return: true if orig is a backbone for this vid, false otherwise.
  */
 bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
 				    unsigned short vid)
@@ -1436,9 +1461,10 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
  * @orig_node: the orig_node of the frame
  * @hdr_size: maximum length of the frame
  *
- * bla_is_backbone_gw inspects the skb for the VLAN ID and returns 1
- * if the orig_node is also a gateway on the soft interface, otherwise it
- * returns 0.
+ * bla_is_backbone_gw inspects the skb for the VLAN ID
+ *
+ * Return: 1 if the orig_node is also a gateway on the soft interface, otherwise
+ * it returns 0.
  */
 int batadv_bla_is_backbone_gw(struct sk_buff *skb,
 			      struct batadv_orig_node *orig_node, int hdr_size)
@@ -1498,8 +1524,9 @@ void batadv_bla_free(struct batadv_priv *bat_priv)
  *  * we have to race for a claim
  *  * if the frame is allowed on the LAN
  *
- * in these cases, the skb is further handled by this function and
- * returns 1, otherwise it returns 0 and the caller shall further
+ * in these cases, the skb is further handled by this function
+ *
+ * Return: 1 if handled, otherwise it returns 0 and the caller shall further
  * process the skb.
  */
 int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
@@ -1592,11 +1619,12 @@ out:
  *  * a claim was received which has to be processed
  *  * the frame is allowed on the mesh
  *
- * in these cases, the skb is further handled by this function and
- * returns 1, otherwise it returns 0 and the caller shall further
- * process the skb.
+ * in these cases, the skb is further handled by this function.
  *
  * This call might reallocate skb data.
+ *
+ * Return: 1 if handled, otherwise it returns 0 and the caller shall further
+ * process the skb.
  */
 int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
 		  unsigned short vid)
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index a49c705fb86b..2f13429ce5d3 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -76,7 +76,7 @@ static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry)
  * batadv_dat_to_purge - check whether a dat_entry has to be purged or not
  * @dat_entry: the entry to check
  *
- * Returns true if the entry has to be purged now, false otherwise.
+ * Return: true if the entry has to be purged now, false otherwise.
  */
 static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry)
 {
@@ -151,7 +151,7 @@ static void batadv_dat_purge(struct work_struct *work)
  * @node: node in the local table
  * @data2: second object to compare the node to
  *
- * Returns 1 if the two entries are the same, 0 otherwise.
+ * Return: 1 if the two entries are the same, 0 otherwise.
  */
 static int batadv_compare_dat(const struct hlist_node *node, const void *data2)
 {
@@ -166,7 +166,7 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2)
  * @skb: ARP packet
  * @hdr_size: size of the possible header before the ARP packet
  *
- * Returns the value of the hw_src field in the ARP packet.
+ * Return: the value of the hw_src field in the ARP packet.
  */
 static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
 {
@@ -183,7 +183,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
  * @skb: ARP packet
  * @hdr_size: size of the possible header before the ARP packet
  *
- * Returns the value of the ip_src field in the ARP packet.
+ * Return: the value of the ip_src field in the ARP packet.
  */
 static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
 {
@@ -195,7 +195,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
  * @skb: ARP packet
  * @hdr_size: size of the possible header before the ARP packet
  *
- * Returns the value of the hw_dst field in the ARP packet.
+ * Return: the value of the hw_dst field in the ARP packet.
  */
 static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
 {
@@ -207,7 +207,7 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
  * @skb: ARP packet
  * @hdr_size: size of the possible header before the ARP packet
  *
- * Returns the value of the ip_dst field in the ARP packet.
+ * Return: the value of the ip_dst field in the ARP packet.
  */
 static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
 {
@@ -219,7 +219,7 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
  * @data: data to hash
  * @size: size of the hash table
  *
- * Returns the selected index in the hash table for the given data.
+ * Return: the selected index in the hash table for the given data.
  */
 static u32 batadv_hash_dat(const void *data, u32 size)
 {
@@ -256,7 +256,7 @@ static u32 batadv_hash_dat(const void *data, u32 size)
  * @ip: search key
  * @vid: VLAN identifier
  *
- * Returns the dat_entry if found, NULL otherwise.
+ * Return: the dat_entry if found, NULL otherwise.
  */
 static struct batadv_dat_entry *
 batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
@@ -440,7 +440,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
  * @candidate: orig_node under evaluation
  * @max_orig_node: last selected candidate
  *
- * Returns true if the node has been elected as next candidate or false
+ * Return: true if the node has been elected as next candidate or false
  * otherwise.
  */
 static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res,
@@ -558,7 +558,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
  * closest values (from the LEFT, with wrap around if needed) then the hash
  * value of the key. ip_dst is the key.
  *
- * Returns the candidate array of size BATADV_DAT_CANDIDATE_NUM.
+ * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM.
  */
 static struct batadv_dat_candidate *
 batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
@@ -602,7 +602,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
  * This function copies the skb with pskb_copy() and is sent as unicast packet
  * to each of the selected candidates.
  *
- * Returns true if the packet is sent to at least one candidate, false
+ * Return: true if the packet is sent to at least one candidate, false
  * otherwise.
  */
 static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
@@ -831,7 +831,7 @@ out:
  * @skb: packet to analyse
  * @hdr_size: size of the possible header before the ARP packet in the skb
  *
- * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise.
+ * Return: the ARP type if the skb contains a valid ARP packet, 0 otherwise.
  */
 static u16 batadv_arp_get_type(struct batadv_priv *bat_priv,
 			       struct sk_buff *skb, int hdr_size)
@@ -904,8 +904,9 @@ out:
  * @skb: the buffer containing the packet to extract the VID from
  * @hdr_size: the size of the batman-adv header encapsulating the packet
  *
- * If the packet embedded in the skb is vlan tagged this function returns the
- * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned.
+ * Return: If the packet embedded in the skb is vlan tagged this function
+ * returns the VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS
+ * is returned.
  */
 static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
 {
@@ -930,7 +931,7 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: packet to check
  *
- * Returns true if the message has been sent to the dht candidates, false
+ * Return: true if the message has been sent to the dht candidates, false
  * otherwise. In case of a positive return value the message has to be enqueued
  * to permit the fallback.
  */
@@ -1020,7 +1021,7 @@ out:
  * @skb: packet to check
  * @hdr_size: size of the encapsulation header
  *
- * Returns true if the request has been answered, false otherwise.
+ * Return: true if the request has been answered, false otherwise.
  */
 bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
 					   struct sk_buff *skb, int hdr_size)
@@ -1143,7 +1144,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
  * @skb: packet to check
  * @hdr_size: size of the encapsulation header
  *
- * Returns true if the packet was snooped and consumed by DAT. False if the
+ * Return: true if the packet was snooped and consumed by DAT. False if the
  * packet has to be delivered to the interface
  */
 bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
@@ -1200,7 +1201,7 @@ out:
  * @bat_priv: the bat priv with all the soft interface information
  * @forw_packet: the broadcast packet
  *
- * Returns true if the node can drop the packet, false otherwise.
+ * Return: true if the node can drop the packet, false otherwise.
  */
 bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
 				      struct batadv_forw_packet *forw_packet)
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index fedddbe0cd28..80eddf44a827 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -85,7 +85,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
 /**
  * batadv_frag_size_limit - maximum possible size of packet to be fragmented
  *
- * Returns the maximum size of payload that can be fragmented.
+ * Return: the maximum size of payload that can be fragmented.
  */
 static int batadv_frag_size_limit(void)
 {
@@ -107,7 +107,7 @@ static int batadv_frag_size_limit(void)
  *
  * Caller must hold chain->lock.
  *
- * Returns true if chain is empty and caller can just insert the new fragment
+ * Return: true if chain is empty and caller can just insert the new fragment
  * without searching for the right position.
  */
 static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
@@ -136,7 +136,7 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
  * Insert a new fragment into the reverse ordered chain in the right table
  * entry. The hash table entry is cleared if "old" fragments exist in it.
  *
- * Returns true if skb is buffered, false on error. If the chain has all the
+ * Return: true if skb is buffered, false on error. If the chain has all the
  * fragments needed to merge the packet, the chain is moved to the passed head
  * to avoid locking the chain in the table.
  */
@@ -246,7 +246,7 @@ err:
  * Expand the first skb in the chain and copy the content of the remaining
  * skb's into the expanded one. After doing so, clear the chain.
  *
- * Returns the merged skb or NULL on error.
+ * Return: the merged skb or NULL on error.
  */
 static struct sk_buff *
 batadv_frag_merge_packets(struct hlist_head *chain)
@@ -306,6 +306,9 @@ free:
  * There are three possible outcomes: 1) Packet is merged: Return true and
  * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
  * to NULL; 3) Error: Return false and leave skb as is.
+ *
+ * Return: true when packet is merged or buffered, false when skb is not not
+ * used.
  */
 bool batadv_frag_skb_buffer(struct sk_buff **skb,
 			    struct batadv_orig_node *orig_node_src)
@@ -343,7 +346,7 @@ out_err:
  * will exceed the MTU towards the next-hop. If so, the fragment is forwarded
  * without merging it.
  *
- * Returns true if the fragment is consumed/forwarded, false otherwise.
+ * Return: true if the fragment is consumed/forwarded, false otherwise.
  */
 bool batadv_frag_skb_fwd(struct sk_buff *skb,
 			 struct batadv_hard_iface *recv_if,
@@ -398,7 +401,7 @@ out:
  * passed mtu and the old one with the rest. The new skb contains data from the
  * tail of the old skb.
  *
- * Returns the new fragment, NULL on error.
+ * Return: the new fragment, NULL on error.
  */
 static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
 					  struct batadv_frag_packet *frag_head,
@@ -432,7 +435,7 @@ err:
  * @orig_node: final destination of the created fragments
  * @neigh_node: next-hop of the created fragments
  *
- * Returns true on success, false otherwise.
+ * Return: true on success, false otherwise.
  */
 bool batadv_frag_send_packet(struct sk_buff *skb,
 			     struct batadv_orig_node *orig_node,
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 8b9877e70b95..c5476fe79dc6 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -42,7 +42,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
  * batadv_frag_check_entry - check if a list of fragments has timed out
  * @frags_entry: table entry to check
  *
- * Returns true if the frags entry has timed out, false otherwise.
+ * Return: true if the frags entry has timed out, false otherwise.
  */
 static inline bool
 batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry)
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index e6c8382c79ba..83507750bb66 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -456,7 +456,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: originator announcing gateway capabilities
  *
- * Returns gateway node if found or NULL otherwise.
+ * Return: gateway node if found or NULL otherwise.
  */
 static struct batadv_gw_node *
 batadv_gw_node_get(struct batadv_priv *bat_priv,
@@ -655,13 +655,13 @@ out:
  * @chaddr: buffer where the client address will be stored. Valid
  *  only if the function returns BATADV_DHCP_TO_CLIENT
  *
- * Returns:
+ * This function may re-allocate the data buffer of the skb passed as argument.
+ *
+ * Return:
  * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error
  *   while parsing it
  * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server
  * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client
- *
- * This function may re-allocate the data buffer of the skb passed as argument.
  */
 enum batadv_dhcp_recipient
 batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
@@ -776,11 +776,11 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
  * server. Due to topology changes it may be the case that the GW server
  * previously selected is not the best one anymore.
  *
- * Returns true if the packet destination is unicast and it is not the best gw,
- * false otherwise.
- *
  * This call might reallocate skb data.
  * Must be invoked only when the DHCP packet is going TO a DHCP SERVER.
+ *
+ * Return: true if the packet destination is unicast and it is not the best gw,
+ * false otherwise.
  */
 bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
 			    struct sk_buff *skb)
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index b51bface8bdd..b287448b9921 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -38,7 +38,7 @@
  * @description: text shown when throughput string cannot be parsed
  * @throughput: pointer holding the returned throughput information
  *
- * Returns false on parse error and true otherwise.
+ * Return: false on parse error and true otherwise.
  */
 static bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
 				    const char *description, u32 *throughput)
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 01acccc4d218..49e05d238667 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -85,7 +85,7 @@ out:
  * This function recursively checks all the fathers of the device passed as
  * argument looking for a batman-adv soft interface.
  *
- * Returns true if the device is descendant of a batman-adv mesh interface (or
+ * Return: true if the device is descendant of a batman-adv mesh interface (or
  * if it is a batman-adv interface itself), false otherwise
  */
 static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
@@ -136,7 +136,7 @@ static int batadv_is_valid_iface(const struct net_device *net_dev)
  *  interface
  * @net_device: the device to check
  *
- * Returns true if the net device is a 802.11 wireless device, false otherwise.
+ * Return: true if the net device is a 802.11 wireless device, false otherwise.
  */
 bool batadv_is_wifi_netdev(struct net_device *net_device)
 {
@@ -401,7 +401,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface)
  *
  * Invoke ndo_del_slave on master passing slave as argument. In this way slave
  * is free'd and master can correctly change its internal state.
- * Return 0 on success, a negative value representing the error otherwise
+ *
+ * Return: 0 on success, a negative value representing the error otherwise
  */
 static int batadv_master_del_slave(struct batadv_hard_iface *slave,
 				   struct net_device *master)
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 377626250ac7..071a87ea4f2f 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -30,14 +30,17 @@
 struct lock_class_key;
 
 /* callback to a compare function.  should compare 2 element datas for their
- * keys, return 0 if same and not 0 if not same
+ * keys
+ *
+ * Return: 0 if same and not 0 if not same
  */
 typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *,
 					  const void *);
 
-/* the hashfunction, should return an index
- * based on the key in the data of the first
- * argument and the size the second
+/* the hashfunction
+ *
+ * Return: an index based on the key in the data of the first argument and the
+ * size the second
  */
 typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32);
 typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *);
@@ -96,7 +99,7 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash,
  *	@data: data passed to the aforementioned callbacks as argument
  *	@data_node: to be added element
  *
- *	Returns 0 on success, 1 if the element already is in the hash
+ *	Return: 0 on success, 1 if the element already is in the hash
  *	and -1 on error.
  */
 static inline int batadv_hash_add(struct batadv_hashtable *hash,
@@ -139,10 +142,11 @@ out:
 	return ret;
 }
 
-/* removes data from hash, if found. returns pointer do data on success, so you
- * can remove the used structure yourself, or NULL on error .  data could be the
- * structure you use with just the key filled, we just need the key for
- * comparing.
+/* removes data from hash, if found. data could be the structure you use with
+ * just the key filled, we just need the key for comparing.
+ *
+ * Return: returns pointer do data on success, so you can remove the used
+ * structure yourself, or NULL on error
  */
 static inline void *batadv_hash_remove(struct batadv_hashtable *hash,
 				       batadv_hashdata_compare_cb compare,
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 4b5d61fbadb1..70907f6f37ed 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -233,7 +233,7 @@ void batadv_mesh_free(struct net_device *soft_iface)
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the address to check
  *
- * Returns 'true' if the mac address was found, false otherwise.
+ * Return: 'true' if the mac address was found, false otherwise.
  */
 bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
 {
@@ -262,7 +262,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
  *  function that requires the primary interface
  * @seq: debugfs table seq_file struct
  *
- * Returns primary interface if found or NULL otherwise.
+ * Return: primary interface if found or NULL otherwise.
  */
 struct batadv_hard_iface *
 batadv_seq_print_text_primary_if_get(struct seq_file *seq)
@@ -297,7 +297,7 @@ out:
  * batadv_max_header_len - calculate maximum encapsulation overhead for a
  *  payload packet
  *
- * Return the maximum encapsulation overhead in bytes.
+ * Return: the maximum encapsulation overhead in bytes.
  */
 int batadv_max_header_len(void)
 {
@@ -640,7 +640,7 @@ batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler)
  * @type: tvlv handler type to look for
  * @version: tvlv handler version to look for
  *
- * Returns tvlv handler if found or NULL otherwise.
+ * Return: tvlv handler if found or NULL otherwise.
  */
 static struct batadv_tvlv_handler
 *batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version)
@@ -688,7 +688,7 @@ static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv)
  * Has to be called with the appropriate locks being acquired
  * (tvlv.container_list_lock).
  *
- * Returns tvlv container if found or NULL otherwise.
+ * Return: tvlv container if found or NULL otherwise.
  */
 static struct batadv_tvlv_container
 *batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
@@ -720,7 +720,7 @@ static struct batadv_tvlv_container
  * Has to be called with the appropriate locks being acquired
  * (tvlv.container_list_lock).
  *
- * Returns size of all currently registered tvlv containers in bytes.
+ * Return: size of all currently registered tvlv containers in bytes.
  */
 static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
 {
@@ -826,7 +826,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
  * @additional_packet_len: requested additional packet size on top of minimum
  *  size
  *
- * Returns true of the packet buffer could be changed to the requested size,
+ * Return: true of the packet buffer could be changed to the requested size,
  * false otherwise.
  */
 static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
@@ -862,7 +862,7 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
  * The ogm packet might be enlarged or shrunk depending on the current size
  * and the size of the to-be-appended tvlv containers.
  *
- * Returns size of all appended tvlv containers in bytes.
+ * Return: size of all appended tvlv containers in bytes.
  */
 u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
 				     unsigned char **packet_buff,
@@ -915,7 +915,7 @@ end:
  * @tvlv_value: tvlv content
  * @tvlv_value_len: tvlv content length
  *
- * Returns success if handler was not found or the return value of the handler
+ * Return: success if handler was not found or the return value of the handler
  * callback.
  */
 static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
@@ -968,7 +968,7 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
  * @tvlv_value: tvlv content
  * @tvlv_value_len: tvlv content length
  *
- * Returns success when processing an OGM or the return value of all called
+ * Return: success when processing an OGM or the return value of all called
  * handler callbacks.
  */
 int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
@@ -1190,8 +1190,8 @@ out:
  * @skb: the buffer containing the packet
  * @header_len: length of the batman header preceding the ethernet header
  *
- * If the packet embedded in the skb is vlan tagged this function returns the
- * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned.
+ * Return: VID with the BATADV_VLAN_HAS_TAG flag when the packet embedded in the
+ * skb is vlan tagged. Otherwise BATADV_NO_FLAGS.
  */
 unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
 {
@@ -1218,7 +1218,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
  * @vid: the VLAN identifier for which the AP isolation attributed as to be
  *  looked up
  *
- * Returns true if AP isolation is on for the VLAN idenfied by vid, false
+ * Return: true if AP isolation is on for the VLAN idenfied by vid, false
  * otherwise
  */
 bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid)
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 9dbd9107e7e1..8a48d4a40828 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -273,9 +273,12 @@ static inline void _batadv_dbg(int type __always_unused,
 		pr_err("%s: " fmt, _netdev->name, ## arg);		\
 	} while (0)
 
-/* returns 1 if they are the same ethernet addr
+/**
+ * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses
  *
  * note: can't use ether_addr_equal() as it requires aligned memory
+ *
+ * Return: 1 if they are the same ethernet addr
  */
 static inline bool batadv_compare_eth(const void *data1, const void *data2)
 {
@@ -287,7 +290,7 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2)
  * @timestamp:		base value to compare with (in jiffies)
  * @timeout:		added to base value before comparing (in milliseconds)
  *
- * Returns true if current time is after timestamp + timeout
+ * Return: true if current time is after timestamp + timeout
  */
 static inline bool batadv_has_timed_out(unsigned long timestamp,
 					unsigned int timeout)
@@ -326,7 +329,11 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
 
 #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
 
-/* Sum and return the cpu-local counters for index 'idx' */
+/**
+ * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
+ *
+ * Return: sum of all cpu-local counters
+ */
 static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv,  size_t idx)
 {
 	u64 *counters, sum = 0;
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 75fa5013af72..d984eee1776c 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -55,7 +55,7 @@
  * Collect multicast addresses of the local multicast listeners
  * on the given soft interface, dev, in the given mcast_list.
  *
- * Returns -ENOMEM on memory allocation error or the number of
+ * Return: -ENOMEM on memory allocation error or the number of
  * items added to the mcast_list otherwise.
  */
 static int batadv_mcast_mla_softif_get(struct net_device *dev,
@@ -87,7 +87,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev,
  * @mcast_addr: the multicast address to check
  * @mcast_list: the list with multicast addresses to search in
  *
- * Returns true if the given address is already in the given list.
+ * Return: true if the given address is already in the given list.
  * Otherwise returns false.
  */
 static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
@@ -195,8 +195,9 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
  * batadv_mcast_has_bridge - check whether the soft-iface is bridged
  * @bat_priv: the bat priv with all the soft interface information
  *
- * Checks whether there is a bridge on top of our soft interface. Returns
- * true if so, false otherwise.
+ * Checks whether there is a bridge on top of our soft interface.
+ *
+ * Return: true if there is a bridge, false otherwise.
  */
 static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
 {
@@ -218,7 +219,7 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
  * Updates the own multicast tvlv with our current multicast related settings,
  * capabilities and inabilities.
  *
- * Returns true if the tvlv container is registered afterwards. Otherwise
+ * Return: true if the tvlv container is registered afterwards. Otherwise
  * returns false.
  */
 static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv)
@@ -289,8 +290,8 @@ out:
  * Checks whether the given IPv4 packet has the potential to be forwarded with a
  * mode more optimal than classic flooding.
  *
- * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM in case of
- * memory allocation failure.
+ * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory
+ * allocation failure.
  */
 static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
 					     struct sk_buff *skb,
@@ -327,8 +328,7 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
  * Checks whether the given IPv6 packet has the potential to be forwarded with a
  * mode more optimal than classic flooding.
  *
- * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out
- * of memory.
+ * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory
  */
 static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
 					     struct sk_buff *skb,
@@ -366,8 +366,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
  * Checks whether the given multicast ethernet frame has the potential to be
  * forwarded with a mode more optimal than classic flooding.
  *
- * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out
- * of memory.
+ * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory
  */
 static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
 					struct sk_buff *skb,
@@ -398,7 +397,7 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
  * @bat_priv: the bat priv with all the soft interface information
  * @ethhdr: ethernet header of a packet
  *
- * Returns the number of nodes which want all IPv4 multicast traffic if the
+ * Return: the number of nodes which want all IPv4 multicast traffic if the
  * given ethhdr is from an IPv4 packet or the number of nodes which want all
  * IPv6 traffic if it matches an IPv6 packet.
  */
@@ -421,7 +420,7 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv,
  * @bat_priv: the bat priv with all the soft interface information
  * @ethhdr: the ether header containing the multicast destination
  *
- * Returns an orig_node matching the multicast address provided by ethhdr
+ * Return: an orig_node matching the multicast address provided by ethhdr
  * via a translation table lookup. This increases the returned nodes refcount.
  */
 static struct batadv_orig_node *
@@ -436,7 +435,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv,
  * batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag
  * @bat_priv: the bat priv with all the soft interface information
  *
- * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and
+ * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and
  * increases its refcount.
  */
 static struct batadv_orig_node *
@@ -463,7 +462,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv)
  * batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag
  * @bat_priv: the bat priv with all the soft interface information
  *
- * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set
+ * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set
  * and increases its refcount.
  */
 static struct batadv_orig_node *
@@ -491,7 +490,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv)
  * @bat_priv: the bat priv with all the soft interface information
  * @ethhdr: an ethernet header to determine the protocol family from
  *
- * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or
+ * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or
  * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and
  * increases its refcount.
  */
@@ -514,7 +513,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv,
  * batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag
  * @bat_priv: the bat priv with all the soft interface information
  *
- * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag
+ * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag
  * set and increases its refcount.
  */
 static struct batadv_orig_node *
@@ -543,7 +542,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv)
  * @skb: The multicast packet to check
  * @orig: an originator to be set to forward the skb to
  *
- * Returns the forwarding mode as enum batadv_forw_mode and in case of
+ * Return: the forwarding mode as enum batadv_forw_mode and in case of
  * BATADV_FORW_SINGLE set the orig to the single originator the skb
  * should be forwarded to.
  */
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index cc63b44f0d2e..1a75d288f8a7 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -251,7 +251,7 @@ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet)
  * @bat_priv: the bat priv with all the soft interface information
  * @nc_node: the nc node to check
  *
- * Returns true if the entry has to be purged now, false otherwise
+ * Return: true if the entry has to be purged now, false otherwise
  */
 static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv,
 				       struct batadv_nc_node *nc_node)
@@ -267,7 +267,7 @@ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv,
  * @bat_priv: the bat priv with all the soft interface information
  * @nc_path: the nc path to check
  *
- * Returns true if the entry has to be purged now, false otherwise
+ * Return: true if the entry has to be purged now, false otherwise
  */
 static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv,
 					      struct batadv_nc_path *nc_path)
@@ -287,7 +287,7 @@ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv,
  * @bat_priv: the bat priv with all the soft interface information
  * @nc_path: the nc path to check
  *
- * Returns true if the entry has to be purged now, false otherwise
+ * Return: true if the entry has to be purged now, false otherwise
  */
 static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv,
 						struct batadv_nc_path *nc_path)
@@ -470,7 +470,7 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src,
  * @data: data to hash
  * @size: size of the hash table
  *
- * Returns the selected index in the hash table for the given data.
+ * Return: the selected index in the hash table for the given data.
  */
 static u32 batadv_nc_hash_choose(const void *data, u32 size)
 {
@@ -489,7 +489,7 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size)
  * @node: node in the local table
  * @data2: second object to compare the node to
  *
- * Returns 1 if the two entry are the same, 0 otherwise
+ * Return: 1 if the two entry are the same, 0 otherwise
  */
 static int batadv_nc_hash_compare(const struct hlist_node *node,
 				  const void *data2)
@@ -516,7 +516,7 @@ static int batadv_nc_hash_compare(const struct hlist_node *node,
  * @hash: hash table containing the nc path
  * @data: search key
  *
- * Returns the nc_path if found, NULL otherwise.
+ * Return: the nc_path if found, NULL otherwise.
  */
 static struct batadv_nc_path *
 batadv_nc_hash_find(struct batadv_hashtable *hash,
@@ -571,7 +571,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
  * timeout. If so, the packet is no longer kept and the entry deleted from the
  * queue. Has to be called with the appropriate locks.
  *
- * Returns false as soon as the entry in the fifo queue has not been timed out
+ * Return: false as soon as the entry in the fifo queue has not been timed out
  * yet and true otherwise.
  */
 static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
@@ -610,7 +610,7 @@ out:
  * packet is no longer delayed, immediately sent and the entry deleted from the
  * queue. Has to be called with the appropriate locks.
  *
- * Returns false as soon as the entry in the fifo queue has not been timed out
+ * Return: false as soon as the entry in the fifo queue has not been timed out
  * yet and true otherwise.
  */
 static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv,
@@ -731,7 +731,7 @@ static void batadv_nc_worker(struct work_struct *work)
  * @orig_node: neighboring orig node which may be used as nc candidate
  * @ogm_packet: incoming ogm packet also used for the checks
  *
- * Returns true if:
+ * Return: true if:
  *  1) The OGM must have the most recent sequence number.
  *  2) The TTL must be decremented by one and only one.
  *  3) The OGM must be received from the first hop from orig_node.
@@ -772,7 +772,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
  *  (can be equal to orig_node)
  * @in_coding: traverse incoming or outgoing network coding list
  *
- * Returns the nc_node if found, NULL otherwise.
+ * Return: the nc_node if found, NULL otherwise.
  */
 static struct batadv_nc_node
 *batadv_nc_find_nc_node(struct batadv_orig_node *orig_node,
@@ -814,7 +814,7 @@ static struct batadv_nc_node
  *  (can be equal to orig_node)
  * @in_coding: traverse incoming or outgoing network coding list
  *
- * Returns the nc_node if found or created, NULL in case of an error.
+ * Return: the nc_node if found or created, NULL in case of an error.
  */
 static struct batadv_nc_node
 *batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
@@ -932,7 +932,7 @@ out:
  * @src: ethernet source address - first half of the nc path search key
  * @dst: ethernet destination address - second half of the nc path search key
  *
- * Returns pointer to nc_path if the path was found or created, returns NULL
+ * Return: pointer to nc_path if the path was found or created, returns NULL
  * on error.
  */
 static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
@@ -1029,7 +1029,7 @@ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len)
  * @nc_packet: structure containing the packet to the skb can be coded with
  * @neigh_node: next hop to forward packet to
  *
- * Returns true if both packets are consumed, false otherwise.
+ * Return: true if both packets are consumed, false otherwise.
  */
 static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
 				   struct sk_buff *skb,
@@ -1228,7 +1228,7 @@ out:
  * Since the source encoded the packet we can be certain it has all necessary
  * decode information.
  *
- * Returns true if coding of a decoded packet is allowed.
+ * Return: true if coding of a decoded packet is allowed.
  */
 static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src)
 {
@@ -1246,7 +1246,7 @@ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src)
  * @skb: data skb to forward
  * @eth_dst: next hop mac address of skb
  *
- * Returns true if coding of a decoded skb is allowed.
+ * Return: true if coding of a decoded skb is allowed.
  */
 static struct batadv_nc_packet *
 batadv_nc_path_search(struct batadv_priv *bat_priv,
@@ -1314,7 +1314,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv,
  * @eth_src: source mac address of skb
  * @in_nc_node: pointer to skb next hop's neighbor nc node
  *
- * Returns an nc packet if a suitable coding packet was found, NULL otherwise.
+ * Return: an nc packet if a suitable coding packet was found, NULL otherwise.
  */
 static struct batadv_nc_packet *
 batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
@@ -1397,7 +1397,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
  * next hop that potentially sent a packet which our next hop also received
  * (overheard) and has stored for later decoding.
  *
- * Returns true if the skb was consumed (encoded packet sent) or false otherwise
+ * Return: true if the skb was consumed (encoded packet sent) or false otherwise
  */
 static bool batadv_nc_skb_dst_search(struct sk_buff *skb,
 				     struct batadv_neigh_node *neigh_node,
@@ -1451,7 +1451,7 @@ static bool batadv_nc_skb_dst_search(struct sk_buff *skb,
  * @neigh_node: next hop to forward packet to
  * @packet_id: checksum to identify packet
  *
- * Returns true if the packet was buffered or false in case of an error.
+ * Return: true if the packet was buffered or false in case of an error.
  */
 static bool batadv_nc_skb_add_to_path(struct sk_buff *skb,
 				      struct batadv_nc_path *nc_path,
@@ -1485,7 +1485,7 @@ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb,
  * @skb: data skb to forward
  * @neigh_node: next hop to forward packet to
  *
- * Returns true if the skb was consumed (encoded packet sent) or false otherwise
+ * Return: true if the skb was consumed (encoded packet sent) or false otherwise
  */
 bool batadv_nc_skb_forward(struct sk_buff *skb,
 			   struct batadv_neigh_node *neigh_node)
@@ -1624,7 +1624,7 @@ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
  * @skb: unicast skb to decode
  * @nc_packet: decode data needed to decode the skb
  *
- * Returns pointer to decoded unicast packet if the packet was decoded or NULL
+ * Return: pointer to decoded unicast packet if the packet was decoded or NULL
  * in case of an error.
  */
 static struct batadv_unicast_packet *
@@ -1718,7 +1718,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
  * @ethhdr: pointer to the ethernet header inside the coded packet
  * @coded: coded packet we try to find decode data for
  *
- * Returns pointer to nc packet if the needed data was found or NULL otherwise.
+ * Return: pointer to nc packet if the needed data was found or NULL otherwise.
  */
 static struct batadv_nc_packet *
 batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index dfeebc89ed9a..ed1d6d7b717e 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -47,7 +47,11 @@ static struct lock_class_key batadv_orig_hash_lock_class_key;
 
 static void batadv_purge_orig(struct work_struct *work);
 
-/* returns 1 if they are the same originator */
+/**
+ * batadv_compare_orig
+ *
+ * Return: 1 if they are the same originator
+ */
 int batadv_compare_orig(const struct hlist_node *node, const void *data2)
 {
 	const void *data1 = container_of(node, struct batadv_orig_node,
@@ -61,7 +65,7 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2)
  * @orig_node: the originator serving the VLAN
  * @vid: the VLAN identifier
  *
- * Returns the vlan object identified by vid and belonging to orig_node or NULL
+ * Return: the vlan object identified by vid and belonging to orig_node or NULL
  * if it does not exist.
  */
 struct batadv_orig_node_vlan *
@@ -93,7 +97,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
  * @orig_node: the originator serving the VLAN
  * @vid: the VLAN identifier
  *
- * Returns NULL in case of failure or the vlan object identified by vid and
+ * Return: NULL in case of failure or the vlan object identified by vid and
  * belonging to orig_node otherwise. The object is created and added to the list
  * if it does not exist.
  *
@@ -266,7 +270,7 @@ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node)
  * @if_outgoing: the interface where the payload packet has been received or
  *  the OGM should be sent to
  *
- * Returns the neighbor which should be router for this orig_node/iface.
+ * Return: the neighbor which should be router for this orig_node/iface.
  *
  * The object is returned with refcounter increased by 1.
  */
@@ -298,7 +302,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node,
  * @orig_node: the orig node to be queried
  * @if_outgoing: the interface for which the ifinfo should be acquired
  *
- * Returns the requested orig_ifinfo or NULL if not found.
+ * Return: the requested orig_ifinfo or NULL if not found.
  *
  * The object is returned with refcounter increased by 1.
  */
@@ -330,7 +334,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
  * @orig_node: the orig node to be queried
  * @if_outgoing: the interface for which the ifinfo should be acquired
  *
- * Returns NULL in case of failure or the orig_ifinfo object for the if_outgoing
+ * Return: NULL in case of failure or the orig_ifinfo object for the if_outgoing
  * interface otherwise. The object is created and added to the list
  * if it does not exist.
  *
@@ -380,7 +384,7 @@ out:
  *
  * The object is returned with refcounter increased by 1.
  *
- * Returns the requested neigh_ifinfo or NULL if not found
+ * Return: the requested neigh_ifinfo or NULL if not found
  */
 struct batadv_neigh_ifinfo *
 batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
@@ -411,7 +415,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
  * @neigh: the neigh node to be queried
  * @if_outgoing: the interface for which the ifinfo should be acquired
  *
- * Returns NULL in case of failure or the neigh_ifinfo object for the
+ * Return: NULL in case of failure or the neigh_ifinfo object for the
  * if_outgoing interface otherwise. The object is created and added to the list
  * if it does not exist.
  *
@@ -459,7 +463,8 @@ out:
  *
  * Looks for and possibly returns a neighbour belonging to this originator list
  * which is connected through the provided hard interface.
- * Returns NULL if the neighbour is not found.
+ *
+ * Return: neighbor when found. Othwerwise NULL
  */
 static struct batadv_neigh_node *
 batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
@@ -492,7 +497,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
  * @hard_iface: the interface this neighbour is connected to
  * @neigh_addr: the interface address of the neighbour to retrieve
  *
- * Returns the hardif neighbour node if found or created or NULL otherwise.
+ * Return: the hardif neighbour node if found or created or NULL otherwise.
  */
 static struct batadv_hardif_neigh_node *
 batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
@@ -540,7 +545,7 @@ out:
  * @hard_iface: the interface this neighbour is connected to
  * @neigh_addr: the interface address of the neighbour to retrieve
  *
- * Returns the hardif neighbour node if found or created or NULL otherwise.
+ * Return: the hardif neighbour node if found or created or NULL otherwise.
  */
 static struct batadv_hardif_neigh_node *
 batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface,
@@ -562,7 +567,8 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface,
  * @neigh_addr: the address of the neighbour
  *
  * Looks for and possibly returns a neighbour belonging to this hard interface.
- * Returns NULL if the neighbour is not found.
+ *
+ * Return: neighbor when found. Othwerwise NULL
  */
 struct batadv_hardif_neigh_node *
 batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
@@ -594,7 +600,8 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
  * @neigh_addr: the mac address of the neighbour interface
  *
  * Allocates a new neigh_node object and initialises all the generic fields.
- * Returns the new object or NULL on failure.
+ *
+ * Return: neighbor when found. Othwerwise NULL
  */
 struct batadv_neigh_node *
 batadv_neigh_node_new(struct batadv_orig_node *orig_node,
@@ -656,7 +663,7 @@ out:
  * @seq: neighbour table seq_file struct
  * @offset: not used
  *
- * Always returns 0.
+ * Return: always 0
  */
 int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
 {
@@ -820,7 +827,8 @@ void batadv_originator_free(struct batadv_priv *bat_priv)
  *
  * Creates a new originator object and initialise all the generic fields.
  * The new object is not added to the originator list.
- * Returns the newly created object or NULL on failure.
+ *
+ * Return: the newly created object or NULL on failure.
  */
 struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
 					      const u8 *addr)
@@ -937,7 +945,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node which is to be checked
  *
- * Returns true if any ifinfo entry was purged, false otherwise.
+ * Return: true if any ifinfo entry was purged, false otherwise.
  */
 static bool
 batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
@@ -989,7 +997,7 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node which is to be checked
  *
- * Returns true if any neighbor was purged, false otherwise
+ * Return: true if any neighbor was purged, false otherwise
  */
 static bool
 batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
@@ -1048,7 +1056,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
  * @orig_node: orig node which is to be checked
  * @if_outgoing: the interface for which the metric should be compared
  *
- * Returns the current best neighbor, with refcount increased.
+ * Return: the current best neighbor, with refcount increased.
  */
 static struct batadv_neigh_node *
 batadv_find_best_neighbor(struct batadv_priv *bat_priv,
@@ -1085,7 +1093,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv,
  * This function checks if the orig_node or substructures of it have become
  * obsolete, and purges this information if that's the case.
  *
- * Returns true if the orig_node is to be removed, false otherwise.
+ * Return: true if the orig_node is to be removed, false otherwise.
  */
 static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
 				   struct batadv_orig_node *orig_node)
@@ -1230,7 +1238,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
  * @seq: debugfs table seq_file struct
  * @offset: not used
  *
- * Returns 0
+ * Return: 0
  */
 int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset)
 {
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 54d660806c2a..bcaa7870038b 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -140,9 +140,12 @@ out:
 		batadv_neigh_node_free_ref(router);
 }
 
-/* checks whether the host restarted and is in the protection time.
- * returns:
- *  0 if the packet is to be accepted
+/**
+ * batadv_window_protected checks whether the host restarted and is in the
+ *  protection time.
+ *
+ * Return:
+ *  0 if the packet is to be accepted.
  *  1 if the packet is to be ignored.
  */
 int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
@@ -198,7 +201,7 @@ bool batadv_check_management_packet(struct sk_buff *skb,
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: icmp packet to process
  *
- * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
  * otherwise.
  */
 static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
@@ -398,10 +401,11 @@ out:
  * @skb: packet to check
  * @hdr_size: size of header to pull
  *
- * Check for short header and bad addresses in given packet. Returns negative
- * value when check fails and 0 otherwise. The negative value depends on the
- * reason: -ENODATA for bad header, -EBADR for broadcast destination or source,
- * and -EREMOTE for non-local (other host) destination.
+ * Check for short header and bad addresses in given packet.
+ *
+ * Return: negative value when check fails and 0 otherwise. The negative value
+ * depends on the reason: -ENODATA for bad header, -EBADR for broadcast
+ * destination or source, and -EREMOTE for non-local (other host) destination.
  */
 static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
 				       struct sk_buff *skb, int hdr_size)
@@ -435,7 +439,7 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
  * @orig_node: the destination node
  * @recv_if: pointer to interface this packet was received on
  *
- * Returns the router which should be used for this orig_node on
+ * Return: the router which should be used for this orig_node on
  * this interface, or NULL if not available.
  */
 struct batadv_neigh_node *
@@ -648,7 +652,7 @@ out:
  * the new corresponding information (originator address where the destination
  * client currently is and its known TTVN)
  *
- * Returns true if the packet header has been updated, false otherwise
+ * Return: true if the packet header has been updated, false otherwise
  */
 static bool
 batadv_reroute_unicast_packet(struct batadv_priv *bat_priv,
@@ -805,7 +809,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
  * @skb: unicast tvlv packet to process
  * @recv_if: pointer to interface this packet was received on
  *
- * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
  * otherwise.
  */
 int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
@@ -905,7 +909,7 @@ rx_success:
  * @skb: unicast tvlv packet to process
  * @recv_if: pointer to interface this packet was received on
  *
- * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
  * otherwise.
  */
 int batadv_recv_unicast_tvlv(struct sk_buff *skb,
@@ -959,7 +963,7 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
  * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till
  * lack further fragments; 3) Merge fragments, if we have all needed parts.
  *
- * Return NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise.
+ * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise.
  */
 int batadv_recv_frag_packet(struct sk_buff *skb,
 			    struct batadv_hard_iface *recv_if)
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 3fbf7910c4d1..d9b93c567bbc 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -111,7 +111,7 @@ send_skb_err:
  * host, NULL can be passed as recv_if and no interface alternating is
  * attempted.
  *
- * Returns NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or
+ * Return: NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or
  * NET_XMIT_POLICED if the skb is buffered for later transmit.
  */
 int batadv_send_skb_to_orig(struct sk_buff *skb,
@@ -165,7 +165,7 @@ out:
  * @hdr_size: amount of bytes to push at the beginning of the skb
  * @orig_node: the destination node
  *
- * Returns false if the buffer extension was not possible or true otherwise.
+ * Return: false if the buffer extension was not possible or true otherwise.
  */
 static bool
 batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size,
@@ -196,7 +196,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size,
  * @skb: the skb containing the payload to encapsulate
  * @orig_node: the destination node
  *
- * Returns false if the payload could not be encapsulated or true otherwise.
+ * Return: false if the payload could not be encapsulated or true otherwise.
  */
 static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb,
 					    struct batadv_orig_node *orig_node)
@@ -214,7 +214,7 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb,
  * @orig: the destination node
  * @packet_subtype: the unicast 4addr packet subtype to use
  *
- * Returns false if the payload could not be encapsulated or true otherwise.
+ * Return: false if the payload could not be encapsulated or true otherwise.
  */
 bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv,
 					   struct sk_buff *skb,
@@ -265,7 +265,7 @@ out:
  * as packet_type. Then send this frame to the given orig_node and release a
  * reference to this orig_node.
  *
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
  */
 int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
 			    struct sk_buff *skb, int packet_type,
@@ -339,7 +339,7 @@ out:
  * BATADV_UNICAST_4ADDR was supplied as packet_type. Then send this frame
  * to the according destination node.
  *
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
  */
 int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
 				   struct sk_buff *skb, int packet_type,
@@ -373,7 +373,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
  * Look up the currently selected gateway. Wrap the given skb into a batman-adv
  * unicast header and send this frame to this gateway node.
  *
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
  */
 int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
 			   unsigned short vid)
@@ -430,14 +430,16 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
 			   send_time);
 }
 
-/* add a broadcast packet to the queue and setup timers. broadcast packets
- * are sent multiple times to increase probability for being received.
+/**
+ * batadv_add_bcast_packet_to_list
  *
- * This function returns NETDEV_TX_OK on success and NETDEV_TX_BUSY on
- * errors.
+ * add a broadcast packet to the queue and setup timers. broadcast packets
+ * are sent multiple times to increase probability for being received.
  *
  * The skb is not consumed, so the caller should make sure that the
  * skb is freed.
+ *
+ * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
  */
 int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
 				    const struct sk_buff *skb,
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 82059f259e46..3adc11fe1586 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -69,7 +69,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
  * header via the translation table. Wrap the given skb into a batman-adv
  * unicast header. Then send this frame to the according destination node.
  *
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
  */
 static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
 					 struct sk_buff *skb, u8 *dst_hint,
@@ -92,7 +92,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
  * unicast-4addr header. Then send this frame to the according destination
  * node.
  *
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
  */
 static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv,
 					       struct sk_buff *skb,
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index c923e3c7b6ac..5ee794b62392 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -501,7 +501,7 @@ void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan)
  * @bat_priv: the bat priv with all the soft interface information
  * @vid: the identifier of the vlan object to retrieve
  *
- * Returns the private data of the vlan matching the vid passed as argument or
+ * Return: the private data of the vlan matching the vid passed as argument or
  * NULL otherwise. The refcounter of the returned object is incremented by 1.
  */
 struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
@@ -530,7 +530,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
  * @bat_priv: the bat priv with all the soft interface information
  * @vid: the VLAN identifier
  *
- * Returns 0 on success, a negative error otherwise.
+ * Return: 0 on success, a negative error otherwise.
  */
 int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
 {
@@ -599,7 +599,7 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv,
  * Set up all the internal structures for handling the new vlan on top of the
  * mesh interface
  *
- * Returns 0 on success or a negative error code in case of failure.
+ * Return: 0 on success or a negative error code in case of failure.
  */
 static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
 				    unsigned short vid)
@@ -656,7 +656,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
  * Destroy all the internal structures used to handle the vlan identified by vid
  * on top of the mesh interface
  *
- * Returns 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q
+ * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q
  * or -ENOENT if the specified vlan id wasn't registered.
  */
 static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
@@ -745,7 +745,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work)
  * batadv_softif_init_late - late stage initialization of soft interface
  * @dev: registered network device to modify
  *
- * Returns error code on failures
+ * Return: error code on failures
  */
 static int batadv_softif_init_late(struct net_device *dev)
 {
@@ -847,7 +847,7 @@ free_bat_counters:
  * @dev: batadv_soft_interface used as master interface
  * @slave_dev: net_device which should become the slave interface
  *
- * Return 0 if successful or error otherwise.
+ * Return: 0 if successful or error otherwise.
  */
 static int batadv_softif_slave_add(struct net_device *dev,
 				   struct net_device *slave_dev)
@@ -872,7 +872,7 @@ out:
  * @dev: batadv_soft_interface used as master interface
  * @slave_dev: net_device which should be removed from the master interface
  *
- * Return 0 if successful or error otherwise.
+ * Return: 0 if successful or error otherwise.
  */
 static int batadv_softif_slave_del(struct net_device *dev,
 				   struct net_device *slave_dev)
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 33772d9bd613..7658e6ea0596 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -64,7 +64,7 @@ static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj)
  * batadv_vlan_kobj_to_batpriv - convert a vlan kobj in the associated batpriv
  * @obj: kobject to covert
  *
- * Returns the associated batadv_priv struct.
+ * Return: the associated batadv_priv struct.
  */
 static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj)
 {
@@ -84,7 +84,7 @@ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj)
  * batadv_kobj_to_vlan - convert a kobj in the associated softif_vlan struct
  * @obj: kobject to covert
  *
- * Returns the associated softif_vlan struct if found, NULL otherwise.
+ * Return: the associated softif_vlan struct if found, NULL otherwise.
  */
 static struct batadv_softif_vlan *
 batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj)
@@ -491,7 +491,7 @@ static ssize_t batadv_store_gw_bwidth(struct kobject *kobj,
  * @attr: the batman-adv attribute the user is interacting with
  * @buff: the buffer that will contain the data to send back to the user
  *
- * Returns the number of bytes written into 'buff' on success or a negative
+ * Return: the number of bytes written into 'buff' on success or a negative
  * error code in case of failure
  */
 static ssize_t batadv_show_isolation_mark(struct kobject *kobj,
@@ -511,7 +511,7 @@ static ssize_t batadv_show_isolation_mark(struct kobject *kobj,
  * @buff: the buffer containing the user data
  * @count: number of bytes in the buffer
  *
- * Returns 'count' on success or a negative error code in case of failure
+ * Return: 'count' on success or a negative error code in case of failure
  */
 static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
 					   struct attribute *attr, char *buff,
@@ -681,7 +681,7 @@ void batadv_sysfs_del_meshif(struct net_device *dev)
  * @dev: netdev of the mesh interface
  * @vlan: private data of the newly added VLAN interface
  *
- * Returns 0 on success and -ENOMEM if any of the structure allocations fails.
+ * Return: 0 on success and -ENOMEM if any of the structure allocations fails.
  */
 int batadv_sysfs_add_vlan(struct net_device *dev,
 			  struct batadv_softif_vlan *vlan)
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 7d8fc90c9e38..af1d24ce420f 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -68,7 +68,11 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv,
 				 unsigned short vid, const char *message,
 				 bool roaming);
 
-/* returns 1 if they are the same mac addr and vid */
+/**
+ * batadv_compare_tt
+ *
+ * Return: 1 if they are the same mac addr and vid
+ */
 static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
 {
 	const void *data1 = container_of(node, struct batadv_tt_common_entry,
@@ -84,7 +88,7 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
  * @data: pointer to the tt_common_entry object to map
  * @size: the size of the hash table
  *
- * Returns the hash index where the object represented by 'data' should be
+ * Return: the hash index where the object represented by 'data' should be
  * stored at.
  */
 static inline u32 batadv_choose_tt(const void *data, u32 size)
@@ -105,7 +109,7 @@ static inline u32 batadv_choose_tt(const void *data, u32 size)
  * @addr: the mac address of the client to look for
  * @vid: VLAN identifier
  *
- * Returns a pointer to the tt_common struct belonging to the searched client if
+ * Return: a pointer to the tt_common struct belonging to the searched client if
  * found, NULL otherwise.
  */
 static struct batadv_tt_common_entry *
@@ -150,7 +154,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr,
  * @addr: the mac address of the client to look for
  * @vid: VLAN identifier
  *
- * Returns a pointer to the corresponding tt_local_entry struct if the client is
+ * Return: a pointer to the corresponding tt_local_entry struct if the client is
  * found, NULL otherwise.
  */
 static struct batadv_tt_local_entry *
@@ -175,7 +179,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
  * @addr: the mac address of the client to look for
  * @vid: VLAN identifier
  *
- * Returns a pointer to the corresponding tt_global_entry struct if the client
+ * Return: a pointer to the corresponding tt_global_entry struct if the client
  * is found, NULL otherwise.
  */
 static struct batadv_tt_global_entry *
@@ -220,7 +224,7 @@ batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry)
  * @addr: the mac address of the client to count entries for
  * @vid: VLAN identifier
  *
- * Return the number of originators advertising the given address/data
+ * Return: the number of originators advertising the given address/data
  * (excluding ourself).
  */
 int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
@@ -433,7 +437,7 @@ unlock:
  * batadv_tt_len - compute length in bytes of given number of tt changes
  * @changes_num: number of tt changes
  *
- * Returns computed length in bytes.
+ * Return: computed length in bytes.
  */
 static int batadv_tt_len(int changes_num)
 {
@@ -444,7 +448,7 @@ static int batadv_tt_len(int changes_num)
  * batadv_tt_entries - compute the number of entries fitting in tt_len bytes
  * @tt_len: available space
  *
- * Returns the number of entries.
+ * Return: the number of entries.
  */
 static u16 batadv_tt_entries(u16 tt_len)
 {
@@ -456,7 +460,7 @@ static u16 batadv_tt_entries(u16 tt_len)
  *  size when transmitted over the air
  * @bat_priv: the bat priv with all the soft interface information
  *
- * Returns local translation table size in bytes.
+ * Return: local translation table size in bytes.
  */
 static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv)
 {
@@ -522,7 +526,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv,
  * @mark: the value contained in the skb->mark field of the received packet (if
  *  any)
  *
- * Returns true if the client was successfully added, false otherwise.
+ * Return: true if the client was successfully added, false otherwise.
  */
 bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
 			 unsigned short vid, int ifindex, u32 mark)
@@ -722,7 +726,7 @@ out:
  * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data
  * objects, one per active VLAN served by the originator node.
  *
- * Return the size of the allocated buffer or 0 in case of failure.
+ * Return: the size of the allocated buffer or 0 in case of failure.
  */
 static u16
 batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
@@ -796,7 +800,7 @@ out:
  * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data
  * objects, one per active VLAN.
  *
- * Return the size of the allocated buffer or 0 in case of failure.
+ * Return: the size of the allocated buffer or 0 in case of failure.
  */
 static u16
 batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
@@ -1038,7 +1042,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
  * @message: message to append to the log on deletion
  * @roaming: true if the deletion is due to a roaming event
  *
- * Returns the flags assigned to the local entry before being deleted
+ * Return: the flags assigned to the local entry before being deleted
  */
 u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
 			   unsigned short vid, const char *message,
@@ -1238,10 +1242,13 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
 	spin_unlock_bh(&bat_priv->tt.changes_list_lock);
 }
 
-/* retrieves the orig_tt_list_entry belonging to orig_node from the
+/**
+ * batadv_tt_global_orig_entry_find
+ *
+ * retrieves the orig_tt_list_entry belonging to orig_node from the
  * batadv_tt_global_entry list
  *
- * returns it with an increased refcounter, NULL if not found
+ * Return: it with an increased refcounter, NULL if not found
  */
 static struct batadv_tt_orig_list_entry *
 batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
@@ -1266,8 +1273,12 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
 	return orig_entry;
 }
 
-/* find out if an orig_node is already in the list of a tt_global_entry.
- * returns true if found, false otherwise
+/**
+ * batadv_tt_global_entry_has_orig
+ *
+ * find out if an orig_node is already in the list of a tt_global_entry.
+ *
+ * Return: true if found, false otherwise
  */
 static bool
 batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry,
@@ -1339,7 +1350,7 @@ out:
  *
  * The caller must hold orig_node refcount.
  *
- * Return true if the new entry has been added, false otherwise
+ * Return: true if the new entry has been added, false otherwise
  */
 static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
 				 struct batadv_orig_node *orig_node,
@@ -1497,7 +1508,7 @@ out:
  * @tt_global_entry: global translation table entry to be analyzed
  *
  * This functon assumes the caller holds rcu_read_lock().
- * Returns best originator list entry or NULL on errors.
+ * Return: best originator list entry or NULL on errors.
  */
 static struct batadv_tt_orig_list_entry *
 batadv_transtable_best_orig(struct batadv_priv *bat_priv,
@@ -2027,7 +2038,7 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry,
  * @addr: mac address of the destination client
  * @vid: VLAN identifier
  *
- * Returns a pointer to the originator that was selected as destination in the
+ * Return: a pointer to the originator that was selected as destination in the
  * mesh for contacting the client 'addr', NULL otherwise.
  * In case of multiple originators serving the same client, the function returns
  * the best one (best in terms of metric towards the destination node).
@@ -2102,7 +2113,7 @@ out:
  * because the XOR operation can combine them all while trying to reduce the
  * noise as much as possible.
  *
- * Returns the checksum of the global table of a given originator.
+ * Return: the checksum of the global table of a given originator.
  */
 static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv,
 				struct batadv_orig_node *orig_node,
@@ -2179,7 +2190,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv,
  * For details about the computation, please refer to the documentation for
  * batadv_tt_global_crc().
  *
- * Returns the checksum of the local table
+ * Return: the checksum of the local table
  */
 static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv,
 			       unsigned short vid)
@@ -2285,7 +2296,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
  * @bat_priv: the bat priv with all the soft interface information
  * @orig_node: orig node this request is being issued for
  *
- * Returns the pointer to the new tt_req_node struct if no request
+ * Return: the pointer to the new tt_req_node struct if no request
  * has already been issued for this orig_node, NULL otherwise.
  */
 static struct batadv_tt_req_node *
@@ -2320,7 +2331,7 @@ unlock:
  * @entry_ptr: to be checked local tt entry
  * @data_ptr: not used but definition required to satisfy the callback prototype
  *
- * Returns 1 if the entry is a valid, 0 otherwise.
+ * Return: 1 if the entry is a valid, 0 otherwise.
  */
 static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr)
 {
@@ -2405,7 +2416,7 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
  * @tt_vlan: pointer to the first tvlv VLAN entry
  * @num_vlan: number of tvlv VLAN entries
  *
- * Return true if all the received CRCs match the locally stored ones, false
+ * Return: true if all the received CRCs match the locally stored ones, false
  * otherwise
  */
 static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
@@ -2588,7 +2599,7 @@ out:
  * @req_src: mac address of tt request sender
  * @req_dst: mac address of tt request recipient
  *
- * Returns true if tt request reply was sent, false otherwise.
+ * Return: true if tt request reply was sent, false otherwise.
  */
 static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
 					  struct batadv_tvlv_tt_data *tt_data,
@@ -2720,7 +2731,7 @@ out:
  * @tt_data: tt data containing the tt request information
  * @req_src: mac address of tt request sender
  *
- * Returns true if tt request reply was sent, false otherwise.
+ * Return: true if tt request reply was sent, false otherwise.
  */
 static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
 				       struct batadv_tvlv_tt_data *tt_data,
@@ -2838,7 +2849,7 @@ out:
  * @req_src: mac address of tt request sender
  * @req_dst: mac address of tt request recipient
  *
- * Returns true if tt request reply was sent, false otherwise.
+ * Return: true if tt request reply was sent, false otherwise.
  */
 static bool batadv_send_tt_response(struct batadv_priv *bat_priv,
 				    struct batadv_tvlv_tt_data *tt_data,
@@ -2933,7 +2944,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
  * @addr: the mac address of the client to check
  * @vid: VLAN identifier
  *
- * Returns true if the client is served by this node, false otherwise.
+ * Return: true if the client is served by this node, false otherwise.
  */
 bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr,
 			 unsigned short vid)
@@ -3050,11 +3061,14 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
 	spin_unlock_bh(&bat_priv->tt.roam_list_lock);
 }
 
-/* This function checks whether the client already reached the
+/**
+ * batadv_tt_check_roam_count
+ *
+ * This function checks whether the client already reached the
  * maximum number of possible roaming phases. In this case the ROAMING_ADV
  * will not be sent.
  *
- * returns true if the ROAMING_ADV can be sent, false otherwise
+ * Return: true if the ROAMING_ADV can be sent, false otherwise
  */
 static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client)
 {
@@ -3453,7 +3467,7 @@ request_table:
  * @addr: the mac address of the client to check
  * @vid: VLAN identifier
  *
- * Returns true if we know that the client has moved from its old originator
+ * Return: true if we know that the client has moved from its old originator
  * to another one. This entry is still kept for consistency purposes and will be
  * deleted later by a DEL or because of timeout
  */
@@ -3479,7 +3493,7 @@ out:
  * @addr: the mac address of the local client to query
  * @vid: VLAN identifier
  *
- * Returns true if the local client is known to be roaming (it is not served by
+ * Return: true if the local client is known to be roaming (it is not served by
  * this node anymore) or not. If yes, the client is still present in the table
  * to keep the latter consistent with the node TTVN
  */
@@ -3608,7 +3622,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
  * @tvlv_value: tvlv buffer containing the tt data
  * @tvlv_value_len: tvlv buffer length
  *
- * Returns NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS
+ * Return: NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS
  * otherwise.
  */
 static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
@@ -3689,7 +3703,7 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
  * @tvlv_value: tvlv buffer containing the tt data
  * @tvlv_value_len: tvlv buffer length
  *
- * Returns NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS
+ * Return: NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS
  * otherwise.
  */
 static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
@@ -3735,7 +3749,7 @@ out:
  * batadv_tt_init - initialise the translation table internals
  * @bat_priv: the bat priv with all the soft interface information
  *
- * Return 0 on success or negative error number in case of failure.
+ * Return: 0 on success or negative error number in case of failure.
  */
 int batadv_tt_init(struct batadv_priv *bat_priv)
 {
@@ -3773,7 +3787,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv)
  * @addr: the mac address of the client
  * @vid: the identifier of the VLAN where this client is connected
  *
- * Returns true if the client is marked with the TT_CLIENT_ISOLA flag, false
+ * Return: true if the client is marked with the TT_CLIENT_ISOLA flag, false
  * otherwise
  */
 bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
-- 
cgit v1.2.3-59-g8ed1b


From 7afcbbef6471130a1eb586fea0f5c06609b8341f Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 31 Oct 2015 12:29:29 +0100
Subject: batman-adv: Fix kerneldoc of main functions

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bitarray.c       | 12 +++++++++---
 net/batman-adv/bitarray.h       | 11 +++++------
 net/batman-adv/debugfs.c        |  6 ++++++
 net/batman-adv/main.c           |  2 ++
 net/batman-adv/main.h           |  4 ++++
 net/batman-adv/originator.c     |  4 +++-
 net/batman-adv/packet.h         |  6 ++++++
 net/batman-adv/routing.c        |  7 ++++++-
 net/batman-adv/send.c           |  5 ++++-
 net/batman-adv/soft-interface.c |  2 ++
 net/batman-adv/sysfs.c          |  1 +
 11 files changed, 48 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 06bab5179bf9..3404195f2561 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -29,10 +29,16 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n)
 	bitmap_shift_left(seq_bits, seq_bits, n, BATADV_TQ_LOCAL_WINDOW_SIZE);
 }
 
-/* receive and process one packet within the sequence number window.
+/**
+ * batadv_bit_get_packet - receive and process one packet within the sequence
+ *  number window
+ * @priv: the bat priv with all the soft interface information
+ * @seq_bits: pointer to the sequence number receive packet
+ * @seq_num_diff: difference between the current/received sequence number and
+ *  the last sequence number
+ * @set_mark: whether this packet should be marked in seq_bits
  *
- * Return:
- *  1 if the window was moved (either new or very old)
+ * Return: 1 if the window was moved (either new or very old),
  *  0 if the window was not moved/shifted.
  */
 int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index cf2aeb0831a4..2b64d7a1bc9b 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -25,7 +25,11 @@
 #include <linux/types.h>
 
 /**
- * batadv_test_bit
+ * batadv_test_bit - check if bit is set in the current window
+ *
+ * @seq_bits: pointer to the sequence number receive packet
+ * @last_seqno: latest sequence number in seq_bits
+ * @curr_seqno: sequence number to test for
  *
  * Return: 1 if the corresponding bit in the given seq_bits indicates true
  * and curr_seqno is within range of last_seqno. Otherwise returns 0.
@@ -51,11 +55,6 @@ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n)
 	set_bit(n, seq_bits); /* turn the position on */
 }
 
-/**
- * batadv_bit_get_packet - receive and process one packet
- *
- * Return: 1 if received seq_num is considered new, 0 if old
- */
 int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
 			  int set_mark);
 
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 037ad0a5f485..0fc9df52f3d9 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -281,6 +281,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file)
  *  originator table of an hard interface
  * @inode: inode pointer to debugfs file
  * @file: pointer to the seq_file
+ *
+ * Return: 0 on success or negative error number in case of failure
  */
 static int batadv_originators_hardif_open(struct inode *inode,
 					  struct file *file)
@@ -329,6 +331,8 @@ static int batadv_bla_backbone_table_open(struct inode *inode,
  * batadv_dat_cache_open - Prepare file handler for reads from dat_chache
  * @inode: inode which was opened
  * @file: file handle to be initialized
+ *
+ * Return: 0 on success or negative error number in case of failure
  */
 static int batadv_dat_cache_open(struct inode *inode, struct file *file)
 {
@@ -483,6 +487,8 @@ void batadv_debugfs_destroy(void)
  * batadv_debugfs_add_hardif - creates the base directory for a hard interface
  *  in debugfs.
  * @hard_iface: hard interface which should be added.
+ *
+ * Return: 0 on success or negative error number in case of failure
  */
 int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
 {
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 70907f6f37ed..9d34be628304 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -599,6 +599,8 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
  *
  * payload_ptr must always point to an address in the skb head buffer and not to
  * a fragment.
+ *
+ * Return: big endian crc32c of the checksummed data
  */
 __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
 {
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 8a48d4a40828..34f56efa2e4e 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -275,6 +275,8 @@ static inline void _batadv_dbg(int type __always_unused,
 
 /**
  * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses
+ * @data1: Pointer to a six-byte array containing the Ethernet address
+ * @data2: Pointer other six-byte array containing the Ethernet address
  *
  * note: can't use ether_addr_equal() as it requires aligned memory
  *
@@ -331,6 +333,8 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
 
 /**
  * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
+ * @bat_priv: the bat priv with all the soft interface information
+ * @idx: index of counter to sum up
  *
  * Return: sum of all cpu-local counters
  */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index ed1d6d7b717e..f6ca4e509729 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -48,7 +48,9 @@ static struct lock_class_key batadv_orig_hash_lock_class_key;
 static void batadv_purge_orig(struct work_struct *work);
 
 /**
- * batadv_compare_orig
+ * batadv_compare_orig - comparing function used in the originator hash table
+ * @node: node in the local table
+ * @data2: second object to compare the node to
  *
  * Return: 1 if they are the same originator
  */
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 72fd5b0a4183..fdb01637ad1f 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -209,6 +209,11 @@ struct batadv_bla_claim_dst {
  * @version: batman-adv protocol version, part of the genereal header
  * @ttl: time to live for this packet, part of the genereal header
  * @flags: contains routing relevant flags - see enum batadv_iv_flags
+ * @seqno: sequence identification
+ * @orig: address of the source node
+ * @prev_sender: address of the previous sender
+ * @reserved: reserved byte for alignment
+ * @tq: transmission quality
  * @tvlv_len: length of tvlv data following the ogm header
  */
 struct batadv_ogm_packet {
@@ -345,6 +350,7 @@ struct batadv_unicast_packet {
  * @u: common unicast packet header
  * @src: address of the source
  * @subtype: packet subtype
+ * @reserved: reserved byte for alignment
  */
 struct batadv_unicast_4addr_packet {
 	struct batadv_unicast_packet u;
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index bcaa7870038b..1fb1be31bf3a 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -141,8 +141,13 @@ out:
 }
 
 /**
- * batadv_window_protected checks whether the host restarted and is in the
+ * batadv_window_protected - checks whether the host restarted and is in the
  *  protection time.
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq_num_diff: difference between the current/received sequence number and
+ *  the last sequence number
+ * @last_reset: jiffies timestamp of the last reset, will be updated when reset
+ *  is detected
  *
  * Return:
  *  0 if the packet is to be accepted.
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index d9b93c567bbc..c188f4660981 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -431,7 +431,10 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_add_bcast_packet_to_list
+ * batadv_add_bcast_packet_to_list - queue broadcast packet for multiple sends
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
  *
  * add a broadcast packet to the queue and setup timers. broadcast packets
  * are sent multiple times to increase probability for being received.
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 5ee794b62392..6c65de97126c 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -594,6 +594,7 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv,
 /**
  * batadv_interface_add_vid - ndo_add_vid API implementation
  * @dev: the netdev of the mesh interface
+ * @proto: protocol of the the vlan id
  * @vid: identifier of the new vlan
  *
  * Set up all the internal structures for handling the new vlan on top of the
@@ -651,6 +652,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
 /**
  * batadv_interface_kill_vid - ndo_kill_vid API implementation
  * @dev: the netdev of the mesh interface
+ * @proto: protocol of the the vlan id
  * @vid: identifier of the deleted vlan
  *
  * Destroy all the internal structures used to handle the vlan identified by vid
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 7658e6ea0596..f38d7b75b0a5 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -82,6 +82,7 @@ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj)
 
 /**
  * batadv_kobj_to_vlan - convert a kobj in the associated softif_vlan struct
+ * @bat_priv: the bat priv with all the soft interface information
  * @obj: kobject to covert
  *
  * Return: the associated softif_vlan struct if found, NULL otherwise.
-- 
cgit v1.2.3-59-g8ed1b


From 672e797850422bdbbcd3c781a07fe777042ee068 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 31 Oct 2015 12:29:30 +0100
Subject: batman-adv: Fix kerneldoc of network-coding functions

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/network-coding.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 1a75d288f8a7..3a752bea55fa 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -64,6 +64,8 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
 
 /**
  * batadv_nc_init - one-time initialization for network coding
+ *
+ * Return: 0 on success or negative error number in case of failure
  */
 int __init batadv_nc_init(void)
 {
@@ -142,6 +144,8 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
 /**
  * batadv_nc_mesh_init - initialise coding hash table and start house keeping
  * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
  */
 int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
 {
@@ -989,6 +993,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
  * batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair
  *  selection of a receiver with slightly lower TQ than the other
  * @tq: to be weighted tq value
+ *
+ * Return: scaled tq value
  */
 static u8 batadv_nc_random_weight_tq(u8 tq)
 {
@@ -1781,6 +1787,9 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
  *  resulting unicast packet
  * @skb: incoming coded packet
  * @recv_if: pointer to interface this packet was received on
+ *
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * otherwise.
  */
 static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
 				       struct batadv_hard_iface *recv_if)
@@ -1865,6 +1874,8 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
  * batadv_nc_nodes_seq_print_text - print the nc node information
  * @seq: seq file to print on
  * @offset: not used
+ *
+ * Return: always 0
  */
 int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset)
 {
@@ -1927,6 +1938,8 @@ out:
 /**
  * batadv_nc_init_debugfs - create nc folder and related files in debugfs
  * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
  */
 int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 04e14be65bdb09f8ed87f75e0b5463ced6706949 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <sw@simonwunderlich.de>
Date: Fri, 6 Nov 2015 10:45:19 +0100
Subject: batman-adv: Update/repair bridge loop avoidance kerneldoc

Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bridge_loop_avoidance.c | 178 ++++++++++++++++++++++++---------
 1 file changed, 130 insertions(+), 48 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 90c79948979c..131aca049907 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -59,9 +59,11 @@ batadv_bla_send_announce(struct batadv_priv *bat_priv,
 			 struct batadv_bla_backbone_gw *backbone_gw);
 
 /**
- * batadv_choose_claim
+ * batadv_choose_claim - choose the right bucket for a claim.
+ * @data: data to hash
+ * @size: size of the hash table
  *
- * Return: the index of the claim
+ * Return: the hash index of the claim
  */
 static inline u32 batadv_choose_claim(const void *data, u32 size)
 {
@@ -75,9 +77,11 @@ static inline u32 batadv_choose_claim(const void *data, u32 size)
 }
 
 /**
- * batadv_choose_backbone_gw
+ * batadv_choose_backbone_gw - choose the right bucket for a backbone gateway.
+ * @data: data to hash
+ * @size: size of the hash table
  *
- * Return: the index of the backbone gateway
+ * Return: the hash index of the backbone gateway
  */
 static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
 {
@@ -90,7 +94,13 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
 	return hash % size;
 }
 
-/* compares address and vid of two backbone gws */
+/**
+ * batadv_compare_backbone_gw - compare address and vid of two backbone gws
+ * @node: list node of the first entry to compare
+ * @data2: pointer to the second backbone gateway
+ *
+ * Return: 1 if the backbones have the same data, 0 otherwise
+ */
 static int batadv_compare_backbone_gw(const struct hlist_node *node,
 				      const void *data2)
 {
@@ -108,7 +118,13 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node,
 	return 1;
 }
 
-/* compares address and vid of two claims */
+/**
+ * batadv_compare_backbone_gw - compare address and vid of two claims
+ * @node: list node of the first entry to compare
+ * @data2: pointer to the second claims
+ *
+ * Return: 1 if the claim have the same data, 0 otherwise
+ */
 static int batadv_compare_claim(const struct hlist_node *node,
 				const void *data2)
 {
@@ -126,7 +142,10 @@ static int batadv_compare_claim(const struct hlist_node *node,
 	return 1;
 }
 
-/* free a backbone gw */
+/**
+ * batadv_compare_backbone_gw - free backbone gw
+ * @backbone_gw: backbone gateway to be free'd
+ */
 static void
 batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw)
 {
@@ -134,14 +153,21 @@ batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw)
 		kfree_rcu(backbone_gw, rcu);
 }
 
-/* finally deinitialize the claim */
+/**
+ * batadv_claim_release - release claim from lists and queue for free after rcu
+ *  grace period
+ * @ref: kref pointer of the claim
+ */
 static void batadv_claim_release(struct batadv_bla_claim *claim)
 {
 	batadv_backbone_gw_free_ref(claim->backbone_gw);
 	kfree_rcu(claim, rcu);
 }
 
-/* free a claim, call claim_free_rcu if its the last reference */
+/**
+ * batadv_claim_free_rcu - free a claim
+ * @claim: claim to be free'd
+ */
 static void batadv_claim_free_ref(struct batadv_bla_claim *claim)
 {
 	if (atomic_dec_and_test(&claim->refcount))
@@ -149,12 +175,10 @@ static void batadv_claim_free_ref(struct batadv_bla_claim *claim)
 }
 
 /**
- * batadv_claim_hash_find
+ * batadv_claim_hash_find - looks for a claim in the claim hash
  * @bat_priv: the bat priv with all the soft interface information
  * @data: search data (may be local/static data)
  *
- * looks for a claim in the hash
- *
  * Return: claim if found or NULL otherwise.
  */
 static struct batadv_bla_claim
@@ -190,12 +214,12 @@ static struct batadv_bla_claim
 }
 
 /**
- * batadv_backbone_hash_find - looks for a claim in the hash
+ * batadv_backbone_hash_find - looks for a backbone gateway in the hash
  * @bat_priv: the bat priv with all the soft interface information
  * @addr: the address of the originator
  * @vid: the VLAN ID
  *
- * Return: claim if found or NULL otherwise.
+ * Return: backbone gateway if found or NULL otherwise
  */
 static struct batadv_bla_backbone_gw *
 batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
@@ -233,7 +257,10 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
 	return backbone_gw_tmp;
 }
 
-/* delete all claims for a backbone */
+/**
+ * batadv_bla_del_backbone_claims - delete all claims for a backbone
+ * @backbone_gw: backbone gateway where the claims should be removed
+ */
 static void
 batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
 {
@@ -381,14 +408,13 @@ out:
 }
 
 /**
- * batadv_bla_get_backbone_gw
+ * batadv_bla_get_backbone_gw - finds or creates a backbone gateway
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: the mac address of the originator
  * @vid: the VLAN ID
  * @own_backbone: set if the requested backbone is local
  *
- * searches for the backbone gw or creates a new one if it could not
- * be found.
+ * Return: the (possibly created) backbone gateway or NULL on error
  */
 static struct batadv_bla_backbone_gw *
 batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
@@ -454,7 +480,13 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
 	return entry;
 }
 
-/* update or add the own backbone gw to make sure we announce
+/**
+ * batadv_bla_update_own_backbone_gw - updates the own backbone gw for a VLAN
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the selected primary interface
+ * @vid: VLAN identifier
+ *
+ * update or add the own backbone gw to make sure we announce
  * where we receive other backbone gws
  */
 static void
@@ -551,12 +583,9 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw)
 }
 
 /**
- * batadv_bla_send_announce
+ * batadv_bla_send_announce - Send an announcement frame
  * @bat_priv: the bat priv with all the soft interface information
  * @backbone_gw: our backbone gateway which should be announced
- *
- * This function sends an announcement. It is called from multiple
- * places.
  */
 static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
 				     struct batadv_bla_backbone_gw *backbone_gw)
@@ -646,8 +675,11 @@ claim_free_ref:
 	batadv_claim_free_ref(claim);
 }
 
-/* Delete a claim from the claim hash which has the
- * given mac address and vid.
+/**
+ * batadv_bla_del_claim - delete a claim from the claim hash
+ * @bat_priv: the bat priv with all the soft interface information
+ * @mac: mac address of the claim to be removed
+ * @vid: VLAN id for the claim to be removed
  */
 static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
 				 const u8 *mac, const unsigned short vid)
@@ -677,6 +709,10 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
 
 /**
  * batadv_handle_announce - check for ANNOUNCE frame
+ * @bat_priv: the bat priv with all the soft interface information
+ * @an_addr: announcement mac address (ARP Sender HW address)
+ * @backbone_addr: originator address of the sender (Ethernet source MAC)
+ * @vid: the VLAN ID of the frame
  *
  * Return: 1 if handled
  */
@@ -731,6 +767,11 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
 
 /**
  * batadv_handle_request - check for REQUEST frame
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the primary hard interface of this batman soft interface
+ * @backbone_addr: backbone address to be requested (ARP sender HW MAC)
+ * @ethhdr: ethernet header of a packet
+ * @vid: the VLAN ID of the frame
  *
  * Return: 1 if handled
  */
@@ -759,6 +800,11 @@ static int batadv_handle_request(struct batadv_priv *bat_priv,
 
 /**
  * batadv_handle_unclaim - check for UNCLAIM frame
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the primary hard interface of this batman soft interface
+ * @backbone_addr: originator address of the backbone (Ethernet source)
+ * @claim_addr: Client to be unclaimed (ARP sender HW MAC)
+ * @vid: the VLAN ID of the frame
  *
  * Return: 1 if handled
  */
@@ -792,6 +838,11 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
 
 /**
  * batadv_handle_claim - check for CLAIM frame
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the primary hard interface of this batman soft interface
+ * @backbone_addr: originator address of the backbone (Ethernet Source)
+ * @claim_addr: client mac address to be claimed (ARP sender HW MAC)
+ * @vid: the VLAN ID of the frame
  *
  * Return: 1 if handled
  */
@@ -823,7 +874,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_check_claim_group
+ * batadv_check_claim_group - check for claim group membership
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the primary interface of this batman interface
  * @hw_src: the Hardware source in the ARP Header
@@ -898,13 +949,11 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_bla_process_claim
+ * batadv_bla_process_claim - Check if this is a claim frame, and process it
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the primary hard interface of this batman soft interface
  * @skb: the frame to be checked
  *
- * Check if this is a claim frame, and process it accordingly.
- *
  * Return: 1 if it was a claim frame, otherwise return 0 to
  * tell the callee that it can use the frame on its own.
  */
@@ -1036,7 +1085,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
 	return 1;
 }
 
-/* Check when we last heard from other nodes, and remove them in case of
+/**
+ * batadv_bla_purge_backbone_gw - Remove backbone gateways after a timeout or
+ *  immediately
+ * @bat_priv: the bat priv with all the soft interface information
+ * @now: whether the whole hash shall be wiped now
+ *
+ * Check when we last heard from other nodes, and remove them in case of
  * a time out, or clean all backbone gws if now is set.
  */
 static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now)
@@ -1084,7 +1139,7 @@ purge_now:
 }
 
 /**
- * batadv_bla_purge_claims
+ * batadv_bla_purge_claims - Remove claims after a timeout or immediately
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the selected primary interface, may be NULL if now is set
  * @now: whether the whole hash shall be wiped now
@@ -1133,12 +1188,11 @@ purge_now:
 }
 
 /**
- * batadv_bla_update_orig_address
+ * batadv_bla_update_orig_address - Update the backbone gateways when the own
+ *  originator address changes
  * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the new selected primary_if
  * @oldif: the old primary interface, may be NULL
- *
- * Update the backbone gateways when the own orig address changes.
  */
 void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
 				    struct batadv_hard_iface *primary_if,
@@ -1209,7 +1263,11 @@ void batadv_bla_status_update(struct net_device *net_dev)
 	batadv_hardif_free_ref(primary_if);
 }
 
-/* periodic work to do:
+/**
+ * batadv_bla_periodic_work - performs periodic bla work
+ * @work: kernel work struct
+ *
+ * periodic work to do:
  *  * purge structures when they are too old
  *  * send announcements
  */
@@ -1290,7 +1348,12 @@ out:
 static struct lock_class_key batadv_claim_hash_lock_class_key;
 static struct lock_class_key batadv_backbone_hash_lock_class_key;
 
-/* initialize all bla structures */
+/**
+ * batadv_bla_init - initialize all bla structures
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success, < 0 on error.
+ */
 int batadv_bla_init(struct batadv_priv *bat_priv)
 {
 	int i;
@@ -1345,7 +1408,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_bla_check_bcast_duplist
+ * batadv_bla_check_bcast_duplist - Check if a frame is in the broadcast dup.
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: contains the bcast_packet to be checked
  *
@@ -1357,6 +1420,8 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
  * with a good chance that it is the same packet. If it is furthermore
  * sent by another host, drop it. We allow equal packets from
  * the same host however as this might be intended.
+ *
+ * Return: 1 if a packet is in the duplicate list, 0 otherwise.
  */
 int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
 				   struct sk_buff *skb)
@@ -1415,13 +1480,12 @@ out:
 }
 
 /**
- * batadv_bla_is_backbone_gw_orig
+ * batadv_bla_is_backbone_gw_orig - Check if the originator is a gateway for
+ *  the VLAN identified by vid.
  * @bat_priv: the bat priv with all the soft interface information
  * @orig: originator mac address
  * @vid: VLAN identifier
  *
- * Check if the originator is a gateway for the VLAN identified by vid.
- *
  * Return: true if orig is a backbone for this vid, false otherwise.
  */
 bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
@@ -1456,13 +1520,11 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
 }
 
 /**
- * batadv_bla_is_backbone_gw
+ * batadv_bla_is_backbone_gw - check if originator is a backbone gw for a VLAN.
  * @skb: the frame to be checked
  * @orig_node: the orig_node of the frame
  * @hdr_size: maximum length of the frame
  *
- * bla_is_backbone_gw inspects the skb for the VLAN ID
- *
  * Return: 1 if the orig_node is also a gateway on the soft interface, otherwise
  * it returns 0.
  */
@@ -1491,7 +1553,12 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb,
 	return 1;
 }
 
-/* free all bla structures (for softinterface free or module unload) */
+/**
+ * batadv_bla_init - free all bla structures
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * for softinterface free or module unload
+ */
 void batadv_bla_free(struct batadv_priv *bat_priv)
 {
 	struct batadv_hard_iface *primary_if;
@@ -1514,13 +1581,13 @@ void batadv_bla_free(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_bla_rx
+ * batadv_bla_rx - check packets coming from the mesh.
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the frame to be checked
  * @vid: the VLAN ID of the frame
  * @is_bcast: the packet came in a broadcast packet type.
  *
- * bla_rx avoidance checks if:
+ * batadv_bla_rx avoidance checks if:
  *  * we have to race for a claim
  *  * if the frame is allowed on the LAN
  *
@@ -1610,12 +1677,12 @@ out:
 }
 
 /**
- * batadv_bla_tx
+ * batadv_bla_tx - check packets going into the mesh
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: the frame to be checked
  * @vid: the VLAN ID of the frame
  *
- * bla_tx checks if:
+ * batadv_bla_tx checks if:
  *  * a claim was received which has to be processed
  *  * the frame is allowed on the mesh
  *
@@ -1698,6 +1765,13 @@ out:
 	return ret;
 }
 
+/**
+ * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * Return: always 0
+ */
 int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
 {
 	struct net_device *net_dev = (struct net_device *)seq->private;
@@ -1747,6 +1821,14 @@ out:
 	return 0;
 }
 
+/**
+ * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq
+ *  file
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * Return: always 0
+ */
 int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
 {
 	struct net_device *net_dev = (struct net_device *)seq->private;
-- 
cgit v1.2.3-59-g8ed1b


From fe13c2aadf208e6210ee125f550dcf725c48721b Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <a@unstable.cc>
Date: Tue, 17 Nov 2015 16:40:51 +0800
Subject: batman-adv: fix kerneldoc for DAT functions

Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/distributed-arp-table.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 2f13429ce5d3..60df82385495 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -741,6 +741,8 @@ static void batadv_dat_hash_free(struct batadv_priv *bat_priv)
 /**
  * batadv_dat_init - initialise the DAT internals
  * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 in case of success, a negative error code otherwise
  */
 int batadv_dat_init(struct batadv_priv *bat_priv)
 {
@@ -779,6 +781,8 @@ void batadv_dat_free(struct batadv_priv *bat_priv)
  * batadv_dat_cache_seq_print_text - print the local DAT hash table
  * @seq: seq file to print on
  * @offset: not used
+ *
+ * Return: always 0
  */
 int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset)
 {
-- 
cgit v1.2.3-59-g8ed1b


From d15cd6221c74027af005d5acbefa43f67ff3df37 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <a@unstable.cc>
Date: Tue, 17 Nov 2015 16:40:52 +0800
Subject: batman-adv: fix kerneldoc for TT functions

Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/translation-table.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index af1d24ce420f..9fcf6bf9fa11 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -69,9 +69,13 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv,
 				 bool roaming);
 
 /**
- * batadv_compare_tt
+ * batadv_compare_tt - check if two TT entries are the same
+ * @node: the list element pointer of the first TT entry
+ * @data2: pointer to the tt_common_entry of the second TT entry
  *
- * Return: 1 if they are the same mac addr and vid
+ * Compare the MAC address and the VLAN ID of the two TT entries and check if
+ * they are the same TT client.
+ * Return: 1 if the two TT clients are the same, 0 otherwise
  */
 static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
 {
@@ -221,6 +225,7 @@ batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry)
 
 /**
  * batadv_tt_global_hash_count - count the number of orig entries
+ * @bat_priv: the bat priv with all the soft interface information
  * @addr: the mac address of the client to count entries for
  * @vid: VLAN identifier
  *
@@ -289,8 +294,9 @@ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv,
 }
 
 /**
- * batadv_tt_global_size_mod - change the size by v of the local table
- *  identified by vid
+ * batadv_tt_global_size_mod - change the size by v of the global table
+ *  for orig_node identified by vid
+ * @orig_node: the originator for which the table has to be modified
  * @vid: the VLAN identifier
  * @v: the amount to sum to the global table size
  */
@@ -721,7 +727,6 @@ out:
  *  function reserves the amount of space needed to send the entire global TT
  *  table. In case of success the value is updated with the real amount of
  *  reserved bytes
-
  * Allocate the needed amount of memory for the entire TT TVLV and write its
  * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data
  * objects, one per active VLAN served by the originator node.
@@ -1243,9 +1248,12 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_global_orig_entry_find
+ * batadv_tt_global_orig_entry_find - find a TT orig_list_entry
+ * @entry: the TT global entry where the orig_list_entry has to be
+ *  extracted from
+ * @orig_node: the originator for which the orig_list_entry has to be found
  *
- * retrieves the orig_tt_list_entry belonging to orig_node from the
+ * retrieve the orig_tt_list_entry belonging to orig_node from the
  * batadv_tt_global_entry list
  *
  * Return: it with an increased refcounter, NULL if not found
@@ -1274,7 +1282,10 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
 }
 
 /**
- * batadv_tt_global_entry_has_orig
+ * batadv_tt_global_entry_has_orig - check if a TT global entry is also handled
+ *  by a given originator
+ * @entry: the TT global entry to check
+ * @orig_node: the originator to search in the list
  *
  * find out if an orig_node is already in the list of a tt_global_entry.
  *
@@ -2519,6 +2530,8 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
  * @num_vlan: number of tvlv VLAN entries
  * @full_table: ask for the entire translation table if true, while only for the
  *  last TT diff otherwise
+ *
+ * Return: true if the TT Request was sent, false otherwise
  */
 static int batadv_send_tt_request(struct batadv_priv *bat_priv,
 				  struct batadv_orig_node *dst_orig_node,
@@ -3062,7 +3075,9 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
 }
 
 /**
- * batadv_tt_check_roam_count
+ * batadv_tt_check_roam_count - check if a client has roamed too frequently
+ * @bat_priv: the bat priv with all the soft interface information
+ * @client: mac address of the roaming client
  *
  * This function checks whether the client already reached the
  * maximum number of possible roaming phases. In this case the ROAMING_ADV
-- 
cgit v1.2.3-59-g8ed1b


From a14c131d8c056397b34bfde63b398b712637d60f Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <a@unstable.cc>
Date: Tue, 17 Nov 2015 16:40:53 +0800
Subject: batman-adv: add kernel doc for AP isolation attributes in bat_priv

Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 7f7a0f1eaf75..5e8c8df4c4ac 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -772,6 +772,9 @@ struct batadv_softif_vlan {
  * @orig_interval: OGM broadcast interval in milliseconds
  * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop
  * @log_level: configured log level (see batadv_dbg_level)
+ * @isolation_mark: the skb->mark value used to match packets for AP isolation
+ * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be used
+ *  for the isolation mark
  * @bcast_seqno: last sent broadcast packet sequence number
  * @bcast_queue_left: number of remaining buffered broadcast packet slots
  * @batman_queue_left: number of remaining OGM packet slots
-- 
cgit v1.2.3-59-g8ed1b


From 600184b705ae040f5ff7442466d0ec2bbae5560c Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon@open-mesh.com>
Date: Mon, 23 Nov 2015 19:57:21 +0100
Subject: batman-adv: add kerneldoc for batadv_iv_ogm_aggr_packet

Signed-off-by: Simon Wunderlich <simon@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index a6d389b0563c..7377313d3e42 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -396,7 +396,14 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv)
 	return new_tq;
 }
 
-/* is there another aggregated packet here? */
+/**
+ * batadv_iv_ogm_aggr_packet - checks if there is another OGM attached
+ * @buff_pos: current position in the skb
+ * @packet_len: total length of the skb
+ * @tvlv_len: tvlv length of the previously considered OGM
+ *
+ * Return: true if there is enough space for another OGM, false otherwise.
+ */
 static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
 				      __be16 tvlv_len)
 {
-- 
cgit v1.2.3-59-g8ed1b


From ec9b83ca5fc1b39ed6e710b24484a272f9bcc3ae Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Tue, 5 Jan 2016 12:06:16 +0100
Subject: batman-adv: Fix kernel-doc for batadv_claim_free_ref

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bridge_loop_avoidance.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 131aca049907..b3a72e29a167 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -165,7 +165,8 @@ static void batadv_claim_release(struct batadv_bla_claim *claim)
 }
 
 /**
- * batadv_claim_free_rcu - free a claim
+ * batadv_claim_free_ref - decrement the claim refcounter and possibly
+ *  release it
  * @claim: claim to be free'd
  */
 static void batadv_claim_free_ref(struct batadv_bla_claim *claim)
-- 
cgit v1.2.3-59-g8ed1b


From 0046b0402ac284a31940e3354614ac88c64881a3 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 1 Jan 2016 00:01:03 +0100
Subject: batman-adv: update copyright years for 2016

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/Makefile                | 2 +-
 net/batman-adv/bat_algo.h              | 2 +-
 net/batman-adv/bat_iv_ogm.c            | 2 +-
 net/batman-adv/bitarray.c              | 2 +-
 net/batman-adv/bitarray.h              | 2 +-
 net/batman-adv/bridge_loop_avoidance.c | 2 +-
 net/batman-adv/bridge_loop_avoidance.h | 2 +-
 net/batman-adv/debugfs.c               | 2 +-
 net/batman-adv/debugfs.h               | 2 +-
 net/batman-adv/distributed-arp-table.c | 2 +-
 net/batman-adv/distributed-arp-table.h | 2 +-
 net/batman-adv/fragmentation.c         | 2 +-
 net/batman-adv/fragmentation.h         | 2 +-
 net/batman-adv/gateway_client.c        | 2 +-
 net/batman-adv/gateway_client.h        | 2 +-
 net/batman-adv/gateway_common.c        | 2 +-
 net/batman-adv/gateway_common.h        | 2 +-
 net/batman-adv/hard-interface.c        | 2 +-
 net/batman-adv/hard-interface.h        | 2 +-
 net/batman-adv/hash.c                  | 2 +-
 net/batman-adv/hash.h                  | 2 +-
 net/batman-adv/icmp_socket.c           | 2 +-
 net/batman-adv/icmp_socket.h           | 2 +-
 net/batman-adv/main.c                  | 2 +-
 net/batman-adv/main.h                  | 2 +-
 net/batman-adv/multicast.c             | 2 +-
 net/batman-adv/multicast.h             | 2 +-
 net/batman-adv/network-coding.c        | 2 +-
 net/batman-adv/network-coding.h        | 2 +-
 net/batman-adv/originator.c            | 2 +-
 net/batman-adv/originator.h            | 2 +-
 net/batman-adv/packet.h                | 2 +-
 net/batman-adv/routing.c               | 2 +-
 net/batman-adv/routing.h               | 2 +-
 net/batman-adv/send.c                  | 2 +-
 net/batman-adv/send.h                  | 2 +-
 net/batman-adv/soft-interface.c        | 2 +-
 net/batman-adv/soft-interface.h        | 2 +-
 net/batman-adv/sysfs.c                 | 2 +-
 net/batman-adv/sysfs.h                 | 2 +-
 net/batman-adv/translation-table.c     | 2 +-
 net/batman-adv/translation-table.h     | 2 +-
 net/batman-adv/types.h                 | 2 +-
 43 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index 21434ab79d2c..207e2af316c7 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
 #
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 4e59cf3eb079..a7485d676088 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 7377313d3e42..3266bcb5bb06 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 3404195f2561..b56bb000a0ab 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 2b64d7a1bc9b..3e41bb80eb81 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index b3a72e29a167..779160934844 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 7ea199b8b5ab..579f0fa6fe6a 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 0fc9df52f3d9..48253cf8341b 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 80ab8d6f0ab3..1ab4e2e63afc 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 60df82385495..017fffe9a5b8 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 26d4a525a798..813ecea96cf9 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 80eddf44a827..55656e84bc7e 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2016  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index c5476fe79dc6..9ff77c7ef7c7 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2016  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 83507750bb66..5950974de7b1 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index fa9527785ed3..582dd8c413c8 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index b287448b9921..5ee04f7140af 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index ab893e318229..b58346350024 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 49e05d238667..db90022c00a4 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 7b12ea8ea29d..4d6b5e12331f 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index 2ea6a18d793f..a0a0fdb85805 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 071a87ea4f2f..9bb57b87447c 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2016  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index bcabb5e3f4d3..a69da37bbad5 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index e937143f0b10..618d5de06f20 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 9d34be628304..5f319fd6ecd7 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 34f56efa2e4e..a7dc41a2709b 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index d984eee1776c..155565e0fecc 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2016  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 685219f3fec9..80bceec55592 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2016  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 3a752bea55fa..0b30c15eee5f 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2016  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index 8f6d4ad8778a..d6d7fb4ec5d5 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2016  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index f6ca4e509729..d4a30db0158a 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index cf0730414ed2..745b4e4fcdc4 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index fdb01637ad1f..e7f915181aba 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 1fb1be31bf3a..f4b60b1fb50e 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 204bbe4952a6..c776e9655b9b 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index c188f4660981..d8b03fd604e0 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 3adc11fe1586..7ff95cada2e7 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 6c65de97126c..4bf35b8c3d23 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 8e82176f40b1..d17cfbacf809 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index f38d7b75b0a5..964fc5986b2c 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index 61974428a7af..c76021b4e198 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 9fcf6bf9fa11..0dc8a5ca33bf 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index abd8e116e5fb..7c7e2c006bfe 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 5e8c8df4c4ac..8974bc0dc15c 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
-- 
cgit v1.2.3-59-g8ed1b


From 7b5e73961915438d623cb80eea97868b3a66cc48 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 6 Dec 2015 16:15:07 +0100
Subject: batman-adv: Switch to HTTPS version of links

open-mesh.org and its subdomains can only be accessed via HTTPS. HTTP-only
requests are currently redirected automatically to HTTPS but references in
the source code should be only https.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 Documentation/networking/batman-adv.txt | 2 +-
 MAINTAINERS                             | 2 +-
 net/batman-adv/Kconfig                  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/batman-adv.txt b/Documentation/networking/batman-adv.txt
index ff23b755f5e4..1b5e7a7f2185 100644
--- a/Documentation/networking/batman-adv.txt
+++ b/Documentation/networking/batman-adv.txt
@@ -187,7 +187,7 @@ interfaces to the kernel module settings.
 
 For more information, please see the manpage (man batctl).
 
-batctl is available on http://www.open-mesh.org/
+batctl is available on https://www.open-mesh.org/
 
 
 CONTACT
diff --git a/MAINTAINERS b/MAINTAINERS
index f678c37107f5..1e216aa4d25c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2147,7 +2147,7 @@ M:	Marek Lindner <mareklindner@neomailbox.ch>
 M:	Simon Wunderlich <sw@simonwunderlich.de>
 M:	Antonio Quartulli <a@unstable.cc>
 L:	b.a.t.m.a.n@lists.open-mesh.org
-W:	http://www.open-mesh.org/
+W:	https://www.open-mesh.org/
 S:	Maintained
 F:	net/batman-adv/
 
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index c6fc8f756c9a..2dd40e5ea030 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -12,7 +12,7 @@ config BATMAN_ADV
           B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
           a routing protocol for multi-hop ad-hoc mesh networks. The
           networks may be wired or wireless. See
-          http://www.open-mesh.org/ for more information and user space
+          https://www.open-mesh.org/ for more information and user space
           tools.
 
 config BATMAN_ADV_BLA
-- 
cgit v1.2.3-59-g8ed1b


From d01332f1acacc0cb43a61f4244dd2b846d4cd585 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Mon, 1 Feb 2016 08:19:56 +0100
Subject: tipc: fix link attribute propagation bug

Changing certain link attributes (link tolerance and link priority)
from the TIPC management tool is supposed to automatically take
effect at both endpoints of the affected link.

Currently the media address is not instantiated for the link and is
used uninstantiated when crafting protocol messages designated for the
peer endpoint. This means that changing a link property currently
results in the property being changed on the local machine but the
protocol message designated for the peer gets lost. Resulting in
property discrepancy between the endpoints.

In this patch we resolve this by using the media address from the
link entry and using the bearer transmit function to send it. Hence,
we can now eliminate the redundant function tipc_link_prot_xmit() and
the redundant field tipc_link::media_addr.

Fixes: 2af5ae372a4b (tipc: clean up unused code and structures)
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Reported-by: Jason Hu <huzhijiang@gmail.com>
Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c | 31 ++++++-------------------------
 net/tipc/link.h |  6 ++++--
 net/tipc/node.c |  9 ++++++---
 3 files changed, 16 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 0c2944fb9ae0..f15635391576 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -123,7 +123,6 @@ struct tipc_stats {
 struct tipc_link {
 	u32 addr;
 	char name[TIPC_MAX_LINK_NAME];
-	struct tipc_media_addr *media_addr;
 	struct net *net;
 
 	/* Management and link supervision data */
@@ -1261,26 +1260,6 @@ drop:
 	return rc;
 }
 
-/*
- * Send protocol message to the other endpoint.
- */
-static void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ,
-				 int probe_msg, u32 gap, u32 tolerance,
-				 u32 priority)
-{
-	struct sk_buff *skb = NULL;
-	struct sk_buff_head xmitq;
-
-	__skb_queue_head_init(&xmitq);
-	tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap,
-				  tolerance, priority, &xmitq);
-	skb = __skb_dequeue(&xmitq);
-	if (!skb)
-		return;
-	tipc_bearer_xmit_skb(l->net, l->bearer_id, skb, l->media_addr);
-	l->rcv_unacked = 0;
-}
-
 static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
 				      u16 rcvgap, int tolerance, int priority,
 				      struct sk_buff_head *xmitq)
@@ -2021,16 +2000,18 @@ msg_full:
 	return -EMSGSIZE;
 }
 
-void tipc_link_set_tolerance(struct tipc_link *l, u32 tol)
+void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
+			     struct sk_buff_head *xmitq)
 {
 	l->tolerance = tol;
-	tipc_link_proto_xmit(l, STATE_MSG, 0, 0, tol, 0);
+	tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, tol, 0, xmitq);
 }
 
-void tipc_link_set_prio(struct tipc_link *l, u32 prio)
+void tipc_link_set_prio(struct tipc_link *l, u32 prio,
+			struct sk_buff_head *xmitq)
 {
 	l->priority = prio;
-	tipc_link_proto_xmit(l, STATE_MSG, 0, 0, 0, prio);
+	tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, prio, xmitq);
 }
 
 void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit)
diff --git a/net/tipc/link.h b/net/tipc/link.h
index b2ae0f4276af..b4ee9d6e181d 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -112,8 +112,10 @@ char tipc_link_plane(struct tipc_link *l);
 int tipc_link_prio(struct tipc_link *l);
 int tipc_link_window(struct tipc_link *l);
 unsigned long tipc_link_tolerance(struct tipc_link *l);
-void tipc_link_set_tolerance(struct tipc_link *l, u32 tol);
-void tipc_link_set_prio(struct tipc_link *l, u32 prio);
+void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
+			     struct sk_buff_head *xmitq);
+void tipc_link_set_prio(struct tipc_link *l, u32 prio,
+			struct sk_buff_head *xmitq);
 void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit);
 void tipc_link_set_queue_limits(struct tipc_link *l, u32 window);
 int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
diff --git a/net/tipc/node.c b/net/tipc/node.c
index fa97d9649a28..f8a8255a7182 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1637,9 +1637,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info)
 	char *name;
 	struct tipc_link *link;
 	struct tipc_node *node;
+	struct sk_buff_head xmitq;
 	struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
 	struct net *net = sock_net(skb->sk);
 
+	__skb_queue_head_init(&xmitq);
+
 	if (!info->attrs[TIPC_NLA_LINK])
 		return -EINVAL;
 
@@ -1683,13 +1686,13 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info)
 			u32 tol;
 
 			tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
-			tipc_link_set_tolerance(link, tol);
+			tipc_link_set_tolerance(link, tol, &xmitq);
 		}
 		if (props[TIPC_NLA_PROP_PRIO]) {
 			u32 prio;
 
 			prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
-			tipc_link_set_prio(link, prio);
+			tipc_link_set_prio(link, prio, &xmitq);
 		}
 		if (props[TIPC_NLA_PROP_WIN]) {
 			u32 win;
@@ -1701,7 +1704,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info)
 
 out:
 	tipc_node_read_unlock(node);
-
+	tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr);
 	return res;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 817298102b0bc936b08dfcc5fbcc2213157050f2 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Mon, 1 Feb 2016 08:19:57 +0100
Subject: tipc: fix link priority propagation

Currently link priority changes isn't handled for active links. In
this patch we resolve this by changing our priority if the peer passes
a valid priority in a state message.

Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index f15635391576..6f4a6d9b0149 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1458,6 +1458,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 		if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
 			l->tolerance = peers_tol;
 
+		if (peers_prio && in_range(peers_prio, TIPC_MIN_LINK_PRI,
+					   TIPC_MAX_LINK_PRI)) {
+			l->priority = peers_prio;
+			rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+		}
+
 		l->silent_intv_cnt = 0;
 		l->stats.recv_states++;
 		if (msg_probe(hdr))
-- 
cgit v1.2.3-59-g8ed1b


From 9256645af09807bc52fa8b2e66ecd28ab25318c4 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 1 Feb 2016 18:51:04 -0500
Subject: net/core: relax BUILD_BUG_ON in netdev_stats_to_stats64

The netdev_stats_to_stats64 function copies the deprecated
net_device_stats format stats into rtnl_link_stats64 for legacy support
purposes, but with the BUILD_BUG_ON as it was, it wasn't possible to
extend rtnl_link_stats64 without also extending net_device_stats. Relax
the BUILD_BUG_ON to only require that rtnl_link_stats64 is larger, and
zero out all the stat counters that aren't present in net_device_stats.

CC: Eric Dumazet <edumazet@google.com>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8cba3d852f25..65863e512227 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7253,24 +7253,31 @@ void netdev_run_todo(void)
 	}
 }
 
-/* Convert net_device_stats to rtnl_link_stats64.  They have the same
- * fields in the same order, with only the type differing.
+/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
+ * all the same fields in the same order as net_device_stats, with only
+ * the type differing, but rtnl_link_stats64 may have additional fields
+ * at the end for newer counters.
  */
 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 			     const struct net_device_stats *netdev_stats)
 {
 #if BITS_PER_LONG == 64
-	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
 	memcpy(stats64, netdev_stats, sizeof(*stats64));
+	/* zero out counters that only exist in rtnl_link_stats64 */
+	memset((char *)stats64 + sizeof(*netdev_stats), 0,
+	       sizeof(*stats64) - sizeof(*netdev_stats));
 #else
-	size_t i, n = sizeof(*stats64) / sizeof(u64);
+	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
 	const unsigned long *src = (const unsigned long *)netdev_stats;
 	u64 *dst = (u64 *)stats64;
 
-	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
-		     sizeof(*stats64) / sizeof(u64));
+	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 	for (i = 0; i < n; i++)
 		dst[i] = src[i];
+	/* zero out counters that only exist in rtnl_link_stats64 */
+	memset((char *)stats64 + n * sizeof(u64), 0,
+	       sizeof(*stats64) - n * sizeof(u64));
 #endif
 }
 EXPORT_SYMBOL(netdev_stats_to_stats64);
-- 
cgit v1.2.3-59-g8ed1b


From 6e7333d315a768170a59ac771297ee0551bdddbf Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 1 Feb 2016 18:51:05 -0500
Subject: net: add rx_nohandler stat counter

This adds an rx_nohandler stat counter, along with a sysfs statistics
node, and copies the counter out via netlink as well.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@mellanox.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Tom Herbert <tom@herbertland.com>
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <gospo@cumulusnetworks.com>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    | 3 +++
 include/uapi/linux/if_link.h | 4 ++++
 net/core/dev.c               | 6 +++++-
 net/core/net-sysfs.c         | 2 ++
 net/core/rtnetlink.c         | 2 ++
 5 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 289c2314d766..78a20cec2a0a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1397,6 +1397,8 @@ enum netdev_priv_flags {
  *			do not use this in drivers
  *	@tx_dropped:	Dropped packets by core network,
  *			do not use this in drivers
+ *	@rx_nohandler:	nohandler dropped packets by core network on
+ *			inactive devices, do not use this in drivers
  *
  *	@wireless_handlers:	List of functions to handle Wireless Extensions,
  *				instead of ioctl,
@@ -1611,6 +1613,7 @@ struct net_device {
 
 	atomic_long_t		rx_dropped;
 	atomic_long_t		tx_dropped;
+	atomic_long_t		rx_nohandler;
 
 #ifdef CONFIG_WIRELESS_EXT
 	const struct iw_handler_def *	wireless_handlers;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index a30b78090594..d3e90b91e07e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -35,6 +35,8 @@ struct rtnl_link_stats {
 	/* for cslip etc */
 	__u32	rx_compressed;
 	__u32	tx_compressed;
+
+	__u32	rx_nohandler;		/* dropped, no handler found	*/
 };
 
 /* The main device statistics structure */
@@ -68,6 +70,8 @@ struct rtnl_link_stats64 {
 	/* for cslip etc */
 	__u64	rx_compressed;
 	__u64	tx_compressed;
+
+	__u64	rx_nohandler;		/* dropped, no handler found	*/
 };
 
 /* The struct should be in sync with struct ifmap */
diff --git a/net/core/dev.c b/net/core/dev.c
index 65863e512227..f1284835b8c9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4154,7 +4154,10 @@ ncls:
 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
 drop:
-		atomic_long_inc(&skb->dev->rx_dropped);
+		if (!deliver_exact)
+			atomic_long_inc(&skb->dev->rx_dropped);
+		else
+			atomic_long_inc(&skb->dev->rx_nohandler);
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
 		 * me how you were going to use this. :-)
@@ -7307,6 +7310,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 	}
 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
+	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
 	return storage;
 }
 EXPORT_SYMBOL(dev_get_stats);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b6c8a6629b39..da7dbc237a5f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -574,6 +574,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors);
 NETSTAT_ENTRY(tx_window_errors);
 NETSTAT_ENTRY(rx_compressed);
 NETSTAT_ENTRY(tx_compressed);
+NETSTAT_ENTRY(rx_nohandler);
 
 static struct attribute *netstat_attrs[] = {
 	&dev_attr_rx_packets.attr,
@@ -599,6 +600,7 @@ static struct attribute *netstat_attrs[] = {
 	&dev_attr_tx_window_errors.attr,
 	&dev_attr_rx_compressed.attr,
 	&dev_attr_tx_compressed.attr,
+	&dev_attr_rx_nohandler.attr,
 	NULL
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d735e854f916..20d71358c143 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -804,6 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 
 	a->rx_compressed = b->rx_compressed;
 	a->tx_compressed = b->tx_compressed;
+
+	a->rx_nohandler = b->rx_nohandler;
 }
 
 static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
-- 
cgit v1.2.3-59-g8ed1b


From 61d2bcae99f66a640b3dd9632180209143fb5512 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 1 Feb 2016 21:03:07 -0800
Subject: tcp: fastopen: accept data/FIN present in SYNACK message

RFC 7413 (TCP Fast Open) 4.2.2 states that the SYNACK message
MAY include data and/or FIN

This patch adds support for the client side :

If we receive a SYNACK with payload or FIN, queue the skb instead
of ignoring it.

Since we already support the same for SYN, we refactor the existing
code and reuse it. Note we need to clone the skb, so this operation
might fail under memory pressure.

Sara Dickinson pointed out FreeBSD server Fast Open implementation
was planned to generate such SYNACK in the future.

The server side might be implemented on linux later.

Reported-by: Sara Dickinson <sara@sinodun.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  1 +
 net/ipv4/tcp_fastopen.c | 64 ++++++++++++++++++++++++++-----------------------
 net/ipv4/tcp_input.c    |  3 +++
 3 files changed, 38 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f6f8f032c73e..27f4c733116d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1437,6 +1437,7 @@ void tcp_free_fastopen_req(struct tcp_sock *tp);
 
 extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
 int tcp_fastopen_reset_cipher(void *key, unsigned int len);
+void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
 			      struct tcp_fastopen_cookie *foc,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 55be6ac70cff..467d3e985411 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -124,6 +124,35 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
 	return false;
 }
 
+
+/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
+ * queue this additional data / FIN.
+ */
+void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
+		return;
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb_dst_drop(skb);
+	__skb_pull(skb, tcp_hdrlen(skb));
+	skb_set_owner_r(skb, sk);
+
+	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+	tp->syn_data_acked = 1;
+
+	/* u64_stats_update_begin(&tp->syncp) not needed here,
+	 * as we certainly are not changing upper 32bit value (0)
+	 */
+	tp->bytes_received = skb->len;
+}
+
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
 					      struct sk_buff *skb,
 					      struct dst_entry *dst,
@@ -132,7 +161,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	struct tcp_sock *tp;
 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
 	struct sock *child;
-	u32 end_seq;
 	bool own_req;
 
 	req->num_retrans = 0;
@@ -178,35 +206,11 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	tcp_init_metrics(child);
 	tcp_init_buffer_space(child);
 
-	/* Queue the data carried in the SYN packet.
-	 * We used to play tricky games with skb_get().
-	 * With lockless listener, it is a dead end.
-	 * Do not think about it.
-	 *
-	 * XXX (TFO) - we honor a zero-payload TFO request for now,
-	 * (any reason not to?) but no need to queue the skb since
-	 * there is no data. How about SYN+FIN?
-	 */
-	end_seq = TCP_SKB_CB(skb)->end_seq;
-	if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
-		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-
-		if (likely(skb2)) {
-			skb_dst_drop(skb2);
-			__skb_pull(skb2, tcp_hdrlen(skb));
-			skb_set_owner_r(skb2, child);
-			__skb_queue_tail(&child->sk_receive_queue, skb2);
-			tp->syn_data_acked = 1;
-
-			/* u64_stats_update_begin(&tp->syncp) not needed here,
-			 * as we certainly are not changing upper 32bit value (0)
-			 */
-			tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
-		} else {
-			end_seq = TCP_SKB_CB(skb)->seq + 1;
-		}
-	}
-	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
+	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+
+	tcp_fastopen_add_skb(child, skb);
+
+	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
 	/* tcp_conn_request() is sending the SYNACK,
 	 * and queues the child into listener accept queue.
 	 */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1c2a73406261..4add3eb40e58 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5509,6 +5509,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	tp->syn_data_acked = tp->syn_data;
 	if (tp->syn_data_acked)
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+
+	tcp_fastopen_add_skb(sk, synack);
+
 	return false;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 9d691539eea2d977e3eb86766c389a19a9c13146 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 1 Feb 2016 21:03:08 -0800
Subject: tcp: do not enqueue skb with SYN flag

If we remove the SYN flag from the skbs that tcp_fastopen_add_skb()
places in socket receive queue, then we can remove the test that
tcp_recvmsg() has to perform in fast path.

All we have to do is to adjust SEQ in the slow path.

For the moment, we place an unlikely() and output a message
if we find an skb having SYN flag set.
Goal would be to get rid of the test completely.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c          | 8 ++++++--
 net/ipv4/tcp_fastopen.c | 3 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 19746b3fcbbe..c5075779e017 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1466,8 +1466,10 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 
 	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
 		offset = seq - TCP_SKB_CB(skb)->seq;
-		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+		if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+			pr_err_once("%s: found a SYN, please report !\n", __func__);
 			offset--;
+		}
 		if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
 			*off = offset;
 			return skb;
@@ -1657,8 +1659,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 				break;
 
 			offset = *seq - TCP_SKB_CB(skb)->seq;
-			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+			if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+				pr_err_once("%s: found a SYN, please report !\n", __func__);
 				offset--;
+			}
 			if (offset < skb->len)
 				goto found_ok_skb;
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 467d3e985411..6a6e11e54bae 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -143,6 +143,9 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 	__skb_pull(skb, tcp_hdrlen(skb));
 	skb_set_owner_r(skb, sk);
 
+	TCP_SKB_CB(skb)->seq++;
+	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+
 	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 	__skb_queue_tail(&sk->sk_receive_queue, skb);
 	tp->syn_data_acked = 1;
-- 
cgit v1.2.3-59-g8ed1b


From ba905f5e2f63d86ed4cfbd3d9096fb28d156f1ee Mon Sep 17 00:00:00 2001
From: Kim Jones <kim-marie.jones@intel.com>
Date: Tue, 2 Feb 2016 03:51:16 +0000
Subject: ethtool: Declare netdev_rss_key as __read_mostly.

netdev_rss_key is written to once and thereafter is read by
drivers when they are initialising. The fact that it is mostly
read and not written to makes it a candidate for a __read_mostly
declaration.

Signed-off-by: Kim Jones <kim-marie.jones@intel.com>
Signed-off-by: Alan Carey <alan.carey@intel.com>
Acked-by: Rami Rosen <rami.rosen@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 net/core/ethtool.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 78a20cec2a0a..219f53c30cb3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3744,7 +3744,7 @@ void netdev_lower_state_changed(struct net_device *lower_dev,
 
 /* RSS keys are 40 or 52 bytes long */
 #define NETDEV_RSS_KEY_LEN 52
-extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
 void netdev_rss_key_fill(void *buffer, size_t len);
 
 int dev_get_nest_level(struct net_device *dev,
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index daf04709dd3c..453c803f1c87 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -632,7 +632,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
 	return 0;
 }
 
-u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
 
 void netdev_rss_key_fill(void *buffer, size_t len)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 4f61d4ef7088581997ad95ad37db2d8a459844e4 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:08 +0100
Subject: tipc: remove incorrect check for subscription timeout value

Until now, during subscription creation we set sub->timeout by
converting the timeout request value in milliseconds to jiffies.
This is followed by setting the timeout value in the timer if
sub->timeout != TIPC_WAIT_FOREVER.

For a subscription create request with a timeout value of
TIPC_WAIT_FOREVER, msecs_to_jiffies(TIPC_WAIT_FOREVER)
returns MAX_JIFFY_OFFSET (0xfffffffe). This is not equal to
TIPC_WAIT_FOREVER (0xffffffff).

In this commit, we remove this check.

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/subscr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 69ee2eeef968..7e61a907d860 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -270,8 +270,7 @@ static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
 	memcpy(&sub->evt.s, s, sizeof(*s));
 	atomic_inc(&tn->subscription_count);
 	setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
-	if (sub->timeout != TIPC_WAIT_FOREVER)
-		sub->timeout += jiffies;
+	sub->timeout += jiffies;
 	if (!mod_timer(&sub->timer, sub->timeout))
 		tipc_subscrb_get(subscriber);
 	*sub_p = sub;
-- 
cgit v1.2.3-59-g8ed1b


From 3086523149ef4c15f5e75bb2ed9f43a8f3f95015 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:09 +0100
Subject: tipc: remove filter and timeout elements from struct
 tipc_subscription

Until now, struct tipc_subscription has duplicate timeout and filter
attributes present:
1. directly as members of struct tipc_subscription
2. in struct tipc_subscr, which is contained in struct tipc_event

In this commit, we remove the references to these elements as
members of struct tipc_subscription and replace them with elements
from struct tipc_subscr.

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/subscr.c | 15 ++++++++-------
 net/tipc/subscr.h |  5 -----
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 7e61a907d860..cef0b6b967d3 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -110,7 +110,8 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
 {
 	if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper))
 		return;
-	if (!must && !(sub->filter & TIPC_SUB_PORTS))
+	if (!must &&
+	    !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS))
 		return;
 
 	tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref,
@@ -222,6 +223,7 @@ static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_subscription *sub;
+	u32 timeout, filter;
 	int swap;
 
 	/* Determine subscriber's endianness */
@@ -253,10 +255,8 @@ static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
 	sub->seq.type = htohl(s->seq.type, swap);
 	sub->seq.lower = htohl(s->seq.lower, swap);
 	sub->seq.upper = htohl(s->seq.upper, swap);
-	sub->timeout = msecs_to_jiffies(htohl(s->timeout, swap));
-	sub->filter = htohl(s->filter, swap);
-	if ((!(sub->filter & TIPC_SUB_PORTS) ==
-	     !(sub->filter & TIPC_SUB_SERVICE)) ||
+	filter = htohl(s->filter, swap);
+	if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) ||
 	    (sub->seq.lower > sub->seq.upper)) {
 		pr_warn("Subscription rejected, illegal request\n");
 		kfree(sub);
@@ -265,13 +265,14 @@ static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
 	spin_lock_bh(&subscriber->lock);
 	list_add(&sub->subscrp_list, &subscriber->subscrp_list);
 	spin_unlock_bh(&subscriber->lock);
+
 	sub->subscriber = subscriber;
 	sub->swap = swap;
 	memcpy(&sub->evt.s, s, sizeof(*s));
 	atomic_inc(&tn->subscription_count);
 	setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
-	sub->timeout += jiffies;
-	if (!mod_timer(&sub->timer, sub->timeout))
+	timeout = htohl(sub->evt.s.timeout, swap);
+	if (!mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)))
 		tipc_subscrb_get(subscriber);
 	*sub_p = sub;
 	return 0;
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 92ee18cc5fe6..9e69dbf05626 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -50,12 +50,9 @@ struct tipc_subscriber;
  * @subscriber: pointer to its subscriber
  * @seq: name sequence associated with subscription
  * @net: point to network namespace
- * @timeout: duration of subscription (in ms)
- * @filter: event filtering to be done for subscription
  * @timer: timer governing subscription duration (optional)
  * @nameseq_list: adjacent subscriptions in name sequence's subscription list
  * @subscrp_list: adjacent subscriptions in subscriber's subscription list
- * @server_ref: object reference of server port associated with subscription
  * @swap: indicates if subscriber uses opposite endianness in its messages
  * @evt: template for events generated by subscription
  */
@@ -63,8 +60,6 @@ struct tipc_subscription {
 	struct tipc_subscriber *subscriber;
 	struct tipc_name_seq seq;
 	struct net *net;
-	unsigned long timeout;
-	u32 filter;
 	struct timer_list timer;
 	struct list_head nameseq_list;
 	struct list_head subscrp_list;
-- 
cgit v1.2.3-59-g8ed1b


From a4273c73ebf06515eab6b5a84817a175ad8acdc4 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:10 +0100
Subject: tipc: remove struct tipc_name_seq from struct tipc_subscription

Until now, struct tipc_subscriber has duplicate fields for
type, upper and lower (as member of struct tipc_name_seq) at:
1. as member seq in struct tipc_subscription
2. as member seq in struct tipc_subscr, which is contained
   in struct tipc_event
The former structure contains the type, upper and lower
values in network byte order and the later contains the
intact copy of the request.
The struct tipc_subscription contains a field swap to
determine if request needs network byte order conversion.
Thus by using swap, we can convert the request when
required instead of duplicating it.

In this commit,
1. we remove the references to these elements as members of
   struct tipc_subscription and replace them with elements
   from struct tipc_subscr.
2. provide new functions to convert the user request into
   network byte order.

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 14 ++++++++++----
 net/tipc/subscr.c     | 33 +++++++++++++++++++++++----------
 net/tipc/subscr.h     |  6 ++++--
 3 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 91fce70291a8..777b979b8463 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -418,6 +418,9 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
 				   struct tipc_subscription *s)
 {
 	struct sub_seq *sseq = nseq->sseqs;
+	struct tipc_name_seq ns;
+
+	tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns);
 
 	list_add(&s->nameseq_list, &nseq->subscriptions);
 
@@ -425,7 +428,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
 		return;
 
 	while (sseq != &nseq->sseqs[nseq->first_free]) {
-		if (tipc_subscrp_check_overlap(s, sseq->lower, sseq->upper)) {
+		if (tipc_subscrp_check_overlap(&ns, sseq->lower, sseq->upper)) {
 			struct publication *crs;
 			struct name_info *info = sseq->info;
 			int must_report = 1;
@@ -722,9 +725,10 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
 void tipc_nametbl_subscribe(struct tipc_subscription *s)
 {
 	struct tipc_net *tn = net_generic(s->net, tipc_net_id);
-	u32 type = s->seq.type;
+	u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap);
 	int index = hash(type);
 	struct name_seq *seq;
+	struct tipc_name_seq ns;
 
 	spin_lock_bh(&tn->nametbl_lock);
 	seq = nametbl_find_seq(s->net, type);
@@ -735,8 +739,9 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s)
 		tipc_nameseq_subscribe(seq, s);
 		spin_unlock_bh(&seq->lock);
 	} else {
+		tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns);
 		pr_warn("Failed to create subscription for {%u,%u,%u}\n",
-			s->seq.type, s->seq.lower, s->seq.upper);
+			ns.type, ns.lower, ns.upper);
 	}
 	spin_unlock_bh(&tn->nametbl_lock);
 }
@@ -748,9 +753,10 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s)
 {
 	struct tipc_net *tn = net_generic(s->net, tipc_net_id);
 	struct name_seq *seq;
+	u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap);
 
 	spin_lock_bh(&tn->nametbl_lock);
-	seq = nametbl_find_seq(s->net, s->seq.type);
+	seq = nametbl_find_seq(s->net, type);
 	if (seq != NULL) {
 		spin_lock_bh(&seq->lock);
 		list_del_init(&s->nameseq_list);
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index cef0b6b967d3..5f3100e964cf 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -92,23 +92,39 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub,
  *
  * Returns 1 if there is overlap, otherwise 0.
  */
-int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower,
+int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 			       u32 found_upper)
 {
-	if (found_lower < sub->seq.lower)
-		found_lower = sub->seq.lower;
-	if (found_upper > sub->seq.upper)
-		found_upper = sub->seq.upper;
+	if (found_lower < seq->lower)
+		found_lower = seq->lower;
+	if (found_upper > seq->upper)
+		found_upper = seq->upper;
 	if (found_lower > found_upper)
 		return 0;
 	return 1;
 }
 
+u32 tipc_subscrp_convert_seq_type(u32 type, int swap)
+{
+	return htohl(type, swap);
+}
+
+void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
+			      struct tipc_name_seq *out)
+{
+	out->type = htohl(in->type, swap);
+	out->lower = htohl(in->lower, swap);
+	out->upper = htohl(in->upper, swap);
+}
+
 void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
 				 u32 found_upper, u32 event, u32 port_ref,
 				 u32 node, int must)
 {
-	if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper))
+	struct tipc_name_seq seq;
+
+	tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq);
+	if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper))
 		return;
 	if (!must &&
 	    !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS))
@@ -252,12 +268,9 @@ static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
 
 	/* Initialize subscription object */
 	sub->net = net;
-	sub->seq.type = htohl(s->seq.type, swap);
-	sub->seq.lower = htohl(s->seq.lower, swap);
-	sub->seq.upper = htohl(s->seq.upper, swap);
 	filter = htohl(s->filter, swap);
 	if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) ||
-	    (sub->seq.lower > sub->seq.upper)) {
+	    (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) {
 		pr_warn("Subscription rejected, illegal request\n");
 		kfree(sub);
 		return -EINVAL;
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 9e69dbf05626..be60103082c9 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -58,7 +58,6 @@ struct tipc_subscriber;
  */
 struct tipc_subscription {
 	struct tipc_subscriber *subscriber;
-	struct tipc_name_seq seq;
 	struct net *net;
 	struct timer_list timer;
 	struct list_head nameseq_list;
@@ -67,11 +66,14 @@ struct tipc_subscription {
 	struct tipc_event evt;
 };
 
-int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower,
+int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 			       u32 found_upper);
 void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
 				 u32 found_lower, u32 found_upper, u32 event,
 				 u32 port_ref, u32 node, int must);
+void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
+			      struct tipc_name_seq *out);
+u32 tipc_subscrp_convert_seq_type(u32 type, int swap);
 int tipc_topsrv_start(struct net *net);
 void tipc_topsrv_stop(struct net *net);
 
-- 
cgit v1.2.3-59-g8ed1b


From 7c13c6224123a6424bd3bc60ef982759754501e9 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:11 +0100
Subject: tipc: introduce tipc_subscrb_subscribe() routine

In this commit, we split tipc_subscrp_create() into two:
1. tipc_subscrp_create() creates a subscription
2. A new function tipc_subscrp_subscribe() adds the
   subscription to the subscriber subscription list,
   activates the subscription timer and subscribes to
   the nametable updates.

In future commits, the purpose of tipc_subscrb_rcv_cb() will
be to either subscribe or cancel a subscription.

There is no functional change in this commit.

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/subscr.c | 54 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 5f3100e964cf..022a2f21be04 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -233,13 +233,13 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s,
 	spin_unlock_bh(&subscriber->lock);
 }
 
-static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
-			       struct tipc_subscriber *subscriber,
-			       struct tipc_subscription **sub_p)
+static struct tipc_subscription *tipc_subscrp_create(struct net *net,
+						     struct tipc_subscr *s,
+						     struct tipc_subscriber *subscriber)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_subscription *sub;
-	u32 timeout, filter;
+	u32 filter;
 	int swap;
 
 	/* Determine subscriber's endianness */
@@ -249,21 +249,21 @@ static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
 	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
 		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
 		tipc_subscrp_cancel(s, subscriber);
-		return 0;
+		return NULL;
 	}
 
 	/* Refuse subscription if global limit exceeded */
 	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) {
 		pr_warn("Subscription rejected, limit reached (%u)\n",
 			TIPC_MAX_SUBSCRIPTIONS);
-		return -EINVAL;
+		return NULL;
 	}
 
 	/* Allocate subscription object */
 	sub = kmalloc(sizeof(*sub), GFP_ATOMIC);
 	if (!sub) {
 		pr_warn("Subscription rejected, no memory\n");
-		return -ENOMEM;
+		return NULL;
 	}
 
 	/* Initialize subscription object */
@@ -273,22 +273,36 @@ static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
 	    (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) {
 		pr_warn("Subscription rejected, illegal request\n");
 		kfree(sub);
-		return -EINVAL;
+		return NULL;
 	}
-	spin_lock_bh(&subscriber->lock);
-	list_add(&sub->subscrp_list, &subscriber->subscrp_list);
-	spin_unlock_bh(&subscriber->lock);
 
-	sub->subscriber = subscriber;
 	sub->swap = swap;
 	memcpy(&sub->evt.s, s, sizeof(*s));
 	atomic_inc(&tn->subscription_count);
 	setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
-	timeout = htohl(sub->evt.s.timeout, swap);
+	return sub;
+}
+
+static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
+				   struct tipc_subscriber *subscriber)
+{
+	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_subscription *sub = NULL;
+	u32 timeout;
+
+	sub = tipc_subscrp_create(net, s, subscriber);
+	if (!sub)
+		return tipc_conn_terminate(tn->topsrv, subscriber->conid);
+
+	spin_lock_bh(&subscriber->lock);
+	list_add(&sub->subscrp_list, &subscriber->subscrp_list);
+	spin_unlock_bh(&subscriber->lock);
+
+	sub->subscriber = subscriber;
+	timeout = htohl(sub->evt.s.timeout, sub->swap);
 	if (!mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)))
 		tipc_subscrb_get(subscriber);
-	*sub_p = sub;
-	return 0;
+	tipc_nametbl_subscribe(sub);
 }
 
 /* Handle one termination request for the subscriber */
@@ -302,14 +316,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
 				struct sockaddr_tipc *addr, void *usr_data,
 				void *buf, size_t len)
 {
-	struct tipc_subscriber *subscrb = usr_data;
-	struct tipc_subscription *sub = NULL;
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-
-	if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub))
-		return tipc_conn_terminate(tn->topsrv, subscrb->conid);
-
-	tipc_nametbl_subscribe(sub);
+	tipc_subscrp_subscribe(net, (struct tipc_subscr *)buf,
+			       (struct tipc_subscriber *)usr_data);
 }
 
 /* Handle one request to establish a new subscriber */
-- 
cgit v1.2.3-59-g8ed1b


From c8beccc67c833db4a01c621cbc6eb0a577286806 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:12 +0100
Subject: tipc: fix connection abort during subscription cancellation

In 'commit 7fe8097cef5f ("tipc: fix nullpointer bug when subscribing
to events")', we terminate the connection if the subscription
creation fails.
In the same commit, the subscription creation result was based on
the value of subscription pointer (set in the function) instead of
the return code.

Unfortunately, the same function also handles subscription
cancellation request. For a subscription cancellation request,
the subscription pointer cannot be set. Thus the connection is
terminated during cancellation request.

In this commit, we move the subcription cancel check outside
of tipc_subscrp_create(). Hence,
- tipc_subscrp_create() will create a subscripton
- tipc_subscrb_rcv_cb() will subscribe or cancel a subscription.

Fixes: 'commit 7fe8097cef5f ("tipc: fix nullpointer bug when subscribing to events")'

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/subscr.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 022a2f21be04..531227208ae2 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -235,22 +235,11 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s,
 
 static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 						     struct tipc_subscr *s,
-						     struct tipc_subscriber *subscriber)
+						     int swap)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_subscription *sub;
-	u32 filter;
-	int swap;
-
-	/* Determine subscriber's endianness */
-	swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE));
-
-	/* Detect & process a subscription cancellation request */
-	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
-		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
-		tipc_subscrp_cancel(s, subscriber);
-		return NULL;
-	}
+	u32 filter = htohl(s->filter, swap);
 
 	/* Refuse subscription if global limit exceeded */
 	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) {
@@ -268,7 +257,6 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 
 	/* Initialize subscription object */
 	sub->net = net;
-	filter = htohl(s->filter, swap);
 	if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) ||
 	    (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) {
 		pr_warn("Subscription rejected, illegal request\n");
@@ -284,13 +272,13 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 }
 
 static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
-				   struct tipc_subscriber *subscriber)
+				   struct tipc_subscriber *subscriber, int swap)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_subscription *sub = NULL;
 	u32 timeout;
 
-	sub = tipc_subscrp_create(net, s, subscriber);
+	sub = tipc_subscrp_create(net, s, swap);
 	if (!sub)
 		return tipc_conn_terminate(tn->topsrv, subscriber->conid);
 
@@ -299,7 +287,7 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
 	spin_unlock_bh(&subscriber->lock);
 
 	sub->subscriber = subscriber;
-	timeout = htohl(sub->evt.s.timeout, sub->swap);
+	timeout = htohl(sub->evt.s.timeout, swap);
 	if (!mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)))
 		tipc_subscrb_get(subscriber);
 	tipc_nametbl_subscribe(sub);
@@ -316,8 +304,20 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
 				struct sockaddr_tipc *addr, void *usr_data,
 				void *buf, size_t len)
 {
-	tipc_subscrp_subscribe(net, (struct tipc_subscr *)buf,
-			       (struct tipc_subscriber *)usr_data);
+	struct tipc_subscriber *subscriber = usr_data;
+	struct tipc_subscr *s = (struct tipc_subscr *)buf;
+	int swap;
+
+	/* Determine subscriber's endianness */
+	swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE));
+
+	/* Detect & process a subscription cancellation request */
+	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
+		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
+		return tipc_subscrp_cancel(s, subscriber);
+	}
+
+	tipc_subscrp_subscribe(net, s, subscriber, swap);
 }
 
 /* Handle one request to establish a new subscriber */
-- 
cgit v1.2.3-59-g8ed1b


From cb01c7c8701a35866479753fe78d04bd9826dd1b Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:13 +0100
Subject: tipc: fix connection abort when receiving invalid cancel request

Until now, the subscribers endianness for a subscription
create/cancel request is determined as:
    swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE))
The checks are performed only for port/service subscriptions.

The swap calculation is incorrect if the filter in the subscription
cancellation request is set to TIPC_SUB_CANCEL (it's a malformed
cancel request, as the corresponding subscription create filter
is missing).
Thus, the check if the request is for cancellation fails and the
request is treated as a subscription create request. The
subscription creation fails as the request is illegal, which
terminates this connection.

In this commit we determine the endianness by including
TIPC_SUB_CANCEL, which will set swap correctly and the
request is processed as a cancellation request.

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/subscr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 531227208ae2..24d2c8128bac 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -309,7 +309,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
 	int swap;
 
 	/* Determine subscriber's endianness */
-	swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE));
+	swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE |
+			      TIPC_SUB_CANCEL));
 
 	/* Detect & process a subscription cancellation request */
 	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
-- 
cgit v1.2.3-59-g8ed1b


From d4091899c9bbfd6695449c6b09517ceb45bb379d Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:14 +0100
Subject: tipc: hold subscriber->lock for tipc_nametbl_subscribe()

Until now, while creating a subscription the subscriber lock
protects only the subscribers subscription list and not the
nametable. The call to tipc_nametbl_subscribe() is outside
the lock. However, at subscription timeout and cancel both
the subscribers subscription list and the nametable are
protected by the subscriber lock.

This asymmetric locking mechanism leads to the following problem:
In a SMP system, the timer can be fire on another core before
the create request is complete.
When the timer thread calls tipc_nametbl_unsubscribe() before create
thread calls tipc_nametbl_subscribe(), we get a nullptr exception.

This can be simulated by creating subscription with timeout=0 and
sometimes the timeout occurs before the create request is complete.

The following is the oops:
[57.569661] BUG: unable to handle kernel NULL pointer dereference at (null)
[57.577498] IP: [<ffffffffa02135aa>] tipc_nametbl_unsubscribe+0x8a/0x120 [tipc]
[57.584820] PGD 0
[57.586834] Oops: 0002 [#1] SMP
[57.685506] CPU: 14 PID: 10077 Comm: kworker/u40:1 Tainted: P OENX 3.12.48-52.27.1.     9688.1.PTF-default #1
[57.703637] Workqueue: tipc_rcv tipc_recv_work [tipc]
[57.708697] task: ffff88064c7f00c0 ti: ffff880629ef4000 task.ti: ffff880629ef4000
[57.716181] RIP: 0010:[<ffffffffa02135aa>]  [<ffffffffa02135aa>] tipc_nametbl_unsubscribe+0x8a/   0x120 [tipc]
[...]
[57.812327] Call Trace:
[57.814806]  [<ffffffffa0211c77>] tipc_subscrp_delete+0x37/0x90 [tipc]
[57.821357]  [<ffffffffa0211e2f>] tipc_subscrp_timeout+0x3f/0x70 [tipc]
[57.827982]  [<ffffffff810618c1>] call_timer_fn+0x31/0x100
[57.833490]  [<ffffffff81062709>] run_timer_softirq+0x1f9/0x2b0
[57.839414]  [<ffffffff8105a795>] __do_softirq+0xe5/0x230
[57.844827]  [<ffffffff81520d1c>] call_softirq+0x1c/0x30
[57.850150]  [<ffffffff81004665>] do_softirq+0x55/0x90
[57.855285]  [<ffffffff8105aa35>] irq_exit+0x95/0xa0
[57.860290]  [<ffffffff815215b5>] smp_apic_timer_interrupt+0x45/0x60
[57.866644]  [<ffffffff8152005d>] apic_timer_interrupt+0x6d/0x80
[57.872686]  [<ffffffffa02121c5>] tipc_subscrb_rcv_cb+0x2a5/0x3f0 [tipc]
[57.879425]  [<ffffffffa021c65f>] tipc_receive_from_sock+0x9f/0x100 [tipc]
[57.886324]  [<ffffffffa021c826>] tipc_recv_work+0x26/0x60 [tipc]
[57.892463]  [<ffffffff8106fb22>] process_one_work+0x172/0x420
[57.898309]  [<ffffffff8107079a>] worker_thread+0x11a/0x3c0
[57.903871]  [<ffffffff81077114>] kthread+0xb4/0xc0
[57.908751]  [<ffffffff8151f318>] ret_from_fork+0x58/0x90

In this commit, we do the following at subscription creation:
1. set the subscription's subscriber pointer before performing
   tipc_nametbl_subscribe(), as this value is required further in
   the call chain ex: by tipc_subscrp_send_event().
2. move tipc_nametbl_subscribe() under the scope of subscriber lock

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/subscr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 24d2c8128bac..e4ebbc161e42 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -284,13 +284,13 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
 
 	spin_lock_bh(&subscriber->lock);
 	list_add(&sub->subscrp_list, &subscriber->subscrp_list);
+	sub->subscriber = subscriber;
+	tipc_nametbl_subscribe(sub);
 	spin_unlock_bh(&subscriber->lock);
 
-	sub->subscriber = subscriber;
 	timeout = htohl(sub->evt.s.timeout, swap);
 	if (!mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)))
 		tipc_subscrb_get(subscriber);
-	tipc_nametbl_subscribe(sub);
 }
 
 /* Handle one termination request for the subscriber */
-- 
cgit v1.2.3-59-g8ed1b


From f3ad288c56d21e3b8ec77e13ab9aa06dbbfa3577 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:15 +0100
Subject: tipc: protect tipc_subscrb_get() with subscriber spin lock

Until now, during subscription creation the mod_time() &
tipc_subscrb_get() are called after releasing the subscriber
spin lock.

In a SMP system when performing a subscription creation, if the
subscription timeout occurs simultaneously (the timer is
scheduled to run on another CPU) then the timer thread
might decrement the subscribers refcount before the create
thread increments the refcount.

This can be simulated by creating subscription with timeout=0 and
sometimes the timeout occurs before the create request is complete.
This leads to the following message:
[30.702949] BUG: spinlock bad magic on CPU#1, kworker/u8:3/87
[30.703834] general protection fault: 0000 [#1] SMP
[30.704826] CPU: 1 PID: 87 Comm: kworker/u8:3 Not tainted 4.4.0-rc8+ #18
[30.704826] Workqueue: tipc_rcv tipc_recv_work [tipc]
[30.704826] task: ffff88003f878600 ti: ffff88003fae0000 task.ti: ffff88003fae0000
[30.704826] RIP: 0010:[<ffffffff8109196c>]  [<ffffffff8109196c>] spin_dump+0x5c/0xe0
[...]
[30.704826] Call Trace:
[30.704826]  [<ffffffff81091a16>] spin_bug+0x26/0x30
[30.704826]  [<ffffffff81091b75>] do_raw_spin_lock+0xe5/0x120
[30.704826]  [<ffffffff81684439>] _raw_spin_lock_bh+0x19/0x20
[30.704826]  [<ffffffffa0096f10>] tipc_subscrb_rcv_cb+0x1d0/0x330 [tipc]
[30.704826]  [<ffffffffa00a37b1>] tipc_receive_from_sock+0xc1/0x150 [tipc]
[30.704826]  [<ffffffffa00a31df>] tipc_recv_work+0x3f/0x80 [tipc]
[30.704826]  [<ffffffff8106a739>] process_one_work+0x149/0x3c0
[30.704826]  [<ffffffff8106aa16>] worker_thread+0x66/0x460
[30.704826]  [<ffffffff8106a9b0>] ? process_one_work+0x3c0/0x3c0
[30.704826]  [<ffffffff8106a9b0>] ? process_one_work+0x3c0/0x3c0
[30.704826]  [<ffffffff8107029d>] kthread+0xed/0x110
[30.704826]  [<ffffffff810701b0>] ? kthread_create_on_node+0x190/0x190
[30.704826]  [<ffffffff81684bdf>] ret_from_fork+0x3f/0x70

In this commit,
1. we remove the check for the return code for mod_timer()
2. we protect tipc_subscrb_get() using the subscriber spin lock.
   We increment the subscriber's refcount as soon as we add the
   subscription to subscriber's subscription list.

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/subscr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index e4ebbc161e42..7d226ecb0490 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -284,13 +284,13 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
 
 	spin_lock_bh(&subscriber->lock);
 	list_add(&sub->subscrp_list, &subscriber->subscrp_list);
+	tipc_subscrb_get(subscriber);
 	sub->subscriber = subscriber;
 	tipc_nametbl_subscribe(sub);
 	spin_unlock_bh(&subscriber->lock);
 
 	timeout = htohl(sub->evt.s.timeout, swap);
-	if (!mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)))
-		tipc_subscrb_get(subscriber);
+	mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
 }
 
 /* Handle one termination request for the subscriber */
-- 
cgit v1.2.3-59-g8ed1b


From ae245557f87fffe2e1c39ba07524024e650e822b Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:16 +0100
Subject: tipc: donot create timers if subscription timeout = TIPC_WAIT_FOREVER

Until now, we create timers even for the subscription requests
with timeout = TIPC_WAIT_FOREVER.
This can be improved by avoiding timer creation when the timeout
is set to TIPC_WAIT_FOREVER.

In this commit, we introduce a check to creates timers only
when timeout != TIPC_WAIT_FOREVER.

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/subscr.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 7d226ecb0490..22963cafd5ed 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -188,12 +188,14 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid)
 static void tipc_subscrb_delete(struct tipc_subscriber *subscriber)
 {
 	struct tipc_subscription *sub, *temp;
+	u32 timeout;
 
 	spin_lock_bh(&subscriber->lock);
 	/* Destroy any existing subscriptions for subscriber */
 	list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
 				 subscrp_list) {
-		if (del_timer(&sub->timer)) {
+		timeout = htohl(sub->evt.s.timeout, sub->swap);
+		if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) {
 			tipc_subscrp_delete(sub);
 			tipc_subscrb_put(subscriber);
 		}
@@ -217,13 +219,16 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s,
 				struct tipc_subscriber *subscriber)
 {
 	struct tipc_subscription *sub, *temp;
+	u32 timeout;
 
 	spin_lock_bh(&subscriber->lock);
 	/* Find first matching subscription, exit if not found */
 	list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
 				 subscrp_list) {
 		if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
-			if (del_timer(&sub->timer)) {
+			timeout = htohl(sub->evt.s.timeout, sub->swap);
+			if ((timeout == TIPC_WAIT_FOREVER) ||
+			    del_timer(&sub->timer)) {
 				tipc_subscrp_delete(sub);
 				tipc_subscrb_put(subscriber);
 			}
@@ -267,7 +272,6 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 	sub->swap = swap;
 	memcpy(&sub->evt.s, s, sizeof(*s));
 	atomic_inc(&tn->subscription_count);
-	setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
 	return sub;
 }
 
@@ -290,6 +294,10 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
 	spin_unlock_bh(&subscriber->lock);
 
 	timeout = htohl(sub->evt.s.timeout, swap);
+	if (timeout == TIPC_WAIT_FOREVER)
+		return;
+
+	setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
 	mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 06c8581f85e99bbf69723f76ad2a40fa8a8c8cdd Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 2 Feb 2016 10:52:17 +0100
Subject: tipc: use alloc_ordered_workqueue() instead of WQ_UNBOUND w/
 max_active = 1

Until now, tipc_rcv and tipc_send workqueues in server are allocated
with parameters WQ_UNBOUND & max_active = 1.
This parameters passed to this function makes it equivalent to
alloc_ordered_workqueue(). The later form is more explicit and
can inherit future ordered_workqueue changes.

In this commit we replace alloc_workqueue() with more readable
alloc_ordered_workqueue().

Acked-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/server.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/server.c b/net/tipc/server.c
index 922e04a43396..2446bfbaa309 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -571,13 +571,13 @@ static void tipc_work_stop(struct tipc_server *s)
 
 static int tipc_work_start(struct tipc_server *s)
 {
-	s->rcv_wq = alloc_workqueue("tipc_rcv", WQ_UNBOUND, 1);
+	s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0);
 	if (!s->rcv_wq) {
 		pr_err("can't start tipc receive workqueue\n");
 		return -ENOMEM;
 	}
 
-	s->send_wq = alloc_workqueue("tipc_send", WQ_UNBOUND, 1);
+	s->send_wq = alloc_ordered_workqueue("tipc_send", 0);
 	if (!s->send_wq) {
 		pr_err("can't start tipc send workqueue\n");
 		destroy_workqueue(s->rcv_wq);
-- 
cgit v1.2.3-59-g8ed1b


From e3e17b773bfe45462b7f3fae20c550025975cb13 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 6 Feb 2016 11:16:28 -0800
Subject: tcp: fastopen: call tcp_fin() if FIN present in SYNACK

When we acknowledge a FIN, it is not enough to ack the sequence number
and queue the skb into receive queue. We also have to call tcp_fin()
to properly update socket state and send proper poll() notifications.

It seems we also had the problem if we received a SYN packet with the
FIN flag set, but it does not seem an urgent issue, as no known
implementation can do that.

Fixes: 61d2bcae99f6 ("tcp: fastopen: accept data/FIN present in SYNACK message")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       | 1 +
 net/ipv4/tcp_fastopen.c | 3 +++
 net/ipv4/tcp_input.c    | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 27f4c733116d..479d535609fd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -568,6 +568,7 @@ void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
 void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
+void tcp_fin(struct sock *sk);
 
 /* tcp_timer.c */
 void tcp_init_xmit_timers(struct sock *);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 6a6e11e54bae..fdb286ddba04 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -154,6 +154,9 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 	 * as we certainly are not changing upper 32bit value (0)
 	 */
 	tp->bytes_received = skb->len;
+
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		tcp_fin(sk);
 }
 
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4add3eb40e58..8194a250a01e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3995,7 +3995,7 @@ void tcp_reset(struct sock *sk)
  *
  *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
  */
-static void tcp_fin(struct sock *sk)
+void tcp_fin(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-- 
cgit v1.2.3-59-g8ed1b


From 3575dbf2cbbc8e598f17ec441aed526dbea0e1bd Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Tue, 2 Feb 2016 18:17:54 +0100
Subject: net: drop write-only stack variable

Remove a write-only stack variable from unix_attach_fds(). This is a
left-over from the security fix in:

    commit 712f4aad406bb1ed67f3f98d04c044191f0ff593
    Author: willy tarreau <w@1wt.eu>
    Date:   Sun Jan 10 07:54:56 2016 +0100

        unix: properly account for FDs passed over unix sockets

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 49d5093eb055..b3745557fc89 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1534,7 +1534,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	int i;
 	unsigned char max_level = 0;
-	int unix_sock_count = 0;
 
 	if (too_many_unix_fds(current))
 		return -ETOOMANYREFS;
@@ -1542,11 +1541,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	for (i = scm->fp->count - 1; i >= 0; i--) {
 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
 
-		if (sk) {
-			unix_sock_count++;
+		if (sk)
 			max_level = max(max_level,
 					unix_sk(sk)->recursion_level);
-		}
 	}
 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
 		return -ETOOMANYREFS;
-- 
cgit v1.2.3-59-g8ed1b


From e662ca40de846e0a2be6326a7c4668326ddb194c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 2 Feb 2016 10:33:04 -0800
Subject: tcp: retransmit after recovery processing and congestion control

The retransmission and F-RTO transmission currently happen inside
recovery state processing (tcp_fastretrans_alert) but before
congestion control.  This refactoring moves the logic after both
s.t. we can determine how much to send (cwnd) before deciding what to
send.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 58 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8194a250a01e..84a4ab9c05d1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -126,6 +126,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
 #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
 
+#define REXMIT_NONE	0 /* no loss recovery to do */
+#define REXMIT_LOST	1 /* retransmit packets marked lost */
+#define REXMIT_NEW	2 /* FRTO-style transmit of unsent/new packets */
+
 /* Adapt the MSS value used to make delayed ack decision to the
  * real world.
  */
@@ -2662,7 +2666,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
  * recovered or spurious. Otherwise retransmits more on partial ACKs.
  */
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+			     int *rexmit)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	bool recovered = !before(tp->snd_una, tp->high_seq);
@@ -2684,10 +2689,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 				tp->frto = 0; /* Step 3.a. loss was real */
 		} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
 			tp->high_seq = tp->snd_nxt;
-			__tcp_push_pending_frames(sk, tcp_current_mss(sk),
-						  TCP_NAGLE_OFF);
-			if (after(tp->snd_nxt, tp->high_seq))
-				return; /* Step 2.b */
+			/* Step 2.b. Try send new data (but deferred until cwnd
+			 * is updated in tcp_ack()). Otherwise fall back to
+			 * the conventional recovery.
+			 */
+			if (tcp_send_head(sk) &&
+			    after(tcp_wnd_end(tp), tp->snd_nxt)) {
+				*rexmit = REXMIT_NEW;
+				return;
+			}
 			tp->frto = 0;
 		}
 	}
@@ -2706,7 +2716,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 		else if (flag & FLAG_SND_UNA_ADVANCED)
 			tcp_reset_reno_sack(tp);
 	}
-	tcp_xmit_retransmit_queue(sk);
+	*rexmit = REXMIT_LOST;
 }
 
 /* Undo during fast recovery after partial ACK. */
@@ -2756,7 +2766,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 				  const int prior_unsacked,
-				  bool is_dupack, int flag)
+				  bool is_dupack, int flag, int *rexmit)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2831,7 +2841,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 		}
 		break;
 	case TCP_CA_Loss:
-		tcp_process_loss(sk, flag, is_dupack);
+		tcp_process_loss(sk, flag, is_dupack, rexmit);
 		if (icsk->icsk_ca_state != TCP_CA_Open &&
 		    !(flag & FLAG_LOST_RETRANS))
 			return;
@@ -2871,7 +2881,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	if (do_lost)
 		tcp_update_scoreboard(sk, fast_rexmit);
 	tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
-	tcp_xmit_retransmit_queue(sk);
+	*rexmit = REXMIT_LOST;
 }
 
 /* Kathleen Nichols' algorithm for tracking the minimum value of
@@ -3506,6 +3516,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
 		icsk->icsk_ca_ops->in_ack_event(sk, flags);
 }
 
+/* Congestion control has updated the cwnd already. So if we're in
+ * loss recovery then now we do any new sends (for FRTO) or
+ * retransmits (for CA_Loss or CA_recovery) that make sense.
+ */
+static void tcp_xmit_recovery(struct sock *sk, int rexmit)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (rexmit == REXMIT_NONE)
+		return;
+
+	if (unlikely(rexmit == 2)) {
+		__tcp_push_pending_frames(sk, tcp_current_mss(sk),
+					  TCP_NAGLE_OFF);
+		if (after(tp->snd_nxt, tp->high_seq))
+			return;
+		tp->frto = 0;
+	}
+	tcp_xmit_retransmit_queue(sk);
+}
+
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 {
@@ -3520,6 +3551,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	int prior_packets = tp->packets_out;
 	const int prior_unsacked = tp->packets_out - tp->sacked_out;
 	int acked = 0; /* Number of packets newly acked */
+	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
 
 	sack_state.first_sackt.v64 = 0;
 
@@ -3616,7 +3648,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
 		tcp_fastretrans_alert(sk, acked, prior_unsacked,
-				      is_dupack, flag);
+				      is_dupack, flag, &rexmit);
 	}
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
@@ -3634,13 +3666,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
 	tcp_update_pacing_rate(sk);
+	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
 	if (flag & FLAG_DSACKING_ACK)
 		tcp_fastretrans_alert(sk, acked, prior_unsacked,
-				      is_dupack, flag);
+				      is_dupack, flag, &rexmit);
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
@@ -3664,7 +3697,8 @@ old_ack:
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 						&sack_state);
 		tcp_fastretrans_alert(sk, acked, prior_unsacked,
-				      is_dupack, flag);
+				      is_dupack, flag, &rexmit);
+		tcp_xmit_recovery(sk, rexmit);
 	}
 
 	SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
-- 
cgit v1.2.3-59-g8ed1b


From 31ba0c10723e9eba378f96de1d1a9426129949e1 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 2 Feb 2016 10:33:05 -0800
Subject: tcp: move cwnd reduction after recovery state procesing

Currently the cwnd is reduced and increased in various different
places. The reduction happens in various places in the recovery
state processing (tcp_fastretrans_alert) while the increase
happens afterward.

A better sequence is to identify lost packets and update
the congestion control state (icsk_ca_state) first. Then base
on the new state, up/down the cwnd in one central place. It's
more clear to reason cwnd changes.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 60 ++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 84a4ab9c05d1..dc810df53e90 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2471,14 +2471,12 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
 	tcp_ecn_queue_cwr(tp);
 }
 
-static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
-			       int fast_rexmit, int flag)
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+			       int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int sndcnt = 0;
 	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
-	int newly_acked_sacked = prior_unsacked -
-				 (tp->packets_out - tp->sacked_out);
 
 	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
 		return;
@@ -2496,7 +2494,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
 	} else {
 		sndcnt = min(delta, newly_acked_sacked);
 	}
-	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+	/* Force a fast retransmit upon entering fast recovery */
+	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
 	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
 }
 
@@ -2541,7 +2540,7 @@ static void tcp_try_keep_open(struct sock *sk)
 	}
 }
 
-static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
+static void tcp_try_to_open(struct sock *sk, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2555,8 +2554,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
 
 	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
 		tcp_try_keep_open(sk);
-	} else {
-		tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
 	}
 }
 
@@ -2720,8 +2717,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
 }
 
 /* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, const int acked,
-				 const int prior_unsacked, int flag)
+static bool tcp_try_undo_partial(struct sock *sk, const int acked)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2736,10 +2732,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
 		 * can undo. Otherwise we clock out new packets but do not
 		 * mark more packets lost or retransmit more.
 		 */
-		if (tp->retrans_out) {
-			tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
+		if (tp->retrans_out)
 			return true;
-		}
 
 		if (!tcp_any_retrans_done(sk))
 			tp->retrans_stamp = 0;
@@ -2758,21 +2752,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
  * taking into account both packets sitting in receiver's buffer and
  * packets lost by network.
  *
- * Besides that it does CWND reduction, when packet loss is detected
- * and changes state of machine.
+ * Besides that it updates the congestion state when packet loss or ECN
+ * is detected. But it does not reduce the cwnd, it is done by the
+ * congestion control later.
  *
  * It does _not_ decide what to send, it is made in function
  * tcp_xmit_retransmit_queue().
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
-				  const int prior_unsacked,
-				  bool is_dupack, int flag, int *rexmit)
+				  bool is_dupack, int *ack_flag, int *rexmit)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	int fast_rexmit = 0, flag = *ack_flag;
 	bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
 				    (tcp_fackets_out(tp) > tp->reordering));
-	int fast_rexmit = 0;
 
 	if (WARN_ON(!tp->packets_out && tp->sacked_out))
 		tp->sacked_out = 0;
@@ -2819,8 +2813,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 
 	/* Use RACK to detect loss */
 	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
-	    tcp_rack_mark_lost(sk))
+	    tcp_rack_mark_lost(sk)) {
 		flag |= FLAG_LOST_RETRANS;
+		*ack_flag |= FLAG_LOST_RETRANS;
+	}
 
 	/* E. Process state. */
 	switch (icsk->icsk_ca_state) {
@@ -2829,7 +2825,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			if (tcp_is_reno(tp) && is_dupack)
 				tcp_add_reno_sack(sk);
 		} else {
-			if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
+			if (tcp_try_undo_partial(sk, acked))
 				return;
 			/* Partial ACK arrived. Force fast retransmit. */
 			do_lost = tcp_is_reno(tp) ||
@@ -2858,7 +2854,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			tcp_try_undo_dsack(sk);
 
 		if (!tcp_time_to_recover(sk, flag)) {
-			tcp_try_to_open(sk, flag, prior_unsacked);
+			tcp_try_to_open(sk, flag);
 			return;
 		}
 
@@ -2880,7 +2876,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 
 	if (do_lost)
 		tcp_update_scoreboard(sk, fast_rexmit);
-	tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
 	*rexmit = REXMIT_LOST;
 }
 
@@ -3306,9 +3301,6 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
 /* Decide wheather to run the increase function of congestion control. */
 static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
-	if (tcp_in_cwnd_reduction(sk))
-		return false;
-
 	/* If reordering is high then always grow cwnd whenever data is
 	 * delivered regardless of its ordering. Otherwise stay conservative
 	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
@@ -3551,6 +3543,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	int prior_packets = tp->packets_out;
 	const int prior_unsacked = tp->packets_out - tp->sacked_out;
 	int acked = 0; /* Number of packets newly acked */
+	int acked_sacked; /* Number of packets newly acked or sacked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
 
 	sack_state.first_sackt.v64 = 0;
@@ -3647,15 +3640,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-		tcp_fastretrans_alert(sk, acked, prior_unsacked,
-				      is_dupack, flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
 	}
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
+	acked_sacked = prior_unsacked - (tp->packets_out - tp->sacked_out);
 	/* Advance cwnd if state allows */
-	if (tcp_may_raise_cwnd(sk, flag))
+	if (tcp_in_cwnd_reduction(sk)) {
+		/* Reduce cwnd if state mandates */
+		tcp_cwnd_reduction(sk, acked_sacked, flag);
+	} else if (tcp_may_raise_cwnd(sk, flag)) {
+		/* Advance cwnd if state allows */
 		tcp_cong_avoid(sk, ack, acked);
+	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
 		struct dst_entry *dst = __sk_dst_get(sk);
@@ -3672,8 +3670,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
 	if (flag & FLAG_DSACKING_ACK)
-		tcp_fastretrans_alert(sk, acked, prior_unsacked,
-				      is_dupack, flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
@@ -3696,8 +3693,7 @@ old_ack:
 	if (TCP_SKB_CB(skb)->sacked) {
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 						&sack_state);
-		tcp_fastretrans_alert(sk, acked, prior_unsacked,
-				      is_dupack, flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
 		tcp_xmit_recovery(sk, rexmit);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From ddf1af6fa00e772fdb67a7d22cb83fac2b8968a8 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 2 Feb 2016 10:33:06 -0800
Subject: tcp: new delivery accounting

This patch changes the accounting of how many packets are
newly acked or sacked when the sender receives an ACK.

The current approach basically computes

   newly_acked_sacked = (prior_packets - prior_sacked) -
                        (tp->packets_out - tp->sacked_out)

   where prior_packets and prior_sacked out are snapshot
   at the beginning of the ACK processing.

The new approach tracks the delivery information via a new
TCP state variable "delivered" which monotically increases
as new packets are delivered in order or out-of-order.

The reason for this change is that the current approach is
brittle that produces negative or inaccurate estimate.

   1) For non-SACK connections, an ACK that advances the SND.UNA
   could reset the DUPACK counters (tp->sacked_out) in
   tcp_process_loss() or tcp_fastretrans_alert(). This inflates
   the inflight suddenly and causes under-estimate or even
   negative estimate. Here is a real example:

                   before   after (processing ACK)
   packets_out     75       73
   sacked_out      23        0
   ca state        Loss     Open

   The old approach computes (75-23) - (73 - 0) = -21 delivered
   while the new approach computes 1 delivered since it
   considers the 2nd-24th packets are delivered OOO.

   2) MSS change would re-count packets_out and sacked_out so
   the estimate is in-accurate and can even become negative.
   E.g., the inflight is doubled when MSS is halved.

   3) Spurious retransmission signaled by DSACK is not accounted

The new approach is simpler and more robust. For SACK connections,
tp->delivered increments as packets are being acked or sacked in
SACK and ACK processing.

For non-sack connections, it's done in tcp_remove_reno_sacks() and
tcp_add_reno_sack(). When an ACK advances the SND.UNA, tp->delivered
is incremented by the number of packets ACKed (less the current
number of DUPACKs received plus one packet hole).  Upon receiving
a DUPACK, tp->delivered is incremented assuming one out-of-order
packet is delivered.

Upon receiving a DSACK, tp->delivered is incremtened assuming one
retransmission is delivered in tcp_sacktag_write_queue().

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  1 +
 net/ipv4/tcp_input.c | 21 +++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b386361ba3e8..d909feeeaea2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -256,6 +256,7 @@ struct tcp_sock {
 	u32	prr_delivered;	/* Number of newly delivered packets to
 				 * receiver in Recovery. */
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
+	u32	delivered;	/* Total data packets delivered incl. rexmits */
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dc810df53e90..2d690b3f0a7b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1214,6 +1214,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		sacked |= TCPCB_SACKED_ACKED;
 		state->flag |= FLAG_DATA_SACKED;
 		tp->sacked_out += pcount;
+		tp->delivered += pcount;  /* Out-of-order packets delivered */
 
 		fack_count += pcount;
 
@@ -1825,8 +1826,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 static void tcp_add_reno_sack(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 prior_sacked = tp->sacked_out;
+
 	tp->sacked_out++;
 	tcp_check_reno_reordering(sk, 0);
+	if (tp->sacked_out > prior_sacked)
+		tp->delivered++; /* Some out-of-order packet is delivered */
 	tcp_verify_left_out(tp);
 }
 
@@ -1838,6 +1843,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked)
 
 	if (acked > 0) {
 		/* One ACK acked hole. The rest eat duplicate ACKs. */
+		tp->delivered += max_t(int, acked - tp->sacked_out, 1);
 		if (acked - 1 >= tp->sacked_out)
 			tp->sacked_out = 0;
 		else
@@ -3156,10 +3162,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 				flag |= FLAG_ORIG_SACK_ACKED;
 		}
 
-		if (sacked & TCPCB_SACKED_ACKED)
+		if (sacked & TCPCB_SACKED_ACKED) {
 			tp->sacked_out -= acked_pcount;
-		else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
-			tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+		} else if (tcp_is_sack(tp)) {
+			tp->delivered += acked_pcount;
+			if (!tcp_skb_spurious_retrans(tp, skb))
+				tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+		}
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
 
@@ -3541,9 +3550,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	bool is_dupack = false;
 	u32 prior_fackets;
 	int prior_packets = tp->packets_out;
-	const int prior_unsacked = tp->packets_out - tp->sacked_out;
+	u32 prior_delivered = tp->delivered;
 	int acked = 0; /* Number of packets newly acked */
-	int acked_sacked; /* Number of packets newly acked or sacked */
+	u32 acked_sacked; /* Number of packets newly acked or sacked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
 
 	sack_state.first_sackt.v64 = 0;
@@ -3645,7 +3654,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
-	acked_sacked = prior_unsacked - (tp->packets_out - tp->sacked_out);
+	acked_sacked = tp->delivered - prior_delivered;
 	/* Advance cwnd if state allows */
 	if (tcp_in_cwnd_reduction(sk)) {
 		/* Reduce cwnd if state mandates */
-- 
cgit v1.2.3-59-g8ed1b


From 3ebd88710584d494b670e54b2c339e3be290956c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 2 Feb 2016 10:33:07 -0800
Subject: tcp: refactor pkts acked accounting

A small refactoring that gets number of packets cumulatively acked
from tcp_clean_rtx_queue() directly.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2d690b3f0a7b..40824b380ef8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3104,7 +3104,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  * arrived at the other end.
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
-			       u32 prior_snd_una,
+			       u32 prior_snd_una, int *acked,
 			       struct tcp_sacktag_state *sack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3277,6 +3277,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		}
 	}
 #endif
+	*acked = pkts_acked;
 	return flag;
 }
 
@@ -3642,10 +3643,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		goto no_queue;
 
 	/* See if we can take anything off of the retransmit queue. */
-	acked = tp->packets_out;
-	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
 				    &sack_state);
-	acked -= tp->packets_out;
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-- 
cgit v1.2.3-59-g8ed1b


From 2d14a4def4fc87cb2d2712f7841b45189d75e301 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 2 Feb 2016 10:33:08 -0800
Subject: tcp: make congestion control more robust against reordering

This change enables congestion control to update cwnd based on
not only packet cumulatively acked but also packets delivered
out-of-order. This makes congestion control robust against packet
reordering because it may raise cwnd as long as packets are being
delivered once reordering has been detected (i.e., it only cares
the amount of packets delivered, not the ordering among them).

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 40824b380ef8..d598ff408cb9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3660,7 +3660,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		tcp_cwnd_reduction(sk, acked_sacked, flag);
 	} else if (tcp_may_raise_cwnd(sk, flag)) {
 		/* Advance cwnd if state allows */
-		tcp_cong_avoid(sk, ack, acked);
+		tcp_cong_avoid(sk, ack, acked_sacked);
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
-- 
cgit v1.2.3-59-g8ed1b


From d452e6caf8367cc70cf940c24a6a6cc2d521d3c1 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 2 Feb 2016 10:33:09 -0800
Subject: tcp: tcp_cong_control helper

Refactor and consolidate cwnd and rate updates into a new function
tcp_cong_control().

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d598ff408cb9..596c1cb6759a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3323,6 +3323,24 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 	return flag & FLAG_DATA_ACKED;
 }
 
+/* The "ultimate" congestion control function that aims to replace the rigid
+ * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
+ * It's called toward the end of processing an ACK with precise rate
+ * information. All transmission or retransmission are delayed afterwards.
+ */
+static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
+			     int flag)
+{
+	if (tcp_in_cwnd_reduction(sk)) {
+		/* Reduce cwnd if state mandates */
+		tcp_cwnd_reduction(sk, acked_sacked, flag);
+	} else if (tcp_may_raise_cwnd(sk, flag)) {
+		/* Advance cwnd if state allows */
+		tcp_cong_avoid(sk, ack, acked_sacked);
+	}
+	tcp_update_pacing_rate(sk);
+}
+
 /* Check that window update is acceptable.
  * The function assumes that snd_una<=ack<=snd_next.
  */
@@ -3553,7 +3571,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	int prior_packets = tp->packets_out;
 	u32 prior_delivered = tp->delivered;
 	int acked = 0; /* Number of packets newly acked */
-	u32 acked_sacked; /* Number of packets newly acked or sacked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
 
 	sack_state.first_sackt.v64 = 0;
@@ -3653,16 +3670,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
-	acked_sacked = tp->delivered - prior_delivered;
-	/* Advance cwnd if state allows */
-	if (tcp_in_cwnd_reduction(sk)) {
-		/* Reduce cwnd if state mandates */
-		tcp_cwnd_reduction(sk, acked_sacked, flag);
-	} else if (tcp_may_raise_cwnd(sk, flag)) {
-		/* Advance cwnd if state allows */
-		tcp_cong_avoid(sk, ack, acked_sacked);
-	}
-
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
 		struct dst_entry *dst = __sk_dst_get(sk);
 		if (dst)
@@ -3671,7 +3678,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
-	tcp_update_pacing_rate(sk);
+	tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
-- 
cgit v1.2.3-59-g8ed1b


From 6fa251663069e05daadd1666cbf3b658bf840ea4 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:49 +0200
Subject: ipv4: Namespaceify tcp syn retries sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  2 ++
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp.c             |  3 ++-
 net/ipv4/tcp_ipv4.c        |  2 ++
 net/ipv4/tcp_timer.c       |  4 ++--
 6 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2b7907a35568..b7b5bd64df35 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -98,6 +98,8 @@ struct netns_ipv4 {
 	int sysctl_tcp_keepalive_probes;
 	int sysctl_tcp_keepalive_intvl;
 
+	int sysctl_tcp_syn_retries;
+
 	struct ping_group_range ping_group_range;
 
 	atomic_t dev_addr_genid;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 479d535609fd..825485c7cc1a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_syn_retries;
 extern int sysctl_tcp_synack_retries;
 extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4d367b4139a3..ae9dd8823134 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -291,15 +291,6 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &ip_ttl_min,
 		.extra2		= &ip_ttl_max,
 	},
-	{
-		.procname	= "tcp_syn_retries",
-		.data		= &sysctl_tcp_syn_retries,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &tcp_syn_retries_min,
-		.extra2		= &tcp_syn_retries_max
-	},
 	{
 		.procname	= "tcp_synack_retries",
 		.data		= &sysctl_tcp_synack_retries,
@@ -960,6 +951,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "tcp_syn_retries",
+		.data		= &init_net.ipv4.sysctl_tcp_syn_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_syn_retries_min,
+		.extra2		= &tcp_syn_retries_max
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c5075779e017..3dbb3637bb4b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2731,6 +2731,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	int val, len;
 
 	if (get_user(len, optlen))
@@ -2765,7 +2766,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		val = keepalive_probes(tp);
 		break;
 	case TCP_SYNCNT:
-		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+		val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
 		break;
 	case TCP_LINGER2:
 		val = tp->linger2;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a4d523709ab3..f7464852aaa1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2388,6 +2388,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
 
+	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a4730a28b220..c5d51f530c65 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
 int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
 int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
@@ -157,6 +156,7 @@ static int tcp_write_timeout(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	int retry_until;
 	bool do_reset, syn_set = false;
 
@@ -169,7 +169,7 @@ static int tcp_write_timeout(struct sock *sk)
 				NET_INC_STATS_BH(sock_net(sk),
 						 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
 		}
-		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+		retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
 		syn_set = true;
 	} else {
 		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
-- 
cgit v1.2.3-59-g8ed1b


From 7c083ecb3ba4583a625d5ff9655d1a819e374493 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:50 +0200
Subject: ipv4: Namespaceify tcp synack retries sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h        |  1 +
 include/net/tcp.h               |  1 -
 net/ipv4/inet_connection_sock.c |  7 ++-----
 net/ipv4/sysctl_net_ipv4.c      | 14 +++++++-------
 net/ipv4/tcp_ipv4.c             |  1 +
 net/ipv4/tcp_timer.c            |  3 +--
 6 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index b7b5bd64df35..9e83084ab8c1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -99,6 +99,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_keepalive_intvl;
 
 	int sysctl_tcp_syn_retries;
+	int sysctl_tcp_synack_retries;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 825485c7cc1a..05659e860039 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_synack_retries;
 extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 46b9c887bede..9b17c1792dce 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -482,10 +482,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
 #define AF_INET_FAMILY(fam) true
 #endif
 
-/* Only thing we need from tcp.h */
-extern int sysctl_tcp_synack_retries;
-
-
 /* Decide when to expire the request and when to resend SYN-ACK */
 static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
 				  const int max_retries,
@@ -557,6 +553,7 @@ static void reqsk_timer_handler(unsigned long data)
 {
 	struct request_sock *req = (struct request_sock *)data;
 	struct sock *sk_listener = req->rsk_listener;
+	struct net *net = sock_net(sk_listener);
 	struct inet_connection_sock *icsk = inet_csk(sk_listener);
 	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 	int qlen, expire = 0, resend = 0;
@@ -566,7 +563,7 @@ static void reqsk_timer_handler(unsigned long data)
 	if (sk_state_load(sk_listener) != TCP_LISTEN)
 		goto drop;
 
-	max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+	max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
 	thresh = max_retries;
 	/* Normally all the openreqs are young and become mature
 	 * (i.e. converted to established socket) for first timeout.
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ae9dd8823134..bb682e36d8b7 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -291,13 +291,6 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &ip_ttl_min,
 		.extra2		= &ip_ttl_max,
 	},
-	{
-		.procname	= "tcp_synack_retries",
-		.data		= &sysctl_tcp_synack_retries,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_max_orphans",
 		.data		= &sysctl_tcp_max_orphans,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= &tcp_syn_retries_min,
 		.extra2		= &tcp_syn_retries_max
 	},
+	{
+		.procname	= "tcp_synack_retries",
+		.data		= &init_net.ipv4.sysctl_tcp_synack_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f7464852aaa1..3146279695b9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2389,6 +2389,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
 
 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c5d51f530c65..ca25fdf0c525 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
 int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
@@ -332,7 +331,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int max_retries = icsk->icsk_syn_retries ? :
-	    sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+	    sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
 	struct request_sock *req;
 
 	req = tcp_sk(sk)->fastopen_rsk;
-- 
cgit v1.2.3-59-g8ed1b


From 12ed8244ed8b31b023ea6d2851fd8b15f2999e9b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:51 +0200
Subject: ipv4: Namespaceify tcp syncookies sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  2 ++
 include/net/tcp.h          |  1 -
 net/ipv4/syncookies.c      |  4 +---
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++---------
 net/ipv4/tcp_input.c       | 10 ++++++----
 net/ipv4/tcp_ipv4.c        |  3 ++-
 net/ipv4/tcp_minisocks.c   |  3 ---
 net/ipv6/syncookies.c      |  2 +-
 8 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 9e83084ab8c1..ac000fccdf0f 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -101,6 +101,8 @@ struct netns_ipv4 {
 	int sysctl_tcp_syn_retries;
 	int sysctl_tcp_synack_retries;
 
+	int sysctl_tcp_syncookies;
+
 	struct ping_group_range ping_group_range;
 
 	atomic_t dev_addr_genid;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 05659e860039..1fb23b70d237 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -243,7 +243,6 @@ extern int sysctl_tcp_fin_timeout;
 extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
-extern int sysctl_tcp_syncookies;
 extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 643a86c49020..ba0dcffada3b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -19,8 +19,6 @@
 #include <net/tcp.h>
 #include <net/route.h>
 
-extern int sysctl_tcp_syncookies;
-
 static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
 
 #define COOKIEBITS 24	/* Upper bits store count */
@@ -307,7 +305,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	__u8 rcv_wscale;
 	struct flowi4 fl4;
 
-	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
 		goto out;
 
 	if (tcp_synq_no_recent_overflow(sk))
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index bb682e36d8b7..d80142570a8d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -341,15 +341,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-#ifdef CONFIG_SYN_COOKIES
-	{
-		.procname	= "tcp_syncookies",
-		.data		= &sysctl_tcp_syncookies,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-#endif
 	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
@@ -960,6 +951,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_SYN_COOKIES
+	{
+		.procname	= "tcp_syncookies",
+		.data		= &init_net.ipv4.sysctl_tcp_syncookies,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
 	{ }
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 596c1cb6759a..b17aba42a368 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6163,9 +6163,10 @@ static bool tcp_syn_flood_action(const struct sock *sk,
 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
 	const char *msg = "Dropping request";
 	bool want_cookie = false;
+	struct net *net = sock_net(sk);
 
 #ifdef CONFIG_SYN_COOKIES
-	if (sysctl_tcp_syncookies) {
+	if (net->ipv4.sysctl_tcp_syncookies) {
 		msg = "Sending cookies";
 		want_cookie = true;
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
@@ -6174,7 +6175,7 @@ static bool tcp_syn_flood_action(const struct sock *sk,
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 
 	if (!queue->synflood_warned &&
-	    sysctl_tcp_syncookies != 2 &&
+	    net->ipv4.sysctl_tcp_syncookies != 2 &&
 	    xchg(&queue->synflood_warned, 1) == 0)
 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 			proto, ntohs(tcp_hdr(skb)->dest), msg);
@@ -6207,6 +6208,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
 	struct tcp_options_received tmp_opt;
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	struct sock *fastopen_sk = NULL;
 	struct dst_entry *dst = NULL;
 	struct request_sock *req;
@@ -6217,7 +6219,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	 * limitations, they conserve resources and peer is
 	 * evidently real one.
 	 */
-	if ((sysctl_tcp_syncookies == 2 ||
+	if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
 	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
 		want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
 		if (!want_cookie)
@@ -6283,7 +6285,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			}
 		}
 		/* Kill the following clause, if you dislike this way. */
-		else if (!sysctl_tcp_syncookies &&
+		else if (!net->ipv4.sysctl_tcp_syncookies &&
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
 			 !tcp_peer_is_proven(req, dst, false,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3146279695b9..98313d10a2e0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -860,7 +860,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
 	kfree(inet_rsk(req)->opt);
 }
 
-
 #ifdef CONFIG_TCP_MD5SIG
 /*
  * RFC2385 MD5 checksumming requires a mapping of
@@ -2391,6 +2390,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
 
+	net->ipv4.sysctl_tcp_syncookies = 0;
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 75632a925824..fadd8b978951 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -27,9 +27,6 @@
 #include <net/inet_common.h>
 #include <net/xfrm.h>
 
-int sysctl_tcp_syncookies __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_tcp_syncookies);
-
 int sysctl_tcp_abort_on_overflow __read_mostly;
 
 struct inet_timewait_death_row tcp_death_row = {
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 2906ef20795e..0e393ff7f5d0 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -148,7 +148,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	struct dst_entry *dst;
 	__u8 rcv_wscale;
 
-	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
 		goto out;
 
 	if (tcp_synq_no_recent_overflow(sk))
-- 
cgit v1.2.3-59-g8ed1b


From 1043e25ff96a1efc7bd34d11f5f32203a28a3bd7 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:52 +0200
Subject: ipv4: Namespaceify tcp reordering sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  2 +-
 include/net/tcp.h          |  4 +++-
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp.c             |  2 +-
 net/ipv4/tcp_input.c       | 12 ++++++------
 net/ipv4/tcp_ipv4.c        |  2 +-
 net/ipv4/tcp_metrics.c     |  3 ++-
 7 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index ac000fccdf0f..eb4cd0a3c296 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -100,8 +100,8 @@ struct netns_ipv4 {
 
 	int sysctl_tcp_syn_retries;
 	int sysctl_tcp_synack_retries;
-
 	int sysctl_tcp_syncookies;
+	int sysctl_tcp_reordering;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1fb23b70d237..7e9a147cabae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -961,9 +961,11 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
  */
 static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
 {
+	struct net *net = sock_net((struct sock *)tp);
+
 	tp->do_early_retrans = sysctl_tcp_early_retrans &&
 		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
-		sysctl_tcp_reordering == 3;
+		net->ipv4.sysctl_tcp_reordering == 3;
 }
 
 static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d80142570a8d..7cd20570588f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -455,13 +455,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "tcp_reordering",
-		.data		= &sysctl_tcp_reordering,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_max_reordering",
 		.data		= &sysctl_tcp_max_reordering,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 #endif
+	{
+		.procname	= "tcp_reordering",
+		.data		= &init_net.ipv4.sysctl_tcp_reordering,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3dbb3637bb4b..f4db6b04cdb4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -406,7 +406,7 @@ void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 	u64_stats_init(&tp->syncp);
 
-	tp->reordering = sysctl_tcp_reordering;
+	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 	tcp_enable_early_retrans(tp);
 	tcp_assign_congestion_control(sk);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b17aba42a368..5ee6fe0d152d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -80,9 +80,7 @@ int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
 int sysctl_tcp_fack __read_mostly = 1;
-int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
 int sysctl_tcp_max_reordering __read_mostly = 300;
-EXPORT_SYMBOL(sysctl_tcp_reordering);
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
@@ -1883,6 +1881,7 @@ void tcp_enter_loss(struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	struct sk_buff *skb;
 	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
@@ -1933,9 +1932,9 @@ void tcp_enter_loss(struct sock *sk)
 	 * suggests that the degree of reordering is over-estimated.
 	 */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
-	    tp->sacked_out >= sysctl_tcp_reordering)
+	    tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
 		tp->reordering = min_t(unsigned int, tp->reordering,
-				       sysctl_tcp_reordering);
+				       net->ipv4.sysctl_tcp_reordering);
 	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->snd_nxt;
 	tcp_ecn_queue_cwr(tp);
@@ -2119,6 +2118,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 packets_out;
+	int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 
 	/* Trick#1: The loss is proven. */
 	if (tp->lost_out)
@@ -2133,7 +2133,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	 */
 	packets_out = tp->packets_out;
 	if (packets_out <= tp->reordering &&
-	    tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+	    tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
 	    !tcp_may_send_now(sk)) {
 		/* We have nothing to send. This connection is limited
 		 * either by receiver window or by application.
@@ -3317,7 +3317,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 	 * new SACK or ECE mark may first advance cwnd here and later reduce
 	 * cwnd in tcp_fastretrans_alert() based on more states.
 	 */
-	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+	if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
 		return flag & FLAG_FORWARD_PROGRESS;
 
 	return flag & FLAG_DATA_ACKED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 98313d10a2e0..10dfc8b5c0f8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2389,8 +2389,8 @@ static int __net_init tcp_sk_init(struct net *net)
 
 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
-
 	net->ipv4.sysctl_tcp_syncookies = 0;
+	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c8cbc2b4b792..c26241f3057b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -369,6 +369,7 @@ void tcp_update_metrics(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	struct tcp_metrics_block *tm;
 	unsigned long rtt;
 	u32 val;
@@ -473,7 +474,7 @@ void tcp_update_metrics(struct sock *sk)
 		if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
 			val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
 			if (val < tp->reordering &&
-			    tp->reordering != sysctl_tcp_reordering)
+			    tp->reordering != net->ipv4.sysctl_tcp_reordering)
 				tcp_metric_set(tm, TCP_METRIC_REORDERING,
 					       tp->reordering);
 		}
-- 
cgit v1.2.3-59-g8ed1b


From ae5c3f406cffe15ffd2aa544961b7cd027468d46 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:53 +0200
Subject: ipv4: Namespaceify tcp_retries1 sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 16 ++++++++--------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_timer.c       |  8 ++++----
 5 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index eb4cd0a3c296..dee6ba647461 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -102,6 +102,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_synack_retries;
 	int sysctl_tcp_syncookies;
 	int sysctl_tcp_reordering;
+	int sysctl_tcp_retries1;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7e9a147cabae..da96b9af3e5f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
 extern int sysctl_tcp_fastopen;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7cd20570588f..52853c6dc929 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -319,14 +319,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_retries1",
-		.data		= &sysctl_tcp_retries1,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra2		= &tcp_retr1_max
-	},
 	{
 		.procname	= "tcp_retries2",
 		.data		= &sysctl_tcp_retries2,
@@ -960,6 +952,14 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_retries1",
+		.data		= &init_net.ipv4.sysctl_tcp_retries1,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra2		= &tcp_retr1_max
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 10dfc8b5c0f8..57fe3c6bfb30 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2391,6 +2391,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
 	net->ipv4.sysctl_tcp_syncookies = 0;
 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
+	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ca25fdf0c525..6694e33149b9 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
@@ -171,7 +170,7 @@ static int tcp_write_timeout(struct sock *sk)
 		retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
 		syn_set = true;
 	} else {
-		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+		if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) {
 			/* Some middle-boxes may black-hole Fast Open _after_
 			 * the handshake. Therefore we conservatively disable
 			 * Fast Open on this path on recurring timeouts with
@@ -180,7 +179,7 @@ static int tcp_write_timeout(struct sock *sk)
 			if (tp->syn_data_acked &&
 			    tp->bytes_acked <= tp->rx_opt.mss_clamp) {
 				tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
-				if (icsk->icsk_retransmits == sysctl_tcp_retries1)
+				if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
 					NET_INC_STATS_BH(sock_net(sk),
 							 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
 			}
@@ -359,6 +358,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
 void tcp_retransmit_timer(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (tp->fastopen_rsk) {
@@ -489,7 +489,7 @@ out_reset_timer:
 		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
-	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
+	if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0))
 		__sk_dst_reset(sk);
 
 out:;
-- 
cgit v1.2.3-59-g8ed1b


From c6214a97c86c660de4f7ddb8eed925192e646161 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:54 +0200
Subject: ipv4: Namespaceify tcp_retries2 sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_output.c      |  3 ++-
 net/ipv4/tcp_timer.c       |  5 ++---
 6 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index dee6ba647461..d92c8e5d0fbc 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -103,6 +103,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_syncookies;
 	int sysctl_tcp_reordering;
 	int sysctl_tcp_retries1;
+	int sysctl_tcp_retries2;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index da96b9af3e5f..a786cfa6301b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
 extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 52853c6dc929..8e339d43619c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -319,13 +319,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_retries2",
-		.data		= &sysctl_tcp_retries2,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_fin_timeout",
 		.data		= &sysctl_tcp_fin_timeout,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra2		= &tcp_retr1_max
 	},
+	{
+		.procname	= "tcp_retries2",
+		.data		= &init_net.ipv4.sysctl_tcp_retries2,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 57fe3c6bfb30..0710e6108a5e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2392,6 +2392,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_syncookies = 0;
 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
+	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fda379cd600d..7beb3f688b7a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3476,6 +3476,7 @@ void tcp_send_probe0(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	unsigned long probe_max;
 	int err;
 
@@ -3489,7 +3490,7 @@ void tcp_send_probe0(struct sock *sk)
 	}
 
 	if (err <= 0) {
-		if (icsk->icsk_backoff < sysctl_tcp_retries2)
+		if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
 			icsk->icsk_backoff++;
 		icsk->icsk_probes_out++;
 		probe_max = TCP_RTO_MAX;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6694e33149b9..09f4e0297e56 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
 
@@ -189,7 +188,7 @@ static int tcp_write_timeout(struct sock *sk)
 			dst_negative_advice(sk);
 		}
 
-		retry_until = sysctl_tcp_retries2;
+		retry_until = net->ipv4.sysctl_tcp_retries2;
 		if (sock_flag(sk, SOCK_DEAD)) {
 			const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
 
@@ -303,7 +302,7 @@ static void tcp_probe_timer(struct sock *sk)
 		 (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
 		goto abort;
 
-	max_probes = sysctl_tcp_retries2;
+	max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
 	if (sock_flag(sk, SOCK_DEAD)) {
 		const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
 
-- 
cgit v1.2.3-59-g8ed1b


From c402d9beffb6141ab2e4d2ad8be71128803a28ca Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:55 +0200
Subject: ipv4: Namespaceify tcp_orphan_retries sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  1 -
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_timer.c       |  3 +--
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d92c8e5d0fbc..080230321985 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -104,6 +104,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_reordering;
 	int sysctl_tcp_retries1;
 	int sysctl_tcp_retries2;
+	int sysctl_tcp_orphan_retries;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a786cfa6301b..71f840b89c76 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_orphan_retries;
 extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 8e339d43619c..b7af6336985f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -419,13 +419,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	{
-		.procname	= "tcp_orphan_retries",
-		.data		= &sysctl_tcp_orphan_retries,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_fack",
 		.data		= &sysctl_tcp_fack,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_orphan_retries",
+		.data		= &init_net.ipv4.sysctl_tcp_orphan_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0710e6108a5e..1240dd62eee1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2393,6 +2393,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
+	net->ipv4.sysctl_tcp_orphan_retries = 0;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 09f4e0297e56..49bc474f8e35 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,7 +22,6 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
 
 static void tcp_write_err(struct sock *sk)
@@ -78,7 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
 /* Calculate maximal number or retries on an orphaned socket. */
 static int tcp_orphan_retries(struct sock *sk, bool alive)
 {
-	int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+	int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */
 
 	/* We know from an ICMP that something is wrong. */
 	if (sk->sk_err_soft && !alive)
-- 
cgit v1.2.3-59-g8ed1b


From 1e579caa18b96f9eb18f4f5416658cd15f37c062 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:56 +0200
Subject: ipv4: Namespaceify tcp_fin_timeout sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  3 +--
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp.c             |  7 +++----
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 080230321985..de5ff4385e84 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -105,6 +105,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_retries1;
 	int sysctl_tcp_retries2;
 	int sysctl_tcp_orphan_retries;
+	int sysctl_tcp_fin_timeout;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 71f840b89c76..3f160c2e6960 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -239,7 +239,6 @@ extern struct inet_timewait_death_row tcp_death_row;
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
-extern int sysctl_tcp_fin_timeout;
 extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
@@ -1249,7 +1248,7 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
 
 static inline int tcp_fin_time(const struct sock *sk)
 {
-	int fin_timeout = tcp_sk(sk)->linger2 ? : sysctl_tcp_fin_timeout;
+	int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout;
 	const int rto = inet_csk(sk)->icsk_rto;
 
 	if (fin_timeout < (rto << 2) - (rto >> 1))
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b7af6336985f..8bd335a2cba8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -319,13 +319,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_fin_timeout",
-		.data		= &sysctl_tcp_fin_timeout,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
 	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_fin_timeout",
+		.data		= &init_net.ipv4.sysctl_tcp_fin_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f4db6b04cdb4..014f18e2f7b3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,8 +282,6 @@
 #include <asm/unaligned.h>
 #include <net/busy_poll.h>
 
-int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
-
 int sysctl_tcp_min_tso_segs __read_mostly = 2;
 
 int sysctl_tcp_autocorking __read_mostly = 1;
@@ -2330,6 +2328,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct net *net = sock_net(sk);
 	int val;
 	int err = 0;
 
@@ -2526,7 +2525,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_LINGER2:
 		if (val < 0)
 			tp->linger2 = -1;
-		else if (val > sysctl_tcp_fin_timeout / HZ)
+		else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
 			tp->linger2 = 0;
 		else
 			tp->linger2 = val * HZ;
@@ -2771,7 +2770,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_LINGER2:
 		val = tp->linger2;
 		if (val >= 0)
-			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+			val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
 		break;
 	case TCP_DEFER_ACCEPT:
 		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1240dd62eee1..36c83c28d9c9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2394,6 +2394,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
 	net->ipv4.sysctl_tcp_orphan_retries = 0;
+	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 
 	return 0;
 fail:
-- 
cgit v1.2.3-59-g8ed1b


From 4979f2d9f7262b9b180bc83de8d70f7a7721c085 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 3 Feb 2016 09:46:57 +0200
Subject: ipv4: Namespaceify tcp_notsent_lowat sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/tcp.h          |  4 ++--
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_output.c      |  3 ---
 5 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index de5ff4385e84..4d6ec3f6fafe 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -106,6 +106,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_retries2;
 	int sysctl_tcp_orphan_retries;
 	int sysctl_tcp_fin_timeout;
+	unsigned int sysctl_tcp_notsent_lowat;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3f160c2e6960..9b2cb0c8d876 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -267,7 +267,6 @@ extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
-extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_min_rtt_wlen;
 extern int sysctl_tcp_autocorking;
@@ -1682,7 +1681,8 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
 
 static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
 {
-	return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat;
+	struct net *net = sock_net((struct sock *)tp);
+	return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
 }
 
 static inline bool tcp_stream_memory_free(const struct sock *sk)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 8bd335a2cba8..44bb59824267 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -455,13 +455,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
-	{
-		.procname	= "tcp_notsent_lowat",
-		.data		= &sysctl_tcp_notsent_lowat,
-		.maxlen		= sizeof(sysctl_tcp_notsent_lowat),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "tcp_rmem",
 		.data		= &sysctl_tcp_rmem,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "tcp_notsent_lowat",
+		.data		= &init_net.ipv4.sysctl_tcp_notsent_lowat,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 36c83c28d9c9..11ae706f53a1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2395,6 +2395,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
 	net->ipv4.sysctl_tcp_orphan_retries = 0;
 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 
 	return 0;
 fail:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7beb3f688b7a..7d2c7a400456 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -62,9 +62,6 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
-unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
-EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
-
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
-- 
cgit v1.2.3-59-g8ed1b


From 0aca737d4623b7b791b4b1e5f9b015fff5f7180c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 8 Feb 2016 04:24:33 -0500
Subject: tcp: Fix syncookies sysctl default.

Unintentionally the default was changed to zero, fix
that.

Fixes: 12ed8244ed ("ipv4: Namespaceify tcp syncookies sysctl knob")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 11ae706f53a1..0d381fa164f8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2389,7 +2389,7 @@ static int __net_init tcp_sk_init(struct net *net)
 
 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
-	net->ipv4.sysctl_tcp_syncookies = 0;
+	net->ipv4.sysctl_tcp_syncookies = 1;
 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
-- 
cgit v1.2.3-59-g8ed1b


From 9d06b6d8a3fcba8eb8e1c62c92942452afea8057 Mon Sep 17 00:00:00 2001
From: Elad Raz <eladr@mellanox.com>
Date: Wed, 3 Feb 2016 09:57:05 +0100
Subject: bridge: mdb: Separate br_mdb_entry->state from
 net_bridge_port_group->state

Change net_bridge_port_group 'state' member to 'flags' and define new set
of flags internal to the kernel.

Signed-off-by: Elad Raz <eladr@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c       | 16 ++++++++++++----
 net/bridge/br_multicast.c | 16 ++++++++--------
 net/bridge/br_private.h   |  9 ++++++---
 3 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 30e105f57f0d..5312570e79fa 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -41,6 +41,14 @@ fail:
 	return -EMSGSIZE;
 }
 
+static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
+{
+	e->state = flags & MDB_PG_FLAGS_PERMANENT;
+	e->flags = 0;
+	if (flags & MDB_PG_FLAGS_OFFLOAD)
+		e->flags |= MDB_FLAGS_OFFLOAD;
+}
+
 static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			    struct net_device *dev)
 {
@@ -85,8 +93,8 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 					struct br_mdb_entry e;
 					memset(&e, 0, sizeof(e));
 					e.ifindex = port->dev->ifindex;
-					e.state = p->state;
 					e.vid = p->addr.vid;
+					__mdb_entry_fill_flags(&e, p->flags);
 					if (p->addr.proto == htons(ETH_P_IP))
 						e.addr.u.ip4 = p->addr.u.ip4;
 #if IS_ENABLED(CONFIG_IPV6)
@@ -254,7 +262,7 @@ errout:
 }
 
 void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
-		   struct br_ip *group, int type, u8 state)
+		   struct br_ip *group, int type, u8 flags)
 {
 	struct br_mdb_entry entry;
 
@@ -265,8 +273,8 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
 #if IS_ENABLED(CONFIG_IPV6)
 	entry.addr.u.ip6 = group->u.ip6;
 #endif
-	entry.state = state;
 	entry.vid = group->vid;
+	__mdb_entry_fill_flags(&entry, flags);
 	__br_mdb_notify(dev, &entry, type);
 }
 
@@ -568,7 +576,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
 		if (p->port->state == BR_STATE_DISABLED)
 			goto unlock;
 
-		entry->state = p->state;
+		__mdb_entry_fill_flags(entry, p->flags);
 		rcu_assign_pointer(*pp, p->next);
 		hlist_del_init(&p->mglist);
 		del_timer(&p->timer);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 03661d97463c..d156491ee4e8 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -284,7 +284,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
 		hlist_del_init(&p->mglist);
 		del_timer(&p->timer);
 		br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
-			      p->state);
+			      p->flags);
 		call_rcu_bh(&p->rcu, br_multicast_free_pg);
 
 		if (!mp->ports && !mp->mglist &&
@@ -304,7 +304,7 @@ static void br_multicast_port_group_expired(unsigned long data)
 
 	spin_lock(&br->multicast_lock);
 	if (!netif_running(br->dev) || timer_pending(&pg->timer) ||
-	    hlist_unhashed(&pg->mglist) || pg->state & MDB_PERMANENT)
+	    hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT)
 		goto out;
 
 	br_multicast_del_pg(br, pg);
@@ -649,7 +649,7 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 			struct net_bridge_port *port,
 			struct br_ip *group,
 			struct net_bridge_port_group __rcu *next,
-			unsigned char state)
+			unsigned char flags)
 {
 	struct net_bridge_port_group *p;
 
@@ -659,7 +659,7 @@ struct net_bridge_port_group *br_multicast_new_port_group(
 
 	p->addr = *group;
 	p->port = port;
-	p->state = state;
+	p->flags = flags;
 	rcu_assign_pointer(p->next, next);
 	hlist_add_head(&p->mglist, &port->mglist);
 	setup_timer(&p->timer, br_multicast_port_group_expired,
@@ -702,11 +702,11 @@ static int br_multicast_add_group(struct net_bridge *br,
 			break;
 	}
 
-	p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY);
+	p = br_multicast_new_port_group(port, group, *pp, 0);
 	if (unlikely(!p))
 		goto err;
 	rcu_assign_pointer(*pp, p);
-	br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY);
+	br_mdb_notify(br->dev, port, group, RTM_NEWMDB, 0);
 
 found:
 	mod_timer(&p->timer, now + br->multicast_membership_interval);
@@ -975,7 +975,7 @@ void br_multicast_disable_port(struct net_bridge_port *port)
 
 	spin_lock(&br->multicast_lock);
 	hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
-		if (pg->state == MDB_TEMPORARY)
+		if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
 			br_multicast_del_pg(br, pg);
 
 	if (!hlist_unhashed(&port->rlist)) {
@@ -1454,7 +1454,7 @@ br_multicast_leave_group(struct net_bridge *br,
 			del_timer(&p->timer);
 			call_rcu_bh(&p->rcu, br_multicast_free_pg);
 			br_mdb_notify(br->dev, port, group, RTM_DELMDB,
-				      p->state);
+				      p->flags);
 
 			if (!mp->ports && !mp->mglist &&
 			    netif_running(br->dev))
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 216018c76018..67f4dce3c3d2 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -150,6 +150,9 @@ struct net_bridge_fdb_entry
 	struct rcu_head			rcu;
 };
 
+#define MDB_PG_FLAGS_PERMANENT	BIT(0)
+#define MDB_PG_FLAGS_OFFLOAD	BIT(1)
+
 struct net_bridge_port_group {
 	struct net_bridge_port		*port;
 	struct net_bridge_port_group __rcu *next;
@@ -157,7 +160,7 @@ struct net_bridge_port_group {
 	struct rcu_head			rcu;
 	struct timer_list		timer;
 	struct br_ip			addr;
-	unsigned char			state;
+	unsigned char			flags;
 };
 
 struct net_bridge_mdb_entry
@@ -554,11 +557,11 @@ void br_multicast_free_pg(struct rcu_head *head);
 struct net_bridge_port_group *
 br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
 			    struct net_bridge_port_group __rcu *next,
-			    unsigned char state);
+			    unsigned char flags);
 void br_mdb_init(void);
 void br_mdb_uninit(void);
 void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
-		   struct br_ip *group, int type, u8 state);
+		   struct br_ip *group, int type, u8 flags);
 void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
 		   int type);
 
-- 
cgit v1.2.3-59-g8ed1b


From 9e8430f8d60d985c9f292b5eb5790cc13a87200c Mon Sep 17 00:00:00 2001
From: Elad Raz <eladr@mellanox.com>
Date: Wed, 3 Feb 2016 09:57:06 +0100
Subject: bridge: mdb: Passing the port-group pointer to br_mdb module

Passing the port-group to br_mdb in order to allow direct access to the
structure. br_mdb will later use the structure to reflect HW reflection
status via "state" variable.

Signed-off-by: Elad Raz <eladr@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c       | 51 +++++++++++++++++++++++++++--------------------
 net/bridge/br_multicast.c |  8 +++-----
 net/bridge/br_private.h   |  4 ++--
 3 files changed, 34 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 5312570e79fa..ac089286526e 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -217,7 +217,7 @@ static inline size_t rtnl_mdb_nlmsg_size(void)
 }
 
 static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry,
-			    int type)
+			    int type, struct net_bridge_port_group *pg)
 {
 	struct switchdev_obj_port_mdb mdb = {
 		.obj = {
@@ -240,10 +240,13 @@ static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry,
 #endif
 
 	mdb.obj.orig_dev = port_dev;
-	if (port_dev && type == RTM_NEWMDB)
-		switchdev_port_obj_add(port_dev, &mdb.obj);
-	else if (port_dev && type == RTM_DELMDB)
+	if (port_dev && type == RTM_NEWMDB) {
+		err = switchdev_port_obj_add(port_dev, &mdb.obj);
+		if (!err && pg)
+			pg->flags |= MDB_PG_FLAGS_OFFLOAD;
+	} else if (port_dev && type == RTM_DELMDB) {
 		switchdev_port_obj_del(port_dev, &mdb.obj);
+	}
 
 	skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC);
 	if (!skb)
@@ -261,21 +264,21 @@ errout:
 	rtnl_set_sk_err(net, RTNLGRP_MDB, err);
 }
 
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
-		   struct br_ip *group, int type, u8 flags)
+void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg,
+		   int type)
 {
 	struct br_mdb_entry entry;
 
 	memset(&entry, 0, sizeof(entry));
-	entry.ifindex = port->dev->ifindex;
-	entry.addr.proto = group->proto;
-	entry.addr.u.ip4 = group->u.ip4;
+	entry.ifindex = pg->port->dev->ifindex;
+	entry.addr.proto = pg->addr.proto;
+	entry.addr.u.ip4 = pg->addr.u.ip4;
 #if IS_ENABLED(CONFIG_IPV6)
-	entry.addr.u.ip6 = group->u.ip6;
+	entry.addr.u.ip6 = pg->addr.u.ip6;
 #endif
-	entry.vid = group->vid;
-	__mdb_entry_fill_flags(&entry, flags);
-	__br_mdb_notify(dev, &entry, type);
+	entry.vid = pg->addr.vid;
+	__mdb_entry_fill_flags(&entry, pg->flags);
+	__br_mdb_notify(dev, &entry, type, pg);
 }
 
 static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
@@ -420,7 +423,8 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
-			    struct br_ip *group, unsigned char state)
+			    struct br_ip *group, unsigned char state,
+			    struct net_bridge_port_group **pg)
 {
 	struct net_bridge_mdb_entry *mp;
 	struct net_bridge_port_group *p;
@@ -451,6 +455,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 	if (unlikely(!p))
 		return -ENOMEM;
 	rcu_assign_pointer(*pp, p);
+	*pg = p;
 	if (state == MDB_TEMPORARY)
 		mod_timer(&p->timer, now + br->multicast_membership_interval);
 
@@ -458,7 +463,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 }
 
 static int __br_mdb_add(struct net *net, struct net_bridge *br,
-			struct br_mdb_entry *entry)
+			struct br_mdb_entry *entry,
+			struct net_bridge_port_group **pg)
 {
 	struct br_ip ip;
 	struct net_device *dev;
@@ -487,7 +493,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
 #endif
 
 	spin_lock_bh(&br->multicast_lock);
-	ret = br_mdb_add_group(br, p, &ip, entry->state);
+	ret = br_mdb_add_group(br, p, &ip, entry->state, pg);
 	spin_unlock_bh(&br->multicast_lock);
 	return ret;
 }
@@ -495,6 +501,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
 static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
+	struct net_bridge_port_group *pg;
 	struct net_bridge_vlan_group *vg;
 	struct net_device *dev, *pdev;
 	struct br_mdb_entry *entry;
@@ -524,15 +531,15 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (br_vlan_enabled(br) && vg && entry->vid == 0) {
 		list_for_each_entry(v, &vg->vlan_list, vlist) {
 			entry->vid = v->vid;
-			err = __br_mdb_add(net, br, entry);
+			err = __br_mdb_add(net, br, entry, &pg);
 			if (err)
 				break;
-			__br_mdb_notify(dev, entry, RTM_NEWMDB);
+			__br_mdb_notify(dev, entry, RTM_NEWMDB, pg);
 		}
 	} else {
-		err = __br_mdb_add(net, br, entry);
+		err = __br_mdb_add(net, br, entry, &pg);
 		if (!err)
-			__br_mdb_notify(dev, entry, RTM_NEWMDB);
+			__br_mdb_notify(dev, entry, RTM_NEWMDB, pg);
 	}
 
 	return err;
@@ -628,12 +635,12 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 			entry->vid = v->vid;
 			err = __br_mdb_del(br, entry);
 			if (!err)
-				__br_mdb_notify(dev, entry, RTM_DELMDB);
+				__br_mdb_notify(dev, entry, RTM_DELMDB, NULL);
 		}
 	} else {
 		err = __br_mdb_del(br, entry);
 		if (!err)
-			__br_mdb_notify(dev, entry, RTM_DELMDB);
+			__br_mdb_notify(dev, entry, RTM_DELMDB, NULL);
 	}
 
 	return err;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d156491ee4e8..8b6e4249be1b 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -283,8 +283,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
 		rcu_assign_pointer(*pp, p->next);
 		hlist_del_init(&p->mglist);
 		del_timer(&p->timer);
-		br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
-			      p->flags);
+		br_mdb_notify(br->dev, p, RTM_DELMDB);
 		call_rcu_bh(&p->rcu, br_multicast_free_pg);
 
 		if (!mp->ports && !mp->mglist &&
@@ -706,7 +705,7 @@ static int br_multicast_add_group(struct net_bridge *br,
 	if (unlikely(!p))
 		goto err;
 	rcu_assign_pointer(*pp, p);
-	br_mdb_notify(br->dev, port, group, RTM_NEWMDB, 0);
+	br_mdb_notify(br->dev, p, RTM_NEWMDB);
 
 found:
 	mod_timer(&p->timer, now + br->multicast_membership_interval);
@@ -1453,8 +1452,7 @@ br_multicast_leave_group(struct net_bridge *br,
 			hlist_del_init(&p->mglist);
 			del_timer(&p->timer);
 			call_rcu_bh(&p->rcu, br_multicast_free_pg);
-			br_mdb_notify(br->dev, port, group, RTM_DELMDB,
-				      p->flags);
+			br_mdb_notify(br->dev, p, RTM_DELMDB);
 
 			if (!mp->ports && !mp->mglist &&
 			    netif_running(br->dev))
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 67f4dce3c3d2..302ab0a43725 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -560,8 +560,8 @@ br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
 			    unsigned char flags);
 void br_mdb_init(void);
 void br_mdb_uninit(void);
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
-		   struct br_ip *group, int type, u8 flags);
+void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg,
+		   int type);
 void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
 		   int type);
 
-- 
cgit v1.2.3-59-g8ed1b


From 16cc1400456a4d5d6df9f5e7f1b2c7cb8c50404b Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 3 Feb 2016 18:02:14 -0500
Subject: packet: move vnet_hdr code to helper functions

packet_snd and packet_rcv support virtio net headers for GSO.
Move this logic into helper functions to be able to reuse it in
tpacket_snd and tpacket_rcv.

This is a straighforward code move with one exception. Instead of
creating and passing a separate gso_type variable, reuse
vnet_hdr.gso_type after conversion from virtio to kernel gso type.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 261 ++++++++++++++++++++++++++++---------------------
 1 file changed, 148 insertions(+), 113 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 992396aa635c..bd3de7b4fb79 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1960,6 +1960,64 @@ static unsigned int run_filter(struct sk_buff *skb,
 	return res;
 }
 
+static int __packet_rcv_vnet(const struct sk_buff *skb,
+			     struct virtio_net_hdr *vnet_hdr)
+{
+	*vnet_hdr = (const struct virtio_net_hdr) { 0 };
+
+	if (skb_is_gso(skb)) {
+		struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+		/* This is a hint as to how much should be linear. */
+		vnet_hdr->hdr_len =
+			__cpu_to_virtio16(vio_le(), skb_headlen(skb));
+		vnet_hdr->gso_size =
+			__cpu_to_virtio16(vio_le(), sinfo->gso_size);
+
+		if (sinfo->gso_type & SKB_GSO_TCPV4)
+			vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+		else if (sinfo->gso_type & SKB_GSO_TCPV6)
+			vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+		else if (sinfo->gso_type & SKB_GSO_UDP)
+			vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
+		else if (sinfo->gso_type & SKB_GSO_FCOE)
+			return -EINVAL;
+		else
+			BUG();
+
+		if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+			vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+	} else
+		vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(),
+				  skb_checksum_start_offset(skb));
+		vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(),
+						 skb->csum_offset);
+	} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+		vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
+	} /* else everything is zero */
+
+	return 0;
+}
+
+static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
+			   size_t *len)
+{
+	struct virtio_net_hdr vnet_hdr;
+
+	if (*len < sizeof(vnet_hdr))
+		return -EINVAL;
+	*len -= sizeof(vnet_hdr);
+
+	if (__packet_rcv_vnet(skb, &vnet_hdr))
+		return -EINVAL;
+
+	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
+}
+
 /*
  * This function makes lazy skb cloning in hope that most of packets
  * are discarded by BPF.
@@ -2347,6 +2405,84 @@ static void tpacket_set_protocol(const struct net_device *dev,
 	}
 }
 
+static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
+{
+	unsigned short gso_type = 0;
+
+	if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+	    (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
+	     __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
+	      __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
+		vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
+			 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
+			__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
+
+	if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
+		return -EINVAL;
+
+	if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+		switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+		case VIRTIO_NET_HDR_GSO_TCPV4:
+			gso_type = SKB_GSO_TCPV4;
+			break;
+		case VIRTIO_NET_HDR_GSO_TCPV6:
+			gso_type = SKB_GSO_TCPV6;
+			break;
+		case VIRTIO_NET_HDR_GSO_UDP:
+			gso_type = SKB_GSO_UDP;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
+			gso_type |= SKB_GSO_TCP_ECN;
+
+		if (vnet_hdr->gso_size == 0)
+			return -EINVAL;
+	}
+
+	vnet_hdr->gso_type = gso_type;	/* changes type, temporary storage */
+	return 0;
+}
+
+static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
+				 struct virtio_net_hdr *vnet_hdr)
+{
+	int n;
+
+	if (*len < sizeof(*vnet_hdr))
+		return -EINVAL;
+	*len -= sizeof(*vnet_hdr);
+
+	n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
+	if (n != sizeof(*vnet_hdr))
+		return -EFAULT;
+
+	return __packet_snd_vnet_parse(vnet_hdr, *len);
+}
+
+static int packet_snd_vnet_gso(struct sk_buff *skb,
+			       struct virtio_net_hdr *vnet_hdr)
+{
+	if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
+		u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
+
+		if (!skb_partial_csum_set(skb, s, o))
+			return -EINVAL;
+	}
+
+	skb_shinfo(skb)->gso_size =
+		__virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
+	skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
+
+	/* Header must be checked, and gso_segs computed. */
+	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+	skb_shinfo(skb)->gso_segs = 0;
+	return 0;
+}
+
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		void *frame, struct net_device *dev, int size_max,
 		__be16 proto, unsigned char *addr, int hlen)
@@ -2643,12 +2779,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	struct sockcm_cookie sockc;
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int offset = 0;
-	int vnet_hdr_len;
 	struct packet_sock *po = pkt_sk(sk);
-	unsigned short gso_type = 0;
 	int hlen, tlen;
 	int extra_len = 0;
-	ssize_t n;
 
 	/*
 	 *	Get and verify the address.
@@ -2686,53 +2819,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	if (sock->type == SOCK_RAW)
 		reserve = dev->hard_header_len;
 	if (po->has_vnet_hdr) {
-		vnet_hdr_len = sizeof(vnet_hdr);
-
-		err = -EINVAL;
-		if (len < vnet_hdr_len)
-			goto out_unlock;
-
-		len -= vnet_hdr_len;
-
-		err = -EFAULT;
-		n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
-		if (n != vnet_hdr_len)
-			goto out_unlock;
-
-		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
-		    (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) +
-		     __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 >
-		      __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len)))
-			vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(),
-				 __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) +
-				__virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2);
-
-		err = -EINVAL;
-		if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len)
+		err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
+		if (err)
 			goto out_unlock;
-
-		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
-			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
-			case VIRTIO_NET_HDR_GSO_TCPV4:
-				gso_type = SKB_GSO_TCPV4;
-				break;
-			case VIRTIO_NET_HDR_GSO_TCPV6:
-				gso_type = SKB_GSO_TCPV6;
-				break;
-			case VIRTIO_NET_HDR_GSO_UDP:
-				gso_type = SKB_GSO_UDP;
-				break;
-			default:
-				goto out_unlock;
-			}
-
-			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
-				gso_type |= SKB_GSO_TCP_ECN;
-
-			if (vnet_hdr.gso_size == 0)
-				goto out_unlock;
-
-		}
 	}
 
 	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
@@ -2744,7 +2833,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	}
 
 	err = -EMSGSIZE;
-	if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
+	if (!vnet_hdr.gso_type &&
+	    (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
 		goto out_unlock;
 
 	err = -ENOBUFS;
@@ -2775,7 +2865,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
 	sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 
-	if (!gso_type && (len > dev->mtu + reserve + extra_len) &&
+	if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
 	    !packet_extra_vlan_len_allowed(dev, skb)) {
 		err = -EMSGSIZE;
 		goto out_free;
@@ -2789,24 +2879,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	packet_pick_tx_queue(dev, skb);
 
 	if (po->has_vnet_hdr) {
-		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-			u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start);
-			u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset);
-			if (!skb_partial_csum_set(skb, s, o)) {
-				err = -EINVAL;
-				goto out_free;
-			}
-		}
-
-		skb_shinfo(skb)->gso_size =
-			__virtio16_to_cpu(vio_le(), vnet_hdr.gso_size);
-		skb_shinfo(skb)->gso_type = gso_type;
-
-		/* Header must be checked, and gso_segs computed. */
-		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-		skb_shinfo(skb)->gso_segs = 0;
-
-		len += vnet_hdr_len;
+		err = packet_snd_vnet_gso(skb, &vnet_hdr);
+		if (err)
+			goto out_free;
+		len += sizeof(vnet_hdr);
 	}
 
 	skb_probe_transport_header(skb, reserve);
@@ -3177,51 +3253,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		packet_rcv_has_room(pkt_sk(sk), NULL);
 
 	if (pkt_sk(sk)->has_vnet_hdr) {
-		struct virtio_net_hdr vnet_hdr = { 0 };
-
-		err = -EINVAL;
-		vnet_hdr_len = sizeof(vnet_hdr);
-		if (len < vnet_hdr_len)
-			goto out_free;
-
-		len -= vnet_hdr_len;
-
-		if (skb_is_gso(skb)) {
-			struct skb_shared_info *sinfo = skb_shinfo(skb);
-
-			/* This is a hint as to how much should be linear. */
-			vnet_hdr.hdr_len =
-				__cpu_to_virtio16(vio_le(), skb_headlen(skb));
-			vnet_hdr.gso_size =
-				__cpu_to_virtio16(vio_le(), sinfo->gso_size);
-			if (sinfo->gso_type & SKB_GSO_TCPV4)
-				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
-			else if (sinfo->gso_type & SKB_GSO_TCPV6)
-				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
-			else if (sinfo->gso_type & SKB_GSO_UDP)
-				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
-			else if (sinfo->gso_type & SKB_GSO_FCOE)
-				goto out_free;
-			else
-				BUG();
-			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
-				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
-		} else
-			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
-
-		if (skb->ip_summed == CHECKSUM_PARTIAL) {
-			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-			vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(),
-					  skb_checksum_start_offset(skb));
-			vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(),
-							 skb->csum_offset);
-		} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
-			vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
-		} /* else everything is zero */
-
-		err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
-		if (err < 0)
+		err = packet_rcv_vnet(msg, skb, &len);
+		if (err)
 			goto out_free;
+		vnet_hdr_len = sizeof(struct virtio_net_hdr);
 	}
 
 	/* You lose any data beyond the buffer you gave. If it worries
-- 
cgit v1.2.3-59-g8ed1b


From 58d19b19cd99b438541eea4cdbf5c171900b25e5 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 3 Feb 2016 18:02:15 -0500
Subject: packet: vnet_hdr support for tpacket_rcv

Support socket option PACKET_VNET_HDR together with PACKET_RX_RING.
When enabled, a struct virtio_net_hdr will precede the data in the
packet ring slots.

Verified with test program at
github.com/wdebruij/kerneltools/blob/master/tests/psock_rxring_vnet.c

  pkt: 1454269209.798420 len=5066
  vnet: gso_type=tcpv4 gso_size=1448 hlen=66 ecn=off
  csum: start=34 off=16
  eth: proto=0x800
  ip: src=<masked> dst=<masked> proto=6 len=5052

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bd3de7b4fb79..b26df32cc64f 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2206,7 +2206,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		unsigned int maclen = skb_network_offset(skb);
 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
 				       (maclen < 16 ? 16 : maclen)) +
-			po->tp_reserve;
+				       po->tp_reserve;
+		if (po->has_vnet_hdr)
+			netoff += sizeof(struct virtio_net_hdr);
 		macoff = netoff - maclen;
 	}
 	if (po->tp_version <= TPACKET_V2) {
@@ -2243,7 +2245,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	h.raw = packet_current_rx_frame(po, skb,
 					TP_STATUS_KERNEL, (macoff+snaplen));
 	if (!h.raw)
-		goto ring_is_full;
+		goto drop_n_account;
 	if (po->tp_version <= TPACKET_V2) {
 		packet_increment_rx_head(po, &po->rx_ring);
 	/*
@@ -2262,6 +2264,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	}
 	spin_unlock(&sk->sk_receive_queue.lock);
 
+	if (po->has_vnet_hdr) {
+		if (__packet_rcv_vnet(skb, h.raw + macoff -
+					   sizeof(struct virtio_net_hdr))) {
+			spin_lock(&sk->sk_receive_queue.lock);
+			goto drop_n_account;
+		}
+	}
+
 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
 
 	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
@@ -2357,7 +2367,7 @@ drop:
 	kfree_skb(skb);
 	return 0;
 
-ring_is_full:
+drop_n_account:
 	po->stats.stats1.tp_drops++;
 	spin_unlock(&sk->sk_receive_queue.lock);
 
@@ -3587,7 +3597,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		}
 		if (optlen < len)
 			return -EINVAL;
-		if (pkt_sk(sk)->has_vnet_hdr)
+		if (pkt_sk(sk)->has_vnet_hdr &&
+		    optname == PACKET_TX_RING)
 			return -EINVAL;
 		if (copy_from_user(&req_u.req, optval, len))
 			return -EFAULT;
-- 
cgit v1.2.3-59-g8ed1b


From 8d39b4a6b83c141acaf0b5e8f90fd5e67721ff90 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 3 Feb 2016 18:02:16 -0500
Subject: packet: parse tpacket header before skb alloc

GSO packet headers must be stored in the linear skb segment.
Move tpacket header parsing before sock_alloc_send_skb. The GSO
follow-on patch will later increase the skb linear argument to
sock_alloc_send_skb if needed for large packets.

The header parsing code does not require an allocated skb, so is
safe to move. Later pass to tpacket_fill_skb the computed data
start and length.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 111 +++++++++++++++++++++++++++++--------------------
 1 file changed, 65 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b26df32cc64f..89377bf283c8 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2494,14 +2494,13 @@ static int packet_snd_vnet_gso(struct sk_buff *skb,
 }
 
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
-		void *frame, struct net_device *dev, int size_max,
+		void *frame, struct net_device *dev, void *data, int tp_len,
 		__be16 proto, unsigned char *addr, int hlen)
 {
 	union tpacket_uhdr ph;
-	int to_write, offset, len, tp_len, nr_frags, len_max;
+	int to_write, offset, len, nr_frags, len_max;
 	struct socket *sock = po->sk.sk_socket;
 	struct page *page;
-	void *data;
 	int err;
 
 	ph.raw = frame;
@@ -2513,51 +2512,9 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
 	skb_shinfo(skb)->destructor_arg = ph.raw;
 
-	switch (po->tp_version) {
-	case TPACKET_V2:
-		tp_len = ph.h2->tp_len;
-		break;
-	default:
-		tp_len = ph.h1->tp_len;
-		break;
-	}
-	if (unlikely(tp_len > size_max)) {
-		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
-		return -EMSGSIZE;
-	}
-
 	skb_reserve(skb, hlen);
 	skb_reset_network_header(skb);
 
-	if (unlikely(po->tp_tx_has_off)) {
-		int off_min, off_max, off;
-		off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
-		off_max = po->tx_ring.frame_size - tp_len;
-		if (sock->type == SOCK_DGRAM) {
-			switch (po->tp_version) {
-			case TPACKET_V2:
-				off = ph.h2->tp_net;
-				break;
-			default:
-				off = ph.h1->tp_net;
-				break;
-			}
-		} else {
-			switch (po->tp_version) {
-			case TPACKET_V2:
-				off = ph.h2->tp_mac;
-				break;
-			default:
-				off = ph.h1->tp_mac;
-				break;
-			}
-		}
-		if (unlikely((off < off_min) || (off_max < off)))
-			return -EINVAL;
-		data = ph.raw + off;
-	} else {
-		data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
-	}
 	to_write = tp_len;
 
 	if (sock->type == SOCK_DGRAM) {
@@ -2615,6 +2572,61 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	return tp_len;
 }
 
+static int tpacket_parse_header(struct packet_sock *po, void *frame,
+				int size_max, void **data)
+{
+	union tpacket_uhdr ph;
+	int tp_len, off;
+
+	ph.raw = frame;
+
+	switch (po->tp_version) {
+	case TPACKET_V2:
+		tp_len = ph.h2->tp_len;
+		break;
+	default:
+		tp_len = ph.h1->tp_len;
+		break;
+	}
+	if (unlikely(tp_len > size_max)) {
+		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
+		return -EMSGSIZE;
+	}
+
+	if (unlikely(po->tp_tx_has_off)) {
+		int off_min, off_max;
+
+		off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
+		off_max = po->tx_ring.frame_size - tp_len;
+		if (po->sk.sk_type == SOCK_DGRAM) {
+			switch (po->tp_version) {
+			case TPACKET_V2:
+				off = ph.h2->tp_net;
+				break;
+			default:
+				off = ph.h1->tp_net;
+				break;
+			}
+		} else {
+			switch (po->tp_version) {
+			case TPACKET_V2:
+				off = ph.h2->tp_mac;
+				break;
+			default:
+				off = ph.h1->tp_mac;
+				break;
+			}
+		}
+		if (unlikely((off < off_min) || (off_max < off)))
+			return -EINVAL;
+	} else {
+		off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
+	}
+
+	*data = frame + off;
+	return tp_len;
+}
+
 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 {
 	struct sk_buff *skb;
@@ -2626,6 +2638,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
 	int tp_len, size_max;
 	unsigned char *addr;
+	void *data;
 	int len_sum = 0;
 	int status = TP_STATUS_AVAILABLE;
 	int hlen, tlen;
@@ -2673,6 +2686,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 			continue;
 		}
 
+		skb = NULL;
+		tp_len = tpacket_parse_header(po, ph, size_max, &data);
+		if (tp_len < 0)
+			goto tpacket_error;
+
 		status = TP_STATUS_SEND_REQUEST;
 		hlen = LL_RESERVED_SPACE(dev);
 		tlen = dev->needed_tailroom;
@@ -2686,7 +2704,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 				err = len_sum;
 			goto out_status;
 		}
-		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
+		tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
 					  addr, hlen);
 		if (likely(tp_len >= 0) &&
 		    tp_len > dev->mtu + reserve &&
@@ -2694,6 +2712,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 			tp_len = -EMSGSIZE;
 
 		if (unlikely(tp_len < 0)) {
+tpacket_error:
 			if (po->tp_loss) {
 				__packet_set_status(po, ph,
 						TP_STATUS_AVAILABLE);
-- 
cgit v1.2.3-59-g8ed1b


From 1d036d25e5609ba73fee6a88db01c306b140d512 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 3 Feb 2016 18:02:17 -0500
Subject: packet: tpacket_snd gso and checksum offload

Support socket option PACKET_VNET_HDR together with PACKET_TX_RING.

When enabled, a struct virtio_net_hdr is expected to precede the data
in the ring. The vnet option must be set before the ring is created.

The implementation reuses the existing skb_copy_bits code that is used
when dev->hard_header_len is non-zero. Move this ll_header check to
before the skb alloc and combine it with a test for vnet_hdr->hdr_len.
Allocate and copy the max of the two.

Verified with test program at
github.com/wdebruij/kerneltools/blob/master/tests/psock_txring_vnet.c

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 53 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 89377bf283c8..b7e7851ddc5d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2495,7 +2495,7 @@ static int packet_snd_vnet_gso(struct sk_buff *skb,
 
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		void *frame, struct net_device *dev, void *data, int tp_len,
-		__be16 proto, unsigned char *addr, int hlen)
+		__be16 proto, unsigned char *addr, int hlen, int copylen)
 {
 	union tpacket_uhdr ph;
 	int to_write, offset, len, nr_frags, len_max;
@@ -2522,20 +2522,17 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 				NULL, tp_len);
 		if (unlikely(err < 0))
 			return -EINVAL;
-	} else if (dev->hard_header_len) {
-		if (ll_header_truncated(dev, tp_len))
-			return -EINVAL;
-
+	} else if (copylen) {
 		skb_push(skb, dev->hard_header_len);
-		err = skb_store_bits(skb, 0, data,
-				dev->hard_header_len);
+		skb_put(skb, copylen - dev->hard_header_len);
+		err = skb_store_bits(skb, 0, data, copylen);
 		if (unlikely(err))
 			return err;
 		if (!skb->protocol)
 			tpacket_set_protocol(dev, skb);
 
-		data += dev->hard_header_len;
-		to_write -= dev->hard_header_len;
+		data += copylen;
+		to_write -= copylen;
 	}
 
 	offset = offset_in_page(data);
@@ -2631,6 +2628,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 {
 	struct sk_buff *skb;
 	struct net_device *dev;
+	struct virtio_net_hdr *vnet_hdr = NULL;
 	__be16 proto;
 	int err, reserve = 0;
 	void *ph;
@@ -2641,7 +2639,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	void *data;
 	int len_sum = 0;
 	int status = TP_STATUS_AVAILABLE;
-	int hlen, tlen;
+	int hlen, tlen, copylen = 0;
 
 	mutex_lock(&po->pg_vec_lock);
 
@@ -2674,7 +2672,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	size_max = po->tx_ring.frame_size
 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
 
-	if (size_max > dev->mtu + reserve + VLAN_HLEN)
+	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
 		size_max = dev->mtu + reserve + VLAN_HLEN;
 
 	do {
@@ -2694,8 +2692,28 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		status = TP_STATUS_SEND_REQUEST;
 		hlen = LL_RESERVED_SPACE(dev);
 		tlen = dev->needed_tailroom;
+		if (po->has_vnet_hdr) {
+			vnet_hdr = data;
+			data += sizeof(*vnet_hdr);
+			tp_len -= sizeof(*vnet_hdr);
+			if (tp_len < 0 ||
+			    __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
+				tp_len = -EINVAL;
+				goto tpacket_error;
+			}
+			copylen = __virtio16_to_cpu(vio_le(),
+						    vnet_hdr->hdr_len);
+		}
+		if (dev->hard_header_len) {
+			if (ll_header_truncated(dev, tp_len)) {
+				tp_len = -EINVAL;
+				goto tpacket_error;
+			}
+			copylen = max_t(int, copylen, dev->hard_header_len);
+		}
 		skb = sock_alloc_send_skb(&po->sk,
-				hlen + tlen + sizeof(struct sockaddr_ll),
+				hlen + tlen + sizeof(struct sockaddr_ll) +
+				(copylen - dev->hard_header_len),
 				!need_wait, &err);
 
 		if (unlikely(skb == NULL)) {
@@ -2705,9 +2723,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 			goto out_status;
 		}
 		tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
-					  addr, hlen);
+					  addr, hlen, copylen);
 		if (likely(tp_len >= 0) &&
 		    tp_len > dev->mtu + reserve &&
+		    !po->has_vnet_hdr &&
 		    !packet_extra_vlan_len_allowed(dev, skb))
 			tp_len = -EMSGSIZE;
 
@@ -2726,6 +2745,11 @@ tpacket_error:
 			}
 		}
 
+		if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
+			tp_len = -EINVAL;
+			goto tpacket_error;
+		}
+
 		packet_pick_tx_queue(dev, skb);
 
 		skb->destructor = tpacket_destruct_skb;
@@ -3616,9 +3640,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		}
 		if (optlen < len)
 			return -EINVAL;
-		if (pkt_sk(sk)->has_vnet_hdr &&
-		    optname == PACKET_TX_RING)
-			return -EINVAL;
 		if (copy_from_user(&req_u.req, optval, len))
 			return -EFAULT;
 		return packet_set_ring(sk, &req_u, 0,
-- 
cgit v1.2.3-59-g8ed1b


From 140ed8e87ca8f4875c2b146cdb2cdbf0c9ac6080 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Tue, 5 Jan 2016 12:06:26 +0100
Subject: batman-adv: Drop reference to netdevice on last reference

The references to the network device should be dropped inside the release
function for batadv_hard_iface similar to what is done with the batman-adv
internal datastructures.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/hard-interface.c | 13 ++++++++-----
 net/batman-adv/hard-interface.h |  6 +++---
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index db90022c00a4..b17e272b5cb3 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -47,13 +47,16 @@
 #include "sysfs.h"
 #include "translation-table.h"
 
-void batadv_hardif_free_rcu(struct rcu_head *rcu)
+/**
+ * batadv_hardif_release - release hard interface from lists and queue for
+ *  free after rcu grace period
+ * @hard_iface: the hard interface to free
+ */
+void batadv_hardif_release(struct batadv_hard_iface *hard_iface)
 {
-	struct batadv_hard_iface *hard_iface;
-
-	hard_iface = container_of(rcu, struct batadv_hard_iface, rcu);
 	dev_put(hard_iface->net_dev);
-	kfree(hard_iface);
+
+	kfree_rcu(hard_iface, rcu);
 }
 
 struct batadv_hard_iface *
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 4d6b5e12331f..9eb08ad6032d 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -61,18 +61,18 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
 void batadv_hardif_remove_interfaces(void);
 int batadv_hardif_min_mtu(struct net_device *soft_iface);
 void batadv_update_min_mtu(struct net_device *soft_iface);
-void batadv_hardif_free_rcu(struct rcu_head *rcu);
+void batadv_hardif_release(struct batadv_hard_iface *hard_iface);
 
 /**
  * batadv_hardif_free_ref - decrement the hard interface refcounter and
- *  possibly free it
+ *  possibly release it
  * @hard_iface: the hard interface to free
  */
 static inline void
 batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface)
 {
 	if (atomic_dec_and_test(&hard_iface->refcount))
-		call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu);
+		batadv_hardif_release(hard_iface);
 }
 
 static inline struct batadv_hard_iface *
-- 
cgit v1.2.3-59-g8ed1b


From 81f02683504dc5590204c3fa059eac86fbfa0927 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon@open-mesh.com>
Date: Mon, 23 Nov 2015 19:57:22 +0100
Subject: batman-adv: add seqno maximum age and protection start flag
 parameters

To allow future use of the window protected function with different
maximum sequence numbers, add a parameter to set this value which
was previously hardcoded. Another parameter added for future use is a
flag to return whether the protection window has started.

While at it, also fix the kerneldoc.

Signed-off-by: Simon Wunderlich <simon@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c |  3 ++-
 net/batman-adv/main.h       |  3 +++
 net/batman-adv/routing.c    | 13 ++++++++++---
 net/batman-adv/routing.h    |  3 ++-
 4 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 3266bcb5bb06..23ce90e21a40 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1315,7 +1315,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
 	/* signalize caller that the packet is to be dropped. */
 	if (!hlist_empty(&orig_node->neigh_list) &&
 	    batadv_window_protected(bat_priv, seq_diff,
-				    &orig_ifinfo->batman_seqno_reset)) {
+				    BATADV_TQ_LOCAL_WINDOW_SIZE,
+				    &orig_ifinfo->batman_seqno_reset, NULL)) {
 		ret = BATADV_PROTECTED;
 		goto out;
 	}
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index a7dc41a2709b..32dfc9e578af 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -35,6 +35,9 @@
 /* Time To Live of broadcast messages */
 #define BATADV_TTL 50
 
+/* maximum sequence number age of broadcast messages */
+#define BATADV_BCAST_MAX_AGE 64
+
 /* purge originators after time in seconds if no valid packet comes in
  * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE
  */
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index f4b60b1fb50e..1c8b35df50cf 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -146,23 +146,29 @@ out:
  * @bat_priv: the bat priv with all the soft interface information
  * @seq_num_diff: difference between the current/received sequence number and
  *  the last sequence number
+ * @seq_old_max_diff: maximum age of sequence number not considered as restart
  * @last_reset: jiffies timestamp of the last reset, will be updated when reset
  *  is detected
+ * @protection_started: is set to true if the protection window was started,
+ *   doesn't change otherwise.
  *
  * Return:
  *  0 if the packet is to be accepted.
  *  1 if the packet is to be ignored.
  */
 int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
-			    unsigned long *last_reset)
+			    s32 seq_old_max_diff, unsigned long *last_reset,
+			    bool *protection_started)
 {
-	if (seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE ||
+	if (seq_num_diff <= -seq_old_max_diff ||
 	    seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) {
 		if (!batadv_has_timed_out(*last_reset,
 					  BATADV_RESET_PROTECTION_MS))
 			return 1;
 
 		*last_reset = jiffies;
+		if (protection_started)
+			*protection_started = true;
 		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
 			   "old packet received, start protection\n");
 	}
@@ -1073,7 +1079,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
 
 	/* check whether the packet is old and the host just restarted. */
 	if (batadv_window_protected(bat_priv, seq_diff,
-				    &orig_node->bcast_seqno_reset))
+				    BATADV_BCAST_MAX_AGE,
+				    &orig_node->bcast_seqno_reset, NULL))
 		goto spin_unlock;
 
 	/* mark broadcast in flood history, update window position
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index c776e9655b9b..02a5caa84127 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -52,6 +52,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
 		   struct batadv_orig_node *orig_node,
 		   struct batadv_hard_iface *recv_if);
 int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
-			    unsigned long *last_reset);
+			    s32 seq_old_max_diff, unsigned long *last_reset,
+			    bool *protection_started);
 
 #endif /* _NET_BATMAN_ADV_ROUTING_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From dded0692247c76f892adee3ba6878ff5f394e210 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 20 Dec 2015 09:04:03 +0100
Subject: batman-adv: Add lockdep assert for container_list_lock

The batadv_tvlv_container* functions state in their kernel-doc that they
require tvlv.container_list_lock. Add an assert to automatically detect
when this might have been ignored by the caller.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/main.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 5f319fd6ecd7..a9b4f75db874 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -697,6 +697,8 @@ static struct batadv_tvlv_container
 {
 	struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL;
 
+	lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
 	hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) {
 		if (tvlv_tmp->tvlv_hdr.type != type)
 			continue;
@@ -729,6 +731,8 @@ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
 	struct batadv_tvlv_container *tvlv;
 	u16 tvlv_len = 0;
 
+	lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
 	hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
 		tvlv_len += sizeof(struct batadv_tvlv_hdr);
 		tvlv_len += ntohs(tvlv->tvlv_hdr.len);
-- 
cgit v1.2.3-59-g8ed1b


From 90f564dff43c9c62d68568c550c011825f14a899 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:40 +0100
Subject: batman-adv: Convert batadv_hardif_neigh_node to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/originator.c | 20 ++++++++++++--------
 net/batman-adv/types.h      |  3 ++-
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index d4a30db0158a..7710595ffc42 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -23,6 +23,7 @@
 #include <linux/fs.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
@@ -196,11 +197,15 @@ void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo)
 /**
  * batadv_hardif_neigh_release - release hardif neigh node from lists and
  *  queue for free after rcu grace period
- * @hardif_neigh: hardif neigh neighbor to free
+ * @ref: kref pointer of the neigh_node
  */
-static void
-batadv_hardif_neigh_release(struct batadv_hardif_neigh_node *hardif_neigh)
+static void batadv_hardif_neigh_release(struct kref *ref)
 {
+	struct batadv_hardif_neigh_node *hardif_neigh;
+
+	hardif_neigh = container_of(ref, struct batadv_hardif_neigh_node,
+				    refcount);
+
 	spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock);
 	hlist_del_init_rcu(&hardif_neigh->list);
 	spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock);
@@ -216,8 +221,7 @@ batadv_hardif_neigh_release(struct batadv_hardif_neigh_node *hardif_neigh)
  */
 void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh)
 {
-	if (atomic_dec_and_test(&hardif_neigh->refcount))
-		batadv_hardif_neigh_release(hardif_neigh);
+	kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release);
 }
 
 /**
@@ -529,7 +533,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
 	hardif_neigh->if_incoming = hard_iface;
 	hardif_neigh->last_seen = jiffies;
 
-	atomic_set(&hardif_neigh->refcount, 1);
+	kref_init(&hardif_neigh->refcount);
 
 	if (bat_priv->bat_algo_ops->bat_hardif_neigh_init)
 		bat_priv->bat_algo_ops->bat_hardif_neigh_init(hardif_neigh);
@@ -584,7 +588,7 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
 		if (!batadv_compare_eth(tmp_hardif_neigh->addr, neigh_addr))
 			continue;
 
-		if (!atomic_inc_not_zero(&tmp_hardif_neigh->refcount))
+		if (!kref_get_unless_zero(&tmp_hardif_neigh->refcount))
 			continue;
 
 		hardif_neigh = tmp_hardif_neigh;
@@ -648,7 +652,7 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 	spin_unlock_bh(&orig_node->neigh_list_lock);
 
 	/* increment unique neighbor refcount */
-	atomic_inc(&hardif_neigh->refcount);
+	kref_get(&hardif_neigh->refcount);
 
 	batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
 		   "Creating new neighbor %pM for orig_node %pM on interface %s\n",
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 8974bc0dc15c..ace7900347d4 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -25,6 +25,7 @@
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/if_ether.h>
+#include <linux/kref.h>
 #include <linux/netdevice.h>
 #include <linux/sched.h> /* for linux/wait.h */
 #include <linux/spinlock.h>
@@ -359,7 +360,7 @@ struct batadv_hardif_neigh_node {
 	u8 addr[ETH_ALEN];
 	struct batadv_hard_iface *if_incoming;
 	unsigned long last_seen;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From e7aed321b8c07d2019b003c3f51a067f2b4f78ab Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:41 +0100
Subject: batman-adv: Convert batadv_gw_node to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/gateway_client.c | 40 +++++++++++++++++++++++++++++-----------
 net/batman-adv/types.h          |  2 +-
 2 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 5950974de7b1..1f3847c61db8 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -28,6 +28,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/rculist.h>
@@ -59,12 +60,29 @@
  */
 #define BATADV_DHCP_CHADDR_OFFSET	28
 
+/**
+ * batadv_gw_node_release - release gw_node from lists and queue for free after
+ *  rcu grace period
+ * @ref: kref pointer of the gw_node
+ */
+static void batadv_gw_node_release(struct kref *ref)
+{
+	struct batadv_gw_node *gw_node;
+
+	gw_node = container_of(ref, struct batadv_gw_node, refcount);
+
+	batadv_orig_node_free_ref(gw_node->orig_node);
+	kfree_rcu(gw_node, rcu);
+}
+
+/**
+ * batadv_gw_node_free_ref - decrement the gw_node refcounter and possibly
+ *  release it
+ * @gw_node: gateway node to free
+ */
 static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node)
 {
-	if (atomic_dec_and_test(&gw_node->refcount)) {
-		batadv_orig_node_free_ref(gw_node->orig_node);
-		kfree_rcu(gw_node, rcu);
-	}
+	kref_put(&gw_node->refcount, batadv_gw_node_release);
 }
 
 static struct batadv_gw_node *
@@ -77,7 +95,7 @@ batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv)
 	if (!gw_node)
 		goto out;
 
-	if (!atomic_inc_not_zero(&gw_node->refcount))
+	if (!kref_get_unless_zero(&gw_node->refcount))
 		gw_node = NULL;
 
 out:
@@ -118,7 +136,7 @@ static void batadv_gw_select(struct batadv_priv *bat_priv,
 
 	spin_lock_bh(&bat_priv->gw.list_lock);
 
-	if (new_gw_node && !atomic_inc_not_zero(&new_gw_node->refcount))
+	if (new_gw_node && !kref_get_unless_zero(&new_gw_node->refcount))
 		new_gw_node = NULL;
 
 	curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1);
@@ -170,7 +188,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
 		if (!router_ifinfo)
 			goto next;
 
-		if (!atomic_inc_not_zero(&gw_node->refcount))
+		if (!kref_get_unless_zero(&gw_node->refcount))
 			goto next;
 
 		tq_avg = router_ifinfo->bat_iv.tq_avg;
@@ -188,7 +206,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
 				if (curr_gw)
 					batadv_gw_node_free_ref(curr_gw);
 				curr_gw = gw_node;
-				atomic_inc(&curr_gw->refcount);
+				kref_get(&curr_gw->refcount);
 			}
 			break;
 
@@ -203,7 +221,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
 				if (curr_gw)
 					batadv_gw_node_free_ref(curr_gw);
 				curr_gw = gw_node;
-				atomic_inc(&curr_gw->refcount);
+				kref_get(&curr_gw->refcount);
 			}
 			break;
 		}
@@ -436,7 +454,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
 	gw_node->orig_node = orig_node;
 	gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
 	gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
-	atomic_set(&gw_node->refcount, 1);
+	kref_init(&gw_node->refcount);
 
 	spin_lock_bh(&bat_priv->gw.list_lock);
 	hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list);
@@ -469,7 +487,7 @@ batadv_gw_node_get(struct batadv_priv *bat_priv,
 		if (gw_node_tmp->orig_node != orig_node)
 			continue;
 
-		if (!atomic_inc_not_zero(&gw_node_tmp->refcount))
+		if (!kref_get_unless_zero(&gw_node_tmp->refcount))
 			continue;
 
 		gw_node = gw_node_tmp;
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index ace7900347d4..06c38881e382 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -342,7 +342,7 @@ struct batadv_gw_node {
 	struct batadv_orig_node *orig_node;
 	u32 bandwidth_down;
 	u32 bandwidth_up;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 6be4d30c18270fe275084c7cd30884535424266e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:42 +0100
Subject: batman-adv: Convert batadv_softif_vlan to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/soft-interface.c | 33 +++++++++++++++++++++++----------
 net/batman-adv/sysfs.c          |  3 ++-
 net/batman-adv/types.h          |  2 +-
 3 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 4bf35b8c3d23..d4490ff75edd 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -30,6 +30,7 @@
 #include <linux/if_vlan.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
@@ -477,9 +478,27 @@ out:
 	return;
 }
 
+/**
+ * batadv_softif_vlan_release - release vlan from lists and queue for free after
+ *  rcu grace period
+ * @ref: kref pointer of the vlan object
+ */
+static void batadv_softif_vlan_release(struct kref *ref)
+{
+	struct batadv_softif_vlan *vlan;
+
+	vlan = container_of(ref, struct batadv_softif_vlan, refcount);
+
+	spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock);
+	hlist_del_rcu(&vlan->list);
+	spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock);
+
+	kfree_rcu(vlan, rcu);
+}
+
 /**
  * batadv_softif_vlan_free_ref - decrease the vlan object refcounter and
- *  possibly free it
+ *  possibly release it
  * @vlan: the vlan object to release
  */
 void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan)
@@ -487,13 +506,7 @@ void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan)
 	if (!vlan)
 		return;
 
-	if (atomic_dec_and_test(&vlan->refcount)) {
-		spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock);
-		hlist_del_rcu(&vlan->list);
-		spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock);
-
-		kfree_rcu(vlan, rcu);
-	}
+	kref_put(&vlan->refcount, batadv_softif_vlan_release);
 }
 
 /**
@@ -514,7 +527,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
 		if (vlan_tmp->vid != vid)
 			continue;
 
-		if (!atomic_inc_not_zero(&vlan_tmp->refcount))
+		if (!kref_get_unless_zero(&vlan_tmp->refcount))
 			continue;
 
 		vlan = vlan_tmp;
@@ -549,7 +562,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
 
 	vlan->bat_priv = bat_priv;
 	vlan->vid = vid;
-	atomic_set(&vlan->refcount, 1);
+	kref_init(&vlan->refcount);
 
 	atomic_set(&vlan->ap_isolation, 0);
 
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 964fc5986b2c..ab4382ba3855 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/if.h>
 #include <linux/if_vlan.h>
+#include <linux/kref.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
@@ -97,7 +98,7 @@ batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj)
 		if (vlan_tmp->kobj != obj)
 			continue;
 
-		if (!atomic_inc_not_zero(&vlan_tmp->refcount))
+		if (!kref_get_unless_zero(&vlan_tmp->refcount))
 			continue;
 
 		vlan = vlan_tmp;
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 06c38881e382..868f6b592b7c 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -746,7 +746,7 @@ struct batadv_softif_vlan {
 	atomic_t ap_isolation;		/* boolean */
 	struct batadv_vlan_tt tt;
 	struct hlist_node list;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 06e56ded867d1035b877aba28189736f41e85ab0 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:43 +0100
Subject: batman-adv: Convert batadv_bla_backbone_gw to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bridge_loop_avoidance.c | 29 +++++++++++++++++++++++------
 net/batman-adv/types.h                 |  2 +-
 2 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 779160934844..41116e4c2e34 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -31,6 +31,7 @@
 #include <linux/jhash.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
@@ -143,14 +144,29 @@ static int batadv_compare_claim(const struct hlist_node *node,
 }
 
 /**
- * batadv_compare_backbone_gw - free backbone gw
+ * batadv_backbone_gw_release - release backbone gw from lists and queue for
+ *  free after rcu grace period
+ * @ref: kref pointer of the backbone gw
+ */
+static void batadv_backbone_gw_release(struct kref *ref)
+{
+	struct batadv_bla_backbone_gw *backbone_gw;
+
+	backbone_gw = container_of(ref, struct batadv_bla_backbone_gw,
+				   refcount);
+
+	kfree_rcu(backbone_gw, rcu);
+}
+
+/**
+ * batadv_backbone_gw_free_ref - decrement the backbone gw refcounter and
+ *  possibly release it
  * @backbone_gw: backbone gateway to be free'd
  */
 static void
 batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw)
 {
-	if (atomic_dec_and_test(&backbone_gw->refcount))
-		kfree_rcu(backbone_gw, rcu);
+	kref_put(&backbone_gw->refcount, batadv_backbone_gw_release);
 }
 
 /**
@@ -247,7 +263,7 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
 						&search_entry))
 			continue;
 
-		if (!atomic_inc_not_zero(&backbone_gw->refcount))
+		if (!kref_get_unless_zero(&backbone_gw->refcount))
 			continue;
 
 		backbone_gw_tmp = backbone_gw;
@@ -448,7 +464,8 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
 	ether_addr_copy(entry->orig, orig);
 
 	/* one for the hash, one for returning */
-	atomic_set(&entry->refcount, 2);
+	kref_init(&entry->refcount);
+	kref_get(&entry->refcount);
 
 	hash_added = batadv_hash_add(bat_priv->bla.backbone_hash,
 				     batadv_compare_backbone_gw,
@@ -664,7 +681,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
 		batadv_backbone_gw_free_ref(claim->backbone_gw);
 	}
 	/* set (new) backbone gw */
-	atomic_inc(&backbone_gw->refcount);
+	kref_get(&backbone_gw->refcount);
 	claim->backbone_gw = backbone_gw;
 
 	spin_lock_bh(&backbone_gw->crc_lock);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 868f6b592b7c..595f52400b1f 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -930,7 +930,7 @@ struct batadv_bla_backbone_gw {
 	atomic_t request_sent;
 	u16 crc;
 	spinlock_t crc_lock; /* protects crc */
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 71b7e3d3161f7b1922a2c3b5d39b7b451c3f8cf7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:44 +0100
Subject: batman-adv: Convert batadv_bla_claim to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bridge_loop_avoidance.c | 14 +++++++++-----
 net/batman-adv/types.h                 |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 41116e4c2e34..7781f39c174d 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -174,8 +174,12 @@ batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw)
  *  grace period
  * @ref: kref pointer of the claim
  */
-static void batadv_claim_release(struct batadv_bla_claim *claim)
+static void batadv_claim_release(struct kref *ref)
 {
+	struct batadv_bla_claim *claim;
+
+	claim = container_of(ref, struct batadv_bla_claim, refcount);
+
 	batadv_backbone_gw_free_ref(claim->backbone_gw);
 	kfree_rcu(claim, rcu);
 }
@@ -187,8 +191,7 @@ static void batadv_claim_release(struct batadv_bla_claim *claim)
  */
 static void batadv_claim_free_ref(struct batadv_bla_claim *claim)
 {
-	if (atomic_dec_and_test(&claim->refcount))
-		batadv_claim_release(claim);
+	kref_put(&claim->refcount, batadv_claim_release);
 }
 
 /**
@@ -219,7 +222,7 @@ static struct batadv_bla_claim
 		if (!batadv_compare_claim(&claim->hash_entry, data))
 			continue;
 
-		if (!atomic_inc_not_zero(&claim->refcount))
+		if (!kref_get_unless_zero(&claim->refcount))
 			continue;
 
 		claim_tmp = claim;
@@ -651,7 +654,8 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
 		claim->lasttime = jiffies;
 		claim->backbone_gw = backbone_gw;
 
-		atomic_set(&claim->refcount, 2);
+		kref_init(&claim->refcount);
+		kref_get(&claim->refcount);
 		batadv_dbg(BATADV_DBG_BLA, bat_priv,
 			   "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n",
 			   mac, BATADV_PRINT_VID(vid));
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 595f52400b1f..34c13b65bec2 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -951,7 +951,7 @@ struct batadv_bla_claim {
 	unsigned long lasttime;
 	struct hlist_node hash_entry;
 	struct rcu_head rcu;
-	atomic_t refcount;
+	struct kref refcount;
 };
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From daf99b4810658d41d22be367fc3c1ae22505bef2 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:45 +0100
Subject: batman-adv: Convert batadv_nc_node to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/network-coding.c | 21 +++++++++++++--------
 net/batman-adv/types.h          |  2 +-
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 0b30c15eee5f..7a1216eeb4cf 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -32,6 +32,7 @@
 #include <linux/jhash.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
@@ -209,23 +210,26 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
 /**
  * batadv_nc_node_release - release nc_node from lists and queue for free after
  *  rcu grace period
- * @nc_node: the nc node to free
+ * @ref: kref pointer of the nc_node
  */
-static void batadv_nc_node_release(struct batadv_nc_node *nc_node)
+static void batadv_nc_node_release(struct kref *ref)
 {
+	struct batadv_nc_node *nc_node;
+
+	nc_node = container_of(ref, struct batadv_nc_node, refcount);
+
 	batadv_orig_node_free_ref(nc_node->orig_node);
 	kfree_rcu(nc_node, rcu);
 }
 
 /**
- * batadv_nc_node_free_ref - decrement the nc node refcounter and possibly
+ * batadv_nc_node_free_ref - decrement the nc_node refcounter and possibly
  *  release it
- * @nc_node: the nc node to free
+ * @nc_node: nc_node to be free'd
  */
 static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node)
 {
-	if (atomic_dec_and_test(&nc_node->refcount))
-		batadv_nc_node_release(nc_node);
+	kref_put(&nc_node->refcount, batadv_nc_node_release);
 }
 
 /**
@@ -797,7 +801,7 @@ static struct batadv_nc_node
 		if (!batadv_compare_eth(nc_node->addr, orig_node->orig))
 			continue;
 
-		if (!atomic_inc_not_zero(&nc_node->refcount))
+		if (!kref_get_unless_zero(&nc_node->refcount))
 			continue;
 
 		/* Found a match */
@@ -848,7 +852,8 @@ static struct batadv_nc_node
 	INIT_LIST_HEAD(&nc_node->list);
 	ether_addr_copy(nc_node->addr, orig_node->orig);
 	nc_node->orig_node = orig_neigh_node;
-	atomic_set(&nc_node->refcount, 2);
+	kref_init(&nc_node->refcount);
+	kref_get(&nc_node->refcount);
 
 	/* Select ingoing or outgoing coding node */
 	if (in_coding) {
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 34c13b65bec2..05421afb1d74 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1067,7 +1067,7 @@ struct batadv_tt_roam_node {
 struct batadv_nc_node {
 	struct list_head list;
 	u8 addr[ETH_ALEN];
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 	struct batadv_orig_node *orig_node;
 	unsigned long last_seen;
-- 
cgit v1.2.3-59-g8ed1b


From 727e0cd59e7be176edc7728cef883f912a29168f Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:46 +0100
Subject: batman-adv: Convert batadv_nc_path to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/network-coding.c | 28 +++++++++++++++++++++-------
 net/batman-adv/types.h          |  2 +-
 2 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 7a1216eeb4cf..371fc6bbf9d6 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -233,14 +233,27 @@ static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node)
 }
 
 /**
- * batadv_nc_path_free_ref - decrements the nc path refcounter and possibly
- * frees it
- * @nc_path: the nc node to free
+ * batadv_nc_path_release - release nc_path from lists and queue for free after
+ *  rcu grace period
+ * @ref: kref pointer of the nc_path
+ */
+static void batadv_nc_path_release(struct kref *ref)
+{
+	struct batadv_nc_path *nc_path;
+
+	nc_path = container_of(ref, struct batadv_nc_path, refcount);
+
+	kfree_rcu(nc_path, rcu);
+}
+
+/**
+ * batadv_nc_path_free_ref - decrement the nc_path refcounter and possibly
+ *  release it
+ * @nc_path: nc_path to be free'd
  */
 static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path)
 {
-	if (atomic_dec_and_test(&nc_path->refcount))
-		kfree_rcu(nc_path, rcu);
+	kref_put(&nc_path->refcount, batadv_nc_path_release);
 }
 
 /**
@@ -545,7 +558,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash,
 		if (!batadv_nc_hash_compare(&nc_path->hash_entry, data))
 			continue;
 
-		if (!atomic_inc_not_zero(&nc_path->refcount))
+		if (!kref_get_unless_zero(&nc_path->refcount))
 			continue;
 
 		nc_path_tmp = nc_path;
@@ -972,7 +985,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
 	/* Initialize nc_path */
 	INIT_LIST_HEAD(&nc_path->packet_list);
 	spin_lock_init(&nc_path->packet_list_lock);
-	atomic_set(&nc_path->refcount, 2);
+	kref_init(&nc_path->refcount);
+	kref_get(&nc_path->refcount);
 	nc_path->last_valid = jiffies;
 	ether_addr_copy(nc_path->next_hop, dst);
 	ether_addr_copy(nc_path->prev_hop, src);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 05421afb1d74..b3d9b5b566a2 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1087,7 +1087,7 @@ struct batadv_nc_node {
 struct batadv_nc_path {
 	struct hlist_node hash_entry;
 	struct rcu_head rcu;
-	atomic_t refcount;
+	struct kref refcount;
 	struct list_head packet_list;
 	spinlock_t packet_list_lock; /* Protects packet_list */
 	u8 next_hop[ETH_ALEN];
-- 
cgit v1.2.3-59-g8ed1b


From 68a6722cc41f92ceffa8915bfe6a9c0307f2abeb Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:47 +0100
Subject: batman-adv: Convert batadv_dat_entry to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/distributed-arp-table.c | 27 +++++++++++++++++++++------
 net/batman-adv/types.h                 |  2 +-
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 017fffe9a5b8..921e25255cab 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -30,6 +30,7 @@
 #include <linux/in.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
@@ -61,15 +62,28 @@ static void batadv_dat_start_timer(struct batadv_priv *bat_priv)
 			   msecs_to_jiffies(10000));
 }
 
+/**
+ * batadv_dat_entry_release - release dat_entry from lists and queue for free
+ *  after rcu grace period
+ * @ref: kref pointer of the dat_entry
+ */
+static void batadv_dat_entry_release(struct kref *ref)
+{
+	struct batadv_dat_entry *dat_entry;
+
+	dat_entry = container_of(ref, struct batadv_dat_entry, refcount);
+
+	kfree_rcu(dat_entry, rcu);
+}
+
 /**
  * batadv_dat_entry_free_ref - decrement the dat_entry refcounter and possibly
- * free it
- * @dat_entry: the entry to free
+ *  release it
+ * @dat_entry: dat_entry to be free'd
  */
 static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry)
 {
-	if (atomic_dec_and_test(&dat_entry->refcount))
-		kfree_rcu(dat_entry, rcu);
+	kref_put(&dat_entry->refcount, batadv_dat_entry_release);
 }
 
 /**
@@ -281,7 +295,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
 		if (dat_entry->ip != ip)
 			continue;
 
-		if (!atomic_inc_not_zero(&dat_entry->refcount))
+		if (!kref_get_unless_zero(&dat_entry->refcount))
 			continue;
 
 		dat_entry_tmp = dat_entry;
@@ -326,7 +340,8 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
 	dat_entry->vid = vid;
 	ether_addr_copy(dat_entry->mac_addr, mac_addr);
 	dat_entry->last_update = jiffies;
-	atomic_set(&dat_entry->refcount, 2);
+	kref_init(&dat_entry->refcount);
+	kref_get(&dat_entry->refcount);
 
 	hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat,
 				     batadv_hash_dat, dat_entry,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index b3d9b5b566a2..9df8f39e4785 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1230,7 +1230,7 @@ struct batadv_dat_entry {
 	unsigned short vid;
 	unsigned long last_update;
 	struct hlist_node hash_entry;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From f7157dd135012d9e9b4d74a9f8b2426ea92131f8 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:48 +0100
Subject: batman-adv: Convert batadv_tvlv_container to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/main.c  | 22 +++++++++++++++++-----
 net/batman-adv/types.h |  2 +-
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index a9b4f75db874..4b6c2589fa34 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -29,6 +29,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/module.h>
@@ -669,15 +670,26 @@ static struct batadv_tvlv_handler
 	return tvlv_handler;
 }
 
+/**
+ * batadv_tvlv_container_release - release tvlv from lists and free
+ * @ref: kref pointer of the tvlv
+ */
+static void batadv_tvlv_container_release(struct kref *ref)
+{
+	struct batadv_tvlv_container *tvlv;
+
+	tvlv = container_of(ref, struct batadv_tvlv_container, refcount);
+	kfree(tvlv);
+}
+
 /**
  * batadv_tvlv_container_free_ref - decrement the tvlv container refcounter and
- *  possibly free it
+ *  possibly release it
  * @tvlv: the tvlv container to free
  */
 static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv)
 {
-	if (atomic_dec_and_test(&tvlv->refcount))
-		kfree(tvlv);
+	kref_put(&tvlv->refcount, batadv_tvlv_container_release);
 }
 
 /**
@@ -706,7 +718,7 @@ static struct batadv_tvlv_container
 		if (tvlv_tmp->tvlv_hdr.version != version)
 			continue;
 
-		if (!atomic_inc_not_zero(&tvlv_tmp->refcount))
+		if (!kref_get_unless_zero(&tvlv_tmp->refcount))
 			continue;
 
 		tvlv = tvlv_tmp;
@@ -814,7 +826,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
 
 	memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len));
 	INIT_HLIST_NODE(&tvlv_new->list);
-	atomic_set(&tvlv_new->refcount, 1);
+	kref_init(&tvlv_new->refcount);
 
 	spin_lock_bh(&bat_priv->tvlv.container_list_lock);
 	tvlv_old = batadv_tvlv_container_get(bat_priv, type, version);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 9df8f39e4785..c55925bb34af 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1266,7 +1266,7 @@ struct batadv_dat_candidate {
 struct batadv_tvlv_container {
 	struct hlist_node list;
 	struct batadv_tvlv_hdr tvlv_hdr;
-	atomic_t refcount;
+	struct kref refcount;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 32836f56f88c280d39fc302bcc0e0e3ed6bb412e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:49 +0100
Subject: batman-adv: Convert batadv_tvlv_handler to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/main.c  | 24 ++++++++++++++++++------
 net/batman-adv/types.h |  2 +-
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 4b6c2589fa34..568c5503f637 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -625,15 +625,27 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
 }
 
 /**
- * batadv_tvlv_handler_free_ref - decrement the tvlv handler refcounter and
- *  possibly free it
+ * batadv_tvlv_handler_release - release tvlv handler from lists and queue for
+ *  free after rcu grace period
+ * @ref: kref pointer of the tvlv
+ */
+static void batadv_tvlv_handler_release(struct kref *ref)
+{
+	struct batadv_tvlv_handler *tvlv_handler;
+
+	tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount);
+	kfree_rcu(tvlv_handler, rcu);
+}
+
+/**
+ * batadv_tvlv_handler_free_ref - decrement the tvlv container refcounter and
+ *  possibly release it
  * @tvlv_handler: the tvlv handler to free
  */
 static void
 batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler)
 {
-	if (atomic_dec_and_test(&tvlv_handler->refcount))
-		kfree_rcu(tvlv_handler, rcu);
+	kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release);
 }
 
 /**
@@ -659,7 +671,7 @@ static struct batadv_tvlv_handler
 		if (tvlv_handler_tmp->version != version)
 			continue;
 
-		if (!atomic_inc_not_zero(&tvlv_handler_tmp->refcount))
+		if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount))
 			continue;
 
 		tvlv_handler = tvlv_handler_tmp;
@@ -1112,7 +1124,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
 	tvlv_handler->type = type;
 	tvlv_handler->version = version;
 	tvlv_handler->flags = flags;
-	atomic_set(&tvlv_handler->refcount, 1);
+	kref_init(&tvlv_handler->refcount);
 	INIT_HLIST_NODE(&tvlv_handler->list);
 
 	spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index c55925bb34af..f89dd977f348 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1293,7 +1293,7 @@ struct batadv_tvlv_handler {
 	u8 type;
 	u8 version;
 	u8 flags;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 6e8ef69dd46044abcee3c3dbc5a42afce229ac48 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:50 +0100
Subject: batman-adv: Convert batadv_tt_orig_list_entry to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/translation-table.c | 26 +++++++++++++++++---------
 net/batman-adv/types.h             |  2 +-
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 0dc8a5ca33bf..68494a10848c 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -31,6 +31,7 @@
 #include <linux/jhash.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
@@ -346,22 +347,28 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node,
 /**
  * batadv_tt_orig_list_entry_release - release tt orig entry from lists and
  *  queue for free after rcu grace period
- * @orig_entry: tt orig entry to be free'd
+ * @ref: kref pointer of the tt orig entry
  */
-static void
-batadv_tt_orig_list_entry_release(struct batadv_tt_orig_list_entry *orig_entry)
+static void batadv_tt_orig_list_entry_release(struct kref *ref)
 {
+	struct batadv_tt_orig_list_entry *orig_entry;
+
+	orig_entry = container_of(ref, struct batadv_tt_orig_list_entry,
+				  refcount);
+
 	batadv_orig_node_free_ref(orig_entry->orig_node);
 	kfree_rcu(orig_entry, rcu);
 }
 
+/**
+ * batadv_tt_orig_list_entry_free_ref - decrement the tt orig entry refcounter
+ *  and possibly release it
+ * @orig_entry: tt orig entry to be free'd
+ */
 static void
 batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry)
 {
-	if (!atomic_dec_and_test(&orig_entry->refcount))
-		return;
-
-	batadv_tt_orig_list_entry_release(orig_entry);
+	kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release);
 }
 
 /**
@@ -1270,7 +1277,7 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
 	hlist_for_each_entry_rcu(tmp_orig_entry, head, list) {
 		if (tmp_orig_entry->orig_node != orig_node)
 			continue;
-		if (!atomic_inc_not_zero(&tmp_orig_entry->refcount))
+		if (!kref_get_unless_zero(&tmp_orig_entry->refcount))
 			continue;
 
 		orig_entry = tmp_orig_entry;
@@ -1331,7 +1338,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
 	batadv_tt_global_size_inc(orig_node, tt_global->common.vid);
 	orig_entry->orig_node = orig_node;
 	orig_entry->ttvn = ttvn;
-	atomic_set(&orig_entry->refcount, 2);
+	kref_init(&orig_entry->refcount);
+	kref_get(&orig_entry->refcount);
 
 	spin_lock_bh(&tt_global->list_lock);
 	hlist_add_head_rcu(&orig_entry->list,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index f89dd977f348..17f97ff5b4d1 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1014,7 +1014,7 @@ struct batadv_tt_orig_list_entry {
 	struct batadv_orig_node *orig_node;
 	u8 ttvn;
 	struct hlist_node list;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 962c68328b14eb311edc950801cda2fceddad2df Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:51 +0100
Subject: batman-adv: Convert batadv_neigh_ifinfo to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/originator.c | 17 ++++++++++-------
 net/batman-adv/types.h      |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 7710595ffc42..1c50abe52c6d 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -172,11 +172,14 @@ err:
 /**
  * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for
  *  free after rcu grace period
- * @neigh_ifinfo: the neigh_ifinfo object to release
+ * @ref: kref pointer of the neigh_ifinfo
  */
-static void
-batadv_neigh_ifinfo_release(struct batadv_neigh_ifinfo *neigh_ifinfo)
+static void batadv_neigh_ifinfo_release(struct kref *ref)
 {
+	struct batadv_neigh_ifinfo *neigh_ifinfo;
+
+	neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount);
+
 	if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
 		batadv_hardif_free_ref(neigh_ifinfo->if_outgoing);
 
@@ -190,8 +193,7 @@ batadv_neigh_ifinfo_release(struct batadv_neigh_ifinfo *neigh_ifinfo)
  */
 void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo)
 {
-	if (atomic_dec_and_test(&neigh_ifinfo->refcount))
-		batadv_neigh_ifinfo_release(neigh_ifinfo);
+	kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release);
 }
 
 /**
@@ -405,7 +407,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
 		if (tmp_neigh_ifinfo->if_outgoing != if_outgoing)
 			continue;
 
-		if (!atomic_inc_not_zero(&tmp_neigh_ifinfo->refcount))
+		if (!kref_get_unless_zero(&tmp_neigh_ifinfo->refcount))
 			continue;
 
 		neigh_ifinfo = tmp_neigh_ifinfo;
@@ -450,7 +452,8 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
 	}
 
 	INIT_HLIST_NODE(&neigh_ifinfo->list);
-	atomic_set(&neigh_ifinfo->refcount, 2);
+	kref_init(&neigh_ifinfo->refcount);
+	kref_get(&neigh_ifinfo->refcount);
 	neigh_ifinfo->if_outgoing = if_outgoing;
 
 	hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 17f97ff5b4d1..81d9f8d6c54b 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -420,7 +420,7 @@ struct batadv_neigh_ifinfo {
 	struct batadv_hard_iface *if_outgoing;
 	struct batadv_neigh_ifinfo_bat_iv bat_iv;
 	u8 last_ttl;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From a6ba0d340db63b95cd07484370cca69929d8f4e4 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:52 +0100
Subject: batman-adv: Convert batadv_orig_ifinfo to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/originator.c | 15 +++++++++------
 net/batman-adv/routing.c    |  5 +++--
 net/batman-adv/types.h      |  2 +-
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 1c50abe52c6d..428983a2036f 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -326,7 +326,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
 		if (tmp->if_outgoing != if_outgoing)
 			continue;
 
-		if (!atomic_inc_not_zero(&tmp->refcount))
+		if (!kref_get_unless_zero(&tmp->refcount))
 			continue;
 
 		orig_ifinfo = tmp;
@@ -377,7 +377,8 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
 	orig_ifinfo->batman_seqno_reset = reset_time;
 	orig_ifinfo->if_outgoing = if_outgoing;
 	INIT_HLIST_NODE(&orig_ifinfo->list);
-	atomic_set(&orig_ifinfo->refcount, 2);
+	kref_init(&orig_ifinfo->refcount);
+	kref_get(&orig_ifinfo->refcount);
 	hlist_add_head_rcu(&orig_ifinfo->list,
 			   &orig_node->ifinfo_list);
 out:
@@ -704,12 +705,15 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
 /**
  * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for
  *  free after rcu grace period
- * @orig_ifinfo: the orig_ifinfo object to release
+ * @ref: kref pointer of the orig_ifinfo
  */
-static void batadv_orig_ifinfo_release(struct batadv_orig_ifinfo *orig_ifinfo)
+static void batadv_orig_ifinfo_release(struct kref *ref)
 {
+	struct batadv_orig_ifinfo *orig_ifinfo;
 	struct batadv_neigh_node *router;
 
+	orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount);
+
 	if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
 		batadv_hardif_free_ref(orig_ifinfo->if_outgoing);
 
@@ -728,8 +732,7 @@ static void batadv_orig_ifinfo_release(struct batadv_orig_ifinfo *orig_ifinfo)
  */
 void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo)
 {
-	if (atomic_dec_and_test(&orig_ifinfo->refcount))
-		batadv_orig_ifinfo_release(orig_ifinfo);
+	kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release);
 }
 
 /**
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 1c8b35df50cf..4a5cd8bf2661 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -25,6 +25,7 @@
 #include <linux/etherdevice.h>
 #include <linux/if_ether.h>
 #include <linux/jiffies.h>
+#include <linux/kref.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
 #include <linux/rculist.h>
@@ -497,7 +498,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
 
 	hlist_for_each_entry_rcu(cand, &orig_node->ifinfo_list, list) {
 		/* acquire some structures and references ... */
-		if (!atomic_inc_not_zero(&cand->refcount))
+		if (!kref_get_unless_zero(&cand->refcount))
 			continue;
 
 		cand_router = rcu_dereference(cand->router);
@@ -524,7 +525,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
 		/* mark the first possible candidate */
 		if (!first_candidate) {
 			atomic_inc(&cand_router->refcount);
-			atomic_inc(&cand->refcount);
+			kref_get(&cand->refcount);
 			first_candidate = cand;
 			first_candidate_router = cand_router;
 		}
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 81d9f8d6c54b..6193c01b47d2 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -141,7 +141,7 @@ struct batadv_orig_ifinfo {
 	u32 last_real_seqno;
 	u8 last_ttl;
 	unsigned long batman_seqno_reset;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 77ae32e8989a26d24bd8007be3b9dc4a348175a9 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:53 +0100
Subject: batman-adv: Convert batadv_neigh_node to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c |  5 +++--
 net/batman-adv/originator.c | 22 ++++++++++++----------
 net/batman-adv/routing.c    |  8 ++++----
 net/batman-adv/types.h      |  2 +-
 4 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 23ce90e21a40..affcbb571426 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -31,6 +31,7 @@
 #include <linux/init.h>
 #include <linux/jiffies.h>
 #include <linux/list.h>
+#include <linux/kref.h>
 #include <linux/netdevice.h>
 #include <linux/pkt_sched.h>
 #include <linux/printk.h>
@@ -1002,7 +1003,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
 		neigh_addr = tmp_neigh_node->addr;
 		if (batadv_compare_eth(neigh_addr, ethhdr->h_source) &&
 		    tmp_neigh_node->if_incoming == if_incoming &&
-		    atomic_inc_not_zero(&tmp_neigh_node->refcount)) {
+		    kref_get_unless_zero(&tmp_neigh_node->refcount)) {
 			if (WARN(neigh_node, "too many matching neigh_nodes"))
 				batadv_neigh_node_free_ref(neigh_node);
 			neigh_node = tmp_neigh_node;
@@ -1161,7 +1162,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
 		if (tmp_neigh_node->if_incoming != if_incoming)
 			continue;
 
-		if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+		if (!kref_get_unless_zero(&tmp_neigh_node->refcount))
 			continue;
 
 		neigh_node = tmp_neigh_node;
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 428983a2036f..9e3dbd88c69e 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -229,15 +229,17 @@ void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh)
 /**
  * batadv_neigh_node_release - release neigh_node from lists and queue for
  *  free after rcu grace period
- * @neigh_node: neigh neighbor to free
+ * @ref: kref pointer of the neigh_node
  */
-static void batadv_neigh_node_release(struct batadv_neigh_node *neigh_node)
+static void batadv_neigh_node_release(struct kref *ref)
 {
 	struct hlist_node *node_tmp;
+	struct batadv_neigh_node *neigh_node;
 	struct batadv_hardif_neigh_node *hardif_neigh;
 	struct batadv_neigh_ifinfo *neigh_ifinfo;
 	struct batadv_algo_ops *bao;
 
+	neigh_node = container_of(ref, struct batadv_neigh_node, refcount);
 	bao = neigh_node->orig_node->bat_priv->bat_algo_ops;
 
 	hlist_for_each_entry_safe(neigh_ifinfo, node_tmp,
@@ -262,14 +264,13 @@ static void batadv_neigh_node_release(struct batadv_neigh_node *neigh_node)
 }
 
 /**
- * batadv_neigh_node_free_ref - decrement the neighbors refcounter
- *  and possibly release it
+ * batadv_neigh_node_free_ref - decrement the neighbors refcounter and possibly
+ *  release it
  * @neigh_node: neigh neighbor to free
  */
 void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node)
 {
-	if (atomic_dec_and_test(&neigh_node->refcount))
-		batadv_neigh_node_release(neigh_node);
+	kref_put(&neigh_node->refcount, batadv_neigh_node_release);
 }
 
 /**
@@ -298,7 +299,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node,
 		break;
 	}
 
-	if (router && !atomic_inc_not_zero(&router->refcount))
+	if (router && !kref_get_unless_zero(&router->refcount))
 		router = NULL;
 
 	rcu_read_unlock();
@@ -491,7 +492,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
 		if (tmp_neigh_node->if_incoming != hard_iface)
 			continue;
 
-		if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+		if (!kref_get_unless_zero(&tmp_neigh_node->refcount))
 			continue;
 
 		res = tmp_neigh_node;
@@ -649,7 +650,8 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 	neigh_node->orig_node = orig_node;
 
 	/* extra reference for return */
-	atomic_set(&neigh_node->refcount, 2);
+	kref_init(&neigh_node->refcount);
+	kref_get(&neigh_node->refcount);
 
 	spin_lock_bh(&orig_node->neigh_list_lock);
 	hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
@@ -1084,7 +1086,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv,
 						best, if_outgoing) <= 0))
 			continue;
 
-		if (!atomic_inc_not_zero(&neigh->refcount))
+		if (!kref_get_unless_zero(&neigh->refcount))
 			continue;
 
 		if (best)
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 4a5cd8bf2661..205310b56c2b 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -73,7 +73,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
 
 	rcu_read_lock();
 	curr_router = rcu_dereference(orig_ifinfo->router);
-	if (curr_router && !atomic_inc_not_zero(&curr_router->refcount))
+	if (curr_router && !kref_get_unless_zero(&curr_router->refcount))
 		curr_router = NULL;
 	rcu_read_unlock();
 
@@ -101,7 +101,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
 		batadv_neigh_node_free_ref(curr_router);
 
 	/* increase refcount of new best neighbor */
-	if (neigh_node && !atomic_inc_not_zero(&neigh_node->refcount))
+	if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount))
 		neigh_node = NULL;
 
 	spin_lock_bh(&orig_node->neigh_list_lock);
@@ -505,7 +505,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
 		if (!cand_router)
 			goto next;
 
-		if (!atomic_inc_not_zero(&cand_router->refcount)) {
+		if (!kref_get_unless_zero(&cand_router->refcount)) {
 			cand_router = NULL;
 			goto next;
 		}
@@ -524,7 +524,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
 
 		/* mark the first possible candidate */
 		if (!first_candidate) {
-			atomic_inc(&cand_router->refcount);
+			kref_get(&cand_router->refcount);
 			kref_get(&cand->refcount);
 			first_candidate = cand;
 			first_candidate_router = cand_router;
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 6193c01b47d2..68ca39e1713d 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -384,7 +384,7 @@ struct batadv_neigh_node {
 	spinlock_t ifinfo_lock;	/* protects ifinfo_list and its members */
 	struct batadv_hard_iface *if_incoming;
 	unsigned long last_seen;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 7a659d5694b9b580615f0a813b1e6675f26c629e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:54 +0100
Subject: batman-adv: Convert batadv_hard_iface to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c     |  4 ++--
 net/batman-adv/hard-interface.c | 20 +++++++++++++-------
 net/batman-adv/hard-interface.h | 16 +++++++---------
 net/batman-adv/originator.c     |  8 ++++----
 net/batman-adv/types.h          |  2 +-
 5 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index affcbb571426..bf0e7d6f12bb 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -644,10 +644,10 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
 	unsigned char *skb_buff;
 	unsigned int skb_size;
 
-	if (!atomic_inc_not_zero(&if_incoming->refcount))
+	if (!kref_get_unless_zero(&if_incoming->refcount))
 		return;
 
-	if (!atomic_inc_not_zero(&if_outgoing->refcount))
+	if (!kref_get_unless_zero(&if_outgoing->refcount))
 		goto out_free_incoming;
 
 	/* own packet should always be scheduled */
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index b17e272b5cb3..fb2d9c058ed0 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -18,6 +18,7 @@
 #include "hard-interface.h"
 #include "main.h"
 
+#include <linux/atomic.h>
 #include <linux/bug.h>
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
@@ -26,6 +27,7 @@
 #include <linux/if_ether.h>
 #include <linux/if.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
@@ -50,10 +52,13 @@
 /**
  * batadv_hardif_release - release hard interface from lists and queue for
  *  free after rcu grace period
- * @hard_iface: the hard interface to free
+ * @ref: kref pointer of the hard interface
  */
-void batadv_hardif_release(struct batadv_hard_iface *hard_iface)
+void batadv_hardif_release(struct kref *ref)
 {
+	struct batadv_hard_iface *hard_iface;
+
+	hard_iface = container_of(ref, struct batadv_hard_iface, refcount);
 	dev_put(hard_iface->net_dev);
 
 	kfree_rcu(hard_iface, rcu);
@@ -67,7 +72,7 @@ batadv_hardif_get_by_netdev(const struct net_device *net_dev)
 	rcu_read_lock();
 	list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
 		if (hard_iface->net_dev == net_dev &&
-		    atomic_inc_not_zero(&hard_iface->refcount))
+		    kref_get_unless_zero(&hard_iface->refcount))
 			goto out;
 	}
 
@@ -172,7 +177,7 @@ batadv_hardif_get_active(const struct net_device *soft_iface)
 			continue;
 
 		if (hard_iface->if_status == BATADV_IF_ACTIVE &&
-		    atomic_inc_not_zero(&hard_iface->refcount))
+		    kref_get_unless_zero(&hard_iface->refcount))
 			goto out;
 	}
 
@@ -206,7 +211,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv,
 
 	ASSERT_RTNL();
 
-	if (new_hard_iface && !atomic_inc_not_zero(&new_hard_iface->refcount))
+	if (new_hard_iface && !kref_get_unless_zero(&new_hard_iface->refcount))
 		new_hard_iface = NULL;
 
 	curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1);
@@ -434,7 +439,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
 	if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
 		goto out;
 
-	if (!atomic_inc_not_zero(&hard_iface->refcount))
+	if (!kref_get_unless_zero(&hard_iface->refcount))
 		goto out;
 
 	soft_iface = dev_get_by_name(&init_net, iface_name);
@@ -655,7 +660,8 @@ batadv_hardif_add_interface(struct net_device *net_dev)
 		hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
 
 	/* extra reference for return */
-	atomic_set(&hard_iface->refcount, 2);
+	kref_init(&hard_iface->refcount);
+	kref_get(&hard_iface->refcount);
 
 	batadv_check_known_mac_addr(hard_iface->net_dev);
 	list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list);
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 9eb08ad6032d..5cecc6bc1b1e 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -20,8 +20,8 @@
 
 #include "main.h"
 
-#include <linux/atomic.h>
 #include <linux/compiler.h>
+#include <linux/kref.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/stddef.h>
@@ -61,18 +61,16 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
 void batadv_hardif_remove_interfaces(void);
 int batadv_hardif_min_mtu(struct net_device *soft_iface);
 void batadv_update_min_mtu(struct net_device *soft_iface);
-void batadv_hardif_release(struct batadv_hard_iface *hard_iface);
+void batadv_hardif_release(struct kref *ref);
 
 /**
- * batadv_hardif_free_ref - decrement the hard interface refcounter and
- *  possibly release it
+ * batadv_hardif_free_ref - decrement the hard interface refcounter and possibly
+ *  release it
  * @hard_iface: the hard interface to free
  */
-static inline void
-batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface)
+static inline void batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface)
 {
-	if (atomic_dec_and_test(&hard_iface->refcount))
-		batadv_hardif_release(hard_iface);
+	kref_put(&hard_iface->refcount, batadv_hardif_release);
 }
 
 static inline struct batadv_hard_iface *
@@ -85,7 +83,7 @@ batadv_primary_if_get_selected(struct batadv_priv *bat_priv)
 	if (!hard_iface)
 		goto out;
 
-	if (!atomic_inc_not_zero(&hard_iface->refcount))
+	if (!kref_get_unless_zero(&hard_iface->refcount))
 		hard_iface = NULL;
 
 out:
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 9e3dbd88c69e..bf27007a574a 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -367,7 +367,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
 		goto out;
 
 	if (if_outgoing != BATADV_IF_DEFAULT &&
-	    !atomic_inc_not_zero(&if_outgoing->refcount)) {
+	    !kref_get_unless_zero(&if_outgoing->refcount)) {
 		kfree(orig_ifinfo);
 		orig_ifinfo = NULL;
 		goto out;
@@ -447,7 +447,7 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
 	if (!neigh_ifinfo)
 		goto out;
 
-	if (if_outgoing && !atomic_inc_not_zero(&if_outgoing->refcount)) {
+	if (if_outgoing && !kref_get_unless_zero(&if_outgoing->refcount)) {
 		kfree(neigh_ifinfo);
 		neigh_ifinfo = NULL;
 		goto out;
@@ -524,7 +524,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
 	if (hardif_neigh)
 		goto out;
 
-	if (!atomic_inc_not_zero(&hard_iface->refcount))
+	if (!kref_get_unless_zero(&hard_iface->refcount))
 		goto out;
 
 	hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC);
@@ -635,7 +635,7 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 	if (!neigh_node)
 		goto out;
 
-	if (!atomic_inc_not_zero(&hard_iface->refcount)) {
+	if (!kref_get_unless_zero(&hard_iface->refcount)) {
 		kfree(neigh_node);
 		neigh_node = NULL;
 		goto out;
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 68ca39e1713d..74cb26372ed1 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -111,7 +111,7 @@ struct batadv_hard_iface {
 	struct net_device *net_dev;
 	u8 num_bcasts;
 	struct kobject *hardif_obj;
-	atomic_t refcount;
+	struct kref refcount;
 	struct packet_type batman_adv_ptype;
 	struct net_device *soft_iface;
 	struct rcu_head rcu;
-- 
cgit v1.2.3-59-g8ed1b


From 161a3be9324eaed3ecd1879c994d58b16bbf4732 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:55 +0100
Subject: batman-adv: Convert batadv_orig_node_vlan to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/originator.c | 26 ++++++++++++++++++++------
 net/batman-adv/types.h      |  2 +-
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index bf27007a574a..8b2ef8de93e9 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -82,7 +82,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
 		if (tmp->vid != vid)
 			continue;
 
-		if (!atomic_inc_not_zero(&tmp->refcount))
+		if (!kref_get_unless_zero(&tmp->refcount))
 			continue;
 
 		vlan = tmp;
@@ -123,7 +123,8 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
 	if (!vlan)
 		goto out;
 
-	atomic_set(&vlan->refcount, 2);
+	kref_init(&vlan->refcount);
+	kref_get(&vlan->refcount);
 	vlan->vid = vid;
 
 	hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list);
@@ -135,14 +136,27 @@ out:
 }
 
 /**
- * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly free
- *  the originator-vlan object
+ * batadv_orig_node_vlan_release - release originator-vlan object from lists
+ *  and queue for free after rcu grace period
+ * @ref: kref pointer of the originator-vlan object
+ */
+static void batadv_orig_node_vlan_release(struct kref *ref)
+{
+	struct batadv_orig_node_vlan *orig_vlan;
+
+	orig_vlan = container_of(ref, struct batadv_orig_node_vlan, refcount);
+
+	kfree_rcu(orig_vlan, rcu);
+}
+
+/**
+ * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly
+ *  release the originator-vlan object
  * @orig_vlan: the originator-vlan object to release
  */
 void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan)
 {
-	if (atomic_dec_and_test(&orig_vlan->refcount))
-		kfree_rcu(orig_vlan, rcu);
+	kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release);
 }
 
 int batadv_originator_init(struct batadv_priv *bat_priv)
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 74cb26372ed1..504f3dbdbcc1 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -197,7 +197,7 @@ struct batadv_orig_node_vlan {
 	unsigned short vid;
 	struct batadv_vlan_tt tt;
 	struct hlist_node list;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 7c12439115da77389733d9bf313f64eedaec3b73 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:56 +0100
Subject: batman-adv: Convert batadv_orig_node to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/distributed-arp-table.c |  2 +-
 net/batman-adv/gateway_client.c        |  4 ++--
 net/batman-adv/multicast.c             |  7 ++++---
 net/batman-adv/network-coding.c        |  2 +-
 net/batman-adv/originator.c            | 14 +++++++++-----
 net/batman-adv/originator.h            |  4 ++--
 net/batman-adv/translation-table.c     |  4 ++--
 net/batman-adv/types.h                 |  2 +-
 8 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 921e25255cab..e3261118130a 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -542,7 +542,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
 							  max_orig_node))
 				continue;
 
-			if (!atomic_inc_not_zero(&orig_node->refcount))
+			if (!kref_get_unless_zero(&orig_node->refcount))
 				continue;
 
 			max = tmp_max;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 1f3847c61db8..261866e38502 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -118,7 +118,7 @@ batadv_gw_get_selected_orig(struct batadv_priv *bat_priv)
 	if (!orig_node)
 		goto unlock;
 
-	if (!atomic_inc_not_zero(&orig_node->refcount))
+	if (!kref_get_unless_zero(&orig_node->refcount))
 		orig_node = NULL;
 
 unlock:
@@ -441,7 +441,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
 	if (gateway->bandwidth_down == 0)
 		return;
 
-	if (!atomic_inc_not_zero(&orig_node->refcount))
+	if (!kref_get_unless_zero(&orig_node->refcount))
 		return;
 
 	gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC);
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 155565e0fecc..8caa2c72efa3 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -30,6 +30,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
@@ -447,7 +448,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv)
 	hlist_for_each_entry_rcu(tmp_orig_node,
 				 &bat_priv->mcast.want_all_ipv4_list,
 				 mcast_want_all_ipv4_node) {
-		if (!atomic_inc_not_zero(&tmp_orig_node->refcount))
+		if (!kref_get_unless_zero(&tmp_orig_node->refcount))
 			continue;
 
 		orig_node = tmp_orig_node;
@@ -474,7 +475,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv)
 	hlist_for_each_entry_rcu(tmp_orig_node,
 				 &bat_priv->mcast.want_all_ipv6_list,
 				 mcast_want_all_ipv6_node) {
-		if (!atomic_inc_not_zero(&tmp_orig_node->refcount))
+		if (!kref_get_unless_zero(&tmp_orig_node->refcount))
 			continue;
 
 		orig_node = tmp_orig_node;
@@ -525,7 +526,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv)
 	hlist_for_each_entry_rcu(tmp_orig_node,
 				 &bat_priv->mcast.want_all_unsnoopables_list,
 				 mcast_want_all_unsnoopables_node) {
-		if (!atomic_inc_not_zero(&tmp_orig_node->refcount))
+		if (!kref_get_unless_zero(&tmp_orig_node->refcount))
 			continue;
 
 		orig_node = tmp_orig_node;
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 371fc6bbf9d6..a4eb8ee4abb1 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -858,7 +858,7 @@ static struct batadv_nc_node
 	if (!nc_node)
 		return NULL;
 
-	if (!atomic_inc_not_zero(&orig_neigh_node->refcount))
+	if (!kref_get_unless_zero(&orig_neigh_node->refcount))
 		goto free;
 
 	/* Initialize nc_node */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 8b2ef8de93e9..eacd0e5a0238 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -18,6 +18,7 @@
 #include "originator.h"
 #include "main.h"
 
+#include <linux/atomic.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
 #include <linux/fs.h>
@@ -775,14 +776,17 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
 /**
  * batadv_orig_node_release - release orig_node from lists and queue for
  *  free after rcu grace period
- * @orig_node: the orig node to free
+ * @ref: kref pointer of the orig_node
  */
-static void batadv_orig_node_release(struct batadv_orig_node *orig_node)
+static void batadv_orig_node_release(struct kref *ref)
 {
 	struct hlist_node *node_tmp;
 	struct batadv_neigh_node *neigh_node;
+	struct batadv_orig_node *orig_node;
 	struct batadv_orig_ifinfo *orig_ifinfo;
 
+	orig_node = container_of(ref, struct batadv_orig_node, refcount);
+
 	spin_lock_bh(&orig_node->neigh_list_lock);
 
 	/* for all neighbors towards this originator ... */
@@ -812,8 +816,7 @@ static void batadv_orig_node_release(struct batadv_orig_node *orig_node)
  */
 void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node)
 {
-	if (atomic_dec_and_test(&orig_node->refcount))
-		batadv_orig_node_release(orig_node);
+	kref_put(&orig_node->refcount, batadv_orig_node_release);
 }
 
 void batadv_originator_free(struct batadv_priv *bat_priv)
@@ -885,7 +888,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
 	batadv_nc_init_orig(orig_node);
 
 	/* extra reference for return */
-	atomic_set(&orig_node->refcount, 2);
+	kref_init(&orig_node->refcount);
+	kref_get(&orig_node->refcount);
 
 	orig_node->bat_priv = bat_priv;
 	ether_addr_copy(orig_node->orig, addr);
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 745b4e4fcdc4..99507408f4cf 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -20,10 +20,10 @@
 
 #include "main.h"
 
-#include <linux/atomic.h>
 #include <linux/compiler.h>
 #include <linux/if_ether.h>
 #include <linux/jhash.h>
+#include <linux/kref.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/stddef.h>
@@ -115,7 +115,7 @@ batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data)
 		if (!batadv_compare_eth(orig_node, data))
 			continue;
 
-		if (!atomic_inc_not_zero(&orig_node->refcount))
+		if (!kref_get_unless_zero(&orig_node->refcount))
 			continue;
 
 		orig_node_tmp = orig_node;
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 68494a10848c..96b1571213bd 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1334,7 +1334,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
 		goto out;
 
 	INIT_HLIST_NODE(&orig_entry->list);
-	atomic_inc(&orig_node->refcount);
+	kref_get(&orig_node->refcount);
 	batadv_tt_global_size_inc(orig_node, tt_global->common.vid);
 	orig_entry->orig_node = orig_node;
 	orig_entry->ttvn = ttvn;
@@ -2097,7 +2097,7 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
 	/* found anything? */
 	if (best_entry)
 		orig_node = best_entry->orig_node;
-	if (orig_node && !atomic_inc_not_zero(&orig_node->refcount))
+	if (orig_node && !kref_get_unless_zero(&orig_node->refcount))
 		orig_node = NULL;
 	rcu_read_unlock();
 
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 504f3dbdbcc1..aea82a121c02 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -299,7 +299,7 @@ struct batadv_orig_node {
 	struct batadv_priv *bat_priv;
 	/* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */
 	spinlock_t bcast_seqno_lock;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 #ifdef CONFIG_BATMAN_ADV_NC
 	struct list_head in_coding_list;
-- 
cgit v1.2.3-59-g8ed1b


From 92dcdf09a139e51b7b043b9443b8fd69de298dff Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 16 Jan 2016 10:29:57 +0100
Subject: batman-adv: Convert batadv_tt_common_entry to kref

batman-adv uses a self-written reference implementation which is just based
on atomic_t. This is less obvious when reading the code than kref and
therefore increases the change that the reference counting will be missed.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/translation-table.c | 60 ++++++++++++++++++++++++++++++--------
 net/batman-adv/types.h             |  2 +-
 2 files changed, 49 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 96b1571213bd..11882793f0a4 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -142,7 +142,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr,
 		if (tt->vid != vid)
 			continue;
 
-		if (!atomic_inc_not_zero(&tt->refcount))
+		if (!kref_get_unless_zero(&tt->refcount))
 			continue;
 
 		tt_tmp = tt;
@@ -203,25 +203,59 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
 	return tt_global_entry;
 }
 
+/**
+ * batadv_tt_local_entry_release - release tt_local_entry from lists and queue
+ *  for free after rcu grace period
+ * @ref: kref pointer of the nc_node
+ */
+static void batadv_tt_local_entry_release(struct kref *ref)
+{
+	struct batadv_tt_local_entry *tt_local_entry;
+
+	tt_local_entry = container_of(ref, struct batadv_tt_local_entry,
+				      common.refcount);
+
+	kfree_rcu(tt_local_entry, common.rcu);
+}
+
+/**
+ * batadv_tt_local_entry_free_ref - decrement the tt_local_entry refcounter and
+ *  possibly release it
+ * @tt_local_entry: tt_local_entry to be free'd
+ */
 static void
 batadv_tt_local_entry_free_ref(struct batadv_tt_local_entry *tt_local_entry)
 {
-	if (atomic_dec_and_test(&tt_local_entry->common.refcount))
-		kfree_rcu(tt_local_entry, common.rcu);
+	kref_put(&tt_local_entry->common.refcount,
+		 batadv_tt_local_entry_release);
 }
 
 /**
- * batadv_tt_global_entry_free_ref - decrement the refcounter for a
- *  tt_global_entry and possibly free it
- * @tt_global_entry: the object to free
+ * batadv_tt_global_entry_release - release tt_global_entry from lists and queue
+ *  for free after rcu grace period
+ * @ref: kref pointer of the nc_node
+ */
+static void batadv_tt_global_entry_release(struct kref *ref)
+{
+	struct batadv_tt_global_entry *tt_global_entry;
+
+	tt_global_entry = container_of(ref, struct batadv_tt_global_entry,
+				       common.refcount);
+
+	batadv_tt_global_del_orig_list(tt_global_entry);
+	kfree_rcu(tt_global_entry, common.rcu);
+}
+
+/**
+ * batadv_tt_global_entry_free_ref - decrement the tt_global_entry refcounter
+ *  and possibly release it
+ * @tt_global_entry: tt_global_entry to be free'd
  */
 static void
 batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry)
 {
-	if (atomic_dec_and_test(&tt_global_entry->common.refcount)) {
-		batadv_tt_global_del_orig_list(tt_global_entry);
-		kfree_rcu(tt_global_entry, common.rcu);
-	}
+	kref_put(&tt_global_entry->common.refcount,
+		 batadv_tt_global_entry_release);
 }
 
 /**
@@ -633,7 +667,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
 	tt_local->common.vid = vid;
 	if (batadv_is_wifi_netdev(in_dev))
 		tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
-	atomic_set(&tt_local->common.refcount, 2);
+	kref_init(&tt_local->common.refcount);
+	kref_get(&tt_local->common.refcount);
 	tt_local->last_seen = jiffies;
 	tt_local->common.added_at = tt_local->last_seen;
 
@@ -1415,7 +1450,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
 		 */
 		if (flags & BATADV_TT_CLIENT_ROAM)
 			tt_global_entry->roam_at = jiffies;
-		atomic_set(&common->refcount, 2);
+		kref_init(&common->refcount);
+		kref_get(&common->refcount);
 		common->added_at = jiffies;
 
 		INIT_HLIST_HEAD(&tt_global_entry->orig_list);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index aea82a121c02..612de23178e6 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -972,7 +972,7 @@ struct batadv_tt_common_entry {
 	struct hlist_node hash_entry;
 	u16 flags;
 	unsigned long added_at;
-	atomic_t refcount;
+	struct kref refcount;
 	struct rcu_head rcu;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 086c653f5862591a9cfe2386f5650d03adacc33a Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:35 -0500
Subject: sock: struct proto hash function may error

In order to support fast reuseport lookups in TCP, the hash function
defined in struct proto must be capable of returning an error code.
This patch changes the function signature of all related hash functions
to return an integer and handles or propagates this return value at
all call sites.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h   |  2 +-
 include/net/phonet/phonet.h     |  2 +-
 include/net/ping.h              |  2 +-
 include/net/raw.h               |  2 +-
 include/net/sock.h              |  6 +++---
 include/net/udp.h               |  3 ++-
 net/ieee802154/socket.c         | 17 +++++++++++++----
 net/ipv4/af_inet.c              |  9 ++++++---
 net/ipv4/inet_connection_sock.c |  8 +++++---
 net/ipv4/inet_hashtables.c      |  4 +++-
 net/ipv4/ping.c                 |  4 +++-
 net/ipv4/raw.c                  |  4 +++-
 net/ipv6/af_inet6.c             |  6 +++++-
 net/phonet/socket.c             |  6 ++++--
 net/sctp/socket.c               |  3 ++-
 15 files changed, 53 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index de2e3ade6102..554440e7f83d 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -208,7 +208,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h);
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
 void __inet_hash(struct sock *sk, struct sock *osk);
-void inet_hash(struct sock *sk);
+int inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
 
 struct sock *__inet_lookup_listener(struct net *net,
diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h
index 68e509750caa..039cc29cb4a8 100644
--- a/include/net/phonet/phonet.h
+++ b/include/net/phonet/phonet.h
@@ -51,7 +51,7 @@ void pn_sock_init(void);
 struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa);
 void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb);
 void phonet_get_local_port_range(int *min, int *max);
-void pn_sock_hash(struct sock *sk);
+int pn_sock_hash(struct sock *sk);
 void pn_sock_unhash(struct sock *sk);
 int pn_sock_get_port(struct sock *sk, unsigned short sport);
 
diff --git a/include/net/ping.h b/include/net/ping.h
index ac80cb45e630..5fd7cc244833 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -65,7 +65,7 @@ struct pingfakehdr {
 };
 
 int  ping_get_port(struct sock *sk, unsigned short ident);
-void ping_hash(struct sock *sk);
+int ping_hash(struct sock *sk);
 void ping_unhash(struct sock *sk);
 
 int  ping_init_sock(struct sock *sk);
diff --git a/include/net/raw.h b/include/net/raw.h
index 6a40c6562dd2..3e789008394d 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -57,7 +57,7 @@ int raw_seq_open(struct inode *ino, struct file *file,
 
 #endif
 
-void raw_hash_sk(struct sock *sk);
+int raw_hash_sk(struct sock *sk);
 void raw_unhash_sk(struct sock *sk);
 
 struct raw_sock {
diff --git a/include/net/sock.h b/include/net/sock.h
index f5ea148853e2..255d3e03727b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -984,7 +984,7 @@ struct proto {
 	void		(*release_cb)(struct sock *sk);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
-	void			(*hash)(struct sock *sk);
+	int			(*hash)(struct sock *sk);
 	void			(*unhash)(struct sock *sk);
 	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
@@ -1194,10 +1194,10 @@ static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
 /* With per-bucket locks this operation is not-atomic, so that
  * this version is not worse.
  */
-static inline void __sk_prot_rehash(struct sock *sk)
+static inline int __sk_prot_rehash(struct sock *sk)
 {
 	sk->sk_prot->unhash(sk);
-	sk->sk_prot->hash(sk);
+	return sk->sk_prot->hash(sk);
 }
 
 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size);
diff --git a/include/net/udp.h b/include/net/udp.h
index 2842541e28e7..92927f729ac8 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -177,9 +177,10 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 }
 
 /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
-static inline void udp_lib_hash(struct sock *sk)
+static inline int udp_lib_hash(struct sock *sk)
 {
 	BUG();
+	return 0;
 }
 
 void udp_lib_unhash(struct sock *sk);
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a548be247e15..e0bd013a1e5e 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -182,12 +182,14 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
 static HLIST_HEAD(raw_head);
 static DEFINE_RWLOCK(raw_lock);
 
-static void raw_hash(struct sock *sk)
+static int raw_hash(struct sock *sk)
 {
 	write_lock_bh(&raw_lock);
 	sk_add_node(sk, &raw_head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock_bh(&raw_lock);
+
+	return 0;
 }
 
 static void raw_unhash(struct sock *sk)
@@ -462,12 +464,14 @@ static inline struct dgram_sock *dgram_sk(const struct sock *sk)
 	return container_of(sk, struct dgram_sock, sk);
 }
 
-static void dgram_hash(struct sock *sk)
+static int dgram_hash(struct sock *sk)
 {
 	write_lock_bh(&dgram_lock);
 	sk_add_node(sk, &dgram_head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock_bh(&dgram_lock);
+
+	return 0;
 }
 
 static void dgram_unhash(struct sock *sk)
@@ -1026,8 +1030,13 @@ static int ieee802154_create(struct net *net, struct socket *sock,
 	/* Checksums on by default */
 	sock_set_flag(sk, SOCK_ZAPPED);
 
-	if (sk->sk_prot->hash)
-		sk->sk_prot->hash(sk);
+	if (sk->sk_prot->hash) {
+		rc = sk->sk_prot->hash(sk);
+		if (rc) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
 
 	if (sk->sk_prot->init) {
 		rc = sk->sk_prot->init(sk);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5c5db6636704..eade66db214e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -370,7 +370,11 @@ lookup_protocol:
 		 */
 		inet->inet_sport = htons(inet->inet_num);
 		/* Add to protocol hash chains. */
-		sk->sk_prot->hash(sk);
+		err = sk->sk_prot->hash(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
 	}
 
 	if (sk->sk_prot->init) {
@@ -1142,8 +1146,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	 * Besides that, it does not check for connection
 	 * uniqueness. Wait for troubles.
 	 */
-	__sk_prot_rehash(sk);
-	return 0;
+	return __sk_prot_rehash(sk);
 }
 
 int inet_sk_rebuild_header(struct sock *sk)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 9b17c1792dce..12c8d389dc18 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -734,6 +734,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_sock *inet = inet_sk(sk);
+	int err = -EADDRINUSE;
 
 	reqsk_queue_alloc(&icsk->icsk_accept_queue);
 
@@ -751,13 +752,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
 		inet->inet_sport = htons(inet->inet_num);
 
 		sk_dst_reset(sk);
-		sk->sk_prot->hash(sk);
+		err = sk->sk_prot->hash(sk);
 
-		return 0;
+		if (likely(!err))
+			return 0;
 	}
 
 	sk->sk_state = TCP_CLOSE;
-	return -EADDRINUSE;
+	return err;
 }
 EXPORT_SYMBOL_GPL(inet_csk_listen_start);
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ccc5980797fc..b6023b7baae0 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -468,13 +468,15 @@ void __inet_hash(struct sock *sk, struct sock *osk)
 }
 EXPORT_SYMBOL(__inet_hash);
 
-void inet_hash(struct sock *sk)
+int inet_hash(struct sock *sk)
 {
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
 		__inet_hash(sk, NULL);
 		local_bh_enable();
 	}
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(inet_hash);
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index c117b21b937d..f6f93fc2c61f 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -145,10 +145,12 @@ fail:
 }
 EXPORT_SYMBOL_GPL(ping_get_port);
 
-void ping_hash(struct sock *sk)
+int ping_hash(struct sock *sk)
 {
 	pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
 	BUG(); /* "Please do not press this button again." */
+
+	return 0;
 }
 
 void ping_unhash(struct sock *sk)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bc35f1842512..d6352515d738 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
 };
 
-void raw_hash_sk(struct sock *sk)
+int raw_hash_sk(struct sock *sk)
 {
 	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
 	struct hlist_head *head;
@@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk)
 	sk_add_node(sk, head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock_bh(&h->lock);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(raw_hash_sk);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9f5137cd604e..b11c37cfd67c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -235,7 +235,11 @@ lookup_protocol:
 		 * creation time automatically shares.
 		 */
 		inet->inet_sport = htons(inet->inet_num);
-		sk->sk_prot->hash(sk);
+		err = sk->sk_prot->hash(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
 	}
 	if (sk->sk_prot->init) {
 		err = sk->sk_prot->init(sk);
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index d575ef4e9aa6..ffd5f2297584 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -140,13 +140,15 @@ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb)
 	rcu_read_unlock();
 }
 
-void pn_sock_hash(struct sock *sk)
+int pn_sock_hash(struct sock *sk)
 {
 	struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject);
 
 	mutex_lock(&pnsocks.lock);
 	sk_add_node_rcu(sk, hlist);
 	mutex_unlock(&pnsocks.lock);
+
+	return 0;
 }
 EXPORT_SYMBOL(pn_sock_hash);
 
@@ -200,7 +202,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
 	pn->resource = spn->spn_resource;
 
 	/* Enable RX on the socket */
-	sk->sk_prot->hash(sk);
+	err = sk->sk_prot->hash(sk);
 out_port:
 	mutex_unlock(&port_mutex);
 out:
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5ca2ebfe0be8..6427b9d1197e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6101,9 +6101,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 	return retval;
 }
 
-static void sctp_hash(struct sock *sk)
+static int sctp_hash(struct sock *sk)
 {
 	/* STUB */
+	return 0;
 }
 
 static void sctp_unhash(struct sock *sk)
-- 
cgit v1.2.3-59-g8ed1b


From 496611d7b5eaf59c03440c8f2def1d9988ad2459 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:36 -0500
Subject: inet: create IPv6-equivalent inet_hash function

In order to support fast lookups for TCP sockets with SO_REUSEPORT,
the function that adds sockets to the listening hash set needs
to be able to check receive address equality.  Since this equality
check is different for IPv4 and IPv6, we will need two different
socket hashing functions.

This patch adds inet6_hash identical to the existing inet_hash function
and updates the appropriate references.  A following patch will
differentiate the two by passing different comparison functions to
__inet_hash.

Additionally, in order to use the IPv6 address equality function from
inet6_hashtables (which is compiled as a built-in object when IPv6 is
enabled) it also needs to be in a built-in object file as well.  This
moves ipv6_rcv_saddr_equal into inet_hashtables to accomplish this.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_hashtables.h |  2 ++
 net/dccp/ipv6.c                |  2 +-
 net/ipv6/inet6_hashtables.c    | 56 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c            |  2 +-
 net/ipv6/udp.c                 | 44 +--------------------------------
 net/l2tp/l2tp_ip6.c            |  3 ++-
 6 files changed, 63 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 7ff588ca6817..b3c28a9dfbf1 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -96,6 +96,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 			  const struct in6_addr *saddr, const __be16 sport,
 			  const struct in6_addr *daddr, const __be16 dport,
 			  const int dif);
+
+int inet6_hash(struct sock *sk);
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
 #define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif)	\
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9c6d0508e63a..90a8269b28d0 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -993,7 +993,7 @@ static struct proto dccp_v6_prot = {
 	.sendmsg	   = dccp_sendmsg,
 	.recvmsg	   = dccp_recvmsg,
 	.backlog_rcv	   = dccp_v6_do_rcv,
-	.hash		   = inet_hash,
+	.hash		   = inet6_hash,
 	.unhash		   = inet_unhash,
 	.accept		   = inet_csk_accept,
 	.get_port	   = inet_csk_get_port,
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 21ace5a2bf7c..072653dd9c98 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -274,3 +274,59 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row,
 				   __inet6_check_established);
 }
 EXPORT_SYMBOL_GPL(inet6_hash_connect);
+
+int inet6_hash(struct sock *sk)
+{
+	if (sk->sk_state != TCP_CLOSE) {
+		local_bh_disable();
+		__inet_hash(sk, NULL);
+		local_bh_enable();
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inet6_hash);
+
+/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
+ *                          only, and any IPv4 addresses if not IPv6 only
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ *                          and 0.0.0.0 equals to 0.0.0.0 only
+ */
+int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			 bool match_wildcard)
+{
+	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
+	int sk2_ipv6only = inet_v6_ipv6only(sk2);
+	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+	/* if both are mapped, treat as IPv4 */
+	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+		if (!sk2_ipv6only) {
+			if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+				return 1;
+			if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
+				return match_wildcard;
+		}
+		return 0;
+	}
+
+	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+		return 1;
+
+	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+	    !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (sk2_rcv_saddr6 &&
+	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 006396e31cb0..d72bcfb326d8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1865,7 +1865,7 @@ struct proto tcpv6_prot = {
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.release_cb		= tcp_release_cb,
-	.hash			= inet_hash,
+	.hash			= inet6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 22e28a44e3c8..ac4e7e03dded 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -37,6 +37,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 
+#include <net/addrconf.h>
 #include <net/ndisc.h>
 #include <net/protocol.h>
 #include <net/transp_v6.h>
@@ -77,49 +78,6 @@ static u32 udp6_ehashfn(const struct net *net,
 			       udp_ipv6_hash_secret + net_hash_mix(net));
 }
 
-/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
- *                          only, and any IPv4 addresses if not IPv6 only
- * match_wildcard == false: addresses must be exactly the same, i.e.
- *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
- *                          and 0.0.0.0 equals to 0.0.0.0 only
- */
-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-			 bool match_wildcard)
-{
-	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
-	int sk2_ipv6only = inet_v6_ipv6only(sk2);
-	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
-	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
-
-	/* if both are mapped, treat as IPv4 */
-	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
-		if (!sk2_ipv6only) {
-			if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
-				return 1;
-			if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
-				return match_wildcard;
-		}
-		return 0;
-	}
-
-	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
-		return 1;
-
-	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
-	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
-		return 1;
-
-	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
-	    !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
-		return 1;
-
-	if (sk2_rcv_saddr6 &&
-	    ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
-		return 1;
-
-	return 0;
-}
-
 static u32 udp6_portaddr_hash(const struct net *net,
 			      const struct in6_addr *addr6,
 			      unsigned int port)
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index a2c8747d2936..6b54ff3ff4cb 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -25,6 +25,7 @@
 #include <net/udp.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
 #include <net/tcp_states.h>
 #include <net/protocol.h>
 #include <net/xfrm.h>
@@ -718,7 +719,7 @@ static struct proto l2tp_ip6_prot = {
 	.sendmsg	   = l2tp_ip6_sendmsg,
 	.recvmsg	   = l2tp_ip6_recvmsg,
 	.backlog_rcv	   = l2tp_ip6_backlog_recv,
-	.hash		   = inet_hash,
+	.hash		   = inet6_hash,
 	.unhash		   = inet_unhash,
 	.obj_size	   = sizeof(struct l2tp_ip6_sock),
 #ifdef CONFIG_COMPAT
-- 
cgit v1.2.3-59-g8ed1b


From a583636a83ea383fd07517e5a7a2eedbc5d90fb1 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:38 -0500
Subject: inet: refactor inet[6]_lookup functions to take skb

This is a preliminary step to allow fast socket lookup of SO_REUSEPORT
groups.  Doing so with a BPF filter will require access to the
skb in question.  This change plumbs the skb (and offset to payload
data) through the call stack to the listening socket lookup
implementations where it will be used in a following patch.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h         |  2 ++
 include/net/inet6_hashtables.h | 11 +++++++----
 include/net/inet_hashtables.h  | 18 ++++++++++++------
 net/dccp/ipv4.c                |  2 +-
 net/dccp/ipv6.c                |  2 +-
 net/ipv4/inet_diag.c           |  6 +++---
 net/ipv4/inet_hashtables.c     |  1 +
 net/ipv4/tcp_ipv4.c            | 10 ++++++----
 net/ipv6/inet6_hashtables.c    |  8 ++++++--
 net/ipv6/tcp_ipv6.c            |  8 +++++---
 net/netfilter/xt_TPROXY.c      | 31 ++++++++++++++++++++-----------
 net/netfilter/xt_socket.c      | 28 +++++++++++++++++++++-------
 12 files changed, 85 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 47f52d3cd8df..730d856683e5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -87,6 +87,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
 		      u32 banned_flags);
 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 		    u32 banned_flags);
+int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			 bool match_wildcard);
 int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 			 bool match_wildcard);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index b3c28a9dfbf1..28332bdac333 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -53,6 +53,7 @@ struct sock *__inet6_lookup_established(struct net *net,
 
 struct sock *inet6_lookup_listener(struct net *net,
 				   struct inet_hashinfo *hashinfo,
+				   struct sk_buff *skb, int doff,
 				   const struct in6_addr *saddr,
 				   const __be16 sport,
 				   const struct in6_addr *daddr,
@@ -60,6 +61,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 
 static inline struct sock *__inet6_lookup(struct net *net,
 					  struct inet_hashinfo *hashinfo,
+					  struct sk_buff *skb, int doff,
 					  const struct in6_addr *saddr,
 					  const __be16 sport,
 					  const struct in6_addr *daddr,
@@ -71,12 +73,12 @@ static inline struct sock *__inet6_lookup(struct net *net,
 	if (sk)
 		return sk;
 
-	return inet6_lookup_listener(net, hashinfo, saddr, sport,
+	return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
 				     daddr, hnum, dif);
 }
 
 static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
-					      struct sk_buff *skb,
+					      struct sk_buff *skb, int doff,
 					      const __be16 sport,
 					      const __be16 dport,
 					      int iif)
@@ -86,13 +88,14 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
 	if (sk)
 		return sk;
 
-	return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
-			      &ipv6_hdr(skb)->saddr, sport,
+	return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
+			      doff, &ipv6_hdr(skb)->saddr, sport,
 			      &ipv6_hdr(skb)->daddr, ntohs(dport),
 			      iif);
 }
 
 struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
+			  struct sk_buff *skb, int doff,
 			  const struct in6_addr *saddr, const __be16 sport,
 			  const struct in6_addr *daddr, const __be16 dport,
 			  const int dif);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 554440e7f83d..82403390af58 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -213,6 +213,7 @@ void inet_unhash(struct sock *sk);
 
 struct sock *__inet_lookup_listener(struct net *net,
 				    struct inet_hashinfo *hashinfo,
+				    struct sk_buff *skb, int doff,
 				    const __be32 saddr, const __be16 sport,
 				    const __be32 daddr,
 				    const unsigned short hnum,
@@ -220,10 +221,11 @@ struct sock *__inet_lookup_listener(struct net *net,
 
 static inline struct sock *inet_lookup_listener(struct net *net,
 		struct inet_hashinfo *hashinfo,
+		struct sk_buff *skb, int doff,
 		__be32 saddr, __be16 sport,
 		__be32 daddr, __be16 dport, int dif)
 {
-	return __inet_lookup_listener(net, hashinfo, saddr, sport,
+	return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
 				      daddr, ntohs(dport), dif);
 }
 
@@ -299,6 +301,7 @@ static inline struct sock *
 
 static inline struct sock *__inet_lookup(struct net *net,
 					 struct inet_hashinfo *hashinfo,
+					 struct sk_buff *skb, int doff,
 					 const __be32 saddr, const __be16 sport,
 					 const __be32 daddr, const __be16 dport,
 					 const int dif)
@@ -307,12 +310,13 @@ static inline struct sock *__inet_lookup(struct net *net,
 	struct sock *sk = __inet_lookup_established(net, hashinfo,
 				saddr, sport, daddr, hnum, dif);
 
-	return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
-					     daddr, hnum, dif);
+	return sk ? : __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
+					     sport, daddr, hnum, dif);
 }
 
 static inline struct sock *inet_lookup(struct net *net,
 				       struct inet_hashinfo *hashinfo,
+				       struct sk_buff *skb, int doff,
 				       const __be32 saddr, const __be16 sport,
 				       const __be32 daddr, const __be16 dport,
 				       const int dif)
@@ -320,7 +324,8 @@ static inline struct sock *inet_lookup(struct net *net,
 	struct sock *sk;
 
 	local_bh_disable();
-	sk = __inet_lookup(net, hashinfo, saddr, sport, daddr, dport, dif);
+	sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
+			   dport, dif);
 	local_bh_enable();
 
 	return sk;
@@ -328,6 +333,7 @@ static inline struct sock *inet_lookup(struct net *net,
 
 static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 					     struct sk_buff *skb,
+					     int doff,
 					     const __be16 sport,
 					     const __be16 dport)
 {
@@ -337,8 +343,8 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 	if (sk)
 		return sk;
 	else
-		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
-				     iph->saddr, sport,
+		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
+				     doff, iph->saddr, sport,
 				     iph->daddr, dport, inet_iif(skb));
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 5684e14932bd..1e0c600c83ae 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -802,7 +802,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
 	}
 
 lookup:
-	sk = __inet_lookup_skb(&dccp_hashinfo, skb,
+	sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
 			       dh->dccph_sport, dh->dccph_dport);
 	if (!sk) {
 		dccp_pr_debug("failed to look up flow ID in table and "
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 90a8269b28d0..45cbe85f0940 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -668,7 +668,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
 		DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
 
 lookup:
-	sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
+	sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
 			        dh->dccph_sport, dh->dccph_dport,
 				inet6_iif(skb));
 	if (!sk) {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 6029157a19ed..50c0d96b8441 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -357,18 +357,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
 	struct sock *sk;
 
 	if (req->sdiag_family == AF_INET)
-		sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
+		sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
 				 req->id.idiag_dport, req->id.idiag_src[0],
 				 req->id.idiag_sport, req->id.idiag_if);
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (req->sdiag_family == AF_INET6) {
 		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
 		    ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
-			sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3],
+			sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],
 					 req->id.idiag_dport, req->id.idiag_src[3],
 					 req->id.idiag_sport, req->id.idiag_if);
 		else
-			sk = inet6_lookup(net, hashinfo,
+			sk = inet6_lookup(net, hashinfo, NULL, 0,
 					  (struct in6_addr *)req->id.idiag_dst,
 					  req->id.idiag_dport,
 					  (struct in6_addr *)req->id.idiag_src,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index b6023b7baae0..5e4290b83255 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -205,6 +205,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
 
 struct sock *__inet_lookup_listener(struct net *net,
 				    struct inet_hashinfo *hashinfo,
+				    struct sk_buff *skb, int doff,
 				    const __be32 saddr, __be16 sport,
 				    const __be32 daddr, const unsigned short hnum,
 				    const int dif)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0d381fa164f8..3f872a6bc274 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -637,8 +637,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 		 * Incoming packet is checked with md5 hash with finding key,
 		 * no RST generated if md5 hash doesn't match.
 		 */
-		sk1 = __inet_lookup_listener(net,
-					     &tcp_hashinfo, ip_hdr(skb)->saddr,
+		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
+					     ip_hdr(skb)->saddr,
 					     th->source, ip_hdr(skb)->daddr,
 					     ntohs(th->source), inet_iif(skb));
 		/* don't send rst if it can't find key */
@@ -1581,7 +1581,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->sacked	 = 0;
 
 lookup:
-	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
+			       th->dest);
 	if (!sk)
 		goto no_tcp_socket;
 
@@ -1695,7 +1696,8 @@ do_time_wait:
 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
 	case TCP_TW_SYN: {
 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
-							&tcp_hashinfo,
+							&tcp_hashinfo, skb,
+							__tcp_hdrlen(th),
 							iph->saddr, th->source,
 							iph->daddr, th->dest,
 							inet_iif(skb));
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 072653dd9c98..004345d26808 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -121,7 +121,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
 }
 
 struct sock *inet6_lookup_listener(struct net *net,
-		struct inet_hashinfo *hashinfo, const struct in6_addr *saddr,
+		struct inet_hashinfo *hashinfo,
+		struct sk_buff *skb, int doff,
+		const struct in6_addr *saddr,
 		const __be16 sport, const struct in6_addr *daddr,
 		const unsigned short hnum, const int dif)
 {
@@ -177,6 +179,7 @@ begin:
 EXPORT_SYMBOL_GPL(inet6_lookup_listener);
 
 struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
+			  struct sk_buff *skb, int doff,
 			  const struct in6_addr *saddr, const __be16 sport,
 			  const struct in6_addr *daddr, const __be16 dport,
 			  const int dif)
@@ -184,7 +187,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 	struct sock *sk;
 
 	local_bh_disable();
-	sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+	sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
+			    ntohs(dport), dif);
 	local_bh_enable();
 
 	return sk;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d72bcfb326d8..9977b6f19f2a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -866,7 +866,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 		 * no RST generated if md5 hash doesn't match.
 		 */
 		sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
-					   &tcp_hashinfo, &ipv6h->saddr,
+					   &tcp_hashinfo, NULL, 0,
+					   &ipv6h->saddr,
 					   th->source, &ipv6h->daddr,
 					   ntohs(th->source), tcp_v6_iif(skb));
 		if (!sk1)
@@ -1375,8 +1376,8 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 	hdr = ipv6_hdr(skb);
 
 lookup:
-	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest,
-				inet6_iif(skb));
+	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
+				th->source, th->dest, inet6_iif(skb));
 	if (!sk)
 		goto no_tcp_socket;
 
@@ -1500,6 +1501,7 @@ do_time_wait:
 		struct sock *sk2;
 
 		sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
+					    skb, __tcp_hdrlen(th),
 					    &ipv6_hdr(skb)->saddr, th->source,
 					    &ipv6_hdr(skb)->daddr,
 					    ntohs(th->dest), tcp_v6_iif(skb));
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 3ab591e73ec0..7f4414d26a66 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -105,19 +105,24 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
  * belonging to established connections going through that one.
  */
 static inline struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
 		      const struct net_device *in,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
+	struct tcphdr *tcph;
 
 	switch (protocol) {
 	case IPPROTO_TCP:
 		switch (lookup_type) {
 		case NFT_LOOKUP_LISTENER:
-			sk = inet_lookup_listener(net, &tcp_hashinfo,
+			tcph = hp;
+			sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
+						    ip_hdrlen(skb) +
+						      __tcp_hdrlen(tcph),
 						    saddr, sport,
 						    daddr, dport,
 						    in->ifindex);
@@ -169,19 +174,23 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 
 #ifdef XT_TPROXY_HAVE_IPV6
 static inline struct sock *
-nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
 		      const struct net_device *in,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
+	struct tcphdr *tcph;
 
 	switch (protocol) {
 	case IPPROTO_TCP:
 		switch (lookup_type) {
 		case NFT_LOOKUP_LISTENER:
-			sk = inet6_lookup_listener(net, &tcp_hashinfo,
+			tcph = hp;
+			sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
+						   thoff + __tcp_hdrlen(tcph),
 						   saddr, sport,
 						   daddr, ntohs(dport),
 						   in->ifindex);
@@ -267,7 +276,7 @@ tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v4(net, iph->protocol,
+		sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 					    iph->saddr, laddr ? laddr : iph->daddr,
 					    hp->source, lport ? lport : hp->dest,
 					    skb->dev, NFT_LOOKUP_LISTENER);
@@ -305,7 +314,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v4(net, iph->protocol,
+	sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 				   iph->saddr, iph->daddr,
 				   hp->source, hp->dest,
 				   skb->dev, NFT_LOOKUP_ESTABLISHED);
@@ -321,7 +330,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	else if (!sk)
 		/* no, there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v4(net, iph->protocol,
+		sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 					   iph->saddr, laddr,
 					   hp->source, lport,
 					   skb->dev, NFT_LOOKUP_LISTENER);
@@ -429,7 +438,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v6(par->net, tproto,
+		sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
 					    &iph->saddr,
 					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
 					    hp->source,
@@ -472,7 +481,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v6(par->net, tproto,
+	sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
 				   &iph->saddr, &iph->daddr,
 				   hp->source, hp->dest,
 				   par->in, NFT_LOOKUP_ESTABLISHED);
@@ -487,8 +496,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	else if (!sk)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v6(par->net, tproto,
-					   &iph->saddr, laddr,
+		sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp,
+					   tproto, &iph->saddr, laddr,
 					   hp->source, lport,
 					   par->in, NFT_LOOKUP_LISTENER);
 
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 2ec08f04b816..49d14ecad444 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -112,14 +112,15 @@ extract_icmp4_fields(const struct sk_buff *skb,
  *     box.
  */
 static struct sock *
-xt_socket_get_sock_v4(struct net *net, const u8 protocol,
+xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
+		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
 		      const struct net_device *in)
 {
 	switch (protocol) {
 	case IPPROTO_TCP:
-		return __inet_lookup(net, &tcp_hashinfo,
+		return __inet_lookup(net, &tcp_hashinfo, skb, doff,
 				     saddr, sport, daddr, dport,
 				     in->ifindex);
 	case IPPROTO_UDP:
@@ -148,6 +149,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
 					     const struct net_device *indev)
 {
 	const struct iphdr *iph = ip_hdr(skb);
+	struct sk_buff *data_skb = NULL;
+	int doff = 0;
 	__be32 uninitialized_var(daddr), uninitialized_var(saddr);
 	__be16 uninitialized_var(dport), uninitialized_var(sport);
 	u8 uninitialized_var(protocol);
@@ -169,6 +172,10 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
 		sport = hp->source;
 		daddr = iph->daddr;
 		dport = hp->dest;
+		data_skb = (struct sk_buff *)skb;
+		doff = iph->protocol == IPPROTO_TCP ?
+			ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
+			ip_hdrlen(skb) + sizeof(*hp);
 
 	} else if (iph->protocol == IPPROTO_ICMP) {
 		if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
@@ -198,8 +205,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
 	}
 #endif
 
-	return xt_socket_get_sock_v4(net, protocol, saddr, daddr,
-				     sport, dport, indev);
+	return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
+				     daddr, sport, dport, indev);
 }
 
 static bool
@@ -318,14 +325,15 @@ extract_icmp6_fields(const struct sk_buff *skb,
 }
 
 static struct sock *
-xt_socket_get_sock_v6(struct net *net, const u8 protocol,
+xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
+		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
 		      const struct net_device *in)
 {
 	switch (protocol) {
 	case IPPROTO_TCP:
-		return inet6_lookup(net, &tcp_hashinfo,
+		return inet6_lookup(net, &tcp_hashinfo, skb, doff,
 				    saddr, sport, daddr, dport,
 				    in->ifindex);
 	case IPPROTO_UDP:
@@ -343,6 +351,8 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
 	__be16 uninitialized_var(dport), uninitialized_var(sport);
 	const struct in6_addr *daddr = NULL, *saddr = NULL;
 	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct sk_buff *data_skb = NULL;
+	int doff = 0;
 	int thoff = 0, tproto;
 
 	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
@@ -362,6 +372,10 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
 		sport = hp->source;
 		daddr = &iph->daddr;
 		dport = hp->dest;
+		data_skb = (struct sk_buff *)skb;
+		doff = tproto == IPPROTO_TCP ?
+			thoff + __tcp_hdrlen((struct tcphdr *)hp) :
+			thoff + sizeof(*hp);
 
 	} else if (tproto == IPPROTO_ICMPV6) {
 		struct ipv6hdr ipv6_var;
@@ -373,7 +387,7 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
 		return NULL;
 	}
 
-	return xt_socket_get_sock_v6(net, tproto, saddr, daddr,
+	return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
 				     sport, dport, indev);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From fa463497679352c04d201631534955e6be66eef8 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:39 -0500
Subject: soreuseport: Prep for fast reuseport TCP socket selection

Both of the lines in this patch probably should have been included
in the initial implementation of this code for generic socket
support, but weren't technically necessary since only UDP sockets
were supported.

First, the sk_reuseport_cb points to a structure which assumes
each socket in the group has this pointer assigned at the same
time it's added to the array in the structure.  The sk_clone_lock
function breaks this assumption.  Since a child socket shouldn't
implicitly be in a reuseport group, the simple fix is to clear
the field in the clone.

Second, the SO_ATTACH_REUSEPORT_xBPF socket options require that
SO_REUSEPORT also be set first.  For UDP sockets, this is easily
enforced at bind-time since that process both puts the socket in
the appropriate receive hlist and updates the reuseport structures.
Since these operations can happen at two different times for TCP
sockets (bind and listen) it must be explicitly checked to enforce
the use of SO_REUSEPORT with SO_ATTACH_REUSEPORT_xBPF in the
setsockopt call.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 2 +-
 net/core/sock.c   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 94d26201080d..2a6e9562f1ab 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1181,7 +1181,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
 	if (bpf_prog_size(prog->len) > sysctl_optmem_max)
 		return -ENOMEM;
 
-	if (sk_unhashed(sk)) {
+	if (sk_unhashed(sk) && sk->sk_reuseport) {
 		err = reuseport_alloc(sk);
 		if (err)
 			return err;
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c1c8bc93412..46dc8ad7d050 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1531,6 +1531,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 			newsk = NULL;
 			goto out;
 		}
+		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
 
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
-- 
cgit v1.2.3-59-g8ed1b


From c125e80b88687b25b321795457309eaaee4bf270 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Wed, 10 Feb 2016 11:50:40 -0500
Subject: soreuseport: fast reuseport TCP socket selection

This change extends the fast SO_REUSEPORT socket lookup implemented
for UDP to TCP.  Listener sockets with SO_REUSEPORT and the same
receive address are additionally added to an array for faster
random access.  This means that only a single socket from the group
must be found in the listener list before any socket in the group can
be used to receive a packet.  Previously, every socket in the group
needed to be considered before handing off the incoming packet.

This feature also exposes the ability to use a BPF program when
selecting a socket from a reuseport group.

Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h    |  5 +++-
 net/ipv4/inet_connection_sock.c  | 14 ++++++---
 net/ipv4/inet_hashtables.c       | 64 +++++++++++++++++++++++++++++++++++++---
 net/ipv4/udp.c                   |  4 +--
 net/ipv6/inet6_connection_sock.c |  2 ++
 net/ipv6/inet6_hashtables.c      | 16 +++++++++-
 6 files changed, 93 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 82403390af58..50f635c2c536 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -207,7 +207,10 @@ void inet_hashinfo_init(struct inet_hashinfo *h);
 
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
-void __inet_hash(struct sock *sk, struct sock *osk);
+int __inet_hash(struct sock *sk, struct sock *osk,
+		int (*saddr_same)(const struct sock *sk1,
+				  const struct sock *sk2,
+				  bool match_wildcard));
 int inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 12c8d389dc18..c16a2e6273d9 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -24,6 +24,7 @@
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
 #include <net/tcp.h>
+#include <net/sock_reuseport.h>
 
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
 			if ((!reuse || !sk2->sk_reuse ||
 			    sk2->sk_state == TCP_LISTEN) &&
 			    (!reuseport || !sk2->sk_reuseport ||
-			    (sk2->sk_state != TCP_TIME_WAIT &&
+			     rcu_access_pointer(sk->sk_reuseport_cb) ||
+			     (sk2->sk_state != TCP_TIME_WAIT &&
 			     !uid_eq(uid, sock_i_uid(sk2))))) {
 
 				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
@@ -132,6 +134,7 @@ again:
 					      sk->sk_state != TCP_LISTEN) ||
 					     (tb->fastreuseport > 0 &&
 					      sk->sk_reuseport &&
+					      !rcu_access_pointer(sk->sk_reuseport_cb) &&
 					      uid_eq(tb->fastuid, uid))) &&
 					    (tb->num_owners < smallest_size || smallest_size == -1)) {
 						smallest_size = tb->num_owners;
@@ -193,15 +196,18 @@ tb_found:
 		if (((tb->fastreuse > 0 &&
 		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 		     (tb->fastreuseport > 0 &&
-		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
-		    smallest_size == -1) {
+		      sk->sk_reuseport &&
+		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+		      uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
 			goto success;
 		} else {
 			ret = 1;
 			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
 				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 				     (tb->fastreuseport > 0 &&
-				      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+				      sk->sk_reuseport &&
+				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+				      uid_eq(tb->fastuid, uid))) &&
 				    smallest_size != -1 && --attempts >= 0) {
 					spin_unlock(&head->lock);
 					goto again;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 5e4290b83255..c0f9942de924 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -20,10 +20,12 @@
 #include <linux/wait.h>
 #include <linux/vmalloc.h>
 
+#include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
 #include <net/secure_seq.h>
 #include <net/ip.h>
+#include <net/sock_reuseport.h>
 
 static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
 			const __u16 lport, const __be32 faddr,
@@ -215,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net,
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
 	int score, hiscore, matches = 0, reuseport = 0;
+	bool select_ok = true;
 	u32 phash = 0;
 
 	rcu_read_lock();
@@ -230,6 +233,15 @@ begin:
 			if (reuseport) {
 				phash = inet_ehashfn(net, daddr, hnum,
 						     saddr, sport);
+				if (select_ok) {
+					struct sock *sk2;
+					sk2 = reuseport_select_sock(sk, phash,
+								    skb, doff);
+					if (sk2) {
+						result = sk2;
+						goto found;
+					}
+				}
 				matches = 1;
 			}
 		} else if (score == hiscore && reuseport) {
@@ -247,11 +259,13 @@ begin:
 	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
 		goto begin;
 	if (result) {
+found:
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
 		else if (unlikely(compute_score(result, net, hnum, daddr,
 				  dif) < hiscore)) {
 			sock_put(result);
+			select_ok = false;
 			goto begin;
 		}
 	}
@@ -450,34 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
 }
 EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
 
-void __inet_hash(struct sock *sk, struct sock *osk)
+static int inet_reuseport_add_sock(struct sock *sk,
+				   struct inet_listen_hashbucket *ilb,
+				   int (*saddr_same)(const struct sock *sk1,
+						     const struct sock *sk2,
+						     bool match_wildcard))
+{
+	struct sock *sk2;
+	struct hlist_nulls_node *node;
+	kuid_t uid = sock_i_uid(sk);
+
+	sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
+		if (sk2 != sk &&
+		    sk2->sk_family == sk->sk_family &&
+		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
+		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+		    saddr_same(sk, sk2, false))
+			return reuseport_add_sock(sk, sk2);
+	}
+
+	/* Initial allocation may have already happened via setsockopt */
+	if (!rcu_access_pointer(sk->sk_reuseport_cb))
+		return reuseport_alloc(sk);
+	return 0;
+}
+
+int __inet_hash(struct sock *sk, struct sock *osk,
+		 int (*saddr_same)(const struct sock *sk1,
+				   const struct sock *sk2,
+				   bool match_wildcard))
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct inet_listen_hashbucket *ilb;
+	int err = 0;
 
 	if (sk->sk_state != TCP_LISTEN) {
 		inet_ehash_nolisten(sk, osk);
-		return;
+		return 0;
 	}
 	WARN_ON(!sk_unhashed(sk));
 	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
 
 	spin_lock(&ilb->lock);
+	if (sk->sk_reuseport) {
+		err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+		if (err)
+			goto unlock;
+	}
 	__sk_nulls_add_node_rcu(sk, &ilb->head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+unlock:
 	spin_unlock(&ilb->lock);
+
+	return err;
 }
 EXPORT_SYMBOL(__inet_hash);
 
 int inet_hash(struct sock *sk)
 {
+	int err = 0;
+
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
-		__inet_hash(sk, NULL);
+		err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
 		local_bh_enable();
 	}
 
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(inet_hash);
 
@@ -496,6 +550,8 @@ void inet_unhash(struct sock *sk)
 		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 
 	spin_lock_bh(lock);
+	if (rcu_access_pointer(sk->sk_reuseport_cb))
+		reuseport_detach_sock(sk);
 	done = __sk_nulls_del_node_init_rcu(sk);
 	if (done)
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index be0b21852b13..ac3cedb25a9f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -356,8 +356,8 @@ EXPORT_SYMBOL(udp_lib_get_port);
  * match_wildcard == false: addresses must be exactly the same, i.e.
  *                          0.0.0.0 only equals to 0.0.0.0
  */
-static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
-				bool match_wildcard)
+int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
+			 bool match_wildcard)
 {
 	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
 
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 36c3f0155010..532c3ef282c5 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -26,6 +26,7 @@
 #include <net/ip6_route.h>
 #include <net/sock.h>
 #include <net/inet6_connection_sock.h>
+#include <net/sock_reuseport.h>
 
 int inet6_csk_bind_conflict(const struct sock *sk,
 			    const struct inet_bind_bucket *tb, bool relax)
@@ -48,6 +49,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 			if ((!reuse || !sk2->sk_reuse ||
 			     sk2->sk_state == TCP_LISTEN) &&
 			    (!reuseport || !sk2->sk_reuseport ||
+			     rcu_access_pointer(sk->sk_reuseport_cb) ||
 			     (sk2->sk_state != TCP_TIME_WAIT &&
 			      !uid_eq(uid,
 				      sock_i_uid((struct sock *)sk2))))) {
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 004345d26808..70f2628be6fa 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -17,11 +17,13 @@
 #include <linux/module.h>
 #include <linux/random.h>
 
+#include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
 #include <net/inet6_hashtables.h>
 #include <net/secure_seq.h>
 #include <net/ip.h>
+#include <net/sock_reuseport.h>
 
 u32 inet6_ehashfn(const struct net *net,
 		  const struct in6_addr *laddr, const u16 lport,
@@ -131,6 +133,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 	const struct hlist_nulls_node *node;
 	struct sock *result;
 	int score, hiscore, matches = 0, reuseport = 0;
+	bool select_ok = true;
 	u32 phash = 0;
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
@@ -148,6 +151,15 @@ begin:
 			if (reuseport) {
 				phash = inet6_ehashfn(net, daddr, hnum,
 						      saddr, sport);
+				if (select_ok) {
+					struct sock *sk2;
+					sk2 = reuseport_select_sock(sk, phash,
+								    skb, doff);
+					if (sk2) {
+						result = sk2;
+						goto found;
+					}
+				}
 				matches = 1;
 			}
 		} else if (score == hiscore && reuseport) {
@@ -165,11 +177,13 @@ begin:
 	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
 		goto begin;
 	if (result) {
+found:
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
 		else if (unlikely(compute_score(result, net, hnum, daddr,
 				  dif) < hiscore)) {
 			sock_put(result);
+			select_ok = false;
 			goto begin;
 		}
 	}
@@ -283,7 +297,7 @@ int inet6_hash(struct sock *sk)
 {
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
-		__inet_hash(sk, NULL);
+		__inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
 		local_bh_enable();
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From dc599f76c22b0de55a2a9141dfe52e70b32a1194 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 2 Feb 2016 08:17:07 -0800
Subject: net: Add support for filtering link dump by master device and kind

Add support for filtering link dumps by master device and kind, similar
to the filtering implemented for neighbor dumps.

Each net_device that exists adds between 1196 bytes (eth) and 1556 bytes
(bridge) to the link dump. As the number of interfaces increases so does
the amount of data pushed to user space for a link list. If the user
only wants to see a list of specific devices (e.g., interfaces enslaved
to a specific bridge or a list of VRFs) most of that data is thrown away.
Passing the filters to the kernel to have only relevant data returned
makes the dump more efficient.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 20d71358c143..62737f437c8e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1414,6 +1414,58 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
 	[IFLA_PORT_RESPONSE]	= { .type = NLA_U16, },
 };
 
+static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
+{
+	const struct rtnl_link_ops *ops = NULL;
+	struct nlattr *linfo[IFLA_INFO_MAX + 1];
+
+	if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0)
+		return NULL;
+
+	if (linfo[IFLA_INFO_KIND]) {
+		char kind[MODULE_NAME_LEN];
+
+		nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+		ops = rtnl_link_ops_get(kind);
+	}
+
+	return ops;
+}
+
+static bool link_master_filtered(struct net_device *dev, int master_idx)
+{
+	struct net_device *master;
+
+	if (!master_idx)
+		return false;
+
+	master = netdev_master_upper_dev_get(dev);
+	if (!master || master->ifindex != master_idx)
+		return true;
+
+	return false;
+}
+
+static bool link_kind_filtered(const struct net_device *dev,
+			       const struct rtnl_link_ops *kind_ops)
+{
+	if (kind_ops && dev->rtnl_link_ops != kind_ops)
+		return true;
+
+	return false;
+}
+
+static bool link_dump_filtered(struct net_device *dev,
+			       int master_idx,
+			       const struct rtnl_link_ops *kind_ops)
+{
+	if (link_master_filtered(dev, master_idx) ||
+	    link_kind_filtered(dev, kind_ops))
+		return true;
+
+	return false;
+}
+
 static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
@@ -1423,6 +1475,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	struct hlist_head *head;
 	struct nlattr *tb[IFLA_MAX+1];
 	u32 ext_filter_mask = 0;
+	const struct rtnl_link_ops *kind_ops = NULL;
+	unsigned int flags = NLM_F_MULTI;
+	int master_idx = 0;
 	int err;
 	int hdrlen;
 
@@ -1445,18 +1500,29 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 
 		if (tb[IFLA_EXT_MASK])
 			ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+
+		if (tb[IFLA_MASTER])
+			master_idx = nla_get_u32(tb[IFLA_MASTER]);
+
+		if (tb[IFLA_LINKINFO])
+			kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]);
+
+		if (master_idx || kind_ops)
+			flags |= NLM_F_DUMP_FILTERED;
 	}
 
 	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 		idx = 0;
 		head = &net->dev_index_head[h];
 		hlist_for_each_entry(dev, head, index_hlist) {
+			if (link_dump_filtered(dev, master_idx, kind_ops))
+				continue;
 			if (idx < s_idx)
 				goto cont;
 			err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
-					       NLM_F_MULTI,
+					       flags,
 					       ext_filter_mask);
 			/* If we ran out of room on the first message,
 			 * we're in trouble
-- 
cgit v1.2.3-59-g8ed1b


From 12b74dfadb5a7a23baf4db941dc9fd9d371f249a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:17 +0100
Subject: ipv4: add option to drop unicast encapsulated in L2 multicast

In order to solve a problem with 802.11, the so-called hole-196 attack,
add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
enabled, causes the stack to drop IPv4 unicast packets encapsulated in
link-layer multi- or broadcast frames. Such frames can (as an attack)
be created by any member of the same wireless network and transmitted
as valid encrypted frames since the symmetric key for broadcast frames
is shared between all stations.

Additionally, enabling this option provides compliance with a SHOULD
clause of RFC 1122.

Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/uapi/linux/ip.h                |  1 +
 net/ipv4/devinet.c                     |  2 ++
 net/ipv4/ip_input.c                    | 25 ++++++++++++++++++++++++-
 4 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 73b36d7c7b0d..d5910d63214d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1216,6 +1216,13 @@ promote_secondaries - BOOLEAN
 	promote a corresponding secondary IP address instead of
 	removing all the corresponding secondary IP addresses.
 
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IP packets that are received in link-layer
+	multicast (or broadcast) frames.
+	This behavior (for multicast) is actually a SHOULD in RFC
+	1122, but is disabled by default for compatibility reasons.
+	Default: off (0)
+
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 08f894d2ddbd..584834f7e95c 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -165,6 +165,7 @@ enum
 	IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index cebd9d31e65a..dbbab28a52a4 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2192,6 +2192,8 @@ static struct devinet_sysctl_table {
 					      "promote_secondaries"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
 					      "route_localnet"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
+					      "drop_unicast_in_l2_multicast"),
 	},
 };
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d77eb0c3b684..852002f64c68 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -362,8 +362,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	rt = skb_rtable(skb);
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
-	} else if (rt->rt_type == RTN_BROADCAST)
+	} else if (rt->rt_type == RTN_BROADCAST) {
 		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
+	} else if (skb->pkt_type == PACKET_BROADCAST ||
+		   skb->pkt_type == PACKET_MULTICAST) {
+		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+		/* RFC 1122 3.3.6:
+		 *
+		 *   When a host sends a datagram to a link-layer broadcast
+		 *   address, the IP destination address MUST be a legal IP
+		 *   broadcast or IP multicast address.
+		 *
+		 *   A host SHOULD silently discard a datagram that is received
+		 *   via a link-layer broadcast (see Section 2.4) but does not
+		 *   specify an IP multicast or broadcast destination address.
+		 *
+		 * This doesn't explicitly say L2 *broadcast*, but broadcast is
+		 * in a way a form of multicast and the most common use case for
+		 * this is 802.11 protecting against cross-station spoofing (the
+		 * so-called "hole-196" attack) so do it for both.
+		 */
+		if (in_dev &&
+		    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+			goto drop;
+	}
 
 	return dst_input(skb);
 
-- 
cgit v1.2.3-59-g8ed1b


From 97daf331455077645ae1f13438bebd3d1a2e94ee Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:18 +0100
Subject: ipv4: add option to drop gratuitous ARP packets

In certain 802.11 wireless deployments, there will be ARP proxies
that use knowledge of the network to correctly answer requests.
To prevent gratuitous ARP frames on the shared medium from being
a problem, on such deployments wireless needs to drop them.

Enable this by providing an option called "drop_gratuitous_arp".

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 6 ++++++
 include/uapi/linux/ip.h                | 1 +
 net/ipv4/arp.c                         | 8 ++++++++
 net/ipv4/devinet.c                     | 2 ++
 4 files changed, 17 insertions(+)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d5910d63214d..a53bbfaff1c7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1223,6 +1223,12 @@ drop_unicast_in_l2_multicast - BOOLEAN
 	1122, but is disabled by default for compatibility reasons.
 	Default: off (0)
 
+drop_gratuitous_arp - BOOLEAN
+	Drop all gratuitous ARP frames, for example if there's a known
+	good ARP proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+	Default: off (0)
+
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 584834f7e95c..f291569768dd 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -166,6 +166,7 @@ enum
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+	IPV4_DEVCONF_DROP_GRATUITOUS_ARP,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 59b3e0e8fd51..c102eb5ac55c 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -735,6 +735,14 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
 		goto out;
 
+ /*
+  *	For some 802.11 wireless deployments (and possibly other networks),
+  *	there will be an ARP proxy and gratuitous ARP frames are attacks
+  *	and thus should not be accepted.
+  */
+	if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
+		goto out;
+
 /*
  *     Special case: We must set Frame Relay source Q.922 address
  */
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index dbbab28a52a4..3d835313575e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2185,6 +2185,8 @@ static struct devinet_sysctl_table {
 					"igmpv3_unsolicited_report_interval"),
 		DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
 					"ignore_routes_with_linkdown"),
+		DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
+					"drop_gratuitous_arp"),
 
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
-- 
cgit v1.2.3-59-g8ed1b


From abbc30436d39dfed8ebfca338d253f211ac7b094 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:19 +0100
Subject: ipv6: add option to drop unicast encapsulated in L2 multicast

In order to solve a problem with 802.11, the so-called hole-196 attack,
add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
enabled, causes the stack to drop IPv6 unicast packets encapsulated in
link-layer multi- or broadcast frames. Such frames can (as an attack)
be created by any member of the same wireless network and transmitted
as valid encrypted frames since the symmetric key for broadcast frames
is shared between all stations.

Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    |  8 ++++++++
 net/ipv6/ip6_input.c                   | 10 ++++++++++
 5 files changed, 26 insertions(+)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a53bbfaff1c7..e0e7350a4e6a 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1674,6 +1674,12 @@ stable_secret - IPv6 address
 
 	By default the stable secret is unset.
 
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IPv6 packets that are received in link-layer
+	multicast (or broadcast) frames.
+
+	By default this is turned off.
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 402753bccafa..4a4c1ae826cb 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -50,6 +50,7 @@ struct ipv6_devconf {
 	__s32		mc_forwarding;
 #endif
 	__s32		disable_ipv6;
+	__s32		drop_unicast_in_l2_multicast;
 	__s32		accept_dad;
 	__s32		force_tllao;
 	__s32           ndisc_notify;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 38b4fef20219..4c413570efe8 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -174,6 +174,7 @@ enum {
 	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 38eeddedfc21..23e325f39f8e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4711,6 +4711,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
+	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5784,6 +5785,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= addrconf_sysctl_ignore_routes_with_linkdown,
 		},
+		{
+			.procname	= "drop_unicast_in_l2_multicast",
+			.data		= &ipv6_devconf.drop_unicast_in_l2_multicast,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9075acf081dd..31ac3c56da4b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	    IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
 		goto err;
 
+	/* If enabled, drop unicast packets that were encapsulated in link-layer
+	 * multicast or broadcast to protected against the so-called "hole-196"
+	 * attack in 802.11 wireless.
+	 */
+	if (!ipv6_addr_is_multicast(&hdr->daddr) &&
+	    (skb->pkt_type == PACKET_BROADCAST ||
+	     skb->pkt_type == PACKET_MULTICAST) &&
+	    idev->cnf.drop_unicast_in_l2_multicast)
+		goto err;
+
 	/* RFC4291 2.7
 	 * Nodes must not originate a packet to a multicast address whose scope
 	 * field contains the reserved value 0; if such a packet is received, it
-- 
cgit v1.2.3-59-g8ed1b


From 7a02bf892d8f1e5298af1676f001bee410509d80 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:20 +0100
Subject: ipv6: add option to drop unsolicited neighbor advertisements

In certain 802.11 wireless deployments, there will be NA proxies
that use knowledge of the network to correctly answer requests.
To prevent unsolicitd advertisements on the shared medium from
being a problem, on such deployments wireless needs to drop them.

Enable this by providing an option called "drop_unsolicited_na".

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +++++++
 include/linux/ipv6.h                   | 1 +
 include/uapi/linux/ipv6.h              | 1 +
 net/ipv6/addrconf.c                    | 8 ++++++++
 net/ipv6/ndisc.c                       | 9 +++++++++
 5 files changed, 26 insertions(+)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e0e7350a4e6a..24ce97f42d35 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1680,6 +1680,13 @@ drop_unicast_in_l2_multicast - BOOLEAN
 
 	By default this is turned off.
 
+drop_unsolicited_na - BOOLEAN
+	Drop all unsolicited neighbor advertisements, for example if there's
+	a known good NA proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+
+	By default this is turned off.
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 4a4c1ae826cb..4b2267e1b7c3 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -56,6 +56,7 @@ struct ipv6_devconf {
 	__s32           ndisc_notify;
 	__s32		suppress_frag_ndisc;
 	__s32		accept_ra_mtu;
+	__s32		drop_unsolicited_na;
 	struct ipv6_stable_secret {
 		bool initialized;
 		struct in6_addr secret;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 4c413570efe8..ec117b65d5a5 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -175,6 +175,7 @@ enum {
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+	DEVCONF_DROP_UNSOLICITED_NA,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 23e325f39f8e..ac0ba9e4e06b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4712,6 +4712,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
+	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5792,6 +5793,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "drop_unsolicited_na",
+			.data		= &ipv6_devconf.drop_unsolicited_na,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 84afb9a77278..c245895a3d41 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -883,6 +883,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 				    offsetof(struct nd_msg, opt));
 	struct ndisc_options ndopts;
 	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = __in6_dev_get(dev);
 	struct inet6_ifaddr *ifp;
 	struct neighbour *neigh;
 
@@ -902,6 +903,14 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		return;
 	}
 
+	/* For some 802.11 wireless deployments (and possibly other networks),
+	 * there will be a NA proxy and unsolicitd packets are attacks
+	 * and thus should not be accepted.
+	 */
+	if (!msg->icmph.icmp6_solicited && idev &&
+	    idev->cnf.drop_unsolicited_na)
+		return;
+
 	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, warn, "NS: invalid ND option\n");
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From bef3c6c9374da40ec63698cd504474366a9de2ff Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:27:31 -0800
Subject: net: Drop unecessary enc_features variable from tunnel segmentation
 functions

The enc_features variable isn't necessary since features isn't used
anywhere after we create enc_features so instead just use a destructive AND
on features itself and save ourselves the variable declaration.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/gre_offload.c | 6 +++---
 net/ipv4/udp_offload.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 5a8ee3282550..02cb1a416c7d 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -19,7 +19,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				       netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	netdev_features_t enc_features;
 	int ghl;
 	struct gre_base_hdr *greh;
 	u16 mac_offset = skb->mac_header;
@@ -68,9 +67,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	skb_set_network_header(skb, skb_inner_network_offset(skb));
 	skb->mac_len = skb_inner_network_offset(skb);
 
+	features &= skb->dev->hw_enc_features;
+
 	/* segment inner packet. */
-	enc_features = skb->dev->hw_enc_features & features;
-	segs = skb_mac_gso_segment(skb, enc_features);
+	segs = skb_mac_gso_segment(skb, features);
 	if (IS_ERR_OR_NULL(segs)) {
 		skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
 		goto out;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 4c519c1dc161..ce64c2b7ba55 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -37,7 +37,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	int mac_len = skb->mac_len;
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
 	__be16 protocol = skb->protocol;
-	netdev_features_t enc_features;
 	int udp_offset, outer_hlen;
 	unsigned int oldlen;
 	bool need_csum = !!(skb_shinfo(skb)->gso_type &
@@ -65,9 +64,10 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 			   (skb->dev->features & (is_ipv6 ?
 			    NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM))));
 
+	features &= skb->dev->hw_enc_features;
+
 	/* segment inner packet. */
-	enc_features = skb->dev->hw_enc_features & features;
-	segs = gso_inner_segment(skb, enc_features);
+	segs = gso_inner_segment(skb, features);
 	if (IS_ERR_OR_NULL(segs)) {
 		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
 				     mac_len);
-- 
cgit v1.2.3-59-g8ed1b


From 76443456227097179c14826425f88a95d81a892e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:27:37 -0800
Subject: net: Move GSO csum into SKB_GSO_CB

This patch moves the checksum maintained by GSO out of skb->csum and into
the GSO context block in order to allow for us to work on outer checksums
while maintaining the inner checksum offsets in the case of the inner
checksum being offloaded, while the outer checksums will be computed.

While updating the code I also did a minor cleanu-up on gso_make_checksum.
The change is mostly to make it so that we store the values and compute the
checksum instead of computing the checksum and then storing the values we
needed to update.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 14 +++++++-------
 net/core/skbuff.c      | 16 +++++++++-------
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 11f935c1a090..acece7ce376f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3549,6 +3549,7 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 struct skb_gso_cb {
 	int	mac_offset;
 	int	encap_level;
+	__wsum	csum;
 	__u16	csum_start;
 };
 #define SKB_SGO_CB_OFFSET	32
@@ -3585,15 +3586,14 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
  */
 static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res)
 {
-	int plen = SKB_GSO_CB(skb)->csum_start - skb_headroom(skb) -
-		   skb_transport_offset(skb);
-	__wsum partial;
+	unsigned char *csum_start = skb_transport_header(skb);
+	int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start;
+	__wsum partial = SKB_GSO_CB(skb)->csum;
 
-	partial = csum_partial(skb_transport_header(skb), plen, skb->csum);
-	skb->csum = res;
-	SKB_GSO_CB(skb)->csum_start -= plen;
+	SKB_GSO_CB(skb)->csum = res;
+	SKB_GSO_CB(skb)->csum_start = csum_start - skb->head;
 
-	return csum_fold(partial);
+	return csum_fold(csum_partial(csum_start, plen, partial));
 }
 
 static inline bool skb_is_gso(const struct sk_buff *skb)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b2df375ec9c2..02c638a643ea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3100,11 +3100,12 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 
 		if (!sg && !nskb->remcsum_offload) {
 			nskb->ip_summed = CHECKSUM_NONE;
-			nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
-							    skb_put(nskb, len),
-							    len, 0);
+			SKB_GSO_CB(nskb)->csum =
+				skb_copy_and_csum_bits(head_skb, offset,
+						       skb_put(nskb, len),
+						       len, 0);
 			SKB_GSO_CB(nskb)->csum_start =
-			    skb_headroom(nskb) + doffset;
+				skb_headroom(nskb) + doffset;
 			continue;
 		}
 
@@ -3171,11 +3172,12 @@ skip_fraglist:
 
 perform_csum_check:
 		if (!csum && !nskb->remcsum_offload) {
-			nskb->csum = skb_checksum(nskb, doffset,
-						  nskb->len - doffset, 0);
 			nskb->ip_summed = CHECKSUM_NONE;
+			SKB_GSO_CB(nskb)->csum =
+				skb_checksum(nskb, doffset,
+					     nskb->len - doffset, 0);
 			SKB_GSO_CB(nskb)->csum_start =
-			    skb_headroom(nskb) + doffset;
+				skb_headroom(nskb) + doffset;
 		}
 	} while ((offset += len) < head_skb->len);
 
-- 
cgit v1.2.3-59-g8ed1b


From 7fbeffed77c130ecf64e8a2f7f9d6d63a9d60a19 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:27:43 -0800
Subject: net: Update remote checksum segmentation to support use of GSO
 checksum

This patch addresses two main issues.

First in the case of remote checksum offload we were avoiding dealing with
scatter-gather issues.  As a result it would be possible to assemble a
series of frames that used frags instead of being linearized as they should
have if remote checksum offload was enabled.

Second I have updated the code so that we now let GSO take care of doing
the checksum on the data itself and drop the special case that was added
for remote checksum offload.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c      | 10 ++++++----
 net/ipv4/udp_offload.c | 22 ++++++++++------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 02c638a643ea..9c065ac72e87 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3098,8 +3098,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 		if (nskb->len == len + doffset)
 			goto perform_csum_check;
 
-		if (!sg && !nskb->remcsum_offload) {
-			nskb->ip_summed = CHECKSUM_NONE;
+		if (!sg) {
+			if (!nskb->remcsum_offload)
+				nskb->ip_summed = CHECKSUM_NONE;
 			SKB_GSO_CB(nskb)->csum =
 				skb_copy_and_csum_bits(head_skb, offset,
 						       skb_put(nskb, len),
@@ -3171,8 +3172,9 @@ skip_fraglist:
 		nskb->truesize += nskb->data_len;
 
 perform_csum_check:
-		if (!csum && !nskb->remcsum_offload) {
-			nskb->ip_summed = CHECKSUM_NONE;
+		if (!csum) {
+			if (!nskb->remcsum_offload)
+				nskb->ip_summed = CHECKSUM_NONE;
 			SKB_GSO_CB(nskb)->csum =
 				skb_checksum(nskb, doffset,
 					     nskb->len - doffset, 0);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ce64c2b7ba55..86687f58d613 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -66,6 +66,16 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 
 	features &= skb->dev->hw_enc_features;
 
+	/* The only checksum offload we care about from here on out is the
+	 * outer one so strip the existing checksum feature flags and
+	 * instead set the flag based on our outer checksum offload value.
+	 */
+	if (remcsum) {
+		features &= ~NETIF_F_CSUM_MASK;
+		if (offload_csum)
+			features |= NETIF_F_HW_CSUM;
+	}
+
 	/* segment inner packet. */
 	segs = gso_inner_segment(skb, features);
 	if (IS_ERR_OR_NULL(segs)) {
@@ -116,18 +126,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 			skb->ip_summed = CHECKSUM_PARTIAL;
 			skb->csum_start = skb_transport_header(skb) - skb->head;
 			skb->csum_offset = offsetof(struct udphdr, check);
-		} else if (remcsum) {
-			/* Need to calculate checksum from scratch,
-			 * inner checksums are never when doing
-			 * remote_checksum_offload.
-			 */
-
-			skb->csum = skb_checksum(skb, udp_offset,
-						 skb->len - udp_offset,
-						 0);
-			uh->check = csum_fold(skb->csum);
-			if (uh->check == 0)
-				uh->check = CSUM_MANGLED_0;
 		} else {
 			uh->check = gso_make_checksum(skb, ~uh->check);
 
-- 
cgit v1.2.3-59-g8ed1b


From 08b64fcca942733413bc5ac2321d57021d3e8578 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:27:49 -0800
Subject: net: Store checksum result for offloaded GSO checksums

This patch makes it so that we can offload the checksums for a packet up
to a certain point and then begin computing the checksums via software.
Setting this up is fairly straight forward as all we need to do is reset
the values stored in csum and csum_start for the GSO context block.

One complication for this is remote checksum offload.  In order to allow
the inner checksums to be offloaded while computing the outer checksum
manually we needed to have some way of indicating that the offload wasn't
real.  In order to do that I replaced CHECKSUM_PARTIAL with
CHECKSUM_UNNECESSARY in the case of us computing checksums for the outer
header while skipping computing checksums for the inner headers.  We clean
up the ip_summed flag and set it to either CHECKSUM_PARTIAL or
CHECKSUM_NONE once we hand the packet off to the next lower level.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 15 +++++++++++++++
 net/ipv4/tcp_offload.c |  8 ++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index acece7ce376f..a8fc2220e8ce 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2161,6 +2161,11 @@ static inline int skb_checksum_start_offset(const struct sk_buff *skb)
 	return skb->csum_start - skb_headroom(skb);
 }
 
+static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
+{
+	return skb->head + skb->csum_start;
+}
+
 static inline int skb_transport_offset(const struct sk_buff *skb)
 {
 	return skb_transport_header(skb) - skb->data;
@@ -3576,6 +3581,16 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
 	return 0;
 }
 
+static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
+{
+	/* Do not update partial checksums if remote checksum is enabled. */
+	if (skb->remcsum_offload)
+		return;
+
+	SKB_GSO_CB(skb)->csum = res;
+	SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
+}
+
 /* Compute the checksum for a gso segment. First compute the checksum value
  * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and
  * then add in skb->csum (checksum from csum_start to end of packet).
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 9864a2dbadce..773083b7f1e9 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -135,7 +135,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 		th->fin = th->psh = 0;
 		th->check = newcheck;
 
-		if (skb->ip_summed != CHECKSUM_PARTIAL)
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			gso_reset_checksum(skb, ~th->check);
+		else
 			th->check = gso_make_checksum(skb, ~th->check);
 
 		seq += mss;
@@ -169,7 +171,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 		      skb->data_len);
 	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
 				(__force u32)delta));
-	if (skb->ip_summed != CHECKSUM_PARTIAL)
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		gso_reset_checksum(skb, ~th->check);
+	else
 		th->check = gso_make_checksum(skb, ~th->check);
 out:
 	return segs;
-- 
cgit v1.2.3-59-g8ed1b


From ddff00d420432d54eb420bb33034bb8e22dd2543 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:27:55 -0800
Subject: net: Move skb_has_shared_frag check out of GRE code and into
 segmentation

The call skb_has_shared_frag is used in the GRE path and skb_checksum_help
to verify that no frags can be modified by an external entity.  This check
really doesn't belong in the GRE path but in the skb_segment function
itself.  This way any protocol that might be segmented will be performing
this check before attempting to offload a checksum to software.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c      |  5 +++++
 net/ipv4/gre_offload.c | 11 -----------
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9c065ac72e87..88262c82b96a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3173,6 +3173,11 @@ skip_fraglist:
 
 perform_csum_check:
 		if (!csum) {
+			if (skb_has_shared_frag(nskb)) {
+				err = __skb_linearize(nskb);
+				if (err)
+					goto err;
+			}
 			if (!nskb->remcsum_offload)
 				nskb->ip_summed = CHECKSUM_NONE;
 			SKB_GSO_CB(nskb)->csum =
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 02cb1a416c7d..35a8dd35ed4e 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -83,17 +83,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 		if (csum) {
 			__be32 *pcsum;
 
-			if (skb_has_shared_frag(skb)) {
-				int err;
-
-				err = __skb_linearize(skb);
-				if (err) {
-					kfree_skb_list(segs);
-					segs = ERR_PTR(err);
-					goto out;
-				}
-			}
-
 			skb_reset_transport_header(skb);
 
 			greh = (struct gre_base_hdr *)
-- 
cgit v1.2.3-59-g8ed1b


From 2e598af7135d1434b0ebf2e9c7ff8e014f48d572 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:28:01 -0800
Subject: gre: Use GSO flags to determine csum need instead of GRE flags

This patch updates the gre checksum path to follow something much closer to
the UDP checksum path.  By doing this we can avoid needing to do as much
header inspection and can just make use of the fields we were already
reading in the sk_buff structure.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/gre_offload.c | 64 +++++++++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 35a8dd35ed4e..c15441b5ff61 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -18,14 +18,14 @@
 static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				       netdev_features_t features)
 {
+	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	int ghl;
 	struct gre_base_hdr *greh;
 	u16 mac_offset = skb->mac_header;
-	int mac_len = skb->mac_len;
 	__be16 protocol = skb->protocol;
-	int tnl_hlen;
-	bool csum;
+	u16 mac_len = skb->mac_len;
+	int gre_offset, outer_hlen;
+	bool need_csum;
 
 	if (unlikely(skb_shinfo(skb)->gso_type &
 				~(SKB_GSO_TCPV4 |
@@ -42,64 +42,60 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	if (!skb->encapsulation)
 		goto out;
 
-	if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+	if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr)))
 		goto out;
 
-	greh = (struct gre_base_hdr *)skb_transport_header(skb);
-
-	ghl = skb_inner_mac_header(skb) - skb_transport_header(skb);
-	if (unlikely(ghl < sizeof(*greh)))
+	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
 		goto out;
 
-	csum = !!(greh->flags & GRE_CSUM);
-	if (csum)
-		skb->encap_hdr_csum = 1;
+	greh = (struct gre_base_hdr *)skb_transport_header(skb);
 
 	/* setup inner skb. */
 	skb->protocol = greh->protocol;
 	skb->encapsulation = 0;
-
-	if (unlikely(!pskb_may_pull(skb, ghl)))
-		goto out;
-
-	__skb_pull(skb, ghl);
+	__skb_pull(skb, tnl_hlen);
 	skb_reset_mac_header(skb);
 	skb_set_network_header(skb, skb_inner_network_offset(skb));
 	skb->mac_len = skb_inner_network_offset(skb);
 
+	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
+	skb->encap_hdr_csum = need_csum;
+
 	features &= skb->dev->hw_enc_features;
 
 	/* segment inner packet. */
 	segs = skb_mac_gso_segment(skb, features);
 	if (IS_ERR_OR_NULL(segs)) {
-		skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
+		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+				     mac_len);
 		goto out;
 	}
 
+	outer_hlen = skb_tnl_header_len(skb);
+	gre_offset = outer_hlen - tnl_hlen;
 	skb = segs;
-	tnl_hlen = skb_tnl_header_len(skb);
 	do {
-		__skb_push(skb, ghl);
-		if (csum) {
-			__be32 *pcsum;
-
-			skb_reset_transport_header(skb);
-
-			greh = (struct gre_base_hdr *)
-			    skb_transport_header(skb);
-			pcsum = (__be32 *)(greh + 1);
-			*pcsum = 0;
-			*(__sum16 *)pcsum = gso_make_checksum(skb, 0);
-		}
-		__skb_push(skb, tnl_hlen - ghl);
+		__be32 *pcsum;
 
 		skb_reset_inner_headers(skb);
 		skb->encapsulation = 1;
 
-		skb_reset_mac_header(skb);
-		skb_set_network_header(skb, mac_len);
 		skb->mac_len = mac_len;
 		skb->protocol = protocol;
+
+		__skb_push(skb, outer_hlen);
+		skb_reset_mac_header(skb);
+		skb_set_network_header(skb, mac_len);
+		skb_set_transport_header(skb, gre_offset);
+
+		if (!need_csum)
+			continue;
+
+		greh = (struct gre_base_hdr *)skb_transport_header(skb);
+		pcsum = (__be32 *)(greh + 1);
+
+		*pcsum = 0;
+		*(__sum16 *)pcsum = gso_make_checksum(skb, 0);
 	} while ((skb = skb->next));
 out:
 	return segs;
-- 
cgit v1.2.3-59-g8ed1b


From 38720352412a4305a108ff37ae6099dab3f44a38 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:28:08 -0800
Subject: gre: Use inner_proto to obtain inner header protocol

Instead of parsing headers to determine the inner protocol we can just pull
the value from inner_proto.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/gre_offload.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index c15441b5ff61..003b0ebbcfdd 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -20,7 +20,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 {
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	struct gre_base_hdr *greh;
 	u16 mac_offset = skb->mac_header;
 	__be16 protocol = skb->protocol;
 	u16 mac_len = skb->mac_len;
@@ -48,15 +47,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
 		goto out;
 
-	greh = (struct gre_base_hdr *)skb_transport_header(skb);
-
 	/* setup inner skb. */
-	skb->protocol = greh->protocol;
 	skb->encapsulation = 0;
 	__skb_pull(skb, tnl_hlen);
 	skb_reset_mac_header(skb);
 	skb_set_network_header(skb, skb_inner_network_offset(skb));
 	skb->mac_len = skb_inner_network_offset(skb);
+	skb->protocol = skb->inner_protocol;
 
 	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
 	skb->encap_hdr_csum = need_csum;
@@ -75,6 +72,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	gre_offset = outer_hlen - tnl_hlen;
 	skb = segs;
 	do {
+		struct gre_base_hdr *greh;
 		__be32 *pcsum;
 
 		skb_reset_inner_headers(skb);
-- 
cgit v1.2.3-59-g8ed1b


From fdaefd62fd658b266a8f389cdf7991630b4bc7b4 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:28:14 -0800
Subject: udp: Clean up the use of flags in UDP segmentation offload

This patch goes though and cleans up the logic related to several of the
control flags used in UDP segmentation.  Specifically the use of dont_encap
isn't really needed as we can just check the skb for CHECKSUM_PARTIAL and
if it isn't set then we don't need to update the internal headers.  As such
we can just drop that value.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 86687f58d613..9e4816fc9927 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -33,16 +33,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	__be16 new_protocol, bool is_ipv6)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	bool remcsum, need_csum, offload_csum;
 	u16 mac_offset = skb->mac_header;
 	int mac_len = skb->mac_len;
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
 	__be16 protocol = skb->protocol;
 	int udp_offset, outer_hlen;
 	unsigned int oldlen;
-	bool need_csum = !!(skb_shinfo(skb)->gso_type &
-			    SKB_GSO_UDP_TUNNEL_CSUM);
-	bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
-	bool offload_csum = false, dont_encap = (need_csum || remcsum);
 
 	oldlen = (u16)~skb->len;
 
@@ -55,14 +52,18 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	skb_set_network_header(skb, skb_inner_network_offset(skb));
 	skb->mac_len = skb_inner_network_offset(skb);
 	skb->protocol = new_protocol;
+
+	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
 	skb->encap_hdr_csum = need_csum;
+
+	remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
 	skb->remcsum_offload = remcsum;
 
 	/* Try to offload checksum if possible */
 	offload_csum = !!(need_csum &&
-			  ((skb->dev->features & NETIF_F_HW_CSUM) ||
-			   (skb->dev->features & (is_ipv6 ?
-			    NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM))));
+			  (skb->dev->features &
+			   (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) :
+				      (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
 
 	features &= skb->dev->hw_enc_features;
 
@@ -92,13 +93,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		int len;
 		__be32 delta;
 
-		if (dont_encap) {
-			skb->encapsulation = 0;
+		if (remcsum)
 			skb->ip_summed = CHECKSUM_NONE;
-		} else {
-			/* Only set up inner headers if we might be offloading
-			 * inner checksum.
-			 */
+
+		/* Set up inner headers if we are offloading inner checksum */
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 			skb_reset_inner_headers(skb);
 			skb->encapsulation = 1;
 		}
@@ -122,15 +121,15 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		uh->check = ~csum_fold((__force __wsum)
 				       ((__force u32)uh->check +
 					(__force u32)delta));
-		if (offload_csum) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_transport_header(skb) - skb->head;
-			skb->csum_offset = offsetof(struct udphdr, check);
-		} else {
-			uh->check = gso_make_checksum(skb, ~uh->check);
 
+		if (skb->encapsulation || !offload_csum) {
+			uh->check = gso_make_checksum(skb, ~uh->check);
 			if (uh->check == 0)
 				uh->check = CSUM_MANGLED_0;
+		} else {
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			skb->csum_start = skb_transport_header(skb) - skb->head;
+			skb->csum_offset = offsetof(struct udphdr, check);
 		}
 	} while ((skb = skb->next));
 out:
-- 
cgit v1.2.3-59-g8ed1b


From dbef491ebe7f3a4fb1b9111878b86a426fd540b7 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:28:20 -0800
Subject: udp: Use uh->len instead of skb->len to compute checksum in
 segmentation

The segmentation code was having to do a bunch of work to pull the
skb->len and strip the udp header offset before the value could be used to
adjust the checksum.  Instead of doing all this work we can just use the
value that goes into uh->len since that is the correct value with the
correct byte order that we need anyway.  By using this value we can save
ourselves a bunch of pain as there is no need to do multiple byte swaps.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 9e4816fc9927..56c4c8b88b28 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -32,20 +32,23 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 					     netdev_features_t features),
 	__be16 new_protocol, bool is_ipv6)
 {
+	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	bool remcsum, need_csum, offload_csum;
+	struct udphdr *uh = udp_hdr(skb);
 	u16 mac_offset = skb->mac_header;
-	int mac_len = skb->mac_len;
-	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
 	__be16 protocol = skb->protocol;
+	u16 mac_len = skb->mac_len;
 	int udp_offset, outer_hlen;
-	unsigned int oldlen;
-
-	oldlen = (u16)~skb->len;
+	u32 partial;
 
 	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
 		goto out;
 
+	/* adjust partial header checksum to negate old length */
+	partial = (__force u32)uh->check + (__force u16)~uh->len;
+
+	/* setup inner skb. */
 	skb->encapsulation = 0;
 	__skb_pull(skb, tnl_hlen);
 	skb_reset_mac_header(skb);
@@ -89,9 +92,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	udp_offset = outer_hlen - tnl_hlen;
 	skb = segs;
 	do {
-		struct udphdr *uh;
-		int len;
-		__be32 delta;
+		__be16 len;
 
 		if (remcsum)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -105,22 +106,19 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		skb->mac_len = mac_len;
 		skb->protocol = protocol;
 
-		skb_push(skb, outer_hlen);
+		__skb_push(skb, outer_hlen);
 		skb_reset_mac_header(skb);
 		skb_set_network_header(skb, mac_len);
 		skb_set_transport_header(skb, udp_offset);
-		len = skb->len - udp_offset;
+		len = htons(skb->len - udp_offset);
 		uh = udp_hdr(skb);
-		uh->len = htons(len);
+		uh->len = len;
 
 		if (!need_csum)
 			continue;
 
-		delta = htonl(oldlen + len);
-
 		uh->check = ~csum_fold((__force __wsum)
-				       ((__force u32)uh->check +
-					(__force u32)delta));
+				       ((__force u32)len + partial));
 
 		if (skb->encapsulation || !offload_csum) {
 			uh->check = gso_make_checksum(skb, ~uh->check);
-- 
cgit v1.2.3-59-g8ed1b


From f245d079c1d11dc6927e56f5a89dd566fef2a415 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 5 Feb 2016 15:28:26 -0800
Subject: net: Allow tunnels to use inner checksum offloads with outer
 checksums needed

This patch enables us to use inner checksum offloads if provided by
hardware with outer checksums computed by software.

It basically reduces encap_hdr_csum to an advisory flag for now, but based
on the fact that SCTP may be getting segmentation support before long I
thought we may want to keep it as it is possible we may need to support
CRC32c and 1's compliment checksum in the same packet at some point in the
future.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 88262c82b96a..b0cce744e2a0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3004,8 +3004,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	if (unlikely(!proto))
 		return ERR_PTR(-EINVAL);
 
-	csum = !head_skb->encap_hdr_csum &&
-	    !!can_checksum_protocol(features, proto);
+	csum = !!can_checksum_protocol(features, proto);
 
 	headroom = skb_headroom(head_skb);
 	pos = skb_headlen(head_skb);
-- 
cgit v1.2.3-59-g8ed1b


From f48e72318aa6f6ac792eaf74f3b77491a5c43891 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Tue, 9 Feb 2016 22:04:47 -0800
Subject: rds: duplicate include net/tcp.h

Duplicate include detected.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 9d6ddbacd875..ad60299b088b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -37,7 +37,6 @@
 #include <net/tcp.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
-#include <net/tcp.h>
 
 #include "rds.h"
 #include "tcp.h"
-- 
cgit v1.2.3-59-g8ed1b


From 4a92602aa1cd5bbaeedbd9536ff992f7d26fe9d1 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.andersen@canonical.com>
Date: Fri, 5 Feb 2016 09:20:52 -0700
Subject: openvswitch: allow management from inside user namespaces

Operations with the GENL_ADMIN_PERM flag fail permissions checks because
this flag means we call netlink_capable, which uses the init user ns.

Instead, let's introduce a new flag, GENL_UNS_ADMIN_PERM for operations
which should be allowed inside a user namespace.

The motivation for this is to be able to run openvswitch in unprivileged
containers. I've tested this and it seems to work, but I really have no
idea about the security consequences of this patch, so thoughts would be
much appreciated.

v2: use the GENL_UNS_ADMIN_PERM flag instead of a check in each function
v3: use separate ifs for UNS_ADMIN_PERM and ADMIN_PERM, instead of one
    massive one

Reported-by: James Page <james.page@canonical.com>
Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com>
CC: Eric Biederman <ebiederm@xmission.com>
CC: Pravin Shelar <pshelar@ovn.org>
CC: Justin Pettit <jpettit@nicira.com>
CC: "David S. Miller" <davem@davemloft.net>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/genetlink.h |  1 +
 net/netlink/genetlink.c        |  4 ++++
 net/openvswitch/datapath.c     | 20 ++++++++++----------
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index c3363ba1ae05..5512c90af7e3 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -21,6 +21,7 @@ struct genlmsghdr {
 #define GENL_CMD_CAP_DO		0x02
 #define GENL_CMD_CAP_DUMP	0x04
 #define GENL_CMD_CAP_HASPOL	0x08
+#define GENL_UNS_ADMIN_PERM	0x10
 
 /*
  * List of reserved static generic netlink identifiers:
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f830326b3b1d..0ffd721126e7 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -580,6 +580,10 @@ static int genl_family_rcv_msg(struct genl_family *family,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
+	if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
 	if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
 		int rc;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index deadfdab1bc3..d6f7fe92744a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -654,7 +654,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_packet_genl_ops[] = {
 	{ .cmd = OVS_PACKET_CMD_EXECUTE,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = packet_policy,
 	  .doit = ovs_packet_cmd_execute
 	}
@@ -1391,12 +1391,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_flow_genl_ops[] = {
 	{ .cmd = OVS_FLOW_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_new
 	},
 	{ .cmd = OVS_FLOW_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_del
 	},
@@ -1407,7 +1407,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
 	  .dumpit = ovs_flow_cmd_dump
 	},
 	{ .cmd = OVS_FLOW_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_set,
 	},
@@ -1777,12 +1777,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_datapath_genl_ops[] = {
 	{ .cmd = OVS_DP_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_new
 	},
 	{ .cmd = OVS_DP_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_del
 	},
@@ -1793,7 +1793,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
 	  .dumpit = ovs_dp_cmd_dump
 	},
 	{ .cmd = OVS_DP_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_set,
 	},
@@ -2158,12 +2158,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_vport_genl_ops[] = {
 	{ .cmd = OVS_VPORT_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_new
 	},
 	{ .cmd = OVS_VPORT_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_del
 	},
@@ -2174,7 +2174,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
 	  .dumpit = ovs_vport_cmd_dump
 	},
 	{ .cmd = OVS_VPORT_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_set,
 	},
-- 
cgit v1.2.3-59-g8ed1b


From 815c52700746cdcc0874a33390bac334a4b90107 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 8 Feb 2016 23:29:21 +0200
Subject: igmp: Namespaceify igmp_max_memberships sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  1 -
 include/net/netns/ipv4.h   |  2 ++
 net/ipv4/igmp.c            |  4 +---
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  2 ++
 5 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 9c9de11549a7..57d6d06ce0b3 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -38,7 +38,6 @@ static inline struct igmpv3_query *
 }
 
 extern int sysctl_igmp_llm_reports;
-extern int sysctl_igmp_max_memberships;
 extern int sysctl_igmp_max_msf;
 extern int sysctl_igmp_qrv;
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 4d6ec3f6fafe..759cf624eec2 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -108,6 +108,8 @@ struct netns_ipv4 {
 	int sysctl_tcp_fin_timeout;
 	unsigned int sysctl_tcp_notsent_lowat;
 
+	int sysctl_igmp_max_memberships;
+
 	struct ping_group_range ping_group_range;
 
 	atomic_t dev_addr_genid;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 05e4cba14162..5b86257c9d6b 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,7 +107,6 @@
 #include <linux/seq_file.h>
 #endif
 
-#define IP_MAX_MEMBERSHIPS	20
 #define IP_MAX_MSF		10
 
 /* IGMP reports for link-local multicast groups are enabled by default */
@@ -1727,7 +1726,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 /*
  *	Join a socket to a group
  */
-int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
 int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
 #ifdef CONFIG_IP_MULTICAST
 int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
@@ -2074,7 +2072,7 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
 		count++;
 	}
 	err = -ENOBUFS;
-	if (count >= sysctl_igmp_max_memberships)
+	if (count >= net->ipv4.sysctl_igmp_max_memberships)
 		goto done;
 	iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
 	if (!iml)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44bb59824267..6ea3dbb96db4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -367,13 +367,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "igmp_max_memberships",
-		.data		= &sysctl_igmp_max_memberships,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "igmp_max_msf",
 		.data		= &sysctl_igmp_max_msf,
@@ -871,6 +864,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "igmp_max_memberships",
+		.data		= &init_net.ipv4.sysctl_igmp_max_memberships,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "tcp_keepalive_time",
 		.data		= &init_net.ipv4.sysctl_tcp_keepalive_time,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3f872a6bc274..4b203789900b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2399,6 +2399,8 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 
+	net->ipv4.sysctl_igmp_max_memberships = 20;
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
-- 
cgit v1.2.3-59-g8ed1b


From 166b6b2d6f01be67a83b87ab5c91350a68b17115 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 8 Feb 2016 23:29:22 +0200
Subject: igmp: Namespaceify igmp_max_msf sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  1 -
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/igmp.c            |  5 +----
 net/ipv4/ip_sockglue.c     |  5 +++--
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  1 +
 6 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 57d6d06ce0b3..a91ec9f575e7 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -38,7 +38,6 @@ static inline struct igmpv3_query *
 }
 
 extern int sysctl_igmp_llm_reports;
-extern int sysctl_igmp_max_msf;
 extern int sysctl_igmp_qrv;
 
 struct ip_sf_socklist {
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 759cf624eec2..522a2cfe1ad9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -109,6 +109,7 @@ struct netns_ipv4 {
 	unsigned int sysctl_tcp_notsent_lowat;
 
 	int sysctl_igmp_max_memberships;
+	int sysctl_igmp_max_msf;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5b86257c9d6b..6da2e467b63c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,8 +107,6 @@
 #include <linux/seq_file.h>
 #endif
 
-#define IP_MAX_MSF		10
-
 /* IGMP reports for link-local multicast groups are enabled by default */
 int sysctl_igmp_llm_reports __read_mostly = 1;
 
@@ -1726,7 +1724,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 /*
  *	Join a socket to a group
  */
-int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
 #ifdef CONFIG_IP_MULTICAST
 int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
 #endif
@@ -2244,7 +2241,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	}
 	/* else, add a new source to the filter */
 
-	if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+	if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) {
 		err = -ENOBUFS;
 		goto done;
 	}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5f73a7c03e27..92808f147ef5 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -571,6 +571,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			    int optname, char __user *optval, unsigned int optlen)
 {
 	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
 	int val = 0, err;
 	bool needs_rtnl = setsockopt_needs_rtnl(optname);
 
@@ -910,7 +911,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		}
 		/* numsrc >= (1G-4) overflow in 32 bits */
 		if (msf->imsf_numsrc >= 0x3ffffffcU ||
-		    msf->imsf_numsrc > sysctl_igmp_max_msf) {
+		    msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
 			kfree(msf);
 			err = -ENOBUFS;
 			break;
@@ -1065,7 +1066,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 
 		/* numsrc >= (4G-140)/128 overflow in 32 bits */
 		if (gsf->gf_numsrc >= 0x1ffffff ||
-		    gsf->gf_numsrc > sysctl_igmp_max_msf) {
+		    gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
 			err = -ENOBUFS;
 			goto mc_msf_out;
 		}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6ea3dbb96db4..225659a02cf2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -367,13 +367,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "igmp_max_msf",
-		.data		= &sysctl_igmp_max_msf,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 #ifdef CONFIG_IP_MULTICAST
 	{
 		.procname	= "igmp_qrv",
@@ -871,6 +864,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "igmp_max_msf",
+		.data		= &init_net.ipv4.sysctl_igmp_max_msf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "tcp_keepalive_time",
 		.data		= &init_net.ipv4.sysctl_tcp_keepalive_time,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4b203789900b..055d8a9a0c61 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2400,6 +2400,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 
 	net->ipv4.sysctl_igmp_max_memberships = 20;
+	net->ipv4.sysctl_igmp_max_msf = 10;
 
 	return 0;
 fail:
-- 
cgit v1.2.3-59-g8ed1b


From 87a8a2ae65b7721893c7922f963502be8fa01c94 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <n.borisov@siteground.com>
Date: Tue, 9 Feb 2016 00:13:50 +0200
Subject: igmp: Namespaceify igmp_llm_reports sysctl knob

This was initially introduced in df2cf4a78e488d26 ("IGMP: Inhibit
reports for local multicast groups") by defining the sysctl in the
ipv4_net_table array, however it was never implemented to be
namespace aware. Fix this by changing the code accordingly.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  1 -
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/igmp.c            | 26 +++++++++++++++-----------
 net/ipv4/sysctl_net_ipv4.c |  2 +-
 net/ipv4/tcp_ipv4.c        |  2 ++
 5 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index a91ec9f575e7..c683f4bf642b 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -37,7 +37,6 @@ static inline struct igmpv3_query *
 	return (struct igmpv3_query *)skb_transport_header(skb);
 }
 
-extern int sysctl_igmp_llm_reports;
 extern int sysctl_igmp_qrv;
 
 struct ip_sf_socklist {
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 522a2cfe1ad9..cbbf8115e8a7 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -110,6 +110,7 @@ struct netns_ipv4 {
 
 	int sysctl_igmp_max_memberships;
 	int sysctl_igmp_max_msf;
+	int sysctl_igmp_llm_reports;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6da2e467b63c..2e22ee0efc98 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,9 +107,6 @@
 #include <linux/seq_file.h>
 #endif
 
-/* IGMP reports for link-local multicast groups are enabled by default */
-int sysctl_igmp_llm_reports __read_mostly = 1;
-
 #ifdef CONFIG_IP_MULTICAST
 /* Parameter names and values are taken from igmp-v2-06 draft */
 
@@ -430,6 +427,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 	int type, int gdeleted, int sdeleted)
 {
 	struct net_device *dev = pmc->interface->dev;
+	struct net *net = dev_net(dev);
 	struct igmpv3_report *pih;
 	struct igmpv3_grec *pgr = NULL;
 	struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
@@ -437,7 +435,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 
 	if (pmc->multiaddr == IGMP_ALL_HOSTS)
 		return skb;
-	if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
 		return skb;
 
 	isquery = type == IGMPV3_MODE_IS_INCLUDE ||
@@ -540,6 +538,7 @@ empty_source:
 static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
 {
 	struct sk_buff *skb = NULL;
+	struct net *net = dev_net(in_dev->dev);
 	int type;
 
 	if (!pmc) {
@@ -548,7 +547,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
 			if (pmc->multiaddr == IGMP_ALL_HOSTS)
 				continue;
 			if (ipv4_is_local_multicast(pmc->multiaddr) &&
-			     !sysctl_igmp_llm_reports)
+			     !net->ipv4.sysctl_igmp_llm_reports)
 				continue;
 			spin_lock_bh(&pmc->lock);
 			if (pmc->sfcount[MCAST_EXCLUDE])
@@ -684,7 +683,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
 		return igmpv3_send_report(in_dev, pmc);
 
-	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
 		return 0;
 
 	if (type == IGMP_HOST_LEAVE_MESSAGE)
@@ -855,12 +854,13 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
 static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
 {
 	struct ip_mc_list *im;
+	struct net *net = dev_net(in_dev->dev);
 
 	/* Timers are only set for non-local groups */
 
 	if (group == IGMP_ALL_HOSTS)
 		return false;
-	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
 		return false;
 
 	rcu_read_lock();
@@ -884,6 +884,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 	__be32			group = ih->group;
 	int			max_delay;
 	int			mark = 0;
+	struct net		*net = dev_net(in_dev->dev);
 
 
 	if (len == 8) {
@@ -969,7 +970,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 		if (im->multiaddr == IGMP_ALL_HOSTS)
 			continue;
 		if (ipv4_is_local_multicast(im->multiaddr) &&
-		    !sysctl_igmp_llm_reports)
+		    !net->ipv4.sysctl_igmp_llm_reports)
 			continue;
 		spin_lock_bh(&im->lock);
 		if (im->tm_running)
@@ -1184,6 +1185,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 {
 	struct in_device *in_dev = im->interface;
 #ifdef CONFIG_IP_MULTICAST
+	struct net *net = dev_net(in_dev->dev);
 	int reporter;
 #endif
 
@@ -1195,7 +1197,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 #ifdef CONFIG_IP_MULTICAST
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
-	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
 		return;
 
 	reporter = im->reporter;
@@ -1220,6 +1222,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 static void igmp_group_added(struct ip_mc_list *im)
 {
 	struct in_device *in_dev = im->interface;
+	struct net *net = dev_net(in_dev->dev);
 
 	if (im->loaded == 0) {
 		im->loaded = 1;
@@ -1229,7 +1232,7 @@ static void igmp_group_added(struct ip_mc_list *im)
 #ifdef CONFIG_IP_MULTICAST
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
-	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+	if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
 		return;
 
 	if (in_dev->dead)
@@ -1530,6 +1533,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
 #ifdef CONFIG_IP_MULTICAST
 	struct ip_mc_list *im;
 	int type;
+	struct net *net = dev_net(in_dev->dev);
 
 	ASSERT_RTNL();
 
@@ -1537,7 +1541,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
 		if (im->multiaddr == IGMP_ALL_HOSTS)
 			continue;
 		if (ipv4_is_local_multicast(im->multiaddr) &&
-		    !sysctl_igmp_llm_reports)
+		    !net->ipv4.sysctl_igmp_llm_reports)
 			continue;
 
 		/* a failover is happening and switches
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 225659a02cf2..fc40fa1303d3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -852,7 +852,7 @@ static struct ctl_table ipv4_net_table[] = {
 	},
 	{
 		.procname	= "igmp_link_local_mcast_reports",
-		.data		= &sysctl_igmp_llm_reports,
+		.data		= &init_net.ipv4.sysctl_igmp_llm_reports,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 055d8a9a0c61..6c3c1d5232c6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2401,6 +2401,8 @@ static int __net_init tcp_sk_init(struct net *net)
 
 	net->ipv4.sysctl_igmp_max_memberships = 20;
 	net->ipv4.sysctl_igmp_max_msf = 10;
+	/* IGMP reports for link-local multicast groups are enabled by default */
+	net->ipv4.sysctl_igmp_llm_reports = 1;
 
 	return 0;
 fail:
-- 
cgit v1.2.3-59-g8ed1b


From 165094afcee79e4d5b6e94032a5d3be157460b4a Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 8 Feb 2016 23:29:24 +0200
Subject: igmp: Namespacify igmp_qrv sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  2 --
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/igmp.c            | 29 +++++++++++++++++------------
 net/ipv4/sysctl_net_ipv4.c | 20 ++++++++++----------
 net/ipv4/tcp_ipv4.c        |  1 +
 5 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index c683f4bf642b..12f6fba6d21a 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -37,8 +37,6 @@ static inline struct igmpv3_query *
 	return (struct igmpv3_query *)skb_transport_header(skb);
 }
 
-extern int sysctl_igmp_qrv;
-
 struct ip_sf_socklist {
 	unsigned int		sl_max;
 	unsigned int		sl_count;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index cbbf8115e8a7..848fe8056534 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -111,6 +111,7 @@ struct netns_ipv4 {
 	int sysctl_igmp_max_memberships;
 	int sysctl_igmp_max_msf;
 	int sysctl_igmp_llm_reports;
+	int sysctl_igmp_qrv;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 2e22ee0efc98..7c95335bf85e 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -762,9 +762,10 @@ static void igmp_ifc_timer_expire(unsigned long data)
 
 static void igmp_ifc_event(struct in_device *in_dev)
 {
+	struct net *net = dev_net(in_dev->dev);
 	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
 		return;
-	in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+	in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 	igmp_ifc_start_timer(in_dev, 1);
 }
 
@@ -1086,6 +1087,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
 static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 {
 	struct ip_mc_list *pmc;
+	struct net *net = dev_net(in_dev->dev);
 
 	/* this is an "ip_mc_list" for convenience; only the fields below
 	 * are actually used. In particular, the refcnt and users are not
@@ -1100,7 +1102,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 	pmc->interface = im->interface;
 	in_dev_hold(in_dev);
 	pmc->multiaddr = im->multiaddr;
-	pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+	pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 	pmc->sfmode = im->sfmode;
 	if (pmc->sfmode == MCAST_INCLUDE) {
 		struct ip_sf_list *psf;
@@ -1245,7 +1247,7 @@ static void igmp_group_added(struct ip_mc_list *im)
 	}
 	/* else, v3 */
 
-	im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+	im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 	igmp_ifc_event(in_dev);
 #endif
 }
@@ -1314,6 +1316,7 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
 void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 {
 	struct ip_mc_list *im;
+	struct net *net = dev_net(in_dev->dev);
 
 	ASSERT_RTNL();
 
@@ -1340,7 +1343,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	spin_lock_init(&im->lock);
 #ifdef CONFIG_IP_MULTICAST
 	setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
-	im->unsolicit_count = sysctl_igmp_qrv;
+	im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
 #endif
 
 	im->next_rcu = in_dev->mc_list;
@@ -1640,6 +1643,7 @@ void ip_mc_down(struct in_device *in_dev)
 
 void ip_mc_init_dev(struct in_device *in_dev)
 {
+	struct net *net = dev_net(in_dev->dev);
 	ASSERT_RTNL();
 
 #ifdef CONFIG_IP_MULTICAST
@@ -1647,7 +1651,7 @@ void ip_mc_init_dev(struct in_device *in_dev)
 			(unsigned long)in_dev);
 	setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
 			(unsigned long)in_dev);
-	in_dev->mr_qrv = sysctl_igmp_qrv;
+	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
 #endif
 
 	spin_lock_init(&in_dev->mc_tomb_lock);
@@ -1658,11 +1662,12 @@ void ip_mc_init_dev(struct in_device *in_dev)
 void ip_mc_up(struct in_device *in_dev)
 {
 	struct ip_mc_list *pmc;
+	struct net *net = dev_net(in_dev->dev);
 
 	ASSERT_RTNL();
 
 #ifdef CONFIG_IP_MULTICAST
-	in_dev->mr_qrv = sysctl_igmp_qrv;
+	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
 #endif
 	ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
 
@@ -1728,9 +1733,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 /*
  *	Join a socket to a group
  */
-#ifdef CONFIG_IP_MULTICAST
-int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
-#endif
 
 static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
 	__be32 *psfsrc)
@@ -1755,6 +1757,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
 	if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
 #ifdef CONFIG_IP_MULTICAST
 		struct in_device *in_dev = pmc->interface;
+		struct net *net = dev_net(in_dev->dev);
 #endif
 
 		/* no more filters for this source */
@@ -1765,7 +1768,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
 #ifdef CONFIG_IP_MULTICAST
 		if (psf->sf_oldin &&
 		    !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
-			psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+			psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 			psf->sf_next = pmc->tomb;
 			pmc->tomb = psf;
 			rv = 1;
@@ -1823,12 +1826,13 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 	    pmc->sfcount[MCAST_INCLUDE]) {
 #ifdef CONFIG_IP_MULTICAST
 		struct ip_sf_list *psf;
+		struct net *net = dev_net(in_dev->dev);
 #endif
 
 		/* filter mode change */
 		pmc->sfmode = MCAST_INCLUDE;
 #ifdef CONFIG_IP_MULTICAST
-		pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+		pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		in_dev->mr_ifc_count = pmc->crcount;
 		for (psf = pmc->sources; psf; psf = psf->sf_next)
 			psf->sf_crcount = 0;
@@ -1995,6 +1999,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 	} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
 #ifdef CONFIG_IP_MULTICAST
 		struct ip_sf_list *psf;
+		struct net *net = dev_net(pmc->interface->dev);
 		in_dev = pmc->interface;
 #endif
 
@@ -2006,7 +2011,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 #ifdef CONFIG_IP_MULTICAST
 		/* else no filters; keep old mode for reports */
 
-		pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+		pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		in_dev->mr_ifc_count = pmc->crcount;
 		for (psf = pmc->sources; psf; psf = psf->sf_next)
 			psf->sf_crcount = 0;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index fc40fa1303d3..b537338f5c97 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -367,16 +367,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-#ifdef CONFIG_IP_MULTICAST
-	{
-		.procname	= "igmp_qrv",
-		.data		= &sysctl_igmp_qrv,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
-	},
-#endif
 	{
 		.procname	= "inet_peer_threshold",
 		.data		= &inet_peer_threshold,
@@ -871,6 +861,16 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_IP_MULTICAST
+	{
+		.procname	= "igmp_qrv",
+		.data		= &init_net.ipv4.sysctl_igmp_qrv,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one
+	},
+#endif
 	{
 		.procname	= "tcp_keepalive_time",
 		.data		= &init_net.ipv4.sysctl_tcp_keepalive_time,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6c3c1d5232c6..ba5d0146e3f0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2403,6 +2403,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_igmp_max_msf = 10;
 	/* IGMP reports for link-local multicast groups are enabled by default */
 	net->ipv4.sysctl_igmp_llm_reports = 1;
+	net->ipv4.sysctl_igmp_qrv = 2;
 
 	return 0;
 fail:
-- 
cgit v1.2.3-59-g8ed1b


From 795bb1c00dd338aa0d12f9a7f1f4776fb3160416 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 8 Feb 2016 13:14:59 +0100
Subject: net: bulk free infrastructure for NAPI context, use napi_consume_skb

Discovered that network stack were hitting the kmem_cache/SLUB
slowpath when freeing SKBs.  Doing bulk free with kmem_cache_free_bulk
can speedup this slowpath.

NAPI context is a bit special, lets take advantage of that for bulk
free'ing SKBs.

In NAPI context we are running in softirq, which gives us certain
protection.  A softirq can run on several CPUs at once.  BUT the
important part is a softirq will never preempt another softirq running
on the same CPU.  This gives us the opportunity to access per-cpu
variables in softirq context.

Extend napi_alloc_cache (before only contained page_frag_cache) to be
a struct with a small array based stack for holding SKBs.  Introduce a
SKB defer and flush API for accessing this.

Introduce napi_consume_skb() as replacement for e.g. dev_consume_skb_any()
when running in NAPI context.  A small trick to handle/detect if we
are called from netpoll is to see if budget is 0.  In that case, we
need to invoke dev_consume_skb_irq().

Joint work with Alexander Duyck.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  3 ++
 net/core/dev.c         |  1 +
 net/core/skbuff.c      | 83 ++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a8fc2220e8ce..b56c0103fa15 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2404,6 +2404,9 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
 {
 	return __napi_alloc_skb(napi, length, GFP_ATOMIC);
 }
+void napi_consume_skb(struct sk_buff *skb, int budget);
+
+void __kfree_skb_flush(void);
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index f1284835b8c9..9b2c7a999e71 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5155,6 +5155,7 @@ static void net_rx_action(struct softirq_action *h)
 		}
 	}
 
+	__kfree_skb_flush();
 	local_irq_disable();
 
 	list_splice_tail_init(&sd->poll_list, &list);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b0cce744e2a0..b64187b87773 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -347,8 +347,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 }
 EXPORT_SYMBOL(build_skb);
 
+#define NAPI_SKB_CACHE_SIZE	64
+
+struct napi_alloc_cache {
+	struct page_frag_cache page;
+	size_t skb_count;
+	void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
@@ -378,9 +386,9 @@ EXPORT_SYMBOL(netdev_alloc_frag);
 
 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
-	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 
-	return __alloc_page_frag(nc, fragsz, gfp_mask);
+	return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
 }
 
 void *napi_alloc_frag(unsigned int fragsz)
@@ -474,7 +482,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 				 gfp_t gfp_mask)
 {
-	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 	struct sk_buff *skb;
 	void *data;
 
@@ -494,7 +502,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 	if (sk_memalloc_socks())
 		gfp_mask |= __GFP_MEMALLOC;
 
-	data = __alloc_page_frag(nc, len, gfp_mask);
+	data = __alloc_page_frag(&nc->page, len, gfp_mask);
 	if (unlikely(!data))
 		return NULL;
 
@@ -505,7 +513,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 	}
 
 	/* use OR instead of assignment to avoid clearing of bits in mask */
-	if (nc->pfmemalloc)
+	if (nc->page.pfmemalloc)
 		skb->pfmemalloc = 1;
 	skb->head_frag = 1;
 
@@ -747,6 +755,69 @@ void consume_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(consume_skb);
 
+void __kfree_skb_flush(void)
+{
+	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+	/* flush skb_cache if containing objects */
+	if (nc->skb_count) {
+		kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
+				     nc->skb_cache);
+		nc->skb_count = 0;
+	}
+}
+
+static void __kfree_skb_defer(struct sk_buff *skb)
+{
+	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+	/* drop skb->head and call any destructors for packet */
+	skb_release_all(skb);
+
+	/* record skb to CPU local list */
+	nc->skb_cache[nc->skb_count++] = skb;
+
+#ifdef CONFIG_SLUB
+	/* SLUB writes into objects when freeing */
+	prefetchw(skb);
+#endif
+
+	/* flush skb_cache if it is filled */
+	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
+		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
+				     nc->skb_cache);
+		nc->skb_count = 0;
+	}
+}
+
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+	if (unlikely(!skb))
+		return;
+
+	/* if budget is 0 assume netpoll w/ IRQs disabled */
+	if (unlikely(!budget)) {
+		dev_consume_skb_irq(skb);
+		return;
+	}
+
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return;
+	/* if reaching here SKB is ready to free */
+	trace_consume_skb(skb);
+
+	/* if SKB is a clone, don't handle this case */
+	if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
+		__kfree_skb(skb);
+		return;
+	}
+
+	__kfree_skb_defer(skb);
+}
+EXPORT_SYMBOL(napi_consume_skb);
+
 /* Make sure a field is enclosed inside headers_start/headers_end section */
 #define CHECK_SKB_FIELD(field) \
 	BUILD_BUG_ON(offsetof(struct sk_buff, field) <		\
-- 
cgit v1.2.3-59-g8ed1b


From 15fad714be86eab13e7568fecaf475b2a9730d3e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 8 Feb 2016 13:15:04 +0100
Subject: net: bulk free SKBs that were delay free'ed due to IRQ context

The network stack defers SKBs free, in-case free happens in IRQ or
when IRQs are disabled. This happens in __dev_kfree_skb_irq() that
writes SKBs that were free'ed during IRQ to the softirq completion
queue (softnet_data.completion_queue).

These SKBs are naturally delayed, and cleaned up during NET_TX_SOFTIRQ
in function net_tx_action().  Take advantage of this a use the skb
defer and flush API, as we are already in softirq context.

For modern drivers this rarely happens. Although most drivers do call
dev_kfree_skb_any(), which detects the situation and calls
__dev_kfree_skb_irq() when needed.  This due to netpoll can call from
IRQ context.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 net/core/dev.c         | 8 +++++++-
 net/core/skbuff.c      | 8 ++++++--
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b56c0103fa15..6ec86f1a2ed9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2407,6 +2407,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
 void napi_consume_skb(struct sk_buff *skb, int budget);
 
 void __kfree_skb_flush(void);
+void __kfree_skb_defer(struct sk_buff *skb);
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index 9b2c7a999e71..3f4071a84a03 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3829,8 +3829,14 @@ static void net_tx_action(struct softirq_action *h)
 				trace_consume_skb(skb);
 			else
 				trace_kfree_skb(skb, net_tx_action);
-			__kfree_skb(skb);
+
+			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+				__kfree_skb(skb);
+			else
+				__kfree_skb_defer(skb);
 		}
+
+		__kfree_skb_flush();
 	}
 
 	if (sd->output_queue) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b64187b87773..a5bd067ec1a3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -767,7 +767,7 @@ void __kfree_skb_flush(void)
 	}
 }
 
-static void __kfree_skb_defer(struct sk_buff *skb)
+static inline void _kfree_skb_defer(struct sk_buff *skb)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 
@@ -789,6 +789,10 @@ static void __kfree_skb_defer(struct sk_buff *skb)
 		nc->skb_count = 0;
 	}
 }
+void __kfree_skb_defer(struct sk_buff *skb)
+{
+	_kfree_skb_defer(skb);
+}
 
 void napi_consume_skb(struct sk_buff *skb, int budget)
 {
@@ -814,7 +818,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
 		return;
 	}
 
-	__kfree_skb_defer(skb);
+	_kfree_skb_defer(skb);
 }
 EXPORT_SYMBOL(napi_consume_skb);
 
-- 
cgit v1.2.3-59-g8ed1b


From 1580ab63fc9a03593072cc5656167a75c4f1d173 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 11 Feb 2016 16:28:49 -0800
Subject: tcp/dccp: better use of ephemeral ports in connect()

In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.

It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.

I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.

The companion patch does the same in bind(), but in opposite way.

I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c | 170 ++++++++++++++++++++++-----------------------
 1 file changed, 85 insertions(+), 85 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c0f9942de924..bc68eced0105 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -565,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			struct sock *, __u16, struct inet_timewait_sock **))
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
-	const unsigned short snum = inet_sk(sk)->inet_num;
+	struct inet_timewait_sock *tw = NULL;
 	struct inet_bind_hashbucket *head;
-	struct inet_bind_bucket *tb;
-	int ret;
+	int port = inet_sk(sk)->inet_num;
 	struct net *net = sock_net(sk);
+	struct inet_bind_bucket *tb;
+	u32 remaining, offset;
+	int ret, i, low, high;
+	static u32 hint;
+
+	if (port) {
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		tb = inet_csk(sk)->icsk_bind_hash;
+		spin_lock_bh(&head->lock);
+		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+			inet_ehash_nolisten(sk, NULL);
+			spin_unlock_bh(&head->lock);
+			return 0;
+		}
+		spin_unlock(&head->lock);
+		/* No definite answer... Walk to established hash table */
+		ret = check_established(death_row, sk, port, NULL);
+		local_bh_enable();
+		return ret;
+	}
 
-	if (!snum) {
-		int i, remaining, low, high, port;
-		static u32 hint;
-		u32 offset = hint + port_offset;
-		struct inet_timewait_sock *tw = NULL;
+	inet_get_local_port_range(net, &low, &high);
+	high++; /* [32768, 60999] -> [32768, 61000[ */
+	remaining = high - low;
+	if (likely(remaining > 1))
+		remaining &= ~1U;
 
-		inet_get_local_port_range(net, &low, &high);
-		remaining = (high - low) + 1;
+	offset = (hint + port_offset) % remaining;
+	/* In first pass we try ports of @low parity.
+	 * inet_csk_get_port() does the opposite choice.
+	 */
+	offset &= ~1U;
+other_parity_scan:
+	port = low + offset;
+	for (i = 0; i < remaining; i += 2, port += 2) {
+		if (unlikely(port >= high))
+			port -= remaining;
+		if (inet_is_local_reserved_port(net, port))
+			continue;
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		spin_lock_bh(&head->lock);
 
-		/* By starting with offset being an even number,
-		 * we tend to leave about 50% of ports for other uses,
-		 * like bind(0).
+		/* Does not bother with rcv_saddr checks, because
+		 * the established check is already unique enough.
 		 */
-		offset &= ~1;
-
-		local_bh_disable();
-		for (i = 0; i < remaining; i++) {
-			port = low + (i + offset) % remaining;
-			if (inet_is_local_reserved_port(net, port))
-				continue;
-			head = &hinfo->bhash[inet_bhashfn(net, port,
-					hinfo->bhash_size)];
-			spin_lock(&head->lock);
-
-			/* Does not bother with rcv_saddr checks,
-			 * because the established check is already
-			 * unique enough.
-			 */
-			inet_bind_bucket_for_each(tb, &head->chain) {
-				if (net_eq(ib_net(tb), net) &&
-				    tb->port == port) {
-					if (tb->fastreuse >= 0 ||
-					    tb->fastreuseport >= 0)
-						goto next_port;
-					WARN_ON(hlist_empty(&tb->owners));
-					if (!check_established(death_row, sk,
-								port, &tw))
-						goto ok;
+		inet_bind_bucket_for_each(tb, &head->chain) {
+			if (net_eq(ib_net(tb), net) && tb->port == port) {
+				if (tb->fastreuse >= 0 ||
+				    tb->fastreuseport >= 0)
 					goto next_port;
-				}
+				WARN_ON(hlist_empty(&tb->owners));
+				if (!check_established(death_row, sk,
+						       port, &tw))
+					goto ok;
+				goto next_port;
 			}
-
-			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
-					net, head, port);
-			if (!tb) {
-				spin_unlock(&head->lock);
-				break;
-			}
-			tb->fastreuse = -1;
-			tb->fastreuseport = -1;
-			goto ok;
-
-		next_port:
-			spin_unlock(&head->lock);
 		}
-		local_bh_enable();
-
-		return -EADDRNOTAVAIL;
 
-ok:
-		hint += (i + 2) & ~1;
-
-		/* Head lock still held and bh's disabled */
-		inet_bind_hash(sk, tb, port);
-		if (sk_unhashed(sk)) {
-			inet_sk(sk)->inet_sport = htons(port);
-			inet_ehash_nolisten(sk, (struct sock *)tw);
+		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+					     net, head, port);
+		if (!tb) {
+			spin_unlock_bh(&head->lock);
+			return -ENOMEM;
 		}
-		if (tw)
-			inet_twsk_bind_unhash(tw, hinfo);
-		spin_unlock(&head->lock);
+		tb->fastreuse = -1;
+		tb->fastreuseport = -1;
+		goto ok;
+next_port:
+		spin_unlock_bh(&head->lock);
+		cond_resched();
+	}
 
-		if (tw)
-			inet_twsk_deschedule_put(tw);
+	offset++;
+	if ((offset & 1) && remaining > 1)
+		goto other_parity_scan;
 
-		ret = 0;
-		goto out;
-	}
+	return -EADDRNOTAVAIL;
 
-	head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
-	tb  = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		inet_ehash_nolisten(sk, NULL);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
-		spin_unlock(&head->lock);
-		/* No definite answer... Walk to established hash table */
-		ret = check_established(death_row, sk, snum, NULL);
-out:
-		local_bh_enable();
-		return ret;
+ok:
+	hint += i + 2;
+
+	/* Head lock still held and bh's disabled */
+	inet_bind_hash(sk, tb, port);
+	if (sk_unhashed(sk)) {
+		inet_sk(sk)->inet_sport = htons(port);
+		inet_ehash_nolisten(sk, (struct sock *)tw);
 	}
+	if (tw)
+		inet_twsk_bind_unhash(tw, hinfo);
+	spin_unlock(&head->lock);
+	if (tw)
+		inet_twsk_deschedule_put(tw);
+	local_bh_enable();
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From ea8add2b190395408b22a9127bed2c0912aecbc8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 11 Feb 2016 16:28:50 -0800
Subject: tcp/dccp: better use of ephemeral ports in bind()

Implement strategy used in __inet_hash_connect() in opposite way :

Try to find a candidate using odd ports, then fallback to even ports.

We no longer disable BH for whole traversal, but one bucket at a time.
We also use cond_resched() to yield cpu to other tasks if needed.

I removed one indentation level and tried to mirror the loop we have
in __inet_hash_connect() and variable names to ease code maintenance.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 240 +++++++++++++++++++---------------------
 1 file changed, 114 insertions(+), 126 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c16a2e6273d9..3d28c6d5c3c3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -91,165 +91,153 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 
 /* Obtain a reference to a local port for the given sock,
  * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
  */
 int inet_csk_get_port(struct sock *sk, unsigned short snum)
 {
-	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	int ret = 1, attempts = 5, port = snum;
+	int smallest_size = -1, smallest_port;
 	struct inet_bind_hashbucket *head;
-	struct inet_bind_bucket *tb;
-	int ret, attempts = 5;
 	struct net *net = sock_net(sk);
-	int smallest_size = -1, smallest_rover;
+	int i, low, high, attempt_half;
+	struct inet_bind_bucket *tb;
 	kuid_t uid = sock_i_uid(sk);
-	int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+	u32 remaining, offset;
 
-	local_bh_disable();
-	if (!snum) {
-		int remaining, rover, low, high;
+	if (port) {
+have_port:
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		spin_lock_bh(&head->lock);
+		inet_bind_bucket_for_each(tb, &head->chain)
+			if (net_eq(ib_net(tb), net) && tb->port == port)
+				goto tb_found;
 
+		goto tb_not_found;
+	}
 again:
-		inet_get_local_port_range(net, &low, &high);
-		if (attempt_half) {
-			int half = low + ((high - low) >> 1);
-
-			if (attempt_half == 1)
-				high = half;
-			else
-				low = half;
-		}
-		remaining = (high - low) + 1;
-		smallest_rover = rover = prandom_u32() % remaining + low;
-
-		smallest_size = -1;
-		do {
-			if (inet_is_local_reserved_port(net, rover))
-				goto next_nolock;
-			head = &hashinfo->bhash[inet_bhashfn(net, rover,
-					hashinfo->bhash_size)];
-			spin_lock(&head->lock);
-			inet_bind_bucket_for_each(tb, &head->chain)
-				if (net_eq(ib_net(tb), net) && tb->port == rover) {
-					if (((tb->fastreuse > 0 &&
-					      sk->sk_reuse &&
-					      sk->sk_state != TCP_LISTEN) ||
-					     (tb->fastreuseport > 0 &&
-					      sk->sk_reuseport &&
-					      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-					      uid_eq(tb->fastuid, uid))) &&
-					    (tb->num_owners < smallest_size || smallest_size == -1)) {
-						smallest_size = tb->num_owners;
-						smallest_rover = rover;
-					}
-					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-						snum = rover;
-						goto tb_found;
-					}
-					goto next;
+	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+other_half_scan:
+	inet_get_local_port_range(net, &low, &high);
+	high++; /* [32768, 60999] -> [32768, 61000[ */
+	if (high - low < 4)
+		attempt_half = 0;
+	if (attempt_half) {
+		int half = low + (((high - low) >> 2) << 1);
+
+		if (attempt_half == 1)
+			high = half;
+		else
+			low = half;
+	}
+	remaining = high - low;
+	if (likely(remaining > 1))
+		remaining &= ~1U;
+
+	offset = prandom_u32() % remaining;
+	/* __inet_hash_connect() favors ports having @low parity
+	 * We do the opposite to not pollute connect() users.
+	 */
+	offset |= 1U;
+	smallest_size = -1;
+	smallest_port = low; /* avoid compiler warning */
+
+other_parity_scan:
+	port = low + offset;
+	for (i = 0; i < remaining; i += 2, port += 2) {
+		if (unlikely(port >= high))
+			port -= remaining;
+		if (inet_is_local_reserved_port(net, port))
+			continue;
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		spin_lock_bh(&head->lock);
+		inet_bind_bucket_for_each(tb, &head->chain)
+			if (net_eq(ib_net(tb), net) && tb->port == port) {
+				if (((tb->fastreuse > 0 && reuse) ||
+				     (tb->fastreuseport > 0 &&
+				      sk->sk_reuseport &&
+				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+				      uid_eq(tb->fastuid, uid))) &&
+				    (tb->num_owners < smallest_size || smallest_size == -1)) {
+					smallest_size = tb->num_owners;
+					smallest_port = port;
 				}
-			break;
-		next:
-			spin_unlock(&head->lock);
-		next_nolock:
-			if (++rover > high)
-				rover = low;
-		} while (--remaining > 0);
-
-		/* Exhausted local port range during search?  It is not
-		 * possible for us to be holding one of the bind hash
-		 * locks if this test triggers, because if 'remaining'
-		 * drops to zero, we broke out of the do/while loop at
-		 * the top level, not from the 'break;' statement.
-		 */
-		ret = 1;
-		if (remaining <= 0) {
-			if (smallest_size != -1) {
-				snum = smallest_rover;
-				goto have_snum;
+				if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
+					goto tb_found;
+				goto next_port;
 			}
-			if (attempt_half == 1) {
-				/* OK we now try the upper half of the range */
-				attempt_half = 2;
-				goto again;
-			}
-			goto fail;
-		}
-		/* OK, here is the one we will use.  HEAD is
-		 * non-NULL and we hold it's mutex.
-		 */
-		snum = rover;
-	} else {
-have_snum:
-		head = &hashinfo->bhash[inet_bhashfn(net, snum,
-				hashinfo->bhash_size)];
-		spin_lock(&head->lock);
-		inet_bind_bucket_for_each(tb, &head->chain)
-			if (net_eq(ib_net(tb), net) && tb->port == snum)
-				goto tb_found;
+		goto tb_not_found;
+next_port:
+		spin_unlock_bh(&head->lock);
+		cond_resched();
 	}
-	tb = NULL;
-	goto tb_not_found;
+
+	if (smallest_size != -1) {
+		port = smallest_port;
+		goto have_port;
+	}
+	offset--;
+	if (!(offset & 1))
+		goto other_parity_scan;
+
+	if (attempt_half == 1) {
+		/* OK we now try the upper half of the range */
+		attempt_half = 2;
+		goto other_half_scan;
+	}
+	return ret;
+
+tb_not_found:
+	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+				     net, head, port);
+	if (!tb)
+		goto fail_unlock;
 tb_found:
 	if (!hlist_empty(&tb->owners)) {
 		if (sk->sk_reuse == SK_FORCE_REUSE)
 			goto success;
 
-		if (((tb->fastreuse > 0 &&
-		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+		if (((tb->fastreuse > 0 && reuse) ||
 		     (tb->fastreuseport > 0 &&
-		      sk->sk_reuseport &&
-		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-		      uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
+		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+		    smallest_size == -1)
 			goto success;
-		} else {
-			ret = 1;
-			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
-				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
-				     (tb->fastreuseport > 0 &&
-				      sk->sk_reuseport &&
-				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-				      uid_eq(tb->fastuid, uid))) &&
-				    smallest_size != -1 && --attempts >= 0) {
-					spin_unlock(&head->lock);
-					goto again;
-				}
-
-				goto fail_unlock;
+		if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+			if ((reuse ||
+			     (tb->fastreuseport > 0 &&
+			      sk->sk_reuseport &&
+			      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+			      uid_eq(tb->fastuid, uid))) &&
+			    smallest_size != -1 && --attempts >= 0) {
+				spin_unlock_bh(&head->lock);
+				goto again;
 			}
+			goto fail_unlock;
 		}
-	}
-tb_not_found:
-	ret = 1;
-	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
-					net, head, snum)) == NULL)
-		goto fail_unlock;
-	if (hlist_empty(&tb->owners)) {
-		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
-			tb->fastreuse = 1;
-		else
+		if (!reuse)
 			tb->fastreuse = 0;
+		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
+			tb->fastreuseport = 0;
+	} else {
+		tb->fastreuse = reuse;
 		if (sk->sk_reuseport) {
 			tb->fastreuseport = 1;
 			tb->fastuid = uid;
-		} else
-			tb->fastreuseport = 0;
-	} else {
-		if (tb->fastreuse &&
-		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
-			tb->fastreuse = 0;
-		if (tb->fastreuseport &&
-		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
+		} else {
 			tb->fastreuseport = 0;
+		}
 	}
 success:
 	if (!inet_csk(sk)->icsk_bind_hash)
-		inet_bind_hash(sk, tb, snum);
+		inet_bind_hash(sk, tb, port);
 	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
 	ret = 0;
 
 fail_unlock:
-	spin_unlock(&head->lock);
-fail:
-	local_bh_enable();
+	spin_unlock_bh(&head->lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(inet_csk_get_port);
-- 
cgit v1.2.3-59-g8ed1b


From 179bc67f69b6cb53ad68cfdec5a917c2a2248355 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 11 Feb 2016 20:48:04 +0000
Subject: net: local checksum offload for encapsulation

The arithmetic properties of the ones-complement checksum mean that a
 correctly checksummed inner packet, including its checksum, has a ones
 complement sum depending only on whatever value was used to initialise
 the checksum field before checksumming (in the case of TCP and UDP,
 this is the ones complement sum of the pseudo header, complemented).
Consequently, if we are going to offload the inner checksum with
 CHECKSUM_PARTIAL, we can compute the outer checksum based only on the
 packed data not covered by the inner checksum, and the initial value of
 the inner checksum field.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 24 ++++++++++++++++++++++++
 net/ipv4/ip_tunnel_core.c | 10 +++++-----
 net/ipv4/udp.c            | 20 ++++++++++----------
 net/ipv6/ip6_checksum.c   | 14 +++++++-------
 4 files changed, 46 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6ec86f1a2ed9..cf906d1ce8a7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3702,5 +3702,29 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
 	return hdr_len + skb_gso_transport_seglen(skb);
 }
 
+/* Local Checksum Offload.
+ * Compute outer checksum based on the assumption that the
+ * inner checksum will be offloaded later.
+ * Fill in outer checksum adjustment (e.g. with sum of outer
+ * pseudo-header) before calling.
+ * Also ensure that inner checksum is in linear data area.
+ */
+static inline __wsum lco_csum(struct sk_buff *skb)
+{
+	char *inner_csum_field;
+	__wsum csum;
+
+	/* Start with complement of inner checksum adjustment */
+	inner_csum_field = skb->data + skb_checksum_start_offset(skb) +
+				skb->csum_offset;
+	csum = ~csum_unfold(*(__force __sum16 *)inner_csum_field);
+	/* Add in checksum of our headers (incl. outer checksum
+	 * adjustment filled in by caller)
+	 */
+	csum = skb_checksum(skb, 0, skb_checksum_start_offset(skb), csum);
+	/* The result is the checksum from skb->data to end of packet */
+	return csum;
+}
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 859d415c0b2d..d74ce93de1fe 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -166,20 +166,20 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
 		return skb;
 	}
 
-	/* If packet is not gso and we are resolving any partial checksum,
+	/* If packet is not gso and we are not offloading inner checksum,
 	 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
 	 * on the outer header without confusing devices that implement
 	 * NETIF_F_IP_CSUM with encapsulation.
 	 */
-	if (csum_help)
-		skb->encapsulation = 0;
-
 	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
+		skb->encapsulation = 0;
 		err = skb_checksum_help(skb);
 		if (unlikely(err))
 			goto error;
-	} else if (skb->ip_summed != CHECKSUM_PARTIAL)
+	} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
 		skb->ip_summed = CHECKSUM_NONE;
+		skb->encapsulation = 0;
+	}
 
 	return skb;
 error:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ac3cedb25a9f..a59341cf483e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -848,16 +848,18 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 {
 	struct udphdr *uh = udp_hdr(skb);
 
-	if (nocheck)
+	if (nocheck) {
 		uh->check = 0;
-	else if (skb_is_gso(skb))
+	} else if (skb_is_gso(skb)) {
 		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
-	else if (skb_dst(skb) && skb_dst(skb)->dev &&
-		 (skb_dst(skb)->dev->features &
-		  (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
-
-		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		uh->check = 0;
+		uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	} else if (skb_dst(skb) && skb_dst(skb)->dev &&
+		   (skb_dst(skb)->dev->features &
+		    (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct udphdr, check);
@@ -865,8 +867,6 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 	} else {
 		__wsum csum;
 
-		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
 		uh->check = 0;
 		csum = skb_checksum(skb, 0, len, 0);
 		uh->check = udp_v4_check(len, saddr, daddr, csum);
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index 9a4d7322fb22..4924bd704e89 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -98,11 +98,13 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb,
 		uh->check = 0;
 	else if (skb_is_gso(skb))
 		uh->check = ~udp_v6_check(len, saddr, daddr, 0);
-	else if (skb_dst(skb) && skb_dst(skb)->dev &&
-		 (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) {
-
-		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
+	else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		uh->check = 0;
+		uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb));
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	} else if (skb_dst(skb) && skb_dst(skb)->dev &&
+		   (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) {
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct udphdr, check);
@@ -110,8 +112,6 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb,
 	} else {
 		__wsum csum;
 
-		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
 		uh->check = 0;
 		csum = skb_checksum(skb, 0, len, 0);
 		uh->check = udp_v6_check(len, saddr, daddr, csum);
-- 
cgit v1.2.3-59-g8ed1b


From d75f1306d9464d535007f05e5da0afcf9e3916d9 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 11 Feb 2016 20:49:40 +0000
Subject: net: udp: always set up for CHECKSUM_PARTIAL offload

If the dst device doesn't support it, it'll get fixed up later anyway
 by validate_xmit_skb().  Also, this allows us to take advantage of LCO
 to avoid summing the payload multiple times.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c          | 14 +-------------
 net/ipv6/ip6_checksum.c | 13 +------------
 2 files changed, 2 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a59341cf483e..9fc4e9c06aae 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -857,23 +857,11 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 		uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
 		if (uh->check == 0)
 			uh->check = CSUM_MANGLED_0;
-	} else if (skb_dst(skb) && skb_dst(skb)->dev &&
-		   (skb_dst(skb)->dev->features &
-		    (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
+	} else {
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct udphdr, check);
 		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
-	} else {
-		__wsum csum;
-
-		uh->check = 0;
-		csum = skb_checksum(skb, 0, len, 0);
-		uh->check = udp_v4_check(len, saddr, daddr, csum);
-		if (uh->check == 0)
-			uh->check = CSUM_MANGLED_0;
-
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 }
 EXPORT_SYMBOL(udp_set_csum);
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index 4924bd704e89..8f920580976f 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -103,22 +103,11 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb,
 		uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb));
 		if (uh->check == 0)
 			uh->check = CSUM_MANGLED_0;
-	} else if (skb_dst(skb) && skb_dst(skb)->dev &&
-		   (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) {
+	} else {
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct udphdr, check);
 		uh->check = ~udp_v6_check(len, saddr, daddr, 0);
-	} else {
-		__wsum csum;
-
-		uh->check = 0;
-		csum = skb_checksum(skb, 0, len, 0);
-		uh->check = udp_v6_check(len, saddr, daddr, csum);
-		if (uh->check == 0)
-			uh->check = CSUM_MANGLED_0;
-
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 }
 EXPORT_SYMBOL(udp6_set_csum);
-- 
cgit v1.2.3-59-g8ed1b


From 06f622926d0cdf923e9b33ab9f0a4d4b45983e8a Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 11 Feb 2016 21:00:16 +0000
Subject: fou: enable LCO in FOU and GUE

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 976f0dcf6991..dac1874a5911 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -774,7 +774,6 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
 	uh->dest = e->dport;
 	uh->source = sport;
 	uh->len = htons(skb->len);
-	uh->check = 0;
 	udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
 		     fl4->saddr, fl4->daddr, skb->len);
 
@@ -784,11 +783,11 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
 int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 		     u8 *protocol, struct flowi4 *fl4)
 {
-	bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
-	int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+						       SKB_GSO_UDP_TUNNEL;
 	__be16 sport;
 
-	skb = iptunnel_handle_offloads(skb, csum, type);
+	skb = iptunnel_handle_offloads(skb, false, type);
 
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
@@ -804,8 +803,8 @@ EXPORT_SYMBOL(fou_build_header);
 int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 		     u8 *protocol, struct flowi4 *fl4)
 {
-	bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
-	int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+						       SKB_GSO_UDP_TUNNEL;
 	struct guehdr *guehdr;
 	size_t hdrlen, optlen = 0;
 	__be16 sport;
@@ -814,7 +813,6 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 
 	if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
 	    skb->ip_summed == CHECKSUM_PARTIAL) {
-		csum = false;
 		optlen += GUE_PLEN_REMCSUM;
 		type |= SKB_GSO_TUNNEL_REMCSUM;
 		need_priv = true;
@@ -822,7 +820,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 
 	optlen += need_priv ? GUE_LEN_PRIV : 0;
 
-	skb = iptunnel_handle_offloads(skb, csum, type);
+	skb = iptunnel_handle_offloads(skb, false, type);
 
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
-- 
cgit v1.2.3-59-g8ed1b


From 53936107ba2462f714bae19a754f3ebf69a11e40 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 11 Feb 2016 21:01:07 +0000
Subject: net: gre: Implement LCO for GRE over IPv4

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7c51c4e1661f..9b31532d95f4 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -440,6 +440,17 @@ drop:
 	return 0;
 }
 
+static __sum16 gre_checksum(struct sk_buff *skb)
+{
+	__wsum csum;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		csum = lco_csum(skb);
+	else
+		csum = skb_checksum(skb, 0, skb->len, 0);
+	return csum_fold(csum);
+}
+
 static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
 			 __be16 proto, __be32 key, __be32 seq)
 {
@@ -467,8 +478,7 @@ static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
 		    !(skb_shinfo(skb)->gso_type &
 		      (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
 			*ptr = 0;
-			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
-								 skb->len, 0));
+			*(__sum16 *)ptr = gre_checksum(skb);
 		}
 	}
 }
@@ -493,7 +503,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
 					   bool csum)
 {
-	return iptunnel_handle_offloads(skb, csum,
+	return iptunnel_handle_offloads(skb, false,
 					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 6fa79666e24d32be1b709f5269af41ed9e829e7e Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 11 Feb 2016 21:02:31 +0000
Subject: net: ip_tunnel: remove 'csum_help' argument to
 iptunnel_handle_offloads

All users now pass false, so we can remove it, and remove the code that
 was conditional upon it.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c             |  2 +-
 include/net/ip_tunnels.h        |  3 +--
 include/net/udp_tunnel.h        |  3 +--
 net/ipv4/fou.c                  |  4 ++--
 net/ipv4/ip_gre.c               |  3 +--
 net/ipv4/ip_tunnel_core.c       | 18 ++++++------------
 net/ipv4/ipip.c                 |  2 +-
 net/ipv6/sit.c                  |  4 ++--
 net/netfilter/ipvs/ip_vs_xmit.c |  6 ++----
 9 files changed, 17 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 9f52203ac860..0a23c64379d6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1721,7 +1721,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
 	if (WARN_ON(!skb))
 		return -ENOMEM;
 
-	skb = iptunnel_handle_offloads(skb, false, type);
+	skb = iptunnel_handle_offloads(skb, type);
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 6db96ea0144f..bc439f32baa9 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -279,8 +279,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 					     gfp_t flags);
 
-struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum,
-					 int gso_type_mask);
+struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask);
 
 static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
 {
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 734c15662ea9..97f5adb121a6 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -103,8 +103,7 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb,
 {
 	int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
 
-	/* As we're a UDP tunnel, we support LCO, so don't need csum_help */
-	return iptunnel_handle_offloads(skb, false, type);
+	return iptunnel_handle_offloads(skb, type);
 }
 
 static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index dac1874a5911..88dab0c1670c 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -787,7 +787,7 @@ int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 						       SKB_GSO_UDP_TUNNEL;
 	__be16 sport;
 
-	skb = iptunnel_handle_offloads(skb, false, type);
+	skb = iptunnel_handle_offloads(skb, type);
 
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
@@ -820,7 +820,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 
 	optlen += need_priv ? GUE_LEN_PRIV : 0;
 
-	skb = iptunnel_handle_offloads(skb, false, type);
+	skb = iptunnel_handle_offloads(skb, type);
 
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9b31532d95f4..65748db44285 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -503,8 +503,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
 					   bool csum)
 {
-	return iptunnel_handle_offloads(skb, false,
-					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 }
 
 static struct rtable *gre_get_rt(struct sk_buff *skb,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index d74ce93de1fe..a6e58b6141cd 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -148,7 +148,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
 
 struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
-					 bool csum_help,
 					 int gso_type_mask)
 {
 	int err;
@@ -166,18 +165,13 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
 		return skb;
 	}
 
-	/* If packet is not gso and we are not offloading inner checksum,
-	 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
-	 * on the outer header without confusing devices that implement
-	 * NETIF_F_IP_CSUM with encapsulation.
-	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
-		skb->encapsulation = 0;
-		err = skb_checksum_help(skb);
-		if (unlikely(err))
-			goto error;
-	} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
 		skb->ip_summed = CHECKSUM_NONE;
+		/* We clear encapsulation here to prevent badly-written
+		 * drivers potentially deciding to offload an inner checksum
+		 * if we set CHECKSUM_PARTIAL on the outer header.
+		 * This should go away when the drivers are all fixed.
+		 */
 		skb->encapsulation = 0;
 	}
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4044da61e747..6ec5b42fd172 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb->protocol != htons(ETH_P_IP)))
 		goto tx_error;
 
-	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
 	if (IS_ERR(skb))
 		goto out;
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 2066d1c25a11..9a6b407f5840 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -911,7 +911,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		goto tx_error;
 	}
 
-	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT);
+	skb = iptunnel_handle_offloads(skb, SKB_GSO_SIT);
 	if (IS_ERR(skb)) {
 		ip_rt_put(rt);
 		goto out;
@@ -1000,7 +1000,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr  *tiph = &tunnel->parms.iph;
 
-	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
 	if (IS_ERR(skb))
 		goto out;
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 3264cb49b333..a3f5cd9b3c4c 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -1019,8 +1019,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (IS_ERR(skb))
 		goto tx_error;
 
-	skb = iptunnel_handle_offloads(
-		skb, false, __tun_gso_type_mask(AF_INET, cp->af));
+	skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af));
 	if (IS_ERR(skb))
 		goto tx_error;
 
@@ -1112,8 +1111,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (IS_ERR(skb))
 		goto tx_error;
 
-	skb = iptunnel_handle_offloads(
-		skb, false, __tun_gso_type_mask(AF_INET6, cp->af));
+	skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af));
 	if (IS_ERR(skb))
 		goto tx_error;
 
-- 
cgit v1.2.3-59-g8ed1b


From fc4fa6e112c0f999fab022a4eb7f6614bb47c7ab Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Sun, 13 Dec 2015 15:26:11 +0900
Subject: treewide: Fix typo in printk

This patch fix spelling typos found in printk and Kconfig.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/atm/firestream.c                            | 2 +-
 drivers/crypto/nx/nx-842.c                          | 2 +-
 drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c       | 2 +-
 drivers/input/touchscreen/wdt87xx_i2c.c             | 2 +-
 drivers/net/ethernet/nuvoton/w90p910_ether.c        | 2 +-
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c | 2 +-
 drivers/usb/gadget/legacy/Kconfig                   | 3 +--
 kernel/time/timekeeping.c                           | 2 +-
 lib/842/842_decompress.c                            | 2 +-
 net/openvswitch/vport-geneve.c                      | 2 +-
 tools/testing/selftests/timers/alarmtimer-suspend.c | 2 +-
 11 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 82f2ae0d7cc4..a969a7e443be 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -168,7 +168,7 @@ static char *res_strings[] = {
 	"reserved 14", 
 	"Unrecognized cell", 
 	"reserved 16", 
-	"reassemby abort: AAL5 abort", 
+	"reassembly abort: AAL5 abort", 
 	"packet purged", 
 	"packet ageing timeout", 
 	"channel ageing timeout", 
diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
index 046c1c45411b..d94e25df503b 100644
--- a/drivers/crypto/nx/nx-842.c
+++ b/drivers/crypto/nx/nx-842.c
@@ -308,7 +308,7 @@ int nx842_crypto_compress(struct crypto_tfm *tfm,
 		h = !n && add_header ? hdrsize : 0;
 
 		if (ignore)
-			pr_warn("interal error, ignore is set %x\n", ignore);
+			pr_warn("internal error, ignore is set %x\n", ignore);
 
 		ret = compress(ctx, &p, &hdr->group[n], &c, &ignore, h);
 		if (ret)
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
index 85dc3f989ff7..704d08744116 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
@@ -64,7 +64,7 @@ const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state)
 	case IB_QPS_ERR:
 		return "ERR";
 	default:
-		return "UNKOWN STATE";
+		return "UNKNOWN STATE";
 
 	}
 }
diff --git a/drivers/input/touchscreen/wdt87xx_i2c.c b/drivers/input/touchscreen/wdt87xx_i2c.c
index 515c20a6e10f..73861ad22df4 100644
--- a/drivers/input/touchscreen/wdt87xx_i2c.c
+++ b/drivers/input/touchscreen/wdt87xx_i2c.c
@@ -848,7 +848,7 @@ static int wdt87xx_do_update_firmware(struct i2c_client *client,
 	error = wdt87xx_get_sysparam(client, &wdt->param);
 	if (error)
 		dev_err(&client->dev,
-			"failed to refresh system paramaters: %d\n", error);
+			"failed to refresh system parameters: %d\n", error);
 out:
 	enable_irq(client->irq);
 	mutex_unlock(&wdt->fw_mutex);
diff --git a/drivers/net/ethernet/nuvoton/w90p910_ether.c b/drivers/net/ethernet/nuvoton/w90p910_ether.c
index afa445842f3e..52d9a94aebb9 100644
--- a/drivers/net/ethernet/nuvoton/w90p910_ether.c
+++ b/drivers/net/ethernet/nuvoton/w90p910_ether.c
@@ -1038,7 +1038,7 @@ static int w90p910_ether_probe(struct platform_device *pdev)
 
 	error = register_netdev(dev);
 	if (error != 0) {
-		dev_err(&pdev->dev, "Regiter EMC w90p910 FAILED\n");
+		dev_err(&pdev->dev, "Register EMC w90p910 FAILED\n");
 		error = -ENODEV;
 		goto failed_put_rmiiclk;
 	}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c
index b57cfd965196..95dcbff4673b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c
@@ -626,7 +626,7 @@ static void rtl8821ae_dm_find_minimum_rssi(struct ieee80211_hw *hw)
 		rtl_dm_dig->min_undec_pwdb_for_dm =
 		    rtlpriv->dm.entry_min_undec_sm_pwdb;
 		RT_TRACE(rtlpriv, COMP_BB_POWERSAVING, DBG_LOUD,
-			 "AP Ext Port or disconnet PWDB = 0x%x\n",
+			 "AP Ext Port or disconnect PWDB = 0x%x\n",
 			 rtl_dm_dig->min_undec_pwdb_for_dm);
 	}
 	RT_TRACE(rtlpriv, COMP_DIG, DBG_LOUD,
diff --git a/drivers/usb/gadget/legacy/Kconfig b/drivers/usb/gadget/legacy/Kconfig
index 4d682ad7bf23..fb1cdd9af472 100644
--- a/drivers/usb/gadget/legacy/Kconfig
+++ b/drivers/usb/gadget/legacy/Kconfig
@@ -103,8 +103,7 @@ config USB_ETH
 	   - CDC Ethernet Emulation Model (EEM) is a newer standard that has
 	     a simpler interface that can be used by more USB hardware.
 
-	  RNDIS support is an additional option, more demanding than than
-	  subset.
+	  RNDIS support is an additional option, more demanding than subset.
 
 	  Within the USB device, this gadget driver exposes a network device
 	  "usbX", where X depends on what other networking devices you have.
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b1356b7ae570..0d4cc7601df7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 		printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
 	} else {
 		if (offset > (max_cycles >> 1)) {
-			printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+			printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
 					offset, name, max_cycles >> 1);
 			printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
 		}
diff --git a/lib/842/842_decompress.c b/lib/842/842_decompress.c
index 8881dad2a6a0..a2a941f8112d 100644
--- a/lib/842/842_decompress.c
+++ b/lib/842/842_decompress.c
@@ -250,7 +250,7 @@ static int do_op(struct sw842_param *p, u8 o)
 		case OP_ACTION_NOOP:
 			break;
 		default:
-			pr_err("Interal error, invalid op %x\n", op);
+			pr_err("Internal error, invalid op %x\n", op);
 			return -EINVAL;
 		}
 
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index efb736bb6855..69f1de58a3b4 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -133,6 +133,6 @@ static void __exit ovs_geneve_tnl_exit(void)
 module_init(ovs_geneve_tnl_init);
 module_exit(ovs_geneve_tnl_exit);
 
-MODULE_DESCRIPTION("OVS: Geneve swiching port");
+MODULE_DESCRIPTION("OVS: Geneve switching port");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("vport-type-5");
diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c
index 72cacf5383dd..2b361b830395 100644
--- a/tools/testing/selftests/timers/alarmtimer-suspend.c
+++ b/tools/testing/selftests/timers/alarmtimer-suspend.c
@@ -153,7 +153,7 @@ int main(void)
 
 		alarmcount = 0;
 		if (timer_create(alarm_clock_id, &se, &tm1) == -1) {
-			printf("timer_create failled, %s unspported?\n",
+			printf("timer_create failed, %s unsupported?\n",
 					clockstring(alarm_clock_id));
 			break;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From d4ab4286276fcd6c155bafdf4422b712068d2516 Mon Sep 17 00:00:00 2001
From: "Keller, Jacob E" <jacob.e.keller@intel.com>
Date: Mon, 8 Feb 2016 16:05:03 -0800
Subject: ethtool: correctly ensure {GS}CHANNELS doesn't conflict with GS{RXFH}

Ethernet drivers implementing both {GS}RXFH and {GS}CHANNELS ethtool ops
incorrectly allow SCHANNELS when it would conflict with the settings
from SRXFH. This occurs because it is not possible for drivers to
understand whether their Rx flow indirection table has been configured
or is in the default state. In addition, drivers currently behave in
various ways when increasing the number of Rx channels.

Some drivers will always destroy the Rx flow indirection table when this
occurs, whether it has been set by the user or not. Other drivers will
attempt to preserve the table even if the user has never modified it
from the default driver settings. Neither of these situation is
desirable because it leads to unexpected behavior or loss of user
configuration.

The correct behavior is to simply return -EINVAL when SCHANNELS would
conflict with the current Rx flow table settings. However, it should
only do so if the current settings were modified by the user. If we
required that the new settings never conflict with the current (default)
Rx flow settings, we would force users to first reduce their Rx flow
settings and then reduce the number of Rx channels.

This patch proposes a solution implemented in net/core/ethtool.c which
ensures that all drivers behave correctly. It checks whether the RXFH
table has been configured to non-default settings, and stores this
information in a private netdev flag. When the number of channels is
requested to change, it first ensures that the current Rx flow table is
not going to assign flows to now disabled channels.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  8 +++++++
 net/core/ethtool.c        | 55 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 219f53c30cb3..0499569c256d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1291,6 +1291,7 @@ struct net_device_ops {
  * @IFF_OPENVSWITCH: device is a Open vSwitch master
  * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
  * @IFF_TEAM: device is a team device
+ * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1318,6 +1319,7 @@ enum netdev_priv_flags {
 	IFF_OPENVSWITCH			= 1<<22,
 	IFF_L3MDEV_SLAVE		= 1<<23,
 	IFF_TEAM			= 1<<24,
+	IFF_RXFH_CONFIGURED		= 1<<25,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1345,6 +1347,7 @@ enum netdev_priv_flags {
 #define IFF_OPENVSWITCH			IFF_OPENVSWITCH
 #define IFF_L3MDEV_SLAVE		IFF_L3MDEV_SLAVE
 #define IFF_TEAM			IFF_TEAM
+#define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -4048,6 +4051,11 @@ static inline bool netif_is_lag_port(const struct net_device *dev)
 	return netif_is_bond_slave(dev) || netif_is_team_port(dev);
 }
 
+static inline bool netif_is_rxfh_configured(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_RXFH_CONFIGURED;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 453c803f1c87..379bdc59b1c8 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -642,6 +642,37 @@ void netdev_rss_key_fill(void *buffer, size_t len)
 }
 EXPORT_SYMBOL(netdev_rss_key_fill);
 
+static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
+{
+	u32 dev_size, current_max = 0;
+	u32 *indir;
+	int ret;
+
+	if (!dev->ethtool_ops->get_rxfh_indir_size ||
+	    !dev->ethtool_ops->get_rxfh)
+		return -EOPNOTSUPP;
+	dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+	if (dev_size == 0)
+		return -EOPNOTSUPP;
+
+	indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+	if (!indir)
+		return -ENOMEM;
+
+	ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+	if (ret)
+		goto out;
+
+	while (dev_size--)
+		current_max = max(current_max, indir[dev_size]);
+
+	*max = current_max;
+
+out:
+	kfree(indir);
+	return ret;
+}
+
 static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
 						     void __user *useraddr)
 {
@@ -738,6 +769,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 	}
 
 	ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
+	if (ret)
+		goto out;
+
+	/* indicate whether rxfh was set to default */
+	if (user_size == 0)
+		dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+	else
+		dev->priv_flags |= IFF_RXFH_CONFIGURED;
 
 out:
 	kfree(indir);
@@ -897,6 +936,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	}
 
 	ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+	if (ret)
+		goto out;
+
+	/* indicate whether rxfh was set to default */
+	if (rxfh.indir_size == 0)
+		dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+	else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+		dev->priv_flags |= IFF_RXFH_CONFIGURED;
 
 out:
 	kfree(rss_config);
@@ -1228,6 +1275,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 						   void __user *useraddr)
 {
 	struct ethtool_channels channels;
+	u32 max_rx_in_use = 0;
 
 	if (!dev->ethtool_ops->set_channels)
 		return -EOPNOTSUPP;
@@ -1235,6 +1283,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 	if (copy_from_user(&channels, useraddr, sizeof(channels)))
 		return -EFAULT;
 
+	/* ensure the new Rx count fits within the configured Rx flow
+	 * indirection table settings */
+	if (netif_is_rxfh_configured(dev) &&
+	    !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
+	    (channels.combined_count + channels.rx_count) <= max_rx_in_use)
+	    return -EINVAL;
+
 	return dev->ethtool_ops->set_channels(dev, &channels);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 8bf3686204861d39803797ebbd1e264442421907 Mon Sep 17 00:00:00 2001
From: "Keller, Jacob E" <jacob.e.keller@intel.com>
Date: Mon, 8 Feb 2016 16:05:04 -0800
Subject: ethtool: ensure channel counts are within bounds during SCHANNELS

Add a sanity check to ensure that all requested channel sizes are within
bounds, which should reduce errors in driver implementation.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 379bdc59b1c8..65f907aea777 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1274,15 +1274,24 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
 static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 						   void __user *useraddr)
 {
-	struct ethtool_channels channels;
+	struct ethtool_channels channels, max;
 	u32 max_rx_in_use = 0;
 
-	if (!dev->ethtool_ops->set_channels)
+	if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&channels, useraddr, sizeof(channels)))
 		return -EFAULT;
 
+	dev->ethtool_ops->get_channels(dev, &max);
+
+	/* ensure new counts are within the maximums */
+	if ((channels.rx_count > max.max_rx) ||
+	    (channels.tx_count > max.max_tx) ||
+	    (channels.combined_count > max.max_combined) ||
+	    (channels.other_count > max.max_other))
+		return -EINVAL;
+
 	/* ensure the new Rx count fits within the configured Rx flow
 	 * indirection table settings */
 	if (netif_is_rxfh_configured(dev) &&
-- 
cgit v1.2.3-59-g8ed1b


From 4952cd3e7b47dfe8f7d6c69973b13eb487eb2bd0 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Thu, 11 Feb 2016 10:43:15 +0100
Subject: tipc: refactor node xmit and fix memory leaks

Refactor tipc_node_xmit() to fail fast and fail early. Fix several
potential memory leaks in unexpected error paths.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c |  8 ++++++--
 net/tipc/node.c | 54 ++++++++++++++++++++++++++++++++----------------------
 2 files changed, 38 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 6f4a6d9b0149..3e513daecf80 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -903,8 +903,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 		if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
 			return link_schedule_user(l, list);
 	}
-	if (unlikely(msg_size(hdr) > mtu))
+	if (unlikely(msg_size(hdr) > mtu)) {
+		skb_queue_purge(list);
 		return -EMSGSIZE;
+	}
 
 	/* Prepare each packet for sending, and add to relevant queue: */
 	while (skb_queue_len(list)) {
@@ -916,8 +918,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 
 		if (likely(skb_queue_len(transmq) < maxwin)) {
 			_skb = skb_clone(skb, GFP_ATOMIC);
-			if (!_skb)
+			if (!_skb) {
+				skb_queue_purge(list);
 				return -ENOBUFS;
+			}
 			__skb_dequeue(list);
 			__skb_queue_tail(transmq, skb);
 			__skb_queue_tail(xmitq, _skb);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index f8a8255a7182..10a1e8717c6f 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1166,7 +1166,7 @@ msg_full:
  * @dnode: address of destination node
  * @selector: a number used for deterministic link selection
  * Consumes the buffer chain, except when returning -ELINKCONG
- * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
  */
 int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
 		   u32 dnode, int selector)
@@ -1174,33 +1174,43 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
 	struct tipc_link_entry *le = NULL;
 	struct tipc_node *n;
 	struct sk_buff_head xmitq;
-	int bearer_id = -1;
-	int rc = -EHOSTUNREACH;
+	int bearer_id;
+	int rc;
+
+	if (in_own_node(net, dnode)) {
+		tipc_sk_rcv(net, list);
+		return 0;
+	}
 
-	__skb_queue_head_init(&xmitq);
 	n = tipc_node_find(net, dnode);
-	if (likely(n)) {
-		tipc_node_read_lock(n);
-		bearer_id = n->active_links[selector & 1];
-		if (bearer_id >= 0) {
-			le = &n->links[bearer_id];
-			spin_lock_bh(&le->lock);
-			rc = tipc_link_xmit(le->link, list, &xmitq);
-			spin_unlock_bh(&le->lock);
-		}
+	if (unlikely(!n)) {
+		skb_queue_purge(list);
+		return -EHOSTUNREACH;
+	}
+
+	tipc_node_read_lock(n);
+	bearer_id = n->active_links[selector & 1];
+	if (unlikely(bearer_id == INVALID_BEARER_ID)) {
 		tipc_node_read_unlock(n);
-		if (likely(!rc))
-			tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
-		else if (rc == -ENOBUFS)
-			tipc_node_link_down(n, bearer_id, false);
 		tipc_node_put(n);
-		return rc;
+		skb_queue_purge(list);
+		return -EHOSTUNREACH;
 	}
 
-	if (likely(in_own_node(net, dnode))) {
-		tipc_sk_rcv(net, list);
-		return 0;
-	}
+	__skb_queue_head_init(&xmitq);
+	le = &n->links[bearer_id];
+	spin_lock_bh(&le->lock);
+	rc = tipc_link_xmit(le->link, list, &xmitq);
+	spin_unlock_bh(&le->lock);
+	tipc_node_read_unlock(n);
+
+	if (likely(rc == 0))
+		tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
+	else if (rc == -ENOBUFS)
+		tipc_node_link_down(n, bearer_id, false);
+
+	tipc_node_put(n);
+
 	return rc;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 911362c70df5b766c243dc297fadeaced786ffd8 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:53 +0100
Subject: net: add dst_cache support

This patch add a generic, lockless dst cache implementation.
The need for lock is avoided updating the dst cache fields
only in per cpu scope, and requiring that the cache manipulation
functions are invoked with the local bh disabled.

The refresh_ts and reset_ts fields are used to ensure the cache
consistency in case of cuncurrent cache update (dst_cache_set*) and
reset operation (dst_cache_reset).

Consider the following scenario:

CPU1:                                   	CPU2:
  <cache lookup with emtpy cache: it fails>
  <get dst via uncached route lookup>
						<related configuration changes>
                                        	dst_cache_reset()
  dst_cache_set()

The dst entry set passed to dst_cache_set() should not be used
for later dst cache lookup, because it's obtained using old
configuration values.

Since the refresh_ts is updated only on dst_cache lookup, the
cached value in the above scenario will be discarded on the next
lookup.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_cache.h |  97 ++++++++++++++++++++++++++++
 net/Kconfig             |   4 ++
 net/core/Makefile       |   1 +
 net/core/dst_cache.c    | 168 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 270 insertions(+)
 create mode 100644 include/net/dst_cache.h
 create mode 100644 net/core/dst_cache.c

(limited to 'net')

diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
new file mode 100644
index 000000000000..151accae708b
--- /dev/null
+++ b/include/net/dst_cache.h
@@ -0,0 +1,97 @@
+#ifndef _NET_DST_CACHE_H
+#define _NET_DST_CACHE_H
+
+#include <linux/jiffies.h>
+#include <net/dst.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_fib.h>
+#endif
+
+struct dst_cache {
+	struct dst_cache_pcpu __percpu *cache;
+	unsigned long reset_ts;
+};
+
+/**
+ *	dst_cache_get - perform cache lookup
+ *	@dst_cache: the cache
+ *
+ *	The caller should use dst_cache_get_ip4() if it need to retrieve the
+ *	source address to be used when xmitting to the cached dst.
+ *	local BH must be disabled.
+ */
+struct dst_entry *dst_cache_get(struct dst_cache *dst_cache);
+
+/**
+ *	dst_cache_get_ip4 - perform cache lookup and fetch ipv4 source address
+ *	@dst_cache: the cache
+ *	@saddr: return value for the retrieved source address
+ *
+ *	local BH must be disabled.
+ */
+struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr);
+
+/**
+ *	dst_cache_set_ip4 - store the ipv4 dst into the cache
+ *	@dst_cache: the cache
+ *	@dst: the entry to be cached
+ *	@saddr: the source address to be stored inside the cache
+ *
+ *	local BH must be disabled.
+ */
+void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
+		       __be32 saddr);
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+/**
+ *	dst_cache_set_ip6 - store the ipv6 dst into the cache
+ *	@dst_cache: the cache
+ *	@dst: the entry to be cached
+ *	@saddr: the source address to be stored inside the cache
+ *
+ *	local BH must be disabled.
+ */
+void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
+		       const struct in6_addr *addr);
+
+/**
+ *	dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address
+ *	@dst_cache: the cache
+ *	@saddr: return value for the retrieved source address
+ *
+ *	local BH must be disabled.
+ */
+struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
+				    struct in6_addr *saddr);
+#endif
+
+/**
+ *	dst_cache_reset - invalidate the cache contents
+ *	@dst_cache: the cache
+ *
+ *	This do not free the cached dst to avoid races and contentions.
+ *	the dst will be freed on later cache lookup.
+ */
+static inline void dst_cache_reset(struct dst_cache *dst_cache)
+{
+	dst_cache->reset_ts = jiffies;
+}
+
+/**
+ *	dst_cache_init - initialize the cache, allocating the required storage
+ *	@dst_cache: the cache
+ *	@gfp: allocation flags
+ */
+int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp);
+
+/**
+ *	dst_cache_destroy - empty the cache and free the allocated storage
+ *	@dst_cache: the cache
+ *
+ *	No synchronization is enforced: it must be called only when the cache
+ *	is unsed.
+ */
+void dst_cache_destroy(struct dst_cache *dst_cache);
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index 174354618f8a..b80efecfc1a0 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -392,6 +392,10 @@ config LWTUNNEL
 	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
 	  with light weight tunnel state associated with fib routes.
 
+config DST_CACHE
+	bool "dst cache"
+	default n
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF_JIT
diff --git a/net/core/Makefile b/net/core/Makefile
index 0b835de04de3..7a8fb8aef992 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_DST_CACHE) += dst_cache.o
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
new file mode 100644
index 000000000000..3938f3f38d69
--- /dev/null
+++ b/net/core/dst_cache.c
@@ -0,0 +1,168 @@
+/*
+ * net/core/dst_cache.c - dst entry cache
+ *
+ * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <net/dst_cache.h>
+#include <net/route.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_fib.h>
+#endif
+#include <uapi/linux/in.h>
+
+struct dst_cache_pcpu {
+	unsigned long refresh_ts;
+	struct dst_entry *dst;
+	u32 cookie;
+	union {
+		struct in_addr in_saddr;
+		struct in6_addr in6_saddr;
+	};
+};
+
+void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
+			       struct dst_entry *dst, u32 cookie)
+{
+	dst_release(dst_cache->dst);
+	if (dst)
+		dst_hold(dst);
+
+	dst_cache->cookie = cookie;
+	dst_cache->dst = dst;
+}
+
+struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
+					struct dst_cache_pcpu *idst)
+{
+	struct dst_entry *dst;
+
+	dst = idst->dst;
+	if (!dst)
+		goto fail;
+
+	/* the cache already hold a dst reference; it can't go away */
+	dst_hold(dst);
+
+	if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
+		     (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
+		dst_cache_per_cpu_dst_set(idst, NULL, 0);
+		dst_release(dst);
+		goto fail;
+	}
+	return dst;
+
+fail:
+	idst->refresh_ts = jiffies;
+	return NULL;
+}
+
+struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
+{
+	if (!dst_cache->cache)
+		return NULL;
+
+	return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+}
+EXPORT_SYMBOL_GPL(dst_cache_get);
+
+struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
+{
+	struct dst_cache_pcpu *idst;
+	struct dst_entry *dst;
+
+	if (!dst_cache->cache)
+		return NULL;
+
+	idst = this_cpu_ptr(dst_cache->cache);
+	dst = dst_cache_per_cpu_get(dst_cache, idst);
+	if (!dst)
+		return NULL;
+
+	*saddr = idst->in_saddr.s_addr;
+	return container_of(dst, struct rtable, dst);
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
+
+void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
+		       __be32 saddr)
+{
+	struct dst_cache_pcpu *idst;
+
+	if (!dst_cache->cache)
+		return;
+
+	idst = this_cpu_ptr(dst_cache->cache);
+	dst_cache_per_cpu_dst_set(idst, dst, 0);
+	idst->in_saddr.s_addr = saddr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
+
+#if IS_ENABLED(CONFIG_IPV6)
+void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
+		       const struct in6_addr *addr)
+{
+	struct dst_cache_pcpu *idst;
+
+	if (!dst_cache->cache)
+		return;
+
+	idst = this_cpu_ptr(dst_cache->cache);
+	dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
+				  rt6_get_cookie((struct rt6_info *)dst));
+	idst->in6_saddr = *addr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
+
+struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
+				    struct in6_addr *saddr)
+{
+	struct dst_cache_pcpu *idst;
+	struct dst_entry *dst;
+
+	if (!dst_cache->cache)
+		return NULL;
+
+	idst = this_cpu_ptr(dst_cache->cache);
+	dst = dst_cache_per_cpu_get(dst_cache, idst);
+	if (!dst)
+		return NULL;
+
+	*saddr = idst->in6_saddr;
+	return dst;
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
+#endif
+
+int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
+{
+	dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
+					    gfp | __GFP_ZERO);
+	if (!dst_cache->cache)
+		return -ENOMEM;
+
+	dst_cache_reset(dst_cache);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dst_cache_init);
+
+void dst_cache_destroy(struct dst_cache *dst_cache)
+{
+	int i;
+
+	if (!dst_cache->cache)
+		return;
+
+	for_each_possible_cpu(i)
+		dst_release(per_cpu_ptr(dst_cache->cache, i)->dst);
+
+	free_percpu(dst_cache->cache);
+}
+EXPORT_SYMBOL_GPL(dst_cache_destroy);
-- 
cgit v1.2.3-59-g8ed1b


From 607f725f6f7d5ec3759fbc16224afb60e2152a5b Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:54 +0100
Subject: net: replace dst_cache ip6_tunnel implementation with the generic one

This also fix a potential race into the existing tunnel code, which
could lead to the wrong dst to be permanenty cached:

CPU1:					CPU2:
  <xmit on ip6_tunnel>
  <cache lookup fails>
  dst = ip6_route_output(...)
					<tunnel params are changed via nl>
					dst_cache_reset() // no effect,
							// the cache is empty
  dst_cache_set() // the wrong dst
	// is permanenty stored
	// into the cache

With the new dst implementation the above race is not possible
since the first cache lookup after dst_cache_reset will fail due
to the timestamp check

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h |  14 +------
 net/ipv6/Kconfig         |   1 +
 net/ipv6/ip6_gre.c       |  12 +++---
 net/ipv6/ip6_tunnel.c    | 103 +++--------------------------------------------
 net/ipv6/ip6_vti.c       |   2 +-
 5 files changed, 16 insertions(+), 116 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 0d0ce0b2d870..499a707765ea 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -6,6 +6,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/ip6_tunnel.h>
 #include <net/ip_tunnels.h>
+#include <net/dst_cache.h>
 
 #define IP6TUNNEL_ERR_TIMEO (30*HZ)
 
@@ -33,12 +34,6 @@ struct __ip6_tnl_parm {
 	__be32			o_key;
 };
 
-struct ip6_tnl_dst {
-	seqlock_t lock;
-	struct dst_entry __rcu *dst;
-	u32 cookie;
-};
-
 /* IPv6 tunnel */
 struct ip6_tnl {
 	struct ip6_tnl __rcu *next;	/* next tunnel in list */
@@ -46,7 +41,7 @@ struct ip6_tnl {
 	struct net *net;	/* netns for packet i/o */
 	struct __ip6_tnl_parm parms;	/* tunnel configuration parameters */
 	struct flowi fl;	/* flowi template for xmit */
-	struct ip6_tnl_dst __percpu *dst_cache;	/* cached dst */
+	struct dst_cache dst_cache;	/* cached dst */
 
 	int err_count;
 	unsigned long err_time;
@@ -66,11 +61,6 @@ struct ipv6_tlv_tnl_enc_lim {
 	__u8 encap_limit;	/* tunnel encapsulation limit   */
 } __packed;
 
-struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t);
-int ip6_tnl_dst_init(struct ip6_tnl *t);
-void ip6_tnl_dst_destroy(struct ip6_tnl *t);
-void ip6_tnl_dst_reset(struct ip6_tnl *t);
-void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst);
 int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
 		const struct in6_addr *raddr);
 int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 40c897515ddc..11e875ffd7ac 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -207,6 +207,7 @@ config IPV6_NDISC_NODETYPE
 config IPV6_TUNNEL
 	tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
 	select INET6_TUNNEL
+	select DST_CACHE
 	---help---
 	  Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
 	  RFC 2473.
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index f37f18b6b40c..a94e50602813 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -360,7 +360,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
 	struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
 
 	ip6gre_tunnel_unlink(ign, t);
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	dev_put(dev);
 }
 
@@ -633,7 +633,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
 	}
 
 	if (!fl6->flowi6_mark)
-		dst = ip6_tnl_dst_get(tunnel);
+		dst = dst_cache_get(&tunnel->dst_cache);
 
 	if (!dst) {
 		dst = ip6_route_output(net, NULL, fl6);
@@ -702,7 +702,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
 	}
 
 	if (!fl6->flowi6_mark && ndst)
-		ip6_tnl_dst_set(tunnel, ndst);
+		dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr);
 	skb_dst_set(skb, dst);
 
 	proto = NEXTHDR_GRE;
@@ -1009,7 +1009,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t,
 	t->parms.o_key = p->o_key;
 	t->parms.i_flags = p->i_flags;
 	t->parms.o_flags = p->o_flags;
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	ip6gre_tnl_link_config(t, set_mtu);
 	return 0;
 }
@@ -1219,7 +1219,7 @@ static void ip6gre_dev_free(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 
-	ip6_tnl_dst_destroy(t);
+	dst_cache_destroy(&t->dst_cache);
 	free_percpu(dev->tstats);
 	free_netdev(dev);
 }
@@ -1257,7 +1257,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	ret = ip6_tnl_dst_init(tunnel);
+	ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
 	if (ret) {
 		free_percpu(dev->tstats);
 		dev->tstats = NULL;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 137fca42aaa6..3f3aabd2f07b 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -122,97 +122,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
 	return &dev->stats;
 }
 
-/*
- * Locking : hash tables are protected by RCU and RTNL
- */
-
-static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst,
-				    struct dst_entry *dst)
-{
-	write_seqlock_bh(&idst->lock);
-	dst_release(rcu_dereference_protected(
-			    idst->dst,
-			    lockdep_is_held(&idst->lock.lock)));
-	if (dst) {
-		dst_hold(dst);
-		idst->cookie = rt6_get_cookie((struct rt6_info *)dst);
-	} else {
-		idst->cookie = 0;
-	}
-	rcu_assign_pointer(idst->dst, dst);
-	write_sequnlock_bh(&idst->lock);
-}
-
-struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t)
-{
-	struct ip6_tnl_dst *idst;
-	struct dst_entry *dst;
-	unsigned int seq;
-	u32 cookie;
-
-	idst = raw_cpu_ptr(t->dst_cache);
-
-	rcu_read_lock();
-	do {
-		seq = read_seqbegin(&idst->lock);
-		dst = rcu_dereference(idst->dst);
-		cookie = idst->cookie;
-	} while (read_seqretry(&idst->lock, seq));
-
-	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
-		dst = NULL;
-	rcu_read_unlock();
-
-	if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) {
-		ip6_tnl_per_cpu_dst_set(idst, NULL);
-		dst_release(dst);
-		dst = NULL;
-	}
-	return dst;
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_get);
-
-void ip6_tnl_dst_reset(struct ip6_tnl *t)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset);
-
-void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst)
-{
-	ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst);
-
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_set);
-
-void ip6_tnl_dst_destroy(struct ip6_tnl *t)
-{
-	if (!t->dst_cache)
-		return;
-
-	ip6_tnl_dst_reset(t);
-	free_percpu(t->dst_cache);
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy);
-
-int ip6_tnl_dst_init(struct ip6_tnl *t)
-{
-	int i;
-
-	t->dst_cache = alloc_percpu(struct ip6_tnl_dst);
-	if (!t->dst_cache)
-		return -ENOMEM;
-
-	for_each_possible_cpu(i)
-		seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_init);
-
 /**
  * ip6_tnl_lookup - fetch tunnel matching the end-point addresses
  *   @remote: the address of the tunnel exit-point
@@ -329,7 +238,7 @@ static void ip6_dev_free(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 
-	ip6_tnl_dst_destroy(t);
+	dst_cache_destroy(&t->dst_cache);
 	free_percpu(dev->tstats);
 	free_netdev(dev);
 }
@@ -462,7 +371,7 @@ ip6_tnl_dev_uninit(struct net_device *dev)
 		RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
 	else
 		ip6_tnl_unlink(ip6n, t);
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	dev_put(dev);
 }
 
@@ -1069,7 +978,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 		memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
 		neigh_release(neigh);
 	} else if (!fl6->flowi6_mark)
-		dst = ip6_tnl_dst_get(t);
+		dst = dst_cache_get(&t->dst_cache);
 
 	if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
 		goto tx_err_link_failure;
@@ -1133,7 +1042,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	}
 
 	if (!fl6->flowi6_mark && ndst)
-		ip6_tnl_dst_set(t, ndst);
+		dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
 	skb_dst_set(skb, dst);
 
 	skb->transport_header = skb->network_header;
@@ -1366,7 +1275,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
 	t->parms.flowinfo = p->flowinfo;
 	t->parms.link = p->link;
 	t->parms.proto = p->proto;
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	ip6_tnl_link_config(t);
 	return 0;
 }
@@ -1637,7 +1546,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	ret = ip6_tnl_dst_init(t);
+	ret = dst_cache_init(&t->dst_cache, GFP_KERNEL);
 	if (ret) {
 		free_percpu(dev->tstats);
 		dev->tstats = NULL;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 0a8610b33d79..d90a11f14040 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -640,7 +640,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
 	t->parms.i_key = p->i_key;
 	t->parms.o_key = p->o_key;
 	t->parms.proto = p->proto;
-	ip6_tnl_dst_reset(t);
+	dst_cache_reset(&t->dst_cache);
 	vti6_link_config(t);
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e09acddf873bf775b208b452a4c3a3fd26fa9427 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:55 +0100
Subject: ip_tunnel: replace dst_cache with generic implementation

The current ip_tunnel cache implementation is prone to a race
that will cause the wrong dst to be cached on cuncurrent dst cache
miss and ip tunnel update via netlink.

Replacing with the generic implementation fix the issue.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h |  9 ++----
 net/ipv4/Kconfig         |  1 +
 net/ipv4/ip_tunnel.c     | 78 ++++++++----------------------------------------
 net/ipv6/sit.c           | 17 ++++++-----
 4 files changed, 25 insertions(+), 80 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index bc439f32baa9..fd36936d85a6 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -13,6 +13,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/lwtunnel.h>
+#include <net/dst_cache.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -85,11 +86,6 @@ struct ip_tunnel_prl_entry {
 	struct rcu_head			rcu_head;
 };
 
-struct ip_tunnel_dst {
-	struct dst_entry __rcu 		*dst;
-	__be32				 saddr;
-};
-
 struct metadata_dst;
 
 struct ip_tunnel {
@@ -108,7 +104,7 @@ struct ip_tunnel {
 	int		tun_hlen;	/* Precalculated header length */
 	int		mlink;
 
-	struct ip_tunnel_dst __percpu *dst_cache;
+	struct dst_cache dst_cache;
 
 	struct ip_tunnel_parm parms;
 
@@ -247,7 +243,6 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p);
 void ip_tunnel_setup(struct net_device *dev, int net_id);
-void ip_tunnel_dst_reset_all(struct ip_tunnel *t);
 int ip_tunnel_encap_setup(struct ip_tunnel *t,
 			  struct ip_tunnel_encap *ipencap);
 
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 775824720b6b..395d82754626 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX
 
 config NET_IP_TUNNEL
 	tristate
+	select DST_CACHE
 	default n
 
 config NET_IPGRE
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index c7bd72e9b544..4569da7dfa88 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
 			 IP_TNL_HASH_BITS);
 }
 
-static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
-			     struct dst_entry *dst, __be32 saddr)
-{
-	struct dst_entry *old_dst;
-
-	dst_clone(dst);
-	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
-	dst_release(old_dst);
-	idst->saddr = saddr;
-}
-
-static noinline void tunnel_dst_set(struct ip_tunnel *t,
-			   struct dst_entry *dst, __be32 saddr)
-{
-	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
-}
-
-static void tunnel_dst_reset(struct ip_tunnel *t)
-{
-	tunnel_dst_set(t, NULL, 0);
-}
-
-void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
-}
-EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
-
-static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
-					u32 cookie, __be32 *saddr)
-{
-	struct ip_tunnel_dst *idst;
-	struct dst_entry *dst;
-
-	rcu_read_lock();
-	idst = raw_cpu_ptr(t->dst_cache);
-	dst = rcu_dereference(idst->dst);
-	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
-		dst = NULL;
-	if (dst) {
-		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
-			*saddr = idst->saddr;
-		} else {
-			tunnel_dst_reset(t);
-			dst_release(dst);
-			dst = NULL;
-		}
-	}
-	rcu_read_unlock();
-	return (struct rtable *)dst;
-}
-
 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
 				__be16 flags, __be32 key)
 {
@@ -381,7 +326,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 
 		if (!IS_ERR(rt)) {
 			tdev = rt->dst.dev;
-			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+					  fl4.saddr);
 			ip_rt_put(rt);
 		}
 		if (dev->type != ARPHRD_ETHER)
@@ -729,7 +675,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 		goto tx_error;
 
-	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
+	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
+			 NULL;
 
 	if (!rt) {
 		rt = ip_route_output_key(tunnel->net, &fl4);
@@ -739,7 +686,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 			goto tx_error;
 		}
 		if (connected)
-			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+					  fl4.saddr);
 	}
 
 	if (rt->dst.dev == dev) {
@@ -836,7 +784,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
 		if (set_mtu)
 			dev->mtu = mtu;
 	}
-	ip_tunnel_dst_reset_all(t);
+	dst_cache_reset(&t->dst_cache);
 	netdev_state_change(dev);
 }
 
@@ -961,7 +909,7 @@ static void ip_tunnel_dev_free(struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
 	gro_cells_destroy(&tunnel->gro_cells);
-	free_percpu(tunnel->dst_cache);
+	dst_cache_destroy(&tunnel->dst_cache);
 	free_percpu(dev->tstats);
 	free_netdev(dev);
 }
@@ -1155,15 +1103,15 @@ int ip_tunnel_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
-	if (!tunnel->dst_cache) {
+	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+	if (err) {
 		free_percpu(dev->tstats);
-		return -ENOMEM;
+		return err;
 	}
 
 	err = gro_cells_init(&tunnel->gro_cells, dev);
 	if (err) {
-		free_percpu(tunnel->dst_cache);
+		dst_cache_destroy(&tunnel->dst_cache);
 		free_percpu(dev->tstats);
 		return err;
 	}
@@ -1193,7 +1141,7 @@ void ip_tunnel_uninit(struct net_device *dev)
 	if (itn->fb_tunnel_dev != dev)
 		ip_tunnel_del(itn, netdev_priv(dev));
 
-	ip_tunnel_dst_reset_all(tunnel);
+	dst_cache_reset(&tunnel->dst_cache);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 9a6b407f5840..0625ac6356b5 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -475,7 +475,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
 		ipip6_tunnel_unlink(sitn, tunnel);
 		ipip6_tunnel_del_prl(tunnel, NULL);
 	}
-	ip_tunnel_dst_reset_all(tunnel);
+	dst_cache_reset(&tunnel->dst_cache);
 	dev_put(dev);
 }
 
@@ -1093,7 +1093,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
 		t->parms.link = p->link;
 		ipip6_tunnel_bind_dev(t->dev);
 	}
-	ip_tunnel_dst_reset_all(t);
+	dst_cache_reset(&t->dst_cache);
 	netdev_state_change(t->dev);
 }
 
@@ -1124,7 +1124,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
 	t->ip6rd.relay_prefix = relay_prefix;
 	t->ip6rd.prefixlen = ip6rd->prefixlen;
 	t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen;
-	ip_tunnel_dst_reset_all(t);
+	dst_cache_reset(&t->dst_cache);
 	netdev_state_change(t->dev);
 	return 0;
 }
@@ -1278,7 +1278,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
 			break;
 		}
-		ip_tunnel_dst_reset_all(t);
+		dst_cache_reset(&t->dst_cache);
 		netdev_state_change(dev);
 		break;
 
@@ -1339,7 +1339,7 @@ static void ipip6_dev_free(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
-	free_percpu(tunnel->dst_cache);
+	dst_cache_destroy(&tunnel->dst_cache);
 	free_percpu(dev->tstats);
 	free_netdev(dev);
 }
@@ -1372,6 +1372,7 @@ static void ipip6_tunnel_setup(struct net_device *dev)
 static int ipip6_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int err;
 
 	tunnel->dev = dev;
 	tunnel->net = dev_net(dev);
@@ -1382,10 +1383,10 @@ static int ipip6_tunnel_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
-	if (!tunnel->dst_cache) {
+	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+	if (err) {
 		free_percpu(dev->tstats);
-		return -ENOMEM;
+		return err;
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From d71785ffc7e7cae3fbdc4ea8a9d05b7a1c59f7b8 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:57 +0100
Subject: net: add dst_cache to ovs vxlan lwtunnel

In case of UDP traffic with datagram length
below MTU this give about 2% performance increase
when tunneling over ipv4 and about 60% when tunneling
over ipv6

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            | 15 ++++++++-------
 include/net/dst_metadata.h     |  1 +
 include/net/ip_tunnels.h       |  3 +++
 net/core/dst.c                 | 10 +++++++++-
 net/openvswitch/Kconfig        |  1 +
 net/openvswitch/flow_netlink.c |  6 ++++++
 6 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ad673037bd73..ee1206d9f8df 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1775,7 +1775,7 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 	/* when the ip_tunnel_info is availble, the tos used for lookup is
 	 * packet independent, so we can use the cache
 	 */
-	if (dst_cache && !skb->mark && (!tos || info)) {
+	if (!skb->mark && (!tos || info)) {
 		use_cache = true;
 		rt = dst_cache_get_ip4(dst_cache, saddr);
 		if (rt)
@@ -1806,13 +1806,11 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 					  struct in6_addr *saddr,
 					  struct dst_cache *dst_cache)
 {
-	bool use_cache = false;
 	struct dst_entry *ndst;
 	struct flowi6 fl6;
 	int err;
 
-	if (dst_cache && !skb->mark) {
-		use_cache = true;
+	if (!skb->mark) {
 		ndst = dst_cache_get_ip6(dst_cache, saddr);
 		if (ndst)
 			return ndst;
@@ -1832,7 +1830,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 		return ERR_PTR(err);
 
 	*saddr = fl6.saddr;
-	if (use_cache)
+	if (!skb->mark)
 		dst_cache_set_ip6(dst_cache, ndst, saddr);
 	return ndst;
 }
@@ -1886,6 +1884,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
+	struct dst_cache *dst_cache;
 	struct ip_tunnel_info *info;
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct sock *sk;
@@ -1910,6 +1909,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
 		vni = rdst->remote_vni;
 		dst = &rdst->remote_ip;
+		dst_cache = &rdst->dst_cache;
 	} else {
 		if (!info) {
 			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
@@ -1924,6 +1924,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		else
 			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
 		dst = &remote_ip;
+		dst_cache = &info->dst_cache;
 	}
 
 	if (vxlan_addr_any(dst)) {
@@ -1976,7 +1977,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		rt = vxlan_get_route(vxlan, skb,
 				     rdst ? rdst->remote_ifindex : 0, tos,
 				     dst->sin.sin_addr.s_addr, &saddr,
-				     rdst ? &rdst->dst_cache : NULL, info);
+				     dst_cache, info);
 		if (IS_ERR(rt)) {
 			netdev_dbg(dev, "no route to %pI4\n",
 				   &dst->sin.sin_addr.s_addr);
@@ -2029,7 +2030,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		ndst = vxlan6_get_route(vxlan, skb,
 					rdst ? rdst->remote_ifindex : 0,
 					&dst->sin6.sin6_addr, &saddr,
-					rdst ? &rdst->dst_cache : NULL);
+					dst_cache);
 		if (IS_ERR(ndst)) {
 			netdev_dbg(dev, "no route to %pI6\n",
 				   &dst->sin6.sin6_addr);
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 30a56ab2ccfb..84b833af6882 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -62,6 +62,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
 		      sizeof(a->u.tun_info) + a->u.tun_info.options_len);
 }
 
+void metadata_dst_free(struct metadata_dst *);
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
 struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index fd36936d85a6..87408ab80856 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -58,6 +58,9 @@ struct ip_tunnel_key {
 
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
+#ifdef CONFIG_DST_CACHE
+	struct dst_cache	dst_cache;
+#endif
 	u8			options_len;
 	u8			mode;
 };
diff --git a/net/core/dst.c b/net/core/dst.c
index a1656e3b8d72..b5cbbe07f786 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -265,7 +265,7 @@ again:
 	lwtstate_put(dst->lwtstate);
 
 	if (dst->flags & DST_METADATA)
-		kfree(dst);
+		metadata_dst_free((struct metadata_dst *)dst);
 	else
 		kmem_cache_free(dst->ops->kmem_cachep, dst);
 
@@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
 }
 EXPORT_SYMBOL_GPL(metadata_dst_alloc);
 
+void metadata_dst_free(struct metadata_dst *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+	dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
+#endif
+	kfree(md_dst);
+}
+
 struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
 {
 	int cpu;
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index d143aa9f6654..cd5fd9d728a7 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -10,6 +10,7 @@ config OPENVSWITCH
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
+	select DST_CACHE
 	---help---
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized
 	  environments.  In addition to supporting a variety of features
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index d1bd4a45ca2d..58b8efc23668 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	if (!tun_dst)
 		return -ENOMEM;
 
+	err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL);
+	if (err) {
+		dst_release((struct dst_entry *)tun_dst);
+		return err;
+	}
+
 	a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
 			 sizeof(*ovs_tun), log);
 	if (IS_ERR(a)) {
-- 
cgit v1.2.3-59-g8ed1b


From 3c1cb4d2604c03779a6c9485204e2a80be6c28f0 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 12 Feb 2016 15:43:59 +0100
Subject: net/ipv4: add dst cache support for gre lwtunnels

In case of UDP traffic with datagram length below MTU this
gives about 4% performance increase

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Suggested-and-Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 65748db44285..917c2c1bfadd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -540,9 +540,16 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto err_free_skb;
 
 	key = &tun_info->key;
-	rt = gre_get_rt(skb, dev, &fl, key);
-	if (IS_ERR(rt))
-		goto err_free_skb;
+	rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) :
+			 NULL;
+	if (!rt) {
+		rt = gre_get_rt(skb, dev, &fl, key);
+		if (IS_ERR(rt))
+				goto err_free_skb;
+		if (!skb->mark)
+			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+					  fl.saddr);
+	}
 
 	tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
 
-- 
cgit v1.2.3-59-g8ed1b


From cd9b266095f422267bddbec88f9098b48ea548fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 11 Feb 2016 22:02:53 -0800
Subject: tcp: add tcpi_min_rtt and tcpi_notsent_bytes to tcp_info

tcpi_min_rtt reports the minimal rtt observed by TCP stack for the flow,
in usec unit. Might be ~0U if not yet known.

tcpi_notsent_bytes reports the amount of bytes in the write queue that
were not yet sent.

This is done in a single patch to not add a temporary 32bit padding hole
in tcp_info.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 3 +++
 net/ipv4/tcp.c           | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 65a77b071e22..fe95446e9abf 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -196,6 +196,9 @@ struct tcp_info {
 	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
 	__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
 	__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
+
+	__u32	tcpi_notsent_bytes;
+	__u32	tcpi_min_rtt;
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 014f18e2f7b3..f93150d15199 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2642,6 +2642,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now = tcp_time_stamp;
 	unsigned int start;
+	int notsent_bytes;
 	u64 rate64;
 	u32 rate;
 
@@ -2722,6 +2723,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
 	info->tcpi_segs_out = tp->segs_out;
 	info->tcpi_segs_in = tp->segs_in;
+
+	notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+	info->tcpi_notsent_bytes = max(0, notsent_bytes);
+
+	info->tcpi_min_rtt = tcp_min_rtt(tp);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
-- 
cgit v1.2.3-59-g8ed1b


From fa50d974d104113630d68b7d03233a6686230d0c Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:27 +0200
Subject: ipv4: Namespaceify ip_default_ttl sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h                 |  1 +
 include/net/route.h                      |  5 ++---
 net/bridge/netfilter/nft_reject_bridge.c |  8 +++++---
 net/ipv4/ip_output.c                     |  3 ---
 net/ipv4/ip_sockglue.c                   |  5 ++++-
 net/ipv4/netfilter/ipt_SYNPROXY.c        |  3 ++-
 net/ipv4/proc.c                          |  2 +-
 net/ipv4/sysctl_net_ipv4.c               | 20 +++++++++++---------
 8 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 848fe8056534..bc8f7f94abcb 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -80,6 +80,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_ecn;
 	int sysctl_tcp_ecn_fallback;
 
+	int sysctl_ip_default_ttl;
 	int sysctl_ip_no_pmtu_disc;
 	int sysctl_ip_fwd_use_pmtu;
 	int sysctl_ip_nonlocal_bind;
diff --git a/include/net/route.h b/include/net/route.h
index a3b9ef74a389..9b0a523bb428 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -329,14 +329,13 @@ static inline int inet_iif(const struct sk_buff *skb)
 	return skb->skb_iif;
 }
 
-extern int sysctl_ip_default_ttl;
-
 static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 {
 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+	struct net *net = dev_net(dst->dev);
 
 	if (hoplimit == 0)
-		hoplimit = sysctl_ip_default_ttl;
+		hoplimit = net->ipv4.sysctl_ip_default_ttl;
 	return hoplimit;
 }
 
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index fdba3d9fbff3..adc8d7221dbb 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -48,6 +48,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb,
 	struct iphdr *niph;
 	const struct tcphdr *oth;
 	struct tcphdr _oth;
+	struct net *net = sock_net(oldskb->sk);
 
 	if (!nft_bridge_iphdr_validate(oldskb))
 		return;
@@ -63,9 +64,9 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb,
 
 	skb_reserve(nskb, LL_MAX_HEADER);
 	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
-				   sysctl_ip_default_ttl);
+				   net->ipv4.sysctl_ip_default_ttl);
 	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
-	niph->ttl	= sysctl_ip_default_ttl;
+	niph->ttl	= net->ipv4.sysctl_ip_default_ttl;
 	niph->tot_len	= htons(nskb->len);
 	ip_send_check(niph);
 
@@ -85,6 +86,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb,
 	void *payload;
 	__wsum csum;
 	u8 proto;
+	struct net *net = sock_net(oldskb->sk);
 
 	if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb))
 		return;
@@ -119,7 +121,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb,
 
 	skb_reserve(nskb, LL_MAX_HEADER);
 	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
-				   sysctl_ip_default_ttl);
+				   net->ipv4.sysctl_ip_default_ttl);
 
 	skb_reset_transport_header(nskb);
 	icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 64878efa045c..f734c42acdaf 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -79,9 +79,6 @@
 #include <linux/netlink.h>
 #include <linux/tcp.h>
 
-int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-
 static int
 ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	    unsigned int mtu,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 92808f147ef5..3f1befc4e17b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1341,10 +1341,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		val = inet->tos;
 		break;
 	case IP_TTL:
+	{
+		struct net *net = sock_net(sk);
 		val = (inet->uc_ttl == -1 ?
-		       sysctl_ip_default_ttl :
+		       net->ipv4.sysctl_ip_default_ttl :
 		       inet->uc_ttl);
 		break;
+	}
 	case IP_HDRINCL:
 		val = inet->hdrincl;
 		break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 5fdc556514ba..7b8fbb352877 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -21,6 +21,7 @@ static struct iphdr *
 synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 {
 	struct iphdr *iph;
+	struct net *net = sock_net(skb->sk);
 
 	skb_reset_network_header(skb);
 	iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
@@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 	iph->tos	= 0;
 	iph->id		= 0;
 	iph->frag_off	= htons(IP_DF);
-	iph->ttl	= sysctl_ip_default_ttl;
+	iph->ttl	= net->ipv4.sysctl_ip_default_ttl;
 	iph->protocol	= IPPROTO_TCP;
 	iph->check	= 0;
 	iph->saddr	= saddr;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3abd9d7a3adf..9f665b63a927 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq, "\nIp: %d %d",
 		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
-		   sysctl_ip_default_ttl);
+		   net->ipv4.sysctl_ip_default_ttl);
 
 	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
 	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b537338f5c97..a833a9f9e4cd 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -282,15 +282,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "ip_default_ttl",
-		.data		= &sysctl_ip_default_ttl,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &ip_ttl_min,
-		.extra2		= &ip_ttl_max,
-	},
 	{
 		.procname	= "tcp_max_orphans",
 		.data		= &sysctl_tcp_max_orphans,
@@ -752,6 +743,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_default_ttl",
+		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &ip_ttl_min,
+		.extra2		= &ip_ttl_max,
+	},
 	{
 		.procname	= "ip_local_port_range",
 		.maxlen		= sizeof(init_net.ipv4.ip_local_ports.range),
@@ -988,6 +988,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 	if (!net->ipv4.sysctl_local_reserved_ports)
 		goto err_ports;
 
+	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+
 	return 0;
 
 err_ports:
-- 
cgit v1.2.3-59-g8ed1b


From dcd87999d415d39cf2ae510bfed6b8206d778e1c Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:28 +0200
Subject: igmp: net: Move igmp namespace init to correct file

When igmp related sysctl were namespacified their initializatin was
erroneously put into the tcp socket namespace constructor. This
patch moves the relevant code into the igmp namespace constructor to
keep things consistent.

Also sprinkle some #ifdefs to silence warnings

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c     | 14 ++++++++++++++
 net/ipv4/tcp_ipv4.c |  6 ------
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 7c95335bf85e..2aea9f1a2a31 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1224,7 +1224,9 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 static void igmp_group_added(struct ip_mc_list *im)
 {
 	struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
 	struct net *net = dev_net(in_dev->dev);
+#endif
 
 	if (im->loaded == 0) {
 		im->loaded = 1;
@@ -1316,7 +1318,9 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
 void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 {
 	struct ip_mc_list *im;
+#ifdef CONFIG_IP_MULTICAST
 	struct net *net = dev_net(in_dev->dev);
+#endif
 
 	ASSERT_RTNL();
 
@@ -1643,7 +1647,9 @@ void ip_mc_down(struct in_device *in_dev)
 
 void ip_mc_init_dev(struct in_device *in_dev)
 {
+#ifdef CONFIG_IP_MULTICAST
 	struct net *net = dev_net(in_dev->dev);
+#endif
 	ASSERT_RTNL();
 
 #ifdef CONFIG_IP_MULTICAST
@@ -1662,7 +1668,9 @@ void ip_mc_init_dev(struct in_device *in_dev)
 void ip_mc_up(struct in_device *in_dev)
 {
 	struct ip_mc_list *pmc;
+#ifdef CONFIG_IP_MULTICAST
 	struct net *net = dev_net(in_dev->dev);
+#endif
 
 	ASSERT_RTNL();
 
@@ -2923,6 +2931,12 @@ static int __net_init igmp_net_init(struct net *net)
 		goto out_sock;
 	}
 
+	/* Sysctl initialization */
+	net->ipv4.sysctl_igmp_max_memberships = 20;
+	net->ipv4.sysctl_igmp_max_msf = 10;
+	/* IGMP reports for link-local multicast groups are enabled by default */
+	net->ipv4.sysctl_igmp_llm_reports = 1;
+	net->ipv4.sysctl_igmp_qrv = 2;
 	return 0;
 
 out_sock:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ba5d0146e3f0..3f872a6bc274 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2399,12 +2399,6 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 
-	net->ipv4.sysctl_igmp_max_memberships = 20;
-	net->ipv4.sysctl_igmp_max_msf = 10;
-	/* IGMP reports for link-local multicast groups are enabled by default */
-	net->ipv4.sysctl_igmp_llm_reports = 1;
-	net->ipv4.sysctl_igmp_qrv = 2;
-
 	return 0;
 fail:
 	tcp_sk_exit(net);
-- 
cgit v1.2.3-59-g8ed1b


From 287b7f38fd6842e534db1783cead3843f7677b79 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:29 +0200
Subject: ipv4: Namespacify ip_dynaddr sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h           |  3 ---
 include/net/netns/ipv4.h   |  2 ++
 net/ipv4/af_inet.c         | 10 ++--------
 net/ipv4/sysctl_net_ipv4.c | 15 ++++++++-------
 4 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 1a98f1ca1638..e3fb25d76421 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -248,9 +248,6 @@ extern int inet_peer_maxttl;
 /* From ip_input.c */
 extern int sysctl_ip_early_demux;
 
-/* From ip_output.c */
-extern int sysctl_ip_dynaddr;
-
 void ipfrag_init(void);
 
 void ip_static_sysctl_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index bc8f7f94abcb..b7e3fb2587da 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -84,6 +84,8 @@ struct netns_ipv4 {
 	int sysctl_ip_no_pmtu_disc;
 	int sysctl_ip_fwd_use_pmtu;
 	int sysctl_ip_nonlocal_bind;
+	/* Shall we try to damage output packets if routing dev changes? */
+	int sysctl_ip_dynaddr;
 
 	int sysctl_fwmark_reflect;
 	int sysctl_tcp_fwmark_accept;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eade66db214e..209d1ed28954 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1095,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p)
 }
 EXPORT_SYMBOL(inet_unregister_protosw);
 
-/*
- *      Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr __read_mostly;
-
 static int inet_sk_reselect_saddr(struct sock *sk)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -1131,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	if (new_saddr == old_saddr)
 		return 0;
 
-	if (sysctl_ip_dynaddr > 1) {
+	if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) {
 		pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
 			__func__, &old_saddr, &new_saddr);
 	}
@@ -1186,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 		 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
 		 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
 		 */
-		if (!sysctl_ip_dynaddr ||
+		if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr ||
 		    sk->sk_state != TCP_SYN_SENT ||
 		    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
 		    (err = inet_sk_reselect_saddr(sk)) != 0)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a833a9f9e4cd..04ac5b763385 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -303,13 +303,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "ip_dynaddr",
-		.data		= &sysctl_ip_dynaddr,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
@@ -743,6 +736,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_dynaddr",
+		.data		= &init_net.ipv4.sysctl_ip_dynaddr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "ip_default_ttl",
 		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
@@ -989,6 +989,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 		goto err_ports;
 
 	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+	net->ipv4.sysctl_ip_dynaddr = 0;
 
 	return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From e21145a9871aa5ae07e01926105bb8e523d64095 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:30 +0200
Subject: ipv4: namespacify ip_early_demux sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h           |  3 ---
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/ip_input.c        |  5 +----
 net/ipv4/sysctl_net_ipv4.c | 15 ++++++++-------
 net/ipv6/ip6_input.c       |  2 +-
 5 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index e3fb25d76421..cbb134b2f0e4 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -245,9 +245,6 @@ extern int inet_peer_threshold;
 extern int inet_peer_minttl;
 extern int inet_peer_maxttl;
 
-/* From ip_input.c */
-extern int sysctl_ip_early_demux;
-
 void ipfrag_init(void);
 
 void ip_static_sysctl_init(void);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index b7e3fb2587da..a69cde3ce460 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -86,6 +86,7 @@ struct netns_ipv4 {
 	int sysctl_ip_nonlocal_bind;
 	/* Shall we try to damage output packets if routing dev changes? */
 	int sysctl_ip_dynaddr;
+	int sysctl_ip_early_demux;
 
 	int sysctl_fwmark_reflect;
 	int sysctl_tcp_fwmark_accept;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 852002f64c68..e3d782746d9d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -308,15 +308,12 @@ drop:
 	return true;
 }
 
-int sysctl_ip_early_demux __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_ip_early_demux);
-
 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 
-	if (sysctl_ip_early_demux &&
+	if (net->ipv4.sysctl_ip_early_demux &&
 	    !skb_dst(skb) &&
 	    !skb->sk &&
 	    !ip_is_fragment(iph)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 04ac5b763385..1e1fe6086dd9 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -296,13 +296,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "ip_early_demux",
-		.data		= &sysctl_ip_early_demux,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_fastopen",
 		.data		= &sysctl_tcp_fastopen,
@@ -743,6 +736,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_early_demux",
+		.data		= &init_net.ipv4.sysctl_ip_early_demux,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "ip_default_ttl",
 		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
@@ -990,6 +990,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 
 	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
 	net->ipv4.sysctl_ip_dynaddr = 0;
+	net->ipv4.sysctl_ip_early_demux = 1;
 
 	return 0;
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 31ac3c56da4b..c05c425c2389 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,7 +49,7 @@
 
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
+	if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
 		const struct inet6_protocol *ipprot;
 
 		ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
-- 
cgit v1.2.3-59-g8ed1b


From 0fbf4cb27e061204c8cee8e7eb2870416bdf30fd Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:31 +0200
Subject: ipv4: namespacify ip fragment max dist sysctl knob

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h |  1 +
 net/ipv4/ip_fragment.c  | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 12aac0fd6ee7..909972aa3acd 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -13,6 +13,7 @@ struct netns_frags {
 	int			timeout;
 	int			high_thresh;
 	int			low_thresh;
+	int			max_dist;
 };
 
 /**
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 187c6fcc3027..957161413335 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -54,8 +54,6 @@
  * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
  * as well. Or notify me, at least. --ANK
  */
-
-static int sysctl_ipfrag_max_dist __read_mostly = 64;
 static const char ip_frag_cache_name[] = "ip4-frags";
 
 struct ipfrag_skb_cb
@@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 	qp->daddr = arg->iph->daddr;
 	qp->vif = arg->vif;
 	qp->user = arg->user;
-	qp->peer = sysctl_ipfrag_max_dist ?
+	qp->peer = q->net->max_dist ?
 		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
 		NULL;
 }
@@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 static int ip_frag_too_far(struct ipq *qp)
 {
 	struct inet_peer *peer = qp->peer;
-	unsigned int max = sysctl_ipfrag_max_dist;
+	unsigned int max = qp->q.net->max_dist;
 	unsigned int start, end;
 
 	int rc;
@@ -749,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "ipfrag_max_dist",
+		.data		= &init_net.ipv4.frags.max_dist,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero
+	},
 	{ }
 };
 
@@ -762,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	{
-		.procname	= "ipfrag_max_dist",
-		.data		= &sysctl_ipfrag_max_dist,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero
-	},
 	{ }
 };
 
@@ -790,6 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
 		table[1].data = &net->ipv4.frags.low_thresh;
 		table[1].extra2 = &net->ipv4.frags.high_thresh;
 		table[2].data = &net->ipv4.frags.timeout;
+		table[3].data = &net->ipv4.frags.max_dist;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
@@ -865,6 +864,8 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	 */
 	net->ipv4.frags.timeout = IP_FRAG_TIME;
 
+	net->ipv4.frags.max_dist = 64;
+
 	res = inet_frags_init_net(&net->ipv4.frags);
 	if (res)
 		return res;
-- 
cgit v1.2.3-59-g8ed1b


From 52a773d645e95515016b9d896ba8e323e0aaa57b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Mon, 15 Feb 2016 12:11:32 +0200
Subject: net: Export ip fragment sysctl to unprivileged users

Now that all the ip fragmentation related sysctls are namespaceified
there is no reason to hide them anymore from "root" users inside
containers.

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 957161413335..efbd47d1a531 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -789,10 +789,6 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
 		table[1].extra2 = &net->ipv4.frags.high_thresh;
 		table[2].data = &net->ipv4.frags.timeout;
 		table[3].data = &net->ipv4.frags.max_dist;
-
-		/* Don't export sysctls to unprivileged users */
-		if (net->user_ns != &init_user_ns)
-			table[0].procname = NULL;
 	}
 
 	hdr = register_net_sysctl(net, "net/ipv4", table);
-- 
cgit v1.2.3-59-g8ed1b


From e4c6734eaab90695db0ea8456307790cb0c1ccb5 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Feb 2016 21:16:15 -0800
Subject: net: rework ndo tc op to consume additional qdisc handle parameter

The ndo_setup_tc() op was added to support drivers offloading tx
qdiscs however only support for mqprio was ever added. So we
only ever added support for passing the number of traffic classes
to the driver.

This patch generalizes the ndo_setup_tc op so that a handle can
be provided to indicate if the offload is for ingress or egress
or potentially even child qdiscs.

CC: Murali Karicheri <m-karicheri2@ti.com>
CC: Shradha Shah <sshah@solarflare.com>
CC: Or Gerlitz <ogerlitz@mellanox.com>
CC: Ariel Elior <ariel.elior@qlogic.com>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
CC: Bruce Allan <bruce.w.allan@intel.com>
CC: Jesse Brandeburg <jesse.brandeburg@intel.com>
CC: Don Skidmore <donald.c.skidmore@intel.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c         |  5 ++++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c  |  7 +++++++
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h  |  1 +
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c        |  5 ++++-
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c  | 10 +++++++++-
 drivers/net/ethernet/intel/i40e/i40e.h           |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_fcoe.c      |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c      | 17 ++++++++++++-----
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    | 11 ++++++++++-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c   | 12 ++++++++++--
 drivers/net/ethernet/sfc/efx.h                   |  2 +-
 drivers/net/ethernet/sfc/tx.c                    |  5 ++++-
 drivers/net/ethernet/ti/netcp_core.c             |  5 ++++-
 include/linux/netdevice.h                        |  3 ++-
 net/sched/sch_mqprio.c                           |  5 +++--
 16 files changed, 74 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 8a9b493566c9..9955cae3cabc 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1626,12 +1626,15 @@ static void xgbe_poll_controller(struct net_device *netdev)
 }
 #endif /* End CONFIG_NET_POLL_CONTROLLER */
 
-static int xgbe_setup_tc(struct net_device *netdev, u8 tc)
+static int xgbe_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	unsigned int offset, queue;
 	u8 i;
 
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
 	if (tc && (tc != pdata->hw_feat.tc_cnt))
 		return -EINVAL;
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 9e42bcaf9917..b262cba34dfa 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4272,6 +4272,13 @@ int bnx2x_setup_tc(struct net_device *dev, u8 num_tc)
 	return 0;
 }
 
+int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u8 num_tc)
+{
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+	return bnx2x_setup_tc(dev, num_tc);
+}
+
 /* called with rtnl_lock */
 int bnx2x_change_mac_addr(struct net_device *dev, void *p)
 {
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 37369865ca6d..60a4109dcdeb 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -486,6 +486,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
 /* setup_tc callback */
 int bnx2x_setup_tc(struct net_device *dev, u8 num_tc);
+int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u8 num_tc);
 
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
 			struct ifla_vf_info *ivi);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index c5845252c920..81fc51c4ec2b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13061,7 +13061,7 @@ static const struct net_device_ops bnx2x_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= poll_bnx2x,
 #endif
-	.ndo_setup_tc		= bnx2x_setup_tc,
+	.ndo_setup_tc		= __bnx2x_setup_tc,
 #ifdef CONFIG_BNX2X_SRIOV
 	.ndo_set_vf_mac		= bnx2x_set_vf_mac,
 	.ndo_set_vf_vlan	= bnx2x_set_vf_vlan,
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 5dc89e527e7d..ff08faf44ee5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5370,10 +5370,13 @@ static int bnxt_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
-static int bnxt_setup_tc(struct net_device *dev, u8 tc)
+static int bnxt_setup_tc(struct net_device *dev, u32 handle, u8 tc)
 {
 	struct bnxt *bp = netdev_priv(dev);
 
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
 	if (tc > bp->max_tc) {
 		netdev_err(dev, "too many traffic classes requested: %d Max supported is %d\n",
 			   tc, bp->max_tc);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 662569d5b7c0..12701a492325 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1204,6 +1204,14 @@ err_queueing_scheme:
 	return err;
 }
 
+static int __fm10k_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+{
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
+	return fm10k_setup_tc(dev, tc);
+}
+
 static int fm10k_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 {
 	switch (cmd) {
@@ -1386,7 +1394,7 @@ static const struct net_device_ops fm10k_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= fm10k_vlan_rx_kill_vid,
 	.ndo_set_rx_mode	= fm10k_set_rx_mode,
 	.ndo_get_stats64	= fm10k_get_stats64,
-	.ndo_setup_tc		= fm10k_setup_tc,
+	.ndo_setup_tc		= __fm10k_setup_tc,
 	.ndo_set_vf_mac		= fm10k_ndo_set_vf_mac,
 	.ndo_set_vf_vlan	= fm10k_ndo_set_vf_vlan,
 	.ndo_set_vf_rate	= fm10k_ndo_set_vf_bw,
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 53ed3bdd8363..ef9ca075d5e5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -788,7 +788,7 @@ struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, u8 *macaddr,
 				      bool is_vf, bool is_netdev);
 #ifdef I40E_FCOE
 int i40e_close(struct net_device *netdev);
-int i40e_setup_tc(struct net_device *netdev, u8 tc);
+int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc);
 void i40e_netpoll(struct net_device *netdev);
 int i40e_fcoe_enable(struct net_device *netdev);
 int i40e_fcoe_disable(struct net_device *netdev);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
index 579a46ca82df..7c66ce416ec7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
@@ -1457,7 +1457,7 @@ static const struct net_device_ops i40e_fcoe_netdev_ops = {
 	.ndo_tx_timeout		= i40e_tx_timeout,
 	.ndo_vlan_rx_add_vid	= i40e_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= i40e_vlan_rx_kill_vid,
-	.ndo_setup_tc		= i40e_setup_tc,
+	.ndo_setup_tc		= __i40e_setup_tc,
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= i40e_netpoll,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 320b0491abd9..abcb6c152186 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5253,11 +5253,7 @@ void i40e_down(struct i40e_vsi *vsi)
  * @netdev: net device to configure
  * @tc: number of traffic classes to enable
  **/
-#ifdef I40E_FCOE
-int i40e_setup_tc(struct net_device *netdev, u8 tc)
-#else
 static int i40e_setup_tc(struct net_device *netdev, u8 tc)
-#endif
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
@@ -5310,6 +5306,17 @@ exit:
 	return ret;
 }
 
+#ifdef I40E_FCOE
+int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+#else
+static int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+#endif
+{
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+	return i40e_setup_tc(netdev, tc);
+}
+
 /**
  * i40e_open - Called when a network interface is made active
  * @netdev: network interface device structure
@@ -8951,7 +8958,7 @@ static const struct net_device_ops i40e_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= i40e_netpoll,
 #endif
-	.ndo_setup_tc		= i40e_setup_tc,
+	.ndo_setup_tc		= __i40e_setup_tc,
 #ifdef I40E_FCOE
 	.ndo_fcoe_enable	= i40e_fcoe_enable,
 	.ndo_fcoe_disable	= i40e_fcoe_disable,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0c701b8438b6..1ba714efd78c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8200,6 +8200,15 @@ int ixgbe_setup_tc(struct net_device *dev, u8 tc)
 	return 0;
 }
 
+int __ixgbe_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+{
+	/* Only support egress tc setup for now */
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
+	return ixgbe_setup_tc(dev, tc);
+}
+
 #ifdef CONFIG_PCI_IOV
 void ixgbe_sriov_reinit(struct ixgbe_adapter *adapter)
 {
@@ -8658,7 +8667,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_get_vf_config	= ixgbe_ndo_get_vf_config,
 	.ndo_get_stats64	= ixgbe_get_stats64,
 #ifdef CONFIG_IXGBE_DCB
-	.ndo_setup_tc		= ixgbe_setup_tc,
+	.ndo_setup_tc		= __ixgbe_setup_tc,
 #endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ixgbe_netpoll,
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 0c7e3f69a73b..d5c6c16b9457 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -69,6 +69,14 @@ int mlx4_en_setup_tc(struct net_device *dev, u8 up)
 	return 0;
 }
 
+static int __mlx4_en_setup_tc(struct net_device *dev, u32 handle, u8 up)
+{
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
+	return mlx4_en_setup_tc(dev, up);
+}
+
 #ifdef CONFIG_RFS_ACCEL
 
 struct mlx4_en_filter {
@@ -2466,7 +2474,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 #endif
 	.ndo_set_features	= mlx4_en_set_features,
 	.ndo_fix_features	= mlx4_en_fix_features,
-	.ndo_setup_tc		= mlx4_en_setup_tc,
+	.ndo_setup_tc		= __mlx4_en_setup_tc,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
 #endif
@@ -2504,7 +2512,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 #endif
 	.ndo_set_features	= mlx4_en_set_features,
 	.ndo_fix_features	= mlx4_en_fix_features,
-	.ndo_setup_tc		= mlx4_en_setup_tc,
+	.ndo_setup_tc		= __mlx4_en_setup_tc,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
 #endif
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 10827476bc0b..7815fa09b15d 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -32,7 +32,7 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 				struct net_device *net_dev);
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
-int efx_setup_tc(struct net_device *net_dev, u8 num_tc);
+int efx_setup_tc(struct net_device *net_dev, u32 handle, u8 num_tc);
 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
 extern unsigned int efx_piobuf_size;
 extern bool efx_separate_tx_channels;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index f7a0ec1bca97..8f1d53e2aca7 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -562,7 +562,7 @@ void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
 				     efx->n_tx_channels : 0));
 }
 
-int efx_setup_tc(struct net_device *net_dev, u8 num_tc)
+int efx_setup_tc(struct net_device *net_dev, u32 handle, u8 num_tc)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_channel *channel;
@@ -570,6 +570,9 @@ int efx_setup_tc(struct net_device *net_dev, u8 num_tc)
 	unsigned tc;
 	int rc;
 
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
 	if (efx_nic_rev(efx) < EFX_REV_FALCON_B0 || num_tc > EFX_MAX_TX_TC)
 		return -EINVAL;
 
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index c61d66d38634..40cde814608b 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1835,13 +1835,16 @@ static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
 	return 0;
 }
 
-static int netcp_setup_tc(struct net_device *dev, u8 num_tc)
+static int netcp_setup_tc(struct net_device *dev, u32 handle, u8 num_tc)
 {
 	int i;
 
 	/* setup tc must be called under rtnl lock */
 	ASSERT_RTNL();
 
+	if (handle != TC_H_ROOT)
+		return -EINVAL;
+
 	/* Sanity-check the number of traffic classes requested */
 	if ((dev->real_num_tx_queues <= 1) ||
 	    (dev->real_num_tx_queues < num_tc))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0499569c256d..48928b6f9cb6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -51,6 +51,7 @@
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
 #include <uapi/linux/if_bonding.h>
+#include <uapi/linux/pkt_cls.h>
 
 struct netpoll_info;
 struct device;
@@ -1150,7 +1151,7 @@ struct net_device_ops {
 	int			(*ndo_set_vf_rss_query_en)(
 						   struct net_device *dev,
 						   int vf, bool setting);
-	int			(*ndo_setup_tc)(struct net_device *dev, u8 tc);
+	int			(*ndo_setup_tc)(struct net_device *dev, u32 handle, u8 tc);
 #if IS_ENABLED(CONFIG_FCOE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
 	int			(*ndo_fcoe_disable)(struct net_device *dev);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index ad70ecf57ce7..f5a0e8a4dbd7 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -39,7 +39,7 @@ static void mqprio_destroy(struct Qdisc *sch)
 	}
 
 	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
-		dev->netdev_ops->ndo_setup_tc(dev, 0);
+		dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0);
 	else
 		netdev_set_num_tc(dev, 0);
 }
@@ -141,7 +141,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 */
 	if (qopt->hw) {
 		priv->hw_owned = 1;
-		err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
+		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle,
+						    qopt->num_tc);
 		if (err)
 			goto err;
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From 16e5cc647173a97e33b3e3ba81f73eb455561794 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Feb 2016 21:16:43 -0800
Subject: net: rework setup_tc ndo op to consume general tc operand

This patch updates setup_tc so we can pass additional parameters into
the ndo op in a generic way. To do this we provide structured union
and type flag.

This lets each classifier and qdisc provide its own set of attributes
without having to add new ndo ops or grow the signature of the
callback.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c        |  9 ++++++---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |  7 ++++---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h |  3 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c       |  8 ++++++--
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c |  7 ++++---
 drivers/net/ethernet/intel/i40e/i40e.h          |  3 ++-
 drivers/net/ethernet/intel/i40e/i40e_main.c     | 10 ++++++----
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  7 ++++---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |  7 ++++---
 drivers/net/ethernet/sfc/efx.h                  |  3 ++-
 drivers/net/ethernet/sfc/tx.c                   |  9 ++++++---
 drivers/net/ethernet/ti/netcp_core.c            | 13 +++++++------
 include/linux/netdevice.h                       | 20 +++++++++++++++++++-
 net/sched/sch_mqprio.c                          |  9 ++++++---
 14 files changed, 78 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 9955cae3cabc..cfd3f7efda1c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1626,15 +1626,18 @@ static void xgbe_poll_controller(struct net_device *netdev)
 }
 #endif /* End CONFIG_NET_POLL_CONTROLLER */
 
-static int xgbe_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+static int xgbe_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
+			 struct tc_to_netdev *tc_to_netdev)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	unsigned int offset, queue;
-	u8 i;
+	u8 i, tc;
 
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc_to_netdev->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
+	tc = tc_to_netdev->tc;
+
 	if (tc && (tc != pdata->hw_feat.tc_cnt))
 		return -EINVAL;
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index b262cba34dfa..45843d150868 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4272,11 +4272,12 @@ int bnx2x_setup_tc(struct net_device *dev, u8 num_tc)
 	return 0;
 }
 
-int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u8 num_tc)
+int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+		     struct tc_to_netdev *tc)
 {
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
-	return bnx2x_setup_tc(dev, num_tc);
+	return bnx2x_setup_tc(dev, tc->tc);
 }
 
 /* called with rtnl_lock */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 60a4109dcdeb..0e68fadecfdb 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -486,7 +486,8 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
 /* setup_tc callback */
 int bnx2x_setup_tc(struct net_device *dev, u8 num_tc);
-int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u8 num_tc);
+int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+		     struct tc_to_netdev *tc);
 
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
 			struct ifla_vf_info *ivi);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index ff08faf44ee5..169920aa39f3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5370,13 +5370,17 @@ static int bnxt_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
-static int bnxt_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+			 struct tc_to_netdev *ntc)
 {
 	struct bnxt *bp = netdev_priv(dev);
+	u8 tc;
 
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || ntc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
+	tc = ntc->tc;
+
 	if (tc > bp->max_tc) {
 		netdev_err(dev, "too many traffic classes requested: %d Max supported is %d\n",
 			   tc, bp->max_tc);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 12701a492325..dc1a82148ff0 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1204,12 +1204,13 @@ err_queueing_scheme:
 	return err;
 }
 
-static int __fm10k_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+static int __fm10k_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+			    struct tc_to_netdev *tc)
 {
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return fm10k_setup_tc(dev, tc);
+	return fm10k_setup_tc(dev, tc->tc);
 }
 
 static int fm10k_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index ef9ca075d5e5..933c4b3d92c8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -788,7 +788,8 @@ struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, u8 *macaddr,
 				      bool is_vf, bool is_netdev);
 #ifdef I40E_FCOE
 int i40e_close(struct net_device *netdev);
-int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc);
+int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
+		    struct tc_to_netdev *tc);
 void i40e_netpoll(struct net_device *netdev);
 int i40e_fcoe_enable(struct net_device *netdev);
 int i40e_fcoe_disable(struct net_device *netdev);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index abcb6c152186..257d16207976 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5307,14 +5307,16 @@ exit:
 }
 
 #ifdef I40E_FCOE
-int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
+		    struct tc_to_netdev *tc)
 #else
-static int __i40e_setup_tc(struct net_device *netdev, u32 handle, u8 tc)
+static int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
+			   struct tc_to_netdev *tc)
 #endif
 {
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
-	return i40e_setup_tc(netdev, tc);
+	return i40e_setup_tc(netdev, tc->tc);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 1ba714efd78c..dca2298f4c36 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8200,13 +8200,14 @@ int ixgbe_setup_tc(struct net_device *dev, u8 tc)
 	return 0;
 }
 
-int __ixgbe_setup_tc(struct net_device *dev, u32 handle, u8 tc)
+int __ixgbe_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+		     struct tc_to_netdev *tc)
 {
 	/* Only support egress tc setup for now */
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return ixgbe_setup_tc(dev, tc);
+	return ixgbe_setup_tc(dev, tc->tc);
 }
 
 #ifdef CONFIG_PCI_IOV
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index d5c6c16b9457..01d6a9695586 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -69,12 +69,13 @@ int mlx4_en_setup_tc(struct net_device *dev, u8 up)
 	return 0;
 }
 
-static int __mlx4_en_setup_tc(struct net_device *dev, u32 handle, u8 up)
+static int __mlx4_en_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+			      struct tc_to_netdev *tc)
 {
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return mlx4_en_setup_tc(dev, up);
+	return mlx4_en_setup_tc(dev, tc->tc);
 }
 
 #ifdef CONFIG_RFS_ACCEL
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 7815fa09b15d..5e3f93f04e62 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -32,7 +32,8 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 				struct net_device *net_dev);
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
-int efx_setup_tc(struct net_device *net_dev, u32 handle, u8 num_tc);
+int efx_setup_tc(struct net_device *net_dev, u32 handle, __be16 proto,
+		 struct tc_to_netdev *tc);
 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
 extern unsigned int efx_piobuf_size;
 extern bool efx_separate_tx_channels;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 8f1d53e2aca7..2cdb5718ed66 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -562,17 +562,20 @@ void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
 				     efx->n_tx_channels : 0));
 }
 
-int efx_setup_tc(struct net_device *net_dev, u32 handle, u8 num_tc)
+int efx_setup_tc(struct net_device *net_dev, u32 handle, __be16 proto,
+		 struct tc_to_netdev *ntc)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_channel *channel;
 	struct efx_tx_queue *tx_queue;
-	unsigned tc;
+	unsigned tc, num_tc;
 	int rc;
 
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || ntc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
+	num_tc = ntc->tc;
+
 	if (efx_nic_rev(efx) < EFX_REV_FALCON_B0 || num_tc > EFX_MAX_TX_TC)
 		return -EINVAL;
 
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 40cde814608b..8586a2034019 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1835,25 +1835,26 @@ static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
 	return 0;
 }
 
-static int netcp_setup_tc(struct net_device *dev, u32 handle, u8 num_tc)
+static int netcp_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+			  struct tc_to_netdev tc)
 {
 	int i;
 
 	/* setup tc must be called under rtnl lock */
 	ASSERT_RTNL();
 
-	if (handle != TC_H_ROOT)
+	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	/* Sanity-check the number of traffic classes requested */
 	if ((dev->real_num_tx_queues <= 1) ||
-	    (dev->real_num_tx_queues < num_tc))
+	    (dev->real_num_tx_queues < tc->tc))
 		return -EINVAL;
 
 	/* Configure traffic class to queue mappings */
-	if (num_tc) {
-		netdev_set_num_tc(dev, num_tc);
-		for (i = 0; i < num_tc; i++)
+	if (tc->tc) {
+		netdev_set_num_tc(dev, tc->tc);
+		for (i = 0; i < tc->tc; i++)
 			netdev_set_tc_queue(dev, i, 1, i);
 	} else {
 		netdev_reset_tc(dev);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 48928b6f9cb6..e396060f815f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -779,6 +779,21 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
 typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
+/* This structure holds attributes of qdisc and classifiers
+ * that are being passed to the netdevice through the setup_tc op.
+ */
+enum {
+	TC_SETUP_MQPRIO,
+};
+
+struct tc_to_netdev {
+	unsigned int type;
+	union {
+		u8 tc;
+	};
+};
+
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1151,7 +1166,10 @@ struct net_device_ops {
 	int			(*ndo_set_vf_rss_query_en)(
 						   struct net_device *dev,
 						   int vf, bool setting);
-	int			(*ndo_setup_tc)(struct net_device *dev, u32 handle, u8 tc);
+	int			(*ndo_setup_tc)(struct net_device *dev,
+						u32 handle,
+						__be16 protocol,
+						struct tc_to_netdev *tc);
 #if IS_ENABLED(CONFIG_FCOE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
 	int			(*ndo_fcoe_disable)(struct net_device *dev);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index f5a0e8a4dbd7..f9947d1f4952 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -28,6 +28,7 @@ static void mqprio_destroy(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO};
 	unsigned int ntx;
 
 	if (priv->qdiscs) {
@@ -39,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch)
 	}
 
 	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
-		dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0);
+		dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
 	else
 		netdev_set_num_tc(dev, 0);
 }
@@ -140,9 +141,11 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 * supplied and verified mapping
 	 */
 	if (qopt->hw) {
+		struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO,
+					  .tc = qopt->num_tc};
+
 		priv->hw_owned = 1;
-		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle,
-						    qopt->num_tc);
+		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
 		if (err)
 			goto err;
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From a1b7c5fd7fe98f51fbbc393ee1fc4c1cdb2f0119 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Feb 2016 21:17:09 -0800
Subject: net: sched: add cls_u32 offload hooks for netdevs

This patch allows netdev drivers to consume cls_u32 offloads via
the ndo_setup_tc ndo op.

This works aligns with how network drivers have been doing qdisc
offloads for mqprio.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ++-
 include/net/pkt_cls.h     | 34 ++++++++++++++++
 net/sched/cls_u32.c       | 99 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e396060f815f..47671ce04ac4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -779,17 +779,21 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
 typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
-/* This structure holds attributes of qdisc and classifiers
+/* These structures hold the attributes of qdisc and classifiers
  * that are being passed to the netdevice through the setup_tc op.
  */
 enum {
 	TC_SETUP_MQPRIO,
+	TC_SETUP_CLSU32,
 };
 
+struct tc_cls_u32_offload;
+
 struct tc_to_netdev {
 	unsigned int type;
 	union {
 		u8 tc;
+		struct tc_cls_u32_offload *cls_u32;
 	};
 };
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index bc49967e1a68..59789ca6e2c8 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -358,4 +358,38 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 }
 #endif /* CONFIG_NET_CLS_IND */
 
+struct tc_cls_u32_knode {
+	struct tcf_exts *exts;
+	u8 fshift;
+	u32 handle;
+	u32 val;
+	u32 mask;
+	u32 link_handle;
+	struct tc_u32_sel *sel;
+};
+
+struct tc_cls_u32_hnode {
+	u32 handle;
+	u32 prio;
+	unsigned int divisor;
+};
+
+enum tc_clsu32_command {
+	TC_CLSU32_NEW_KNODE,
+	TC_CLSU32_REPLACE_KNODE,
+	TC_CLSU32_DELETE_KNODE,
+	TC_CLSU32_NEW_HNODE,
+	TC_CLSU32_REPLACE_HNODE,
+	TC_CLSU32_DELETE_HNODE,
+};
+
+struct tc_cls_u32_offload {
+	/* knode values */
+	enum tc_clsu32_command command;
+	union {
+		struct tc_cls_u32_knode knode;
+		struct tc_cls_u32_hnode hnode;
+	};
+};
+
 #endif
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 4fbb67430ce4..d54bc942ea87 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -43,6 +43,7 @@
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
+#include <linux/netdevice.h>
 
 struct tc_u_knode {
 	struct tc_u_knode __rcu	*next;
@@ -424,6 +425,93 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 	return 0;
 }
 
+static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_u32_offload u32_offload = {0};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSU32;
+	offload.cls_u32 = &u32_offload;
+
+	if (dev->netdev_ops->ndo_setup_tc) {
+		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
+		offload.cls_u32->knode.handle = handle;
+		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					      tp->protocol, &offload);
+	}
+}
+
+static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_u32_offload u32_offload = {0};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSU32;
+	offload.cls_u32 = &u32_offload;
+
+	if (dev->netdev_ops->ndo_setup_tc) {
+		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
+		offload.cls_u32->hnode.divisor = h->divisor;
+		offload.cls_u32->hnode.handle = h->handle;
+		offload.cls_u32->hnode.prio = h->prio;
+
+		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					      tp->protocol, &offload);
+	}
+}
+
+static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_u32_offload u32_offload = {0};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSU32;
+	offload.cls_u32 = &u32_offload;
+
+	if (dev->netdev_ops->ndo_setup_tc) {
+		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
+		offload.cls_u32->hnode.divisor = h->divisor;
+		offload.cls_u32->hnode.handle = h->handle;
+		offload.cls_u32->hnode.prio = h->prio;
+
+		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					      tp->protocol, &offload);
+	}
+}
+
+static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_u32_offload u32_offload = {0};
+	struct tc_to_netdev offload;
+
+	offload.type = TC_SETUP_CLSU32;
+	offload.cls_u32 = &u32_offload;
+
+	if (dev->netdev_ops->ndo_setup_tc) {
+		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
+		offload.cls_u32->knode.handle = n->handle;
+		offload.cls_u32->knode.fshift = n->fshift;
+#ifdef CONFIG_CLS_U32_MARK
+		offload.cls_u32->knode.val = n->val;
+		offload.cls_u32->knode.mask = n->mask;
+#else
+		offload.cls_u32->knode.val = 0;
+		offload.cls_u32->knode.mask = 0;
+#endif
+		offload.cls_u32->knode.sel = &n->sel;
+		offload.cls_u32->knode.exts = &n->exts;
+		if (n->ht_down)
+			offload.cls_u32->knode.link_handle = n->ht_down->handle;
+
+		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					      tp->protocol, &offload);
+	}
+}
+
 static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 {
 	struct tc_u_knode *n;
@@ -434,6 +522,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 			RCU_INIT_POINTER(ht->ht[h],
 					 rtnl_dereference(n->next));
 			tcf_unbind_filter(tp, &n->res);
+			u32_remove_hw_knode(tp, n->handle);
 			call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
 		}
 	}
@@ -454,6 +543,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 	     phn;
 	     hn = &phn->next, phn = rtnl_dereference(*hn)) {
 		if (phn == ht) {
+			u32_clear_hw_hnode(tp, ht);
 			RCU_INIT_POINTER(*hn, ht->next);
 			kfree_rcu(ht, rcu);
 			return 0;
@@ -540,8 +630,10 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
 	if (ht == NULL)
 		return 0;
 
-	if (TC_U32_KEY(ht->handle))
+	if (TC_U32_KEY(ht->handle)) {
+		u32_remove_hw_knode(tp, ht->handle);
 		return u32_delete_key(tp, (struct tc_u_knode *)ht);
+	}
 
 	if (root_ht == ht)
 		return -EINVAL;
@@ -769,6 +861,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
+		u32_replace_hw_knode(tp, new);
 		return 0;
 	}
 
@@ -795,6 +888,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		RCU_INIT_POINTER(ht->next, tp_c->hlist);
 		rcu_assign_pointer(tp_c->hlist, ht);
 		*arg = (unsigned long)ht;
+
+		u32_replace_hw_hnode(tp, ht);
 		return 0;
 	}
 
@@ -877,7 +972,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
-
+		u32_replace_hw_knode(tp, n);
 		*arg = (unsigned long)n;
 		return 0;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 1c78c64e9c6f43a490427d55cd2d213b7c6795c1 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Feb 2016 21:17:37 -0800
Subject: net: add tc offload feature flag

Its useful to turn off the qdisc offload feature at a per device
level. This gives us a big hammer to enable/disable offloading.
More fine grained control (i.e. per rule) may be supported later.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 3 +++
 net/core/ethtool.c              | 1 +
 2 files changed, 4 insertions(+)

(limited to 'net')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index d9654f0eecb3..a734bf43d190 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -67,6 +67,8 @@ enum {
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
 
+	NETIF_F_HW_TC_BIT,		/* Offload TC infrastructure */
+
 	/*
 	 * Add your fresh new feature above and remember to update
 	 * netdev_features_strings[] in net/core/ethtool.c and maybe
@@ -124,6 +126,7 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
+#define NETIF_F_HW_TC		__NETIF_F(HW_TC)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 65f907aea777..c2d3118b1395 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_RXALL_BIT] =            "rx-all",
 	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
 	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
+	[NETIF_F_HW_TC_BIT] =		 "hw-tc-offload",
 };
 
 static const char
-- 
cgit v1.2.3-59-g8ed1b


From bd4508e8507cbeee3c0fd96a1f96327b07c490b9 Mon Sep 17 00:00:00 2001
From: "Rosen, Rami" <rami.rosen@intel.com>
Date: Mon, 15 Feb 2016 02:39:43 +0200
Subject: core: remove unneded headers for net cgroup controllers.

commit 3ed80a6 (cgroup: drop module support) made including
module.h redundant in the net cgroup controllers,
netclassid_cgroup.c and netprio_cgroup.c. This patch
removes them.

Signed-off-by: Rami Rosen <rami.rosen@intel.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netclassid_cgroup.c | 1 -
 net/core/netprio_cgroup.c    | 1 -
 2 files changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 0260c84ed83c..11fce17274f6 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -9,7 +9,6 @@
  * Authors:	Thomas Graf <tgraf@suug.ch>
  */
 
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fdtable.h>
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index f1efbc39ef6b..2ec86fc552df 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -11,7 +11,6 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/string.h>
-- 
cgit v1.2.3-59-g8ed1b


From f46c7011b062726b798fa3326b5b0c2711ead2a0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 15 Feb 2016 14:28:03 +0800
Subject: sctp: move rcu_read_lock from __sctp_lookup_association to
 sctp_lookup_association

__sctp_lookup_association() is only invoked by sctp_v4_err() and
sctp_rcv(), both which run on the rx BH, and it has been protected
by rcu_read_lock [see ip_local_deliver_finish() / ipv6_rcv()].

So we can move it to sctp_lookup_association, only let
sctp_lookup_association use rcu_read_lock.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 49d2cc751386..21a2d6b7abaf 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -937,7 +937,6 @@ static struct sctp_association *__sctp_lookup_association(
 	struct sctp_transport *t;
 	struct sctp_association *asoc = NULL;
 
-	rcu_read_lock();
 	t = sctp_addrs_lookup_transport(net, local, peer);
 	if (!t || !sctp_transport_hold(t))
 		goto out;
@@ -949,7 +948,6 @@ static struct sctp_association *__sctp_lookup_association(
 	sctp_transport_put(t);
 
 out:
-	rcu_read_unlock();
 	return asoc;
 }
 
@@ -962,7 +960,9 @@ struct sctp_association *sctp_lookup_association(struct net *net,
 {
 	struct sctp_association *asoc;
 
+	rcu_read_lock();
 	asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+	rcu_read_unlock();
 
 	return asoc;
 }
-- 
cgit v1.2.3-59-g8ed1b


From ac1efde8022acf705c4b8393d5ee4b070594ba88 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 15 Feb 2016 14:28:04 +0800
Subject: sctp: remove rcu_read_lock in sctp_seq_dump_remote_addrs()

sctp_seq_dump_remote_addrs is only called by sctp_assocs_seq_show()
and it has been protected by rcu_read_lock that is from
rhashtable_walk_start().

So we will remove this one.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/proc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index ded7d931a6a5..cfc3c7101a38 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -161,7 +161,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa
 	struct sctp_af *af;
 
 	primary = &assoc->peer.primary_addr;
-	rcu_read_lock();
 	list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list,
 			transports) {
 		addr = &transport->ipaddr;
@@ -172,7 +171,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa
 		}
 		af->seq_dump_addr(seq, addr);
 	}
-	rcu_read_unlock();
 }
 
 static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos)
-- 
cgit v1.2.3-59-g8ed1b


From 1cd4d5c4326a7ed3bb0e346bd7d20f5057a80ae6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 15 Feb 2016 14:28:05 +0800
Subject: sctp: remove the unused sctp_datamsg_free()

Since commit 8b570dc9f7b6 ("sctp: only drop the reference on the datamsg
after sending a msg") used sctp_datamsg_put in sctp_sendmsg, instead of
sctp_datamsg_free, this function has no use in sctp.

So we will remove it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 -
 net/sctp/chunk.c           | 13 -------------
 2 files changed, 14 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 205630bb5010..d05b56641abc 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -535,7 +535,6 @@ struct sctp_datamsg {
 struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
 					    struct sctp_sndrcvinfo *,
 					    struct iov_iter *);
-void sctp_datamsg_free(struct sctp_datamsg *);
 void sctp_datamsg_put(struct sctp_datamsg *);
 void sctp_chunk_fail(struct sctp_chunk *, int error);
 int sctp_chunk_abandoned(struct sctp_chunk *);
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index a3380917f197..3aa43073e0b9 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -70,19 +70,6 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
 	return msg;
 }
 
-void sctp_datamsg_free(struct sctp_datamsg *msg)
-{
-	struct sctp_chunk *chunk;
-
-	/* This doesn't have to be a _safe vairant because
-	 * sctp_chunk_free() only drops the refs.
-	 */
-	list_for_each_entry(chunk, &msg->chunks, frag_list)
-		sctp_chunk_free(chunk);
-
-	sctp_datamsg_put(msg);
-}
-
 /* Final destructruction of datamsg memory. */
 static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 82aaf4fcbe0f26adb14e72c9df6a2c4ed3f8c34d Mon Sep 17 00:00:00 2001
From: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Date: Mon, 15 Feb 2016 19:41:51 +0000
Subject: af_llc: fix types on llc_ui_wait_for_conn

The timeout is a long, we return it truncated if it is huge. Basically
harmless as the only caller does a boolean check, but tidy it up anyway.

(64bit build tested this time. Thank you 0day)

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/af_llc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 8dab4e569571..b3c52e3f689a 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256];
 static struct sockaddr_llc llc_ui_addrnull;
 static const struct proto_ops llc_ui_ops;
 
-static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
+static long llc_ui_wait_for_conn(struct sock *sk, long timeout);
 static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
 static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
 
@@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
 	return rc;
 }
 
-static int llc_ui_wait_for_conn(struct sock *sk, long timeout)
+static long llc_ui_wait_for_conn(struct sock *sk, long timeout)
 {
 	DEFINE_WAIT(wait);
 
-- 
cgit v1.2.3-59-g8ed1b


From 7bbf3cae65b6e438bf52033b63fdce4a86e89e17 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 15 Feb 2016 21:25:57 +0000
Subject: ipv4: Remove inet_lro library

There are no longer any in-tree drivers that use it.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_lro.h | 142 ------------------
 net/ipv4/Kconfig         |   8 -
 net/ipv4/Makefile        |   1 -
 net/ipv4/inet_lro.c      | 374 -----------------------------------------------
 4 files changed, 525 deletions(-)
 delete mode 100644 include/linux/inet_lro.h
 delete mode 100644 net/ipv4/inet_lro.c

(limited to 'net')

diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h
deleted file mode 100644
index 9a715cfa1fe3..000000000000
--- a/include/linux/inet_lro.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  linux/include/linux/inet_lro.h
- *
- *  Large Receive Offload (ipv4 / tcp)
- *
- *  (C) Copyright IBM Corp. 2007
- *
- *  Authors:
- *       Jan-Bernd Themann <themann@de.ibm.com>
- *       Christoph Raisch <raisch@de.ibm.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef __INET_LRO_H_
-#define __INET_LRO_H_
-
-#include <net/ip.h>
-#include <net/tcp.h>
-
-/*
- * LRO statistics
- */
-
-struct net_lro_stats {
-	unsigned long aggregated;
-	unsigned long flushed;
-	unsigned long no_desc;
-};
-
-/*
- * LRO descriptor for a tcp session
- */
-struct net_lro_desc {
-	struct sk_buff *parent;
-	struct sk_buff *last_skb;
-	struct skb_frag_struct *next_frag;
-	struct iphdr *iph;
-	struct tcphdr *tcph;
-	__wsum  data_csum;
-	__be32 tcp_rcv_tsecr;
-	__be32 tcp_rcv_tsval;
-	__be32 tcp_ack;
-	u32 tcp_next_seq;
-	u32 skb_tot_frags_len;
-	u16 ip_tot_len;
-	u16 tcp_saw_tstamp; 		/* timestamps enabled */
-	__be16 tcp_window;
-	int pkt_aggr_cnt;		/* counts aggregated packets */
-	int vlan_packet;
-	int mss;
-	int active;
-};
-
-/*
- * Large Receive Offload (LRO) Manager
- *
- * Fields must be set by driver
- */
-
-struct net_lro_mgr {
-	struct net_device *dev;
-	struct net_lro_stats stats;
-
-	/* LRO features */
-	unsigned long features;
-#define LRO_F_NAPI            1  /* Pass packets to stack via NAPI */
-#define LRO_F_EXTRACT_VLAN_ID 2  /* Set flag if VLAN IDs are extracted
-				    from received packets and eth protocol
-				    is still ETH_P_8021Q */
-
-	/*
-	 * Set for generated SKBs that are not added to
-	 * the frag list in fragmented mode
-	 */
-	u32 ip_summed;
-	u32 ip_summed_aggr; /* Set in aggregated SKBs: CHECKSUM_UNNECESSARY
-			     * or CHECKSUM_NONE */
-
-	int max_desc; /* Max number of LRO descriptors  */
-	int max_aggr; /* Max number of LRO packets to be aggregated */
-
-	int frag_align_pad; /* Padding required to properly align layer 3
-			     * headers in generated skb when using frags */
-
-	struct net_lro_desc *lro_arr; /* Array of LRO descriptors */
-
-	/*
-	 * Optimized driver functions
-	 *
-	 * get_skb_header: returns tcp and ip header for packet in SKB
-	 */
-	int (*get_skb_header)(struct sk_buff *skb, void **ip_hdr,
-			      void **tcpudp_hdr, u64 *hdr_flags, void *priv);
-
-	/* hdr_flags: */
-#define LRO_IPV4 1 /* ip_hdr is IPv4 header */
-#define LRO_TCP  2 /* tcpudp_hdr is TCP header */
-
-	/*
-	 * get_frag_header: returns mac, tcp and ip header for packet in SKB
-	 *
-	 * @hdr_flags: Indicate what kind of LRO has to be done
-	 *             (IPv4/IPv6/TCP/UDP)
-	 */
-	int (*get_frag_header)(struct skb_frag_struct *frag, void **mac_hdr,
-			       void **ip_hdr, void **tcpudp_hdr, u64 *hdr_flags,
-			       void *priv);
-};
-
-/*
- * Processes a SKB
- *
- * @lro_mgr: LRO manager to use
- * @skb: SKB to aggregate
- * @priv: Private data that may be used by driver functions
- *        (for example get_tcp_ip_hdr)
- */
-
-void lro_receive_skb(struct net_lro_mgr *lro_mgr,
-		     struct sk_buff *skb,
-		     void *priv);
-/*
- * Forward all aggregated SKBs held by lro_mgr to network stack
- */
-
-void lro_flush_all(struct net_lro_mgr *lro_mgr);
-
-#endif
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 395d82754626..238225b0c970 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -406,14 +406,6 @@ config INET_XFRM_MODE_BEET
 
 	  If unsure, say Y.
 
-config INET_LRO
-	tristate "Large Receive Offload (ipv4/tcp)"
-	default y
-	---help---
-	  Support for Large Receive Offload (ipv4/tcp).
-
-	  If unsure, say Y.
-
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 62c049b647e9..bfa133691cde 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
-obj-$(CONFIG_INET_LRO) += inet_lro.o
 obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
 obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
 obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
deleted file mode 100644
index f17ea49b28fb..000000000000
--- a/net/ipv4/inet_lro.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- *  linux/net/ipv4/inet_lro.c
- *
- *  Large Receive Offload (ipv4 / tcp)
- *
- *  (C) Copyright IBM Corp. 2007
- *
- *  Authors:
- *       Jan-Bernd Themann <themann@de.ibm.com>
- *       Christoph Raisch <raisch@de.ibm.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-#include <linux/module.h>
-#include <linux/if_vlan.h>
-#include <linux/inet_lro.h>
-#include <net/checksum.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
-MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
-
-#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
-#define IP_HDR_LEN(iph) (iph->ihl << 2)
-#define TCP_PAYLOAD_LENGTH(iph, tcph) \
-	(ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
-
-#define IPH_LEN_WO_OPTIONS 5
-#define TCPH_LEN_WO_OPTIONS 5
-#define TCPH_LEN_W_TIMESTAMP 8
-
-#define LRO_MAX_PG_HLEN 64
-
-#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
-
-/*
- * Basic tcp checks whether packet is suitable for LRO
- */
-
-static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
-			    int len, const struct net_lro_desc *lro_desc)
-{
-        /* check ip header: don't aggregate padded frames */
-	if (ntohs(iph->tot_len) != len)
-		return -1;
-
-	if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
-		return -1;
-
-	if (iph->ihl != IPH_LEN_WO_OPTIONS)
-		return -1;
-
-	if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
-	    tcph->rst || tcph->syn || tcph->fin)
-		return -1;
-
-	if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
-		return -1;
-
-	if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
-	    tcph->doff != TCPH_LEN_W_TIMESTAMP)
-		return -1;
-
-	/* check tcp options (only timestamp allowed) */
-	if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
-		__be32 *topt = (__be32 *)(tcph + 1);
-
-		if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
-				   | (TCPOPT_TIMESTAMP << 8)
-				   | TCPOLEN_TIMESTAMP))
-			return -1;
-
-		/* timestamp should be in right order */
-		topt++;
-		if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
-				      ntohl(*topt)))
-			return -1;
-
-		/* timestamp reply should not be zero */
-		topt++;
-		if (*topt == 0)
-			return -1;
-	}
-
-	return 0;
-}
-
-static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
-{
-	struct iphdr *iph = lro_desc->iph;
-	struct tcphdr *tcph = lro_desc->tcph;
-	__be32 *p;
-	__wsum tcp_hdr_csum;
-
-	tcph->ack_seq = lro_desc->tcp_ack;
-	tcph->window = lro_desc->tcp_window;
-
-	if (lro_desc->tcp_saw_tstamp) {
-		p = (__be32 *)(tcph + 1);
-		*(p+2) = lro_desc->tcp_rcv_tsecr;
-	}
-
-	csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
-	iph->tot_len = htons(lro_desc->ip_tot_len);
-
-	tcph->check = 0;
-	tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
-	lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
-	tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-					lro_desc->ip_tot_len -
-					IP_HDR_LEN(iph), IPPROTO_TCP,
-					lro_desc->data_csum);
-}
-
-static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
-{
-	__wsum tcp_csum;
-	__wsum tcp_hdr_csum;
-	__wsum tcp_ps_hdr_csum;
-
-	tcp_csum = ~csum_unfold(tcph->check);
-	tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
-
-	tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
-					     len + TCP_HDR_LEN(tcph),
-					     IPPROTO_TCP, 0);
-
-	return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
-			tcp_ps_hdr_csum);
-}
-
-static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
-			  struct iphdr *iph, struct tcphdr *tcph)
-{
-	int nr_frags;
-	__be32 *ptr;
-	u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
-	nr_frags = skb_shinfo(skb)->nr_frags;
-	lro_desc->parent = skb;
-	lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
-	lro_desc->iph = iph;
-	lro_desc->tcph = tcph;
-	lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
-	lro_desc->tcp_ack = tcph->ack_seq;
-	lro_desc->tcp_window = tcph->window;
-
-	lro_desc->pkt_aggr_cnt = 1;
-	lro_desc->ip_tot_len = ntohs(iph->tot_len);
-
-	if (tcph->doff == 8) {
-		ptr = (__be32 *)(tcph+1);
-		lro_desc->tcp_saw_tstamp = 1;
-		lro_desc->tcp_rcv_tsval = *(ptr+1);
-		lro_desc->tcp_rcv_tsecr = *(ptr+2);
-	}
-
-	lro_desc->mss = tcp_data_len;
-	lro_desc->active = 1;
-
-	lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
-						tcp_data_len);
-}
-
-static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
-{
-	memset(lro_desc, 0, sizeof(struct net_lro_desc));
-}
-
-static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
-			   struct tcphdr *tcph, int tcp_data_len)
-{
-	struct sk_buff *parent = lro_desc->parent;
-	__be32 *topt;
-
-	lro_desc->pkt_aggr_cnt++;
-	lro_desc->ip_tot_len += tcp_data_len;
-	lro_desc->tcp_next_seq += tcp_data_len;
-	lro_desc->tcp_window = tcph->window;
-	lro_desc->tcp_ack = tcph->ack_seq;
-
-	/* don't update tcp_rcv_tsval, would not work with PAWS */
-	if (lro_desc->tcp_saw_tstamp) {
-		topt = (__be32 *) (tcph + 1);
-		lro_desc->tcp_rcv_tsecr = *(topt + 2);
-	}
-
-	lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
-					     lro_tcp_data_csum(iph, tcph,
-							       tcp_data_len),
-					     parent->len);
-
-	parent->len += tcp_data_len;
-	parent->data_len += tcp_data_len;
-	if (tcp_data_len > lro_desc->mss)
-		lro_desc->mss = tcp_data_len;
-}
-
-static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
-			   struct iphdr *iph, struct tcphdr *tcph)
-{
-	struct sk_buff *parent = lro_desc->parent;
-	int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
-	lro_add_common(lro_desc, iph, tcph, tcp_data_len);
-
-	skb_pull(skb, (skb->len - tcp_data_len));
-	parent->truesize += skb->truesize;
-
-	if (lro_desc->last_skb)
-		lro_desc->last_skb->next = skb;
-	else
-		skb_shinfo(parent)->frag_list = skb;
-
-	lro_desc->last_skb = skb;
-}
-
-
-static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
-			      struct iphdr *iph,
-			      struct tcphdr *tcph)
-{
-	if ((lro_desc->iph->saddr != iph->saddr) ||
-	    (lro_desc->iph->daddr != iph->daddr) ||
-	    (lro_desc->tcph->source != tcph->source) ||
-	    (lro_desc->tcph->dest != tcph->dest))
-		return -1;
-	return 0;
-}
-
-static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
-					 struct net_lro_desc *lro_arr,
-					 struct iphdr *iph,
-					 struct tcphdr *tcph)
-{
-	struct net_lro_desc *lro_desc = NULL;
-	struct net_lro_desc *tmp;
-	int max_desc = lro_mgr->max_desc;
-	int i;
-
-	for (i = 0; i < max_desc; i++) {
-		tmp = &lro_arr[i];
-		if (tmp->active)
-			if (!lro_check_tcp_conn(tmp, iph, tcph)) {
-				lro_desc = tmp;
-				goto out;
-			}
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		if (!lro_arr[i].active) {
-			lro_desc = &lro_arr[i];
-			goto out;
-		}
-	}
-
-	LRO_INC_STATS(lro_mgr, no_desc);
-out:
-	return lro_desc;
-}
-
-static void lro_flush(struct net_lro_mgr *lro_mgr,
-		      struct net_lro_desc *lro_desc)
-{
-	if (lro_desc->pkt_aggr_cnt > 1)
-		lro_update_tcp_ip_header(lro_desc);
-
-	skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
-
-	if (lro_mgr->features & LRO_F_NAPI)
-		netif_receive_skb(lro_desc->parent);
-	else
-		netif_rx(lro_desc->parent);
-
-	LRO_INC_STATS(lro_mgr, flushed);
-	lro_clear_desc(lro_desc);
-}
-
-static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
-			  void *priv)
-{
-	struct net_lro_desc *lro_desc;
-	struct iphdr *iph;
-	struct tcphdr *tcph;
-	u64 flags;
-	int vlan_hdr_len = 0;
-
-	if (!lro_mgr->get_skb_header ||
-	    lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
-				    &flags, priv))
-		goto out;
-
-	if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
-		goto out;
-
-	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
-	if (!lro_desc)
-		goto out;
-
-	if ((skb->protocol == htons(ETH_P_8021Q)) &&
-	    !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
-		vlan_hdr_len = VLAN_HLEN;
-
-	if (!lro_desc->active) { /* start new lro session */
-		if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
-			goto out;
-
-		skb->ip_summed = lro_mgr->ip_summed_aggr;
-		lro_init_desc(lro_desc, skb, iph, tcph);
-		LRO_INC_STATS(lro_mgr, aggregated);
-		return 0;
-	}
-
-	if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
-		goto out2;
-
-	if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
-		goto out2;
-
-	lro_add_packet(lro_desc, skb, iph, tcph);
-	LRO_INC_STATS(lro_mgr, aggregated);
-
-	if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
-	    lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
-		lro_flush(lro_mgr, lro_desc);
-
-	return 0;
-
-out2: /* send aggregated SKBs to stack */
-	lro_flush(lro_mgr, lro_desc);
-
-out:
-	return 1;
-}
-
-void lro_receive_skb(struct net_lro_mgr *lro_mgr,
-		     struct sk_buff *skb,
-		     void *priv)
-{
-	if (__lro_proc_skb(lro_mgr, skb, priv)) {
-		if (lro_mgr->features & LRO_F_NAPI)
-			netif_receive_skb(skb);
-		else
-			netif_rx(skb);
-	}
-}
-EXPORT_SYMBOL(lro_receive_skb);
-
-void lro_flush_all(struct net_lro_mgr *lro_mgr)
-{
-	int i;
-	struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
-
-	for (i = 0; i < lro_mgr->max_desc; i++) {
-		if (lro_desc[i].active)
-			lro_flush(lro_mgr, &lro_desc[i]);
-	}
-}
-EXPORT_SYMBOL(lro_flush_all);
-- 
cgit v1.2.3-59-g8ed1b


From 2d9e9b0d05876c9e6b52cfec142e51da9b9b3f6d Mon Sep 17 00:00:00 2001
From: Yannick Brosseau <scientist@fb.com>
Date: Mon, 25 Jan 2016 17:12:25 -0800
Subject: netfilter: ipvs: Remove noisy debug print from ip_vs_del_service

This have been there for a long time, but does not seem to add value

Signed-off-by: Yannick Brosseau <scientist@fb.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e7c1b052c2a3..daf4cb746974 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1376,8 +1376,6 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
 	struct ip_vs_pe *old_pe;
 	struct netns_ipvs *ipvs = svc->ipvs;
 
-	pr_info("%s: enter\n", __func__);
-
 	/* Count only IPv4 services for old get/setsockopt interface */
 	if (svc->af == AF_INET)
 		ipvs->num_services--;
-- 
cgit v1.2.3-59-g8ed1b


From f6ca9f46f6615c3a87529550058d1b468c0cad89 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 27 Jan 2016 14:52:01 +0100
Subject: netfilter: ipvs: avoid unused variable warnings

The proc_create() and remove_proc_entry() functions do not reference
their arguments when CONFIG_PROC_FS is disabled, so we get a couple
of warnings about unused variables in IPVS:

ipvs/ip_vs_app.c:608:14: warning: unused variable 'net' [-Wunused-variable]
ipvs/ip_vs_ctl.c:3950:14: warning: unused variable 'net' [-Wunused-variable]
ipvs/ip_vs_ctl.c:3994:14: warning: unused variable 'net' [-Wunused-variable]

This removes the local variables and instead looks them up separately
for each use, which obviously avoids the warning.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 4c50a8ce2b63 ("netfilter: ipvs: avoid unused variable warning")
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_app.c |  8 ++------
 net/netfilter/ipvs/ip_vs_ctl.c | 15 ++++++---------
 2 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 0328f7250693..299edc6add5a 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -605,17 +605,13 @@ static const struct file_operations ip_vs_app_fops = {
 
 int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
 {
-	struct net *net = ipvs->net;
-
 	INIT_LIST_HEAD(&ipvs->app_list);
-	proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
+	proc_create("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_fops);
 	return 0;
 }
 
 void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
 {
-	struct net *net = ipvs->net;
-
 	unregister_ip_vs_app(ipvs, NULL /* all */);
-	remove_proc_entry("ip_vs_app", net->proc_net);
+	remove_proc_entry("ip_vs_app", ipvs->net->proc_net);
 }
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index daf4cb746974..404b2a4f4b5b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3945,7 +3945,6 @@ static struct notifier_block ip_vs_dst_notifier = {
 
 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 {
-	struct net *net = ipvs->net;
 	int i, idx;
 
 	/* Initialize rs_table */
@@ -3972,9 +3971,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 
 	spin_lock_init(&ipvs->tot_stats.lock);
 
-	proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
-	proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
-	proc_create("ip_vs_stats_percpu", 0, net->proc_net,
+	proc_create("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_fops);
+	proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops);
+	proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
 		    &ip_vs_stats_percpu_fops);
 
 	if (ip_vs_control_net_init_sysctl(ipvs))
@@ -3989,13 +3988,11 @@ err:
 
 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
 {
-	struct net *net = ipvs->net;
-
 	ip_vs_trash_cleanup(ipvs);
 	ip_vs_control_net_cleanup_sysctl(ipvs);
-	remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
-	remove_proc_entry("ip_vs_stats", net->proc_net);
-	remove_proc_entry("ip_vs", net->proc_net);
+	remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
+	remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
+	remove_proc_entry("ip_vs", ipvs->net->proc_net);
 	free_percpu(ipvs->tot_stats.cpustats);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From e4999f256cdde4f2cbb8fc26fd61819cf4d95cf7 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Thu, 18 Feb 2016 02:29:30 +0000
Subject: vlan: change return type of vlan_proc_rem_dev

Since function vlan_proc_rem_dev() will only return 0, it's better to
return void instead of int.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlanproc.c | 3 +--
 net/8021q/vlanproc.h | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index ae63cf72a953..5f1446c9f098 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -184,12 +184,11 @@ int vlan_proc_add_dev(struct net_device *vlandev)
 /*
  *	Delete directory entry for VLAN device.
  */
-int vlan_proc_rem_dev(struct net_device *vlandev)
+void vlan_proc_rem_dev(struct net_device *vlandev)
 {
 	/** NOTE:  This will consume the memory pointed to by dent, it seems. */
 	proc_remove(vlan_dev_priv(vlandev)->dent);
 	vlan_dev_priv(vlandev)->dent = NULL;
-	return 0;
 }
 
 /****** Proc filesystem entry points ****************************************/
diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h
index 063f60a3d5cc..8838a2e92eb6 100644
--- a/net/8021q/vlanproc.h
+++ b/net/8021q/vlanproc.h
@@ -5,7 +5,7 @@
 struct net;
 
 int vlan_proc_init(struct net *net);
-int vlan_proc_rem_dev(struct net_device *vlandev);
+void vlan_proc_rem_dev(struct net_device *vlandev);
 int vlan_proc_add_dev(struct net_device *vlandev);
 void vlan_proc_cleanup(struct net *net);
 
@@ -14,7 +14,7 @@ void vlan_proc_cleanup(struct net *net);
 #define vlan_proc_init(net)	(0)
 #define vlan_proc_cleanup(net)	do {} while (0)
 #define vlan_proc_add_dev(dev)	({(void)(dev), 0; })
-#define vlan_proc_rem_dev(dev)	({(void)(dev), 0; })
+#define vlan_proc_rem_dev(dev)	do {} while (0)
 #endif
 
 #endif /* !(__BEN_VLAN_PROC_INC__) */
-- 
cgit v1.2.3-59-g8ed1b


From fbbef866fce2d81aa1791d0c762afb07dbc4e660 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 15 Feb 2016 22:54:47 +0000
Subject: net-sysfs: remove unused fmt_long_hex

Ever since commit 04ed3e741d0f133e02bed7fa5c98edba128f90e7
("net: change netdev->features to u32") the format string
fmt_long_hex has not been used, so we may as well remove it.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index da7dbc237a5f..4ae17c3166fc 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -29,7 +29,6 @@
 
 #ifdef CONFIG_SYSFS
 static const char fmt_hex[] = "%#x\n";
-static const char fmt_long_hex[] = "%#lx\n";
 static const char fmt_dec[] = "%d\n";
 static const char fmt_ulong[] = "%lu\n";
 static const char fmt_u64[] = "%llu\n";
-- 
cgit v1.2.3-59-g8ed1b


From 7fbac984f33a374a06fa7b1fc41127bdaf7d6c07 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 18 Feb 2016 14:01:46 +0100
Subject: bridge: switchdev: Offload VLAN flags to hardware bridge

When VLANs are created / destroyed on a VLAN filtering bridge (MASTER
flag set), the configuration is passed down to the hardware. However,
when only the flags (e.g. PVID) are toggled, the configuration is done
in the software bridge alone.

While it is possible to pass these flags to hardware when invoked with
the SELF flag set, this creates inconsistency with regards to the way
the VLANs are initially configured.

Pass the flags down to the hardware even when the VLAN already exists
and only the flags are toggled.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_vlan.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 85e43af4af7a..9309bb4f2a5b 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -955,6 +955,13 @@ err_rhtbl:
  */
 int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags)
 {
+	struct switchdev_obj_port_vlan v = {
+		.obj.orig_dev = port->dev,
+		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+		.flags = flags,
+		.vid_begin = vid,
+		.vid_end = vid,
+	};
 	struct net_bridge_vlan *vlan;
 	int ret;
 
@@ -962,6 +969,10 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags)
 
 	vlan = br_vlan_find(nbp_vlan_group(port), vid);
 	if (vlan) {
+		/* Pass the flags to the hardware bridge */
+		ret = switchdev_port_obj_add(port->dev, &v.obj);
+		if (ret && ret != -EOPNOTSUPP)
+			return ret;
 		__vlan_add_flags(vlan, flags);
 		return 0;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 7e6e18fbc033e00a4d4af3d4ea7bad0db6b7ad1b Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Thu, 18 Feb 2016 08:04:43 -0500
Subject: net_sched: Improve readability of filter processing

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b5c2cf2aa6d4..c9673b5eaf77 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1841,7 +1841,7 @@ reclassify:
 			return err;
 	}
 
-	return -1;
+	return TC_ACT_UNSPEC; /* signal: continue lookup */
 #ifdef CONFIG_NET_CLS_ACT
 reset:
 	if (unlikely(limit++ >= MAX_REC_LOOP)) {
-- 
cgit v1.2.3-59-g8ed1b


From d1b4c689d4130bcfd3532680b64db562300716b6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:24 +0100
Subject: netlink: remove mmapped netlink support

mmapped netlink has a number of unresolved issues:

- TX zerocopy support had to be disabled more than a year ago via
  commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.")
  because the content of the mmapped area can change after netlink
  attribute validation but before message processing.

- RX support was implemented mainly to speed up nfqueue dumping packet
  payload to userspace.  However, since commit ae08ce0021087a5d812d2
  ("netfilter: nfnetlink_queue: zero copy support") we avoid one copy
  with the socket-based interface too (via the skb_zerocopy helper).

The other problem is that skbs attached to mmaped netlink socket
behave different from normal skbs:

- they don't have a shinfo area, so all functions that use skb_shinfo()
(e.g. skb_clone) cannot be used.

- reserving headroom prevents userspace from seeing the content as
it expects message to start at skb->head.
See for instance
commit aa3a022094fa ("netlink: not trim skb for mmaped socket when dump").

- skbs handed e.g. to netlink_ack must have non-NULL skb->sk, else we
crash because it needs the sk to check if a tx ring is attached.

Also not obvious, leads to non-intuitive bug fixes such as 7c7bdf359
("netfilter: nfnetlink: use original skbuff when acking batches").

mmaped netlink also didn't play nicely with the skb_zerocopy helper
used by nfqueue and openvswitch.  Daniel Borkmann fixed this via
commit 6bb0fef489f6 ("netlink, mmap: fix edge-case leakages in nf queue
zero-copy")' but at the cost of also needing to provide remaining
length to the allocation function.

nfqueue also has problems when used with mmaped rx netlink:
- mmaped netlink doesn't allow use of nfqueue batch verdict messages.
  Problem is that in the mmap case, the allocation time also determines
  the ordering in which the frame will be seen by userspace (A
  allocating before B means that A is located in earlier ring slot,
  but this also means that B might get a lower sequence number then A
  since seqno is decided later.  To fix this we would need to extend the
  spinlocked region to also cover the allocation and message setup which
  isn't desirable.
- nfqueue can now be configured to queue large (GSO) skbs to userspace.
  Queing GSO packets is faster than having to force a software segmentation
  in the kernel, so this is a desirable option.  However, with a mmap based
  ring one has to use 64kb per ring slot element, else mmap has to fall back
  to the socket path (NL_MMAP_STATUS_COPY) for all large packets.

To use the mmap interface, userspace not only has to probe for mmap netlink
support, it also has to implement a recv/socket receive path in order to
handle messages that exceed the size of an rx ring element.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Ken-ichirou MATSUZAWA <chamaken@gmail.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netlink_mmap.txt | 332 -------------
 include/uapi/linux/netlink.h              |   4 +
 include/uapi/linux/netlink_diag.h         |   2 +
 net/netlink/Kconfig                       |   9 -
 net/netlink/af_netlink.c                  | 754 +-----------------------------
 net/netlink/af_netlink.h                  |  15 -
 net/netlink/diag.c                        |  39 --
 7 files changed, 15 insertions(+), 1140 deletions(-)
 delete mode 100644 Documentation/networking/netlink_mmap.txt

(limited to 'net')

diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt
deleted file mode 100644
index 54f10478e8e3..000000000000
--- a/Documentation/networking/netlink_mmap.txt
+++ /dev/null
@@ -1,332 +0,0 @@
-This file documents how to use memory mapped I/O with netlink.
-
-Author: Patrick McHardy <kaber@trash.net>
-
-Overview
---------
-
-Memory mapped netlink I/O can be used to increase throughput and decrease
-overhead of unicast receive and transmit operations. Some netlink subsystems
-require high throughput, these are mainly the netfilter subsystems
-nfnetlink_queue and nfnetlink_log, but it can also help speed up large
-dump operations of f.i. the routing database.
-
-Memory mapped netlink I/O used two circular ring buffers for RX and TX which
-are mapped into the processes address space.
-
-The RX ring is used by the kernel to directly construct netlink messages into
-user-space memory without copying them as done with regular socket I/O,
-additionally as long as the ring contains messages no recvmsg() or poll()
-syscalls have to be issued by user-space to get more message.
-
-The TX ring is used to process messages directly from user-space memory, the
-kernel processes all messages contained in the ring using a single sendmsg()
-call.
-
-Usage overview
---------------
-
-In order to use memory mapped netlink I/O, user-space needs three main changes:
-
-- ring setup
-- conversion of the RX path to get messages from the ring instead of recvmsg()
-- conversion of the TX path to construct messages into the ring
-
-Ring setup is done using setsockopt() to provide the ring parameters to the
-kernel, then a call to mmap() to map the ring into the processes address space:
-
-- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &params, sizeof(params));
-- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &params, sizeof(params));
-- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)
-
-Usage of either ring is optional, but even if only the RX ring is used the
-mapping still needs to be writable in order to update the frame status after
-processing.
-
-Conversion of the reception path involves calling poll() on the file
-descriptor, once the socket is readable the frames from the ring are
-processed in order until no more messages are available, as indicated by
-a status word in the frame header.
-
-On kernel side, in order to make use of memory mapped I/O on receive, the
-originating netlink subsystem needs to support memory mapped I/O, otherwise
-it will use an allocated socket buffer as usual and the contents will be
- copied to the ring on transmission, nullifying most of the performance gains.
-Dumps of kernel databases automatically support memory mapped I/O.
-
-Conversion of the transmit path involves changing message construction to
-use memory from the TX ring instead of (usually) a buffer declared on the
-stack and setting up the frame header appropriately. Optionally poll() can
-be used to wait for free frames in the TX ring.
-
-Structured and definitions for using memory mapped I/O are contained in
-<linux/netlink.h>.
-
-RX and TX rings
-----------------
-
-Each ring contains a number of continuous memory blocks, containing frames of
-fixed size dependent on the parameters used for ring setup.
-
-Ring:	[ block 0 ]
-		[ frame 0 ]
-		[ frame 1 ]
-	[ block 1 ]
-		[ frame 2 ]
-		[ frame 3 ]
-	...
-	[ block n ]
-		[ frame 2 * n ]
-		[ frame 2 * n + 1 ]
-
-The blocks are only visible to the kernel, from the point of view of user-space
-the ring just contains the frames in a continuous memory zone.
-
-The ring parameters used for setting up the ring are defined as follows:
-
-struct nl_mmap_req {
-	unsigned int	nm_block_size;
-	unsigned int	nm_block_nr;
-	unsigned int	nm_frame_size;
-	unsigned int	nm_frame_nr;
-};
-
-Frames are grouped into blocks, where each block is a continuous region of memory
-and holds nm_block_size / nm_frame_size frames. The total number of frames in
-the ring is nm_frame_nr. The following invariants hold:
-
-- frames_per_block = nm_block_size / nm_frame_size
-
-- nm_frame_nr = frames_per_block * nm_block_nr
-
-Some parameters are constrained, specifically:
-
-- nm_block_size must be a multiple of the architectures memory page size.
-  The getpagesize() function can be used to get the page size.
-
-- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be
-  able to hold at least the frame header
-
-- nm_frame_size must be smaller or equal to nm_block_size
-
-- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT
-
-- nm_frame_nr must equal the actual number of frames as specified above.
-
-When the kernel can't allocate physically continuous memory for a ring block,
-it will fall back to use physically discontinuous memory. This might affect
-performance negatively, in order to avoid this the nm_frame_size parameter
-should be chosen to be as small as possible for the required frame size and
-the number of blocks should be increased instead.
-
-Ring frames
-------------
-
-Each frames contain a frame header, consisting of a synchronization word and some
-meta-data, and the message itself.
-
-Frame:	[ header message ]
-
-The frame header is defined as follows:
-
-struct nl_mmap_hdr {
-	unsigned int	nm_status;
-	unsigned int	nm_len;
-	__u32		nm_group;
-	/* credentials */
-	__u32		nm_pid;
-	__u32		nm_uid;
-	__u32		nm_gid;
-};
-
-- nm_status is used for synchronizing processing between the kernel and user-
-  space and specifies ownership of the frame as well as the operation to perform
-
-- nm_len contains the length of the message contained in the data area
-
-- nm_group specified the destination multicast group of message
-
-- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending
-  process. These values correspond to the data available using SOCK_PASSCRED in
-  the SCM_CREDENTIALS cmsg.
-
-The possible values in the status word are:
-
-- NL_MMAP_STATUS_UNUSED:
-	RX ring:	frame belongs to the kernel and contains no message
-			for user-space. Approriate action is to invoke poll()
-			to wait for new messages.
-
-	TX ring:	frame belongs to user-space and can be used for
-			message construction.
-
-- NL_MMAP_STATUS_RESERVED:
-	RX ring only:	frame is currently used by the kernel for message
-			construction and contains no valid message yet.
-			Appropriate action is to invoke poll() to wait for
-			new messages.
-
-- NL_MMAP_STATUS_VALID:
-	RX ring:	frame contains a valid message. Approriate action is
-			to process the message and release the frame back to
-			the kernel by setting the status to
-			NL_MMAP_STATUS_UNUSED or queue the frame by setting the
-			status to NL_MMAP_STATUS_SKIP.
-
-	TX ring:	the frame contains a valid message from user-space to
-			be processed by the kernel. After completing processing
-			the kernel will release the frame back to user-space by
-			setting the status to NL_MMAP_STATUS_UNUSED.
-
-- NL_MMAP_STATUS_COPY:
-	RX ring only:	a message is ready to be processed but could not be
-			stored in the ring, either because it exceeded the
-			frame size or because the originating subsystem does
-			not support memory mapped I/O. Appropriate action is
-			to invoke recvmsg() to receive the message and release
-			the frame back to the kernel by setting the status to
-			NL_MMAP_STATUS_UNUSED.
-
-- NL_MMAP_STATUS_SKIP:
-	RX ring only:	user-space queued the message for later processing, but
-			processed some messages following it in the ring. The
-			kernel should skip this frame when looking for unused
-			frames.
-
-The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the
-frame header.
-
-TX limitations
---------------
-
-As of Jan 2015 the message is always copied from the ring frame to an
-allocated buffer due to unresolved security concerns.
-See commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.").
-
-Example
--------
-
-Ring setup:
-
-	unsigned int block_size = 16 * getpagesize();
-	struct nl_mmap_req req = {
-		.nm_block_size		= block_size,
-		.nm_block_nr		= 64,
-		.nm_frame_size		= 16384,
-		.nm_frame_nr		= 64 * block_size / 16384,
-	};
-	unsigned int ring_size;
-	void *rx_ring, *tx_ring;
-
-	/* Configure ring parameters */
-	if (setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0)
-		exit(1);
-	if (setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0)
-		exit(1)
-
-	/* Calculate size of each individual ring */
-	ring_size = req.nm_block_nr * req.nm_block_size;
-
-	/* Map RX/TX rings. The TX ring is located after the RX ring */
-	rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE,
-		       MAP_SHARED, fd, 0);
-	if ((long)rx_ring == -1L)
-		exit(1);
-	tx_ring = rx_ring + ring_size:
-
-Message reception:
-
-This example assumes some ring parameters of the ring setup are available.
-
-	unsigned int frame_offset = 0;
-	struct nl_mmap_hdr *hdr;
-	struct nlmsghdr *nlh;
-	unsigned char buf[16384];
-	ssize_t len;
-
-	while (1) {
-		struct pollfd pfds[1];
-
-		pfds[0].fd	= fd;
-		pfds[0].events	= POLLIN | POLLERR;
-		pfds[0].revents	= 0;
-
-		if (poll(pfds, 1, -1) < 0 && errno != -EINTR)
-			exit(1);
-
-		/* Check for errors. Error handling omitted */
-		if (pfds[0].revents & POLLERR)
-			<handle error>
-
-		/* If no new messages, poll again */
-		if (!(pfds[0].revents & POLLIN))
-			continue;
-
-		/* Process all frames */
-		while (1) {
-			/* Get next frame header */
-			hdr = rx_ring + frame_offset;
-
-			if (hdr->nm_status == NL_MMAP_STATUS_VALID) {
-				/* Regular memory mapped frame */
-				nlh = (void *)hdr + NL_MMAP_HDRLEN;
-				len = hdr->nm_len;
-
-				/* Release empty message immediately. May happen
-				 * on error during message construction.
-				 */
-				if (len == 0)
-					goto release;
-			} else if (hdr->nm_status == NL_MMAP_STATUS_COPY) {
-				/* Frame queued to socket receive queue */
-				len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT);
-				if (len <= 0)
-					break;
-				nlh = buf;
-			} else
-				/* No more messages to process, continue polling */
-				break;
-
-			process_msg(nlh);
-release:
-			/* Release frame back to the kernel */
-			hdr->nm_status = NL_MMAP_STATUS_UNUSED;
-
-			/* Advance frame offset to next frame */
-			frame_offset = (frame_offset + frame_size) % ring_size;
-		}
-	}
-
-Message transmission:
-
-This example assumes some ring parameters of the ring setup are available.
-A single message is constructed and transmitted, to send multiple messages
-at once they would be constructed in consecutive frames before a final call
-to sendto().
-
-	unsigned int frame_offset = 0;
-	struct nl_mmap_hdr *hdr;
-	struct nlmsghdr *nlh;
-	struct sockaddr_nl addr = {
-		.nl_family	= AF_NETLINK,
-	};
-
-	hdr = tx_ring + frame_offset;
-	if (hdr->nm_status != NL_MMAP_STATUS_UNUSED)
-		/* No frame available. Use poll() to avoid. */
-		exit(1);
-
-	nlh = (void *)hdr + NL_MMAP_HDRLEN;
-
-	/* Build message */
-	build_message(nlh);
-
-	/* Fill frame header: length and status need to be set */
-	hdr->nm_len	= nlh->nlmsg_len;
-	hdr->nm_status	= NL_MMAP_STATUS_VALID;
-
-	if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0)
-		exit(1);
-
-	/* Advance frame offset to next frame */
-	frame_offset = (frame_offset + frame_size) % ring_size;
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f095155d8749..0dba4e4ed2be 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -107,8 +107,10 @@ struct nlmsgerr {
 #define NETLINK_PKTINFO			3
 #define NETLINK_BROADCAST_ERROR		4
 #define NETLINK_NO_ENOBUFS		5
+#ifndef __KERNEL__
 #define NETLINK_RX_RING			6
 #define NETLINK_TX_RING			7
+#endif
 #define NETLINK_LISTEN_ALL_NSID		8
 #define NETLINK_LIST_MEMBERSHIPS	9
 #define NETLINK_CAP_ACK			10
@@ -134,6 +136,7 @@ struct nl_mmap_hdr {
 	__u32		nm_gid;
 };
 
+#ifndef __KERNEL__
 enum nl_mmap_status {
 	NL_MMAP_STATUS_UNUSED,
 	NL_MMAP_STATUS_RESERVED,
@@ -145,6 +148,7 @@ enum nl_mmap_status {
 #define NL_MMAP_MSG_ALIGNMENT		NLMSG_ALIGNTO
 #define NL_MMAP_MSG_ALIGN(sz)		__ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
 #define NL_MMAP_HDRLEN			NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
+#endif
 
 #define NET_MAJOR 36		/* Major 36 is reserved for networking 						*/
 
diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h
index f2159d30d1f5..d79399394b46 100644
--- a/include/uapi/linux/netlink_diag.h
+++ b/include/uapi/linux/netlink_diag.h
@@ -48,6 +48,8 @@ enum {
 
 #define NDIAG_SHOW_MEMINFO	0x00000001 /* show memory info of a socket */
 #define NDIAG_SHOW_GROUPS	0x00000002 /* show groups of a netlink socket */
+#ifndef __KERNEL__
 #define NDIAG_SHOW_RING_CFG	0x00000004 /* show ring configuration */
+#endif
 
 #endif
diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig
index 2c5e95e9bfbd..5d6e8c05b3d4 100644
--- a/net/netlink/Kconfig
+++ b/net/netlink/Kconfig
@@ -2,15 +2,6 @@
 # Netlink Sockets
 #
 
-config NETLINK_MMAP
-	bool "NETLINK: mmaped IO"
-	---help---
-	  This option enables support for memory mapped netlink IO. This
-	  reduces overhead by avoiding copying data between kernel- and
-	  userspace.
-
-	  If unsure, say N.
-
 config NETLINK_DIAG
 	tristate "NETLINK: socket monitoring interface"
 	default n
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1ffb34e253f..85aa6ef86dfd 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 
 	dev_hold(dev);
 
-	if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head))
+	if (is_vmalloc_addr(skb->head))
 		nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
 	else
 		nskb = skb_clone(skb, GFP_ATOMIC);
@@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk)
 		wake_up_interruptible(&nlk->wait);
 }
 
-#ifdef CONFIG_NETLINK_MMAP
-static bool netlink_rx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
-}
-
-static bool netlink_tx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
-}
-
-static __pure struct page *pgvec_to_page(const void *addr)
-{
-	if (is_vmalloc_addr(addr))
-		return vmalloc_to_page(addr);
-	else
-		return virt_to_page(addr);
-}
-
-static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
-{
-	unsigned int i;
-
-	for (i = 0; i < len; i++) {
-		if (pg_vec[i] != NULL) {
-			if (is_vmalloc_addr(pg_vec[i]))
-				vfree(pg_vec[i]);
-			else
-				free_pages((unsigned long)pg_vec[i], order);
-		}
-	}
-	kfree(pg_vec);
-}
-
-static void *alloc_one_pg_vec_page(unsigned long order)
-{
-	void *buffer;
-	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
-			  __GFP_NOWARN | __GFP_NORETRY;
-
-	buffer = (void *)__get_free_pages(gfp_flags, order);
-	if (buffer != NULL)
-		return buffer;
-
-	buffer = vzalloc((1 << order) * PAGE_SIZE);
-	if (buffer != NULL)
-		return buffer;
-
-	gfp_flags &= ~__GFP_NORETRY;
-	return (void *)__get_free_pages(gfp_flags, order);
-}
-
-static void **alloc_pg_vec(struct netlink_sock *nlk,
-			   struct nl_mmap_req *req, unsigned int order)
-{
-	unsigned int block_nr = req->nm_block_nr;
-	unsigned int i;
-	void **pg_vec;
-
-	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
-	if (pg_vec == NULL)
-		return NULL;
-
-	for (i = 0; i < block_nr; i++) {
-		pg_vec[i] = alloc_one_pg_vec_page(order);
-		if (pg_vec[i] == NULL)
-			goto err1;
-	}
-
-	return pg_vec;
-err1:
-	free_pg_vec(pg_vec, order, block_nr);
-	return NULL;
-}
-
-
-static void
-__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
-		   unsigned int order)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct sk_buff_head *queue;
-	struct netlink_ring *ring;
-
-	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
-	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
-	spin_lock_bh(&queue->lock);
-
-	ring->frame_max		= req->nm_frame_nr - 1;
-	ring->head		= 0;
-	ring->frame_size	= req->nm_frame_size;
-	ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
-
-	swap(ring->pg_vec_len, req->nm_block_nr);
-	swap(ring->pg_vec_order, order);
-	swap(ring->pg_vec, pg_vec);
-
-	__skb_queue_purge(queue);
-	spin_unlock_bh(&queue->lock);
-
-	WARN_ON(atomic_read(&nlk->mapped));
-
-	if (pg_vec)
-		free_pg_vec(pg_vec, order, req->nm_block_nr);
-}
-
-static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
-			    bool tx_ring)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	void **pg_vec = NULL;
-	unsigned int order = 0;
-
-	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
-	if (atomic_read(&nlk->mapped))
-		return -EBUSY;
-	if (atomic_read(&ring->pending))
-		return -EBUSY;
-
-	if (req->nm_block_nr) {
-		if (ring->pg_vec != NULL)
-			return -EBUSY;
-
-		if ((int)req->nm_block_size <= 0)
-			return -EINVAL;
-		if (!PAGE_ALIGNED(req->nm_block_size))
-			return -EINVAL;
-		if (req->nm_frame_size < NL_MMAP_HDRLEN)
-			return -EINVAL;
-		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
-			return -EINVAL;
-
-		ring->frames_per_block = req->nm_block_size /
-					 req->nm_frame_size;
-		if (ring->frames_per_block == 0)
-			return -EINVAL;
-		if (ring->frames_per_block * req->nm_block_nr !=
-		    req->nm_frame_nr)
-			return -EINVAL;
-
-		order = get_order(req->nm_block_size);
-		pg_vec = alloc_pg_vec(nlk, req, order);
-		if (pg_vec == NULL)
-			return -ENOMEM;
-	} else {
-		if (req->nm_frame_nr)
-			return -EINVAL;
-	}
-
-	mutex_lock(&nlk->pg_vec_lock);
-	if (atomic_read(&nlk->mapped) == 0) {
-		__netlink_set_ring(sk, req, tx_ring, pg_vec, order);
-		mutex_unlock(&nlk->pg_vec_lock);
-		return 0;
-	}
-
-	mutex_unlock(&nlk->pg_vec_lock);
-
-	if (pg_vec)
-		free_pg_vec(pg_vec, order, req->nm_block_nr);
-
-	return -EBUSY;
-}
-
-static void netlink_mm_open(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
-
-	if (sk)
-		atomic_inc(&nlk_sk(sk)->mapped);
-}
-
-static void netlink_mm_close(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
-
-	if (sk)
-		atomic_dec(&nlk_sk(sk)->mapped);
-}
-
-static const struct vm_operations_struct netlink_mmap_ops = {
-	.open	= netlink_mm_open,
-	.close	= netlink_mm_close,
-};
-
-static int netlink_mmap(struct file *file, struct socket *sock,
-			struct vm_area_struct *vma)
-{
-	struct sock *sk = sock->sk;
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	unsigned long start, size, expected;
-	unsigned int i;
-	int err = -EINVAL;
-
-	if (vma->vm_pgoff)
-		return -EINVAL;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	expected = 0;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
-		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
-	}
-
-	if (expected == 0)
-		goto out;
-
-	size = vma->vm_end - vma->vm_start;
-	if (size != expected)
-		goto out;
-
-	start = vma->vm_start;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
-
-		for (i = 0; i < ring->pg_vec_len; i++) {
-			struct page *page;
-			void *kaddr = ring->pg_vec[i];
-			unsigned int pg_num;
-
-			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
-				page = pgvec_to_page(kaddr);
-				err = vm_insert_page(vma, start, page);
-				if (err < 0)
-					goto out;
-				start += PAGE_SIZE;
-				kaddr += PAGE_SIZE;
-			}
-		}
-	}
-
-	atomic_inc(&nlk->mapped);
-	vma->vm_ops = &netlink_mmap_ops;
-	err = 0;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
-static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
-{
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
-	struct page *p_start, *p_end;
-
-	/* First page is flushed through netlink_{get,set}_status */
-	p_start = pgvec_to_page(hdr + PAGE_SIZE);
-	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
-	while (p_start <= p_end) {
-		flush_dcache_page(p_start);
-		p_start++;
-	}
-#endif
-}
-
-static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
-{
-	smp_rmb();
-	flush_dcache_page(pgvec_to_page(hdr));
-	return hdr->nm_status;
-}
-
-static void netlink_set_status(struct nl_mmap_hdr *hdr,
-			       enum nl_mmap_status status)
-{
-	smp_mb();
-	hdr->nm_status = status;
-	flush_dcache_page(pgvec_to_page(hdr));
-}
-
-static struct nl_mmap_hdr *
-__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
-{
-	unsigned int pg_vec_pos, frame_off;
-
-	pg_vec_pos = pos / ring->frames_per_block;
-	frame_off  = pos % ring->frames_per_block;
-
-	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
-}
-
-static struct nl_mmap_hdr *
-netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
-		     enum nl_mmap_status status)
-{
-	struct nl_mmap_hdr *hdr;
-
-	hdr = __netlink_lookup_frame(ring, pos);
-	if (netlink_get_status(hdr) != status)
-		return NULL;
-
-	return hdr;
-}
-
-static struct nl_mmap_hdr *
-netlink_current_frame(const struct netlink_ring *ring,
-		      enum nl_mmap_status status)
-{
-	return netlink_lookup_frame(ring, ring->head, status);
-}
-
-static void netlink_increment_head(struct netlink_ring *ring)
-{
-	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
-}
-
-static void netlink_forward_ring(struct netlink_ring *ring)
-{
-	unsigned int head = ring->head;
-	const struct nl_mmap_hdr *hdr;
-
-	do {
-		hdr = __netlink_lookup_frame(ring, ring->head);
-		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
-			break;
-		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
-			break;
-		netlink_increment_head(ring);
-	} while (ring->head != head);
-}
-
-static bool netlink_has_valid_frame(struct netlink_ring *ring)
-{
-	unsigned int head = ring->head, pos = head;
-	const struct nl_mmap_hdr *hdr;
-
-	do {
-		hdr = __netlink_lookup_frame(ring, pos);
-		if (hdr->nm_status == NL_MMAP_STATUS_VALID)
-			return true;
-		pos = pos != 0 ? pos - 1 : ring->frame_max;
-	} while (pos != head);
-
-	return false;
-}
-
-static bool netlink_dump_space(struct netlink_sock *nlk)
-{
-	struct netlink_ring *ring = &nlk->rx_ring;
-	struct nl_mmap_hdr *hdr;
-	unsigned int n;
-
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL)
-		return false;
-
-	n = ring->head + ring->frame_max / 2;
-	if (n > ring->frame_max)
-		n -= ring->frame_max;
-
-	hdr = __netlink_lookup_frame(ring, n);
-
-	return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
-}
-
-static unsigned int netlink_poll(struct file *file, struct socket *sock,
-				 poll_table *wait)
-{
-	struct sock *sk = sock->sk;
-	struct netlink_sock *nlk = nlk_sk(sk);
-	unsigned int mask;
-	int err;
-
-	if (nlk->rx_ring.pg_vec != NULL) {
-		/* Memory mapped sockets don't call recvmsg(), so flow control
-		 * for dumps is performed here. A dump is allowed to continue
-		 * if at least half the ring is unused.
-		 */
-		while (nlk->cb_running && netlink_dump_space(nlk)) {
-			err = netlink_dump(sk);
-			if (err < 0) {
-				sk->sk_err = -err;
-				sk->sk_error_report(sk);
-				break;
-			}
-		}
-		netlink_rcv_wake(sk);
-	}
-
-	mask = datagram_poll(file, sock, wait);
-
-	/* We could already have received frames in the normal receive
-	 * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
-	 * so if mask contains pollin/etc already, there's no point
-	 * walking the ring.
-	 */
-	if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (nlk->rx_ring.pg_vec) {
-			if (netlink_has_valid_frame(&nlk->rx_ring))
-				mask |= POLLIN | POLLRDNORM;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-	}
-
-	spin_lock_bh(&sk->sk_write_queue.lock);
-	if (nlk->tx_ring.pg_vec) {
-		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
-			mask |= POLLOUT | POLLWRNORM;
-	}
-	spin_unlock_bh(&sk->sk_write_queue.lock);
-
-	return mask;
-}
-
-static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
-{
-	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
-}
-
-static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
-				   struct netlink_ring *ring,
-				   struct nl_mmap_hdr *hdr)
-{
-	unsigned int size;
-	void *data;
-
-	size = ring->frame_size - NL_MMAP_HDRLEN;
-	data = (void *)hdr + NL_MMAP_HDRLEN;
-
-	skb->head	= data;
-	skb->data	= data;
-	skb_reset_tail_pointer(skb);
-	skb->end	= skb->tail + size;
-	skb->len	= 0;
-
-	skb->destructor	= netlink_skb_destructor;
-	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
-	NETLINK_CB(skb).sk = sk;
-}
-
-static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
-				u32 dst_portid, u32 dst_group,
-				struct scm_cookie *scm)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-	struct sk_buff *skb;
-	unsigned int maxlen;
-	int err = 0, len = 0;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	ring   = &nlk->tx_ring;
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-
-	do {
-		unsigned int nm_len;
-
-		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
-		if (hdr == NULL) {
-			if (!(msg->msg_flags & MSG_DONTWAIT) &&
-			    atomic_read(&nlk->tx_ring.pending))
-				schedule();
-			continue;
-		}
-
-		nm_len = ACCESS_ONCE(hdr->nm_len);
-		if (nm_len > maxlen) {
-			err = -EINVAL;
-			goto out;
-		}
-
-		netlink_frame_flush_dcache(hdr, nm_len);
-
-		skb = alloc_skb(nm_len, GFP_KERNEL);
-		if (skb == NULL) {
-			err = -ENOBUFS;
-			goto out;
-		}
-		__skb_put(skb, nm_len);
-		memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
-		netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-
-		netlink_increment_head(ring);
-
-		NETLINK_CB(skb).portid	  = nlk->portid;
-		NETLINK_CB(skb).dst_group = dst_group;
-		NETLINK_CB(skb).creds	  = scm->creds;
-
-		err = security_netlink_send(sk, skb);
-		if (err) {
-			kfree_skb(skb);
-			goto out;
-		}
-
-		if (unlikely(dst_group)) {
-			atomic_inc(&skb->users);
-			netlink_broadcast(sk, skb, dst_portid, dst_group,
-					  GFP_KERNEL);
-		}
-		err = netlink_unicast(sk, skb, dst_portid,
-				      msg->msg_flags & MSG_DONTWAIT);
-		if (err < 0)
-			goto out;
-		len += err;
-
-	} while (hdr != NULL ||
-		 (!(msg->msg_flags & MSG_DONTWAIT) &&
-		  atomic_read(&nlk->tx_ring.pending)));
-
-	if (len > 0)
-		err = len;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
-static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
-{
-	struct nl_mmap_hdr *hdr;
-
-	hdr = netlink_mmap_hdr(skb);
-	hdr->nm_len	= skb->len;
-	hdr->nm_group	= NETLINK_CB(skb).dst_group;
-	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
-	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
-	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
-	netlink_frame_flush_dcache(hdr, hdr->nm_len);
-	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-
-	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
-	kfree_skb(skb);
-}
-
-static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring = &nlk->rx_ring;
-	struct nl_mmap_hdr *hdr;
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL) {
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		kfree_skb(skb);
-		netlink_overrun(sk);
-		return;
-	}
-	netlink_increment_head(ring);
-	__skb_queue_tail(&sk->sk_receive_queue, skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-
-	hdr->nm_len	= skb->len;
-	hdr->nm_group	= NETLINK_CB(skb).dst_group;
-	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
-	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
-	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
-	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
-}
-
-#else /* CONFIG_NETLINK_MMAP */
-#define netlink_rx_is_mmaped(sk)	false
-#define netlink_tx_is_mmaped(sk)	false
-#define netlink_mmap			sock_no_mmap
-#define netlink_poll			datagram_poll
-#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm)	0
-#endif /* CONFIG_NETLINK_MMAP */
-
 static void netlink_skb_destructor(struct sk_buff *skb)
 {
-#ifdef CONFIG_NETLINK_MMAP
-	struct nl_mmap_hdr *hdr;
-	struct netlink_ring *ring;
-	struct sock *sk;
-
-	/* If a packet from the kernel to userspace was freed because of an
-	 * error without being delivered to userspace, the kernel must reset
-	 * the status. In the direction userspace to kernel, the status is
-	 * always reset here after the packet was processed and freed.
-	 */
-	if (netlink_skb_is_mmaped(skb)) {
-		hdr = netlink_mmap_hdr(skb);
-		sk = NETLINK_CB(skb).sk;
-
-		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
-			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-			ring = &nlk_sk(sk)->tx_ring;
-		} else {
-			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
-				hdr->nm_len = 0;
-				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-			}
-			ring = &nlk_sk(sk)->rx_ring;
-		}
-
-		WARN_ON(atomic_read(&ring->pending) == 0);
-		atomic_dec(&ring->pending);
-		sock_put(sk);
-
-		skb->head = NULL;
-	}
-#endif
 	if (is_vmalloc_addr(skb->head)) {
 		if (!skb->cloned ||
 		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
@@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk)
 	}
 
 	skb_queue_purge(&sk->sk_receive_queue);
-#ifdef CONFIG_NETLINK_MMAP
-	if (1) {
-		struct nl_mmap_req req;
-
-		memset(&req, 0, sizeof(req));
-		if (nlk->rx_ring.pg_vec)
-			__netlink_set_ring(sk, &req, false, NULL, 0);
-		memset(&req, 0, sizeof(req));
-		if (nlk->tx_ring.pg_vec)
-			__netlink_set_ring(sk, &req, true, NULL, 0);
-	}
-#endif /* CONFIG_NETLINK_MMAP */
 
 	if (!sock_flag(sk, SOCK_DEAD)) {
 		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
@@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock,
 		mutex_init(nlk->cb_mutex);
 	}
 	init_waitqueue_head(&nlk->wait);
-#ifdef CONFIG_NETLINK_MMAP
-	mutex_init(&nlk->pg_vec_lock);
-#endif
 
 	sk->sk_destruct = netlink_sock_destruct;
 	sk->sk_protocol = protocol;
@@ -1728,8 +1111,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 	nlk = nlk_sk(sk);
 
 	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
-	    !netlink_skb_is_mmaped(skb)) {
+	     test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
 		DECLARE_WAITQUEUE(wait, current);
 		if (!*timeo) {
 			if (!ssk || netlink_is_kernel(ssk))
@@ -1767,14 +1149,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 
 	netlink_deliver_tap(skb);
 
-#ifdef CONFIG_NETLINK_MMAP
-	if (netlink_skb_is_mmaped(skb))
-		netlink_queue_mmaped_skb(sk, skb);
-	else if (netlink_rx_is_mmaped(sk))
-		netlink_ring_set_copied(sk, skb);
-	else
-#endif /* CONFIG_NETLINK_MMAP */
-		skb_queue_tail(&sk->sk_receive_queue, skb);
+	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk);
 	return len;
 }
@@ -1798,9 +1173,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
 	int delta;
 
 	WARN_ON(skb->sk != NULL);
-	if (netlink_skb_is_mmaped(skb))
-		return skb;
-
 	delta = skb->end - skb->tail;
 	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
 		return skb;
@@ -1880,71 +1252,6 @@ struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
 				    unsigned int ldiff, u32 dst_portid,
 				    gfp_t gfp_mask)
 {
-#ifdef CONFIG_NETLINK_MMAP
-	unsigned int maxlen, linear_size;
-	struct sock *sk = NULL;
-	struct sk_buff *skb;
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-
-	sk = netlink_getsockbyportid(ssk, dst_portid);
-	if (IS_ERR(sk))
-		goto out;
-
-	ring = &nlk_sk(sk)->rx_ring;
-	/* fast-path without atomic ops for common case: non-mmaped receiver */
-	if (ring->pg_vec == NULL)
-		goto out_put;
-
-	/* We need to account the full linear size needed as a ring
-	 * slot cannot have non-linear parts.
-	 */
-	linear_size = size + ldiff;
-	if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
-		goto out_put;
-
-	skb = alloc_skb_head(gfp_mask);
-	if (skb == NULL)
-		goto err1;
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	/* check again under lock */
-	if (ring->pg_vec == NULL)
-		goto out_free;
-
-	/* check again under lock */
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-	if (maxlen < linear_size)
-		goto out_free;
-
-	netlink_forward_ring(ring);
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL)
-		goto err2;
-
-	netlink_ring_setup_skb(skb, sk, ring, hdr);
-	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
-	atomic_inc(&ring->pending);
-	netlink_increment_head(ring);
-
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-	return skb;
-
-err2:
-	kfree_skb(skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-	netlink_overrun(sk);
-err1:
-	sock_put(sk);
-	return NULL;
-
-out_free:
-	kfree_skb(skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-out_put:
-	sock_put(sk);
-out:
-#endif
 	return alloc_skb(size, gfp_mask);
 }
 EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
@@ -2225,8 +1532,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 	if (level != SOL_NETLINK)
 		return -ENOPROTOOPT;
 
-	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
-	    optlen >= sizeof(int) &&
+	if (optlen >= sizeof(int) &&
 	    get_user(val, (unsigned int __user *)optval))
 		return -EFAULT;
 
@@ -2279,25 +1585,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		}
 		err = 0;
 		break;
-#ifdef CONFIG_NETLINK_MMAP
-	case NETLINK_RX_RING:
-	case NETLINK_TX_RING: {
-		struct nl_mmap_req req;
-
-		/* Rings might consume more memory than queue limits, require
-		 * CAP_NET_ADMIN.
-		 */
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-		if (optlen < sizeof(req))
-			return -EINVAL;
-		if (copy_from_user(&req, optval, sizeof(req)))
-			return -EFAULT;
-		err = netlink_set_ring(sk, &req,
-				       optname == NETLINK_TX_RING);
-		break;
-	}
-#endif /* CONFIG_NETLINK_MMAP */
 	case NETLINK_LISTEN_ALL_NSID:
 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
 			return -EPERM;
@@ -2467,18 +1754,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 		smp_rmb();
 	}
 
-	/* It's a really convoluted way for userland to ask for mmaped
-	 * sendmsg(), but that's what we've got...
-	 */
-	if (netlink_tx_is_mmaped(sk) &&
-	    iter_is_iovec(&msg->msg_iter) &&
-	    msg->msg_iter.nr_segs == 1 &&
-	    msg->msg_iter.iov->iov_base == NULL) {
-		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
-					   &scm);
-		goto out;
-	}
-
 	err = -EMSGSIZE;
 	if (len > sk->sk_sndbuf - 32)
 		goto out;
@@ -2794,8 +2069,7 @@ static int netlink_dump(struct sock *sk)
 		goto errout_skb;
 	}
 
-	if (!netlink_rx_is_mmaped(sk) &&
-	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		goto errout_skb;
 
 	/* NLMSG_GOODSIZE is small to avoid high order allocations being
@@ -2831,8 +2105,7 @@ static int netlink_dump(struct sock *sk)
 	 * reasonable static buffer based on the expected largest dump of a
 	 * single netdev. The outcome is MSG_TRUNC error.
 	 */
-	if (!netlink_rx_is_mmaped(sk))
-		skb_reserve(skb, skb_tailroom(skb) - alloc_size);
+	skb_reserve(skb, skb_tailroom(skb) - alloc_size);
 	netlink_skb_set_owner_r(skb, sk);
 
 	len = cb->dump(skb, cb);
@@ -2884,16 +2157,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	struct netlink_sock *nlk;
 	int ret;
 
-	/* Memory mapped dump requests need to be copied to avoid looping
-	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
-	 * a reference to the skb.
-	 */
-	if (netlink_skb_is_mmaped(skb)) {
-		skb = skb_copy(skb, GFP_KERNEL);
-		if (skb == NULL)
-			return -ENOBUFS;
-	} else
-		atomic_inc(&skb->users);
+	atomic_inc(&skb->users);
 
 	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
 	if (sk == NULL) {
@@ -3241,7 +2505,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll =		netlink_poll,
+	.poll =		datagram_poll,
 	.ioctl =	sock_no_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -3249,7 +2513,7 @@ static const struct proto_ops netlink_ops = {
 	.getsockopt =	netlink_getsockopt,
 	.sendmsg =	netlink_sendmsg,
 	.recvmsg =	netlink_recvmsg,
-	.mmap =		netlink_mmap,
+	.mmap =		sock_no_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 14437d9b1965..e68ef9ccd703 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -44,12 +44,6 @@ struct netlink_sock {
 	int			(*netlink_bind)(struct net *net, int group);
 	void			(*netlink_unbind)(struct net *net, int group);
 	struct module		*module;
-#ifdef CONFIG_NETLINK_MMAP
-	struct mutex		pg_vec_lock;
-	struct netlink_ring	rx_ring;
-	struct netlink_ring	tx_ring;
-	atomic_t		mapped;
-#endif /* CONFIG_NETLINK_MMAP */
 
 	struct rhash_head	node;
 	struct rcu_head		rcu;
@@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)
 	return container_of(sk, struct netlink_sock, sk);
 }
 
-static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb)
-{
-#ifdef CONFIG_NETLINK_MMAP
-	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
-#else
-	return false;
-#endif /* CONFIG_NETLINK_MMAP */
-}
-
 struct netlink_table {
 	struct rhashtable	hash;
 	struct hlist_head	mc_list;
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 3ee63a3cff30..8dd836a8dd60 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -8,41 +8,6 @@
 
 #include "af_netlink.h"
 
-#ifdef CONFIG_NETLINK_MMAP
-static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type,
-			    struct sk_buff *nlskb)
-{
-	struct netlink_diag_ring ndr;
-
-	ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
-	ndr.ndr_block_nr   = ring->pg_vec_len;
-	ndr.ndr_frame_size = ring->frame_size;
-	ndr.ndr_frame_nr   = ring->frame_max + 1;
-
-	return nla_put(nlskb, nl_type, sizeof(ndr), &ndr);
-}
-
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	int ret;
-
-	mutex_lock(&nlk->pg_vec_lock);
-	ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb);
-	if (!ret)
-		ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING,
-				       nlskb);
-	mutex_unlock(&nlk->pg_vec_lock);
-
-	return ret;
-}
-#else
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
-	return 0;
-}
-#endif
-
 static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 	    sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
 		goto out_nlmsg_trim;
 
-	if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) &&
-	    sk_diag_put_rings_cfg(sk, skb))
-		goto out_nlmsg_trim;
-
 	nlmsg_end(skb, nlh);
 	return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 551ddc057e290cc07ff900050da242f35d615d3c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:25 +0100
Subject: openvswitch: Revert: "Enable memory mapped Netlink i/o"

revert commit 795449d8b846 ("openvswitch: Enable memory mapped Netlink i/o").
Following the mmaped netlink removal this code can be removed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index d6f7fe92744a..35a2659a277e 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -422,10 +422,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	struct sk_buff *nskb = NULL;
 	struct sk_buff *user_skb = NULL; /* to be queued to userspace */
 	struct nlattr *nla;
-	struct genl_info info = {
-		.dst_sk = ovs_dp_get_net(dp)->genl_sock,
-		.snd_portid = upcall_info->portid,
-	};
 	size_t len;
 	unsigned int hlen;
 	int err, dp_ifindex;
@@ -466,7 +462,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 		hlen = skb->len;
 
 	len = upcall_msg_size(upcall_info, hlen);
-	user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
+	user_skb = genlmsg_new(len, GFP_ATOMIC);
 	if (!user_skb) {
 		err = -ENOMEM;
 		goto out;
@@ -876,7 +872,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act
 		return NULL;
 
 	len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags);
-	skb = genlmsg_new_unicast(len, info, GFP_KERNEL);
+	skb = genlmsg_new(len, GFP_KERNEL);
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
@@ -1483,7 +1479,7 @@ error:
 
 static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info)
 {
-	return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL);
+	return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL);
 }
 
 /* Called with rcu_read_lock or ovs_mutex. */
-- 
cgit v1.2.3-59-g8ed1b


From 263ea09084d172cac6e40459a690babe8de8e448 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:26 +0100
Subject: Revert "genl: Add genlmsg_new_unicast() for unicast message
 allocation"

This reverts commit bb9b18fb55b0 ("genl: Add genlmsg_new_unicast() for
unicast message allocation")'.

Nothing wrong with it; its no longer needed since this was only for
mmapped netlink support.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h    |  4 ----
 net/netlink/genetlink.c    | 21 ---------------------
 net/openvswitch/datapath.c | 10 +++++-----
 net/tipc/netlink_compat.c  |  1 -
 4 files changed, 5 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 43c0e771f417..8d4608ce8716 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -83,7 +83,6 @@ struct genl_family {
  * @attrs: netlink attributes
  * @_net: network namespace
  * @user_ptr: user pointers
- * @dst_sk: destination socket
  */
 struct genl_info {
 	u32			snd_seq;
@@ -94,7 +93,6 @@ struct genl_info {
 	struct nlattr **	attrs;
 	possible_net_t		_net;
 	void *			user_ptr[2];
-	struct sock *		dst_sk;
 };
 
 static inline struct net *genl_info_net(struct genl_info *info)
@@ -188,8 +186,6 @@ int genl_unregister_family(struct genl_family *family);
 void genl_notify(struct genl_family *family, struct sk_buff *skb,
 		 struct genl_info *info, u32 group, gfp_t flags);
 
-struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info,
-				    gfp_t flags);
 void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
 		  struct genl_family *family, int flags, u8 cmd);
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 0ffd721126e7..a09132a69869 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -462,26 +462,6 @@ int genl_unregister_family(struct genl_family *family)
 }
 EXPORT_SYMBOL(genl_unregister_family);
 
-/**
- * genlmsg_new_unicast - Allocate generic netlink message for unicast
- * @payload: size of the message payload
- * @info: information on destination
- * @flags: the type of memory to allocate
- *
- * Allocates a new sk_buff large enough to cover the specified payload
- * plus required Netlink headers. Will check receiving socket for
- * memory mapped i/o capability and use it if enabled. Will fall back
- * to non-mapped skb if message size exceeds the frame size of the ring.
- */
-struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info,
-				    gfp_t flags)
-{
-	size_t len = nlmsg_total_size(genlmsg_total_size(payload));
-
-	return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags);
-}
-EXPORT_SYMBOL_GPL(genlmsg_new_unicast);
-
 /**
  * genlmsg_put - Add generic netlink header to netlink message
  * @skb: socket buffer holding the message
@@ -642,7 +622,6 @@ static int genl_family_rcv_msg(struct genl_family *family,
 	info.genlhdr = nlmsg_data(nlh);
 	info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
 	info.attrs = attrbuf;
-	info.dst_sk = skb->sk;
 	genl_info_net_set(&info, net);
 	memset(&info.user_ptr, 0, sizeof(info.user_ptr));
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 35a2659a277e..c4e8455d5d56 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1477,7 +1477,7 @@ error:
 	return -EMSGSIZE;
 }
 
-static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info)
+static struct sk_buff *ovs_dp_cmd_alloc_info(void)
 {
 	return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL);
 }
@@ -1532,7 +1532,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
 		goto err;
 
-	reply = ovs_dp_cmd_alloc_info(info);
+	reply = ovs_dp_cmd_alloc_info();
 	if (!reply)
 		return -ENOMEM;
 
@@ -1653,7 +1653,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	int err;
 
-	reply = ovs_dp_cmd_alloc_info(info);
+	reply = ovs_dp_cmd_alloc_info();
 	if (!reply)
 		return -ENOMEM;
 
@@ -1686,7 +1686,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	int err;
 
-	reply = ovs_dp_cmd_alloc_info(info);
+	reply = ovs_dp_cmd_alloc_info();
 	if (!reply)
 		return -ENOMEM;
 
@@ -1719,7 +1719,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	int err;
 
-	reply = ovs_dp_cmd_alloc_info(info);
+	reply = ovs_dp_cmd_alloc_info();
 	if (!reply)
 		return -ENOMEM;
 
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 2c016fdefe97..de66d8f945ed 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1104,7 +1104,6 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
 	req_nlh = (struct nlmsghdr *)skb->data;
 	msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN;
 	msg.cmd = req_userhdr->cmd;
-	msg.dst_sk = info->dst_sk;
 	msg.net = genl_info_net(info);
 
 	if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) {
-- 
cgit v1.2.3-59-g8ed1b


From 905f0a739ad82c6371fb0cb0e71db14a750702ad Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:27 +0100
Subject: nfnetlink: remove nfnetlink_alloc_skb

Following mmapped netlink removal this code can be simplified by
removing the alloc wrapper.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink.h | 2 --
 net/netfilter/nfnetlink.c           | 7 -------
 net/netfilter/nfnetlink_log.c       | 5 ++---
 3 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index ba0d9789eb6e..1d82dd5e9a08 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -34,8 +34,6 @@ int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
 int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 
 int nfnetlink_has_listeners(struct net *net, unsigned int group);
-struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
-				    u32 dst_portid, gfp_t gfp_mask);
 int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
 		   unsigned int group, int echo, gfp_t flags);
 int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index a7ba23353dab..9a99f686d06f 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -127,13 +127,6 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
 
-struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
-				    u32 dst_portid, gfp_t gfp_mask)
-{
-	return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask);
-}
-EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
-
 int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
 		   unsigned int group, int echo, gfp_t flags)
 {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 8ca932057c13..11f81c8385fc 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -330,14 +330,13 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
 	 * message.  WARNING: has to be <= 128k due to slab restrictions */
 
 	n = max(inst_size, pkt_size);
-	skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC);
+	skb = alloc_skb(n, GFP_ATOMIC);
 	if (!skb) {
 		if (n > pkt_size) {
 			/* try to allocate only as much as we need for current
 			 * packet */
 
-			skb = nfnetlink_alloc_skb(net, pkt_size,
-						  peer_portid, GFP_ATOMIC);
+			skb = alloc_skb(pkt_size, GFP_ATOMIC);
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From c5b0db3263b92526bc0c1b6380c0c99f91f069fc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:28 +0100
Subject: nfnetlink: Revert "nfnetlink: add support for memory mapped netlink"

reverts commit 3ab1f683bf8b ("nfnetlink: add support for memory mapped
netlink")'

Like previous commits in the series, remove wrappers that are not needed
after mmapped netlink removal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h         | 10 ----------
 net/netfilter/nfnetlink_queue.c |  6 ++----
 net/netlink/af_netlink.c        | 20 ++++----------------
 3 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 0b41959aab9f..da14ab61f363 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -69,16 +69,6 @@ extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group)
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
 
-extern struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
-					   unsigned int ldiff, u32 dst_portid,
-					   gfp_t gfp_mask);
-static inline struct sk_buff *
-netlink_alloc_skb(struct sock *ssk, unsigned int size, u32 dst_portid,
-		  gfp_t gfp_mask)
-{
-	return __netlink_alloc_skb(ssk, size, 0, dst_portid, gfp_mask);
-}
-
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
 			     __u32 group, gfp_t allocation);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 1d3936587ace..75429997ed41 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 			   __be32 **packet_id_ptr)
 {
 	size_t size;
-	size_t data_len = 0, cap_len = 0, rem_len = 0;
+	size_t data_len = 0, cap_len = 0;
 	unsigned int hlen = 0;
 	struct sk_buff *skb;
 	struct nlattr *nla;
@@ -361,7 +361,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		hlen = min_t(unsigned int, hlen, data_len);
 		size += sizeof(struct nlattr) + hlen;
 		cap_len = entskb->len;
-		rem_len = data_len - hlen;
 		break;
 	}
 
@@ -386,8 +385,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 			size += nla_total_size(seclen);
 	}
 
-	skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid,
-				  GFP_ATOMIC);
+	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb) {
 		skb_tx_error(entskb);
 		return NULL;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 85aa6ef86dfd..c8416792cce0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1248,14 +1248,6 @@ retry:
 }
 EXPORT_SYMBOL(netlink_unicast);
 
-struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
-				    unsigned int ldiff, u32 dst_portid,
-				    gfp_t gfp_mask)
-{
-	return alloc_skb(size, gfp_mask);
-}
-EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
-
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
 	int res = 0;
@@ -2082,15 +2074,12 @@ static int netlink_dump(struct sock *sk)
 
 	if (alloc_min_size < nlk->max_recvmsg_len) {
 		alloc_size = nlk->max_recvmsg_len;
-		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
-					GFP_KERNEL |
-					__GFP_NOWARN |
-					__GFP_NORETRY);
+		skb = alloc_skb(alloc_size, GFP_KERNEL |
+					    __GFP_NOWARN | __GFP_NORETRY);
 	}
 	if (!skb) {
 		alloc_size = alloc_min_size;
-		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
-					GFP_KERNEL);
+		skb = alloc_skb(alloc_size, GFP_KERNEL);
 	}
 	if (!skb)
 		goto errout_skb;
@@ -2230,8 +2219,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 	if (!(nlk->flags & NETLINK_F_CAP_ACK) && err)
 		payload += nlmsg_len(nlh);
 
-	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
-				NETLINK_CB(in_skb).portid, GFP_KERNEL);
+	skb = nlmsg_new(payload, GFP_KERNEL);
 	if (!skb) {
 		struct sock *sk;
 
-- 
cgit v1.2.3-59-g8ed1b


From 7c25b16dbbcf37ce8019bacf634ee47d852225d9 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 16 Feb 2016 10:09:51 -0500
Subject: net: bridge: log port STP state on change

Remove the shared br_log_state function and print the info directly in
br_set_state, where the net_bridge_port state is actually changed.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink.c   |  1 -
 net/bridge/br_private.h   |  1 -
 net/bridge/br_stp.c       | 14 ++++----------
 net/bridge/br_stp_if.c    |  2 --
 net/bridge/br_stp_timer.c |  1 -
 5 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 40197ff8918a..e9c635eae24d 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -598,7 +598,6 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state)
 		return -ENETDOWN;
 
 	br_set_state(p, state);
-	br_log_state(p);
 	br_port_state_selection(p->br);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 302ab0a43725..1b5d145dfcbf 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -900,7 +900,6 @@ static inline void br_nf_core_fini(void) {}
 #endif
 
 /* br_stp.c */
-void br_log_state(const struct net_bridge_port *p);
 void br_set_state(struct net_bridge_port *p, unsigned int state);
 struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no);
 void br_init_port(struct net_bridge_port *p);
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index b3cca126b103..c22816a0b1b1 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -30,13 +30,6 @@ static const char *const br_port_state_names[] = {
 	[BR_STATE_BLOCKING] = "blocking",
 };
 
-void br_log_state(const struct net_bridge_port *p)
-{
-	br_info(p->br, "port %u(%s) entered %s state\n",
-		(unsigned int) p->port_no, p->dev->name,
-		br_port_state_names[p->state]);
-}
-
 void br_set_state(struct net_bridge_port *p, unsigned int state)
 {
 	struct switchdev_attr attr = {
@@ -52,6 +45,10 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
 	if (err && err != -EOPNOTSUPP)
 		br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
 				(unsigned int) p->port_no, p->dev->name);
+	else
+		br_info(p->br, "port %u(%s) entered %s state\n",
+				(unsigned int) p->port_no, p->dev->name,
+				br_port_state_names[p->state]);
 }
 
 /* called under bridge lock */
@@ -126,7 +123,6 @@ static void br_root_port_block(const struct net_bridge *br,
 		  (unsigned int) p->port_no, p->dev->name);
 
 	br_set_state(p, BR_STATE_LISTENING);
-	br_log_state(p);
 	br_ifinfo_notify(RTM_NEWLINK, p);
 
 	if (br->forward_delay > 0)
@@ -407,7 +403,6 @@ static void br_make_blocking(struct net_bridge_port *p)
 			br_topology_change_detection(p->br);
 
 		br_set_state(p, BR_STATE_BLOCKING);
-		br_log_state(p);
 		br_ifinfo_notify(RTM_NEWLINK, p);
 
 		del_timer(&p->forward_delay_timer);
@@ -431,7 +426,6 @@ static void br_make_forwarding(struct net_bridge_port *p)
 	else
 		br_set_state(p, BR_STATE_LEARNING);
 
-	br_log_state(p);
 	br_ifinfo_notify(RTM_NEWLINK, p);
 
 	if (br->forward_delay != 0)
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index a31ac6ad76a2..984d46263007 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -102,7 +102,6 @@ void br_stp_enable_port(struct net_bridge_port *p)
 {
 	br_init_port(p);
 	br_port_state_selection(p->br);
-	br_log_state(p);
 	br_ifinfo_notify(RTM_NEWLINK, p);
 }
 
@@ -118,7 +117,6 @@ void br_stp_disable_port(struct net_bridge_port *p)
 	p->topology_change_ack = 0;
 	p->config_pending = 0;
 
-	br_log_state(p);
 	br_ifinfo_notify(RTM_NEWLINK, p);
 
 	del_timer(&p->message_age_timer);
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 5f0f5af0ec35..da058b85aa22 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -98,7 +98,6 @@ static void br_forward_delay_timer_expired(unsigned long arg)
 			br_topology_change_detection(br);
 		netif_carrier_on(br->dev);
 	}
-	br_log_state(p);
 	rcu_read_lock();
 	br_ifinfo_notify(RTM_NEWLINK, p);
 	rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From 7f290c94352e59b1d720055fce760a69a63bd0a1 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 18 Feb 2016 11:22:52 +0100
Subject: iptunnel: scrub packet in iptunnel_pull_header

Part of skb_scrub_packet was open coded in iptunnel_pull_header. Let it call
skb_scrub_packet directly instead.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c      | 4 ++--
 drivers/net/vxlan.c       | 4 ++--
 include/net/ip_tunnels.h  | 3 ++-
 net/ipv4/ip_gre.c         | 2 +-
 net/ipv4/ip_tunnel_core.c | 8 +++-----
 net/ipv4/ipip.c           | 2 +-
 net/ipv6/sit.c            | 2 +-
 7 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 4ceccf871b3f..dfbe3ca687f7 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -237,7 +237,6 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
 	}
 
 	skb_reset_mac_header(skb);
-	skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev)));
 	skb->protocol = eth_type_trans(skb, geneve->dev);
 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
@@ -356,7 +355,8 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 
 	opts_len = geneveh->opt_len * 4;
 	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
-				 htons(ETH_P_TEB)))
+				 htons(ETH_P_TEB),
+				 !net_eq(geneve->net, dev_net(geneve->dev))))
 		goto drop;
 
 	geneve_rx(geneve, gs, skb);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 16a176cd0dad..c963897e713d 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1198,7 +1198,6 @@ static void vxlan_rcv(struct vxlan_dev *vxlan, struct vxlan_sock *vs,
 	int err = 0;
 
 	skb_reset_mac_header(skb);
-	skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
 	skb->protocol = eth_type_trans(skb, vxlan->dev);
 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
@@ -1305,7 +1304,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!vxlan)
 		goto drop;
 
-	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
+	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB),
+				 !net_eq(vxlan->net, dev_net(vxlan->dev))))
 		goto drop;
 
 	if (vxlan_collect_metadata(vs)) {
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 87408ab80856..4dd616376fec 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -270,7 +270,8 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 	return INET_ECN_encapsulate(tos, inner);
 }
 
-int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
+			 bool xnet);
 void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 		   __be32 src, __be32 dst, u8 proto,
 		   u8 tos, u8 ttl, __be16 df, bool xnet);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 917c2c1bfadd..12071e28d958 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -238,7 +238,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 				return -EINVAL;
 		}
 	}
-	return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+	return iptunnel_pull_header(skb, hdr_len, tpi->proto, false);
 }
 
 static void ipgre_err(struct sk_buff *skb, u32 info,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index a6e58b6141cd..eaca2449a09a 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(iptunnel_xmit);
 
-int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
+			 bool xnet)
 {
 	if (unlikely(!pskb_may_pull(skb, hdr_len)))
 		return -ENOMEM;
@@ -109,13 +110,10 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
 		skb->protocol = inner_proto;
 	}
 
-	nf_reset(skb);
-	secpath_reset(skb);
 	skb_clear_hash_if_not_l4(skb);
-	skb_dst_drop(skb);
 	skb->vlan_tci = 0;
 	skb_set_queue_mapping(skb, 0);
-	skb->pkt_type = PACKET_HOST;
+	skb_scrub_packet(skb, xnet);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iptunnel_pull_header);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 6ec5b42fd172..ec51d02166de 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb)
 	if (tunnel) {
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 			goto drop;
-		if (iptunnel_pull_header(skb, 0, tpi.proto))
+		if (iptunnel_pull_header(skb, 0, tpi.proto, false))
 			goto drop;
 		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
 	}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0625ac6356b5..f45b8ffc2840 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -740,7 +740,7 @@ static int ipip_rcv(struct sk_buff *skb)
 
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 			goto drop;
-		if (iptunnel_pull_header(skb, 0, tpi.proto))
+		if (iptunnel_pull_header(skb, 0, tpi.proto, false))
 			goto drop;
 		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 76cc173d48d9ed2206a1affe13b70d0e8e134010 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 16 Feb 2016 12:46:53 +0100
Subject: bridge: mdb: reduce the indentation level in br_mdb_fill_info

Switch the port check and skip if it's null, this allows us to reduce one
indentation level.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index ac089286526e..e66619171386 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -88,25 +88,26 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			for (pp = &mp->ports;
 			     (p = rcu_dereference(*pp)) != NULL;
 			      pp = &p->next) {
+				struct br_mdb_entry e;
+
 				port = p->port;
-				if (port) {
-					struct br_mdb_entry e;
-					memset(&e, 0, sizeof(e));
-					e.ifindex = port->dev->ifindex;
-					e.vid = p->addr.vid;
-					__mdb_entry_fill_flags(&e, p->flags);
-					if (p->addr.proto == htons(ETH_P_IP))
-						e.addr.u.ip4 = p->addr.u.ip4;
+				if (!port)
+					continue;
+				memset(&e, 0, sizeof(e));
+				e.ifindex = port->dev->ifindex;
+				e.vid = p->addr.vid;
+				__mdb_entry_fill_flags(&e, p->flags);
+				if (p->addr.proto == htons(ETH_P_IP))
+					e.addr.u.ip4 = p->addr.u.ip4;
 #if IS_ENABLED(CONFIG_IPV6)
-					if (p->addr.proto == htons(ETH_P_IPV6))
-						e.addr.u.ip6 = p->addr.u.ip6;
+				if (p->addr.proto == htons(ETH_P_IPV6))
+					e.addr.u.ip6 = p->addr.u.ip6;
 #endif
-					e.addr.proto = p->addr.proto;
-					if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
-						nla_nest_cancel(skb, nest2);
-						err = -EMSGSIZE;
-						goto out;
-					}
+				e.addr.proto = p->addr.proto;
+				if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
+					nla_nest_cancel(skb, nest2);
+					err = -EMSGSIZE;
+					goto out;
 				}
 			}
 			nla_nest_end(skb, nest2);
-- 
cgit v1.2.3-59-g8ed1b


From 2125715635053d4207a756a35aa718f548824e58 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 16 Feb 2016 12:46:54 +0100
Subject: bridge: mdb: add support for more attributes and export timer

Currently mdb entries are exported directly as a structure inside
MDBA_MDB_ENTRY_INFO attribute, we can't really extend it without
breaking user-space. In order to export new mdb fields, I've converted
the MDBA_MDB_ENTRY_INFO into a nested attribute which starts like before
with struct br_mdb_entry (without header, as it's casted directly in
iproute2) and continues with MDBA_MDB_EATTR_ attributes. This way we
keep compatibility with older users and can export new data.
I've tested this with iproute2, both with and without support for the
added attribute and it works fine.
So basically we again have MDBA_MDB_ENTRY_INFO with struct br_mdb_entry
inside but it may contain also some additional MDBA_MDB_EATTR_ attributes
such as MDBA_MDB_EATTR_TIMER which can be parsed by user-space.

So the new structure is:
[MDBA_MDB] = {
     [MDBA_MDB_ENTRY] = {
         [MDBA_MDB_ENTRY_INFO]
         [MDBA_MDB_ENTRY_INFO] { <- Nested attribute
             struct br_mdb_entry <- nla_put_nohdr()
             [MDBA_MDB_ENTRY attributes] <- normal netlink attributes
         }
     }
}

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 13 ++++++++++++-
 net/bridge/br_mdb.c            | 16 +++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index ec3547234998..0890b217580d 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -137,7 +137,10 @@ struct bridge_vlan_info {
 /* Bridge multicast database attributes
  * [MDBA_MDB] = {
  *     [MDBA_MDB_ENTRY] = {
- *         [MDBA_MDB_ENTRY_INFO]
+ *         [MDBA_MDB_ENTRY_INFO] {
+ *		struct br_mdb_entry
+ *		[MDBA_MDB_EATTR attributes]
+ *         }
  *     }
  * }
  * [MDBA_ROUTER] = {
@@ -166,6 +169,14 @@ enum {
 };
 #define MDBA_MDB_ENTRY_MAX (__MDBA_MDB_ENTRY_MAX - 1)
 
+/* per mdb entry additional attributes */
+enum {
+	MDBA_MDB_EATTR_UNSPEC,
+	MDBA_MDB_EATTR_TIMER,
+	__MDBA_MDB_EATTR_MAX
+};
+#define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
+
 enum {
 	MDBA_ROUTER_UNSPEC,
 	MDBA_ROUTER_PORT,
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index e66619171386..cf51b7bcb5d5 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -88,11 +88,13 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			for (pp = &mp->ports;
 			     (p = rcu_dereference(*pp)) != NULL;
 			      pp = &p->next) {
+				struct nlattr *nest_ent;
 				struct br_mdb_entry e;
 
 				port = p->port;
 				if (!port)
 					continue;
+
 				memset(&e, 0, sizeof(e));
 				e.ifindex = port->dev->ifindex;
 				e.vid = p->addr.vid;
@@ -104,11 +106,23 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 					e.addr.u.ip6 = p->addr.u.ip6;
 #endif
 				e.addr.proto = p->addr.proto;
-				if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
+				nest_ent = nla_nest_start(skb,
+							  MDBA_MDB_ENTRY_INFO);
+				if (!nest_ent) {
+					nla_nest_cancel(skb, nest2);
+					err = -EMSGSIZE;
+					goto out;
+				}
+				if (nla_put_nohdr(skb, sizeof(e), &e) ||
+				    nla_put_u32(skb,
+						MDBA_MDB_EATTR_TIMER,
+						br_timer_value(&p->timer))) {
+					nla_nest_cancel(skb, nest_ent);
 					nla_nest_cancel(skb, nest2);
 					err = -EMSGSIZE;
 					goto out;
 				}
+				nla_nest_end(skb, nest_ent);
 			}
 			nla_nest_end(skb, nest2);
 		skip:
-- 
cgit v1.2.3-59-g8ed1b


From e0d8c1b7381133240f0c69eaf740f64975b6058e Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Wed, 17 Feb 2016 13:58:22 -0800
Subject: ipv6: pass up EMSGSIZE msg for UDP socket in Ipv6

In ipv4,  when  the machine receives a ICMP_FRAG_NEEDED message,  the
connected UDP socket will get EMSGSIZE message on its next read from the
socket.
However, this is not the case for ipv6.
This fix modifies the udp err handler in Ipv6 for ICMP6_PKT_TOOBIG to
make it similar to ipv4 behavior. That is when the machine gets an
ICMP6_PKT_TOOBIG message, the connected UDP socket will get EMSGSIZE
message on its next read from the socket.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ac4e7e03dded..0711f8fe4d44 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -548,6 +548,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	const struct in6_addr *daddr = &hdr->daddr;
 	struct udphdr *uh = (struct udphdr *)(skb->data+offset);
 	struct sock *sk;
+	int harderr;
 	int err;
 	struct net *net = dev_net(skb->dev);
 
@@ -559,26 +560,27 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		return;
 	}
 
+	harderr = icmpv6_err_convert(type, code, &err);
+	np = inet6_sk(sk);
+
 	if (type == ICMPV6_PKT_TOOBIG) {
 		if (!ip6_sk_accept_pmtu(sk))
 			goto out;
 		ip6_sk_update_pmtu(skb, sk, info);
+		if (np->pmtudisc != IPV6_PMTUDISC_DONT)
+			harderr = 1;
 	}
 	if (type == NDISC_REDIRECT) {
 		ip6_sk_redirect(skb, sk);
 		goto out;
 	}
 
-	np = inet6_sk(sk);
-
-	if (!icmpv6_err_convert(type, code, &err) && !np->recverr)
-		goto out;
-
-	if (sk->sk_state != TCP_ESTABLISHED && !np->recverr)
-		goto out;
-
-	if (np->recverr)
+	if (!np->recverr) {
+		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+	} else {
 		ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1));
+	}
 
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
-- 
cgit v1.2.3-59-g8ed1b


From ac2c7ad0e5d6030452c9af2fafd192e17fd04264 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Fri, 19 Feb 2016 09:24:01 -0500
Subject: net/ethtool: introduce a new ioctl for per queue setting

Introduce a new ioctl ETHTOOL_PERQUEUE for per queue parameters setting.
The following patches will enable some SUB_COMMANDs for per queue
setting.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 17 +++++++++++++++++
 net/core/ethtool.c           | 27 +++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 190aea0faaf4..f15ae02621a1 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1202,6 +1202,21 @@ enum ethtool_sfeatures_retval_bits {
 #define ETHTOOL_F_WISH          (1 << ETHTOOL_F_WISH__BIT)
 #define ETHTOOL_F_COMPAT        (1 << ETHTOOL_F_COMPAT__BIT)
 
+#define MAX_NUM_QUEUE		4096
+
+/**
+ * struct ethtool_per_queue_op - apply sub command to the queues in mask.
+ * @cmd: ETHTOOL_PERQUEUE
+ * @sub_command: the sub command which apply to each queues
+ * @queue_mask: Bitmap of the queues which sub command apply to
+ * @data: A complete command structure following for each of the queues addressed
+ */
+struct ethtool_per_queue_op {
+	__u32	cmd;
+	__u32	sub_command;
+	__u32	queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
+	char	data[];
+};
 
 /* CMDs currently supported */
 #define ETHTOOL_GSET		0x00000001 /* Get settings. */
@@ -1285,6 +1300,8 @@ enum ethtool_sfeatures_retval_bits {
 #define ETHTOOL_STUNABLE	0x00000049 /* Set tunable configuration */
 #define ETHTOOL_GPHYSTATS	0x0000004a /* get PHY-specific statistics */
 
+#define ETHTOOL_PERQUEUE	0x0000004b /* Set per queue options */
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index c2d3118b1395..d640ecf71e74 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1888,13 +1888,27 @@ out:
 	return ret;
 }
 
+static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_per_queue_op per_queue_opt;
+
+	if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
+		return -EFAULT;
+
+	switch (per_queue_opt.sub_command) {
+
+	default:
+		return -EOPNOTSUPP;
+	};
+}
+
 /* The main entry point in this file.  Called from net/core/dev_ioctl.c */
 
 int dev_ethtool(struct net *net, struct ifreq *ifr)
 {
 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
 	void __user *useraddr = ifr->ifr_data;
-	u32 ethcmd;
+	u32 ethcmd, sub_cmd;
 	int rc;
 	netdev_features_t old_features;
 
@@ -1904,8 +1918,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
 		return -EFAULT;
 
+	if (ethcmd == ETHTOOL_PERQUEUE) {
+		if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
+			return -EFAULT;
+	} else {
+		sub_cmd = ethcmd;
+	}
 	/* Allow some commands to be done by anyone */
-	switch (ethcmd) {
+	switch (sub_cmd) {
 	case ETHTOOL_GSET:
 	case ETHTOOL_GDRVINFO:
 	case ETHTOOL_GMSGLVL:
@@ -2135,6 +2155,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GPHYSTATS:
 		rc = ethtool_get_phy_stats(dev, useraddr);
 		break;
+	case ETHTOOL_PERQUEUE:
+		rc = ethtool_set_per_queue(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 421797b1aa363cb897f29f7d365e068dc9d9db81 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Fri, 19 Feb 2016 09:24:02 -0500
Subject: net/ethtool: support get coalesce per queue

This patch implements sub command ETHTOOL_GCOALESCE for ioctl
ETHTOOL_PERQUEUE. It introduces an interface get_per_queue_coalesce to
get coalesce of each masked queue from device driver. Then the interrupt
coalescing parameters will be copied back to user space one by one.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  8 +++++++-
 net/core/ethtool.c      | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 653dc9c4ebac..de56600023a7 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -201,6 +201,11 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  * @get_module_eeprom: Get the eeprom information from the plug-in module
  * @get_eee: Get Energy-Efficient (EEE) supported and status.
  * @set_eee: Set EEE status (enable/disable) as well as LPI timers.
+ * @get_per_queue_coalesce: Get interrupt coalescing parameters per queue.
+ *	It must check that the given queue number is valid. If neither a RX nor
+ *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
+ *	queue has this number, set the inapplicable fields to ~0 and return 0.
+ *	Returns a negative error code or zero.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -279,7 +284,8 @@ struct ethtool_ops {
 			       const struct ethtool_tunable *, void *);
 	int	(*set_tunable)(struct net_device *,
 			       const struct ethtool_tunable *, const void *);
-
+	int	(*get_per_queue_coalesce)(struct net_device *, u32,
+					  struct ethtool_coalesce *);
 
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index d640ecf71e74..2a6c3a26f63f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1888,6 +1888,38 @@ out:
 	return ret;
 }
 
+static int ethtool_get_per_queue_coalesce(struct net_device *dev,
+					  void __user *useraddr,
+					  struct ethtool_per_queue_op *per_queue_opt)
+{
+	u32 bit;
+	int ret;
+	DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+	if (!dev->ethtool_ops->get_per_queue_coalesce)
+		return -EOPNOTSUPP;
+
+	useraddr += sizeof(*per_queue_opt);
+
+	bitmap_from_u32array(queue_mask,
+			     MAX_NUM_QUEUE,
+			     per_queue_opt->queue_mask,
+			     DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+
+	for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+		struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+		ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce);
+		if (ret != 0)
+			return ret;
+		if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+			return -EFAULT;
+		useraddr += sizeof(coalesce);
+	}
+
+	return 0;
+}
+
 static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_per_queue_op per_queue_opt;
@@ -1896,7 +1928,8 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
 		return -EFAULT;
 
 	switch (per_queue_opt.sub_command) {
-
+	case ETHTOOL_GCOALESCE:
+		return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
 	default:
 		return -EOPNOTSUPP;
 	};
-- 
cgit v1.2.3-59-g8ed1b


From f38d138a7da6510a1184e3bc5f425deb187c3265 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Fri, 19 Feb 2016 09:24:03 -0500
Subject: net/ethtool: support set coalesce per queue

This patch implements sub command ETHTOOL_SCOALESCE for ioctl
ETHTOOL_PERQUEUE. It introduces an interface set_per_queue_coalesce to
set coalesce of each masked queue to device driver. The wanted coalesce
information are stored in "data" for each masked queue, which can copy
from userspace.
If it fails to set coalesce to device driver, the value which already
set to specific queue will be tried to rollback.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  7 ++++++
 net/core/ethtool.c      | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'net')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index de56600023a7..472d7d7b01c2 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -206,6 +206,11 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
  *	queue has this number, set the inapplicable fields to ~0 and return 0.
  *	Returns a negative error code or zero.
+ * @set_per_queue_coalesce: Set interrupt coalescing parameters per queue.
+ *	It must check that the given queue number is valid. If neither a RX nor
+ *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
+ *	queue has this number, ignore the inapplicable fields.
+ *	Returns a negative error code or zero.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -286,6 +291,8 @@ struct ethtool_ops {
 			       const struct ethtool_tunable *, const void *);
 	int	(*get_per_queue_coalesce)(struct net_device *, u32,
 					  struct ethtool_coalesce *);
+	int	(*set_per_queue_coalesce)(struct net_device *, u32,
+					  struct ethtool_coalesce *);
 
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 2a6c3a26f63f..2406101002b1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1920,6 +1920,65 @@ static int ethtool_get_per_queue_coalesce(struct net_device *dev,
 	return 0;
 }
 
+static int ethtool_set_per_queue_coalesce(struct net_device *dev,
+					  void __user *useraddr,
+					  struct ethtool_per_queue_op *per_queue_opt)
+{
+	u32 bit;
+	int i, ret = 0;
+	int n_queue;
+	struct ethtool_coalesce *backup = NULL, *tmp = NULL;
+	DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+	if ((!dev->ethtool_ops->set_per_queue_coalesce) ||
+	    (!dev->ethtool_ops->get_per_queue_coalesce))
+		return -EOPNOTSUPP;
+
+	useraddr += sizeof(*per_queue_opt);
+
+	bitmap_from_u32array(queue_mask,
+			     MAX_NUM_QUEUE,
+			     per_queue_opt->queue_mask,
+			     DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+	n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE);
+	tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL);
+	if (!backup)
+		return -ENOMEM;
+
+	for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+		struct ethtool_coalesce coalesce;
+
+		ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp);
+		if (ret != 0)
+			goto roll_back;
+
+		tmp++;
+
+		if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) {
+			ret = -EFAULT;
+			goto roll_back;
+		}
+
+		ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
+		if (ret != 0)
+			goto roll_back;
+
+		useraddr += sizeof(coalesce);
+	}
+
+roll_back:
+	if (ret != 0) {
+		tmp = backup;
+		for_each_set_bit(i, queue_mask, bit) {
+			dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp);
+			tmp++;
+		}
+	}
+	kfree(backup);
+
+	return ret;
+}
+
 static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_per_queue_op per_queue_opt;
@@ -1930,6 +1989,8 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
 	switch (per_queue_opt.sub_command) {
 	case ETHTOOL_GCOALESCE:
 		return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+	case ETHTOOL_SCOALESCE:
+		return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt);
 	default:
 		return -EOPNOTSUPP;
 	};
-- 
cgit v1.2.3-59-g8ed1b


From 6b83d28a55a891a9d70fc61ccb1c138e47dcbe74 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 20 Feb 2016 00:29:30 +0100
Subject: net: use skb_postpush_rcsum instead of own implementations

Replace individual implementations with the recently introduced
skb_postpush_rcsum() helper.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Tom Herbert <tom@herbertland.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c              | 4 +---
 net/ipv6/reassembly.c          | 6 ++----
 net/openvswitch/actions.c      | 8 +++-----
 net/openvswitch/vport-netdev.c | 2 +-
 net/openvswitch/vport.h        | 7 -------
 5 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a5bd067ec1a3..8bd4b7951bc0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4496,9 +4496,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 		skb->mac_len += VLAN_HLEN;
 		__skb_pull(skb, offset);
 
-		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->csum = csum_add(skb->csum, csum_partial(skb->data
-					+ (2 * ETH_ALEN), VLAN_HLEN, 0));
+		skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
 	}
 	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
 	return 0;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 18f3498a6c80..e2ea31175ef9 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -496,10 +496,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	IP6CB(head)->flags |= IP6SKB_FRAGMENTED;
 
 	/* Yes, and fold redundant checksum back. 8) */
-	if (head->ip_summed == CHECKSUM_COMPLETE)
-		head->csum = csum_partial(skb_network_header(head),
-					  skb_network_header_len(head),
-					  head->csum);
+	skb_postpush_rcsum(head, skb_network_header(head),
+			   skb_network_header_len(head));
 
 	rcu_read_lock();
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 2d59df521915..e9dd47b2a85b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -158,9 +158,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 	new_mpls_lse = (__be32 *)skb_mpls_header(skb);
 	*new_mpls_lse = mpls->mpls_lse;
 
-	if (skb->ip_summed == CHECKSUM_COMPLETE)
-		skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
-							     MPLS_HLEN, 0));
+	skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
 
 	hdr = eth_hdr(skb);
 	hdr->h_proto = mpls->mpls_ethertype;
@@ -280,7 +278,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
 	ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
 			       mask->eth_dst);
 
-	ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
+	skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
 
 	ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
 	ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
@@ -639,7 +637,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
 	/* Reconstruct the MAC header.  */
 	skb_push(skb, data->l2_len);
 	memcpy(skb->data, &data->l2_data, data->l2_len);
-	ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len);
+	skb_postpush_rcsum(skb, skb->data, data->l2_len);
 	skb_reset_mac_header(skb);
 
 	ovs_vport_send(vport, skb);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 6a6adf314363..4e3972344aa6 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -58,7 +58,7 @@ static void netdev_port_receive(struct sk_buff *skb)
 		return;
 
 	skb_push(skb, ETH_HLEN);
-	ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+	skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
 	ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
 	return;
 error:
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index c10899cb9040..f01f28a567ad 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -185,13 +185,6 @@ static inline struct vport *vport_from_priv(void *priv)
 int ovs_vport_receive(struct vport *, struct sk_buff *,
 		      const struct ip_tunnel_info *);
 
-static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
-				      const void *start, unsigned int len)
-{
-	if (skb->ip_summed == CHECKSUM_COMPLETE)
-		skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
-}
-
 static inline const char *ovs_vport_name(struct vport *vport)
 {
 	return vport->dev->name;
-- 
cgit v1.2.3-59-g8ed1b


From e817af27e01f9294055a441d1034bbc353ee39d2 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Thu, 18 Feb 2016 10:00:11 +0000
Subject: vlan: turn on unicast filtering on vlan device

Currently vlan device inherits unicast filtering flag from underlying
device. If underlying device doesn't support unicast filter, this will
put vlan device into promiscuous mode when it's stacked.

Tun on IFF_UNICAST_FLT on the vlan device in any case so that it does
not go into promiscuous mode needlessly. If underlying device does not
support unicast filtering, that device will enter promiscuous mode.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c     | 1 -
 net/8021q/vlan_dev.c | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index d2cd9de4b724..69929c05c843 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -261,7 +261,6 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	 * hope the underlying device can handle it.
 	 */
 	new_dev->mtu = real_dev->mtu;
-	new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT);
 
 	vlan = vlan_dev_priv(new_dev);
 	vlan->vlan_proto = htons(ETH_P_8021Q);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index ad5e2fd1012c..055f0e989e90 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -799,6 +799,7 @@ void vlan_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->priv_flags		|= IFF_802_1Q_VLAN | IFF_NO_QUEUE;
+	dev->priv_flags		|= IFF_UNICAST_FLT;
 	dev->priv_flags		&= ~IFF_TX_SKB_SHARING;
 	netif_keep_dst(dev);
 
-- 
cgit v1.2.3-59-g8ed1b


From 745041e2aaf1d668f293aaab4b0f6ad7daa056a5 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Fri, 19 Feb 2016 09:43:16 +0000
Subject: lwtunnel: autoload of lwt modules

The lwt implementations using net devices can autoload using the
existing mechanism using IFLA_INFO_KIND. However, there's no mechanism
that lwt modules not using net devices can use.

Therefore, add the ability to autoload modules registering lwt
operations for lwt implementations not using a net device so that
users don't have to manually load the modules.

Only users with the CAP_NET_ADMIN capability can cause modules to be
loaded, which is ensured by rtnetlink_rcv_msg rejecting non-RTM_GETxxx
messages for users without this capability, and by
lwtunnel_build_state not being called in response to RTM_GETxxx
messages.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h |  4 +++-
 net/core/lwtunnel.c    | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 66350ce3e955..e9f116e29c22 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -170,6 +170,8 @@ static inline int lwtunnel_input(struct sk_buff *skb)
 	return -EOPNOTSUPP;
 }
 
-#endif
+#endif /* CONFIG_LWTUNNEL */
+
+#define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type))
 
 #endif /* __NET_LWTUNNEL_H */
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 299cfc24d888..669ecc9f884e 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -27,6 +27,31 @@
 #include <net/rtnetlink.h>
 #include <net/ip6_fib.h>
 
+#ifdef CONFIG_MODULES
+
+static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
+{
+	/* Only lwt encaps implemented without using an interface for
+	 * the encap need to return a string here.
+	 */
+	switch (encap_type) {
+	case LWTUNNEL_ENCAP_MPLS:
+		return "MPLS";
+	case LWTUNNEL_ENCAP_ILA:
+		return "ILA";
+	case LWTUNNEL_ENCAP_IP6:
+	case LWTUNNEL_ENCAP_IP:
+	case LWTUNNEL_ENCAP_NONE:
+	case __LWTUNNEL_ENCAP_MAX:
+		/* should not have got here */
+		WARN_ON(1);
+		break;
+	}
+	return NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
 struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
 {
 	struct lwtunnel_state *lws;
@@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 	ret = -EOPNOTSUPP;
 	rcu_read_lock();
 	ops = rcu_dereference(lwtun_encaps[encap_type]);
+#ifdef CONFIG_MODULES
+	if (!ops) {
+		const char *encap_type_str = lwtunnel_encap_str(encap_type);
+
+		if (encap_type_str) {
+			rcu_read_unlock();
+			request_module("rtnl-lwt-%s", encap_type_str);
+			rcu_read_lock();
+			ops = rcu_dereference(lwtun_encaps[encap_type]);
+		}
+	}
+#endif
 	if (likely(ops && ops->build_state))
 		ret = ops->build_state(dev, encap, family, cfg, lws);
 	rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From b2b04edceb7cc3c3023fea7d4ccc7dd243e839a9 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Fri, 19 Feb 2016 09:43:17 +0000
Subject: mpls: autoload lwt module

Avoid users having to manually load the module by adding a module
alias allowing it to be autoloaded by the lwt infra.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mpls/mpls_iptunnel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index fb31aa87de81..644a8da6d4bd 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -227,5 +227,6 @@ static void __exit mpls_iptunnel_exit(void)
 }
 module_exit(mpls_iptunnel_exit);
 
+MODULE_ALIAS_RTNL_LWT(MPLS);
 MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
 MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-59-g8ed1b


From 84a8cbe46af0945859a8f61bce6100ddc3d51a51 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Fri, 19 Feb 2016 09:43:18 +0000
Subject: ila: autoload module

Avoid users having to manually load the module by adding a module
alias allowing it to be autoloaded by the lwt infra.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ila/ila_common.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 32dc9aab7297..30613050e4ca 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -99,5 +99,6 @@ static void __exit ila_fini(void)
 
 module_init(ila_init);
 module_exit(ila_fini);
+MODULE_ALIAS_RTNL_LWT(ILA);
 MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-59-g8ed1b


From 7d672345ed295b1356a5d9f7111da1d1d7d65867 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:23 +0100
Subject: bpf: add generic bpf_csum_diff helper

For L4 checksums, we currently have bpf_l4_csum_replace() helper. It's
currently limited to handle 2 and 4 byte changes in a header and feeds the
from/to into inet_proto_csum_replace{2,4}() helpers of the kernel. When
working with IPv6, for example, this makes it rather cumbersome to deal
with, similarly when editing larger parts of a header.

Instead, extend the API in a more generic way: For bpf_l4_csum_replace(),
add a case for header field mask of 0 to change the checksum at a given
offset through inet_proto_csum_replace_by_diff(), and provide a helper
bpf_csum_diff() that can generically calculate a from/to diff for arbitrary
amounts of data.

This can be used in multiple ways: for the bpf_l4_csum_replace() only
part, this even provides us with the option to insert precalculated diffs
from user space f.e. from a map, or from bpf_csum_diff() during runtime.

bpf_csum_diff() has a optional from/to stack buffer input, so we can
calculate a diff by using a scratchbuffer for scenarios where we're
inserting (from is NULL), removing (to is NULL) or diffing (from/to buffers
don't need to be of equal size) data. Also, bpf_csum_diff() allows to
feed a previous csum into csum_partial(), so the function can also be
cascaded.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 11 ++++++++++
 net/core/filter.c        | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d3e77da8e9e8..48d0a6c54609 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -287,6 +287,17 @@ enum bpf_func_id {
 	 * Return: >= 0 stackid on success or negative error
 	 */
 	BPF_FUNC_get_stackid,
+
+	/**
+	 * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff
+	 * @from: raw from buffer
+	 * @from_size: length of from buffer
+	 * @to: raw to buffer
+	 * @to_size: length of to buffer
+	 * @seed: optional seed
+	 * Return: csum result
+	 */
+	BPF_FUNC_csum_diff,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 2a6e9562f1ab..bf504f8fbe15 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1491,6 +1491,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EFAULT;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
+	case 0:
+		if (unlikely(from != 0))
+			return -EINVAL;
+
+		inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+		break;
 	case 2:
 		inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
 		break;
@@ -1519,6 +1525,51 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+struct bpf_csum_scratchpad {
+	__be32 diff[128];
+};
+
+static DEFINE_PER_CPU(struct bpf_csum_scratchpad, bpf_csum_sp);
+
+static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+{
+	struct bpf_csum_scratchpad *sp = this_cpu_ptr(&bpf_csum_sp);
+	u64 diff_size = from_size + to_size;
+	__be32 *from = (__be32 *) (long) r1;
+	__be32 *to   = (__be32 *) (long) r3;
+	int i, j = 0;
+
+	/* This is quite flexible, some examples:
+	 *
+	 * from_size == 0, to_size > 0,  seed := csum --> pushing data
+	 * from_size > 0,  to_size == 0, seed := csum --> pulling data
+	 * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
+	 *
+	 * Even for diffing, from_size and to_size don't need to be equal.
+	 */
+	if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
+		     diff_size > sizeof(sp->diff)))
+		return -EINVAL;
+
+	for (i = 0; i < from_size / sizeof(__be32); i++, j++)
+		sp->diff[j] = ~from[i];
+	for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
+		sp->diff[j] = to[i];
+
+	return csum_partial(sp->diff, diff_size, seed);
+}
+
+const struct bpf_func_proto bpf_csum_diff_proto = {
+	.func		= bpf_csum_diff,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_STACK,
+	.arg4_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
@@ -1849,6 +1900,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
-- 
cgit v1.2.3-59-g8ed1b


From 21cafc1dc2da999dabc5ed7aa94230454471fcf0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:24 +0100
Subject: bpf: remove artificial bpf_skb_{load, store}_bytes buffer limitation

We currently limit bpf_skb_store_bytes() and bpf_skb_load_bytes()
helpers to only store or load a maximum buffer of 16 bytes. Thus,
loading, rewriting and storing headers require several bpf_skb_load_bytes()
and bpf_skb_store_bytes() calls.

Also here we can use a per-cpu scratch buffer instead in order to not
pressure stack space any further. I do suspect that this limit was mainly
set in place for this particular reason. So, ease program development
by removing this limitation and make the scratchpad generic, so it can
be reused.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index bf504f8fbe15..ea391e6be7fa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1333,15 +1333,22 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
 	return 0;
 }
 
-#define BPF_LDST_LEN 16U
+struct bpf_scratchpad {
+	union {
+		__be32 diff[MAX_BPF_STACK / sizeof(__be32)];
+		u8     buff[MAX_BPF_STACK];
+	};
+};
+
+static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
 
 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 {
+	struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	int offset = (int) r2;
 	void *from = (void *) (long) r3;
 	unsigned int len = (unsigned int) r4;
-	char buf[BPF_LDST_LEN];
 	void *ptr;
 
 	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM)))
@@ -1355,14 +1362,14 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	 *
 	 * so check for invalid 'offset' and too large 'len'
 	 */
-	if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
+	if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
 		return -EFAULT;
 
 	if (unlikely(skb_cloned(skb) &&
 		     !skb_clone_writable(skb, offset + len)))
 		return -EFAULT;
 
-	ptr = skb_header_pointer(skb, offset, len, buf);
+	ptr = skb_header_pointer(skb, offset, len, sp->buff);
 	if (unlikely(!ptr))
 		return -EFAULT;
 
@@ -1371,7 +1378,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 
 	memcpy(ptr, from, len);
 
-	if (ptr == buf)
+	if (ptr == sp->buff)
 		/* skb_store_bits cannot return -EFAULT here */
 		skb_store_bits(skb, offset, ptr, len);
 
@@ -1400,7 +1407,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
-	if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN))
+	if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, len, to);
@@ -1525,15 +1532,9 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-struct bpf_csum_scratchpad {
-	__be32 diff[128];
-};
-
-static DEFINE_PER_CPU(struct bpf_csum_scratchpad, bpf_csum_sp);
-
 static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
 {
-	struct bpf_csum_scratchpad *sp = this_cpu_ptr(&bpf_csum_sp);
+	struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
 	u64 diff_size = from_size + to_size;
 	__be32 *from = (__be32 *) (long) r1;
 	__be32 *to   = (__be32 *) (long) r3;
-- 
cgit v1.2.3-59-g8ed1b


From 3697649ff29e0f647565eed04b27a7779c646a22 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:25 +0100
Subject: bpf: try harder on clones when writing into skb

When we're dealing with clones and the area is not writeable, try
harder and get a copy via pskb_expand_head(). Replace also other
occurences in tc actions with the new skb_try_make_writable().

Reported-by: Ashhad Sheikh <ashhadsheikh394@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  7 +++++++
 net/core/filter.c      | 19 ++++++++++---------
 net/sched/act_csum.c   |  8 ++------
 net/sched/act_nat.c    | 18 +++++-------------
 4 files changed, 24 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 89b536796e53..6a57757a86cf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2630,6 +2630,13 @@ static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len
 	       skb_headroom(skb) + len <= skb->hdr_len;
 }
 
+static inline int skb_try_make_writable(struct sk_buff *skb,
+					unsigned int write_len)
+{
+	return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
+	       pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+
 static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
 			    int cloned)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index ea391e6be7fa..f031b82128f3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1364,9 +1364,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	 */
 	if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
 		return -EFAULT;
-
-	if (unlikely(skb_cloned(skb) &&
-		     !skb_clone_writable(skb, offset + len)))
+	if (unlikely(skb_try_make_writable(skb, offset + len)))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, len, sp->buff);
@@ -1439,9 +1437,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
-
-	if (unlikely(skb_cloned(skb) &&
-		     !skb_clone_writable(skb, offset + sizeof(sum))))
+	if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1488,9 +1484,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
-
-	if (unlikely(skb_cloned(skb) &&
-		     !skb_clone_writable(skb, offset + sizeof(sum))))
+	if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1734,6 +1728,13 @@ bool bpf_helper_changes_skb_data(void *func)
 		return true;
 	if (func == bpf_skb_vlan_pop)
 		return true;
+	if (func == bpf_skb_store_bytes)
+		return true;
+	if (func == bpf_l3_csum_replace)
+		return true;
+	if (func == bpf_l4_csum_replace)
+		return true;
+
 	return false;
 }
 
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b07c535ba8e7..eeb3eb3ea9eb 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -105,9 +105,7 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
 	int hl = ihl + jhl;
 
 	if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
-	    (skb_cloned(skb) &&
-	     !skb_clone_writable(skb, hl + ntkoff) &&
-	     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+	    skb_try_make_writable(skb, hl + ntkoff))
 		return NULL;
 	else
 		return (void *)(skb_network_header(skb) + ihl);
@@ -365,9 +363,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 	}
 
 	if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
-		if (skb_cloned(skb) &&
-		    !skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
-		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		if (skb_try_make_writable(skb, sizeof(*iph) + ntkoff))
 			goto fail;
 
 		ip_send_check(ip_hdr(skb));
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index b7c4ead8b5a8..27607b863aba 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -126,9 +126,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		addr = iph->daddr;
 
 	if (!((old_addr ^ addr) & mask)) {
-		if (skb_cloned(skb) &&
-		    !skb_clone_writable(skb, sizeof(*iph) + noff) &&
-		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		if (skb_try_make_writable(skb, sizeof(*iph) + noff))
 			goto drop;
 
 		new_addr &= mask;
@@ -156,9 +154,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		struct tcphdr *tcph;
 
 		if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
-		    (skb_cloned(skb) &&
-		     !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) &&
-		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		    skb_try_make_writable(skb, ihl + sizeof(*tcph) + noff))
 			goto drop;
 
 		tcph = (void *)(skb_network_header(skb) + ihl);
@@ -171,9 +167,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		struct udphdr *udph;
 
 		if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
-		    (skb_cloned(skb) &&
-		     !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) &&
-		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		    skb_try_make_writable(skb, ihl + sizeof(*udph) + noff))
 			goto drop;
 
 		udph = (void *)(skb_network_header(skb) + ihl);
@@ -213,10 +207,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		if ((old_addr ^ addr) & mask)
 			break;
 
-		if (skb_cloned(skb) &&
-		    !skb_clone_writable(skb, ihl + sizeof(*icmph) +
-					     sizeof(*iph) + noff) &&
-		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		if (skb_try_make_writable(skb, ihl + sizeof(*icmph) +
+					  sizeof(*iph) + noff))
 			goto drop;
 
 		icmph = (void *)(skb_network_header(skb) + ihl);
-- 
cgit v1.2.3-59-g8ed1b


From 2f72959a9c1260ade234f353ccca91118151af66 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:26 +0100
Subject: bpf: fix csum update in bpf_l4_csum_replace helper for udp

When using this helper for updating UDP checksums, we need to extend
this in order to write CSUM_MANGLED_0 for csum computations that result
into 0 as sum. Reason we need this is because packets with a checksum
could otherwise become incorrectly marked as a packet without a checksum.
Likewise, if the user indicates BPF_F_MARK_MANGLED_0, then we should
not turn packets without a checksum into ones with a checksum.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 48d0a6c54609..6496f98d3d68 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -313,6 +313,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_l4_csum_replace flags. */
 #define BPF_F_PSEUDO_HDR		(1ULL << 4)
+#define BPF_F_MARK_MANGLED_0		(1ULL << 5)
 
 /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
 #define BPF_F_INGRESS			(1ULL << 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index f031b82128f3..8a0b8c3eb189 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1477,10 +1477,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
+	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
 	int offset = (int) r2;
 	__sum16 sum, *ptr;
 
-	if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
+			       BPF_F_HDR_FIELD_MASK)))
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
@@ -1490,6 +1492,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
 	if (unlikely(!ptr))
 		return -EFAULT;
+	if (is_mmzero && !*ptr)
+		return 0;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
 	case 0:
@@ -1508,6 +1512,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	}
 
+	if (is_mmzero && !*ptr)
+		*ptr = CSUM_MANGLED_0;
 	if (ptr == &sum)
 		/* skb_store_bits guaranteed to not return -EFAULT here */
 		skb_store_bits(skb, offset, ptr, sizeof(sum));
-- 
cgit v1.2.3-59-g8ed1b


From 6205b9cf200d1c3dda5491666ddc33e7b70fe469 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:27 +0100
Subject: bpf: don't emit mov A,A on return

While debugging with bpf_jit_disasm I noticed emissions of 'mov %eax,%eax',
and found that this comes from BPF_RET | BPF_A translations from classic
BPF. Emitting this is unnecessary as BPF_REG_A is mapped into BPF_REG_0
already, therefore only emit a mov when immediates are used as return value.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 8a0b8c3eb189..a3aba15a8025 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -530,12 +530,14 @@ do_pass:
 			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
 			break;
 
-		/* RET_K, RET_A are remaped into 2 insns. */
+		/* RET_K is remaped into 2 insns. RET_A case doesn't need an
+		 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
+		 */
 		case BPF_RET | BPF_A:
 		case BPF_RET | BPF_K:
-			*insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
-						BPF_K : BPF_X, BPF_REG_0,
-						BPF_REG_A, fp->k);
+			if (BPF_RVAL(fp->code) == BPF_K)
+				*insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
+							0, fp->k);
 			*insn = BPF_EXIT_INSN();
 			break;
 
-- 
cgit v1.2.3-59-g8ed1b


From e9ccd7e39cd87cb5ff87e2b293dcd467377d93d6 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <a@unstable.cc>
Date: Tue, 12 Jan 2016 09:15:25 +0800
Subject: batman-adv: remove unused BATADV_BONDING_TQ_THRESHOLD constant

BATADV_BONDING_TQ_THRESHOLD is not used anymore since the implementation
of the bat_neigh_is_similar_or_better() API function.
Such function uses the more generic BATADV_TQ_SIMILARITY_THRESHOLD
constant.

Therefore, remove definition of the unused BATADV_BONDING_TQ_THRESHOLD
constant.

Signed-off-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/main.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 32dfc9e578af..8c01f54c61f3 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -100,11 +100,6 @@
  */
 #define BATADV_TQ_SIMILARITY_THRESHOLD 50
 
-/* how much worse secondary interfaces may be to be considered as bonding
- * candidates
- */
-#define BATADV_BONDING_TQ_THRESHOLD	50
-
 /* should not be bigger than 512 bytes or change the size of
  * forw_packet->direct_link_flags
  */
-- 
cgit v1.2.3-59-g8ed1b


From 5d9673109c7c7c6a22b7bedba22701e173e1ea0d Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:09 +0100
Subject: batman-adv: Rename batadv_orig_node *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c            | 16 ++++++++--------
 net/batman-adv/bridge_loop_avoidance.c |  4 ++--
 net/batman-adv/distributed-arp-table.c |  4 ++--
 net/batman-adv/fragmentation.c         |  2 +-
 net/batman-adv/gateway_client.c        |  8 ++++----
 net/batman-adv/icmp_socket.c           |  2 +-
 net/batman-adv/main.c                  |  2 +-
 net/batman-adv/network-coding.c        |  4 ++--
 net/batman-adv/originator.c            |  8 ++++----
 net/batman-adv/originator.h            |  2 +-
 net/batman-adv/routing.c               | 18 +++++++++---------
 net/batman-adv/send.c                  |  2 +-
 net/batman-adv/translation-table.c     | 14 +++++++-------
 13 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index bf0e7d6f12bb..32df38bfff25 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -287,8 +287,8 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
 
 free_orig_node:
 	/* free twice, as batadv_orig_node_new sets refcount to 2 */
-	batadv_orig_node_free_ref(orig_node);
-	batadv_orig_node_free_ref(orig_node);
+	batadv_orig_node_put(orig_node);
+	batadv_orig_node_put(orig_node);
 
 	return NULL;
 }
@@ -1041,7 +1041,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
 						     ethhdr->h_source,
 						     orig_node, orig_tmp);
 
-		batadv_orig_node_free_ref(orig_tmp);
+		batadv_orig_node_put(orig_tmp);
 		if (!neigh_node)
 			goto unlock;
 	} else {
@@ -1306,7 +1306,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
 
 	orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
 	if (WARN_ON(!orig_ifinfo)) {
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 		return 0;
 	}
 
@@ -1367,7 +1367,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
 
 out:
 	spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
-	batadv_orig_node_free_ref(orig_node);
+	batadv_orig_node_put(orig_node);
 	batadv_orig_ifinfo_free_ref(orig_ifinfo);
 	return ret;
 }
@@ -1563,7 +1563,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
 
 out_neigh:
 	if ((orig_neigh_node) && (!is_single_hop_neigh))
-		batadv_orig_node_free_ref(orig_neigh_node);
+		batadv_orig_node_put(orig_neigh_node);
 out:
 	if (router_ifinfo)
 		batadv_neigh_ifinfo_free_ref(router_ifinfo);
@@ -1697,7 +1697,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
 
 		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
 			   "Drop packet: originator packet from myself (via neighbor)\n");
-		batadv_orig_node_free_ref(orig_neigh_node);
+		batadv_orig_node_put(orig_neigh_node);
 		return;
 	}
 
@@ -1735,7 +1735,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
 	}
 	rcu_read_unlock();
 
-	batadv_orig_node_free_ref(orig_node);
+	batadv_orig_node_put(orig_node);
 }
 
 static int batadv_iv_ogm_receive(struct sk_buff *skb,
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 7781f39c174d..49008d98a8aa 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -486,7 +486,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
 	if (orig_node) {
 		batadv_tt_global_del_orig(bat_priv, orig_node, vid,
 					  "became a backbone gateway");
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	}
 
 	if (own_backbone) {
@@ -965,7 +965,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv,
 		bla_dst_own->group = bla_dst->group;
 	}
 
-	batadv_orig_node_free_ref(orig_node);
+	batadv_orig_node_put(orig_node);
 
 	return 2;
 }
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index e3261118130a..8f466e782bc7 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -547,7 +547,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
 
 			max = tmp_max;
 			if (max_orig_node)
-				batadv_orig_node_free_ref(max_orig_node);
+				batadv_orig_node_put(max_orig_node);
 			max_orig_node = orig_node;
 		}
 		rcu_read_unlock();
@@ -676,7 +676,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
 free_neigh:
 		batadv_neigh_node_free_ref(neigh_node);
 free_orig:
-		batadv_orig_node_free_ref(cand[i].orig_node);
+		batadv_orig_node_put(cand[i].orig_node);
 	}
 
 out:
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 55656e84bc7e..67cb224981fb 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -385,7 +385,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
 
 out:
 	if (orig_node_dst)
-		batadv_orig_node_free_ref(orig_node_dst);
+		batadv_orig_node_put(orig_node_dst);
 	if (neigh_node)
 		batadv_neigh_node_free_ref(neigh_node);
 	return ret;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 4b598bd531ea..9cb098f79155 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -71,7 +71,7 @@ static void batadv_gw_node_release(struct kref *ref)
 
 	gw_node = container_of(ref, struct batadv_gw_node, refcount);
 
-	batadv_orig_node_free_ref(gw_node->orig_node);
+	batadv_orig_node_put(gw_node->orig_node);
 	kfree_rcu(gw_node, rcu);
 }
 
@@ -415,7 +415,7 @@ reselect:
 	batadv_gw_reselect(bat_priv);
 out:
 	if (curr_gw_orig)
-		batadv_orig_node_free_ref(curr_gw_orig);
+		batadv_orig_node_put(curr_gw_orig);
 	if (router_gw)
 		batadv_neigh_node_free_ref(router_gw);
 	if (router_orig)
@@ -446,7 +446,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
 
 	gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC);
 	if (!gw_node) {
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 		return;
 	}
 
@@ -879,7 +879,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
 
 out:
 	if (orig_dst_node)
-		batadv_orig_node_free_ref(orig_dst_node);
+		batadv_orig_node_put(orig_dst_node);
 	if (curr_gw)
 		batadv_gw_node_free_ref(curr_gw);
 	if (gw_node)
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index a69da37bbad5..7d944af59ef0 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -292,7 +292,7 @@ out:
 	if (neigh_node)
 		batadv_neigh_node_free_ref(neigh_node);
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	return len;
 }
 
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 568c5503f637..5b573d1c8fcc 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1212,7 +1212,7 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
 	if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP)
 		kfree_skb(skb);
 out:
-	batadv_orig_node_free_ref(orig_node);
+	batadv_orig_node_put(orig_node);
 }
 
 /**
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index a4eb8ee4abb1..00f74d40d81a 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -218,7 +218,7 @@ static void batadv_nc_node_release(struct kref *ref)
 
 	nc_node = container_of(ref, struct batadv_nc_node, refcount);
 
-	batadv_orig_node_free_ref(nc_node->orig_node);
+	batadv_orig_node_put(nc_node->orig_node);
 	kfree_rcu(nc_node, rcu);
 }
 
@@ -1372,7 +1372,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
 	}
 	rcu_read_unlock();
 
-	batadv_orig_node_free_ref(orig_node);
+	batadv_orig_node_put(orig_node);
 	return nc_packet;
 }
 
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index eacd0e5a0238..4df9be423583 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -810,11 +810,11 @@ static void batadv_orig_node_release(struct kref *ref)
 }
 
 /**
- * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly
+ * batadv_orig_node_put - decrement the orig node refcounter and possibly
  *  release it
  * @orig_node: the orig node to free
  */
-void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node)
+void batadv_orig_node_put(struct batadv_orig_node *orig_node)
 {
 	kref_put(&orig_node->refcount, batadv_orig_node_release);
 }
@@ -843,7 +843,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv)
 		hlist_for_each_entry_safe(orig_node, node_tmp,
 					  head, hash_entry) {
 			hlist_del_rcu(&orig_node->hash_entry);
-			batadv_orig_node_free_ref(orig_node);
+			batadv_orig_node_put(orig_node);
 		}
 		spin_unlock_bh(list_lock);
 	}
@@ -1204,7 +1204,7 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv)
 				batadv_tt_global_del_orig(orig_node->bat_priv,
 							  orig_node, -1,
 							  "originator timed out");
-				batadv_orig_node_free_ref(orig_node);
+				batadv_orig_node_put(orig_node);
 				continue;
 			}
 
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 99507408f4cf..c151f6a8f5ec 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -37,7 +37,7 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2);
 int batadv_originator_init(struct batadv_priv *bat_priv);
 void batadv_originator_free(struct batadv_priv *bat_priv);
 void batadv_purge_orig_ref(struct batadv_priv *bat_priv);
-void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node);
+void batadv_orig_node_put(struct batadv_orig_node *orig_node);
 struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
 					      const u8 *addr);
 struct batadv_hardif_neigh_node *
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 205310b56c2b..d2c490df963b 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -271,7 +271,7 @@ out:
 	if (primary_if)
 		batadv_hardif_free_ref(primary_if);
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	return ret;
 }
 
@@ -319,7 +319,7 @@ out:
 	if (primary_if)
 		batadv_hardif_free_ref(primary_if);
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	return ret;
 }
 
@@ -403,7 +403,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
 
 out:
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	return ret;
 }
 
@@ -649,7 +649,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 
 out:
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	return ret;
 }
 
@@ -704,7 +704,7 @@ out:
 	if (primary_if)
 		batadv_hardif_free_ref(primary_if);
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 
 	return ret;
 }
@@ -768,7 +768,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
 			return 0;
 
 		curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn);
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	}
 
 	/* check if the TTVN contained in the packet is fresher than what the
@@ -908,7 +908,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
 
 rx_success:
 		if (orig_node)
-			batadv_orig_node_free_ref(orig_node);
+			batadv_orig_node_put(orig_node);
 
 		return NET_RX_SUCCESS;
 	}
@@ -1019,7 +1019,7 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
 
 out:
 	if (orig_node_src)
-		batadv_orig_node_free_ref(orig_node_src);
+		batadv_orig_node_put(orig_node_src);
 
 	return ret;
 }
@@ -1124,6 +1124,6 @@ spin_unlock:
 	spin_unlock_bh(&orig_node->bcast_seqno_lock);
 out:
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	return ret;
 }
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index d8b03fd604e0..261309ea5fec 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -317,7 +317,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
 
 out:
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	if (ret == NET_XMIT_DROP)
 		kfree_skb(skb);
 	return ret;
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 5c7fa02ea57b..107532c32c57 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -392,7 +392,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref)
 	orig_entry = container_of(ref, struct batadv_tt_orig_list_entry,
 				  refcount);
 
-	batadv_orig_node_free_ref(orig_entry->orig_node);
+	batadv_orig_node_put(orig_entry->orig_node);
 	kfree_rcu(orig_entry, rcu);
 }
 
@@ -2776,9 +2776,9 @@ unlock:
 
 out:
 	if (res_dst_orig_node)
-		batadv_orig_node_free_ref(res_dst_orig_node);
+		batadv_orig_node_put(res_dst_orig_node);
 	if (req_dst_orig_node)
-		batadv_orig_node_free_ref(req_dst_orig_node);
+		batadv_orig_node_put(req_dst_orig_node);
 	kfree(tvlv_tt_data);
 	return ret;
 }
@@ -2893,7 +2893,7 @@ unlock:
 out:
 	spin_unlock_bh(&bat_priv->tt.commit_lock);
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	if (primary_if)
 		batadv_hardif_free_ref(primary_if);
 	kfree(tvlv_tt_data);
@@ -2981,7 +2981,7 @@ static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv,
 
 out:
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 }
 
 static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
@@ -3087,7 +3087,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
 	spin_unlock_bh(&bat_priv->tt.req_list_lock);
 out:
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 }
 
 static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv)
@@ -3802,7 +3802,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
 
 out:
 	if (orig_node)
-		batadv_orig_node_free_ref(orig_node);
+		batadv_orig_node_put(orig_node);
 	return NET_RX_SUCCESS;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 82047ad7fecdc2d58bbeeb3872811b1c528fdf10 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:10 +0100
Subject: batman-adv: Rename batadv_hardif *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c            | 10 +++++-----
 net/batman-adv/bridge_loop_avoidance.c | 18 +++++++++---------
 net/batman-adv/distributed-arp-table.c |  2 +-
 net/batman-adv/fragmentation.c         |  2 +-
 net/batman-adv/gateway_client.c        |  2 +-
 net/batman-adv/hard-interface.c        | 20 ++++++++++----------
 net/batman-adv/hard-interface.h        |  4 ++--
 net/batman-adv/icmp_socket.c           |  2 +-
 net/batman-adv/main.c                  |  2 +-
 net/batman-adv/network-coding.c        |  2 +-
 net/batman-adv/originator.c            | 16 ++++++++--------
 net/batman-adv/routing.c               |  8 ++++----
 net/batman-adv/send.c                  |  8 ++++----
 net/batman-adv/soft-interface.c        |  6 +++---
 net/batman-adv/sysfs.c                 |  8 ++++----
 net/batman-adv/translation-table.c     | 10 +++++-----
 16 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 32df38bfff25..f01ee23e35e6 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -515,7 +515,7 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet)
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 }
 
 /**
@@ -617,7 +617,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet,
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return res;
 }
 
@@ -711,9 +711,9 @@ out_nomem:
 	if (!own_packet)
 		atomic_inc(&bat_priv->batman_queue_left);
 out_free_outgoing:
-	batadv_hardif_free_ref(if_outgoing);
+	batadv_hardif_put(if_outgoing);
 out_free_incoming:
-	batadv_hardif_free_ref(if_incoming);
+	batadv_hardif_put(if_incoming);
 }
 
 /* aggregate a new packet into the existing ogm packet */
@@ -958,7 +958,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 }
 
 /**
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 49008d98a8aa..b7167cf3e907 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -424,7 +424,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
 	netif_rx(skb);
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 }
 
 /**
@@ -1282,7 +1282,7 @@ void batadv_bla_status_update(struct net_device *net_dev)
 	 * so just call that one.
 	 */
 	batadv_bla_update_orig_address(bat_priv, primary_if, primary_if);
-	batadv_hardif_free_ref(primary_if);
+	batadv_hardif_put(primary_if);
 }
 
 /**
@@ -1356,7 +1356,7 @@ static void batadv_bla_periodic_work(struct work_struct *work)
 	}
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 
 	queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work,
 			   msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH));
@@ -1395,7 +1395,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
 	if (primary_if) {
 		crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN);
 		bat_priv->bla.claim_dest.group = htons(crc);
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	} else {
 		bat_priv->bla.claim_dest.group = 0; /* will be set later */
 	}
@@ -1599,7 +1599,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv)
 		bat_priv->bla.backbone_hash = NULL;
 	}
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 }
 
 /**
@@ -1692,7 +1692,7 @@ handled:
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	if (claim)
 		batadv_claim_free_ref(claim);
 	return ret;
@@ -1781,7 +1781,7 @@ handled:
 	ret = 1;
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	if (claim)
 		batadv_claim_free_ref(claim);
 	return ret;
@@ -1839,7 +1839,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
 	}
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return 0;
 }
 
@@ -1904,6 +1904,6 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
 	}
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return 0;
 }
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 8f466e782bc7..00408d6231b6 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -840,7 +840,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset)
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return 0;
 }
 
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 67cb224981fb..d13c5b601b67 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -512,7 +512,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
 
 out_err:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 
 	return ret;
 }
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 9cb098f79155..7bf84c136205 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -663,7 +663,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return 0;
 }
 
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index e2aaa4cc39a7..b22b2775a0a5 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -226,7 +226,7 @@ static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv,
 	batadv_bla_update_orig_address(bat_priv, primary_if, oldif);
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 }
 
 static void batadv_primary_if_select(struct batadv_priv *bat_priv,
@@ -250,7 +250,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv,
 
 out:
 	if (curr_hard_iface)
-		batadv_hardif_free_ref(curr_hard_iface);
+		batadv_hardif_put(curr_hard_iface);
 }
 
 static bool
@@ -409,7 +409,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface)
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 }
 
 static void
@@ -562,7 +562,7 @@ err_dev:
 	hard_iface->soft_iface = NULL;
 	dev_put(soft_iface);
 err:
-	batadv_hardif_free_ref(hard_iface);
+	batadv_hardif_put(hard_iface);
 	return ret;
 }
 
@@ -593,7 +593,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
 		batadv_primary_if_select(bat_priv, new_if);
 
 		if (new_if)
-			batadv_hardif_free_ref(new_if);
+			batadv_hardif_put(new_if);
 	}
 
 	bat_priv->bat_algo_ops->bat_iface_disable(hard_iface);
@@ -616,11 +616,11 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
 	}
 
 	hard_iface->soft_iface = NULL;
-	batadv_hardif_free_ref(hard_iface);
+	batadv_hardif_put(hard_iface);
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 }
 
 /**
@@ -639,7 +639,7 @@ static void batadv_hardif_remove_interface_finish(struct work_struct *work)
 
 	batadv_debugfs_del_hardif(hard_iface);
 	batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
-	batadv_hardif_free_ref(hard_iface);
+	batadv_hardif_put(hard_iface);
 }
 
 static struct batadv_hard_iface *
@@ -794,10 +794,10 @@ static int batadv_hard_if_event(struct notifier_block *this,
 	}
 
 hardif_put:
-	batadv_hardif_free_ref(hard_iface);
+	batadv_hardif_put(hard_iface);
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return NOTIFY_DONE;
 }
 
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 5cecc6bc1b1e..d74f1983f33e 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -64,11 +64,11 @@ void batadv_update_min_mtu(struct net_device *soft_iface);
 void batadv_hardif_release(struct kref *ref);
 
 /**
- * batadv_hardif_free_ref - decrement the hard interface refcounter and possibly
+ * batadv_hardif_put - decrement the hard interface refcounter and possibly
  *  release it
  * @hard_iface: the hard interface to free
  */
-static inline void batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface)
+static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface)
 {
 	kref_put(&hard_iface->refcount, batadv_hardif_release);
 }
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 7d944af59ef0..292a648c2f09 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -288,7 +288,7 @@ free_skb:
 	kfree_skb(skb);
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	if (neigh_node)
 		batadv_neigh_node_free_ref(neigh_node);
 	if (orig_node)
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 5b573d1c8fcc..476554b3adce 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -287,7 +287,7 @@ batadv_seq_print_text_primary_if_get(struct seq_file *seq)
 	seq_printf(seq,
 		   "BATMAN mesh %s disabled - primary interface not active\n",
 		   net_dev->name);
-	batadv_hardif_free_ref(primary_if);
+	batadv_hardif_put(primary_if);
 	primary_if = NULL;
 
 out:
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 00f74d40d81a..3a42e26b2240 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1950,7 +1950,7 @@ int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset)
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return 0;
 }
 
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 4df9be423583..3e959f48f845 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -196,7 +196,7 @@ static void batadv_neigh_ifinfo_release(struct kref *ref)
 	neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount);
 
 	if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
-		batadv_hardif_free_ref(neigh_ifinfo->if_outgoing);
+		batadv_hardif_put(neigh_ifinfo->if_outgoing);
 
 	kfree_rcu(neigh_ifinfo, rcu);
 }
@@ -227,7 +227,7 @@ static void batadv_hardif_neigh_release(struct kref *ref)
 	hlist_del_init_rcu(&hardif_neigh->list);
 	spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock);
 
-	batadv_hardif_free_ref(hardif_neigh->if_incoming);
+	batadv_hardif_put(hardif_neigh->if_incoming);
 	kfree_rcu(hardif_neigh, rcu);
 }
 
@@ -273,7 +273,7 @@ static void batadv_neigh_node_release(struct kref *ref)
 	if (bao->bat_neigh_free)
 		bao->bat_neigh_free(neigh_node);
 
-	batadv_hardif_free_ref(neigh_node->if_incoming);
+	batadv_hardif_put(neigh_node->if_incoming);
 
 	kfree_rcu(neigh_node, rcu);
 }
@@ -544,7 +544,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
 
 	hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC);
 	if (!hardif_neigh) {
-		batadv_hardif_free_ref(hard_iface);
+		batadv_hardif_put(hard_iface);
 		goto out;
 	}
 
@@ -707,7 +707,7 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
 		   primary_if->net_dev->dev_addr, net_dev->name,
 		   bat_priv->bat_algo_ops->name);
 
-	batadv_hardif_free_ref(primary_if);
+	batadv_hardif_put(primary_if);
 
 	if (!bat_priv->bat_algo_ops->bat_neigh_print) {
 		seq_puts(seq,
@@ -732,7 +732,7 @@ static void batadv_orig_ifinfo_release(struct kref *ref)
 	orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount);
 
 	if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
-		batadv_hardif_free_ref(orig_ifinfo->if_outgoing);
+		batadv_hardif_put(orig_ifinfo->if_outgoing);
 
 	/* this is the last reference to this object */
 	router = rcu_dereference_protected(orig_ifinfo->router, true);
@@ -1250,7 +1250,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
 		   primary_if->net_dev->dev_addr, net_dev->name,
 		   bat_priv->bat_algo_ops->name);
 
-	batadv_hardif_free_ref(primary_if);
+	batadv_hardif_put(primary_if);
 
 	if (!bat_priv->bat_algo_ops->bat_orig_print) {
 		seq_puts(seq,
@@ -1306,7 +1306,7 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset)
 
 out:
 	if (hard_iface)
-		batadv_hardif_free_ref(hard_iface);
+		batadv_hardif_put(hard_iface);
 	return 0;
 }
 
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index d2c490df963b..a14ca42e927e 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -269,7 +269,7 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
 	}
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	if (orig_node)
 		batadv_orig_node_put(orig_node);
 	return ret;
@@ -317,7 +317,7 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	if (orig_node)
 		batadv_orig_node_put(orig_node);
 	return ret;
@@ -702,7 +702,7 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv,
 	ret = true;
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	if (orig_node)
 		batadv_orig_node_put(orig_node);
 
@@ -808,7 +808,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
 
 	ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr);
 
-	batadv_hardif_free_ref(primary_if);
+	batadv_hardif_put(primary_if);
 
 	unicast_packet->ttvn = curr_ttvn;
 
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 261309ea5fec..c9cafc0f2d1b 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -246,7 +246,7 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv,
 	ret = true;
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return ret;
 }
 
@@ -409,9 +409,9 @@ static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet)
 {
 	kfree_skb(forw_packet->skb);
 	if (forw_packet->if_incoming)
-		batadv_hardif_free_ref(forw_packet->if_incoming);
+		batadv_hardif_put(forw_packet->if_incoming);
 	if (forw_packet->if_outgoing)
-		batadv_hardif_free_ref(forw_packet->if_outgoing);
+		batadv_hardif_put(forw_packet->if_outgoing);
 	kfree(forw_packet);
 }
 
@@ -497,7 +497,7 @@ out_and_inc:
 	atomic_inc(&bat_priv->bcast_queue_left);
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return NETDEV_TX_BUSY;
 }
 
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index d4490ff75edd..ca8fa4e28d4f 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -377,7 +377,7 @@ dropped_freed:
 	batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED);
 end:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return NETDEV_TX_OK;
 }
 
@@ -878,7 +878,7 @@ static int batadv_softif_slave_add(struct net_device *dev,
 
 out:
 	if (hard_iface)
-		batadv_hardif_free_ref(hard_iface);
+		batadv_hardif_put(hard_iface);
 	return ret;
 }
 
@@ -905,7 +905,7 @@ static int batadv_softif_slave_del(struct net_device *dev,
 
 out:
 	if (hard_iface)
-		batadv_hardif_free_ref(hard_iface);
+		batadv_hardif_put(hard_iface);
 	return ret;
 }
 
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index ab4382ba3855..0db7591a000f 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -771,7 +771,7 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj,
 
 	length = sprintf(buff, "%s\n", ifname);
 
-	batadv_hardif_free_ref(hard_iface);
+	batadv_hardif_put(hard_iface);
 
 	return length;
 }
@@ -795,7 +795,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
 	if (strlen(buff) >= IFNAMSIZ) {
 		pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n",
 		       buff);
-		batadv_hardif_free_ref(hard_iface);
+		batadv_hardif_put(hard_iface);
 		return -EINVAL;
 	}
 
@@ -829,7 +829,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
 unlock:
 	rtnl_unlock();
 out:
-	batadv_hardif_free_ref(hard_iface);
+	batadv_hardif_put(hard_iface);
 	return ret;
 }
 
@@ -863,7 +863,7 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj,
 		break;
 	}
 
-	batadv_hardif_free_ref(hard_iface);
+	batadv_hardif_put(hard_iface);
 
 	return length;
 }
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 107532c32c57..c6c633828bc8 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1060,7 +1060,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
 	}
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return 0;
 }
 
@@ -1725,7 +1725,7 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset)
 	}
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	return 0;
 }
 
@@ -2638,7 +2638,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv,
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	if (ret && tt_req_node) {
 		spin_lock_bh(&bat_priv->tt.req_list_lock);
 		/* hlist_del_init() verifies tt_req_node still is in the list */
@@ -2895,7 +2895,7 @@ out:
 	if (orig_node)
 		batadv_orig_node_put(orig_node);
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 	kfree(tvlv_tt_data);
 	/* The packet was for this host, so it doesn't need to be re-routed */
 	return true;
@@ -3218,7 +3218,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client,
 
 out:
 	if (primary_if)
-		batadv_hardif_free_ref(primary_if);
+		batadv_hardif_put(primary_if);
 }
 
 static void batadv_tt_purge(struct work_struct *work)
-- 
cgit v1.2.3-59-g8ed1b


From 25bb250996eea422171ede0ada8814188dda8937 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:11 +0100
Subject: batman-adv: Rename batadv_neigh_node *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c            | 16 ++++++++--------
 net/batman-adv/distributed-arp-table.c |  2 +-
 net/batman-adv/fragmentation.c         |  2 +-
 net/batman-adv/gateway_client.c        | 14 +++++++-------
 net/batman-adv/icmp_socket.c           |  2 +-
 net/batman-adv/network-coding.c        |  4 ++--
 net/batman-adv/originator.c            | 16 ++++++++--------
 net/batman-adv/originator.h            |  2 +-
 net/batman-adv/routing.c               | 14 +++++++-------
 net/batman-adv/send.c                  |  2 +-
 net/batman-adv/translation-table.c     |  6 +++---
 11 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index f01ee23e35e6..0f8742b4bfaa 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1005,7 +1005,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
 		    tmp_neigh_node->if_incoming == if_incoming &&
 		    kref_get_unless_zero(&tmp_neigh_node->refcount)) {
 			if (WARN(neigh_node, "too many matching neigh_nodes"))
-				batadv_neigh_node_free_ref(neigh_node);
+				batadv_neigh_node_put(neigh_node);
 			neigh_node = tmp_neigh_node;
 			continue;
 		}
@@ -1116,9 +1116,9 @@ unlock:
 	rcu_read_unlock();
 out:
 	if (neigh_node)
-		batadv_neigh_node_free_ref(neigh_node);
+		batadv_neigh_node_put(neigh_node);
 	if (router)
-		batadv_neigh_node_free_ref(router);
+		batadv_neigh_node_put(router);
 	if (neigh_ifinfo)
 		batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
 	if (router_ifinfo)
@@ -1265,7 +1265,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
 
 out:
 	if (neigh_node)
-		batadv_neigh_node_free_ref(neigh_node);
+		batadv_neigh_node_put(neigh_node);
 	return ret;
 }
 
@@ -1568,11 +1568,11 @@ out:
 	if (router_ifinfo)
 		batadv_neigh_ifinfo_free_ref(router_ifinfo);
 	if (router)
-		batadv_neigh_node_free_ref(router);
+		batadv_neigh_node_put(router);
 	if (router_router)
-		batadv_neigh_node_free_ref(router_router);
+		batadv_neigh_node_put(router_router);
 	if (orig_neigh_router)
-		batadv_neigh_node_free_ref(orig_neigh_router);
+		batadv_neigh_node_put(orig_neigh_router);
 	if (hardif_neigh)
 		batadv_hardif_neigh_free_ref(hardif_neigh);
 
@@ -1868,7 +1868,7 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv,
 			batman_count++;
 
 next:
-			batadv_neigh_node_free_ref(neigh_node);
+			batadv_neigh_node_put(neigh_node);
 			if (n_ifinfo)
 				batadv_neigh_ifinfo_free_ref(n_ifinfo);
 		}
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 00408d6231b6..919f3445fff3 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -674,7 +674,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
 			ret = true;
 		}
 free_neigh:
-		batadv_neigh_node_free_ref(neigh_node);
+		batadv_neigh_node_put(neigh_node);
 free_orig:
 		batadv_orig_node_put(cand[i].orig_node);
 	}
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index d13c5b601b67..adb9c3989add 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -387,7 +387,7 @@ out:
 	if (orig_node_dst)
 		batadv_orig_node_put(orig_node_dst);
 	if (neigh_node)
-		batadv_neigh_node_free_ref(neigh_node);
+		batadv_neigh_node_put(neigh_node);
 	return ret;
 }
 
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 7bf84c136205..ea03c056a345 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -235,7 +235,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
 		batadv_gw_node_free_ref(gw_node);
 
 next:
-		batadv_neigh_node_free_ref(router);
+		batadv_neigh_node_put(router);
 		if (router_ifinfo)
 			batadv_neigh_ifinfo_free_ref(router_ifinfo);
 	}
@@ -352,7 +352,7 @@ out:
 	if (next_gw)
 		batadv_gw_node_free_ref(next_gw);
 	if (router)
-		batadv_neigh_node_free_ref(router);
+		batadv_neigh_node_put(router);
 	if (router_ifinfo)
 		batadv_neigh_ifinfo_free_ref(router_ifinfo);
 }
@@ -417,9 +417,9 @@ out:
 	if (curr_gw_orig)
 		batadv_orig_node_put(curr_gw_orig);
 	if (router_gw)
-		batadv_neigh_node_free_ref(router_gw);
+		batadv_neigh_node_put(router_gw);
 	if (router_orig)
-		batadv_neigh_node_free_ref(router_orig);
+		batadv_neigh_node_put(router_orig);
 	if (router_gw_tq)
 		batadv_neigh_ifinfo_free_ref(router_gw_tq);
 	if (router_orig_tq)
@@ -626,7 +626,7 @@ out:
 	if (router_ifinfo)
 		batadv_neigh_ifinfo_free_ref(router_ifinfo);
 	if (router)
-		batadv_neigh_node_free_ref(router);
+		batadv_neigh_node_put(router);
 	return ret;
 }
 
@@ -885,8 +885,8 @@ out:
 	if (gw_node)
 		batadv_gw_node_free_ref(gw_node);
 	if (neigh_old)
-		batadv_neigh_node_free_ref(neigh_old);
+		batadv_neigh_node_put(neigh_old);
 	if (neigh_curr)
-		batadv_neigh_node_free_ref(neigh_curr);
+		batadv_neigh_node_put(neigh_curr);
 	return out_of_range;
 }
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 292a648c2f09..6268f08b7154 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -290,7 +290,7 @@ out:
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 	if (neigh_node)
-		batadv_neigh_node_free_ref(neigh_node);
+		batadv_neigh_node_put(neigh_node);
 	if (orig_node)
 		batadv_orig_node_put(orig_node);
 	return len;
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 3a42e26b2240..e9409bad35e7 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1228,9 +1228,9 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
 	res = true;
 out:
 	if (router_neigh)
-		batadv_neigh_node_free_ref(router_neigh);
+		batadv_neigh_node_put(router_neigh);
 	if (router_coding)
-		batadv_neigh_node_free_ref(router_coding);
+		batadv_neigh_node_put(router_coding);
 	if (router_neigh_ifinfo)
 		batadv_neigh_ifinfo_free_ref(router_neigh_ifinfo);
 	if (router_coding_ifinfo)
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 3e959f48f845..060f3a670d63 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -279,11 +279,11 @@ static void batadv_neigh_node_release(struct kref *ref)
 }
 
 /**
- * batadv_neigh_node_free_ref - decrement the neighbors refcounter and possibly
+ * batadv_neigh_node_put - decrement the neighbors refcounter and possibly
  *  release it
  * @neigh_node: neigh neighbor to free
  */
-void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node)
+void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node)
 {
 	kref_put(&neigh_node->refcount, batadv_neigh_node_release);
 }
@@ -737,7 +737,7 @@ static void batadv_orig_ifinfo_release(struct kref *ref)
 	/* this is the last reference to this object */
 	router = rcu_dereference_protected(orig_ifinfo->router, true);
 	if (router)
-		batadv_neigh_node_free_ref(router);
+		batadv_neigh_node_put(router);
 
 	kfree_rcu(orig_ifinfo, rcu);
 }
@@ -793,7 +793,7 @@ static void batadv_orig_node_release(struct kref *ref)
 	hlist_for_each_entry_safe(neigh_node, node_tmp,
 				  &orig_node->neigh_list, list) {
 		hlist_del_rcu(&neigh_node->list);
-		batadv_neigh_node_free_ref(neigh_node);
+		batadv_neigh_node_put(neigh_node);
 	}
 
 	hlist_for_each_entry_safe(orig_ifinfo, node_tmp,
@@ -1069,7 +1069,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
 			neigh_purged = true;
 
 			hlist_del_rcu(&neigh_node->list);
-			batadv_neigh_node_free_ref(neigh_node);
+			batadv_neigh_node_put(neigh_node);
 		} else {
 			/* only necessary if not the whole neighbor is to be
 			 * deleted, but some interface has been removed.
@@ -1108,7 +1108,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv,
 			continue;
 
 		if (best)
-			batadv_neigh_node_free_ref(best);
+			batadv_neigh_node_put(best);
 
 		best = neigh;
 	}
@@ -1154,7 +1154,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
 	batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT,
 			    best_neigh_node);
 	if (best_neigh_node)
-		batadv_neigh_node_free_ref(best_neigh_node);
+		batadv_neigh_node_put(best_neigh_node);
 
 	/* ... then for all other interfaces. */
 	rcu_read_lock();
@@ -1171,7 +1171,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
 		batadv_update_route(bat_priv, orig_node, hard_iface,
 				    best_neigh_node);
 		if (best_neigh_node)
-			batadv_neigh_node_free_ref(best_neigh_node);
+			batadv_neigh_node_put(best_neigh_node);
 	}
 	rcu_read_unlock();
 
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index c151f6a8f5ec..78f409cf7fc3 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -49,7 +49,7 @@ struct batadv_neigh_node *
 batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 		      struct batadv_hard_iface *hard_iface,
 		      const u8 *neigh_addr);
-void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node);
+void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node);
 struct batadv_neigh_node *
 batadv_orig_router_get(struct batadv_orig_node *orig_node,
 		       const struct batadv_hard_iface *if_outgoing);
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index a14ca42e927e..9107f69873ea 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -98,7 +98,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
 	}
 
 	if (curr_router)
-		batadv_neigh_node_free_ref(curr_router);
+		batadv_neigh_node_put(curr_router);
 
 	/* increase refcount of new best neighbor */
 	if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount))
@@ -111,7 +111,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
 
 	/* decrease refcount of previous best neighbor */
 	if (curr_router)
-		batadv_neigh_node_free_ref(curr_router);
+		batadv_neigh_node_put(curr_router);
 }
 
 /**
@@ -138,7 +138,7 @@ void batadv_update_route(struct batadv_priv *bat_priv,
 
 out:
 	if (router)
-		batadv_neigh_node_free_ref(router);
+		batadv_neigh_node_put(router);
 }
 
 /**
@@ -545,7 +545,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
 next:
 		/* free references */
 		if (cand_router) {
-			batadv_neigh_node_free_ref(cand_router);
+			batadv_neigh_node_put(cand_router);
 			cand_router = NULL;
 		}
 		batadv_orig_ifinfo_free_ref(cand);
@@ -562,17 +562,17 @@ next:
 	 * 3) there is no candidate at all, return the default router
 	 */
 	if (next_candidate) {
-		batadv_neigh_node_free_ref(router);
+		batadv_neigh_node_put(router);
 
 		/* remove references to first candidate, we don't need it. */
 		if (first_candidate) {
-			batadv_neigh_node_free_ref(first_candidate_router);
+			batadv_neigh_node_put(first_candidate_router);
 			batadv_orig_ifinfo_free_ref(first_candidate);
 		}
 		router = next_candidate_router;
 		orig_node->last_bonding_candidate = next_candidate;
 	} else if (first_candidate) {
-		batadv_neigh_node_free_ref(router);
+		batadv_neigh_node_put(router);
 
 		/* refcounting has already been done in the loop above. */
 		router = first_candidate_router;
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index c9cafc0f2d1b..caff32cf6fe7 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -153,7 +153,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
 
 out:
 	if (neigh_node)
-		batadv_neigh_node_free_ref(neigh_node);
+		batadv_neigh_node_put(neigh_node);
 
 	return ret;
 }
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index c6c633828bc8..132635aa7b11 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1586,20 +1586,20 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv,
 		if (best_router &&
 		    bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT,
 				       best_router, BATADV_IF_DEFAULT) <= 0) {
-			batadv_neigh_node_free_ref(router);
+			batadv_neigh_node_put(router);
 			continue;
 		}
 
 		/* release the refcount for the "old" best */
 		if (best_router)
-			batadv_neigh_node_free_ref(best_router);
+			batadv_neigh_node_put(best_router);
 
 		best_entry = orig_entry;
 		best_router = router;
 	}
 
 	if (best_router)
-		batadv_neigh_node_free_ref(best_router);
+		batadv_neigh_node_put(best_router);
 
 	return best_entry;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 044fa3ae124118c62575566cfc88e6046a64ae0e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:12 +0100
Subject: batman-adv: Rename batadv_neigh_ifinfo *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c     | 24 ++++++++++++------------
 net/batman-adv/gateway_client.c | 14 +++++++-------
 net/batman-adv/network-coding.c |  4 ++--
 net/batman-adv/originator.c     |  8 ++++----
 net/batman-adv/originator.h     |  2 +-
 5 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 0f8742b4bfaa..7566cf6614f5 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1026,7 +1026,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
 		neigh_ifinfo->bat_iv.tq_avg = tq_avg;
 		spin_unlock_bh(&tmp_neigh_node->ifinfo_lock);
 
-		batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+		batadv_neigh_ifinfo_put(neigh_ifinfo);
 		neigh_ifinfo = NULL;
 	}
 
@@ -1120,9 +1120,9 @@ out:
 	if (router)
 		batadv_neigh_node_put(router);
 	if (neigh_ifinfo)
-		batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+		batadv_neigh_ifinfo_put(neigh_ifinfo);
 	if (router_ifinfo)
-		batadv_neigh_ifinfo_free_ref(router_ifinfo);
+		batadv_neigh_ifinfo_put(router_ifinfo);
 }
 
 /**
@@ -1192,7 +1192,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
 	neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
 	if (neigh_ifinfo) {
 		neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count;
-		batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+		batadv_neigh_ifinfo_put(neigh_ifinfo);
 	} else {
 		neigh_rq_count = 0;
 	}
@@ -1353,7 +1353,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
 		packet_count = bitmap_weight(bitmap,
 					     BATADV_TQ_LOCAL_WINDOW_SIZE);
 		neigh_ifinfo->bat_iv.real_packet_count = packet_count;
-		batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+		batadv_neigh_ifinfo_put(neigh_ifinfo);
 	}
 	rcu_read_unlock();
 
@@ -1566,7 +1566,7 @@ out_neigh:
 		batadv_orig_node_put(orig_neigh_node);
 out:
 	if (router_ifinfo)
-		batadv_neigh_ifinfo_free_ref(router_ifinfo);
+		batadv_neigh_ifinfo_put(router_ifinfo);
 	if (router)
 		batadv_neigh_node_put(router);
 	if (router_router)
@@ -1805,7 +1805,7 @@ batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node,
 			   neigh_node->addr,
 			   n_ifinfo->bat_iv.tq_avg);
 
-		batadv_neigh_ifinfo_free_ref(n_ifinfo);
+		batadv_neigh_ifinfo_put(n_ifinfo);
 	}
 }
 
@@ -1870,7 +1870,7 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv,
 next:
 			batadv_neigh_node_put(neigh_node);
 			if (n_ifinfo)
-				batadv_neigh_ifinfo_free_ref(n_ifinfo);
+				batadv_neigh_ifinfo_put(n_ifinfo);
 		}
 		rcu_read_unlock();
 	}
@@ -1964,9 +1964,9 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
 
 out:
 	if (neigh1_ifinfo)
-		batadv_neigh_ifinfo_free_ref(neigh1_ifinfo);
+		batadv_neigh_ifinfo_put(neigh1_ifinfo);
 	if (neigh2_ifinfo)
-		batadv_neigh_ifinfo_free_ref(neigh2_ifinfo);
+		batadv_neigh_ifinfo_put(neigh2_ifinfo);
 
 	return diff;
 }
@@ -2007,9 +2007,9 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1,
 
 out:
 	if (neigh1_ifinfo)
-		batadv_neigh_ifinfo_free_ref(neigh1_ifinfo);
+		batadv_neigh_ifinfo_put(neigh1_ifinfo);
 	if (neigh2_ifinfo)
-		batadv_neigh_ifinfo_free_ref(neigh2_ifinfo);
+		batadv_neigh_ifinfo_put(neigh2_ifinfo);
 
 	return ret;
 }
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index ea03c056a345..37972dfda400 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -237,7 +237,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
 next:
 		batadv_neigh_node_put(router);
 		if (router_ifinfo)
-			batadv_neigh_ifinfo_free_ref(router_ifinfo);
+			batadv_neigh_ifinfo_put(router_ifinfo);
 	}
 	rcu_read_unlock();
 
@@ -354,7 +354,7 @@ out:
 	if (router)
 		batadv_neigh_node_put(router);
 	if (router_ifinfo)
-		batadv_neigh_ifinfo_free_ref(router_ifinfo);
+		batadv_neigh_ifinfo_put(router_ifinfo);
 }
 
 void batadv_gw_check_election(struct batadv_priv *bat_priv,
@@ -421,9 +421,9 @@ out:
 	if (router_orig)
 		batadv_neigh_node_put(router_orig);
 	if (router_gw_tq)
-		batadv_neigh_ifinfo_free_ref(router_gw_tq);
+		batadv_neigh_ifinfo_put(router_gw_tq);
 	if (router_orig_tq)
-		batadv_neigh_ifinfo_free_ref(router_orig_tq);
+		batadv_neigh_ifinfo_put(router_orig_tq);
 }
 
 /**
@@ -624,7 +624,7 @@ static int batadv_write_buffer_text(struct batadv_priv *bat_priv,
 		batadv_gw_node_free_ref(curr_gw);
 out:
 	if (router_ifinfo)
-		batadv_neigh_ifinfo_free_ref(router_ifinfo);
+		batadv_neigh_ifinfo_put(router_ifinfo);
 	if (router)
 		batadv_neigh_node_put(router);
 	return ret;
@@ -857,7 +857,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
 			goto out;
 
 		curr_tq_avg = curr_ifinfo->bat_iv.tq_avg;
-		batadv_neigh_ifinfo_free_ref(curr_ifinfo);
+		batadv_neigh_ifinfo_put(curr_ifinfo);
 
 		break;
 	case BATADV_GW_MODE_OFF:
@@ -875,7 +875,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
 
 	if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD)
 		out_of_range = true;
-	batadv_neigh_ifinfo_free_ref(old_ifinfo);
+	batadv_neigh_ifinfo_put(old_ifinfo);
 
 out:
 	if (orig_dst_node)
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index e9409bad35e7..8b367659f1ae 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1232,9 +1232,9 @@ out:
 	if (router_coding)
 		batadv_neigh_node_put(router_coding);
 	if (router_neigh_ifinfo)
-		batadv_neigh_ifinfo_free_ref(router_neigh_ifinfo);
+		batadv_neigh_ifinfo_put(router_neigh_ifinfo);
 	if (router_coding_ifinfo)
-		batadv_neigh_ifinfo_free_ref(router_coding_ifinfo);
+		batadv_neigh_ifinfo_put(router_coding_ifinfo);
 	return res;
 }
 
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 060f3a670d63..7965fe5947fb 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -202,11 +202,11 @@ static void batadv_neigh_ifinfo_release(struct kref *ref)
 }
 
 /**
- * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly release
+ * batadv_neigh_ifinfo_put - decrement the refcounter and possibly release
  *  the neigh_ifinfo
  * @neigh_ifinfo: the neigh_ifinfo object to release
  */
-void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo)
+void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo)
 {
 	kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release);
 }
@@ -259,7 +259,7 @@ static void batadv_neigh_node_release(struct kref *ref)
 
 	hlist_for_each_entry_safe(neigh_ifinfo, node_tmp,
 				  &neigh_node->ifinfo_list, list) {
-		batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+		batadv_neigh_ifinfo_put(neigh_ifinfo);
 	}
 
 	hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming,
@@ -966,7 +966,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
 			   neigh->addr, if_outgoing->net_dev->name);
 
 		hlist_del_rcu(&neigh_ifinfo->list);
-		batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+		batadv_neigh_ifinfo_put(neigh_ifinfo);
 	}
 
 	spin_unlock_bh(&neigh->ifinfo_lock);
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 78f409cf7fc3..b4fe82804fc4 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -59,7 +59,7 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
 struct batadv_neigh_ifinfo *
 batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
 			struct batadv_hard_iface *if_outgoing);
-void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo);
+void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo);
 
 int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset);
 
-- 
cgit v1.2.3-59-g8ed1b


From 35f94779c96b4f79e2fd3f2e59b5713704171c3f Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:13 +0100
Subject: batman-adv: Rename batadv_orig_ifinfo *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c     |  4 ++--
 net/batman-adv/network-coding.c |  2 +-
 net/batman-adv/originator.c     | 10 +++++-----
 net/batman-adv/originator.h     |  2 +-
 net/batman-adv/routing.c        |  8 ++++----
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 7566cf6614f5..ba48fb2818fc 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1368,7 +1368,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
 out:
 	spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
 	batadv_orig_node_put(orig_node);
-	batadv_orig_ifinfo_free_ref(orig_ifinfo);
+	batadv_orig_ifinfo_put(orig_ifinfo);
 	return ret;
 }
 
@@ -1514,7 +1514,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
 					  ogm_packet, if_incoming,
 					  if_outgoing, dup_status);
 	}
-	batadv_orig_ifinfo_free_ref(orig_ifinfo);
+	batadv_orig_ifinfo_put(orig_ifinfo);
 
 	/* only forward for specific interface, not for the default one. */
 	if (if_outgoing == BATADV_IF_DEFAULT)
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 8b367659f1ae..9180ff114a26 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -772,7 +772,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
 
 	last_ttl = orig_ifinfo->last_ttl;
 	last_real_seqno = orig_ifinfo->last_real_seqno;
-	batadv_orig_ifinfo_free_ref(orig_ifinfo);
+	batadv_orig_ifinfo_put(orig_ifinfo);
 
 	if (last_real_seqno != ntohl(ogm_packet->seqno))
 		return false;
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 7965fe5947fb..605641536c82 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -743,11 +743,11 @@ static void batadv_orig_ifinfo_release(struct kref *ref)
 }
 
 /**
- * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly release
+ * batadv_orig_ifinfo_put - decrement the refcounter and possibly release
  *  the orig_ifinfo
  * @orig_ifinfo: the orig_ifinfo object to release
  */
-void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo)
+void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo)
 {
 	kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release);
 }
@@ -799,7 +799,7 @@ static void batadv_orig_node_release(struct kref *ref)
 	hlist_for_each_entry_safe(orig_ifinfo, node_tmp,
 				  &orig_node->ifinfo_list, list) {
 		hlist_del_rcu(&orig_ifinfo->list);
-		batadv_orig_ifinfo_free_ref(orig_ifinfo);
+		batadv_orig_ifinfo_put(orig_ifinfo);
 	}
 	spin_unlock_bh(&orig_node->neigh_list_lock);
 
@@ -1012,10 +1012,10 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
 		ifinfo_purged = true;
 
 		hlist_del_rcu(&orig_ifinfo->list);
-		batadv_orig_ifinfo_free_ref(orig_ifinfo);
+		batadv_orig_ifinfo_put(orig_ifinfo);
 		if (orig_node->last_bonding_candidate == orig_ifinfo) {
 			orig_node->last_bonding_candidate = NULL;
-			batadv_orig_ifinfo_free_ref(orig_ifinfo);
+			batadv_orig_ifinfo_put(orig_ifinfo);
 		}
 	}
 
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index b4fe82804fc4..adbe7aa749a9 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -69,7 +69,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
 struct batadv_orig_ifinfo *
 batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
 		       struct batadv_hard_iface *if_outgoing);
-void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo);
+void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo);
 
 int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
 int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 9107f69873ea..4dd646a52f1a 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -107,7 +107,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
 	spin_lock_bh(&orig_node->neigh_list_lock);
 	rcu_assign_pointer(orig_ifinfo->router, neigh_node);
 	spin_unlock_bh(&orig_node->neigh_list_lock);
-	batadv_orig_ifinfo_free_ref(orig_ifinfo);
+	batadv_orig_ifinfo_put(orig_ifinfo);
 
 	/* decrease refcount of previous best neighbor */
 	if (curr_router)
@@ -548,13 +548,13 @@ next:
 			batadv_neigh_node_put(cand_router);
 			cand_router = NULL;
 		}
-		batadv_orig_ifinfo_free_ref(cand);
+		batadv_orig_ifinfo_put(cand);
 	}
 	rcu_read_unlock();
 
 	/* last_bonding_candidate is reset below, remove the old reference. */
 	if (orig_node->last_bonding_candidate)
-		batadv_orig_ifinfo_free_ref(orig_node->last_bonding_candidate);
+		batadv_orig_ifinfo_put(orig_node->last_bonding_candidate);
 
 	/* After finding candidates, handle the three cases:
 	 * 1) there is a next candidate, use that
@@ -567,7 +567,7 @@ next:
 		/* remove references to first candidate, we don't need it. */
 		if (first_candidate) {
 			batadv_neigh_node_put(first_candidate_router);
-			batadv_orig_ifinfo_free_ref(first_candidate);
+			batadv_orig_ifinfo_put(first_candidate);
 		}
 		router = next_candidate_router;
 		orig_node->last_bonding_candidate = next_candidate;
-- 
cgit v1.2.3-59-g8ed1b


From accadc35a147a8faa14a55e176cb94986f28888c Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:14 +0100
Subject: batman-adv: Rename batadv_hardif_neigh *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bat_iv_ogm.c |  2 +-
 net/batman-adv/originator.c | 10 +++++-----
 net/batman-adv/originator.h |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index ba48fb2818fc..5651e33ca6bd 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1574,7 +1574,7 @@ out:
 	if (orig_neigh_router)
 		batadv_neigh_node_put(orig_neigh_router);
 	if (hardif_neigh)
-		batadv_hardif_neigh_free_ref(hardif_neigh);
+		batadv_hardif_neigh_put(hardif_neigh);
 
 	kfree_skb(skb_priv);
 }
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 605641536c82..0b1a77da2e1e 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -232,11 +232,11 @@ static void batadv_hardif_neigh_release(struct kref *ref)
 }
 
 /**
- * batadv_hardif_neigh_free_ref - decrement the hardif neighbors refcounter
+ * batadv_hardif_neigh_put - decrement the hardif neighbors refcounter
  *  and possibly release it
  * @hardif_neigh: hardif neigh neighbor to free
  */
-void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh)
+void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh)
 {
 	kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release);
 }
@@ -266,8 +266,8 @@ static void batadv_neigh_node_release(struct kref *ref)
 					       neigh_node->addr);
 	if (hardif_neigh) {
 		/* batadv_hardif_neigh_get() increases refcount too */
-		batadv_hardif_neigh_free_ref(hardif_neigh);
-		batadv_hardif_neigh_free_ref(hardif_neigh);
+		batadv_hardif_neigh_put(hardif_neigh);
+		batadv_hardif_neigh_put(hardif_neigh);
 	}
 
 	if (bao->bat_neigh_free)
@@ -681,7 +681,7 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 
 out:
 	if (hardif_neigh)
-		batadv_hardif_neigh_free_ref(hardif_neigh);
+		batadv_hardif_neigh_put(hardif_neigh);
 	return neigh_node;
 }
 
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index adbe7aa749a9..97748e84b66b 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -44,7 +44,7 @@ struct batadv_hardif_neigh_node *
 batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
 			const u8 *neigh_addr);
 void
-batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh);
+batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh);
 struct batadv_neigh_node *
 batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 		      struct batadv_hard_iface *hard_iface,
-- 
cgit v1.2.3-59-g8ed1b


From c8b86c1241e39e6b86ef3a5083b149336942008e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:15 +0100
Subject: batman-adv: Rename batadv_backbone_gw *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bridge_loop_avoidance.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index b7167cf3e907..ce2d0701fb59 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -159,12 +159,11 @@ static void batadv_backbone_gw_release(struct kref *ref)
 }
 
 /**
- * batadv_backbone_gw_free_ref - decrement the backbone gw refcounter and
- *  possibly release it
+ * batadv_backbone_gw_put - decrement the backbone gw refcounter and possibly
+ *  release it
  * @backbone_gw: backbone gateway to be free'd
  */
-static void
-batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw)
+static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw)
 {
 	kref_put(&backbone_gw->refcount, batadv_backbone_gw_release);
 }
@@ -180,7 +179,7 @@ static void batadv_claim_release(struct kref *ref)
 
 	claim = container_of(ref, struct batadv_bla_claim, refcount);
 
-	batadv_backbone_gw_free_ref(claim->backbone_gw);
+	batadv_backbone_gw_put(claim->backbone_gw);
 	kfree_rcu(claim, rcu);
 }
 
@@ -524,7 +523,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv,
 		return;
 
 	backbone_gw->lasttime = jiffies;
-	batadv_backbone_gw_free_ref(backbone_gw);
+	batadv_backbone_gw_put(backbone_gw);
 }
 
 /**
@@ -573,7 +572,7 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv,
 
 	/* finally, send an announcement frame */
 	batadv_bla_send_announce(bat_priv, backbone_gw);
-	batadv_backbone_gw_free_ref(backbone_gw);
+	batadv_backbone_gw_put(backbone_gw);
 }
 
 /**
@@ -682,7 +681,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
 		spin_lock_bh(&claim->backbone_gw->crc_lock);
 		claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
 		spin_unlock_bh(&claim->backbone_gw->crc_lock);
-		batadv_backbone_gw_free_ref(claim->backbone_gw);
+		batadv_backbone_gw_put(claim->backbone_gw);
 	}
 	/* set (new) backbone gw */
 	kref_get(&backbone_gw->refcount);
@@ -783,7 +782,7 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
 		}
 	}
 
-	batadv_backbone_gw_free_ref(backbone_gw);
+	batadv_backbone_gw_put(backbone_gw);
 	return 1;
 }
 
@@ -854,7 +853,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
 		   claim_addr, BATADV_PRINT_VID(vid), backbone_gw->orig);
 
 	batadv_bla_del_claim(bat_priv, claim_addr, vid);
-	batadv_backbone_gw_free_ref(backbone_gw);
+	batadv_backbone_gw_put(backbone_gw);
 	return 1;
 }
 
@@ -891,7 +890,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv,
 
 	/* TODO: we could call something like tt_local_del() here. */
 
-	batadv_backbone_gw_free_ref(backbone_gw);
+	batadv_backbone_gw_put(backbone_gw);
 	return 1;
 }
 
@@ -1154,7 +1153,7 @@ purge_now:
 			batadv_bla_del_backbone_claims(backbone_gw);
 
 			hlist_del_rcu(&backbone_gw->hash_entry);
-			batadv_backbone_gw_free_ref(backbone_gw);
+			batadv_backbone_gw_put(backbone_gw);
 		}
 		spin_unlock_bh(list_lock);
 	}
@@ -1571,7 +1570,7 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb,
 	if (!backbone_gw)
 		return 0;
 
-	batadv_backbone_gw_free_ref(backbone_gw);
+	batadv_backbone_gw_put(backbone_gw);
 	return 1;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 321e3e0884829427b9bcf2c112b0945af3160875 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:16 +0100
Subject: batman-adv: Rename batadv_claim *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/bridge_loop_avoidance.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index ce2d0701fb59..0a6c8b824a00 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -184,11 +184,11 @@ static void batadv_claim_release(struct kref *ref)
 }
 
 /**
- * batadv_claim_free_ref - decrement the claim refcounter and possibly
+ * batadv_claim_put - decrement the claim refcounter and possibly
  *  release it
  * @claim: claim to be free'd
  */
-static void batadv_claim_free_ref(struct batadv_bla_claim *claim)
+static void batadv_claim_put(struct batadv_bla_claim *claim)
 {
 	kref_put(&claim->refcount, batadv_claim_release);
 }
@@ -304,7 +304,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
 			if (claim->backbone_gw != backbone_gw)
 				continue;
 
-			batadv_claim_free_ref(claim);
+			batadv_claim_put(claim);
 			hlist_del_rcu(&claim->hash_entry);
 		}
 		spin_unlock_bh(list_lock);
@@ -693,7 +693,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
 	backbone_gw->lasttime = jiffies;
 
 claim_free_ref:
-	batadv_claim_free_ref(claim);
+	batadv_claim_put(claim);
 }
 
 /**
@@ -718,14 +718,14 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
 
 	batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim,
 			   batadv_choose_claim, claim);
-	batadv_claim_free_ref(claim); /* reference from the hash is gone */
+	batadv_claim_put(claim); /* reference from the hash is gone */
 
 	spin_lock_bh(&claim->backbone_gw->crc_lock);
 	claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
 	spin_unlock_bh(&claim->backbone_gw->crc_lock);
 
 	/* don't need the reference from hash_find() anymore */
-	batadv_claim_free_ref(claim);
+	batadv_claim_put(claim);
 }
 
 /**
@@ -1693,7 +1693,7 @@ out:
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 	if (claim)
-		batadv_claim_free_ref(claim);
+		batadv_claim_put(claim);
 	return ret;
 }
 
@@ -1782,7 +1782,7 @@ out:
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 	if (claim)
-		batadv_claim_free_ref(claim);
+		batadv_claim_put(claim);
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a6416f9ffcb80a84a3ac8d7eb76b277107cae67c Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:17 +0100
Subject: batman-adv: Rename batadv_dat_entry *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/distributed-arp-table.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 919f3445fff3..4c9b69d465a6 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -77,11 +77,11 @@ static void batadv_dat_entry_release(struct kref *ref)
 }
 
 /**
- * batadv_dat_entry_free_ref - decrement the dat_entry refcounter and possibly
+ * batadv_dat_entry_put - decrement the dat_entry refcounter and possibly
  *  release it
  * @dat_entry: dat_entry to be free'd
  */
-static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry)
+static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry)
 {
 	kref_put(&dat_entry->refcount, batadv_dat_entry_release);
 }
@@ -135,7 +135,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv,
 				continue;
 
 			hlist_del_rcu(&dat_entry->hash_entry);
-			batadv_dat_entry_free_ref(dat_entry);
+			batadv_dat_entry_put(dat_entry);
 		}
 		spin_unlock_bh(list_lock);
 	}
@@ -349,7 +349,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
 
 	if (unlikely(hash_added != 0)) {
 		/* remove the reference for the hash */
-		batadv_dat_entry_free_ref(dat_entry);
+		batadv_dat_entry_put(dat_entry);
 		goto out;
 	}
 
@@ -358,7 +358,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
 
 out:
 	if (dat_entry)
-		batadv_dat_entry_free_ref(dat_entry);
+		batadv_dat_entry_put(dat_entry);
 }
 
 #ifdef CONFIG_BATMAN_ADV_DEBUG
@@ -1029,7 +1029,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
 	}
 out:
 	if (dat_entry)
-		batadv_dat_entry_free_ref(dat_entry);
+		batadv_dat_entry_put(dat_entry);
 	return ret;
 }
 
@@ -1109,7 +1109,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
 	}
 out:
 	if (dat_entry)
-		batadv_dat_entry_free_ref(dat_entry);
+		batadv_dat_entry_put(dat_entry);
 	if (ret)
 		kfree_skb(skb);
 	return ret;
@@ -1262,6 +1262,6 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
 
 out:
 	if (dat_entry)
-		batadv_dat_entry_free_ref(dat_entry);
+		batadv_dat_entry_put(dat_entry);
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 3a01743ddf2e64558d87b4af616916e4729e2041 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:18 +0100
Subject: batman-adv: Rename batadv_gw_node *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/gateway_client.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 37972dfda400..c59aff5ccac8 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -76,11 +76,10 @@ static void batadv_gw_node_release(struct kref *ref)
 }
 
 /**
- * batadv_gw_node_free_ref - decrement the gw_node refcounter and possibly
- *  release it
+ * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it
  * @gw_node: gateway node to free
  */
-static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node)
+static void batadv_gw_node_put(struct batadv_gw_node *gw_node)
 {
 	kref_put(&gw_node->refcount, batadv_gw_node_release);
 }
@@ -125,7 +124,7 @@ unlock:
 	rcu_read_unlock();
 out:
 	if (gw_node)
-		batadv_gw_node_free_ref(gw_node);
+		batadv_gw_node_put(gw_node);
 	return orig_node;
 }
 
@@ -143,7 +142,7 @@ static void batadv_gw_select(struct batadv_priv *bat_priv,
 	rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node);
 
 	if (curr_gw_node)
-		batadv_gw_node_free_ref(curr_gw_node);
+		batadv_gw_node_put(curr_gw_node);
 
 	spin_unlock_bh(&bat_priv->gw.list_lock);
 }
@@ -204,7 +203,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
 			    ((tmp_gw_factor == max_gw_factor) &&
 			     (tq_avg > max_tq))) {
 				if (curr_gw)
-					batadv_gw_node_free_ref(curr_gw);
+					batadv_gw_node_put(curr_gw);
 				curr_gw = gw_node;
 				kref_get(&curr_gw->refcount);
 			}
@@ -219,7 +218,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
 			  */
 			if (tq_avg > max_tq) {
 				if (curr_gw)
-					batadv_gw_node_free_ref(curr_gw);
+					batadv_gw_node_put(curr_gw);
 				curr_gw = gw_node;
 				kref_get(&curr_gw->refcount);
 			}
@@ -232,7 +231,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
 		if (tmp_gw_factor > max_gw_factor)
 			max_gw_factor = tmp_gw_factor;
 
-		batadv_gw_node_free_ref(gw_node);
+		batadv_gw_node_put(gw_node);
 
 next:
 		batadv_neigh_node_put(router);
@@ -273,7 +272,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv)
 	 */
 	batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL);
 
-	batadv_gw_node_free_ref(curr_gw);
+	batadv_gw_node_put(curr_gw);
 }
 
 void batadv_gw_election(struct batadv_priv *bat_priv)
@@ -348,9 +347,9 @@ void batadv_gw_election(struct batadv_priv *bat_priv)
 
 out:
 	if (curr_gw)
-		batadv_gw_node_free_ref(curr_gw);
+		batadv_gw_node_put(curr_gw);
 	if (next_gw)
-		batadv_gw_node_free_ref(next_gw);
+		batadv_gw_node_put(next_gw);
 	if (router)
 		batadv_neigh_node_put(router);
 	if (router_ifinfo)
@@ -547,7 +546,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
 		spin_lock_bh(&bat_priv->gw.list_lock);
 		if (!hlist_unhashed(&gw_node->list)) {
 			hlist_del_init_rcu(&gw_node->list);
-			batadv_gw_node_free_ref(gw_node);
+			batadv_gw_node_put(gw_node);
 		}
 		spin_unlock_bh(&bat_priv->gw.list_lock);
 
@@ -556,12 +555,12 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
 			batadv_gw_reselect(bat_priv);
 
 		if (curr_gw)
-			batadv_gw_node_free_ref(curr_gw);
+			batadv_gw_node_put(curr_gw);
 	}
 
 out:
 	if (gw_node)
-		batadv_gw_node_free_ref(gw_node);
+		batadv_gw_node_put(gw_node);
 }
 
 void batadv_gw_node_delete(struct batadv_priv *bat_priv,
@@ -584,7 +583,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
 	hlist_for_each_entry_safe(gw_node, node_tmp,
 				  &bat_priv->gw.list, list) {
 		hlist_del_init_rcu(&gw_node->list);
-		batadv_gw_node_free_ref(gw_node);
+		batadv_gw_node_put(gw_node);
 	}
 	spin_unlock_bh(&bat_priv->gw.list_lock);
 }
@@ -621,7 +620,7 @@ static int batadv_write_buffer_text(struct batadv_priv *bat_priv,
 	ret = seq_has_overflowed(seq) ? -1 : 0;
 
 	if (curr_gw)
-		batadv_gw_node_free_ref(curr_gw);
+		batadv_gw_node_put(curr_gw);
 out:
 	if (router_ifinfo)
 		batadv_neigh_ifinfo_put(router_ifinfo);
@@ -881,9 +880,9 @@ out:
 	if (orig_dst_node)
 		batadv_orig_node_put(orig_dst_node);
 	if (curr_gw)
-		batadv_gw_node_free_ref(curr_gw);
+		batadv_gw_node_put(curr_gw);
 	if (gw_node)
-		batadv_gw_node_free_ref(gw_node);
+		batadv_gw_node_put(gw_node);
 	if (neigh_old)
 		batadv_neigh_node_put(neigh_old);
 	if (neigh_curr)
-- 
cgit v1.2.3-59-g8ed1b


From ba610043afbcd12a1c34ed57fc0689a29aa35b58 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:19 +0100
Subject: batman-adv: Rename batadv_tvlv_handler *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/main.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 476554b3adce..95aaff371966 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -638,12 +638,11 @@ static void batadv_tvlv_handler_release(struct kref *ref)
 }
 
 /**
- * batadv_tvlv_handler_free_ref - decrement the tvlv container refcounter and
+ * batadv_tvlv_handler_put - decrement the tvlv container refcounter and
  *  possibly release it
  * @tvlv_handler: the tvlv handler to free
  */
-static void
-batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler)
+static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler)
 {
 	kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release);
 }
@@ -1031,7 +1030,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
 						src, dst, tvlv_value,
 						tvlv_value_cont_len);
 		if (tvlv_handler)
-			batadv_tvlv_handler_free_ref(tvlv_handler);
+			batadv_tvlv_handler_put(tvlv_handler);
 		tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len;
 		tvlv_value_len -= tvlv_value_cont_len;
 	}
@@ -1111,7 +1110,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
 
 	tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
 	if (tvlv_handler) {
-		batadv_tvlv_handler_free_ref(tvlv_handler);
+		batadv_tvlv_handler_put(tvlv_handler);
 		return;
 	}
 
@@ -1148,11 +1147,11 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
 	if (!tvlv_handler)
 		return;
 
-	batadv_tvlv_handler_free_ref(tvlv_handler);
+	batadv_tvlv_handler_put(tvlv_handler);
 	spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
 	hlist_del_rcu(&tvlv_handler->list);
 	spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
-	batadv_tvlv_handler_free_ref(tvlv_handler);
+	batadv_tvlv_handler_put(tvlv_handler);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 4a13147cb5d2cb45984074c1becb164a6e9d465f Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:20 +0100
Subject: batman-adv: Rename batadv_tvlv_container *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 95aaff371966..bf8a01b8f026 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -694,11 +694,11 @@ static void batadv_tvlv_container_release(struct kref *ref)
 }
 
 /**
- * batadv_tvlv_container_free_ref - decrement the tvlv container refcounter and
+ * batadv_tvlv_container_put - decrement the tvlv container refcounter and
  *  possibly release it
  * @tvlv: the tvlv container to free
  */
-static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv)
+static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv)
 {
 	kref_put(&tvlv->refcount, batadv_tvlv_container_release);
 }
@@ -784,8 +784,8 @@ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv,
 	hlist_del(&tvlv->list);
 
 	/* first call to decrement the counter, second call to free */
-	batadv_tvlv_container_free_ref(tvlv);
-	batadv_tvlv_container_free_ref(tvlv);
+	batadv_tvlv_container_put(tvlv);
+	batadv_tvlv_container_put(tvlv);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 9c3bf08189bd4c0675032f5d3ca9dcb42cbb3ec8 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:21 +0100
Subject: batman-adv: Rename batadv_softif_vlan *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/main.c              |  2 +-
 net/batman-adv/soft-interface.c    | 14 +++++++-------
 net/batman-adv/soft-interface.h    |  2 +-
 net/batman-adv/sysfs.c             |  4 ++--
 net/batman-adv/translation-table.c | 20 ++++++++++----------
 5 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index bf8a01b8f026..e3d7051747b0 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1261,7 +1261,7 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid)
 	vlan = batadv_softif_vlan_get(bat_priv, vid);
 	if (vlan) {
 		ap_isolation_enabled = atomic_read(&vlan->ap_isolation);
-		batadv_softif_vlan_free_ref(vlan);
+		batadv_softif_vlan_put(vlan);
 	}
 
 	return ap_isolation_enabled;
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index ca8fa4e28d4f..0710379491bf 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -497,11 +497,11 @@ static void batadv_softif_vlan_release(struct kref *ref)
 }
 
 /**
- * batadv_softif_vlan_free_ref - decrease the vlan object refcounter and
+ * batadv_softif_vlan_put - decrease the vlan object refcounter and
  *  possibly release it
  * @vlan: the vlan object to release
  */
-void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan)
+void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan)
 {
 	if (!vlan)
 		return;
@@ -552,7 +552,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
 
 	vlan = batadv_softif_vlan_get(bat_priv, vid);
 	if (vlan) {
-		batadv_softif_vlan_free_ref(vlan);
+		batadv_softif_vlan_put(vlan);
 		return -EEXIST;
 	}
 
@@ -601,7 +601,7 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv,
 			       vlan->vid, "vlan interface destroyed", false);
 
 	batadv_sysfs_del_vlan(bat_priv, vlan);
-	batadv_softif_vlan_free_ref(vlan);
+	batadv_softif_vlan_put(vlan);
 }
 
 /**
@@ -646,7 +646,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
 	if (!vlan->kobj) {
 		ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
 		if (ret) {
-			batadv_softif_vlan_free_ref(vlan);
+			batadv_softif_vlan_put(vlan);
 			return ret;
 		}
 	}
@@ -693,7 +693,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
 	batadv_softif_destroy_vlan(bat_priv, vlan);
 
 	/* finally free the vlan object */
-	batadv_softif_vlan_free_ref(vlan);
+	batadv_softif_vlan_put(vlan);
 
 	return 0;
 }
@@ -749,7 +749,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work)
 	vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
 	if (vlan) {
 		batadv_softif_destroy_vlan(bat_priv, vlan);
-		batadv_softif_vlan_free_ref(vlan);
+		batadv_softif_vlan_put(vlan);
 	}
 
 	batadv_sysfs_del_meshif(soft_iface);
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index d17cfbacf809..9ae265703d23 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -34,7 +34,7 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface);
 int batadv_softif_is_valid(const struct net_device *net_dev);
 extern struct rtnl_link_ops batadv_link_ops;
 int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid);
-void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *softif_vlan);
+void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan);
 struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
 						  unsigned short vid);
 
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 0db7591a000f..4d70d4413e40 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -216,7 +216,7 @@ ssize_t batadv_store_vlan_##_name(struct kobject *kobj,			\
 					      attr, &vlan->_name,	\
 					      bat_priv->soft_iface);	\
 									\
-	batadv_softif_vlan_free_ref(vlan);				\
+	batadv_softif_vlan_put(vlan);					\
 	return res;							\
 }
 
@@ -231,7 +231,7 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj,			\
 			     atomic_read(&vlan->_name) == 0 ?		\
 			     "disabled" : "enabled");			\
 									\
-	batadv_softif_vlan_free_ref(vlan);				\
+	batadv_softif_vlan_put(vlan);					\
 	return res;							\
 }
 
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 132635aa7b11..b81f8ea3395d 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -301,7 +301,7 @@ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv,
 
 	atomic_add(v, &vlan->tt.num_entries);
 
-	batadv_softif_vlan_free_ref(vlan);
+	batadv_softif_vlan_put(vlan);
 }
 
 /**
@@ -688,7 +688,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
 	if (unlikely(hash_added != 0)) {
 		/* remove the reference for the hash */
 		batadv_tt_local_entry_free_ref(tt_local);
-		batadv_softif_vlan_free_ref(vlan);
+		batadv_softif_vlan_put(vlan);
 		goto out;
 	}
 
@@ -1054,7 +1054,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
 				   no_purge ? 0 : last_seen_msecs,
 				   vlan->tt.crc);
 
-			batadv_softif_vlan_free_ref(vlan);
+			batadv_softif_vlan_put(vlan);
 		}
 		rcu_read_unlock();
 	}
@@ -1144,8 +1144,8 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
 	if (!vlan)
 		goto out;
 
-	batadv_softif_vlan_free_ref(vlan);
-	batadv_softif_vlan_free_ref(vlan);
+	batadv_softif_vlan_put(vlan);
+	batadv_softif_vlan_put(vlan);
 
 out:
 	if (tt_local_entry)
@@ -1245,8 +1245,8 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv)
 			vlan = batadv_softif_vlan_get(bat_priv,
 						      tt_common_entry->vid);
 			if (vlan) {
-				batadv_softif_vlan_free_ref(vlan);
-				batadv_softif_vlan_free_ref(vlan);
+				batadv_softif_vlan_put(vlan);
+				batadv_softif_vlan_put(vlan);
 			}
 
 			batadv_tt_local_entry_free_ref(tt_local);
@@ -3342,8 +3342,8 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
 			/* decrease the reference held for this vlan */
 			vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid);
 			if (vlan) {
-				batadv_softif_vlan_free_ref(vlan);
-				batadv_softif_vlan_free_ref(vlan);
+				batadv_softif_vlan_put(vlan);
+				batadv_softif_vlan_put(vlan);
 			}
 
 			batadv_tt_local_entry_free_ref(tt_local);
@@ -3429,7 +3429,7 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
 	ret = true;
 
 out:
-	batadv_softif_vlan_free_ref(vlan);
+	batadv_softif_vlan_put(vlan);
 	if (tt_global_entry)
 		batadv_tt_global_entry_free_ref(tt_global_entry);
 	if (tt_local_entry)
-- 
cgit v1.2.3-59-g8ed1b


From 27ad7545bb4fc849f84382717a486438310c6ae4 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:22 +0100
Subject: batman-adv: Rename batadv_nc_node *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/network-coding.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 9180ff114a26..6105cfb0574c 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -223,11 +223,11 @@ static void batadv_nc_node_release(struct kref *ref)
 }
 
 /**
- * batadv_nc_node_free_ref - decrement the nc_node refcounter and possibly
+ * batadv_nc_node_put - decrement the nc_node refcounter and possibly
  *  release it
  * @nc_node: nc_node to be free'd
  */
-static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node)
+static void batadv_nc_node_put(struct batadv_nc_node *nc_node)
 {
 	kref_put(&nc_node->refcount, batadv_nc_node_release);
 }
@@ -356,7 +356,7 @@ batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv,
 			   "Removing nc_node %pM -> %pM\n",
 			   nc_node->addr, nc_node->orig_node->orig);
 		list_del_rcu(&nc_node->list);
-		batadv_nc_node_free_ref(nc_node);
+		batadv_nc_node_put(nc_node);
 	}
 	spin_unlock_bh(lock);
 }
@@ -942,9 +942,9 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
 
 out:
 	if (in_nc_node)
-		batadv_nc_node_free_ref(in_nc_node);
+		batadv_nc_node_put(in_nc_node);
 	if (out_nc_node)
-		batadv_nc_node_free_ref(out_nc_node);
+		batadv_nc_node_put(out_nc_node);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 5fff28255fa6cadbdc46fa43873de2c7f98b850d Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:23 +0100
Subject: batman-adv: Rename batadv_nc_path *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/network-coding.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 6105cfb0574c..d253bb23e2ac 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -247,11 +247,11 @@ static void batadv_nc_path_release(struct kref *ref)
 }
 
 /**
- * batadv_nc_path_free_ref - decrement the nc_path refcounter and possibly
+ * batadv_nc_path_put - decrement the nc_path refcounter and possibly
  *  release it
  * @nc_path: nc_path to be free'd
  */
-static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path)
+static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
 {
 	kref_put(&nc_path->refcount, batadv_nc_path_release);
 }
@@ -263,7 +263,7 @@ static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path)
 static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet)
 {
 	kfree_skb(nc_packet->skb);
-	batadv_nc_path_free_ref(nc_packet->nc_path);
+	batadv_nc_path_put(nc_packet->nc_path);
 	kfree(nc_packet);
 }
 
@@ -467,7 +467,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv,
 				   "Remove nc_path %pM -> %pM\n",
 				   nc_path->prev_hop, nc_path->next_hop);
 			hlist_del_rcu(&nc_path->hash_entry);
-			batadv_nc_path_free_ref(nc_path);
+			batadv_nc_path_put(nc_path);
 		}
 		spin_unlock_bh(lock);
 	}
@@ -1555,7 +1555,7 @@ bool batadv_nc_skb_forward(struct sk_buff *skb,
 	return true;
 
 free_nc_path:
-	batadv_nc_path_free_ref(nc_path);
+	batadv_nc_path_put(nc_path);
 out:
 	/* Packet is not consumed */
 	return false;
@@ -1617,7 +1617,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
 free_skb:
 	kfree_skb(skb);
 free_nc_path:
-	batadv_nc_path_free_ref(nc_path);
+	batadv_nc_path_put(nc_path);
 out:
 	return;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 21754e2501c1faf9cf530fb69e97f463937f8083 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:24 +0100
Subject: batman-adv: Rename batadv_orig_node_vlan *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/originator.c        |  8 ++++----
 net/batman-adv/originator.h        |  2 +-
 net/batman-adv/translation-table.c | 10 +++++-----
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 0b1a77da2e1e..e4cbb0753e37 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -151,11 +151,11 @@ static void batadv_orig_node_vlan_release(struct kref *ref)
 }
 
 /**
- * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly
- *  release the originator-vlan object
+ * batadv_orig_node_vlan_put - decrement the refcounter and possibly release
+ *  the originator-vlan object
  * @orig_vlan: the originator-vlan object to release
  */
-void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan)
+void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan)
 {
 	kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release);
 }
@@ -917,7 +917,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
 	 * Immediately release vlan since it is not needed anymore in this
 	 * context
 	 */
-	batadv_orig_node_vlan_free_ref(vlan);
+	batadv_orig_node_vlan_put(vlan);
 
 	for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
 		INIT_HLIST_HEAD(&orig_node->fragments[i].head);
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 97748e84b66b..4e8b67f11051 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -83,7 +83,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
 struct batadv_orig_node_vlan *
 batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
 			  unsigned short vid);
-void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan);
+void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan);
 
 /* hashfunction to choose an entry in a hash table of given size
  * hash algorithm from http://en.wikipedia.org/wiki/Hash_table
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index b81f8ea3395d..396b095c8a1d 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -348,12 +348,12 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node,
 		spin_lock_bh(&orig_node->vlan_list_lock);
 		if (!hlist_unhashed(&vlan->list)) {
 			hlist_del_init_rcu(&vlan->list);
-			batadv_orig_node_vlan_free_ref(vlan);
+			batadv_orig_node_vlan_put(vlan);
 		}
 		spin_unlock_bh(&orig_node->vlan_list_lock);
 	}
 
-	batadv_orig_node_vlan_free_ref(vlan);
+	batadv_orig_node_vlan_put(vlan);
 }
 
 /**
@@ -1652,7 +1652,7 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv,
 			   ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
 			   ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
 
-		batadv_orig_node_vlan_free_ref(vlan);
+		batadv_orig_node_vlan_put(vlan);
 	}
 
 print_list:
@@ -1684,7 +1684,7 @@ print_list:
 			   ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
 			   ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
 
-		batadv_orig_node_vlan_free_ref(vlan);
+		batadv_orig_node_vlan_put(vlan);
 	}
 }
 
@@ -2503,7 +2503,7 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
 			return false;
 
 		crc = vlan->tt.crc;
-		batadv_orig_node_vlan_free_ref(vlan);
+		batadv_orig_node_vlan_put(vlan);
 
 		if (crc != ntohl(tt_vlan_tmp->crc))
 			return false;
-- 
cgit v1.2.3-59-g8ed1b


From 95c0db90c791ebb73de89f960d12806f1708b31e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:25 +0100
Subject: batman-adv: Rename batadv_tt_local_entry *_free_ref function to *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/translation-table.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 396b095c8a1d..de3e731542e3 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -219,12 +219,12 @@ static void batadv_tt_local_entry_release(struct kref *ref)
 }
 
 /**
- * batadv_tt_local_entry_free_ref - decrement the tt_local_entry refcounter and
+ * batadv_tt_local_entry_put - decrement the tt_local_entry refcounter and
  *  possibly release it
  * @tt_local_entry: tt_local_entry to be free'd
  */
 static void
-batadv_tt_local_entry_free_ref(struct batadv_tt_local_entry *tt_local_entry)
+batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry)
 {
 	kref_put(&tt_local_entry->common.refcount,
 		 batadv_tt_local_entry_release);
@@ -687,7 +687,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
 
 	if (unlikely(hash_added != 0)) {
 		/* remove the reference for the hash */
-		batadv_tt_local_entry_free_ref(tt_local);
+		batadv_tt_local_entry_put(tt_local);
 		batadv_softif_vlan_put(vlan);
 		goto out;
 	}
@@ -754,7 +754,7 @@ out:
 	if (in_dev)
 		dev_put(in_dev);
 	if (tt_local)
-		batadv_tt_local_entry_free_ref(tt_local);
+		batadv_tt_local_entry_put(tt_local);
 	if (tt_global)
 		batadv_tt_global_entry_free_ref(tt_global);
 	return ret;
@@ -1137,7 +1137,7 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
 		goto out;
 
 	/* extra call to free the local tt entry */
-	batadv_tt_local_entry_free_ref(tt_local_entry);
+	batadv_tt_local_entry_put(tt_local_entry);
 
 	/* decrease the reference held for this vlan */
 	vlan = batadv_softif_vlan_get(bat_priv, vid);
@@ -1149,7 +1149,7 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
 
 out:
 	if (tt_local_entry)
-		batadv_tt_local_entry_free_ref(tt_local_entry);
+		batadv_tt_local_entry_put(tt_local_entry);
 
 	return curr_flags;
 }
@@ -1249,7 +1249,7 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv)
 				batadv_softif_vlan_put(vlan);
 			}
 
-			batadv_tt_local_entry_free_ref(tt_local);
+			batadv_tt_local_entry_put(tt_local);
 		}
 		spin_unlock_bh(list_lock);
 	}
@@ -1555,7 +1555,7 @@ out:
 	if (tt_global_entry)
 		batadv_tt_global_entry_free_ref(tt_global_entry);
 	if (tt_local_entry)
-		batadv_tt_local_entry_free_ref(tt_local_entry);
+		batadv_tt_local_entry_put(tt_local_entry);
 	return ret;
 }
 
@@ -1911,7 +1911,7 @@ out:
 	if (tt_global_entry)
 		batadv_tt_global_entry_free_ref(tt_global_entry);
 	if (local_entry)
-		batadv_tt_local_entry_free_ref(local_entry);
+		batadv_tt_local_entry_put(local_entry);
 }
 
 /**
@@ -2143,7 +2143,7 @@ out:
 	if (tt_global_entry)
 		batadv_tt_global_entry_free_ref(tt_global_entry);
 	if (tt_local_entry)
-		batadv_tt_local_entry_free_ref(tt_local_entry);
+		batadv_tt_local_entry_put(tt_local_entry);
 
 	return orig_node;
 }
@@ -3023,7 +3023,7 @@ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr,
 	ret = true;
 out:
 	if (tt_local_entry)
-		batadv_tt_local_entry_free_ref(tt_local_entry);
+		batadv_tt_local_entry_put(tt_local_entry);
 	return ret;
 }
 
@@ -3346,7 +3346,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
 				batadv_softif_vlan_put(vlan);
 			}
 
-			batadv_tt_local_entry_free_ref(tt_local);
+			batadv_tt_local_entry_put(tt_local);
 		}
 		spin_unlock_bh(list_lock);
 	}
@@ -3433,7 +3433,7 @@ out:
 	if (tt_global_entry)
 		batadv_tt_global_entry_free_ref(tt_global_entry);
 	if (tt_local_entry)
-		batadv_tt_local_entry_free_ref(tt_local_entry);
+		batadv_tt_local_entry_put(tt_local_entry);
 	return ret;
 }
 
@@ -3569,7 +3569,7 @@ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv,
 		goto out;
 
 	ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM;
-	batadv_tt_local_entry_free_ref(tt_local_entry);
+	batadv_tt_local_entry_put(tt_local_entry);
 out:
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 5dafd8a6ccf4175ba29f80430915eae2c5558e1e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:26 +0100
Subject: batman-adv: Rename batadv_tt_global_entry *_free_ref function to
 *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/translation-table.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index de3e731542e3..7f65a8f17f52 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -247,12 +247,12 @@ static void batadv_tt_global_entry_release(struct kref *ref)
 }
 
 /**
- * batadv_tt_global_entry_free_ref - decrement the tt_global_entry refcounter
- *  and possibly release it
+ * batadv_tt_global_entry_put - decrement the tt_global_entry refcounter and
+ *  possibly release it
  * @tt_global_entry: tt_global_entry to be free'd
  */
 static void
-batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry)
+batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry)
 {
 	kref_put(&tt_global_entry->common.refcount,
 		 batadv_tt_global_entry_release);
@@ -278,7 +278,7 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
 		return 0;
 
 	count = atomic_read(&tt_global_entry->orig_list_count);
-	batadv_tt_global_entry_free_ref(tt_global_entry);
+	batadv_tt_global_entry_put(tt_global_entry);
 
 	return count;
 }
@@ -561,7 +561,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv,
 
 	batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt,
 			   batadv_choose_tt, &tt_global->common);
-	batadv_tt_global_entry_free_ref(tt_global);
+	batadv_tt_global_entry_put(tt_global);
 }
 
 /**
@@ -756,7 +756,7 @@ out:
 	if (tt_local)
 		batadv_tt_local_entry_put(tt_local);
 	if (tt_global)
-		batadv_tt_global_entry_free_ref(tt_global);
+		batadv_tt_global_entry_put(tt_global);
 	return ret;
 }
 
@@ -1467,7 +1467,7 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
 
 		if (unlikely(hash_added != 0)) {
 			/* remove the reference for the hash */
-			batadv_tt_global_entry_free_ref(tt_global_entry);
+			batadv_tt_global_entry_put(tt_global_entry);
 			goto out_remove;
 		}
 	} else {
@@ -1553,7 +1553,7 @@ out_remove:
 
 out:
 	if (tt_global_entry)
-		batadv_tt_global_entry_free_ref(tt_global_entry);
+		batadv_tt_global_entry_put(tt_global_entry);
 	if (tt_local_entry)
 		batadv_tt_local_entry_put(tt_local_entry);
 	return ret;
@@ -1909,7 +1909,7 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv,
 
 out:
 	if (tt_global_entry)
-		batadv_tt_global_entry_free_ref(tt_global_entry);
+		batadv_tt_global_entry_put(tt_global_entry);
 	if (local_entry)
 		batadv_tt_local_entry_put(local_entry);
 }
@@ -1965,7 +1965,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
 					   tt_global->common.addr,
 					   BATADV_PRINT_VID(vid), message);
 				hlist_del_rcu(&tt_common_entry->hash_entry);
-				batadv_tt_global_entry_free_ref(tt_global);
+				batadv_tt_global_entry_put(tt_global);
 			}
 		}
 		spin_unlock_bh(list_lock);
@@ -2028,7 +2028,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv)
 
 			hlist_del_rcu(&tt_common->hash_entry);
 
-			batadv_tt_global_entry_free_ref(tt_global);
+			batadv_tt_global_entry_put(tt_global);
 		}
 		spin_unlock_bh(list_lock);
 	}
@@ -2060,7 +2060,7 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv)
 			tt_global = container_of(tt_common_entry,
 						 struct batadv_tt_global_entry,
 						 common);
-			batadv_tt_global_entry_free_ref(tt_global);
+			batadv_tt_global_entry_put(tt_global);
 		}
 		spin_unlock_bh(list_lock);
 	}
@@ -2141,7 +2141,7 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
 
 out:
 	if (tt_global_entry)
-		batadv_tt_global_entry_free_ref(tt_global_entry);
+		batadv_tt_global_entry_put(tt_global_entry);
 	if (tt_local_entry)
 		batadv_tt_local_entry_put(tt_local_entry);
 
@@ -3431,7 +3431,7 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
 out:
 	batadv_softif_vlan_put(vlan);
 	if (tt_global_entry)
-		batadv_tt_global_entry_free_ref(tt_global_entry);
+		batadv_tt_global_entry_put(tt_global_entry);
 	if (tt_local_entry)
 		batadv_tt_local_entry_put(tt_local_entry);
 	return ret;
@@ -3543,7 +3543,7 @@ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
 		goto out;
 
 	ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM;
-	batadv_tt_global_entry_free_ref(tt_global_entry);
+	batadv_tt_global_entry_put(tt_global_entry);
 out:
 	return ret;
 }
@@ -3863,7 +3863,7 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
 
 	ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA;
 
-	batadv_tt_global_entry_free_ref(tt);
+	batadv_tt_global_entry_put(tt);
 
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7e2366c6267d4ea257c4d0deb2fcf0744749de5f Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 17 Jan 2016 11:01:27 +0100
Subject: batman-adv: Rename batadv_tt_orig_list_entry *_free_ref function to
 *_put

The batman-adv source code is the only place in the kernel which uses the
*_free_ref naming scheme for the *_put functions. Changing it to *_put
makes it more consistent and makes it easier to understand the connection
to the *_get functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/translation-table.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 7f65a8f17f52..0b43e86328a5 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -397,12 +397,12 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref)
 }
 
 /**
- * batadv_tt_orig_list_entry_free_ref - decrement the tt orig entry refcounter
- *  and possibly release it
+ * batadv_tt_orig_list_entry_put - decrement the tt orig entry refcounter and
+ *  possibly release it
  * @orig_entry: tt orig entry to be free'd
  */
 static void
-batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry)
+batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry)
 {
 	kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release);
 }
@@ -1345,7 +1345,7 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry,
 	orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node);
 	if (orig_entry) {
 		found = true;
-		batadv_tt_orig_list_entry_free_ref(orig_entry);
+		batadv_tt_orig_list_entry_put(orig_entry);
 	}
 
 	return found;
@@ -1386,7 +1386,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
 
 out:
 	if (orig_entry)
-		batadv_tt_orig_list_entry_free_ref(orig_entry);
+		batadv_tt_orig_list_entry_put(orig_entry);
 }
 
 /**
@@ -1753,7 +1753,7 @@ _batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry,
 	 * being part of a list
 	 */
 	hlist_del_rcu(&orig_entry->list);
-	batadv_tt_orig_list_entry_free_ref(orig_entry);
+	batadv_tt_orig_list_entry_put(orig_entry);
 }
 
 /* deletes the orig list of a tt_global_entry */
-- 
cgit v1.2.3-59-g8ed1b


From 6d5d2ee63cee7025badda3b74ae2ef7ab097acfa Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 8 Jan 2016 19:28:58 +0100
Subject: Bluetooth: add LED trigger for indicating HCI is powered up

Add support for LED triggers to the Bluetooth subsystem and add kernel
config symbol BT_LEDS for it.

For now one trigger for indicating "HCI is powered up" is supported.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  3 ++
 net/bluetooth/Kconfig            |  9 +++++
 net/bluetooth/Makefile           |  1 +
 net/bluetooth/hci_core.c         |  8 ++++
 net/bluetooth/leds.c             | 80 ++++++++++++++++++++++++++++++++++++++++
 net/bluetooth/leds.h             | 18 +++++++++
 6 files changed, 119 insertions(+)
 create mode 100644 net/bluetooth/leds.c
 create mode 100644 net/bluetooth/leds.h

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index d4f82edb5cff..dc71473462ac 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -25,6 +25,7 @@
 #ifndef __HCI_CORE_H
 #define __HCI_CORE_H
 
+#include <linux/leds.h>
 #include <net/bluetooth/hci.h>
 #include <net/bluetooth/hci_sock.h>
 
@@ -396,6 +397,8 @@ struct hci_dev {
 	struct delayed_work	rpa_expired;
 	bdaddr_t		rpa;
 
+	struct led_trigger	*power_led;
+
 	int (*open)(struct hci_dev *hdev);
 	int (*close)(struct hci_dev *hdev);
 	int (*flush)(struct hci_dev *hdev);
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index 95d1a66ba03a..06c31b9a68b0 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -69,6 +69,15 @@ config BT_6LOWPAN
 	help
 	  IPv6 compression over Bluetooth Low Energy.
 
+config BT_LEDS
+	bool "Enable LED triggers"
+	depends on BT
+	depends on LEDS_CLASS
+	select LEDS_TRIGGERS
+	help
+	  This option selects a few LED triggers for different
+	  Bluetooth events.
+
 config BT_SELFTEST
 	bool "Bluetooth self testing support"
 	depends on BT && DEBUG_KERNEL
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index 2b15ae8c1def..b3ff12eb9b6d 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -17,6 +17,7 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
 
 bluetooth-$(CONFIG_BT_BREDR) += sco.o
 bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
+bluetooth-$(CONFIG_BT_LEDS) += leds.o
 bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
 bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
 
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 883c821a9e78..88f1ef3589d8 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -40,6 +40,7 @@
 #include "hci_request.h"
 #include "hci_debugfs.h"
 #include "smp.h"
+#include "leds.h"
 
 static void hci_rx_work(struct work_struct *work);
 static void hci_cmd_work(struct work_struct *work);
@@ -1395,6 +1396,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
 		hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
 		set_bit(HCI_UP, &hdev->flags);
 		hci_sock_dev_event(hdev, HCI_DEV_UP);
+		hci_leds_update_powered(hdev, true);
 		if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
 		    !hci_dev_test_flag(hdev, HCI_CONFIG) &&
 		    !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
@@ -1532,6 +1534,8 @@ int hci_dev_do_close(struct hci_dev *hdev)
 		return 0;
 	}
 
+	hci_leds_update_powered(hdev, false);
+
 	/* Flush RX and TX works */
 	flush_work(&hdev->tx_work);
 	flush_work(&hdev->rx_work);
@@ -3067,6 +3071,8 @@ int hci_register_dev(struct hci_dev *hdev)
 	if (error < 0)
 		goto err_wqueue;
 
+	hci_leds_init(hdev);
+
 	hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev,
 				    RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops,
 				    hdev);
@@ -3128,6 +3134,8 @@ void hci_unregister_dev(struct hci_dev *hdev)
 
 	id = hdev->id;
 
+	hci_leds_exit(hdev);
+
 	write_lock(&hci_dev_list_lock);
 	list_del(&hdev->list);
 	write_unlock(&hci_dev_list_lock);
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
new file mode 100644
index 000000000000..ded7c88eaccc
--- /dev/null
+++ b/net/bluetooth/leds.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "leds.h"
+
+struct hci_basic_led_trigger {
+	struct led_trigger	led_trigger;
+	struct hci_dev		*hdev;
+};
+
+#define to_hci_basic_led_trigger(arg) container_of(arg, \
+			struct hci_basic_led_trigger, led_trigger)
+
+void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
+{
+	if (hdev->power_led)
+		led_trigger_event(hdev->power_led,
+				  enabled ? LED_FULL : LED_OFF);
+}
+
+static void power_activate(struct led_classdev *led_cdev)
+{
+	struct hci_basic_led_trigger *htrig;
+	bool powered;
+
+	htrig = to_hci_basic_led_trigger(led_cdev->trigger);
+	powered = test_bit(HCI_UP, &htrig->hdev->flags);
+
+	led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF);
+}
+
+static struct led_trigger *led_allocate_basic(struct hci_dev *hdev,
+			void (*activate)(struct led_classdev *led_cdev),
+			const char *name)
+{
+	struct hci_basic_led_trigger *htrig;
+
+	htrig =	devm_kzalloc(&hdev->dev, sizeof(*htrig), GFP_KERNEL);
+	if (!htrig)
+		return NULL;
+
+	htrig->hdev = hdev;
+	htrig->led_trigger.activate = activate;
+	htrig->led_trigger.name = devm_kasprintf(&hdev->dev, GFP_KERNEL,
+						 "%s-%s", hdev->name,
+						 name);
+	if (!htrig->led_trigger.name)
+		goto err_alloc;
+
+	if (led_trigger_register(&htrig->led_trigger))
+		goto err_register;
+
+	return &htrig->led_trigger;
+
+err_register:
+	devm_kfree(&hdev->dev, (void *)htrig->led_trigger.name);
+err_alloc:
+	devm_kfree(&hdev->dev, htrig);
+	return NULL;
+}
+
+void hci_leds_init(struct hci_dev *hdev)
+{
+	/* initialize power_led */
+	hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
+}
+
+void hci_leds_exit(struct hci_dev *hdev)
+{
+	if (hdev->power_led)
+		led_trigger_unregister(hdev->power_led);
+}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
new file mode 100644
index 000000000000..068261a4e12c
--- /dev/null
+++ b/net/bluetooth/leds.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#if IS_ENABLED(CONFIG_BT_LEDS)
+void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
+void hci_leds_init(struct hci_dev *hdev);
+void hci_leds_exit(struct hci_dev *hdev);
+#else
+static inline void hci_leds_update_powered(struct hci_dev *hdev,
+					   bool enabled) {}
+static inline void hci_leds_init(struct hci_dev *hdev) {}
+static inline void hci_leds_exit(struct hci_dev *hdev) {}
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From b6e402fc84a76c0238b64de497920aeeaa495026 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 25 Jan 2016 20:46:21 +0100
Subject: Bluetooth: Use managed version of led_trigger_register in LED trigger

Recently a managed version of led_trigger_register was introduced.
Using devm_led_trigger_register allows to simplify the LED trigger code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_core.c | 2 --
 net/bluetooth/leds.c     | 8 +-------
 net/bluetooth/leds.h     | 2 --
 3 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 88f1ef3589d8..9c0a6830ff92 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3134,8 +3134,6 @@ void hci_unregister_dev(struct hci_dev *hdev)
 
 	id = hdev->id;
 
-	hci_leds_exit(hdev);
-
 	write_lock(&hci_dev_list_lock);
 	list_del(&hdev->list);
 	write_unlock(&hci_dev_list_lock);
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index ded7c88eaccc..8319c8440c89 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -55,7 +55,7 @@ static struct led_trigger *led_allocate_basic(struct hci_dev *hdev,
 	if (!htrig->led_trigger.name)
 		goto err_alloc;
 
-	if (led_trigger_register(&htrig->led_trigger))
+	if (devm_led_trigger_register(&hdev->dev, &htrig->led_trigger))
 		goto err_register;
 
 	return &htrig->led_trigger;
@@ -72,9 +72,3 @@ void hci_leds_init(struct hci_dev *hdev)
 	/* initialize power_led */
 	hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
 }
-
-void hci_leds_exit(struct hci_dev *hdev)
-{
-	if (hdev->power_led)
-		led_trigger_unregister(hdev->power_led);
-}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
index 068261a4e12c..a9c4d6ea01cf 100644
--- a/net/bluetooth/leds.h
+++ b/net/bluetooth/leds.h
@@ -9,10 +9,8 @@
 #if IS_ENABLED(CONFIG_BT_LEDS)
 void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
 void hci_leds_init(struct hci_dev *hdev);
-void hci_leds_exit(struct hci_dev *hdev);
 #else
 static inline void hci_leds_update_powered(struct hci_dev *hdev,
 					   bool enabled) {}
 static inline void hci_leds_init(struct hci_dev *hdev) {}
-static inline void hci_leds_exit(struct hci_dev *hdev) {}
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From d82142a8b1338e6a4339920863423379c27b0b16 Mon Sep 17 00:00:00 2001
From: Wei-Ning Huang <wnhuang@chromium.org>
Date: Mon, 15 Feb 2016 17:09:51 +0800
Subject: Bluetooth: hci_core: cancel power off delayed work properly

When the HCI_AUTO_OFF flag is cleared, the power_off delayed work need
to be cancel or HCI will be powered off even if it's managed.

Signed-off-by: Wei-Ning Huang <wnhuang@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 9c0a6830ff92..2713fc86e85a 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2021,6 +2021,7 @@ static void hci_power_on(struct work_struct *work)
 	if (test_bit(HCI_UP, &hdev->flags) &&
 	    hci_dev_test_flag(hdev, HCI_MGMT) &&
 	    hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
+		cancel_delayed_work(&hdev->power_off);
 		hci_req_sync_lock(hdev);
 		err = __hci_req_hci_power_on(hdev);
 		hci_req_sync_unlock(hdev);
-- 
cgit v1.2.3-59-g8ed1b


From aef00c15b8c503083a703900a755fdb1cf2436e0 Mon Sep 17 00:00:00 2001
From: Koen Zandberg <koen@bergzand.net>
Date: Wed, 10 Feb 2016 11:49:38 +0100
Subject: mac802154: Fixes kernel oops when unloading a radio driver

Destroying the workqueue before unregistering the net device caused a
kernel oops

Signed-off-by: Koen Zandberg <koen@bergzand.net>
Acked-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/mac802154/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index e8cab5bb80c6..87da85ae5a6b 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -218,7 +218,6 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw)
 
 	tasklet_kill(&local->tasklet);
 	flush_workqueue(local->workqueue);
-	destroy_workqueue(local->workqueue);
 
 	rtnl_lock();
 
@@ -226,6 +225,7 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw)
 
 	rtnl_unlock();
 
+	destroy_workqueue(local->workqueue);
 	wpan_phy_unregister(local->phy);
 }
 EXPORT_SYMBOL(ieee802154_unregister_hw);
-- 
cgit v1.2.3-59-g8ed1b


From 5609c185f24dffca5f6a9c127106869da150be03 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Mon, 22 Feb 2016 09:13:54 +0100
Subject: 6lowpan: iphc: add support for stateful compression

This patch introduce support for IPHC stateful address compression. It
will offer the context table via one debugfs entry.
This debugfs has and directory for each cid entry for the context table.
Inside each cid directory there exists the following files:

 - "active": If the entry is added or deleted. The context table is
   original a list implementation, this flag will indicate if the
   context is part of list or not.
 - "prefix": The ipv6 prefix.
 - "prefix_length": The prefix length for the prefix.
 - "compression": The compression flag according RFC6775.

This part should be moved into sysfs after some testing time.

Also the debugfs entry contains a "show" file which is a pretty-printout
for the current context table information.

Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/6lowpan.h |  32 ++++
 net/6lowpan/core.c    |  39 ++++-
 net/6lowpan/debugfs.c | 247 ++++++++++++++++++++++++++++++
 net/6lowpan/iphc.c    | 414 +++++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 674 insertions(+), 58 deletions(-)

(limited to 'net')

diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h
index 2f6a3f2233ed..da3a77d25fcb 100644
--- a/include/net/6lowpan.h
+++ b/include/net/6lowpan.h
@@ -75,6 +75,8 @@
 #define LOWPAN_IPHC_MAX_HC_BUF_LEN	(sizeof(struct ipv6hdr) +	\
 					 LOWPAN_IPHC_MAX_HEADER_LEN +	\
 					 LOWPAN_NHC_MAX_HDR_LEN)
+/* SCI/DCI is 4 bit width, so we have maximum 16 entries */
+#define LOWPAN_IPHC_CTX_TABLE_SIZE	(1 << 4)
 
 #define LOWPAN_DISPATCH_IPV6		0x41 /* 01000001 = 65 */
 #define LOWPAN_DISPATCH_IPHC		0x60 /* 011xxxxx = ... */
@@ -98,9 +100,39 @@ enum lowpan_lltypes {
 	LOWPAN_LLTYPE_IEEE802154,
 };
 
+enum lowpan_iphc_ctx_flags {
+	LOWPAN_IPHC_CTX_FLAG_ACTIVE,
+	LOWPAN_IPHC_CTX_FLAG_COMPRESSION,
+};
+
+struct lowpan_iphc_ctx {
+	u8 id;
+	struct in6_addr pfx;
+	u8 plen;
+	unsigned long flags;
+};
+
+struct lowpan_iphc_ctx_table {
+	spinlock_t lock;
+	const struct lowpan_iphc_ctx_ops *ops;
+	struct lowpan_iphc_ctx table[LOWPAN_IPHC_CTX_TABLE_SIZE];
+};
+
+static inline bool lowpan_iphc_ctx_is_active(const struct lowpan_iphc_ctx *ctx)
+{
+	return test_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+}
+
+static inline bool
+lowpan_iphc_ctx_is_compression(const struct lowpan_iphc_ctx *ctx)
+{
+	return test_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+}
+
 struct lowpan_priv {
 	enum lowpan_lltypes lltype;
 	struct dentry *iface_debugfs;
+	struct lowpan_iphc_ctx_table ctx;
 
 	/* must be last */
 	u8 priv[0] __aligned(sizeof(void *));
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index faf65baed617..34e44c0c0836 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -20,7 +20,7 @@
 int lowpan_register_netdevice(struct net_device *dev,
 			      enum lowpan_lltypes lltype)
 {
-	int ret;
+	int i, ret;
 
 	dev->addr_len = EUI64_ADDR_LEN;
 	dev->type = ARPHRD_6LOWPAN;
@@ -29,6 +29,10 @@ int lowpan_register_netdevice(struct net_device *dev,
 
 	lowpan_priv(dev)->lltype = lltype;
 
+	spin_lock_init(&lowpan_priv(dev)->ctx.lock);
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+		lowpan_priv(dev)->ctx.table[i].id = i;
+
 	ret = register_netdevice(dev);
 	if (ret < 0)
 		return ret;
@@ -68,6 +72,32 @@ void lowpan_unregister_netdev(struct net_device *dev)
 }
 EXPORT_SYMBOL(lowpan_unregister_netdev);
 
+static int lowpan_event(struct notifier_block *unused,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	int i;
+
+	if (dev->type != ARPHRD_6LOWPAN)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_DOWN:
+		for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+			clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE,
+				  &lowpan_priv(dev)->ctx.table[i].flags);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block lowpan_notifier = {
+	.notifier_call = lowpan_event,
+};
+
 static int __init lowpan_module_init(void)
 {
 	int ret;
@@ -76,6 +106,12 @@ static int __init lowpan_module_init(void)
 	if (ret < 0)
 		return ret;
 
+	ret = register_netdevice_notifier(&lowpan_notifier);
+	if (ret < 0) {
+		lowpan_debugfs_exit();
+		return ret;
+	}
+
 	request_module_nowait("ipv6");
 
 	request_module_nowait("nhc_dest");
@@ -92,6 +128,7 @@ static int __init lowpan_module_init(void)
 static void __exit lowpan_module_exit(void)
 {
 	lowpan_debugfs_exit();
+	unregister_netdevice_notifier(&lowpan_notifier);
 }
 
 module_init(lowpan_module_init);
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
index 88eef84df0fc..aa49ff4ce6fd 100644
--- a/net/6lowpan/debugfs.c
+++ b/net/6lowpan/debugfs.c
@@ -16,19 +16,266 @@
 
 #include "6lowpan_i.h"
 
+#define LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS	8
+
 static struct dentry *lowpan_debugfs;
 
+static int lowpan_ctx_flag_active_set(void *data, u64 val)
+{
+	struct lowpan_iphc_ctx *ctx = data;
+
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	if (val)
+		set_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+	else
+		clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+
+	return 0;
+}
+
+static int lowpan_ctx_flag_active_get(void *data, u64 *val)
+{
+	*val = lowpan_iphc_ctx_is_active(data);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_active_fops,
+			lowpan_ctx_flag_active_get,
+			lowpan_ctx_flag_active_set, "%llu\n");
+
+static int lowpan_ctx_flag_c_set(void *data, u64 val)
+{
+	struct lowpan_iphc_ctx *ctx = data;
+
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	if (val)
+		set_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+	else
+		clear_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+
+	return 0;
+}
+
+static int lowpan_ctx_flag_c_get(void *data, u64 *val)
+{
+	*val = lowpan_iphc_ctx_is_compression(data);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_c_fops, lowpan_ctx_flag_c_get,
+			lowpan_ctx_flag_c_set, "%llu\n");
+
+static int lowpan_ctx_plen_set(void *data, u64 val)
+{
+	struct lowpan_iphc_ctx *ctx = data;
+	struct lowpan_iphc_ctx_table *t =
+		container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+	if (val > 128)
+		return -EINVAL;
+
+	spin_lock_bh(&t->lock);
+	ctx->plen = val;
+	spin_unlock_bh(&t->lock);
+
+	return 0;
+}
+
+static int lowpan_ctx_plen_get(void *data, u64 *val)
+{
+	struct lowpan_iphc_ctx *ctx = data;
+	struct lowpan_iphc_ctx_table *t =
+		container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+	spin_lock_bh(&t->lock);
+	*val = ctx->plen;
+	spin_unlock_bh(&t->lock);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_plen_fops, lowpan_ctx_plen_get,
+			lowpan_ctx_plen_set, "%llu\n");
+
+static int lowpan_ctx_pfx_show(struct seq_file *file, void *offset)
+{
+	struct lowpan_iphc_ctx *ctx = file->private;
+	struct lowpan_iphc_ctx_table *t =
+		container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+	spin_lock_bh(&t->lock);
+	seq_printf(file, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		   be16_to_cpu(ctx->pfx.s6_addr16[0]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[1]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[2]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[3]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[4]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[5]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[6]),
+		   be16_to_cpu(ctx->pfx.s6_addr16[7]));
+	spin_unlock_bh(&t->lock);
+
+	return 0;
+}
+
+static int lowpan_ctx_pfx_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lowpan_ctx_pfx_show, inode->i_private);
+}
+
+static ssize_t lowpan_ctx_pfx_write(struct file *fp,
+				    const char __user *user_buf, size_t count,
+				    loff_t *ppos)
+{
+	char buf[128] = {};
+	struct seq_file *file = fp->private_data;
+	struct lowpan_iphc_ctx *ctx = file->private;
+	struct lowpan_iphc_ctx_table *t =
+		container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+	int status = count, n, i;
+	unsigned int addr[8];
+
+	if (copy_from_user(&buf, user_buf, min_t(size_t, sizeof(buf) - 1,
+						 count))) {
+		status = -EFAULT;
+		goto out;
+	}
+
+	n = sscanf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+		   &addr[0], &addr[1], &addr[2], &addr[3], &addr[4],
+		   &addr[5], &addr[6], &addr[7]);
+	if (n != LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS) {
+		status = -EINVAL;
+		goto out;
+	}
+
+	spin_lock_bh(&t->lock);
+	for (i = 0; i < 8; i++)
+		ctx->pfx.s6_addr16[i] = cpu_to_be16(addr[i] & 0xffff);
+	spin_unlock_bh(&t->lock);
+
+out:
+	return status;
+}
+
+const struct file_operations lowpan_ctx_pfx_fops = {
+	.open		= lowpan_ctx_pfx_open,
+	.read		= seq_read,
+	.write		= lowpan_ctx_pfx_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int lowpan_dev_debugfs_ctx_init(struct net_device *dev,
+				       struct dentry *ctx, u8 id)
+{
+	struct lowpan_priv *lpriv = lowpan_priv(dev);
+	struct dentry *dentry, *root;
+	char buf[32];
+
+	WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE);
+
+	sprintf(buf, "%d", id);
+
+	root = debugfs_create_dir(buf, ctx);
+	if (!root)
+		return -EINVAL;
+
+	dentry = debugfs_create_file("active", 0644, root,
+				     &lpriv->ctx.table[id],
+				     &lowpan_ctx_flag_active_fops);
+	if (!dentry)
+		return -EINVAL;
+
+	dentry = debugfs_create_file("compression", 0644, root,
+				     &lpriv->ctx.table[id],
+				     &lowpan_ctx_flag_c_fops);
+	if (!dentry)
+		return -EINVAL;
+
+	dentry = debugfs_create_file("prefix", 0644, root,
+				     &lpriv->ctx.table[id],
+				     &lowpan_ctx_pfx_fops);
+	if (!dentry)
+		return -EINVAL;
+
+	dentry = debugfs_create_file("prefix_len", 0644, root,
+				     &lpriv->ctx.table[id],
+				     &lowpan_ctx_plen_fops);
+	if (!dentry)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int lowpan_context_show(struct seq_file *file, void *offset)
+{
+	struct lowpan_iphc_ctx_table *t = file->private;
+	int i;
+
+	seq_printf(file, "%3s|%-43s|%c\n", "cid", "prefix", 'C');
+	seq_puts(file, "-------------------------------------------------\n");
+
+	spin_lock_bh(&t->lock);
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+		if (!lowpan_iphc_ctx_is_active(&t->table[i]))
+			continue;
+
+		seq_printf(file, "%3d|%39pI6c/%-3d|%d\n", t->table[i].id,
+			   &t->table[i].pfx, t->table[i].plen,
+			   lowpan_iphc_ctx_is_compression(&t->table[i]));
+	}
+	spin_unlock_bh(&t->lock);
+
+	return 0;
+}
+
+static int lowpan_context_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lowpan_context_show, inode->i_private);
+}
+
+const struct file_operations lowpan_context_fops = {
+	.open		= lowpan_context_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 int lowpan_dev_debugfs_init(struct net_device *dev)
 {
 	struct lowpan_priv *lpriv = lowpan_priv(dev);
+	struct dentry *contexts, *dentry;
+	int ret, i;
 
 	/* creating the root */
 	lpriv->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs);
 	if (!lpriv->iface_debugfs)
 		goto fail;
 
+	contexts = debugfs_create_dir("contexts", lpriv->iface_debugfs);
+	if (!contexts)
+		goto remove_root;
+
+	dentry = debugfs_create_file("show", 0644, contexts,
+				     &lowpan_priv(dev)->ctx,
+				     &lowpan_context_fops);
+	if (!dentry)
+		goto remove_root;
+
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+		ret = lowpan_dev_debugfs_ctx_init(dev, contexts, i);
+		if (ret < 0)
+			goto remove_root;
+	}
+
 	return 0;
 
+remove_root:
+	lowpan_dev_debugfs_exit(dev);
 fail:
 	return -EINVAL;
 }
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 346b5c1a9185..d2a565cde4f4 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -56,6 +56,7 @@
 /* special link-layer handling */
 #include <net/mac802154.h>
 
+#include "6lowpan_i.h"
 #include "nhc.h"
 
 /* Values of fields within the IPHC encoding first byte */
@@ -147,6 +148,9 @@
 	 (((a)->s6_addr16[6]) == 0) &&		\
 	 (((a)->s6_addr[14]) == 0))
 
+#define LOWPAN_IPHC_CID_DCI(cid)	(cid & 0x0f)
+#define LOWPAN_IPHC_CID_SCI(cid)	((cid & 0xf0) >> 4)
+
 static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr,
 						const void *lladdr)
 {
@@ -195,6 +199,98 @@ static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr,
 	}
 }
 
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_id(const struct net_device *dev, u8 id)
+{
+	struct lowpan_iphc_ctx *ret = &lowpan_priv(dev)->ctx.table[id];
+
+	if (!lowpan_iphc_ctx_is_active(ret))
+		return NULL;
+
+	return ret;
+}
+
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_addr(const struct net_device *dev,
+			    const struct in6_addr *addr)
+{
+	struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table;
+	struct lowpan_iphc_ctx *ret = NULL;
+	struct in6_addr addr_pfx;
+	u8 addr_plen;
+	int i;
+
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+		/* Check if context is valid. A context that is not valid
+		 * MUST NOT be used for compression.
+		 */
+		if (!lowpan_iphc_ctx_is_active(&table[i]) ||
+		    !lowpan_iphc_ctx_is_compression(&table[i]))
+			continue;
+
+		ipv6_addr_prefix(&addr_pfx, addr, table[i].plen);
+
+		/* if prefix len < 64, the remaining bits until 64th bit is
+		 * zero. Otherwise we use table[i]->plen.
+		 */
+		if (table[i].plen < 64)
+			addr_plen = 64;
+		else
+			addr_plen = table[i].plen;
+
+		if (ipv6_prefix_equal(&addr_pfx, &table[i].pfx, addr_plen)) {
+			/* remember first match */
+			if (!ret) {
+				ret = &table[i];
+				continue;
+			}
+
+			/* get the context with longest prefix len */
+			if (table[i].plen > ret->plen)
+				ret = &table[i];
+		}
+	}
+
+	return ret;
+}
+
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev,
+				  const struct in6_addr *addr)
+{
+	struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table;
+	struct lowpan_iphc_ctx *ret = NULL;
+	struct in6_addr addr_mcast, network_pfx = {};
+	int i;
+
+	/* init mcast address with  */
+	memcpy(&addr_mcast, addr, sizeof(*addr));
+
+	for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+		/* Check if context is valid. A context that is not valid
+		 * MUST NOT be used for compression.
+		 */
+		if (!lowpan_iphc_ctx_is_active(&table[i]) ||
+		    !lowpan_iphc_ctx_is_compression(&table[i]))
+			continue;
+
+		/* setting plen */
+		addr_mcast.s6_addr[3] = table[i].plen;
+		/* get network prefix to copy into multicast address */
+		ipv6_addr_prefix(&network_pfx, &table[i].pfx,
+				 table[i].plen);
+		/* setting network prefix */
+		memcpy(&addr_mcast.s6_addr[4], &network_pfx, 8);
+
+		if (ipv6_addr_equal(addr, &addr_mcast)) {
+			ret = &table[i];
+			break;
+		}
+	}
+
+	return ret;
+}
+
 /* Uncompress address function for source and
  * destination address(non-multicast).
  *
@@ -259,30 +355,59 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev,
 /* Uncompress address function for source context
  * based address(non-multicast).
  */
-static int uncompress_context_based_src_addr(struct sk_buff *skb,
-					     struct in6_addr *ipaddr,
-					     u8 address_mode)
+static int uncompress_ctx_addr(struct sk_buff *skb,
+			       const struct net_device *dev,
+			       const struct lowpan_iphc_ctx *ctx,
+			       struct in6_addr *ipaddr, u8 address_mode,
+			       const void *lladdr)
 {
+	bool fail;
+
 	switch (address_mode) {
-	case LOWPAN_IPHC_SAM_00:
-		/* unspec address ::
+	/* SAM and DAM are the same here */
+	case LOWPAN_IPHC_DAM_00:
+		fail = false;
+		/* SAM_00 -> unspec address ::
 		 * Do nothing, address is already ::
+		 *
+		 * DAM 00 -> reserved should never occur.
 		 */
 		break;
 	case LOWPAN_IPHC_SAM_01:
-		/* TODO */
+	case LOWPAN_IPHC_DAM_01:
+		fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8);
+		ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+		break;
 	case LOWPAN_IPHC_SAM_10:
-		/* TODO */
+	case LOWPAN_IPHC_DAM_10:
+		ipaddr->s6_addr[11] = 0xFF;
+		ipaddr->s6_addr[12] = 0xFE;
+		fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2);
+		ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+		break;
 	case LOWPAN_IPHC_SAM_11:
-		/* TODO */
-		netdev_warn(skb->dev, "SAM value 0x%x not supported\n",
-			    address_mode);
-		return -EINVAL;
+	case LOWPAN_IPHC_DAM_11:
+		fail = false;
+		switch (lowpan_priv(dev)->lltype) {
+		case LOWPAN_LLTYPE_IEEE802154:
+			iphc_uncompress_802154_lladdr(ipaddr, lladdr);
+			break;
+		default:
+			iphc_uncompress_eui64_lladdr(ipaddr, lladdr);
+			break;
+		}
+		ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+		break;
 	default:
 		pr_debug("Invalid sam value: 0x%x\n", address_mode);
 		return -EINVAL;
 	}
 
+	if (fail) {
+		pr_debug("Failed to fetch skb data\n");
+		return -EIO;
+	}
+
 	raw_dump_inline(NULL,
 			"Reconstructed context based ipv6 src addr is",
 			ipaddr->s6_addr, 16);
@@ -346,6 +471,30 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb,
 	return 0;
 }
 
+static int lowpan_uncompress_multicast_ctx_daddr(struct sk_buff *skb,
+						 struct lowpan_iphc_ctx *ctx,
+						 struct in6_addr *ipaddr,
+						 u8 address_mode)
+{
+	struct in6_addr network_pfx = {};
+	bool fail;
+
+	ipaddr->s6_addr[0] = 0xFF;
+	fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 2);
+	fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[12], 4);
+	if (fail < 0)
+		return -EIO;
+
+	/* take prefix_len and network prefix from the context */
+	ipaddr->s6_addr[3] = ctx->plen;
+	/* get network prefix to copy into multicast address */
+	ipv6_addr_prefix(&network_pfx, &ctx->pfx, ctx->plen);
+	/* setting network prefix */
+	memcpy(&ipaddr->s6_addr[4], &network_pfx, 8);
+
+	return 0;
+}
+
 /* get the ecn values from iphc tf format and set it to ipv6hdr */
 static inline void lowpan_iphc_tf_set_ecn(struct ipv6hdr *hdr, const u8 *tf)
 {
@@ -459,7 +608,8 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
 			     const void *daddr, const void *saddr)
 {
 	struct ipv6hdr hdr = {};
-	u8 iphc0, iphc1;
+	struct lowpan_iphc_ctx *ci;
+	u8 iphc0, iphc1, cid = 0;
 	int err;
 
 	raw_dump_table(__func__, "raw skb data dump uncompressed",
@@ -469,12 +619,14 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
 	    lowpan_fetch_skb(skb, &iphc1, sizeof(iphc1)))
 		return -EINVAL;
 
-	/* another if the CID flag is set */
-	if (iphc1 & LOWPAN_IPHC_CID)
-		return -ENOTSUPP;
-
 	hdr.version = 6;
 
+	/* default CID = 0, another if the CID flag is set */
+	if (iphc1 & LOWPAN_IPHC_CID) {
+		if (lowpan_fetch_skb(skb, &cid, sizeof(cid)))
+			return -EINVAL;
+	}
+
 	err = lowpan_iphc_tf_decompress(skb, &hdr,
 					iphc0 & LOWPAN_IPHC_TF_MASK);
 	if (err < 0)
@@ -500,10 +652,17 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
 	}
 
 	if (iphc1 & LOWPAN_IPHC_SAC) {
-		/* Source address context based uncompression */
+		spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+		ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_SCI(cid));
+		if (!ci) {
+			spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+			return -EINVAL;
+		}
+
 		pr_debug("SAC bit is set. Handle context based source address.\n");
-		err = uncompress_context_based_src_addr(skb, &hdr.saddr,
-							iphc1 & LOWPAN_IPHC_SAM_MASK);
+		err = uncompress_ctx_addr(skb, dev, ci, &hdr.saddr,
+					  iphc1 & LOWPAN_IPHC_SAM_MASK, saddr);
+		spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
 	} else {
 		/* Source address uncompression */
 		pr_debug("source address stateless compression\n");
@@ -515,27 +674,52 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
 	if (err)
 		return -EINVAL;
 
-	/* check for Multicast Compression */
-	if (iphc1 & LOWPAN_IPHC_M) {
-		if (iphc1 & LOWPAN_IPHC_DAC) {
-			pr_debug("dest: context-based mcast compression\n");
-			/* TODO: implement this */
-		} else {
-			err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr,
-								iphc1 & LOWPAN_IPHC_DAM_MASK);
+	switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) {
+	case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC:
+		spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+		ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
+		if (!ci) {
+			spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+			return -EINVAL;
+		}
 
-			if (err)
-				return -EINVAL;
+		/* multicast with context */
+		pr_debug("dest: context-based mcast compression\n");
+		err = lowpan_uncompress_multicast_ctx_daddr(skb, ci,
+							    &hdr.daddr,
+							    iphc1 & LOWPAN_IPHC_DAM_MASK);
+		spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+		break;
+	case LOWPAN_IPHC_M:
+		/* multicast */
+		err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr,
+							iphc1 & LOWPAN_IPHC_DAM_MASK);
+		break;
+	case LOWPAN_IPHC_DAC:
+		spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+		ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
+		if (!ci) {
+			spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+			return -EINVAL;
 		}
-	} else {
+
+		/* Destination address context based uncompression */
+		pr_debug("DAC bit is set. Handle context based destination address.\n");
+		err = uncompress_ctx_addr(skb, dev, ci, &hdr.daddr,
+					  iphc1 & LOWPAN_IPHC_DAM_MASK, daddr);
+		spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+		break;
+	default:
 		err = uncompress_addr(skb, dev, &hdr.daddr,
 				      iphc1 & LOWPAN_IPHC_DAM_MASK, daddr);
 		pr_debug("dest: stateless compression mode %d dest %pI6c\n",
 			 iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr);
-		if (err)
-			return -EINVAL;
+		break;
 	}
 
+	if (err)
+		return -EINVAL;
+
 	/* Next header data uncompression */
 	if (iphc0 & LOWPAN_IPHC_NH) {
 		err = lowpan_nhc_do_uncompression(skb, dev, &hdr);
@@ -585,6 +769,58 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = {
 	[LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11,
 };
 
+static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr,
+				   const struct lowpan_iphc_ctx *ctx,
+				   const unsigned char *lladdr, bool sam)
+{
+	struct in6_addr tmp = {};
+	u8 dam;
+
+	/* check for SAM/DAM = 11 */
+	memcpy(&tmp.s6_addr[8], lladdr, 8);
+	/* second bit-flip (Universe/Local) is done according RFC2464 */
+	tmp.s6_addr[8] ^= 0x02;
+	/* context information are always used */
+	ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+	if (ipv6_addr_equal(&tmp, ipaddr)) {
+		dam = LOWPAN_IPHC_DAM_11;
+		goto out;
+	}
+
+	memset(&tmp, 0, sizeof(tmp));
+	/* check for SAM/DAM = 01 */
+	tmp.s6_addr[11] = 0xFF;
+	tmp.s6_addr[12] = 0xFE;
+	memcpy(&tmp.s6_addr[14], &ipaddr->s6_addr[14], 2);
+	/* context information are always used */
+	ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+	if (ipv6_addr_equal(&tmp, ipaddr)) {
+		lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[14], 2);
+		dam = LOWPAN_IPHC_DAM_10;
+		goto out;
+	}
+
+	memset(&tmp, 0, sizeof(tmp));
+	/* check for SAM/DAM = 10, should always match */
+	memcpy(&tmp.s6_addr[8], &ipaddr->s6_addr[8], 8);
+	/* context information are always used */
+	ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+	if (ipv6_addr_equal(&tmp, ipaddr)) {
+		lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[8], 8);
+		dam = LOWPAN_IPHC_DAM_01;
+		goto out;
+	}
+
+	WARN_ON_ONCE("context found but no address mode matched\n");
+	return -EINVAL;
+out:
+
+	if (sam)
+		return lowpan_iphc_dam_to_sam_value[dam];
+	else
+		return dam;
+}
+
 static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr,
 				  const unsigned char *lladdr, bool sam)
 {
@@ -708,6 +944,21 @@ static u8 lowpan_iphc_tf_compress(u8 **hc_ptr, const struct ipv6hdr *hdr)
 	return val;
 }
 
+static u8 lowpan_iphc_mcast_ctx_addr_compress(u8 **hc_ptr,
+					      const struct lowpan_iphc_ctx *ctx,
+					      const struct in6_addr *ipaddr)
+{
+	u8 data[6];
+
+	/* flags/scope, reserved (RIID) */
+	memcpy(data, &ipaddr->s6_addr[1], 2);
+	/* group ID */
+	memcpy(&data[1], &ipaddr->s6_addr[11], 4);
+	lowpan_push_hc_data(hc_ptr, data, 6);
+
+	return LOWPAN_IPHC_DAM_00;
+}
+
 static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr,
 					  const struct in6_addr *ipaddr)
 {
@@ -742,10 +993,11 @@ static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr,
 int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
 			   const void *daddr, const void *saddr)
 {
-	u8 iphc0, iphc1, *hc_ptr;
+	u8 iphc0, iphc1, *hc_ptr, cid = 0;
 	struct ipv6hdr *hdr;
 	u8 head[LOWPAN_IPHC_MAX_HC_BUF_LEN] = {};
-	int ret, addr_type;
+	struct lowpan_iphc_ctx *dci, *sci, dci_entry, sci_entry;
+	int ret, ipv6_daddr_type, ipv6_saddr_type;
 
 	if (skb->protocol != htons(ETH_P_IPV6))
 		return -EINVAL;
@@ -769,14 +1021,38 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
 	iphc0 = LOWPAN_DISPATCH_IPHC;
 	iphc1 = 0;
 
-	/* TODO: context lookup */
-
 	raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN);
 	raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN);
 
 	raw_dump_table(__func__, "sending raw skb network uncompressed packet",
 		       skb->data, skb->len);
 
+	ipv6_daddr_type = ipv6_addr_type(&hdr->daddr);
+	spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+	if (ipv6_daddr_type & IPV6_ADDR_MULTICAST)
+		dci = lowpan_iphc_ctx_get_by_mcast_addr(dev, &hdr->daddr);
+	else
+		dci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->daddr);
+	if (dci) {
+		memcpy(&dci_entry, dci, sizeof(*dci));
+		cid |= dci->id;
+	}
+	spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+
+	spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+	sci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->saddr);
+	if (sci) {
+		memcpy(&sci_entry, sci, sizeof(*sci));
+		cid |= (sci->id << 4);
+	}
+	spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+
+	/* if cid is zero it will be compressed */
+	if (cid) {
+		iphc1 |= LOWPAN_IPHC_CID;
+		lowpan_push_hc_data(&hc_ptr, &cid, sizeof(cid));
+	}
+
 	/* Traffic Class, Flow Label compression */
 	iphc0 |= lowpan_iphc_tf_compress(&hc_ptr, hdr);
 
@@ -813,39 +1089,63 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
 				    sizeof(hdr->hop_limit));
 	}
 
-	addr_type = ipv6_addr_type(&hdr->saddr);
+	ipv6_saddr_type = ipv6_addr_type(&hdr->saddr);
 	/* source address compression */
-	if (addr_type == IPV6_ADDR_ANY) {
+	if (ipv6_saddr_type == IPV6_ADDR_ANY) {
 		pr_debug("source address is unspecified, setting SAC\n");
 		iphc1 |= LOWPAN_IPHC_SAC;
 	} else {
-		if (addr_type & IPV6_ADDR_LINKLOCAL) {
-			iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->saddr,
-							 saddr, true);
-			pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
-				 &hdr->saddr, iphc1);
+		if (sci) {
+			iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr,
+							  &sci_entry, saddr,
+							  true);
+			iphc1 |= LOWPAN_IPHC_SAC;
 		} else {
-			pr_debug("send the full source address\n");
-			lowpan_push_hc_data(&hc_ptr, hdr->saddr.s6_addr, 16);
+			if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL) {
+				iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+								 &hdr->saddr,
+								 saddr, true);
+				pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
+					 &hdr->saddr, iphc1);
+			} else {
+				pr_debug("send the full source address\n");
+				lowpan_push_hc_data(&hc_ptr,
+						    hdr->saddr.s6_addr, 16);
+			}
 		}
 	}
 
-	addr_type = ipv6_addr_type(&hdr->daddr);
 	/* destination address compression */
-	if (addr_type & IPV6_ADDR_MULTICAST) {
+	if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) {
 		pr_debug("destination address is multicast: ");
-		iphc1 |= LOWPAN_IPHC_M;
-		iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr, &hdr->daddr);
+		if (dci) {
+			iphc1 |= lowpan_iphc_mcast_ctx_addr_compress(&hc_ptr,
+								     &dci_entry,
+								     &hdr->daddr);
+		} else {
+			iphc1 |= LOWPAN_IPHC_M;
+			iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr,
+								 &hdr->daddr);
+		}
 	} else {
-		if (addr_type & IPV6_ADDR_LINKLOCAL) {
-			/* TODO: context lookup */
-			iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->daddr,
-							 daddr, false);
-			pr_debug("dest address unicast link-local %pI6c "
-				 "iphc1 0x%02x\n", &hdr->daddr, iphc1);
+		if (dci) {
+			iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr,
+							  &dci_entry, daddr,
+							  false);
+			iphc1 |= LOWPAN_IPHC_DAC;
 		} else {
-			pr_debug("dest address unicast %pI6c\n", &hdr->daddr);
-			lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16);
+			if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL) {
+				iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+								 &hdr->daddr,
+								 daddr, false);
+				pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n",
+					 &hdr->daddr, iphc1);
+			} else {
+				pr_debug("dest address unicast %pI6c\n",
+					 &hdr->daddr);
+				lowpan_push_hc_data(&hc_ptr,
+						    hdr->daddr.s6_addr, 16);
+			}
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From ebba380cc94689e372387aa7f63c063a3663e846 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Mon, 22 Feb 2016 09:13:55 +0100
Subject: ieee802154: 6lowpan: fix return of netdev notifier

This patch fixed the return value of netdev notifier. If the command is
a don't care a NOTIFY_DONE should be returned. If the command matched a
NOTIFY_OK should be returned.

Reviewed-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/ieee802154/6lowpan/core.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 737c87a2a41e..0023c9048812 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -207,7 +207,7 @@ static int lowpan_device_event(struct notifier_block *unused,
 	struct net_device *wdev = netdev_notifier_info_to_dev(ptr);
 
 	if (wdev->type != ARPHRD_IEEE802154)
-		goto out;
+		return NOTIFY_DONE;
 
 	switch (event) {
 	case NETDEV_UNREGISTER:
@@ -219,11 +219,10 @@ static int lowpan_device_event(struct notifier_block *unused,
 			lowpan_dellink(wdev->ieee802154_ptr->lowpan_dev, NULL);
 		break;
 	default:
-		break;
+		return NOTIFY_DONE;
 	}
 
-out:
-	return NOTIFY_DONE;
+	return NOTIFY_OK;
 }
 
 static struct notifier_block lowpan_dev_notifier = {
-- 
cgit v1.2.3-59-g8ed1b


From a6692754d61a6b3735803783f394880805675f99 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 12 Feb 2016 12:09:39 -0500
Subject: net: dsa: pass bridge down to drivers

Some DSA drivers may or may not support multiple software bridges on top
of an hardware switch.

It is more convenient for them to access the bridge's net_device for
finer configuration.

Removing the need to craft and access a bitmask also simplifies the
code.

This patch changes the signature of bridge related functions, update DSA
drivers, and removes dsa_slave_br_port_mask.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dsa/dsa.txt |  7 ++-----
 drivers/net/dsa/bcm_sf2.c            | 12 +++++++-----
 drivers/net/dsa/bcm_sf2.h            |  2 ++
 drivers/net/dsa/mv88e6xxx.c          | 13 +++++++++++--
 drivers/net/dsa/mv88e6xxx.h          |  6 ++++--
 include/net/dsa.h                    |  5 ++---
 net/dsa/slave.c                      | 31 ++-----------------------------
 7 files changed, 30 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index aa9c1f9313cd..ebf21530471f 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -524,17 +524,14 @@ Bridge layer
 - port_join_bridge: bridge layer function invoked when a given switch port is
   added to a bridge, this function should be doing the necessary at the switch
   level to permit the joining port from being added to the relevant logical
-  domain for it to ingress/egress traffic with other members of the bridge. DSA
-  does nothing but calculate a bitmask of switch ports currently members of the
-  specified bridge being requested the join
+  domain for it to ingress/egress traffic with other members of the bridge.
 
 - port_leave_bridge: bridge layer function invoked when a given switch port is
   removed from a bridge, this function should be doing the necessary at the
   switch level to deny the leaving port from ingress/egress traffic from the
   remaining bridge members. When the port leaves the bridge, it should be aged
   out at the switch hardware for the switch to (re) learn MAC addresses behind
-  this port. DSA calculates the bitmask of ports still members of the bridge
-  being left
+  this port.
 
 - port_stp_update: bridge layer function invoked when a given switch port STP
   state is computed by the bridge layer and should be propagated to switch
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 6f946fedbb77..3f627598f277 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -483,16 +483,17 @@ static int bcm_sf2_sw_fast_age_port(struct dsa_switch  *ds, int port)
 }
 
 static int bcm_sf2_sw_br_join(struct dsa_switch *ds, int port,
-			      u32 br_port_mask)
+			      struct net_device *bridge)
 {
 	struct bcm_sf2_priv *priv = ds_to_priv(ds);
 	unsigned int i;
 	u32 reg, p_ctl;
 
+	priv->port_sts[port].bridge_dev = bridge;
 	p_ctl = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(port));
 
 	for (i = 0; i < priv->hw_params.num_ports; i++) {
-		if (!((1 << i) & br_port_mask))
+		if (priv->port_sts[i].bridge_dev != bridge)
 			continue;
 
 		/* Add this local port to the remote port VLAN control
@@ -515,10 +516,10 @@ static int bcm_sf2_sw_br_join(struct dsa_switch *ds, int port,
 	return 0;
 }
 
-static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port,
-			       u32 br_port_mask)
+static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
 {
 	struct bcm_sf2_priv *priv = ds_to_priv(ds);
+	struct net_device *bridge = priv->port_sts[port].bridge_dev;
 	unsigned int i;
 	u32 reg, p_ctl;
 
@@ -526,7 +527,7 @@ static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port,
 
 	for (i = 0; i < priv->hw_params.num_ports; i++) {
 		/* Don't touch the remaining ports */
-		if (!((1 << i) & br_port_mask))
+		if (priv->port_sts[i].bridge_dev != bridge)
 			continue;
 
 		reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i));
@@ -541,6 +542,7 @@ static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port,
 
 	core_writel(priv, p_ctl, CORE_PORT_VLAN_CTL_PORT(port));
 	priv->port_sts[port].vlan_ctl_mask = p_ctl;
+	priv->port_sts[port].bridge_dev = NULL;
 
 	return 0;
 }
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index 6bba1c98d764..200b1f5fdb56 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -50,6 +50,8 @@ struct bcm_sf2_port_status {
 	struct ethtool_eee eee;
 
 	u32 vlan_ctl_mask;
+
+	struct net_device *bridge_dev;
 };
 
 struct bcm_sf2_arl_entry {
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index b0e00edb302e..2e515e8a95fe 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1889,13 +1889,22 @@ unlock:
 	return err;
 }
 
-int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, u32 members)
+int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
+			       struct net_device *bridge)
 {
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+
+	ps->ports[port].bridge_dev = bridge;
+
 	return 0;
 }
 
-int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, u32 members)
+int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
 {
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+
+	ps->ports[port].bridge_dev = NULL;
+
 	return 0;
 }
 
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 63a6f587e9e8..260b4918e427 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -380,6 +380,7 @@ struct mv88e6xxx_vtu_stu_entry {
 };
 
 struct mv88e6xxx_priv_port {
+	struct net_device *bridge_dev;
 	u8 state;
 };
 
@@ -481,8 +482,9 @@ int mv88e6xxx_phy_write_indirect(struct dsa_switch *ds, int addr, int regnum,
 int mv88e6xxx_get_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e);
 int mv88e6xxx_set_eee(struct dsa_switch *ds, int port,
 		      struct phy_device *phydev, struct ethtool_eee *e);
-int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, u32 members);
-int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, u32 members);
+int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
+			       struct net_device *bridge);
+int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port);
 int mv88e6xxx_port_stp_update(struct dsa_switch *ds, int port, u8 state);
 int mv88e6xxx_port_vlan_prepare(struct dsa_switch *ds, int port,
 				const struct switchdev_obj_port_vlan *vlan,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 26a0e86e611e..1c845d7bf0b2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -297,9 +297,8 @@ struct dsa_switch_driver {
 	 * Bridge integration
 	 */
 	int	(*port_join_bridge)(struct dsa_switch *ds, int port,
-				    u32 br_port_mask);
-	int	(*port_leave_bridge)(struct dsa_switch *ds, int port,
-				     u32 br_port_mask);
+				    struct net_device *bridge);
+	int	(*port_leave_bridge)(struct dsa_switch *ds, int port);
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ab24521beb4d..ab515df5f493 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -385,31 +385,6 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return -EOPNOTSUPP;
 }
 
-/* Return a bitmask of all ports being currently bridged within a given bridge
- * device. Note that on leave, the mask will still return the bitmask of ports
- * currently bridged, prior to port removal, and this is exactly what we want.
- */
-static u32 dsa_slave_br_port_mask(struct dsa_switch *ds,
-				  struct net_device *bridge)
-{
-	struct dsa_slave_priv *p;
-	unsigned int port;
-	u32 mask = 0;
-
-	for (port = 0; port < DSA_MAX_PORTS; port++) {
-		if (!dsa_is_port_initialized(ds, port))
-			continue;
-
-		p = netdev_priv(ds->ports[port]);
-
-		if (ds->ports[port]->priv_flags & IFF_BRIDGE_PORT &&
-		    p->bridge_dev == bridge)
-			mask |= 1 << port;
-	}
-
-	return mask;
-}
-
 static int dsa_slave_stp_update(struct net_device *dev, u8 state)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
@@ -533,8 +508,7 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 	p->bridge_dev = br;
 
 	if (ds->drv->port_join_bridge)
-		ret = ds->drv->port_join_bridge(ds, p->port,
-						dsa_slave_br_port_mask(ds, br));
+		ret = ds->drv->port_join_bridge(ds, p->port, br);
 
 	return ret;
 }
@@ -547,8 +521,7 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
 
 
 	if (ds->drv->port_leave_bridge)
-		ret = ds->drv->port_leave_bridge(ds, p->port,
-						 dsa_slave_br_port_mask(ds, p->bridge_dev));
+		ret = ds->drv->port_leave_bridge(ds, p->port);
 
 	p->bridge_dev = NULL;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9d2dd736698e6f1dc176b9669b6390ddcd2063b1 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 12 Feb 2016 12:09:41 -0500
Subject: net: dsa: remove dsa_bridge_check_vlan_range

DSA drivers may support multiple bridge groups with the same hardware
VLAN. The mv88e6xxx driver which cannot yet, already has its own check
for overlapping bridges. Thus remove the check from the DSA layer.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 50 --------------------------------------------------
 1 file changed, 50 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ab515df5f493..14ca9784ec0c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -201,47 +201,6 @@ out:
 	return 0;
 }
 
-static int dsa_bridge_check_vlan_range(struct dsa_switch *ds,
-				       const struct net_device *bridge,
-				       u16 vid_begin, u16 vid_end)
-{
-	struct dsa_slave_priv *p;
-	struct net_device *dev, *vlan_br;
-	DECLARE_BITMAP(members, DSA_MAX_PORTS);
-	DECLARE_BITMAP(untagged, DSA_MAX_PORTS);
-	u16 vid;
-	int member, err;
-
-	if (!ds->drv->vlan_getnext || !vid_begin)
-		return -EOPNOTSUPP;
-
-	vid = vid_begin - 1;
-
-	do {
-		err = ds->drv->vlan_getnext(ds, &vid, members, untagged);
-		if (err)
-			break;
-
-		if (vid > vid_end)
-			break;
-
-		member = find_first_bit(members, DSA_MAX_PORTS);
-		if (member == DSA_MAX_PORTS)
-			continue;
-
-		dev = ds->ports[member];
-		p = netdev_priv(dev);
-		vlan_br = p->bridge_dev;
-		if (vlan_br == bridge)
-			continue;
-
-		netdev_dbg(vlan_br, "hardware VLAN %d already in use\n", vid);
-		return -EOPNOTSUPP;
-	} while (vid < vid_end);
-
-	return err == -ENOENT ? 0 : err;
-}
-
 static int dsa_slave_port_vlan_add(struct net_device *dev,
 				   const struct switchdev_obj_port_vlan *vlan,
 				   struct switchdev_trans *trans)
@@ -254,15 +213,6 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
 		if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add)
 			return -EOPNOTSUPP;
 
-		/* If the requested port doesn't belong to the same bridge as
-		 * the VLAN members, fallback to software VLAN (hopefully).
-		 */
-		err = dsa_bridge_check_vlan_range(ds, p->bridge_dev,
-						  vlan->vid_begin,
-						  vlan->vid_end);
-		if (err)
-			return err;
-
 		err = ds->drv->port_vlan_prepare(ds, p->port, vlan, trans);
 		if (err)
 			return err;
-- 
cgit v1.2.3-59-g8ed1b


From 412a6d800c7380c1b87c11080c7da905c27cfea8 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 8 Dec 2015 19:09:05 +0200
Subject: mac80211: support hw managing reorder logic

Enable driver to manage the reordering logic itself.
This is needed for example for the iwlwifi driver that
will support hardware assisted reordering.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  |  6 ++++++
 net/mac80211/agg-rx.c   | 24 ++++++++++++++++++++++--
 net/mac80211/debugfs.c  |  1 +
 net/mac80211/sta_info.h | 21 ++++++++++++---------
 4 files changed, 41 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6c9c559394b0..ee6305a52251 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1929,6 +1929,11 @@ struct ieee80211_txq {
  *	by just its MAC address; this prevents, for example, the same station
  *	from connecting to two virtual AP interfaces at the same time.
  *
+ * @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the
+ *	reordering buffer internally, guaranteeing mac80211 receives frames in
+ *	order and does not need to manage its own reorder buffer or BA session
+ *	timeout.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -1965,6 +1970,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU,
 	IEEE80211_HW_BEACON_TX_STATUS,
 	IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
+	IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index ec80db7c955c..2ab54791281d 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -76,10 +76,11 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 	tid_rx = rcu_dereference_protected(sta->ampdu_mlme.tid_rx[tid],
 					lockdep_is_held(&sta->ampdu_mlme.mtx));
 
-	if (!tid_rx)
+	if (!test_bit(tid, sta->ampdu_mlme.agg_session_valid))
 		return;
 
 	RCU_INIT_POINTER(sta->ampdu_mlme.tid_rx[tid], NULL);
+	__clear_bit(tid, sta->ampdu_mlme.agg_session_valid);
 
 	ht_dbg(sta->sdata,
 	       "Rx BA session stop requested for %pM tid %u %s reason: %d\n",
@@ -97,6 +98,13 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 		ieee80211_send_delba(sta->sdata, sta->sta.addr,
 				     tid, WLAN_BACK_RECIPIENT, reason);
 
+	/*
+	 * return here in case tid_rx is not assigned - which will happen if
+	 * IEEE80211_HW_SUPPORTS_REORDERING_BUFFER is set.
+	 */
+	if (!tid_rx)
+		return;
+
 	del_timer_sync(&tid_rx->session_timer);
 
 	/* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */
@@ -297,7 +305,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	/* examine state machine */
 	mutex_lock(&sta->ampdu_mlme.mtx);
 
-	if (sta->ampdu_mlme.tid_rx[tid]) {
+	if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
 		ht_dbg_ratelimited(sta->sdata,
 				   "unexpected AddBA Req from %pM on tid %u\n",
 				   sta->sta.addr, tid);
@@ -308,6 +316,16 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 						false);
 	}
 
+	if (ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER)) {
+		ret = drv_ampdu_action(local, sta->sdata, &params);
+		ht_dbg(sta->sdata,
+		       "Rx A-MPDU request on %pM tid %d result %d\n",
+		       sta->sta.addr, tid, ret);
+		if (!ret)
+			status = WLAN_STATUS_SUCCESS;
+		goto end;
+	}
+
 	/* prepare A-MPDU MLME for Rx aggregation */
 	tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
 	if (!tid_agg_rx)
@@ -369,6 +387,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	}
 
 end:
+	if (status == WLAN_STATUS_SUCCESS)
+		__set_bit(tid, sta->ampdu_mlme.agg_session_valid);
 	mutex_unlock(&sta->ampdu_mlme.mtx);
 
 end_no_lock:
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index abbdff03ce92..e433d0c97e86 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -126,6 +126,7 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
 	FLAG(SUPPORTS_AMSDU_IN_AMPDU),
 	FLAG(BEACON_TX_STATUS),
 	FLAG(NEEDS_UNIQUE_STA_ADDR),
+	FLAG(SUPPORTS_REORDERING_BUFFER),
 
 	/* keep last for the build bug below */
 	(void *)0x1
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index d6051629ed15..f4d38994ecee 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -1,6 +1,7 @@
 /*
  * Copyright 2002-2005, Devicescape Software, Inc.
  * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright(c) 2015 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -212,20 +213,21 @@ struct tid_ampdu_rx {
 /**
  * struct sta_ampdu_mlme - STA aggregation information.
  *
+ * @mtx: mutex to protect all TX data (except non-NULL assignments
+ *	to tid_tx[idx], which are protected by the sta spinlock)
+ *	tid_start_tx is also protected by sta->lock.
  * @tid_rx: aggregation info for Rx per TID -- RCU protected
- * @tid_tx: aggregation info for Tx per TID
- * @tid_start_tx: sessions where start was requested
- * @addba_req_num: number of times addBA request has been sent.
- * @last_addba_req_time: timestamp of the last addBA request.
- * @dialog_token_allocator: dialog token enumerator for each new session;
- * @work: work struct for starting/stopping aggregation
  * @tid_rx_timer_expired: bitmap indicating on which TIDs the
  *	RX timer expired until the work for it runs
  * @tid_rx_stop_requested:  bitmap indicating which BA sessions per TID the
  *	driver requested to close until the work for it runs
- * @mtx: mutex to protect all TX data (except non-NULL assignments
- *	to tid_tx[idx], which are protected by the sta spinlock)
- *	tid_start_tx is also protected by sta->lock.
+ * @agg_session_valid: bitmap indicating which TID has a rx BA session open on
+ * @work: work struct for starting/stopping aggregation
+ * @tid_tx: aggregation info for Tx per TID
+ * @tid_start_tx: sessions where start was requested
+ * @last_addba_req_time: timestamp of the last addBA request.
+ * @addba_req_num: number of times addBA request has been sent.
+ * @dialog_token_allocator: dialog token enumerator for each new session;
  */
 struct sta_ampdu_mlme {
 	struct mutex mtx;
@@ -233,6 +235,7 @@ struct sta_ampdu_mlme {
 	struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS];
 	unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
 	unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
+	unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
 	/* tx */
 	struct work_struct work;
 	struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
-- 
cgit v1.2.3-59-g8ed1b


From 178830481eee5eea147a1c8fab67a96e09d80345 Mon Sep 17 00:00:00 2001
From: Grzegorz Bajorski <grzegorz.bajorski@tieto.com>
Date: Fri, 11 Dec 2015 14:39:46 +0100
Subject: mac80211: allow drivers to report (non-)monitor frames

Some drivers offload some frames internally (e.g.
AddBa). Reporting such frames to mac80211 would
only confuse MLME. However it would be useful to
be able to pass such frames to monitor interfaces
for sniffing purposes, e.g. when running AP +
monitor.

To do that allow drivers to tell mac80211 whether
a given frame should be:
 - processed but not delivered to any monitor vif
 - not processed but delievered to monitor vifs
   only

Signed-off-by: Grzegorz Bajorski <grzegorz.bajorski@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 11 +++++++++++
 net/mac80211/rx.c      |  5 +++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ee6305a52251..5910085af9e6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1031,6 +1031,14 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC
  *	is stored in the @ampdu_delimiter_crc field)
  * @RX_FLAG_LDPC: LDPC was used
+ * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without
+ *	processing it in any regular way.
+ *	This is useful if drivers offload some frames but still want to report
+ *	them for sniffing purposes.
+ * @RX_FLAG_SKIP_MONITOR: Process and report frame to all interfaces except
+ *	monitor interfaces.
+ *	This is useful if drivers offload some frames but still want to report
+ *	them for sniffing purposes.
  * @RX_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3
  * @RX_FLAG_10MHZ: 10 MHz (half channel) was used
  * @RX_FLAG_5MHZ: 5 MHz (quarter channel) was used
@@ -1071,6 +1079,8 @@ enum mac80211_rx_flags {
 	RX_FLAG_MACTIME_END		= BIT(21),
 	RX_FLAG_VHT			= BIT(22),
 	RX_FLAG_LDPC			= BIT(23),
+	RX_FLAG_ONLY_MONITOR		= BIT(24),
+	RX_FLAG_SKIP_MONITOR		= BIT(25),
 	RX_FLAG_STBC_MASK		= BIT(26) | BIT(27),
 	RX_FLAG_10MHZ			= BIT(28),
 	RX_FLAG_5MHZ			= BIT(29),
@@ -1089,6 +1099,7 @@ enum mac80211_rx_flags {
  * @RX_VHT_FLAG_160MHZ: 160 MHz was used
  * @RX_VHT_FLAG_BF: packet was beamformed
  */
+
 enum mac80211_rx_vht_flags {
 	RX_VHT_FLAG_80MHZ		= BIT(0),
 	RX_VHT_FLAG_160MHZ		= BIT(1),
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index fe675d76f29c..ae993edfdecf 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -122,7 +122,8 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
 	hdr = (void *)(skb->data + rtap_vendor_space);
 
 	if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
-			    RX_FLAG_FAILED_PLCP_CRC))
+			    RX_FLAG_FAILED_PLCP_CRC |
+			    RX_FLAG_ONLY_MONITOR))
 		return true;
 
 	if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space))
@@ -507,7 +508,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 		return NULL;
 	}
 
-	if (!local->monitors) {
+	if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) {
 		if (should_drop_frame(origskb, present_fcs_len,
 				      rtap_vendor_space)) {
 			dev_kfree_skb(origskb);
-- 
cgit v1.2.3-59-g8ed1b


From a7201a6c5ea0157e96d17240d185a538bfdb7038 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Sun, 13 Dec 2015 13:41:43 +0200
Subject: mac80211: Recalc min chandef when station is associated

The minimum chandef bandwidth calculation was done only in case
a new station was inserted (or when an existing station was removed).
However, it is possible that stations are inserted before they are
associated, e.g., when FULL_AP_CLIENT_STATE is supported and user
space adds stations unassociated.

Fix this by calling ieee80211_recalc_min_chandef() whenever
a station transitions in/out the associated state, and only
consider station marked as associated.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/chan.c     | 2 +-
 net/mac80211/sta_info.c | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 1d1b9b7bdefe..283981108ca8 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -231,7 +231,7 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
 		    !(sta->sdata->bss && sta->sdata->bss == sdata->bss))
 			continue;
 
-		if (!sta->uploaded)
+		if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_ASSOC))
 			continue;
 
 		max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta));
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 4402ad5b27d1..7e007cf12cb2 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -530,7 +530,6 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
 	/* accept BA sessions now */
 	clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
 
-	ieee80211_recalc_min_chandef(sdata);
 	ieee80211_sta_debugfs_add(sta);
 	rate_control_add_sta_debugfs(sta);
 
@@ -941,7 +940,6 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
 
 	rate_control_remove_sta_debugfs(sta);
 	ieee80211_sta_debugfs_remove(sta);
-	ieee80211_recalc_min_chandef(sdata);
 
 	cleanup_single_sta(sta);
 }
@@ -1808,14 +1806,17 @@ int sta_info_move_state(struct sta_info *sta,
 			clear_bit(WLAN_STA_AUTH, &sta->_flags);
 		break;
 	case IEEE80211_STA_AUTH:
-		if (sta->sta_state == IEEE80211_STA_NONE)
+		if (sta->sta_state == IEEE80211_STA_NONE) {
 			set_bit(WLAN_STA_AUTH, &sta->_flags);
-		else if (sta->sta_state == IEEE80211_STA_ASSOC)
+		} else if (sta->sta_state == IEEE80211_STA_ASSOC) {
 			clear_bit(WLAN_STA_ASSOC, &sta->_flags);
+			ieee80211_recalc_min_chandef(sta->sdata);
+		}
 		break;
 	case IEEE80211_STA_ASSOC:
 		if (sta->sta_state == IEEE80211_STA_AUTH) {
 			set_bit(WLAN_STA_ASSOC, &sta->_flags);
+			ieee80211_recalc_min_chandef(sta->sdata);
 		} else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
 			if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
 			    (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
-- 
cgit v1.2.3-59-g8ed1b


From 506bcfa8abebdbcebdc17b03e96e38dc0b8ce765 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Sun, 13 Dec 2015 15:41:05 +0200
Subject: mac80211: limit the A-MSDU Tx based on peer's capabilities

In VHT, the specification allows to limit the number of
MSDUs in an A-MSDU in the Extended Capabilities IE. There
is also a limitation on the byte size in the VHT IE.
In HT, the only limitation is on the byte size.
Parse the capabilities from the peer and make them
available to the driver.

In HT, there is another limitation when a BA agreement
is active: the byte size can't be greater than 4095.
This is not enforced here.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 19 +++++++++++++++++++
 include/net/mac80211.h    | 14 ++++++++++++++
 net/mac80211/cfg.c        | 29 +++++++++++++++++++++++++++++
 net/mac80211/ht.c         |  5 +++++
 net/mac80211/vht.c        | 17 +++++++++++++++++
 5 files changed, 84 insertions(+)

(limited to 'net')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index d9ddb89533a7..3b1f6cef9513 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -163,6 +163,14 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */
 #define IEEE80211_MAX_FRAME_LEN		2352
 
+/* Maximal size of an A-MSDU */
+#define IEEE80211_MAX_MPDU_LEN_HT_3839		3839
+#define IEEE80211_MAX_MPDU_LEN_HT_7935		7935
+
+#define IEEE80211_MAX_MPDU_LEN_VHT_3895		3895
+#define IEEE80211_MAX_MPDU_LEN_VHT_7991		7991
+#define IEEE80211_MAX_MPDU_LEN_VHT_11454	11454
+
 #define IEEE80211_MAX_SSID_LEN		32
 
 #define IEEE80211_MAX_MESH_ID_LEN	32
@@ -1505,6 +1513,7 @@ struct ieee80211_vht_operation {
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895			0x00000000
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991			0x00000001
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454			0x00000002
+#define IEEE80211_VHT_CAP_MAX_MPDU_MASK				0x00000003
 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ		0x00000004
 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ	0x00000008
 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK			0x0000000C
@@ -2086,6 +2095,16 @@ enum ieee80211_tdls_actioncode {
 #define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED	BIT(5)
 #define WLAN_EXT_CAPA8_OPMODE_NOTIF	BIT(6)
 
+/* Defines the maximal number of MSDUs in an A-MSDU. */
+#define WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB	BIT(7)
+#define WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB	BIT(0)
+
+/*
+ * Fine Timing Measurement Initiator - bit 71 of @WLAN_EID_EXT_CAPABILITY
+ * information element
+ */
+#define WLAN_EXT_CAPA9_FTM_INITIATOR	BIT(7)
+
 /* TDLS specific payload type in the LLC/SNAP header */
 #define WLAN_TDLS_SNAP_RFTYPE	0x2
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5910085af9e6..df5698ed8052 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1714,6 +1714,18 @@ struct ieee80211_sta_rates {
  * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only
  *	valid if the STA is a TDLS peer in the first place.
  * @mfp: indicates whether the STA uses management frame protection or not.
+ * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single
+ *	A-MSDU. Taken from the Extended Capabilities element. 0 means
+ *	unlimited.
+ * @max_amsdu_len: indicates the maximal length of an A-MSDU in bytes. This
+ *	field is always valid for packets with a VHT preamble. For packets
+ *	with a HT preamble, additional limits apply:
+ *		+ If the skb is transmitted as part of a BA agreement, the
+ *		  A-MSDU maximal size is min(max_amsdu_len, 4065) bytes.
+ *		+ If the skb is not part of a BA aggreement, the A-MSDU maximal
+ *		  size is min(max_amsdu_len, 7935) bytes.
+ *	Both additional HT limits must be enforced by the low level driver.
+ *	This is defined by the spec (IEEE 802.11-2012 section 8.3.2.2 NOTE 2).
  * @txq: per-TID data TX queues (if driver uses the TXQ abstraction)
  */
 struct ieee80211_sta {
@@ -1732,6 +1744,8 @@ struct ieee80211_sta {
 	bool tdls;
 	bool tdls_initiator;
 	bool mfp;
+	u8 max_amsdu_subframes;
+	u16 max_amsdu_len;
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS];
 
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 166a29fe6c35..66d22de93c8d 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1131,6 +1131,34 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 		sta->sta.max_sp = params->max_sp;
 	}
 
+	/* The sender might not have sent the last bit, consider it to be 0 */
+	if (params->ext_capab_len >= 8) {
+		u8 val = (params->ext_capab[7] &
+			  WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB) >> 7;
+
+		/* we did get all the bits, take the MSB as well */
+		if (params->ext_capab_len >= 9) {
+			u8 val_msb = params->ext_capab[8] &
+				WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB;
+			val_msb <<= 1;
+			val |= val_msb;
+		}
+
+		switch (val) {
+		case 1:
+			sta->sta.max_amsdu_subframes = 32;
+			break;
+		case 2:
+			sta->sta.max_amsdu_subframes = 16;
+			break;
+		case 3:
+			sta->sta.max_amsdu_subframes = 8;
+			break;
+		default:
+			sta->sta.max_amsdu_subframes = 0;
+		}
+	}
+
 	/*
 	 * cfg80211 validates this (1-2007) and allows setting the AID
 	 * only when creating a new station entry
@@ -1160,6 +1188,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 		ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
 						  params->ht_capa, sta);
 
+	/* VHT can override some HT caps such as the A-MSDU max length */
 	if (params->vht_capa)
 		ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
 						    params->vht_capa, sta);
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 7a76ce639d58..f4a528773563 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -230,6 +230,11 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 	/* set Rx highest rate */
 	ht_cap.mcs.rx_highest = ht_cap_ie->mcs.rx_highest;
 
+	if (ht_cap.cap & IEEE80211_HT_CAP_MAX_AMSDU)
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_7935;
+	else
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_3839;
+
  apply:
 	changed = memcmp(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap));
 
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 050de08bf82e..204cf9ad3019 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -281,6 +281,23 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 	}
 
 	sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
+
+	/* If HT IE reported 3839 bytes only, stay with that size. */
+	if (sta->sta.max_amsdu_len == IEEE80211_MAX_MPDU_LEN_HT_3839)
+		return;
+
+	switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) {
+	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454;
+		break;
+	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991:
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991;
+		break;
+	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895:
+	default:
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895;
+		break;
+	}
 }
 
 enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
-- 
cgit v1.2.3-59-g8ed1b


From de3bb771f471ba7c745a913b95eee70d5fb199b1 Mon Sep 17 00:00:00 2001
From: Ola Olsson <ola1olsson@gmail.com>
Date: Wed, 16 Dec 2015 22:43:35 +0100
Subject: cfg80211: add more warnings for inconsistent ops

Print a warning whenever an expected callback function
lacks implementation.

Signed-off-by: Ola Olsson <ola.olsson@sonymobile.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index b0915515640e..3a9c41bc849a 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -352,6 +352,16 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv,
 	WARN_ON(ops->add_station && !ops->del_station);
 	WARN_ON(ops->add_mpath && !ops->del_mpath);
 	WARN_ON(ops->join_mesh && !ops->leave_mesh);
+	WARN_ON(ops->start_p2p_device && !ops->stop_p2p_device);
+	WARN_ON(ops->start_ap && !ops->stop_ap);
+	WARN_ON(ops->join_ocb && !ops->leave_ocb);
+	WARN_ON(ops->suspend && !ops->resume);
+	WARN_ON(ops->sched_scan_start && !ops->sched_scan_stop);
+	WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel);
+	WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch);
+	WARN_ON(ops->add_tx_ts && !ops->del_tx_ts);
+	WARN_ON(ops->set_tx_power && !ops->get_tx_power);
+	WARN_ON(ops->set_antenna && !ops->get_antenna);
 
 	alloc_size = sizeof(*rdev) + sizeof_priv;
 
-- 
cgit v1.2.3-59-g8ed1b


From 8ac3c70419176b0fbc4aeae30de661f690b2e3ae Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 18 Dec 2015 15:08:34 +0100
Subject: mac80211: refactor HT/VHT to chandef code

The station MLME and IBSS/mesh ones use entirely different
code for interpreting HT and VHT operation elements. Change
the code that interprets them a bit - it now modifies an
existing chandef - and use it also in the MLME code.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c        | 10 ++++-----
 net/mac80211/ieee80211_i.h | 10 ++++-----
 net/mac80211/mesh.c        |  9 ++++----
 net/mac80211/mlme.c        | 41 ++++-------------------------------
 net/mac80211/util.c        | 54 +++++++++++++++++++++++++---------------------
 5 files changed, 46 insertions(+), 78 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index f7fc0e00497f..e5e8a317b865 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -1050,9 +1050,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
 		struct cfg80211_chan_def chandef;
 		enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth;
 
-		ieee80211_ht_oper_to_chandef(channel,
-					     elems->ht_operation,
-					     &chandef);
+		cfg80211_chandef_create(&chandef, channel, NL80211_CHAN_NO_HT);
+		ieee80211_chandef_ht_oper(elems->ht_operation, &chandef);
 
 		memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie));
 		rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
@@ -1066,9 +1065,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
 			struct ieee80211_vht_cap cap_ie;
 			struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap;
 
-			ieee80211_vht_oper_to_chandef(channel,
-						      elems->vht_operation,
-						      &chandef);
+			ieee80211_chandef_vht_oper(elems->vht_operation,
+						   &chandef);
 			memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie));
 			ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
 							    &cap_ie, sta);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 747402d8c7a9..9934447a8b9a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1988,12 +1988,10 @@ int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
 u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
 
 /* channel management */
-void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan,
-				  const struct ieee80211_ht_operation *ht_oper,
-				  struct cfg80211_chan_def *chandef);
-void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan,
-				   const struct ieee80211_vht_operation *oper,
-				   struct cfg80211_chan_def *chandef);
+bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
+			       struct cfg80211_chan_def *chandef);
+bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper,
+				struct cfg80211_chan_def *chandef);
 u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c);
 
 int __must_check
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index fa28500f28fd..9a8e7b57c86e 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -91,11 +91,10 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
 	if (sdata->vif.bss_conf.basic_rates != basic_rates)
 		return false;
 
-	ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan,
-				     ie->ht_operation, &sta_chan_def);
-
-	ieee80211_vht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan,
-				      ie->vht_operation, &sta_chan_def);
+	cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan,
+				NL80211_CHAN_NO_HT);
+	ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def);
+	ieee80211_chandef_vht_oper(ie->vht_operation, &sta_chan_def);
 
 	if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef,
 					 &sta_chan_def))
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 4af9b2bcc020..e6d198b981ab 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -196,16 +196,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 
 	/* check 40 MHz support, if we have it */
 	if (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) {
-		switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
-		case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
-			chandef->width = NL80211_CHAN_WIDTH_40;
-			chandef->center_freq1 += 10;
-			break;
-		case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
-			chandef->width = NL80211_CHAN_WIDTH_40;
-			chandef->center_freq1 -= 10;
-			break;
-		}
+		ieee80211_chandef_ht_oper(ht_oper, chandef);
 	} else {
 		/* 40 MHz (and 80 MHz) must be supported for VHT */
 		ret = IEEE80211_STA_DISABLE_VHT;
@@ -219,35 +210,11 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 		goto out;
 	}
 
-	vht_chandef.chan = channel;
-	vht_chandef.center_freq1 =
-		ieee80211_channel_to_frequency(vht_oper->center_freq_seg1_idx,
-					       channel->band);
-	vht_chandef.center_freq2 = 0;
-
-	switch (vht_oper->chan_width) {
-	case IEEE80211_VHT_CHANWIDTH_USE_HT:
-		vht_chandef.width = chandef->width;
-		vht_chandef.center_freq1 = chandef->center_freq1;
-		break;
-	case IEEE80211_VHT_CHANWIDTH_80MHZ:
-		vht_chandef.width = NL80211_CHAN_WIDTH_80;
-		break;
-	case IEEE80211_VHT_CHANWIDTH_160MHZ:
-		vht_chandef.width = NL80211_CHAN_WIDTH_160;
-		break;
-	case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
-		vht_chandef.width = NL80211_CHAN_WIDTH_80P80;
-		vht_chandef.center_freq2 =
-			ieee80211_channel_to_frequency(
-				vht_oper->center_freq_seg2_idx,
-				channel->band);
-		break;
-	default:
+	vht_chandef = *chandef;
+	if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) {
 		if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
 			sdata_info(sdata,
-				   "AP VHT operation IE has invalid channel width (%d), disable VHT\n",
-				   vht_oper->chan_width);
+				   "AP VHT information is invalid, disable VHT\n");
 		ret = IEEE80211_STA_DISABLE_VHT;
 		goto out;
 	}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index f4b2c04e7d81..7d0479e31674 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2383,17 +2383,13 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
 	return pos + sizeof(struct ieee80211_vht_operation);
 }
 
-void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan,
-				  const struct ieee80211_ht_operation *ht_oper,
-				  struct cfg80211_chan_def *chandef)
+bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
+			       struct cfg80211_chan_def *chandef)
 {
 	enum nl80211_channel_type channel_type;
 
-	if (!ht_oper) {
-		cfg80211_chandef_create(chandef, control_chan,
-					NL80211_CHAN_NO_HT);
-		return;
-	}
+	if (!ht_oper)
+		return false;
 
 	switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
 	case IEEE80211_HT_PARAM_CHA_SEC_NONE:
@@ -2407,42 +2403,52 @@ void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan,
 		break;
 	default:
 		channel_type = NL80211_CHAN_NO_HT;
+		return false;
 	}
 
-	cfg80211_chandef_create(chandef, control_chan, channel_type);
+	cfg80211_chandef_create(chandef, chandef->chan, channel_type);
+	return true;
 }
 
-void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan,
-				   const struct ieee80211_vht_operation *oper,
-				   struct cfg80211_chan_def *chandef)
+bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper,
+				struct cfg80211_chan_def *chandef)
 {
+	struct cfg80211_chan_def new = *chandef;
+	int cf1, cf2;
+
 	if (!oper)
-		return;
+		return false;
 
-	chandef->chan = control_chan;
+	cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx,
+					     chandef->chan->band);
+	cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg2_idx,
+					     chandef->chan->band);
 
 	switch (oper->chan_width) {
 	case IEEE80211_VHT_CHANWIDTH_USE_HT:
 		break;
 	case IEEE80211_VHT_CHANWIDTH_80MHZ:
-		chandef->width = NL80211_CHAN_WIDTH_80;
+		new.width = NL80211_CHAN_WIDTH_80;
+		new.center_freq1 = cf1;
 		break;
 	case IEEE80211_VHT_CHANWIDTH_160MHZ:
-		chandef->width = NL80211_CHAN_WIDTH_160;
+		new.width = NL80211_CHAN_WIDTH_160;
+		new.center_freq1 = cf1;
 		break;
 	case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
-		chandef->width = NL80211_CHAN_WIDTH_80P80;
+		new.width = NL80211_CHAN_WIDTH_80P80;
+		new.center_freq1 = cf1;
+		new.center_freq2 = cf2;
 		break;
 	default:
-		break;
+		return false;
 	}
 
-	chandef->center_freq1 =
-		ieee80211_channel_to_frequency(oper->center_freq_seg1_idx,
-					       control_chan->band);
-	chandef->center_freq2 =
-		ieee80211_channel_to_frequency(oper->center_freq_seg2_idx,
-					       control_chan->band);
+	if (!cfg80211_chandef_valid(&new))
+		return false;
+
+	*chandef = new;
+	return true;
 }
 
 int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
-- 
cgit v1.2.3-59-g8ed1b


From dd21dfc645d5dce0657af78761b3fa11a3a95398 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Jan 2016 10:39:23 +0100
Subject: rfkill: disentangle polling pause and suspend

When suspended while polling is paused, polling will erroneously
resume at resume time. Fix this by tracking pause and suspend in
separate state variable and adding the necessary checks.

Clarify the documentation on this as well.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill.h |  3 ++-
 net/rfkill/core.c      | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index d9010789b4e8..7af625f6d226 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -104,7 +104,8 @@ int __must_check rfkill_register(struct rfkill *rfkill);
  *
  * Pause polling -- say transmitter is off for other reasons.
  * NOTE: not necessary for suspend/resume -- in that case the
- * core stops polling anyway
+ * core stops polling anyway (but will also correctly handle
+ * the case of polling having been paused before suspend.)
  */
 void rfkill_pause_polling(struct rfkill *rfkill);
 
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index f53bf3b6558b..166439995f34 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -57,6 +57,8 @@ struct rfkill {
 
 	bool			registered;
 	bool			persistent;
+	bool			polling_paused;
+	bool			suspended;
 
 	const struct rfkill_ops	*ops;
 	void			*data;
@@ -786,6 +788,7 @@ void rfkill_pause_polling(struct rfkill *rfkill)
 	if (!rfkill->ops->poll)
 		return;
 
+	rfkill->polling_paused = true;
 	cancel_delayed_work_sync(&rfkill->poll_work);
 }
 EXPORT_SYMBOL(rfkill_pause_polling);
@@ -797,6 +800,11 @@ void rfkill_resume_polling(struct rfkill *rfkill)
 	if (!rfkill->ops->poll)
 		return;
 
+	rfkill->polling_paused = false;
+
+	if (rfkill->suspended)
+		return;
+
 	queue_delayed_work(system_power_efficient_wq,
 			   &rfkill->poll_work, 0);
 }
@@ -807,7 +815,8 @@ static int rfkill_suspend(struct device *dev)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
 
-	rfkill_pause_polling(rfkill);
+	rfkill->suspended = true;
+	cancel_delayed_work_sync(&rfkill->poll_work);
 
 	return 0;
 }
@@ -817,12 +826,16 @@ static int rfkill_resume(struct device *dev)
 	struct rfkill *rfkill = to_rfkill(dev);
 	bool cur;
 
+	rfkill->suspended = false;
+
 	if (!rfkill->persistent) {
 		cur = !!(rfkill->state & RFKILL_BLOCK_SW);
 		rfkill_set_block(rfkill, cur);
 	}
 
-	rfkill_resume_polling(rfkill);
+	if (rfkill->ops->poll && !rfkill->polling_paused)
+		queue_delayed_work(system_power_efficient_wq,
+				   &rfkill->poll_work, 0);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 573a2b51acb48e745094cf3cb942cf6a55d2bf30 Mon Sep 17 00:00:00 2001
From: Ola Olsson <ola1olsson@gmail.com>
Date: Sun, 10 Jan 2016 18:41:42 +0100
Subject: cfg80211: Fix some linguistics in Kconfig

Signed-off-by: Ola Olsson <ola.olsson@sonymobile.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/Kconfig | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index ec3bf30dd526..6c606120abfe 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -50,8 +50,8 @@ config CFG80211_DEVELOPER_WARNINGS
 	default n
 	help
 	  This option enables some additional warnings that help
-	  cfg80211 developers and driver developers, but that can
-	  trigger due to races with userspace.
+	  cfg80211 developers and driver developers, but beware that
+	  they can also trigger due to races with userspace.
 
 	  For example, when a driver reports that it was disconnected
 	  from the AP, but the user disconnects manually at the same
@@ -110,7 +110,7 @@ config CFG80211_REG_RELAX_NO_IR
 	 interface which associated to an AP which userspace assumes or confirms
 	 to be an authorized master, i.e., with radar detection support and DFS
 	 capabilities. However, note that in order to not create daisy chain
-	 scenarios, this relaxation is not allowed in cases that the BSS client
+	 scenarios, this relaxation is not allowed in cases where the BSS client
 	 is associated to P2P GO and in addition the P2P GO instantiated on
 	 a channel due to this relaxation should not allow connection from
 	 non P2P clients.
@@ -135,7 +135,7 @@ config CFG80211_DEBUGFS
 	depends on CFG80211
 	depends on DEBUG_FS
 	---help---
-	  You can enable this if you want to debugfs entries for cfg80211.
+	  You can enable this if you want debugfs entries for cfg80211.
 
 	  If unsure, say N.
 
@@ -146,7 +146,7 @@ config CFG80211_INTERNAL_REGDB
 	---help---
 	  This option generates an internal data structure representing
 	  the wireless regulatory rules described in net/wireless/db.txt
-	  and includes code to query that database.  This is an alternative
+	  and includes code to query that database. This is an alternative
 	  to using CRDA for defining regulatory rules for the kernel.
 
 	  Using this option requires some parsing of the db.txt at build time,
@@ -159,7 +159,7 @@ config CFG80211_INTERNAL_REGDB
 
 	  http://wireless.kernel.org/en/developers/Regulatory
 
-	  Most distributions have a CRDA package.  So if unsure, say N.
+	  Most distributions have a CRDA package. So if unsure, say N.
 
 config CFG80211_CRDA_SUPPORT
 	bool "support CRDA" if CFG80211_INTERNAL_REGDB
-- 
cgit v1.2.3-59-g8ed1b


From f3e7fae248224b9441098e3c344b48b1cd1d9eca Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Tue, 19 Jan 2016 10:42:37 -0500
Subject: rfkill: use variable instead of duplicating the expression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RFKILL_BLOCK_SW value have just been saved to prev, no need to check it
again in the if expression. This makes code a little bit easier to read.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 166439995f34..061ed37c07cf 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -287,7 +287,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 	spin_lock_irqsave(&rfkill->lock, flags);
 	prev = rfkill->state & RFKILL_BLOCK_SW;
 
-	if (rfkill->state & RFKILL_BLOCK_SW)
+	if (prev)
 		rfkill->state |= RFKILL_BLOCK_SW_PREV;
 	else
 		rfkill->state &= ~RFKILL_BLOCK_SW_PREV;
-- 
cgit v1.2.3-59-g8ed1b


From 1926e260d8d377b95b57a49eefeb54ac42919f53 Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Tue, 19 Jan 2016 10:42:38 -0500
Subject: rfkill: remove/inline __rfkill_set_hw_state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__rfkill_set_hw_state() is only one used in rfkill_set_hw_state(), and
none of them are long or complicated, so merging the two makes the code
easier to read.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 061ed37c07cf..076590e92dde 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -235,29 +235,6 @@ static void rfkill_event(struct rfkill *rfkill)
 	rfkill_send_events(rfkill, RFKILL_OP_CHANGE);
 }
 
-static bool __rfkill_set_hw_state(struct rfkill *rfkill,
-				  bool blocked, bool *change)
-{
-	unsigned long flags;
-	bool prev, any;
-
-	BUG_ON(!rfkill);
-
-	spin_lock_irqsave(&rfkill->lock, flags);
-	prev = !!(rfkill->state & RFKILL_BLOCK_HW);
-	if (blocked)
-		rfkill->state |= RFKILL_BLOCK_HW;
-	else
-		rfkill->state &= ~RFKILL_BLOCK_HW;
-	*change = prev != blocked;
-	any = !!(rfkill->state & RFKILL_BLOCK_ANY);
-	spin_unlock_irqrestore(&rfkill->lock, flags);
-
-	rfkill_led_trigger_event(rfkill);
-
-	return any;
-}
-
 /**
  * rfkill_set_block - wrapper for set_block method
  *
@@ -482,14 +459,26 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type)
 
 bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 {
-	bool ret, change;
+	unsigned long flags;
+	bool ret, prev;
+
+	BUG_ON(!rfkill);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	prev = !!(rfkill->state & RFKILL_BLOCK_HW);
+	if (blocked)
+		rfkill->state |= RFKILL_BLOCK_HW;
+	else
+		rfkill->state &= ~RFKILL_BLOCK_HW;
+	ret = !!(rfkill->state & RFKILL_BLOCK_ANY);
+	spin_unlock_irqrestore(&rfkill->lock, flags);
 
-	ret = __rfkill_set_hw_state(rfkill, blocked, &change);
+	rfkill_led_trigger_event(rfkill);
 
 	if (!rfkill->registered)
 		return ret;
 
-	if (change)
+	if (prev != blocked)
 		schedule_work(&rfkill->uevent_work);
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From e2a35e89291d70d1c4668b3216f84ec740d36be3 Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Tue, 19 Jan 2016 10:42:39 -0500
Subject: rfkill: Remove obsolete "claim" sysfs interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This was scheduled to be removed in 2012 by:

 commit 69c86373c6ea1149aa559e6088362d58d8ec8835
 Author: florian@mickler.org <florian@mickler.org>
 Date:   Wed Feb 24 12:05:16 2010 +0100

     Document the rfkill sysfs ABI

     This moves sysfs ABI info from Documentation/rfkill.txt to the
     ABI subfolder and reformats it.

     This also schedules the deprecated sysfs parts to be removed in
     2012 (claim file) and 2014 (state file).

     Signed-off-by: Florian Mickler <florian@mickler.org>
     Signed-off-by: John W. Linville <linville@tuxdriver.com>

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 Documentation/ABI/obsolete/sysfs-class-rfkill |  9 ---------
 Documentation/ABI/removed/sysfs-class-rfkill  | 13 +++++++++++++
 net/rfkill/core.c                             | 11 +----------
 3 files changed, 14 insertions(+), 19 deletions(-)
 create mode 100644 Documentation/ABI/removed/sysfs-class-rfkill

(limited to 'net')

diff --git a/Documentation/ABI/obsolete/sysfs-class-rfkill b/Documentation/ABI/obsolete/sysfs-class-rfkill
index ff60ad9eca4c..e736d145085f 100644
--- a/Documentation/ABI/obsolete/sysfs-class-rfkill
+++ b/Documentation/ABI/obsolete/sysfs-class-rfkill
@@ -18,12 +18,3 @@ Values: 	A numeric value.
 		2: RFKILL_STATE_HARD_BLOCKED
 			transmitter is forced off by something outside of
 			the driver's control.
-
-What:		/sys/class/rfkill/rfkill[0-9]+/claim
-Date:		09-Jul-2007
-KernelVersion	v2.6.22
-Contact:	linux-wireless@vger.kernel.org
-Description:	This file is deprecated because there no longer is a way to
-		claim just control over a single rfkill instance.
-		This file is scheduled to be removed in 2012.
-Values: 	0: Kernel handles events
diff --git a/Documentation/ABI/removed/sysfs-class-rfkill b/Documentation/ABI/removed/sysfs-class-rfkill
new file mode 100644
index 000000000000..3ce6231f20b2
--- /dev/null
+++ b/Documentation/ABI/removed/sysfs-class-rfkill
@@ -0,0 +1,13 @@
+rfkill - radio frequency (RF) connector kill switch support
+
+For details to this subsystem look at Documentation/rfkill.txt.
+
+What:		/sys/class/rfkill/rfkill[0-9]+/claim
+Date:		09-Jul-2007
+KernelVersion	v2.6.22
+Contact:	linux-wireless@vger.kernel.org
+Description:	This file was deprecated because there no longer was a way to
+		claim just control over a single rfkill instance.
+		This file was scheduled to be removed in 2012, and was removed
+		in 2016.
+Values: 	0: Kernel handles events
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 076590e92dde..a805831d5d9b 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -311,8 +311,7 @@ static atomic_t rfkill_input_disabled = ATOMIC_INIT(0);
  * @blocked: the new state
  *
  * This function sets the state of all switches of given type,
- * unless a specific switch is claimed by userspace (in which case,
- * that switch is left alone) or suspended.
+ * unless a specific switch is suspended.
  *
  * Caller must have acquired rfkill_global_mutex.
  */
@@ -721,20 +720,12 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RW(state);
 
-static ssize_t claim_show(struct device *dev, struct device_attribute *attr,
-			  char *buf)
-{
-	return sprintf(buf, "%d\n", 0);
-}
-static DEVICE_ATTR_RO(claim);
-
 static struct attribute *rfkill_dev_attrs[] = {
 	&dev_attr_name.attr,
 	&dev_attr_type.attr,
 	&dev_attr_index.attr,
 	&dev_attr_persistent.attr,
 	&dev_attr_state.attr,
-	&dev_attr_claim.attr,
 	&dev_attr_soft.attr,
 	&dev_attr_hard.attr,
 	NULL,
-- 
cgit v1.2.3-59-g8ed1b


From b8631c003307bc7abf0180d950c42d9032ebda70 Mon Sep 17 00:00:00 2001
From: Sunil Shahu <shshahu@gmail.com>
Date: Thu, 21 Jan 2016 11:06:25 +0530
Subject: mac80211: mesh_plink: remove redundant sta_info check

Remove unnecessory "if" statement and club it with previos "if" block.

Signed-off-by: Sunil Shahu <shshahu@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh_plink.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index bd3d55eb21d4..a07e93c21c9e 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -976,6 +976,10 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
 			mpl_dbg(sdata, "Mesh plink error: no more free plinks\n");
 			goto out;
 		}
+
+		/* new matching peer */
+		event = OPN_ACPT;
+		goto out;
 	} else {
 		if (!test_sta_flag(sta, WLAN_STA_AUTH)) {
 			mpl_dbg(sdata, "Mesh plink: Action frame from non-authed peer\n");
@@ -985,12 +989,6 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
 			goto out;
 	}
 
-	/* new matching peer */
-	if (!sta) {
-		event = OPN_ACPT;
-		goto out;
-	}
-
 	switch (ftype) {
 	case WLAN_SP_MESH_PEERING_OPEN:
 		if (!matches_local)
-- 
cgit v1.2.3-59-g8ed1b


From 2a58d42c1e018ad514d4e23fd33fb2ded95d3ee6 Mon Sep 17 00:00:00 2001
From: Michal Kazior <michal.kazior@tieto.com>
Date: Thu, 21 Jan 2016 14:23:07 +0100
Subject: mac80211: fix txq queue related crashes

The driver can access the queue simultanously
while mac80211 tears down the interface. Without
spinlock protection this could lead to corrupting
sk_buff_head and subsequently to an invalid
pointer dereference.

Fixes: ba8c3d6f16a1 ("mac80211: add an intermediate software queue implementation")
Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 33ae3c81bfc5..0451f120746e 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -977,7 +977,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 	if (sdata->vif.txq) {
 		struct txq_info *txqi = to_txq_info(sdata->vif.txq);
 
+		spin_lock_bh(&txqi->queue.lock);
 		ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
+		spin_unlock_bh(&txqi->queue.lock);
+
 		atomic_set(&sdata->txqs_len[txqi->txq.ac], 0);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From cf44012810ccdd8fd947518e965cb04b7b8498be Mon Sep 17 00:00:00 2001
From: Michal Kazior <michal.kazior@tieto.com>
Date: Mon, 25 Jan 2016 14:43:24 +0100
Subject: mac80211: fix unnecessary frame drops in mesh fwding

The ieee80211_queue_stopped() expects hw queue
number but it was given raw WMM AC number instead.

This could cause frame drops and problems with
traffic in some cases - most notably if driver
doesn't map AC numbers to queue numbers 1:1 and
uses ieee80211_stop_queues() and
ieee80211_wake_queue() only without ever calling
ieee80211_wake_queues().

On ath10k it was possible to hit this problem in
the following case:

  1. wlan0 uses queue 0
     (ath10k maps queues per vif)
  2. offchannel uses queue 15
  3. queues 1-14 are unused
  4. ieee80211_stop_queues()
  5. ieee80211_wake_queue(q=0)
  6. ieee80211_wake_queue(q=15)
     (other queues are not woken up because both
      driver and mac80211 know other queues are
      unused)
  7. ieee80211_rx_h_mesh_fwding()
  8. ieee80211_select_queue_80211() returns 2
  9. ieee80211_queue_stopped(q=2) returns true
 10. frame is dropped (oops!)

Fixes: d3c1597b8d1b ("mac80211: fix forwarded mesh frame queue mapping")
Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index ae993edfdecf..664e8861edbe 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2235,7 +2235,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 	struct ieee80211_local *local = rx->local;
 	struct ieee80211_sub_if_data *sdata = rx->sdata;
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
-	u16 q, hdrlen;
+	u16 ac, q, hdrlen;
 
 	hdr = (struct ieee80211_hdr *) skb->data;
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -2304,7 +2304,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 	    ether_addr_equal(sdata->vif.addr, hdr->addr3))
 		return RX_CONTINUE;
 
-	q = ieee80211_select_queue_80211(sdata, skb, hdr);
+	ac = ieee80211_select_queue_80211(sdata, skb, hdr);
+	q = sdata->vif.hw_queue[ac];
 	if (ieee80211_queue_stopped(&local->hw, q)) {
 		IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion);
 		return RX_DROP_MONITOR;
-- 
cgit v1.2.3-59-g8ed1b


From f4a0f0c5264e72d9279fbf9cf48a061526e8f788 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 25 Jan 2016 15:46:34 +0200
Subject: mac80211: add RX_FLAG_MACTIME_PLCP_START

The timestamp given by iwlwifi is at the beginning of the
frame over the air, at (or during) the SYNC field. Allow
such timestamps to be given to mac80211, at least (for now)
for frames with non-HT/VHT preambles.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  3 +++
 net/mac80211/ieee80211_i.h |  8 +++++++-
 net/mac80211/util.c        | 14 +++++++++++++-
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 566df20dc957..31337f81ec03 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1010,6 +1010,8 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_MACTIME_END: The timestamp passed in the RX status (@mactime
  *	field) is valid and contains the time the last symbol of the MPDU
  *	(including FCS) was received.
+ * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime
+ *	field) is valid and contains the time the SYNC preamble was received.
  * @RX_FLAG_SHORTPRE: Short preamble was used for this frame
  * @RX_FLAG_HT: HT MCS was used and rate_idx is MCS index
  * @RX_FLAG_VHT: VHT MCS was used and rate_index is MCS index
@@ -1058,6 +1060,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
 enum mac80211_rx_flags {
 	RX_FLAG_MMIC_ERROR		= BIT(0),
 	RX_FLAG_DECRYPTED		= BIT(1),
+	RX_FLAG_MACTIME_PLCP_START	= BIT(2),
 	RX_FLAG_MMIC_STRIPPED		= BIT(3),
 	RX_FLAG_IV_STRIPPED		= BIT(4),
 	RX_FLAG_FAILED_FCS_CRC		= BIT(5),
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9934447a8b9a..a29f61dc9c06 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1466,7 +1466,13 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
 {
 	WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START &&
 		     status->flag & RX_FLAG_MACTIME_END);
-	return status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END);
+	if (status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END))
+		return true;
+	/* can't handle HT/VHT preamble yet */
+	if (status->flag & RX_FLAG_MACTIME_PLCP_START &&
+	    !(status->flag & (RX_FLAG_HT | RX_FLAG_VHT)))
+		return true;
+	return false;
 }
 
 u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 7d0479e31674..fb90d9c5df59 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -4,7 +4,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2015	Intel Deutschland GmbH
+ * Copyright (C) 2015-2016	Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -2671,6 +2671,18 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
 		sband = local->hw.wiphy->bands[status->band];
 		bitrate = sband->bitrates[status->rate_idx].bitrate;
 		ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift));
+
+		if (status->flag & RX_FLAG_MACTIME_PLCP_START) {
+			/* TODO: handle HT/VHT preambles */
+			if (status->band == IEEE80211_BAND_5GHZ) {
+				ts += 20 << shift;
+				mpdu_offset += 2;
+			} else if (status->flag & RX_FLAG_SHORTPRE) {
+				ts += 96;
+			} else {
+				ts += 192;
+			}
+		}
 	}
 
 	rate = cfg80211_calculate_bitrate(&ri);
-- 
cgit v1.2.3-59-g8ed1b


From d321cd014e51baab475efbdec468255b9e0ec822 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Mon, 25 Jan 2016 15:46:35 +0200
Subject: mac80211: fix ibss scan parameters

When joining IBSS a full scan should be initiated in order to search
for existing cell, unless the fixed_channel parameter was set.
A default channel to create the IBSS on if no cell was found is
provided as well.
However - a scan is initiated only on the default channel provided
regardless of whether ifibss->fixed_channel is set or not, with the
obvious result of the cell not joining existing IBSS cell that is
on another channel.

Fixes: 76bed0f43b27 ("mac80211: IBSS fix scan request")
Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index e5e8a317b865..9b983788ee51 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -7,6 +7,7 @@
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright(c) 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -1483,14 +1484,21 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
 
 		sdata_info(sdata, "Trigger new scan to find an IBSS to join\n");
 
-		num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy,
-							 &ifibss->chandef,
-							 channels,
-							 ARRAY_SIZE(channels));
 		scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef);
-		ieee80211_request_ibss_scan(sdata, ifibss->ssid,
-					    ifibss->ssid_len, channels, num,
-					    scan_width);
+
+		if (ifibss->fixed_channel) {
+			num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy,
+								 &ifibss->chandef,
+								 channels,
+								 ARRAY_SIZE(channels));
+			ieee80211_request_ibss_scan(sdata, ifibss->ssid,
+						    ifibss->ssid_len, channels,
+						    num, scan_width);
+		} else {
+			ieee80211_request_ibss_scan(sdata, ifibss->ssid,
+						    ifibss->ssid_len, NULL,
+						    0, scan_width);
+		}
 	} else {
 		int interval = IEEE80211_SCAN_INTERVAL;
 
-- 
cgit v1.2.3-59-g8ed1b


From 89f774e6c41efdcede9d4b37867af5aabcc835d5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 25 Jan 2016 15:46:36 +0200
Subject: mac80211: always print a message when disconnecting

Make sure there's at least a debug message whenever the
connection to the AP is terminated.

Also change one message from wiphy_debug() to the common
mlme_dbg().

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e6d198b981ab..857089de475f 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3544,6 +3544,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 				elems.ht_cap_elem, elems.ht_operation,
 				elems.vht_operation, bssid, &changed)) {
 		mutex_unlock(&local->sta_mtx);
+		sdata_info(sdata,
+			   "failed to follow AP %pM bandwidth change, disconnect\n",
+			   bssid);
 		ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
 				       WLAN_REASON_DEAUTH_LEAVING,
 				       true, deauth_buf);
@@ -3919,11 +3922,9 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
 			 * We actually lost the connection ... or did we?
 			 * Let's make sure!
 			 */
-			wiphy_debug(local->hw.wiphy,
-				    "%s: No probe response from AP %pM"
-				    " after %dms, disconnecting.\n",
-				    sdata->name,
-				    bssid, probe_wait_ms);
+			mlme_dbg(sdata,
+				 "No probe response from AP %pM after %dms, disconnecting.\n",
+				 bssid, probe_wait_ms);
 
 			ieee80211_sta_connection_lost(sdata, bssid,
 				WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false);
@@ -4511,6 +4512,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
 	if (ifmgd->associated) {
 		u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
 
+		sdata_info(sdata,
+			   "disconnect from AP %pM for new auth to %pM\n",
+			   ifmgd->associated->bssid, req->bss->bssid);
 		ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
 				       WLAN_REASON_UNSPECIFIED,
 				       false, frame_buf);
@@ -4579,6 +4583,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	if (ifmgd->associated) {
 		u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
 
+		sdata_info(sdata,
+			   "disconnect from AP %pM for new assoc to %pM\n",
+			   ifmgd->associated->bssid, req->bss->bssid);
 		ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
 				       WLAN_REASON_UNSPECIFIED,
 				       false, frame_buf);
-- 
cgit v1.2.3-59-g8ed1b


From 0ef049dc1167fe834d0ad5d63f89eddc5c70f6e4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 26 Jan 2016 23:05:31 +0100
Subject: mac80211: avoid excessive stack usage in sta_info

When CONFIG_OPTIMIZE_INLINING is set, the sta_info_insert_finish
function consumes more stack than normally, exceeding the
1024 byte limit on ARM:

net/mac80211/sta_info.c: In function 'sta_info_insert_finish':
net/mac80211/sta_info.c:561:1: error: the frame size of 1080 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]

It turns out that there are two functions that put a 'struct station_info'
on the stack: __sta_info_destroy_part2 and sta_info_insert_finish, and
this structure alone requires up to 792 bytes.

Hoping that both are called rarely enough, this replaces the
on-stack structure with a dynamic allocation, which unfortunately
requires some suboptimal error handling for out-of-memory.

The __sta_info_destroy_part2 function is actually affected by the
stack usage twice because it calls cfg80211_del_sta_sinfo(), which
has another instance of struct station_info on its stack.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 98b6218388e3 ("mac80211/cfg80211: add station events")
Fixes: 6f7a8d26e266 ("mac80211: send statistics with delete station event")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 7e007cf12cb2..b28e7a220d56 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -498,11 +498,17 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
 {
 	struct ieee80211_local *local = sta->local;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	int err = 0;
 
 	lockdep_assert_held(&local->sta_mtx);
 
+	sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
+	if (!sinfo) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
 	/* check if STA exists already */
 	if (sta_info_get_bss(sdata, sta->sta.addr)) {
 		err = -EEXIST;
@@ -533,10 +539,9 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
 	ieee80211_sta_debugfs_add(sta);
 	rate_control_add_sta_debugfs(sta);
 
-	memset(&sinfo, 0, sizeof(sinfo));
-	sinfo.filled = 0;
-	sinfo.generation = local->sta_generation;
-	cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL);
+	sinfo->generation = local->sta_generation;
+	cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
+	kfree(sinfo);
 
 	sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr);
 
@@ -897,7 +902,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
 {
 	struct ieee80211_local *local = sta->local;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
-	struct station_info sinfo = {};
+	struct station_info *sinfo;
 	int ret;
 
 	/*
@@ -935,8 +940,11 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
 
 	sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr);
 
-	sta_set_sinfo(sta, &sinfo);
-	cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL);
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (sinfo)
+		sta_set_sinfo(sta, sinfo);
+	cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
+	kfree(sinfo);
 
 	rate_control_remove_sta_debugfs(sta);
 	ieee80211_sta_debugfs_remove(sta);
-- 
cgit v1.2.3-59-g8ed1b


From 3f73fe9fd8703123faf663908923ca9e62cb984e Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Sat, 30 Jan 2016 13:17:03 -0500
Subject: mac80211: mesh: drop constant field mean_chain_len

The mean_chain_len field in struct mesh_table is copied whenever a
new mesh table is allocated, but only ever has the value 2 and is
never otherwise updated, so just remove it and use the related
define instead.

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh.h         | 3 ---
 net/mac80211/mesh_pathtbl.c | 9 +++------
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index a1596344c3ba..d941f3a73a4f 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -137,8 +137,6 @@ struct mesh_path {
  * @copy_node: function to copy nodes of the table
  * @size_order: determines size of the table, there will be 2^size_order hash
  *	buckets
- * @mean_chain_len: maximum average length for the hash buckets' list, if it is
- *	reached, the table will grow
  * @known_gates: list of known mesh gates and their mpaths by the station. The
  * gate's mpath may or may not be resolved and active.
  *
@@ -154,7 +152,6 @@ struct mesh_table {
 	void (*free_node) (struct hlist_node *p, bool free_leafs);
 	int (*copy_node) (struct hlist_node *p, struct mesh_table *newtbl);
 	int size_order;
-	int mean_chain_len;
 	struct hlist_head *known_gates;
 	spinlock_t gates_lock;
 
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index dadf8dc6f1cf..c95bdb8bb452 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -160,11 +160,10 @@ static int mesh_table_grow(struct mesh_table *oldtbl,
 	int i;
 
 	if (atomic_read(&oldtbl->entries)
-			< oldtbl->mean_chain_len * (oldtbl->hash_mask + 1))
+			< MEAN_CHAIN_LEN * (oldtbl->hash_mask + 1))
 		return -EAGAIN;
 
 	newtbl->free_node = oldtbl->free_node;
-	newtbl->mean_chain_len = oldtbl->mean_chain_len;
 	newtbl->copy_node = oldtbl->copy_node;
 	newtbl->known_gates = oldtbl->known_gates;
 	atomic_set(&newtbl->entries, atomic_read(&oldtbl->entries));
@@ -585,7 +584,7 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
 
 	hlist_add_head_rcu(&new_node->list, bucket);
 	if (atomic_inc_return(&tbl->entries) >=
-	    tbl->mean_chain_len * (tbl->hash_mask + 1))
+	    MEAN_CHAIN_LEN * (tbl->hash_mask + 1))
 		grow = 1;
 
 	mesh_paths_generation++;
@@ -714,7 +713,7 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata,
 
 	hlist_add_head_rcu(&new_node->list, bucket);
 	if (atomic_inc_return(&tbl->entries) >=
-	    tbl->mean_chain_len * (tbl->hash_mask + 1))
+	    MEAN_CHAIN_LEN * (tbl->hash_mask + 1))
 		grow = 1;
 
 	spin_unlock(&tbl->hashwlock[hash_idx]);
@@ -1076,7 +1075,6 @@ int mesh_pathtbl_init(void)
 		return -ENOMEM;
 	tbl_path->free_node = &mesh_path_node_free;
 	tbl_path->copy_node = &mesh_path_node_copy;
-	tbl_path->mean_chain_len = MEAN_CHAIN_LEN;
 	tbl_path->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC);
 	if (!tbl_path->known_gates) {
 		ret = -ENOMEM;
@@ -1092,7 +1090,6 @@ int mesh_pathtbl_init(void)
 	}
 	tbl_mpp->free_node = &mesh_path_node_free;
 	tbl_mpp->copy_node = &mesh_path_node_copy;
-	tbl_mpp->mean_chain_len = MEAN_CHAIN_LEN;
 	tbl_mpp->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC);
 	if (!tbl_mpp->known_gates) {
 		ret = -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From dfdfc2beb0dd7e3a067d2eeacb4623cb48e77658 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Tue, 26 Jan 2016 17:11:13 +0100
Subject: mac80211: Parse legacy and HT rate in injected frames

Drivers/devices without their own rate control algorithm can get the
information what rates they should use from either the radiotap header of
injected frames or from the rate control algorithm. But the parsing of the
legacy rate information from the radiotap header was removed in commit
e6a9854b05c1 ("mac80211/drivers: rewrite the rate control API").

The removal of this feature heavily reduced the usefulness of frame
injection when wanting to simulate specific transmission behavior. Having
rate parsing together with MCS rates and retry support allows a fine
grained selection of the tx behavior of injected frames for these kind of
tests.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Cc: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 Documentation/networking/mac80211-injection.txt | 17 ++++++
 include/net/mac80211.h                          |  2 +
 net/mac80211/tx.c                               | 72 ++++++++++++++++++++++++-
 3 files changed, 89 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/mac80211-injection.txt b/Documentation/networking/mac80211-injection.txt
index 3a930072b161..ec8f934c2eb2 100644
--- a/Documentation/networking/mac80211-injection.txt
+++ b/Documentation/networking/mac80211-injection.txt
@@ -28,6 +28,23 @@ radiotap headers and used to control injection:
    IEEE80211_RADIOTAP_F_TX_NOACK: frame should be sent without waiting for
 				  an ACK even if it is a unicast frame
 
+ * IEEE80211_RADIOTAP_RATE
+
+   legacy rate for the transmission (only for devices without own rate control)
+
+ * IEEE80211_RADIOTAP_MCS
+
+   HT rate for the transmission (only for devices without own rate control).
+   Also some flags are parsed
+
+   IEEE80211_TX_RC_SHORT_GI: use short guard interval
+   IEEE80211_TX_RC_40_MHZ_WIDTH: send in HT40 mode
+
+ * IEEE80211_RADIOTAP_DATA_RETRIES
+
+   number of retries when either IEEE80211_RADIOTAP_RATE or
+   IEEE80211_RADIOTAP_MCS was used
+
 The injection code can also skip all other currently defined radiotap fields
 facilitating replay of captured radiotap headers directly.
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 31337f81ec03..dbcd69a6bfda 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -708,12 +708,14 @@ enum mac80211_tx_info_flags {
  *	protocol frame (e.g. EAP)
  * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
  *	frame (PS-Poll or uAPSD).
+ * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  *
  * These flags are used in tx_info->control.flags.
  */
 enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PORT_CTRL_PROTO	= BIT(0),
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
+	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 };
 
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 3311ce0f3d6c..723cd7aa8953 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -710,6 +710,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
 
 	info->control.short_preamble = txrc.short_preamble;
 
+	/* don't ask rate control when rate already injected via radiotap */
+	if (info->control.flags & IEEE80211_TX_CTRL_RATE_INJECT)
+		return TX_CONTINUE;
+
 	if (tx->sta)
 		assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC);
 
@@ -1665,15 +1669,24 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
 	ieee80211_tx(sdata, sta, skb, false);
 }
 
-static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
+static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
+					struct sk_buff *skb)
 {
 	struct ieee80211_radiotap_iterator iterator;
 	struct ieee80211_radiotap_header *rthdr =
 		(struct ieee80211_radiotap_header *) skb->data;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_supported_band *sband =
+		local->hw.wiphy->bands[info->band];
 	int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len,
 						   NULL);
 	u16 txflags;
+	u16 rate = 0;
+	bool rate_found = false;
+	u8 rate_retries = 0;
+	u16 rate_flags = 0;
+	u8 mcs_known, mcs_flags;
+	int i;
 
 	info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
 		       IEEE80211_TX_CTL_DONTFRAG;
@@ -1724,6 +1737,35 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
 				info->flags |= IEEE80211_TX_CTL_NO_ACK;
 			break;
 
+		case IEEE80211_RADIOTAP_RATE:
+			rate = *iterator.this_arg;
+			rate_flags = 0;
+			rate_found = true;
+			break;
+
+		case IEEE80211_RADIOTAP_DATA_RETRIES:
+			rate_retries = *iterator.this_arg;
+			break;
+
+		case IEEE80211_RADIOTAP_MCS:
+			mcs_known = iterator.this_arg[0];
+			mcs_flags = iterator.this_arg[1];
+			if (!(mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_MCS))
+				break;
+
+			rate_found = true;
+			rate = iterator.this_arg[2];
+			rate_flags = IEEE80211_TX_RC_MCS;
+
+			if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_GI &&
+			    mcs_flags & IEEE80211_RADIOTAP_MCS_SGI)
+				rate_flags |= IEEE80211_TX_RC_SHORT_GI;
+
+			if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW &&
+			    mcs_flags & IEEE80211_RADIOTAP_MCS_BW_40)
+				rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH;
+			break;
+
 		/*
 		 * Please update the file
 		 * Documentation/networking/mac80211-injection.txt
@@ -1738,6 +1780,32 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
 	if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */
 		return false;
 
+	if (rate_found) {
+		info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT;
+
+		for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+			info->control.rates[i].idx = -1;
+			info->control.rates[i].flags = 0;
+			info->control.rates[i].count = 0;
+		}
+
+		if (rate_flags & IEEE80211_TX_RC_MCS) {
+			info->control.rates[0].idx = rate;
+		} else {
+			for (i = 0; i < sband->n_bitrates; i++) {
+				if (rate * 5 != sband->bitrates[i].bitrate)
+					continue;
+
+				info->control.rates[0].idx = i;
+				break;
+			}
+		}
+
+		info->control.rates[0].flags = rate_flags;
+		info->control.rates[0].count = min_t(u8, rate_retries + 1,
+						     local->hw.max_rate_tries);
+	}
+
 	/*
 	 * remove the radiotap header
 	 * iterator->_max_length was sanity-checked against
@@ -1819,7 +1887,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
 		      IEEE80211_TX_CTL_INJECTED;
 
 	/* process and remove the injection radiotap header */
-	if (!ieee80211_parse_tx_radiotap(skb))
+	if (!ieee80211_parse_tx_radiotap(local, skb))
 		goto fail;
 
 	rcu_read_lock();
-- 
cgit v1.2.3-59-g8ed1b


From f2ac7e301ae6397669ff3f79e691942a9b5d2f39 Mon Sep 17 00:00:00 2001
From: Michal Kazior <michal.kazior@tieto.com>
Date: Wed, 27 Jan 2016 15:26:12 +0100
Subject: mac80211: expose txq queue depth and size to drivers

This will allow drivers to make more educated
decisions whether to defer transmission or not.

Relying on wake_tx_queue() call count implicitly
was not possible because it could be called
without queued frame count actually changing on
software tx aggregation start/stop code paths.

It was also not possible to know how long
byte-wise queue was without dequeueing.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 15 +++++++++++++++
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/iface.c       |  1 +
 net/mac80211/sta_info.c    |  1 +
 net/mac80211/tx.c          |  8 +++++++-
 net/mac80211/util.c        | 14 ++++++++++++++
 6 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index dbcd69a6bfda..fd35fc4d7127 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5596,4 +5596,19 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid);
  */
 struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 				     struct ieee80211_txq *txq);
+
+/**
+ * ieee80211_txq_get_depth - get pending frame/byte count of given txq
+ *
+ * The values are not guaranteed to be coherent with regard to each other, i.e.
+ * txq state can change half-way of this function and the caller may end up
+ * with "new" frame_cnt and "old" byte_cnt or vice-versa.
+ *
+ * @txq: pointer obtained from station or virtual interface
+ * @frame_cnt: pointer to store frame count
+ * @byte_cnt: pointer to store byte count
+ */
+void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
+			     unsigned long *frame_cnt,
+			     unsigned long *byte_cnt);
 #endif /* MAC80211_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a29f61dc9c06..a96f8c0461f6 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -804,6 +804,7 @@ enum txq_info_flags {
 struct txq_info {
 	struct sk_buff_head queue;
 	unsigned long flags;
+	unsigned long byte_cnt;
 
 	/* keep last! */
 	struct ieee80211_txq txq;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 0451f120746e..453b4e741780 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -979,6 +979,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 
 		spin_lock_bh(&txqi->queue.lock);
 		ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
+		txqi->byte_cnt = 0;
 		spin_unlock_bh(&txqi->queue.lock);
 
 		atomic_set(&sdata->txqs_len[txqi->txq.ac], 0);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index b28e7a220d56..5894c0a1c01f 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -116,6 +116,7 @@ static void __cleanup_single_sta(struct sta_info *sta)
 
 			ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
 			atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]);
+			txqi->byte_cnt = 0;
 		}
 	}
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 723cd7aa8953..a5aa275d0434 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1270,7 +1270,11 @@ static void ieee80211_drv_tx(struct ieee80211_local *local,
 	if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending)
 		netif_stop_subqueue(sdata->dev, ac);
 
-	skb_queue_tail(&txqi->queue, skb);
+	spin_lock_bh(&txqi->queue.lock);
+	txqi->byte_cnt += skb->len;
+	__skb_queue_tail(&txqi->queue, skb);
+	spin_unlock_bh(&txqi->queue.lock);
+
 	drv_wake_tx_queue(local, txqi);
 
 	return;
@@ -1298,6 +1302,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	if (!skb)
 		goto out;
 
+	txqi->byte_cnt -= skb->len;
+
 	atomic_dec(&sdata->txqs_len[ac]);
 	if (__netif_subqueue_stopped(sdata->dev, ac))
 		ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index fb90d9c5df59..091f3dd62ad1 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3368,3 +3368,17 @@ void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
 		txqi->txq.ac = IEEE80211_AC_BE;
 	}
 }
+
+void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
+			     unsigned long *frame_cnt,
+			     unsigned long *byte_cnt)
+{
+	struct txq_info *txqi = to_txq_info(txq);
+
+	if (frame_cnt)
+		*frame_cnt = txqi->queue.qlen;
+
+	if (byte_cnt)
+		*byte_cnt = txqi->byte_cnt;
+}
+EXPORT_SYMBOL(ieee80211_txq_get_depth);
-- 
cgit v1.2.3-59-g8ed1b


From fb4ea054857ec719dba84c3b5a7c4ee50d42bd5b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 Jan 2016 16:19:24 +0200
Subject: mac80211: change ieee80211_rx_reorder_ready() arguments

Clean up ieee80211_rx_reorder_ready() callers by passing the RX
TID struct and the index, instead of the frames list. This will
make it more extensible as well.

While at it, move the inline to rx.c as it's only used there.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h | 14 --------------
 net/mac80211/rx.c          | 33 ++++++++++++++++++++++++---------
 2 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a96f8c0461f6..a49c10361f1c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1838,20 +1838,6 @@ static inline void ieee802_11_parse_elems(const u8 *start, size_t len,
 	ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0);
 }
 
-static inline bool ieee80211_rx_reorder_ready(struct sk_buff_head *frames)
-{
-	struct sk_buff *tail = skb_peek_tail(frames);
-	struct ieee80211_rx_status *status;
-
-	if (!tail)
-		return false;
-
-	status = IEEE80211_SKB_RXCB(tail);
-	if (status->flag & RX_FLAG_AMSDU_MORE)
-		return false;
-
-	return true;
-}
 
 extern const int ieee802_1d_to_ac[8];
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 664e8861edbe..1153871b570f 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4,6 +4,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -798,6 +799,23 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
+static inline bool ieee80211_rx_reorder_ready(struct tid_ampdu_rx *tid_agg_rx,
+					      int index)
+{
+	struct sk_buff_head *frames = &tid_agg_rx->reorder_buf[index];
+	struct sk_buff *tail = skb_peek_tail(frames);
+	struct ieee80211_rx_status *status;
+
+	if (!tail)
+		return false;
+
+	status = IEEE80211_SKB_RXCB(tail);
+	if (status->flag & RX_FLAG_AMSDU_MORE)
+		return false;
+
+	return true;
+}
+
 static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
 					    struct tid_ampdu_rx *tid_agg_rx,
 					    int index,
@@ -812,7 +830,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
 	if (skb_queue_empty(skb_list))
 		goto no_frame;
 
-	if (!ieee80211_rx_reorder_ready(skb_list)) {
+	if (!ieee80211_rx_reorder_ready(tid_agg_rx, index)) {
 		__skb_queue_purge(skb_list);
 		goto no_frame;
 	}
@@ -866,7 +884,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
 
 	/* release the buffer until next missing frame */
 	index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
-	if (!ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index]) &&
+	if (!ieee80211_rx_reorder_ready(tid_agg_rx, index) &&
 	    tid_agg_rx->stored_mpdu_num) {
 		/*
 		 * No buffers ready to be released, but check whether any
@@ -875,8 +893,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
 		int skipped = 1;
 		for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
 		     j = (j + 1) % tid_agg_rx->buf_size) {
-			if (!ieee80211_rx_reorder_ready(
-					&tid_agg_rx->reorder_buf[j])) {
+			if (!ieee80211_rx_reorder_ready(tid_agg_rx, j)) {
 				skipped++;
 				continue;
 			}
@@ -903,8 +920,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
 				 skipped) & IEEE80211_SN_MASK;
 			skipped = 0;
 		}
-	} else while (ieee80211_rx_reorder_ready(
-				&tid_agg_rx->reorder_buf[index])) {
+	} else while (ieee80211_rx_reorder_ready(tid_agg_rx, index)) {
 		ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
 						frames);
 		index =	tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
@@ -915,8 +931,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
 
 		for (; j != (index - 1) % tid_agg_rx->buf_size;
 		     j = (j + 1) % tid_agg_rx->buf_size) {
-			if (ieee80211_rx_reorder_ready(
-					&tid_agg_rx->reorder_buf[j]))
+			if (ieee80211_rx_reorder_ready(tid_agg_rx, j))
 				break;
 		}
 
@@ -987,7 +1002,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
 	index = mpdu_seq_num % tid_agg_rx->buf_size;
 
 	/* check if we already stored this frame */
-	if (ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index])) {
+	if (ieee80211_rx_reorder_ready(tid_agg_rx, index)) {
 		dev_kfree_skb(skb);
 		goto out;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 06470f7468c8b6c95e72ebda803a61a99f4ee446 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Thu, 28 Jan 2016 16:19:25 +0200
Subject: mac80211: add API to allow filtering frames in BA sessions

If any frames are dropped that are part of a BA session, the reorder
buffer will "indefinitely" (until the timeout) wait for them to come
in (or a BAR moving the window) and won't release frames after them.
This means it isn't possible to filter frames within a BA session in
firmware.

Introduce an API function that allows such filtering. Calling this
function will move the BA window forward to the new SSN, and allows
marking frames after the SSN as having been filtered, so any future
reordering activity will release frames while skipping the holes.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  | 20 +++++++++++-
 net/mac80211/agg-rx.c   |  1 +
 net/mac80211/rx.c       | 84 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/sta_info.h |  3 ++
 4 files changed, 107 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fd35fc4d7127..57147749ae42 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5,7 +5,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2015 Intel Deutschland GmbH
+ * Copyright (C) 2015 - 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -5193,6 +5193,24 @@ void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw);
 void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap,
 				  const u8 *addr);
 
+/**
+ * ieee80211_mark_rx_ba_filtered_frames - move RX BA window and mark filtered
+ * @pubsta: station struct
+ * @tid: the session's TID
+ * @ssn: starting sequence number of the bitmap, all frames before this are
+ *	assumed to be out of the window after the call
+ * @filtered: bitmap of filtered frames, BIT(0) is the @ssn entry etc.
+ * @received_mpdus: number of received mpdus in firmware
+ *
+ * This function moves the BA window and releases all frames before @ssn, and
+ * marks frames marked in the bitmap as having been filtered. Afterwards, it
+ * checks if any frames in the window starting from @ssn can now be released
+ * (in case they were only waiting for frames that were filtered.)
+ */
+void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
+					  u16 ssn, u64 filtered,
+					  u16 received_mpdus);
+
 /**
  * ieee80211_send_bar - send a BlockAckReq frame
  *
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 2ab54791281d..1b8a5caa221e 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -376,6 +376,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	tid_agg_rx->timeout = timeout;
 	tid_agg_rx->stored_mpdu_num = 0;
 	tid_agg_rx->auto_seq = auto_seq;
+	tid_agg_rx->reorder_buf_filtered = 0;
 	status = WLAN_STATUS_SUCCESS;
 
 	/* activate it for RX */
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1153871b570f..9fb7074f0280 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -19,6 +19,7 @@
 #include <linux/etherdevice.h>
 #include <linux/rcupdate.h>
 #include <linux/export.h>
+#include <linux/bitops.h>
 #include <net/mac80211.h>
 #include <net/ieee80211_radiotap.h>
 #include <asm/unaligned.h>
@@ -806,6 +807,9 @@ static inline bool ieee80211_rx_reorder_ready(struct tid_ampdu_rx *tid_agg_rx,
 	struct sk_buff *tail = skb_peek_tail(frames);
 	struct ieee80211_rx_status *status;
 
+	if (tid_agg_rx->reorder_buf_filtered & BIT_ULL(index))
+		return true;
+
 	if (!tail)
 		return false;
 
@@ -844,6 +848,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
 	}
 
 no_frame:
+	tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
 	tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num);
 }
 
@@ -3300,6 +3305,85 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
 	ieee80211_rx_handlers(&rx, &frames);
 }
 
+void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
+					  u16 ssn, u64 filtered,
+					  u16 received_mpdus)
+{
+	struct sta_info *sta;
+	struct tid_ampdu_rx *tid_agg_rx;
+	struct sk_buff_head frames;
+	struct ieee80211_rx_data rx = {
+		/* This is OK -- must be QoS data frame */
+		.security_idx = tid,
+		.seqno_idx = tid,
+	};
+	int i, diff;
+
+	if (WARN_ON(!pubsta || tid >= IEEE80211_NUM_TIDS))
+		return;
+
+	__skb_queue_head_init(&frames);
+
+	sta = container_of(pubsta, struct sta_info, sta);
+
+	rx.sta = sta;
+	rx.sdata = sta->sdata;
+	rx.local = sta->local;
+
+	rcu_read_lock();
+	tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+	if (!tid_agg_rx)
+		goto out;
+
+	spin_lock_bh(&tid_agg_rx->reorder_lock);
+
+	if (received_mpdus >= IEEE80211_SN_MODULO >> 1) {
+		int release;
+
+		/* release all frames in the reorder buffer */
+		release = (tid_agg_rx->head_seq_num + tid_agg_rx->buf_size) %
+			   IEEE80211_SN_MODULO;
+		ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx,
+						 release, &frames);
+		/* update ssn to match received ssn */
+		tid_agg_rx->head_seq_num = ssn;
+	} else {
+		ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx, ssn,
+						 &frames);
+	}
+
+	/* handle the case that received ssn is behind the mac ssn.
+	 * it can be tid_agg_rx->buf_size behind and still be valid */
+	diff = (tid_agg_rx->head_seq_num - ssn) & IEEE80211_SN_MASK;
+	if (diff >= tid_agg_rx->buf_size) {
+		tid_agg_rx->reorder_buf_filtered = 0;
+		goto release;
+	}
+	filtered = filtered >> diff;
+	ssn += diff;
+
+	/* update bitmap */
+	for (i = 0; i < tid_agg_rx->buf_size; i++) {
+		int index = (ssn + i) % tid_agg_rx->buf_size;
+
+		tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
+		if (filtered & BIT_ULL(i))
+			tid_agg_rx->reorder_buf_filtered |= BIT_ULL(index);
+	}
+
+	/* now process also frames that the filter marking released */
+	ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames);
+
+release:
+	spin_unlock_bh(&tid_agg_rx->reorder_lock);
+
+	ieee80211_rx_handlers(&rx, &frames);
+
+ out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_mark_rx_ba_filtered_frames);
+
 /* main receive path */
 
 static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index f4d38994ecee..053f5c4fa495 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -168,6 +168,8 @@ struct tid_ampdu_tx {
  *
  * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an
  *	A-MSDU with individually reported subframes.
+ * @reorder_buf_filtered: bitmap indicating where there are filtered frames in
+ *	the reorder buffer that should be ignored when releasing frames
  * @reorder_time: jiffies when skb was added
  * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
  * @reorder_timer: releases expired frames from the reorder buffer.
@@ -195,6 +197,7 @@ struct tid_ampdu_tx {
 struct tid_ampdu_rx {
 	struct rcu_head rcu_head;
 	spinlock_t reorder_lock;
+	u64 reorder_buf_filtered;
 	struct sk_buff_head *reorder_buf;
 	unsigned long *reorder_time;
 	struct timer_list session_timer;
-- 
cgit v1.2.3-59-g8ed1b


From 23665aaf9170ae6328cc4f68250c529a628af2ab Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Mon, 1 Feb 2016 11:40:55 +0200
Subject: mac80211: Interoperability workaround for 80+80 and 160 MHz channels

Number of deployed 80 MHz capable VHT stations that do not support 80+80
and 160 MHz bandwidths seem to misbehave when trying to connect to an AP
that advertises 80+80 or 160 MHz channel bandwidth in the VHT Operation
element. To avoid such issues with deployed devices, modify the design
based on recently accepted IEEE 802.11 standard changes (*).

This allows poorly implemented VHT 80 MHz stations to connect with the
AP in 80 MHz mode. 80+80 and 160 MHz capable stations need to support
the new workaround mechanism to allow full bandwidth to be used.
However, there are more or less no impacted station with 80+80/160
capability deployed.

The rebased version of this patch is based on the updated version from
Johannes Berg to take the HT/VHT chandef refactoring into account.

(*) Changes in
https://mentor.ieee.org/802.11/dcn/15/11-15-1530-04-000m-vht160-operation-signaling-through-non-zero-ccfs1.docx
were accepted during the IEEE 802.11 January 2016 meeting.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/util.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 091f3dd62ad1..f1e5b76eda70 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2364,10 +2364,23 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
 
 	switch (chandef->width) {
 	case NL80211_CHAN_WIDTH_160:
-		vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_160MHZ;
+		/*
+		 * Convert 160 MHz channel width to new style as interop
+		 * workaround.
+		 */
+		vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
+		vht_oper->center_freq_seg2_idx = vht_oper->center_freq_seg1_idx;
+		if (chandef->chan->center_freq < chandef->center_freq1)
+			vht_oper->center_freq_seg1_idx -= 8;
+		else
+			vht_oper->center_freq_seg1_idx += 8;
 		break;
 	case NL80211_CHAN_WIDTH_80P80:
-		vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80P80MHZ;
+		/*
+		 * Convert 80+80 MHz channel width to new style as interop
+		 * workaround.
+		 */
+		vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
 		break;
 	case NL80211_CHAN_WIDTH_80:
 		vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
@@ -2430,6 +2443,20 @@ bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper,
 	case IEEE80211_VHT_CHANWIDTH_80MHZ:
 		new.width = NL80211_CHAN_WIDTH_80;
 		new.center_freq1 = cf1;
+		/* If needed, adjust based on the newer interop workaround. */
+		if (oper->center_freq_seg2_idx) {
+			unsigned int diff;
+
+			diff = abs(oper->center_freq_seg2_idx -
+				   oper->center_freq_seg1_idx);
+			if (diff == 8) {
+				new.width = NL80211_CHAN_WIDTH_160;
+				new.center_freq1 = cf2;
+			} else if (diff > 8) {
+				new.width = NL80211_CHAN_WIDTH_80P80;
+				new.center_freq2 = cf2;
+			}
+		}
 		break;
 	case IEEE80211_VHT_CHANWIDTH_160MHZ:
 		new.width = NL80211_CHAN_WIDTH_160;
-- 
cgit v1.2.3-59-g8ed1b


From ea32f065bd3e3e09f0bcb3042f1664caf6b3e233 Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudip@vectorindia.org>
Date: Tue, 2 Feb 2016 13:21:14 +0530
Subject: mac80211: fix memory leak

On error we jumped to the error label and returned the error code but we
missed releasing sinfo.

Fixes: 5fe74014172d ("mac80211: avoid excessive stack usage in sta_info")
Reviewed-by: Julian Calaby <julian.calaby@gmail.com>
Signed-off-by: Sudip Mukherjee <sudip@vectorindia.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 5894c0a1c01f..87b7e7a7df6c 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -562,6 +562,7 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
 	__cleanup_single_sta(sta);
  out_err:
 	mutex_unlock(&local->sta_mtx);
+	kfree(sinfo);
 	rcu_read_lock();
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 88665f5a797a832ff7926ad2287ee41738bc09b9 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Tue, 2 Feb 2016 14:39:08 +0100
Subject: mac80211: move A-MSDU skb_linearize call to ieee80211_amsdu_to_8023s

Prepararation for zero-copy A-MSDU support with page fragment SKBs

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c   | 3 ---
 net/wireless/util.c | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9fb7074f0280..44192143ebdf 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2223,9 +2223,6 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 	skb->dev = dev;
 	__skb_queue_head_init(&frame_list);
 
-	if (skb_linearize(skb))
-		return RX_DROP_UNUSABLE;
-
 	ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
 				 rx->sdata->vif.type,
 				 rx->local->hw.extra_tx_headroom, true);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 92770427b211..6a407699a900 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -657,6 +657,9 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 	int remaining, err;
 	u8 dst[ETH_ALEN], src[ETH_ALEN];
 
+	if (skb_linearize(skb))
+		goto out;
+
 	if (has_80211_header) {
 		err = ieee80211_data_to_8023(skb, addr, iftype);
 		if (err)
-- 
cgit v1.2.3-59-g8ed1b


From 2d1c304cb2d5cf28d8362496bd9ea6aadf4f01d2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Tue, 2 Feb 2016 14:39:09 +0100
Subject: cfg80211: add function for 802.3 conversion with separate output
 buffer

Use skb_copy_bits in preparation for allowing fragmented skbs

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 106 ++++++++++++++++++++++++++--------------------------
 1 file changed, 53 insertions(+), 53 deletions(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index 6a407699a900..f75f6010eb08 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -393,9 +393,9 @@ unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb)
 }
 EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb);
 
-unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
+static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags)
 {
-	int ae = meshhdr->flags & MESH_FLAGS_AE;
+	int ae = flags & MESH_FLAGS_AE;
 	/* 802.11-2012, 8.2.4.7.3 */
 	switch (ae) {
 	default:
@@ -407,21 +407,31 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
 		return 18;
 	}
 }
+
+unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
+{
+	return __ieee80211_get_mesh_hdrlen(meshhdr->flags);
+}
 EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen);
 
-int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
-			   enum nl80211_iftype iftype)
+static int __ieee80211_data_to_8023(struct sk_buff *skb, struct ethhdr *ehdr,
+				    const u8 *addr, enum nl80211_iftype iftype)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	u16 hdrlen, ethertype;
-	u8 *payload;
-	u8 dst[ETH_ALEN];
-	u8 src[ETH_ALEN] __aligned(2);
+	struct {
+		u8 hdr[ETH_ALEN] __aligned(2);
+		__be16 proto;
+	} payload;
+	struct ethhdr tmp;
+	u16 hdrlen;
+	u8 mesh_flags = 0;
 
 	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
 		return -1;
 
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	if (skb->len < hdrlen + 8)
+		return -1;
 
 	/* convert IEEE 802.11 header + possible LLC headers into Ethernet
 	 * header
@@ -432,8 +442,11 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
 	 *   1     0   BSSID SA    DA    n/a
 	 *   1     1   RA    TA    DA    SA
 	 */
-	memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN);
-	memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN);
+	memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN);
+	memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN);
+
+	if (iftype == NL80211_IFTYPE_MESH_POINT)
+		skb_copy_bits(skb, hdrlen, &mesh_flags, 1);
 
 	switch (hdr->frame_control &
 		cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
@@ -450,44 +463,31 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
 			     iftype != NL80211_IFTYPE_STATION))
 			return -1;
 		if (iftype == NL80211_IFTYPE_MESH_POINT) {
-			struct ieee80211s_hdr *meshdr =
-				(struct ieee80211s_hdr *) (skb->data + hdrlen);
-			/* make sure meshdr->flags is on the linear part */
-			if (!pskb_may_pull(skb, hdrlen + 1))
-				return -1;
-			if (meshdr->flags & MESH_FLAGS_AE_A4)
+			if (mesh_flags & MESH_FLAGS_AE_A4)
 				return -1;
-			if (meshdr->flags & MESH_FLAGS_AE_A5_A6) {
+			if (mesh_flags & MESH_FLAGS_AE_A5_A6) {
 				skb_copy_bits(skb, hdrlen +
 					offsetof(struct ieee80211s_hdr, eaddr1),
-				       	dst, ETH_ALEN);
-				skb_copy_bits(skb, hdrlen +
-					offsetof(struct ieee80211s_hdr, eaddr2),
-				        src, ETH_ALEN);
+					tmp.h_dest, 2 * ETH_ALEN);
 			}
-			hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+			hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags);
 		}
 		break;
 	case cpu_to_le16(IEEE80211_FCTL_FROMDS):
 		if ((iftype != NL80211_IFTYPE_STATION &&
 		     iftype != NL80211_IFTYPE_P2P_CLIENT &&
 		     iftype != NL80211_IFTYPE_MESH_POINT) ||
-		    (is_multicast_ether_addr(dst) &&
-		     ether_addr_equal(src, addr)))
+		    (is_multicast_ether_addr(tmp.h_dest) &&
+		     ether_addr_equal(tmp.h_source, addr)))
 			return -1;
 		if (iftype == NL80211_IFTYPE_MESH_POINT) {
-			struct ieee80211s_hdr *meshdr =
-				(struct ieee80211s_hdr *) (skb->data + hdrlen);
-			/* make sure meshdr->flags is on the linear part */
-			if (!pskb_may_pull(skb, hdrlen + 1))
+			if (mesh_flags & MESH_FLAGS_AE_A5_A6)
 				return -1;
-			if (meshdr->flags & MESH_FLAGS_AE_A5_A6)
-				return -1;
-			if (meshdr->flags & MESH_FLAGS_AE_A4)
+			if (mesh_flags & MESH_FLAGS_AE_A4)
 				skb_copy_bits(skb, hdrlen +
 					offsetof(struct ieee80211s_hdr, eaddr1),
-					src, ETH_ALEN);
-			hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+					tmp.h_source, ETH_ALEN);
+			hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags);
 		}
 		break;
 	case cpu_to_le16(0):
@@ -498,33 +498,33 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
 		break;
 	}
 
-	if (!pskb_may_pull(skb, hdrlen + 8))
-		return -1;
-
-	payload = skb->data + hdrlen;
-	ethertype = (payload[6] << 8) | payload[7];
+	skb_copy_bits(skb, hdrlen, &payload, sizeof(payload));
+	tmp.h_proto = payload.proto;
 
-	if (likely((ether_addr_equal(payload, rfc1042_header) &&
-		    ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
-		   ether_addr_equal(payload, bridge_tunnel_header))) {
+	if (likely((ether_addr_equal(payload.hdr, rfc1042_header) &&
+		    tmp.h_proto != htons(ETH_P_AARP) &&
+		    tmp.h_proto != htons(ETH_P_IPX)) ||
+		   ether_addr_equal(payload.hdr, bridge_tunnel_header)))
 		/* remove RFC1042 or Bridge-Tunnel encapsulation and
 		 * replace EtherType */
-		skb_pull(skb, hdrlen + 6);
-		memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
-		memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
-	} else {
-		struct ethhdr *ehdr;
-		__be16 len;
+		hdrlen += ETH_ALEN + 2;
+	else
+		tmp.h_proto = htons(skb->len);
+
+	pskb_pull(skb, hdrlen);
 
-		skb_pull(skb, hdrlen);
-		len = htons(skb->len);
+	if (!ehdr)
 		ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr));
-		memcpy(ehdr->h_dest, dst, ETH_ALEN);
-		memcpy(ehdr->h_source, src, ETH_ALEN);
-		ehdr->h_proto = len;
-	}
+	memcpy(ehdr, &tmp, sizeof(tmp));
+
 	return 0;
 }
+
+int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
+			   enum nl80211_iftype iftype)
+{
+	return __ieee80211_data_to_8023(skb, NULL, addr, iftype);
+}
 EXPORT_SYMBOL(ieee80211_data_to_8023);
 
 int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
-- 
cgit v1.2.3-59-g8ed1b


From 230fd28a91542f0abfe390bda74702a3504c3df6 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Tue, 2 Feb 2016 14:39:10 +0100
Subject: cfg80211: add support for non-linear skbs in ieee80211_amsdu_to_8023s

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 100 +++++++++++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index f75f6010eb08..6e4eb3555177 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -644,73 +644,75 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
 }
 EXPORT_SYMBOL(ieee80211_data_from_8023);
 
+static struct sk_buff *
+__ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen,
+		       int offset, int len)
+{
+	struct sk_buff *frame;
+
+	if (skb->len - offset < len)
+		return NULL;
+
+	/*
+	 * Allocate and reserve two bytes more for payload
+	 * alignment since sizeof(struct ethhdr) is 14.
+	 */
+	frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + len);
+
+	skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2);
+	skb_copy_bits(skb, offset, skb_put(frame, len), len);
+
+	return frame;
+}
 
 void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 			      const u8 *addr, enum nl80211_iftype iftype,
 			      const unsigned int extra_headroom,
 			      bool has_80211_header)
 {
+	unsigned int hlen = ALIGN(extra_headroom, 4);
 	struct sk_buff *frame = NULL;
 	u16 ethertype;
 	u8 *payload;
-	const struct ethhdr *eth;
-	int remaining, err;
-	u8 dst[ETH_ALEN], src[ETH_ALEN];
-
-	if (skb_linearize(skb))
-		goto out;
+	int offset = 0, remaining, err;
+	struct ethhdr eth;
+	bool reuse_skb = true;
+	bool last = false;
 
 	if (has_80211_header) {
-		err = ieee80211_data_to_8023(skb, addr, iftype);
+		err = __ieee80211_data_to_8023(skb, &eth, addr, iftype);
 		if (err)
 			goto out;
-
-		/* skip the wrapping header */
-		eth = (struct ethhdr *) skb_pull(skb, sizeof(struct ethhdr));
-		if (!eth)
-			goto out;
-	} else {
-		eth = (struct ethhdr *) skb->data;
 	}
 
-	while (skb != frame) {
+	while (!last) {
+		unsigned int subframe_len;
+		int len;
 		u8 padding;
-		__be16 len = eth->h_proto;
-		unsigned int subframe_len = sizeof(struct ethhdr) + ntohs(len);
-
-		remaining = skb->len;
-		memcpy(dst, eth->h_dest, ETH_ALEN);
-		memcpy(src, eth->h_source, ETH_ALEN);
 
+		skb_copy_bits(skb, offset, &eth, sizeof(eth));
+		len = ntohs(eth.h_proto);
+		subframe_len = sizeof(struct ethhdr) + len;
 		padding = (4 - subframe_len) & 0x3;
+
 		/* the last MSDU has no padding */
+		remaining = skb->len - offset;
 		if (subframe_len > remaining)
 			goto purge;
 
-		skb_pull(skb, sizeof(struct ethhdr));
+		offset += sizeof(struct ethhdr);
 		/* reuse skb for the last subframe */
-		if (remaining <= subframe_len + padding)
+		last = remaining <= subframe_len + padding;
+		if (!skb_is_nonlinear(skb) && last) {
+			skb_pull(skb, offset);
 			frame = skb;
-		else {
-			unsigned int hlen = ALIGN(extra_headroom, 4);
-			/*
-			 * Allocate and reserve two bytes more for payload
-			 * alignment since sizeof(struct ethhdr) is 14.
-			 */
-			frame = dev_alloc_skb(hlen + subframe_len + 2);
+			reuse_skb = true;
+		} else {
+			frame = __ieee80211_amsdu_copy(skb, hlen, offset, len);
 			if (!frame)
 				goto purge;
 
-			skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2);
-			memcpy(skb_put(frame, ntohs(len)), skb->data,
-				ntohs(len));
-
-			eth = (struct ethhdr *)skb_pull(skb, ntohs(len) +
-							padding);
-			if (!eth) {
-				dev_kfree_skb(frame);
-				goto purge;
-			}
+			offset += len + padding;
 		}
 
 		skb_reset_network_header(frame);
@@ -719,24 +721,20 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 
 		payload = frame->data;
 		ethertype = (payload[6] << 8) | payload[7];
-
 		if (likely((ether_addr_equal(payload, rfc1042_header) &&
 			    ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
 			   ether_addr_equal(payload, bridge_tunnel_header))) {
-			/* remove RFC1042 or Bridge-Tunnel
-			 * encapsulation and replace EtherType */
-			skb_pull(frame, 6);
-			memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN);
-			memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN);
-		} else {
-			memcpy(skb_push(frame, sizeof(__be16)), &len,
-				sizeof(__be16));
-			memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN);
-			memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN);
+			eth.h_proto = htons(ethertype);
+			skb_pull(frame, ETH_ALEN + 2);
 		}
+
+		memcpy(skb_push(frame, sizeof(eth)), &eth, sizeof(eth));
 		__skb_queue_tail(list, frame);
 	}
 
+	if (!reuse_skb)
+		dev_kfree_skb(skb);
+
 	return;
 
  purge:
-- 
cgit v1.2.3-59-g8ed1b


From 34d505193bd10668acf1caba02d2f66bddc23fea Mon Sep 17 00:00:00 2001
From: Lior David <liord@codeaurora.org>
Date: Thu, 28 Jan 2016 10:58:25 +0200
Subject: cfg80211: basic support for PBSS network type

PBSS (Personal Basic Service Set) is a new BSS type for DMG
networks. It is similar to infrastructure BSS, having an AP-like
entity called PCP (PBSS Control Point), but it has few differences.
PBSS support is mandatory for 11ad devices.

Add support for PBSS by introducing a new PBSS flag attribute.
The PBSS flag is used in the START_AP command to request starting
a PCP instead of an AP, and in the CONNECT command to request
connecting to a PCP instead of an AP.

Signed-off-by: Lior David <liord@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/wil6210/cfg80211.c | 10 ++++++++++
 include/net/cfg80211.h                      |  8 ++++++++
 include/uapi/linux/nl80211.h                |  6 ++++++
 net/wireless/nl80211.c                      | 11 +++++++++++
 net/wireless/sme.c                          |  9 ++++++---
 5 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 20d07ef679e8..1f231cd08138 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -422,6 +422,11 @@ static int wil_cfg80211_connect(struct wiphy *wiphy,
 	if (sme->privacy && !rsn_eid)
 		wil_info(wil, "WSC connection\n");
 
+	if (sme->pbss) {
+		wil_err(wil, "connect - PBSS not yet supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	bss = cfg80211_get_bss(wiphy, sme->channel, sme->bssid,
 			       sme->ssid, sme->ssid_len,
 			       IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
@@ -870,6 +875,11 @@ static int wil_cfg80211_start_ap(struct wiphy *wiphy,
 		return -EINVAL;
 	}
 
+	if (info->pbss) {
+		wil_err(wil, "AP: PBSS not yet supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	switch (info->hidden_ssid) {
 	case NL80211_HIDDEN_SSID_NOT_IN_USE:
 		hidden_ssid = WMI_HIDDEN_SSID_DISABLED;
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9bcaaf7cd15a..9e1b24c29f0c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -712,6 +712,8 @@ struct cfg80211_acl_data {
  * @p2p_opp_ps: P2P opportunistic PS
  * @acl: ACL configuration used by the drivers which has support for
  *	MAC address based access control
+ * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
+ *	networks.
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -730,6 +732,7 @@ struct cfg80211_ap_settings {
 	u8 p2p_ctwindow;
 	bool p2p_opp_ps;
 	const struct cfg80211_acl_data *acl;
+	bool pbss;
 };
 
 /**
@@ -1888,6 +1891,8 @@ struct cfg80211_ibss_params {
  * @ht_capa_mask:  The bits of ht_capa which are to be used.
  * @vht_capa:  VHT Capability overrides
  * @vht_capa_mask: The bits of vht_capa which are to be used.
+ * @pbss: if set, connect to a PCP instead of AP. Valid for DMG
+ *	networks.
  */
 struct cfg80211_connect_params {
 	struct ieee80211_channel *channel;
@@ -1910,6 +1915,7 @@ struct cfg80211_connect_params {
 	struct ieee80211_ht_cap ht_capa_mask;
 	struct ieee80211_vht_cap vht_capa;
 	struct ieee80211_vht_cap vht_capa_mask;
+	bool pbss;
 };
 
 /**
@@ -3489,6 +3495,7 @@ struct cfg80211_cached_keys;
  *	registered for unexpected class 3 frames (AP mode)
  * @conn: (private) cfg80211 software SME connection state machine data
  * @connect_keys: (private) keys to set after connection is established
+ * @conn_bss_type: connecting/connected BSS type
  * @ibss_fixed: (private) IBSS is using fixed BSSID
  * @ibss_dfs_possible: (private) IBSS may change to a DFS channel
  * @event_list: (private) list for internal event processing
@@ -3519,6 +3526,7 @@ struct wireless_dev {
 	u8 ssid_len, mesh_id_len, mesh_id_up_len;
 	struct cfg80211_conn *conn;
 	struct cfg80211_cached_keys *connect_keys;
+	enum ieee80211_bss_type conn_bss_type;
 
 	struct list_head event_list;
 	spinlock_t event_lock;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5b7b5ebe7ca8..7758969a2a8e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1789,6 +1789,10 @@ enum nl80211_commands {
  *	thus it must not specify the number of iterations, only the interval
  *	between scans. The scan plans are executed sequentially.
  *	Each scan plan is a nested attribute of &enum nl80211_sched_scan_plan.
+ * @NL80211_ATTR_PBSS: flag attribute. If set it means operate
+ *	in a PBSS. Specified in %NL80211_CMD_CONNECT to request
+ *	connecting to a PCP, and in %NL80211_CMD_START_AP to start
+ *	a PCP instead of AP. Relevant for DMG networks only.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2164,6 +2168,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS,
 	NL80211_ATTR_SCHED_SCAN_PLANS,
 
+	NL80211_ATTR_PBSS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d4786f2802aa..268cb493f6a5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -401,6 +401,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 },
 	[NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 },
 	[NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
+	[NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -3461,6 +3462,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			return PTR_ERR(params.acl);
 	}
 
+	params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+	if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ])
+		return -EOPNOTSUPP;
+
 	wdev_lock(wdev);
 	err = rdev_start_ap(rdev, dev, &params);
 	if (!err) {
@@ -7980,6 +7985,12 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 		connect.flags |= ASSOC_REQ_USE_RRM;
 	}
 
+	connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+	if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) {
+		kzfree(connkeys);
+		return -EOPNOTSUPP;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 	err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL);
 	wdev_unlock(dev->ieee80211_ptr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 8020b5b094d4..79bd3a171caa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -264,7 +264,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev)
 			       wdev->conn->params.bssid,
 			       wdev->conn->params.ssid,
 			       wdev->conn->params.ssid_len,
-			       IEEE80211_BSS_TYPE_ESS,
+			       wdev->conn_bss_type,
 			       IEEE80211_PRIVACY(wdev->conn->params.privacy));
 	if (!bss)
 		return NULL;
@@ -687,7 +687,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 		WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
 		bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid,
 				       wdev->ssid, wdev->ssid_len,
-				       IEEE80211_BSS_TYPE_ESS,
+				       wdev->conn_bss_type,
 				       IEEE80211_PRIVACY_ANY);
 		if (bss)
 			cfg80211_hold_bss(bss_from_pub(bss));
@@ -846,7 +846,7 @@ void cfg80211_roamed(struct net_device *dev,
 
 	bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid,
 			       wdev->ssid_len,
-			       IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
+			       wdev->conn_bss_type, IEEE80211_PRIVACY_ANY);
 	if (WARN_ON(!bss))
 		return;
 
@@ -1017,6 +1017,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 	memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
 	wdev->ssid_len = connect->ssid_len;
 
+	wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS :
+					      IEEE80211_BSS_TYPE_ESS;
+
 	if (!rdev->ops->connect)
 		err = cfg80211_sme_connect(wdev, connect, prev_bssid);
 	else
-- 
cgit v1.2.3-59-g8ed1b


From bf5a70e105820eb214d3f33376cf6a3011d2e20c Mon Sep 17 00:00:00 2001
From: Henning Rogge <hrogge@gmail.com>
Date: Wed, 3 Feb 2016 13:58:36 +0100
Subject: mac80211: Remove MPP table entries with MPath

Make the mesh_path_del() function remove all mpp table entries
that are proxied by the removed mesh path.

Acked-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Henning Rogge <henning.rogge@fkie.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh_pathtbl.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index c95bdb8bb452..40ed206b0f60 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -834,6 +834,29 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
 	rcu_read_unlock();
 }
 
+static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
+			       const u8 *proxy)
+{
+	struct mesh_table *tbl;
+	struct mesh_path *mpp;
+	struct mpath_node *node;
+	int i;
+
+	rcu_read_lock();
+	read_lock_bh(&pathtbl_resize_lock);
+	tbl = resize_dereference_mpp_paths();
+	for_each_mesh_entry(tbl, node, i) {
+		mpp = node->mpath;
+		if (ether_addr_equal(mpp->mpp, proxy)) {
+			spin_lock(&tbl->hashwlock[i]);
+			__mesh_path_del(tbl, node);
+			spin_unlock(&tbl->hashwlock[i]);
+		}
+	}
+	read_unlock_bh(&pathtbl_resize_lock);
+	rcu_read_unlock();
+}
+
 static void table_flush_by_iface(struct mesh_table *tbl,
 				 struct ieee80211_sub_if_data *sdata)
 {
@@ -891,6 +914,9 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
 	int hash_idx;
 	int err = 0;
 
+	/* flush relevant mpp entries first */
+	mpp_flush_by_proxy(sdata, addr);
+
 	read_lock_bh(&pathtbl_resize_lock);
 	tbl = resize_dereference_mesh_paths();
 	hash_idx = mesh_table_hash(addr, sdata, tbl);
-- 
cgit v1.2.3-59-g8ed1b


From ab1c7906b0bead549065a3a2def5aeda21818c99 Mon Sep 17 00:00:00 2001
From: Henning Rogge <hrogge@gmail.com>
Date: Wed, 3 Feb 2016 13:58:37 +0100
Subject: mac80211: let unused MPP table entries timeout

Remember the last time when a mpp table entry is used for
rx or tx and remove them after MESH_PATH_EXPIRE time.

Acked-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Henning Rogge <henning.rogge@fkie.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh_pathtbl.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/rx.c           |  1 +
 net/mac80211/tx.c           |  5 ++++-
 3 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 40ed206b0f60..29e50407507a 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -940,6 +940,46 @@ enddel:
 	return err;
 }
 
+/**
+ * mpp_path_del - delete a mesh proxy path from the table
+ *
+ * @addr: addr address (ETH_ALEN length)
+ * @sdata: local subif
+ *
+ * Returns: 0 if successful
+ */
+static int mpp_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
+{
+	struct mesh_table *tbl;
+	struct mesh_path *mpath;
+	struct mpath_node *node;
+	struct hlist_head *bucket;
+	int hash_idx;
+	int err = 0;
+
+	read_lock_bh(&pathtbl_resize_lock);
+	tbl = resize_dereference_mpp_paths();
+	hash_idx = mesh_table_hash(addr, sdata, tbl);
+	bucket = &tbl->hash_buckets[hash_idx];
+
+	spin_lock(&tbl->hashwlock[hash_idx]);
+	hlist_for_each_entry(node, bucket, list) {
+		mpath = node->mpath;
+		if (mpath->sdata == sdata &&
+		    ether_addr_equal(addr, mpath->dst)) {
+			__mesh_path_del(tbl, node);
+			goto enddel;
+		}
+	}
+
+	err = -ENXIO;
+enddel:
+	mesh_paths_generation++;
+	spin_unlock(&tbl->hashwlock[hash_idx]);
+	read_unlock_bh(&pathtbl_resize_lock);
+	return err;
+}
+
 /**
  * mesh_path_tx_pending - sends pending frames in a mesh path queue
  *
@@ -1154,6 +1194,17 @@ void mesh_path_expire(struct ieee80211_sub_if_data *sdata)
 		     time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
 			mesh_path_del(mpath->sdata, mpath->dst);
 	}
+
+	tbl = rcu_dereference(mpp_paths);
+	for_each_mesh_entry(tbl, node, i) {
+		if (node->mpath->sdata != sdata)
+			continue;
+		mpath = node->mpath;
+		if ((!(mpath->flags & MESH_PATH_FIXED)) &&
+		    time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
+			mpp_path_del(mpath->sdata, mpath->dst);
+	}
+
 	rcu_read_unlock();
 }
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 44192143ebdf..91279576f4a7 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2311,6 +2311,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 			spin_lock_bh(&mppath->state_lock);
 			if (!ether_addr_equal(mppath->mpp, mpp_addr))
 				memcpy(mppath->mpp, mpp_addr, ETH_ALEN);
+			mppath->exp_time = jiffies;
 			spin_unlock_bh(&mppath->state_lock);
 		}
 		rcu_read_unlock();
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a5aa275d0434..7bb67fa9f4d2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2173,8 +2173,11 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 					mpp_lookup = true;
 			}
 
-			if (mpp_lookup)
+			if (mpp_lookup) {
 				mppath = mpp_path_lookup(sdata, skb->data);
+				if (mppath)
+					mppath->exp_time = jiffies;
+			}
 
 			if (mppath && mpath)
 				mesh_path_del(mpath->sdata, mpath->dst);
-- 
cgit v1.2.3-59-g8ed1b


From 4cc955de7c99fe60b09688d7c2564f8fc7d00ca2 Mon Sep 17 00:00:00 2001
From: Henning Rogge <hrogge@gmail.com>
Date: Wed, 3 Feb 2016 13:58:38 +0100
Subject: mac80211: Unify mesh and mpp path removal function

mpp_path_del() and mesh_path_del() are mostly the same function.
Move common code into a new static function.

Acked-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Henning Rogge <henning.rogge@fkie.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh_pathtbl.c | 77 ++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 29e50407507a..2ba7aa56b11c 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -55,16 +55,21 @@ int mpp_paths_generation;
 static DEFINE_RWLOCK(pathtbl_resize_lock);
 
 
+static inline struct mesh_table *resize_dereference_paths(
+	struct mesh_table __rcu *table)
+{
+	return rcu_dereference_protected(table,
+					lockdep_is_held(&pathtbl_resize_lock));
+}
+
 static inline struct mesh_table *resize_dereference_mesh_paths(void)
 {
-	return rcu_dereference_protected(mesh_paths,
-		lockdep_is_held(&pathtbl_resize_lock));
+	return resize_dereference_paths(mesh_paths);
 }
 
 static inline struct mesh_table *resize_dereference_mpp_paths(void)
 {
-	return rcu_dereference_protected(mpp_paths,
-		lockdep_is_held(&pathtbl_resize_lock));
+	return resize_dereference_paths(mpp_paths);
 }
 
 /*
@@ -898,14 +903,17 @@ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
 }
 
 /**
- * mesh_path_del - delete a mesh path from the table
+ * table_path_del - delete a path from the mesh or mpp table
  *
- * @addr: dst address (ETH_ALEN length)
+ * @tbl: mesh or mpp path table
  * @sdata: local subif
+ * @addr: dst address (ETH_ALEN length)
  *
  * Returns: 0 if successful
  */
-int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
+static int table_path_del(struct mesh_table __rcu *rcu_tbl,
+			  struct ieee80211_sub_if_data *sdata,
+			  const u8 *addr)
 {
 	struct mesh_table *tbl;
 	struct mesh_path *mpath;
@@ -914,11 +922,7 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
 	int hash_idx;
 	int err = 0;
 
-	/* flush relevant mpp entries first */
-	mpp_flush_by_proxy(sdata, addr);
-
-	read_lock_bh(&pathtbl_resize_lock);
-	tbl = resize_dereference_mesh_paths();
+	tbl = resize_dereference_paths(rcu_tbl);
 	hash_idx = mesh_table_hash(addr, sdata, tbl);
 	bucket = &tbl->hash_buckets[hash_idx];
 
@@ -934,9 +938,30 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
 
 	err = -ENXIO;
 enddel:
-	mesh_paths_generation++;
 	spin_unlock(&tbl->hashwlock[hash_idx]);
+	return err;
+}
+
+/**
+ * mesh_path_del - delete a mesh path from the table
+ *
+ * @addr: dst address (ETH_ALEN length)
+ * @sdata: local subif
+ *
+ * Returns: 0 if successful
+ */
+int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
+{
+	int err = 0;
+
+	/* flush relevant mpp entries first */
+	mpp_flush_by_proxy(sdata, addr);
+
+	read_lock_bh(&pathtbl_resize_lock);
+	err = table_path_del(mesh_paths, sdata, addr);
+	mesh_paths_generation++;
 	read_unlock_bh(&pathtbl_resize_lock);
+
 	return err;
 }
 
@@ -950,33 +975,13 @@ enddel:
  */
 static int mpp_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
 {
-	struct mesh_table *tbl;
-	struct mesh_path *mpath;
-	struct mpath_node *node;
-	struct hlist_head *bucket;
-	int hash_idx;
 	int err = 0;
 
 	read_lock_bh(&pathtbl_resize_lock);
-	tbl = resize_dereference_mpp_paths();
-	hash_idx = mesh_table_hash(addr, sdata, tbl);
-	bucket = &tbl->hash_buckets[hash_idx];
-
-	spin_lock(&tbl->hashwlock[hash_idx]);
-	hlist_for_each_entry(node, bucket, list) {
-		mpath = node->mpath;
-		if (mpath->sdata == sdata &&
-		    ether_addr_equal(addr, mpath->dst)) {
-			__mesh_path_del(tbl, node);
-			goto enddel;
-		}
-	}
-
-	err = -ENXIO;
-enddel:
-	mesh_paths_generation++;
-	spin_unlock(&tbl->hashwlock[hash_idx]);
+	err = table_path_del(mpp_paths, sdata, addr);
+	mpp_paths_generation++;
 	read_unlock_bh(&pathtbl_resize_lock);
+
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0241fa19ef4021c3db6f70b8f08d69f9e21cc043 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Wed, 3 Feb 2016 21:52:23 +0200
Subject: mac80211: fix VHT MU-MIMO frame processing

The source and destintation addresses in the memcpy arguments
are flipped. Fix that.

Fixes: 23a1f8d44c0b("mac80211: process and save VHT MU-MIMO group frame")
Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/vht.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 204cf9ad3019..341d192cea52 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -459,10 +459,12 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 		    bss_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN))
 		return;
 
-	memcpy(mgmt->u.action.u.vht_group_notif.membership,
-	       bss_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN);
-	memcpy(mgmt->u.action.u.vht_group_notif.position,
-	       bss_conf->mu_group.position, WLAN_USER_POSITION_LEN);
+	memcpy(bss_conf->mu_group.membership,
+	       mgmt->u.action.u.vht_group_notif.membership,
+	       WLAN_MEMBERSHIP_LEN);
+	memcpy(bss_conf->mu_group.position,
+	       mgmt->u.action.u.vht_group_notif.position,
+	       WLAN_USER_POSITION_LEN);
 
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2bf0ccc7095e6cf665bbdb7c32c352d24f1ae033 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Mon, 8 Feb 2016 14:25:26 +0100
Subject: cfg80211: fix faulty variable initialization in
 ieee80211_amsdu_to_8023s

reuse_skb is set to true if the code decides to use the last segment.
Fixes a memory leak

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index 6e4eb3555177..9880c894c58d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -676,7 +676,7 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 	u8 *payload;
 	int offset = 0, remaining, err;
 	struct ethhdr eth;
-	bool reuse_skb = true;
+	bool reuse_skb = false;
 	bool last = false;
 
 	if (has_80211_header) {
-- 
cgit v1.2.3-59-g8ed1b


From 2b67f944f88c29e71efdcfc4d0c21ab28fbf23ae Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Mon, 8 Feb 2016 14:34:42 +0100
Subject: cfg80211: reuse existing page fragments in A-MSDU rx

This massively reduces data copying and thus improves rx performance

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 77 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index 9880c894c58d..c7f6820bb258 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -644,23 +644,93 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
 }
 EXPORT_SYMBOL(ieee80211_data_from_8023);
 
+static void
+__frame_add_frag(struct sk_buff *skb, struct page *page,
+		 void *ptr, int len, int size)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	int page_offset;
+
+	atomic_inc(&page->_count);
+	page_offset = ptr - page_address(page);
+	skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
+}
+
+static void
+__ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame,
+			    int offset, int len)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	const skb_frag_t *frag = &sh->frags[-1];
+	struct page *frag_page;
+	void *frag_ptr;
+	int frag_len, frag_size;
+	int head_size = skb->len - skb->data_len;
+	int cur_len;
+
+	frag_page = virt_to_head_page(skb->head);
+	frag_ptr = skb->data;
+	frag_size = head_size;
+
+	while (offset >= frag_size) {
+		offset -= frag_size;
+		frag++;
+		frag_page = skb_frag_page(frag);
+		frag_ptr = skb_frag_address(frag);
+		frag_size = skb_frag_size(frag);
+	}
+
+	frag_ptr += offset;
+	frag_len = frag_size - offset;
+
+	cur_len = min(len, frag_len);
+
+	__frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size);
+	len -= cur_len;
+
+	while (len > 0) {
+		frag++;
+		frag_len = skb_frag_size(frag);
+		cur_len = min(len, frag_len);
+		__frame_add_frag(frame, skb_frag_page(frag),
+				 skb_frag_address(frag), cur_len, frag_len);
+		len -= cur_len;
+	}
+}
+
 static struct sk_buff *
 __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen,
-		       int offset, int len)
+		       int offset, int len, bool reuse_frag)
 {
 	struct sk_buff *frame;
+	int cur_len = len;
 
 	if (skb->len - offset < len)
 		return NULL;
 
+	/*
+	 * When reusing framents, copy some data to the head to simplify
+	 * ethernet header handling and speed up protocol header processing
+	 * in the stack later.
+	 */
+	if (reuse_frag)
+		cur_len = min_t(int, len, 32);
+
 	/*
 	 * Allocate and reserve two bytes more for payload
 	 * alignment since sizeof(struct ethhdr) is 14.
 	 */
-	frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + len);
+	frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len);
 
 	skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2);
-	skb_copy_bits(skb, offset, skb_put(frame, len), len);
+	skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len);
+
+	len -= cur_len;
+	if (!len)
+		return frame;
+
+	offset += cur_len;
+	__ieee80211_amsdu_copy_frag(skb, frame, offset, len);
 
 	return frame;
 }
@@ -676,6 +746,7 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 	u8 *payload;
 	int offset = 0, remaining, err;
 	struct ethhdr eth;
+	bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb);
 	bool reuse_skb = false;
 	bool last = false;
 
@@ -703,12 +774,13 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 		offset += sizeof(struct ethhdr);
 		/* reuse skb for the last subframe */
 		last = remaining <= subframe_len + padding;
-		if (!skb_is_nonlinear(skb) && last) {
+		if (!skb_is_nonlinear(skb) && !reuse_frag && last) {
 			skb_pull(skb, offset);
 			frame = skb;
 			reuse_skb = true;
 		} else {
-			frame = __ieee80211_amsdu_copy(skb, hlen, offset, len);
+			frame = __ieee80211_amsdu_copy(skb, hlen, offset, len,
+						       reuse_frag);
 			if (!frame)
 				goto purge;
 
-- 
cgit v1.2.3-59-g8ed1b


From 109843b07aed8663a3c1fe9ef02c540b6ff203b4 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Date: Fri, 19 Feb 2016 12:18:01 +0100
Subject: mac80211: fix wiphy supported_band access

Fix wiphy supported_band access in tx radiotap parsing introduced
in commit 5ec3aed9ba4c ("mac80211: Parse legacy and HT rate in
injected frames"). In particular, info->band is always set to 0
(IEEE80211_BAND_2GHZ) since it has not assigned yet.
This cause a kernel crash on 5GHz only devices.
Move ieee80211_parse_tx_radiotap() after info->band assignment

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 7bb67fa9f4d2..3a7475ff1a41 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1892,10 +1892,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
 	info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
 		      IEEE80211_TX_CTL_INJECTED;
 
-	/* process and remove the injection radiotap header */
-	if (!ieee80211_parse_tx_radiotap(local, skb))
-		goto fail;
-
 	rcu_read_lock();
 
 	/*
@@ -1957,6 +1953,11 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
 		goto fail_rcu;
 
 	info->band = chandef->chan->band;
+
+	/* process and remove the injection radiotap header */
+	if (!ieee80211_parse_tx_radiotap(local, skb))
+		goto fail_rcu;
+
 	ieee80211_xmit(sdata, NULL, skb);
 	rcu_read_unlock();
 
-- 
cgit v1.2.3-59-g8ed1b


From f8079d43cf0f1f0171606e75fcef6fe17bb183f2 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Sun, 14 Feb 2016 13:56:35 +0200
Subject: mac80211: move TKIP TX IVs to public part of key struct

Some drivers/devices might want to set the IVs by
themselves (and still let mac80211 generate MMIC).

Specifically, this is needed when the device does
offloading at certain times, and the driver has
to make sure that the IVs of new tx frames (from
the host) are synchronized with IVs that were
potentially used during the offloading.

Similarly to CCMP, move the TX IVs of TKIP keys to the
public part of the key struct, and export a function
to add the IV right into the crypto header.

The public tx_pn field is defined as atomic64, so define
TKIP_PN_TO_IV16/32 helper macros to convert it to iv16/32
when needed.

Since the iv32 used for the p1k cache is taken
directly from the frame, we can safely remove
iv16/32 from being protected by tkip.txlock.

Signed-off-by: Eliad Peller <eliadx.peller@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 23 ++++++++++++++++++++---
 net/mac80211/cfg.c         |  5 +++--
 net/mac80211/debugfs_key.c |  5 +++--
 net/mac80211/key.c         |  9 +++++----
 net/mac80211/key.h         | 10 +++++++---
 net/mac80211/tkip.c        | 36 ++++++++++++++++++------------------
 net/mac80211/tkip.h        |  2 --
 net/mac80211/wpa.c         | 11 ++++-------
 8 files changed, 60 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 57147749ae42..15879b49baad 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1521,9 +1521,8 @@ enum ieee80211_key_flags {
  *	wants to be given when a frame is transmitted and needs to be
  *	encrypted in hardware.
  * @cipher: The key's cipher suite selector.
- * @tx_pn: PN used for TX on non-TKIP keys, may be used by the driver
- *	as well if it needs to do software PN assignment by itself
- *	(e.g. due to TSO)
+ * @tx_pn: PN used for TX keys, may be used by the driver as well if it
+ *	needs to do software PN assignment by itself (e.g. due to TSO)
  * @flags: key flags, see &enum ieee80211_key_flags.
  * @keyidx: the key index (0-3)
  * @keylen: key material length
@@ -1549,6 +1548,9 @@ struct ieee80211_key_conf {
 
 #define IEEE80211_MAX_PN_LEN	16
 
+#define TKIP_PN_TO_IV16(pn) ((u16)(pn & 0xffff))
+#define TKIP_PN_TO_IV32(pn) ((u32)((pn >> 16) & 0xffffffff))
+
 /**
  * struct ieee80211_key_seq - key sequence counter
  *
@@ -4446,6 +4448,21 @@ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf,
 void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf,
 			    struct sk_buff *skb, u8 *p2k);
 
+/**
+ * ieee80211_tkip_add_iv - write TKIP IV and Ext. IV to pos
+ *
+ * @pos: start of crypto header
+ * @keyconf: the parameter passed with the set key
+ * @pn: PN to add
+ *
+ * Returns: pointer to the octet following IVs (i.e. beginning of
+ * the packet payload)
+ *
+ * This function writes the tkip IV value to pos (which should
+ * point to the crypto header)
+ */
+u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn);
+
 /**
  * ieee80211_get_key_tx_seq - get key TX sequence counter
  *
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 66d22de93c8d..fe1704c4e8fb 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -339,8 +339,9 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 
 	switch (key->conf.cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
-		iv32 = key->u.tkip.tx.iv32;
-		iv16 = key->u.tkip.tx.iv16;
+		pn64 = atomic64_read(&key->conf.tx_pn);
+		iv32 = TKIP_PN_TO_IV32(pn64);
+		iv16 = TKIP_PN_TO_IV16(pn64);
 
 		if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
 		    !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) {
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 7961e7d0b61e..a2ef95f16f11 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -132,9 +132,10 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
 		len = scnprintf(buf, sizeof(buf), "\n");
 		break;
 	case WLAN_CIPHER_SUITE_TKIP:
+		pn = atomic64_read(&key->conf.tx_pn);
 		len = scnprintf(buf, sizeof(buf), "%08x %04x\n",
-				key->u.tkip.tx.iv32,
-				key->u.tkip.tx.iv16);
+				TKIP_PN_TO_IV32(pn),
+				TKIP_PN_TO_IV16(pn));
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 5e5bc599da4c..f9c4cb9c6e06 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -945,8 +945,9 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
 
 	switch (key->conf.cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
-		seq->tkip.iv32 = key->u.tkip.tx.iv32;
-		seq->tkip.iv16 = key->u.tkip.tx.iv16;
+		pn64 = atomic64_read(&key->conf.tx_pn);
+		seq->tkip.iv32 = TKIP_PN_TO_IV32(pn64);
+		seq->tkip.iv16 = TKIP_PN_TO_IV16(pn64);
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
@@ -1039,8 +1040,8 @@ void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
 
 	switch (key->conf.cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
-		key->u.tkip.tx.iv32 = seq->tkip.iv32;
-		key->u.tkip.tx.iv16 = seq->tkip.iv16;
+		pn64 = (u64)seq->tkip.iv16 | ((u64)seq->tkip.iv32 << 16);
+		atomic64_set(&key->conf.tx_pn, pn64);
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 9951ef06323e..4aa20cef0859 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -44,13 +44,17 @@ enum ieee80211_internal_tkip_state {
 };
 
 struct tkip_ctx {
-	u32 iv32;	/* current iv32 */
-	u16 iv16;	/* current iv16 */
 	u16 p1k[5];	/* p1k cache */
 	u32 p1k_iv32;	/* iv32 for which p1k computed */
 	enum ieee80211_internal_tkip_state state;
 };
 
+struct tkip_ctx_rx {
+	struct tkip_ctx ctx;
+	u32 iv32;	/* current iv32 */
+	u16 iv16;	/* current iv16 */
+};
+
 struct ieee80211_key {
 	struct ieee80211_local *local;
 	struct ieee80211_sub_if_data *sdata;
@@ -71,7 +75,7 @@ struct ieee80211_key {
 			struct tkip_ctx tx;
 
 			/* last received RSC */
-			struct tkip_ctx rx[IEEE80211_NUM_TIDS];
+			struct tkip_ctx_rx rx[IEEE80211_NUM_TIDS];
 
 			/* number of mic failures */
 			u32 mic_failures;
diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c
index 0ae207771a58..b3622823bad2 100644
--- a/net/mac80211/tkip.c
+++ b/net/mac80211/tkip.c
@@ -1,6 +1,7 @@
 /*
  * Copyright 2002-2004, Instant802 Networks, Inc.
  * Copyright 2005, Devicescape Software, Inc.
+ * Copyright (C) 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -142,15 +143,14 @@ static void tkip_mixing_phase2(const u8 *tk, struct tkip_ctx *ctx,
 /* Add TKIP IV and Ext. IV at @pos. @iv0, @iv1, and @iv2 are the first octets
  * of the IV. Returns pointer to the octet following IVs (i.e., beginning of
  * the packet payload). */
-u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key)
+u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn)
 {
-	lockdep_assert_held(&key->u.tkip.txlock);
-
-	pos = write_tkip_iv(pos, key->u.tkip.tx.iv16);
-	*pos++ = (key->conf.keyidx << 6) | (1 << 5) /* Ext IV */;
-	put_unaligned_le32(key->u.tkip.tx.iv32, pos);
+	pos = write_tkip_iv(pos, TKIP_PN_TO_IV16(pn));
+	*pos++ = (keyconf->keyidx << 6) | (1 << 5) /* Ext IV */;
+	put_unaligned_le32(TKIP_PN_TO_IV32(pn), pos);
 	return pos + 4;
 }
+EXPORT_SYMBOL_GPL(ieee80211_tkip_add_iv);
 
 static void ieee80211_compute_tkip_p1k(struct ieee80211_key *key, u32 iv32)
 {
@@ -250,6 +250,7 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
 	u8 rc4key[16], keyid, *pos = payload;
 	int res;
 	const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+	struct tkip_ctx_rx *rx_ctx = &key->u.tkip.rx[queue];
 
 	if (payload_len < 12)
 		return -1;
@@ -265,37 +266,36 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
 	if ((keyid >> 6) != key->conf.keyidx)
 		return TKIP_DECRYPT_INVALID_KEYIDX;
 
-	if (key->u.tkip.rx[queue].state != TKIP_STATE_NOT_INIT &&
-	    (iv32 < key->u.tkip.rx[queue].iv32 ||
-	     (iv32 == key->u.tkip.rx[queue].iv32 &&
-	      iv16 <= key->u.tkip.rx[queue].iv16)))
+	if (rx_ctx->ctx.state != TKIP_STATE_NOT_INIT &&
+	    (iv32 < rx_ctx->iv32 ||
+	     (iv32 == rx_ctx->iv32 && iv16 <= rx_ctx->iv16)))
 		return TKIP_DECRYPT_REPLAY;
 
 	if (only_iv) {
 		res = TKIP_DECRYPT_OK;
-		key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+		rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED;
 		goto done;
 	}
 
-	if (key->u.tkip.rx[queue].state == TKIP_STATE_NOT_INIT ||
-	    key->u.tkip.rx[queue].iv32 != iv32) {
+	if (rx_ctx->ctx.state == TKIP_STATE_NOT_INIT ||
+	    rx_ctx->iv32 != iv32) {
 		/* IV16 wrapped around - perform TKIP phase 1 */
-		tkip_mixing_phase1(tk, &key->u.tkip.rx[queue], ta, iv32);
+		tkip_mixing_phase1(tk, &rx_ctx->ctx, ta, iv32);
 	}
 	if (key->local->ops->update_tkip_key &&
 	    key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
-	    key->u.tkip.rx[queue].state != TKIP_STATE_PHASE1_HW_UPLOADED) {
+	    rx_ctx->ctx.state != TKIP_STATE_PHASE1_HW_UPLOADED) {
 		struct ieee80211_sub_if_data *sdata = key->sdata;
 
 		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 			sdata = container_of(key->sdata->bss,
 					struct ieee80211_sub_if_data, u.ap);
 		drv_update_tkip_key(key->local, sdata, &key->conf, key->sta,
-				iv32, key->u.tkip.rx[queue].p1k);
-		key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+				iv32, rx_ctx->ctx.p1k);
+		rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED;
 	}
 
-	tkip_mixing_phase2(tk, &key->u.tkip.rx[queue], iv16, rc4key);
+	tkip_mixing_phase2(tk, &rx_ctx->ctx, iv16, rc4key);
 
 	res = ieee80211_wep_decrypt_data(tfm, rc4key, 16, pos, payload_len - 12);
  done:
diff --git a/net/mac80211/tkip.h b/net/mac80211/tkip.h
index e3ecb659b90a..a1bcbfbefe7c 100644
--- a/net/mac80211/tkip.h
+++ b/net/mac80211/tkip.h
@@ -13,8 +13,6 @@
 #include <linux/crypto.h>
 #include "key.h"
 
-u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key);
-
 int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm,
 				struct ieee80211_key *key,
 				struct sk_buff *skb,
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index d824c38971ed..18848258adde 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -1,6 +1,7 @@
 /*
  * Copyright 2002-2004, Instant802 Networks, Inc.
  * Copyright 2008, Jouni Malinen <j@w1.fi>
+ * Copyright (C) 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -183,7 +184,6 @@ mic_fail_no_key:
 	return RX_DROP_UNUSABLE;
 }
 
-
 static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
@@ -191,6 +191,7 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	unsigned int hdrlen;
 	int len, tail;
+	u64 pn;
 	u8 *pos;
 
 	if (info->control.hw_key &&
@@ -222,12 +223,8 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 		return 0;
 
 	/* Increase IV for the frame */
-	spin_lock(&key->u.tkip.txlock);
-	key->u.tkip.tx.iv16++;
-	if (key->u.tkip.tx.iv16 == 0)
-		key->u.tkip.tx.iv32++;
-	pos = ieee80211_tkip_add_iv(pos, key);
-	spin_unlock(&key->u.tkip.txlock);
+	pn = atomic64_inc_return(&key->conf.tx_pn);
+	pos = ieee80211_tkip_add_iv(pos, &key->conf, pn);
 
 	/* hwaccel - with software IV */
 	if (info->control.hw_key)
-- 
cgit v1.2.3-59-g8ed1b


From ca48ebbc7ea7e82e3ae4b55aacead0cdb54ff008 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Mon, 15 Feb 2016 12:34:10 +0200
Subject: mac80211: remove ieee80211_get_key_tx_seq/ieee80211_set_key_tx_seq

Since the PNs of all the tx keys are now tracked in the public
part of the key struct (with atomic counter), we no longer
need these functions.

dvm and vt665{5,6} are currently the only users of these functions,
so update them accordingly.

Signed-off-by: Eliad Peller <eliadx.peller@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/dvm/lib.c | 20 +++----
 drivers/staging/vt6655/rxtx.c                | 12 ++--
 drivers/staging/vt6656/rxtx.c                | 12 ++--
 include/net/mac80211.h                       | 34 -----------
 net/mac80211/key.c                           | 87 ----------------------------
 5 files changed, 24 insertions(+), 141 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
index 4841be2aa499..1799469268ea 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
@@ -943,14 +943,16 @@ static void iwlagn_wowlan_program_keys(struct ieee80211_hw *hw,
 	switch (key->cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
 		if (sta) {
+			u64 pn64;
+
 			tkip_sc = data->rsc_tsc->all_tsc_rsc.tkip.unicast_rsc;
 			tkip_tx_sc = &data->rsc_tsc->all_tsc_rsc.tkip.tsc;
 
 			rx_p1ks = data->tkip->rx_uni;
 
-			ieee80211_get_key_tx_seq(key, &seq);
-			tkip_tx_sc->iv16 = cpu_to_le16(seq.tkip.iv16);
-			tkip_tx_sc->iv32 = cpu_to_le32(seq.tkip.iv32);
+			pn64 = atomic64_read(&key->tx_pn);
+			tkip_tx_sc->iv16 = cpu_to_le16(TKIP_PN_TO_IV16(pn64));
+			tkip_tx_sc->iv32 = cpu_to_le32(TKIP_PN_TO_IV32(pn64));
 
 			ieee80211_get_tkip_p1k_iv(key, seq.tkip.iv32, p1k);
 			iwlagn_convert_p1k(p1k, data->tkip->tx.p1k);
@@ -996,19 +998,13 @@ static void iwlagn_wowlan_program_keys(struct ieee80211_hw *hw,
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
 		if (sta) {
-			u8 *pn = seq.ccmp.pn;
+			u64 pn64;
 
 			aes_sc = data->rsc_tsc->all_tsc_rsc.aes.unicast_rsc;
 			aes_tx_sc = &data->rsc_tsc->all_tsc_rsc.aes.tsc;
 
-			ieee80211_get_key_tx_seq(key, &seq);
-			aes_tx_sc->pn = cpu_to_le64(
-					(u64)pn[5] |
-					((u64)pn[4] << 8) |
-					((u64)pn[3] << 16) |
-					((u64)pn[2] << 24) |
-					((u64)pn[1] << 32) |
-					((u64)pn[0] << 40));
+			pn64 = atomic64_read(&key->tx_pn);
+			aes_tx_sc->pn = cpu_to_le64(pn64);
 		} else
 			aes_sc = data->rsc_tsc->all_tsc_rsc.aes.multicast_rsc;
 
diff --git a/drivers/staging/vt6655/rxtx.c b/drivers/staging/vt6655/rxtx.c
index b668db6a45fb..1a2dda09b69d 100644
--- a/drivers/staging/vt6655/rxtx.c
+++ b/drivers/staging/vt6655/rxtx.c
@@ -1210,7 +1210,7 @@ static void vnt_fill_txkey(struct ieee80211_hdr *hdr, u8 *key_buffer,
 			   struct sk_buff *skb,	u16 payload_len,
 			   struct vnt_mic_hdr *mic_hdr)
 {
-	struct ieee80211_key_seq seq;
+	u64 pn64;
 	u8 *iv = ((u8 *)hdr + ieee80211_get_hdrlen_from_skb(skb));
 
 	/* strip header and icv len from payload */
@@ -1243,9 +1243,13 @@ static void vnt_fill_txkey(struct ieee80211_hdr *hdr, u8 *key_buffer,
 		mic_hdr->payload_len = cpu_to_be16(payload_len);
 		ether_addr_copy(mic_hdr->mic_addr2, hdr->addr2);
 
-		ieee80211_get_key_tx_seq(tx_key, &seq);
-
-		memcpy(mic_hdr->ccmp_pn, seq.ccmp.pn, IEEE80211_CCMP_PN_LEN);
+		pn64 = atomic64_read(&tx_key->tx_pn);
+		mic_hdr->ccmp_pn[5] = pn64;
+		mic_hdr->ccmp_pn[4] = pn64 >> 8;
+		mic_hdr->ccmp_pn[3] = pn64 >> 16;
+		mic_hdr->ccmp_pn[2] = pn64 >> 24;
+		mic_hdr->ccmp_pn[1] = pn64 >> 32;
+		mic_hdr->ccmp_pn[0] = pn64 >> 40;
 
 		if (ieee80211_has_a4(hdr->frame_control))
 			mic_hdr->hlen = cpu_to_be16(28);
diff --git a/drivers/staging/vt6656/rxtx.c b/drivers/staging/vt6656/rxtx.c
index efb54f53b4f9..76378d225b46 100644
--- a/drivers/staging/vt6656/rxtx.c
+++ b/drivers/staging/vt6656/rxtx.c
@@ -719,7 +719,7 @@ static void vnt_fill_txkey(struct vnt_usb_send_context *tx_context,
 	u16 payload_len, struct vnt_mic_hdr *mic_hdr)
 {
 	struct ieee80211_hdr *hdr = tx_context->hdr;
-	struct ieee80211_key_seq seq;
+	u64 pn64;
 	u8 *iv = ((u8 *)hdr + ieee80211_get_hdrlen_from_skb(skb));
 
 	/* strip header and icv len from payload */
@@ -752,9 +752,13 @@ static void vnt_fill_txkey(struct vnt_usb_send_context *tx_context,
 		mic_hdr->payload_len = cpu_to_be16(payload_len);
 		ether_addr_copy(mic_hdr->mic_addr2, hdr->addr2);
 
-		ieee80211_get_key_tx_seq(tx_key, &seq);
-
-		memcpy(mic_hdr->ccmp_pn, seq.ccmp.pn, IEEE80211_CCMP_PN_LEN);
+		pn64 = atomic64_read(&tx_key->tx_pn);
+		mic_hdr->ccmp_pn[5] = pn64;
+		mic_hdr->ccmp_pn[4] = pn64 >> 8;
+		mic_hdr->ccmp_pn[3] = pn64 >> 16;
+		mic_hdr->ccmp_pn[2] = pn64 >> 24;
+		mic_hdr->ccmp_pn[1] = pn64 >> 32;
+		mic_hdr->ccmp_pn[0] = pn64 >> 40;
 
 		if (ieee80211_has_a4(hdr->frame_control))
 			mic_hdr->hlen = cpu_to_be16(28);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 15879b49baad..66155d3ad7e6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4463,23 +4463,6 @@ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf,
  */
 u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn);
 
-/**
- * ieee80211_get_key_tx_seq - get key TX sequence counter
- *
- * @keyconf: the parameter passed with the set key
- * @seq: buffer to receive the sequence data
- *
- * This function allows a driver to retrieve the current TX IV/PN
- * for the given key. It must not be called if IV generation is
- * offloaded to the device.
- *
- * Note that this function may only be called when no TX processing
- * can be done concurrently, for example when queues are stopped
- * and the stop has been synchronized.
- */
-void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
-			      struct ieee80211_key_seq *seq);
-
 /**
  * ieee80211_get_key_rx_seq - get key RX sequence counter
  *
@@ -4499,23 +4482,6 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
 void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 			      int tid, struct ieee80211_key_seq *seq);
 
-/**
- * ieee80211_set_key_tx_seq - set key TX sequence counter
- *
- * @keyconf: the parameter passed with the set key
- * @seq: new sequence data
- *
- * This function allows a driver to set the current TX IV/PNs for the
- * given key. This is useful when resuming from WoWLAN sleep and the
- * device may have transmitted frames using the PTK, e.g. replies to
- * ARP requests.
- *
- * Note that this function may only be called when no TX processing
- * can be done concurrently.
- */
-void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
-			      struct ieee80211_key_seq *seq);
-
 /**
  * ieee80211_set_key_rx_seq - set key RX sequence counter
  *
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index f9c4cb9c6e06..3df7b0392d30 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -932,51 +932,6 @@ void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid,
 }
 EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_notify);
 
-void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
-			      struct ieee80211_key_seq *seq)
-{
-	struct ieee80211_key *key;
-	u64 pn64;
-
-	if (WARN_ON(!(keyconf->flags & IEEE80211_KEY_FLAG_GENERATE_IV)))
-		return;
-
-	key = container_of(keyconf, struct ieee80211_key, conf);
-
-	switch (key->conf.cipher) {
-	case WLAN_CIPHER_SUITE_TKIP:
-		pn64 = atomic64_read(&key->conf.tx_pn);
-		seq->tkip.iv32 = TKIP_PN_TO_IV32(pn64);
-		seq->tkip.iv16 = TKIP_PN_TO_IV16(pn64);
-		break;
-	case WLAN_CIPHER_SUITE_CCMP:
-	case WLAN_CIPHER_SUITE_CCMP_256:
-	case WLAN_CIPHER_SUITE_AES_CMAC:
-	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), aes_cmac));
-	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
-	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), aes_gmac));
-	case WLAN_CIPHER_SUITE_GCMP:
-	case WLAN_CIPHER_SUITE_GCMP_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), gcmp));
-		pn64 = atomic64_read(&key->conf.tx_pn);
-		seq->ccmp.pn[5] = pn64;
-		seq->ccmp.pn[4] = pn64 >> 8;
-		seq->ccmp.pn[3] = pn64 >> 16;
-		seq->ccmp.pn[2] = pn64 >> 24;
-		seq->ccmp.pn[1] = pn64 >> 32;
-		seq->ccmp.pn[0] = pn64 >> 40;
-		break;
-	default:
-		WARN_ON(1);
-	}
-}
-EXPORT_SYMBOL(ieee80211_get_key_tx_seq);
-
 void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 			      int tid, struct ieee80211_key_seq *seq)
 {
@@ -1030,48 +985,6 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 }
 EXPORT_SYMBOL(ieee80211_get_key_rx_seq);
 
-void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
-			      struct ieee80211_key_seq *seq)
-{
-	struct ieee80211_key *key;
-	u64 pn64;
-
-	key = container_of(keyconf, struct ieee80211_key, conf);
-
-	switch (key->conf.cipher) {
-	case WLAN_CIPHER_SUITE_TKIP:
-		pn64 = (u64)seq->tkip.iv16 | ((u64)seq->tkip.iv32 << 16);
-		atomic64_set(&key->conf.tx_pn, pn64);
-		break;
-	case WLAN_CIPHER_SUITE_CCMP:
-	case WLAN_CIPHER_SUITE_CCMP_256:
-	case WLAN_CIPHER_SUITE_AES_CMAC:
-	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), aes_cmac));
-	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
-	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), aes_gmac));
-	case WLAN_CIPHER_SUITE_GCMP:
-	case WLAN_CIPHER_SUITE_GCMP_256:
-		BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
-			     offsetof(typeof(*seq), gcmp));
-		pn64 = (u64)seq->ccmp.pn[5] |
-		       ((u64)seq->ccmp.pn[4] << 8) |
-		       ((u64)seq->ccmp.pn[3] << 16) |
-		       ((u64)seq->ccmp.pn[2] << 24) |
-		       ((u64)seq->ccmp.pn[1] << 32) |
-		       ((u64)seq->ccmp.pn[0] << 40);
-		atomic64_set(&key->conf.tx_pn, pn64);
-		break;
-	default:
-		WARN_ON(1);
-		break;
-	}
-}
-EXPORT_SYMBOL_GPL(ieee80211_set_key_tx_seq);
-
 void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
 			      int tid, struct ieee80211_key_seq *seq)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 65554d07adfc22bb9e14f6df8c609a646f869a74 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 16 Feb 2016 12:48:17 +0200
Subject: mac80211: provide interface to driver to set VHT MU-MIMO data

Provide an interface to the lower level driver to set the VHT
MU-MIMO data. This is needed for example when there is an update
of the group data during low power state, where the management
frame will not be passed to the host at all.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 15 +++++++++++++++
 net/mac80211/vht.c     | 16 +++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 66155d3ad7e6..23f2a5ecf669 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5445,6 +5445,21 @@ ieee80211_vif_type_p2p(struct ieee80211_vif *vif)
 	return ieee80211_iftype_p2p(vif->type, vif->p2p);
 }
 
+/**
+ * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data
+ *
+ * @vif: the specified virtual interface
+ * @membership: 64 bits array - a bit is set if station is member of the group
+ * @position: 2 bits per group id indicating the position in the group
+ *
+ * Note: This function assumes that the given vif is valid and the position and
+ * membership data is of the correct size and are in the same byte order as the
+ * matching GroupId management frame.
+ * Calls to this function need to be serialized with RX path.
+ */
+void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
+				const u8 *membership, const u8 *position);
+
 void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif,
 				   int rssi_min_thold,
 				   int rssi_max_thold);
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 341d192cea52..f8f161179b5d 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -2,7 +2,7 @@
  * VHT handling
  *
  * Portions of this file
- * Copyright(c) 2015 Intel Deutschland GmbH
+ * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -469,6 +469,20 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS);
 }
 
+void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
+				const u8 *membership, const u8 *position)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+
+	if (WARN_ON_ONCE(!(sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER)))
+		return;
+
+	memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN);
+	memcpy(bss_conf->mu_group.position, position, WLAN_USER_POSITION_LEN);
+}
+EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups);
+
 void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 				 struct sta_info *sta, u8 opmode,
 				 enum ieee80211_band band)
-- 
cgit v1.2.3-59-g8ed1b


From b5a33d52595f0cb153f09bf45a5dcd66a7418dbb Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 16 Feb 2016 12:48:18 +0200
Subject: mac80211: move MU_MIMO_OWNER flag to ieee80211_vif

Drivers may need to track which vif is using VHT MU-MIMO.
Move the flag indicationg the ownership of MU_MIMO to
ieee80211_vif.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  2 ++
 net/mac80211/ieee80211_i.h |  2 --
 net/mac80211/mlme.c        | 11 ++++++-----
 net/mac80211/util.c        |  2 +-
 net/mac80211/vht.c         |  7 +++----
 5 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 23f2a5ecf669..0c09da34b67a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1382,6 +1382,7 @@ enum ieee80211_vif_flags {
  * @csa_active: marks whether a channel switch is going on. Internally it is
  *	write-protected by sdata_lock and local->mtx so holding either is fine
  *	for read access.
+ * @mu_mimo_owner: indicates interface owns MU-MIMO capability
  * @driver_flags: flags/capabilities the driver has for this interface,
  *	these need to be set (or cleared) when the interface is added
  *	or, if supported by the driver, the interface type is changed
@@ -1408,6 +1409,7 @@ struct ieee80211_vif {
 	u8 addr[ETH_ALEN];
 	bool p2p;
 	bool csa_active;
+	bool mu_mimo_owner;
 
 	u8 cab_queue;
 	u8 hw_queue[IEEE80211_NUM_ACS];
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a49c10361f1c..1630975c89f1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -716,7 +716,6 @@ struct ieee80211_if_mesh {
  *	back to wireless media and to the local net stack.
  * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume.
  * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver
- * @IEEE80211_SDATA_MU_MIMO_OWNER: indicates interface owns MU-MIMO capability
  */
 enum ieee80211_sub_if_data_flags {
 	IEEE80211_SDATA_ALLMULTI		= BIT(0),
@@ -724,7 +723,6 @@ enum ieee80211_sub_if_data_flags {
 	IEEE80211_SDATA_DONT_BRIDGE_PACKETS	= BIT(3),
 	IEEE80211_SDATA_DISCONNECT_RESUME	= BIT(4),
 	IEEE80211_SDATA_IN_DRIVER		= BIT(5),
-	IEEE80211_SDATA_MU_MIMO_OWNER		= BIT(6),
 };
 
 /**
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 857089de475f..f41625bcd879 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -6,7 +6,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2015 Intel Deutschland GmbH
+ * Copyright (C) 2015 - 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -559,7 +559,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
 		struct ieee80211_sub_if_data *other;
 
 		list_for_each_entry_rcu(other, &local->interfaces, list) {
-			if (other->flags & IEEE80211_SDATA_MU_MIMO_OWNER) {
+			if (other->vif.mu_mimo_owner) {
 				disable_mu_mimo = true;
 				break;
 			}
@@ -567,7 +567,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
 		if (disable_mu_mimo)
 			cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
 		else
-			sdata->flags |= IEEE80211_SDATA_MU_MIMO_OWNER;
+			sdata->vif.mu_mimo_owner = true;
 	}
 
 	mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK;
@@ -2052,7 +2052,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	memset(sdata->vif.bss_conf.mu_group.position, 0,
 	       sizeof(sdata->vif.bss_conf.mu_group.position));
 	changed |= BSS_CHANGED_MU_GROUPS;
-	sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER;
+	sdata->vif.mu_mimo_owner = false;
 
 	sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
 
@@ -2509,7 +2509,8 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
 		eth_zero_addr(sdata->u.mgd.bssid);
 		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
 		sdata->u.mgd.flags = 0;
-		sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER;
+		sdata->vif.mu_mimo_owner = false;
+
 		mutex_lock(&sdata->local->mtx);
 		ieee80211_vif_release_channel(sdata);
 		mutex_unlock(&sdata->local->mtx);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index f1e5b76eda70..89f71799df84 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1928,7 +1928,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			  BSS_CHANGED_IDLE |
 			  BSS_CHANGED_TXPOWER;
 
-		if (sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER)
+		if (sdata->vif.mu_mimo_owner)
 			changed |= BSS_CHANGED_MU_GROUPS;
 
 		switch (sdata->vif.type) {
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index f8f161179b5d..89e04d55aa18 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -450,7 +450,7 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
 
-	if (!(sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER))
+	if (!sdata->vif.mu_mimo_owner)
 		return;
 
 	if (!memcmp(mgmt->u.action.u.vht_group_notif.position,
@@ -472,10 +472,9 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
 void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
 				const u8 *membership, const u8 *position)
 {
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
-	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
 
-	if (WARN_ON_ONCE(!(sdata->flags & IEEE80211_SDATA_MU_MIMO_OWNER)))
+	if (WARN_ON_ONCE(!vif->mu_mimo_owner))
 		return;
 
 	memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN);
-- 
cgit v1.2.3-59-g8ed1b


From 0c9ca11b1ae8eb16c1b6bbae91991392d2321372 Mon Sep 17 00:00:00 2001
From: Beni Lev <beni.lev@intel.com>
Date: Wed, 17 Feb 2016 20:30:00 +0200
Subject: cfg80211: Add global RRM capability

Today, the supplicant will add the RRM capabilities
Information Element in the association request only if
Quiet period is supported (NL80211_FEATURE_QUIET).

Quiet is one of many RRM features, and there are other RRM
features that are not related to Quiet (e.g. neighbor
report). Therefore, requiring Quiet to enable RRM is too
restrictive.
Some of the features, like neighbor report, can be
supported by user space without any help from the kernel.
Hence adding the RRM capabilities IE to association request
should be the sole user space's decision.
Removing the RRM dependency on Quiet in the driver solves
this problem, but using an old driver with a user space
tool that would not require Quiet feature would be
problematic: the user space would add NL80211_ATTR_USE_RRM
in the association request even if the kernel doesn't
advertize NL80211_FEATURE_QUIET and the association would
be denied by the kernel.

This solution adds a global RRM capability, that tells user
space that it can request RRM capabilities IE publishment
without any specific feature support in the kernel.

Signed-off-by: Beni Lev <beni.lev@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  8 ++++++++
 net/wireless/nl80211.c       | 18 +++++++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 7758969a2a8e..5a30a7563633 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1727,6 +1727,8 @@ enum nl80211_commands {
  *	underlying device supports these minimal RRM features:
  *		%NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES,
  *		%NL80211_FEATURE_QUIET,
+ *	Or, if global RRM is supported, see:
+ *		%NL80211_EXT_FEATURE_RRM
  *	If this flag is used, driver must add the Power Capabilities IE to the
  *	association request. In addition, it must also set the RRM capability
  *	flag in the association request's Capability Info field.
@@ -4402,12 +4404,18 @@ enum nl80211_feature_flags {
 /**
  * enum nl80211_ext_feature_index - bit index of extended features.
  * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates.
+ * @NL80211_EXT_FEATURE_RRM: This driver supports RRM. When featured, user can
+ *	can request to use RRM (see %NL80211_ATTR_USE_RRM) with
+ *	%NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set
+ *	the ASSOC_REQ_USE_RRM flag in the association request even if
+ *	NL80211_FEATURE_QUIET is not advertized.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
 enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_VHT_IBSS,
+	NL80211_EXT_FEATURE_RRM,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 268cb493f6a5..90890f183c0e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015	Intel Deutschland GmbH
+ * Copyright 2015-2016	Intel Deutschland GmbH
  */
 
 #include <linux/if.h>
@@ -7286,9 +7286,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
-		if (!(rdev->wiphy.features &
-		      NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
-		    !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+		if (!((rdev->wiphy.features &
+			NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+		       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_RRM))
 			return -EINVAL;
 		req.flags |= ASSOC_REQ_USE_RRM;
 	}
@@ -7976,9 +7978,11 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
-		if (!(rdev->wiphy.features &
-		      NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
-		    !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) {
+		if (!((rdev->wiphy.features &
+			NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+		       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_RRM)) {
 			kzfree(connkeys);
 			return -EINVAL;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 7837a7778268191dae5f6622f2b92b8b37cb8d7f Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Date: Fri, 19 Feb 2016 11:43:04 +0100
Subject: cfg80211: add radiotap VHT info to rtap_namespace_sizes

Add IEEE80211_RADIOTAP_VHT entry to rtap_namespace_sizes array in order to
define alignment and size of VHT info in tx radiotap

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/radiotap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
index 722da616438c..6582d155e2fc 100644
--- a/net/wireless/radiotap.c
+++ b/net/wireless/radiotap.c
@@ -43,6 +43,7 @@ static const struct radiotap_align_size rtap_namespace_sizes[] = {
 	[IEEE80211_RADIOTAP_DATA_RETRIES] = { .align = 1, .size = 1, },
 	[IEEE80211_RADIOTAP_MCS] = { .align = 1, .size = 3, },
 	[IEEE80211_RADIOTAP_AMPDU_STATUS] = { .align = 4, .size = 8, },
+	[IEEE80211_RADIOTAP_VHT] = { .align = 2, .size = 12, },
 	/*
 	 * add more here as they are defined in radiotap.h
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From 648b50dd6abf8e6e5b589bb8e6873a4596389dbe Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 25 Jan 2016 12:03:46 +0300
Subject: net: rfkill: add rfkill_find_type function

Helper for finding the type based on name. Useful if the
type needs to be determined based on device property.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
[modify rfkill_types array and BUILD_BUG_ON to not cause errors]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill.h | 15 +++++++++++++
 net/rfkill/core.c      | 58 ++++++++++++++++++++++++++------------------------
 2 files changed, 45 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 7af625f6d226..e6a0031d1b1f 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -213,6 +213,15 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw);
  * @rfkill: rfkill struct to query
  */
 bool rfkill_blocked(struct rfkill *rfkill);
+
+/**
+ * rfkill_find_type - Helpper for finding rfkill type by name
+ * @name: the name of the type
+ *
+ * Returns enum rfkill_type that conrresponds the name.
+ */
+enum rfkill_type rfkill_find_type(const char *name);
+
 #else /* !RFKILL */
 static inline struct rfkill * __must_check
 rfkill_alloc(const char *name,
@@ -269,6 +278,12 @@ static inline bool rfkill_blocked(struct rfkill *rfkill)
 {
 	return false;
 }
+
+static inline enum rfkill_type rfkill_find_type(const char *name)
+{
+	return RFKILL_TYPE_ALL;
+}
+
 #endif /* RFKILL || RFKILL_MODULE */
 
 
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index a805831d5d9b..2a23479a49f2 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -572,6 +572,34 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 }
 EXPORT_SYMBOL(rfkill_set_states);
 
+static const char * const rfkill_types[] = {
+	NULL, /* RFKILL_TYPE_ALL */
+	"wlan",
+	"bluetooth",
+	"ultrawideband",
+	"wimax",
+	"wwan",
+	"gps",
+	"fm",
+	"nfc",
+};
+
+enum rfkill_type rfkill_find_type(const char *name)
+{
+	int i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(rfkill_types) != NUM_RFKILL_TYPES);
+
+	if (!name)
+		return RFKILL_TYPE_ALL;
+
+	for (i = 1; i < NUM_RFKILL_TYPES; i++)
+		if (!strcmp(name, rfkill_types[i]))
+			return i;
+	return RFKILL_TYPE_ALL;
+}
+EXPORT_SYMBOL(rfkill_find_type);
+
 static ssize_t name_show(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
@@ -581,38 +609,12 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(name);
 
-static const char *rfkill_get_type_str(enum rfkill_type type)
-{
-	BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_NFC + 1);
-
-	switch (type) {
-	case RFKILL_TYPE_WLAN:
-		return "wlan";
-	case RFKILL_TYPE_BLUETOOTH:
-		return "bluetooth";
-	case RFKILL_TYPE_UWB:
-		return "ultrawideband";
-	case RFKILL_TYPE_WIMAX:
-		return "wimax";
-	case RFKILL_TYPE_WWAN:
-		return "wwan";
-	case RFKILL_TYPE_GPS:
-		return "gps";
-	case RFKILL_TYPE_FM:
-		return "fm";
-	case RFKILL_TYPE_NFC:
-		return "nfc";
-	default:
-		BUG();
-	}
-}
-
 static ssize_t type_show(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
 
-	return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type));
+	return sprintf(buf, "%s\n", rfkill_types[rfkill->type]);
 }
 static DEVICE_ATTR_RO(type);
 
@@ -750,7 +752,7 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
 	if (error)
 		return error;
 	error = add_uevent_var(env, "RFKILL_TYPE=%s",
-			       rfkill_get_type_str(rfkill->type));
+			       rfkill_types[rfkill->type]);
 	if (error)
 		return error;
 	spin_lock_irqsave(&rfkill->lock, flags);
-- 
cgit v1.2.3-59-g8ed1b


From 7d5e9737efda16535e5b54bd627ef4881d11d31f Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 25 Jan 2016 12:03:47 +0300
Subject: net: rfkill: gpio: get the name and type from device property

This prepares the driver for removal of platform data.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/rfkill-gpio.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index 4b1e3f35f06c..1a9c0316aad1 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c
@@ -81,7 +81,6 @@ static int rfkill_gpio_acpi_probe(struct device *dev,
 	if (!id)
 		return -ENODEV;
 
-	rfkill->name = dev_name(dev);
 	rfkill->type = (unsigned)id->driver_data;
 
 	return acpi_dev_add_driver_gpios(ACPI_COMPANION(dev),
@@ -93,12 +92,21 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
 	struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data;
 	struct rfkill_gpio_data *rfkill;
 	struct gpio_desc *gpio;
+	const char *type_name;
 	int ret;
 
 	rfkill = devm_kzalloc(&pdev->dev, sizeof(*rfkill), GFP_KERNEL);
 	if (!rfkill)
 		return -ENOMEM;
 
+	device_property_read_string(&pdev->dev, "name", &rfkill->name);
+	device_property_read_string(&pdev->dev, "type", &type_name);
+
+	if (!rfkill->name)
+		rfkill->name = dev_name(&pdev->dev);
+
+	rfkill->type = rfkill_find_type(type_name);
+
 	if (ACPI_HANDLE(&pdev->dev)) {
 		ret = rfkill_gpio_acpi_probe(&pdev->dev, rfkill);
 		if (ret)
@@ -124,10 +132,8 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
 
 	rfkill->shutdown_gpio = gpio;
 
-	/* Make sure at-least one of the GPIO is defined and that
-	 * a name is specified for this instance
-	 */
-	if ((!rfkill->reset_gpio && !rfkill->shutdown_gpio) || !rfkill->name) {
+	/* Make sure at-least one GPIO is defined for this instance */
+	if (!rfkill->reset_gpio && !rfkill->shutdown_gpio) {
 		dev_err(&pdev->dev, "invalid platform data\n");
 		return -EINVAL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From fb2e6b7b7b02ab35a9d5355a69097a6f60c69d38 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 25 Jan 2016 12:03:49 +0300
Subject: net: rfkill: gpio: remove rfkill_gpio_platform_data

No more users for it.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill-gpio.h | 37 -------------------------------------
 net/rfkill/Kconfig          |  3 +--
 net/rfkill/rfkill-gpio.c    |  8 --------
 3 files changed, 1 insertion(+), 47 deletions(-)
 delete mode 100644 include/linux/rfkill-gpio.h

(limited to 'net')

diff --git a/include/linux/rfkill-gpio.h b/include/linux/rfkill-gpio.h
deleted file mode 100644
index 20bcb55498cd..000000000000
--- a/include/linux/rfkill-gpio.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2011, NVIDIA Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-
-#ifndef __RFKILL_GPIO_H
-#define __RFKILL_GPIO_H
-
-#include <linux/types.h>
-#include <linux/rfkill.h>
-
-/**
- * struct rfkill_gpio_platform_data - platform data for rfkill gpio device.
- * for unused gpio's, the expected value is -1.
- * @name:		name for the gpio rf kill instance
- */
-
-struct rfkill_gpio_platform_data {
-	char			*name;
-	enum rfkill_type	type;
-};
-
-#endif /* __RFKILL_GPIO_H */
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 598d374f6a35..868f1ad0415a 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -41,5 +41,4 @@ config RFKILL_GPIO
 	default n
 	help
 	  If you say yes here you get support of a generic gpio RFKILL
-	  driver. The platform should fill in the appropriate fields in the
-	  rfkill_gpio_platform_data structure and pass that to the driver.
+	  driver.
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index 1a9c0316aad1..76c01cbd56e3 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c
@@ -27,8 +27,6 @@
 #include <linux/acpi.h>
 #include <linux/gpio/consumer.h>
 
-#include <linux/rfkill-gpio.h>
-
 struct rfkill_gpio_data {
 	const char		*name;
 	enum rfkill_type	type;
@@ -89,7 +87,6 @@ static int rfkill_gpio_acpi_probe(struct device *dev,
 
 static int rfkill_gpio_probe(struct platform_device *pdev)
 {
-	struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data;
 	struct rfkill_gpio_data *rfkill;
 	struct gpio_desc *gpio;
 	const char *type_name;
@@ -111,11 +108,6 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
 		ret = rfkill_gpio_acpi_probe(&pdev->dev, rfkill);
 		if (ret)
 			return ret;
-	} else if (pdata) {
-		rfkill->name = pdata->name;
-		rfkill->type = pdata->type;
-	} else {
-		return -ENODEV;
 	}
 
 	rfkill->clk = devm_clk_get(&pdev->dev, NULL);
-- 
cgit v1.2.3-59-g8ed1b


From 3ff707d66881d308ef71a0939aa7a92ae1290702 Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Mon, 22 Feb 2016 11:36:32 -0500
Subject: rfkill: Improve documentation language
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 2a23479a49f2..8f6906eb08f3 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -282,8 +282,8 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 	spin_lock_irqsave(&rfkill->lock, flags);
 	if (err) {
 		/*
-		 * Failed -- reset status to _prev, this may be different
-		 * from what set set _PREV to earlier in this function
+		 * Failed -- reset status to _PREV, which may be different
+		 * from what we have set _PREV to earlier in this function
 		 * if rfkill_set_sw_state was invoked.
 		 */
 		if (rfkill->state & RFKILL_BLOCK_SW_PREV)
-- 
cgit v1.2.3-59-g8ed1b


From 1a1078901b72e5e8046579cc54250e4f167269d5 Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Mon, 22 Feb 2016 11:36:33 -0500
Subject: rfkill: Remove extra blank line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 8f6906eb08f3..f843eeeac46d 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -455,7 +455,6 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type)
 }
 #endif
 
-
 bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 {
 	unsigned long flags;
-- 
cgit v1.2.3-59-g8ed1b


From 9487bd6b96a98ee07661a7b5e6f6afce67e2860b Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Mon, 22 Feb 2016 11:36:36 -0500
Subject: rfkill: Factor rfkill_global_states[].cur assignments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Factor all assignments to rfkill_global_states[].cur into a single
function rfkill_update_global_state().

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index f843eeeac46d..a8c05e18da58 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -302,6 +302,19 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 		rfkill_event(rfkill);
 }
 
+static void rfkill_update_global_state(enum rfkill_type type, bool blocked)
+{
+	int i;
+
+	if (type != RFKILL_TYPE_ALL) {
+		rfkill_global_states[type].cur = blocked;
+		return;
+	}
+
+	for (i = 0; i < NUM_RFKILL_TYPES; i++)
+		rfkill_global_states[i].cur = blocked;
+}
+
 #ifdef CONFIG_RFKILL_INPUT
 static atomic_t rfkill_input_disabled = ATOMIC_INIT(0);
 
@@ -319,15 +332,7 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
 {
 	struct rfkill *rfkill;
 
-	if (type == RFKILL_TYPE_ALL) {
-		int i;
-
-		for (i = 0; i < NUM_RFKILL_TYPES; i++)
-			rfkill_global_states[i].cur = blocked;
-	} else {
-		rfkill_global_states[type].cur = blocked;
-	}
-
+	rfkill_update_global_state(type, blocked);
 	list_for_each_entry(rfkill, &rfkill_list, node) {
 		if (rfkill->type != type && type != RFKILL_TYPE_ALL)
 			continue;
@@ -1166,15 +1171,8 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
 
 	mutex_lock(&rfkill_global_mutex);
 
-	if (ev.op == RFKILL_OP_CHANGE_ALL) {
-		if (ev.type == RFKILL_TYPE_ALL) {
-			enum rfkill_type i;
-			for (i = 0; i < NUM_RFKILL_TYPES; i++)
-				rfkill_global_states[i].cur = ev.soft;
-		} else {
-			rfkill_global_states[ev.type].cur = ev.soft;
-		}
-	}
+	if (ev.op == RFKILL_OP_CHANGE_ALL)
+		rfkill_update_global_state(ev.type, ev.soft);
 
 	list_for_each_entry(rfkill, &rfkill_list, node) {
 		if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL)
@@ -1263,10 +1261,8 @@ static struct miscdevice rfkill_miscdev = {
 static int __init rfkill_init(void)
 {
 	int error;
-	int i;
 
-	for (i = 0; i < NUM_RFKILL_TYPES; i++)
-		rfkill_global_states[i].cur = !rfkill_default_state;
+	rfkill_update_global_state(RFKILL_TYPE_ALL, !rfkill_default_state);
 
 	error = class_register(&rfkill_class);
 	if (error)
-- 
cgit v1.2.3-59-g8ed1b


From 8790404de0e5f47ef721e2ed36ebb0f2314722ac Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Wed, 24 Feb 2016 12:01:03 +0100
Subject: 6lowpan: iphc: fix stateful multicast compression

In case of multicast address we need to set always the LOWPAN_IPHC_M bit
and if a destination context identifier was found for a multicast
address then we need to set the LOWPAN_IPHC_DAC as well.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/6lowpan/iphc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index d2a565cde4f4..8dd8388c8eba 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -1118,12 +1118,13 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
 	/* destination address compression */
 	if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) {
 		pr_debug("destination address is multicast: ");
+		iphc1 |= LOWPAN_IPHC_M;
 		if (dci) {
 			iphc1 |= lowpan_iphc_mcast_ctx_addr_compress(&hc_ptr,
 								     &dci_entry,
 								     &hdr->daddr);
+			iphc1 |= LOWPAN_IPHC_DAC;
 		} else {
-			iphc1 |= LOWPAN_IPHC_M;
 			iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr,
 								 &hdr->daddr);
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 4c23d8745731b1b54bb8eadfc42070d7f9893b46 Mon Sep 17 00:00:00 2001
From: Andrzej Hajda <a.hajda@samsung.com>
Date: Wed, 24 Feb 2016 09:32:13 +0100
Subject: 6lowpan: fix error checking code

Bool variable 'fail' is always non-negative, it indicates an error if it
is true.

The problem has been detected using coccinelle script
scripts/coccinelle/tests/unsigned_lesser_than_zero.cocci

Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Acked-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/6lowpan/iphc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 8dd8388c8eba..06287ac03211 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -482,7 +482,7 @@ static int lowpan_uncompress_multicast_ctx_daddr(struct sk_buff *skb,
 	ipaddr->s6_addr[0] = 0xFF;
 	fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 2);
 	fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[12], 4);
-	if (fail < 0)
+	if (fail)
 		return -EIO;
 
 	/* take prefix_len and network prefix from the context */
-- 
cgit v1.2.3-59-g8ed1b


From e5fbfc1c2d7657eafed645727cb5d74731f6d68c Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Mon, 22 Feb 2016 10:45:29 -0500
Subject: soreuseport: fix merge conflict in tcp bind

One of the validation checks for the new array-based TCP SO_REUSEPORT
validation was unintentionally dropped in ea8add2b1903.  This adds it back.

Lack of this check allows the user to allocate multiple sock_reuseport
structures (leaking all but the first).

Fixes: ea8add2b1903 ("tcp/dccp: better use of ephemeral ports in bind()")
Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d7682306370b..bc5196ea1bdf 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -202,6 +202,7 @@ tb_found:
 
 		if (((tb->fastreuse > 0 && reuse) ||
 		     (tb->fastreuseport > 0 &&
+		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
 		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
 		    smallest_size == -1)
 			goto success;
-- 
cgit v1.2.3-59-g8ed1b


From 918c023f29ab2dd8c63cfcc6a1239ee15933871a Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Wed, 24 Feb 2016 09:29:38 -0800
Subject: flow_dissector: Check for IP fragmentation even if not using IPv4
 address

This patch corrects the logic for the IPv4 parsing so that it is consistent
with how we handle IPv6.  Specifically if we do not have the flow key
indicating we want the addresses we still may need to take a look at the IP
fragmentation bits and to see if we should stop after we have recognized
the L3 header.

Fixes: 807e165dc44f ("flow_dissector: Add control/reporting of fragmentation")
Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 12e700332010..1f88f8280280 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -178,15 +178,16 @@ ip:
 
 		ip_proto = iph->protocol;
 
-		if (!dissector_uses_key(flow_dissector,
-					FLOW_DISSECTOR_KEY_IPV4_ADDRS))
-			break;
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+			key_addrs = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+							      target_container);
 
-		key_addrs = skb_flow_dissector_target(flow_dissector,
-			      FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container);
-		memcpy(&key_addrs->v4addrs, &iph->saddr,
-		       sizeof(key_addrs->v4addrs));
-		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			memcpy(&key_addrs->v4addrs, &iph->saddr,
+			       sizeof(key_addrs->v4addrs));
+			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		}
 
 		if (ip_is_fragment(iph)) {
 			key_control->flags |= FLOW_DIS_IS_FRAGMENT;
-- 
cgit v1.2.3-59-g8ed1b


From 43d2ccb3c122a47524019d1831a54f07f7fcb978 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Wed, 24 Feb 2016 09:29:44 -0800
Subject: flow_dissector: Fix fragment handling for header length computation

It turns out that for IPv4 we were reporting the ip_proto of the fragment,
and for IPv6 we were not.  This patch updates that behavior so that we
always report the IP protocol of the fragment.  In addition it takes the
steps of updating the payload offset code so that we will determine the
start of the payload not including the L4 header for any fragment after the
first.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1f88f8280280..8bd745f72734 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -448,13 +448,12 @@ ip_proto_again:
 		key_control->flags |= FLOW_DIS_IS_FRAGMENT;
 
 		nhoff += sizeof(_fh);
+		ip_proto = fh->nexthdr;
 
 		if (!(fh->frag_off & htons(IP6_OFFSET))) {
 			key_control->flags |= FLOW_DIS_FIRST_FRAG;
-			if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
-				ip_proto = fh->nexthdr;
+			if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)
 				goto ip_proto_again;
-			}
 		}
 		goto out_good;
 	}
@@ -741,6 +740,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 {
 	u32 poff = keys->control.thoff;
 
+	/* skip L4 headers for fragments after the first */
+	if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
+	    !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
+		return poff;
+
 	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP: {
 		/* access doff as u8 to avoid unaligned access */
-- 
cgit v1.2.3-59-g8ed1b


From 224516b3a798a0563346748744f8cd19feaf09be Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Wed, 24 Feb 2016 09:29:51 -0800
Subject: flow_dissector: Correctly handle parsing FCoE

The flow dissector bits handling FCoE didn't bother to actually validate
that the space there was enough for the FCoE header.  So we need to update
things so that if there is room we add the header and report a good result,
otherwise we do not add the header, and report the bad result.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 8bd745f72734..6288153d7f36 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -340,8 +340,11 @@ mpls:
 	}
 
 	case htons(ETH_P_FCOE):
-		key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
-		/* fall through */
+		if ((hlen - nhoff) < FCOE_HEADER_LEN)
+			goto out_bad;
+
+		nhoff += FCOE_HEADER_LEN;
+		goto out_good;
 	default:
 		goto out_bad;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From b3c3106ce3f4646a008cd238b16f899ae14fd2a7 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Wed, 24 Feb 2016 09:29:57 -0800
Subject: flow_dissector: Use same pointer for IPv4 and IPv6 addresses

The IPv6 parsing was using a local pointer when it could use the same
pointer as the IPv4 portion of the code since the key_addrs can support
both IPv4 and IPv6 as it is just a pointer.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6288153d7f36..7c7b8739b8b8 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -220,13 +220,12 @@ ipv6:
 
 		if (dissector_uses_key(flow_dissector,
 				       FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
-			struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
-
-			key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
-								   FLOW_DISSECTOR_KEY_IPV6_ADDRS,
-								   target_container);
+			key_addrs = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+							      target_container);
 
-			memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+			memcpy(&key_addrs->v6addrs, &iph->saddr,
+			       sizeof(key_addrs->v6addrs));
 			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From d975ddd69698d068a8ebd294c382f1c791b380da Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Wed, 24 Feb 2016 09:30:04 -0800
Subject: eth: Pull header from first fragment via eth_get_headlen

We want to try and pull the L4 header in if it is available in the first
fragment.  As such add the flag to indicate we want to pull the headers on
the first fragment in.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethernet/eth.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 103871784e50..66dff5e3d772 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -125,6 +125,7 @@ EXPORT_SYMBOL(eth_header);
  */
 u32 eth_get_headlen(void *data, unsigned int len)
 {
+	const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
 	const struct ethhdr *eth = (const struct ethhdr *)data;
 	struct flow_keys keys;
 
@@ -134,7 +135,7 @@ u32 eth_get_headlen(void *data, unsigned int len)
 
 	/* parse any remaining L2/L3 headers, check for L4 */
 	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
-					    sizeof(*eth), len, 0))
+					    sizeof(*eth), len, flags))
 		return max_t(u32, keys.control.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
-- 
cgit v1.2.3-59-g8ed1b


From 45040978c8994d1401baf5cc5ac71c1495d4e120 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Wed, 24 Feb 2016 20:32:21 +0100
Subject: netfilter: ipset: Fix set:list type crash when flush/dump set in
 parallel

Flushing/listing entries was not RCU safe, so parallel flush/dump
could lead to kernel crash. Bug reported by Deniz Eren.

Fixes netfilter bugzilla id #1050.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/ipset/ip_set_core.c     |  3 ++
 net/netfilter/ipset/ip_set_list_set.c | 55 ++++++++++++++++-------------------
 2 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 95db43fc0303..7e6568cad494 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
 	if (unlikely(protocol_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
+	/* Must wait for flush to be really finished in list:set */
+	rcu_barrier();
+
 	/* Commands are serialized and references are
 	 * protected by the ip_set_ref_lock.
 	 * External systems (i.e. xt_set) must call
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index bbede95c9f68..24c6c1962aea 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -30,6 +30,7 @@ MODULE_ALIAS("ip_set_list:set");
 struct set_elem {
 	struct rcu_head rcu;
 	struct list_head list;
+	struct ip_set *set;	/* Sigh, in order to cleanup reference */
 	ip_set_id_t id;
 } __aligned(__alignof__(u64));
 
@@ -151,30 +152,29 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
 /* Userspace interfaces: we are protected by the nfnl mutex */
 
 static void
-__list_set_del(struct ip_set *set, struct set_elem *e)
+__list_set_del_rcu(struct rcu_head * rcu)
 {
+	struct set_elem *e = container_of(rcu, struct set_elem, rcu);
+	struct ip_set *set = e->set;
 	struct list_set *map = set->data;
 
 	ip_set_put_byindex(map->net, e->id);
-	/* We may call it, because we don't have a to be destroyed
-	 * extension which is used by the kernel.
-	 */
 	ip_set_ext_destroy(set, e);
-	kfree_rcu(e, rcu);
+	kfree(e);
 }
 
 static inline void
 list_set_del(struct ip_set *set, struct set_elem *e)
 {
 	list_del_rcu(&e->list);
-	__list_set_del(set, e);
+	call_rcu(&e->rcu, __list_set_del_rcu);
 }
 
 static inline void
-list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
+list_set_replace(struct set_elem *e, struct set_elem *old)
 {
 	list_replace_rcu(&old->list, &e->list);
-	__list_set_del(set, old);
+	call_rcu(&old->rcu, __list_set_del_rcu);
 }
 
 static void
@@ -244,9 +244,6 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	struct set_elem *e, *n, *prev, *next;
 	bool flag_exist = flags & IPSET_FLAG_EXIST;
 
-	if (SET_WITH_TIMEOUT(set))
-		set_cleanup_entries(set);
-
 	/* Find where to add the new entry */
 	n = prev = next = NULL;
 	list_for_each_entry(e, &map->members, list) {
@@ -301,10 +298,11 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	if (!e)
 		return -ENOMEM;
 	e->id = d->id;
+	e->set = set;
 	INIT_LIST_HEAD(&e->list);
 	list_set_init_extensions(set, ext, e);
 	if (n)
-		list_set_replace(set, e, n);
+		list_set_replace(e, n);
 	else if (next)
 		list_add_tail_rcu(&e->list, &next->list);
 	else if (prev)
@@ -431,6 +429,7 @@ list_set_destroy(struct ip_set *set)
 
 	if (SET_WITH_TIMEOUT(set))
 		del_timer_sync(&map->gc);
+
 	list_for_each_entry_safe(e, n, &map->members, list) {
 		list_del(&e->list);
 		ip_set_put_byindex(map->net, e->id);
@@ -450,8 +449,10 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
 	struct set_elem *e;
 	u32 n = 0;
 
-	list_for_each_entry(e, &map->members, list)
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &map->members, list)
 		n++;
+	rcu_read_unlock();
 
 	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
 	if (!nested)
@@ -483,33 +484,25 @@ list_set_list(const struct ip_set *set,
 	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
 	if (!atd)
 		return -EMSGSIZE;
-	list_for_each_entry(e, &map->members, list) {
-		if (i == first)
-			break;
-		i++;
-	}
 
 	rcu_read_lock();
-	list_for_each_entry_from(e, &map->members, list) {
-		i++;
-		if (SET_WITH_TIMEOUT(set) &&
-		    ip_set_timeout_expired(ext_timeout(e, set)))
+	list_for_each_entry_rcu(e, &map->members, list) {
+		if (i < first ||
+		    (SET_WITH_TIMEOUT(set) &&
+		     ip_set_timeout_expired(ext_timeout(e, set)))) {
+			i++;
 			continue;
+		}
 		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
-		if (!nested) {
-			if (i == first) {
-				nla_nest_cancel(skb, atd);
-				ret = -EMSGSIZE;
-				goto out;
-			}
+		if (!nested)
 			goto nla_put_failure;
-		}
 		if (nla_put_string(skb, IPSET_ATTR_NAME,
 				   ip_set_name_byindex(map->net, e->id)))
 			goto nla_put_failure;
 		if (ip_set_put_extensions(skb, set, e, true))
 			goto nla_put_failure;
 		ipset_nest_end(skb, nested);
+		i++;
 	}
 
 	ipset_nest_end(skb, atd);
@@ -520,10 +513,12 @@ list_set_list(const struct ip_set *set,
 nla_put_failure:
 	nla_nest_cancel(skb, nested);
 	if (unlikely(i == first)) {
+		nla_nest_cancel(skb, atd);
 		cb->args[IPSET_CB_ARG0] = 0;
 		ret = -EMSGSIZE;
+	} else {
+		cb->args[IPSET_CB_ARG0] = i;
 	}
-	cb->args[IPSET_CB_ARG0] = i - 1;
 	ipset_nest_end(skb, atd);
 out:
 	rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From 81ca7835f2cb0c3ba4236e3bcf31d997c6f5d71a Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 29 Jan 2016 11:24:24 -0800
Subject: NFC: Use GFP_USER for user-controlled kmalloc

These two functions are called in sendmsg path, and the
'len' is passed from user-space, so we should not allow
malicious users to OOM kernel on purpose.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Julian Calaby <julian.calaby@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 net/nfc/llcp_commands.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
index 3621a902cb6e..3425532c39f7 100644
--- a/net/nfc/llcp_commands.c
+++ b/net/nfc/llcp_commands.c
@@ -663,7 +663,7 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock,
 		return -ENOBUFS;
 	}
 
-	msg_data = kzalloc(len, GFP_KERNEL);
+	msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN);
 	if (msg_data == NULL)
 		return -ENOMEM;
 
@@ -729,7 +729,7 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap,
 	if (local == NULL)
 		return -ENODEV;
 
-	msg_data = kzalloc(len, GFP_KERNEL);
+	msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN);
 	if (msg_data == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-59-g8ed1b


From 03c05355543149bf610f4375e8382ee4fc0aaade Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 29 Jan 2016 11:37:40 -0800
Subject: NFC: Close a race condition in llcp_sock_getname()

llcp_sock_getname() checks llcp_sock->dev to make sure
llcp_sock is already connected or bound, however, we could
be in the middle of llcp_sock_bind() where llcp_sock->dev
is bound and llcp_sock->service_name_len is set,
but llcp_sock->service_name is not, in this case we would
lead to copy some bytes from a NULL pointer.

Just lock the sock since this is not a hot path anyway.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 net/nfc/llcp_sock.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ecf0a0196f18..b9edf5fae6ae 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -509,6 +509,11 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr,
 	memset(llcp_addr, 0, sizeof(*llcp_addr));
 	*len = sizeof(struct sockaddr_nfc_llcp);
 
+	lock_sock(sk);
+	if (!llcp_sock->dev) {
+		release_sock(sk);
+		return -EBADFD;
+	}
 	llcp_addr->sa_family = AF_NFC;
 	llcp_addr->dev_idx = llcp_sock->dev->idx;
 	llcp_addr->target_idx = llcp_sock->target_idx;
@@ -518,6 +523,7 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr,
 	llcp_addr->service_name_len = llcp_sock->service_name_len;
 	memcpy(llcp_addr->service_name, llcp_sock->service_name,
 	       llcp_addr->service_name_len);
+	release_sock(sk);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 1d4150c02c5709fdfd80f10368a31867de35e72e Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 22 Feb 2016 15:57:52 -0800
Subject: net_sched: prepare tcf_hashinfo_destroy() for netns support

We only release the memory of the hashtable itself, not its
entries inside. This is not a problem yet since we only call
it in module release path, and module is refcount'ed by
actions. This would be a problem after we move the per module
hinfo into per netns in the latter patch.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h |  5 -----
 net/sched/act_api.c   | 32 +++++++++++++++++++++++++++++---
 2 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9d446f136607..8c4e3ff723fb 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -65,11 +65,6 @@ static inline int tcf_hashinfo_init(struct tcf_hashinfo *hf, unsigned int mask)
 	return 0;
 }
 
-static inline void tcf_hashinfo_destroy(struct tcf_hashinfo *hf)
-{
-	kfree(hf->htab);
-}
-
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
  */
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 06e7c4a37245..acafaf7434fc 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -69,7 +69,7 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 			if (a->ops->cleanup)
 				a->ops->cleanup(a, bind);
 			tcf_hash_destroy(a);
-			ret = 1;
+			ret = ACT_P_DELETED;
 		}
 	}
 
@@ -302,6 +302,32 @@ void tcf_hash_insert(struct tc_action *a)
 }
 EXPORT_SYMBOL(tcf_hash_insert);
 
+static void tcf_hashinfo_destroy(const struct tc_action_ops *ops)
+{
+	struct tcf_hashinfo *hinfo = ops->hinfo;
+	struct tc_action a = {
+		.ops = ops,
+	};
+	int i;
+
+	for (i = 0; i < hinfo->hmask + 1; i++) {
+		struct tcf_common *p;
+		struct hlist_node *n;
+
+		hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) {
+			int ret;
+
+			a.priv = p;
+			ret = __tcf_hash_release(&a, false, true);
+			if (ret == ACT_P_DELETED)
+				module_put(ops->owner);
+			else if (ret < 0)
+				return;
+		}
+	}
+	kfree(hinfo->htab);
+}
+
 static LIST_HEAD(act_base);
 static DEFINE_RWLOCK(act_mod_lock);
 
@@ -333,7 +359,7 @@ int tcf_register_action(struct tc_action_ops *act, unsigned int mask)
 	list_for_each_entry(a, &act_base, head) {
 		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
 			write_unlock(&act_mod_lock);
-			tcf_hashinfo_destroy(act->hinfo);
+			tcf_hashinfo_destroy(act);
 			kfree(act->hinfo);
 			return -EEXIST;
 		}
@@ -353,7 +379,7 @@ int tcf_unregister_action(struct tc_action_ops *act)
 	list_for_each_entry(a, &act_base, head) {
 		if (a == act) {
 			list_del(&act->head);
-			tcf_hashinfo_destroy(act->hinfo);
+			tcf_hashinfo_destroy(act);
 			kfree(act->hinfo);
 			err = 0;
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From ddf97ccdd7cb7e00daba465a5c947b8d941dc2a4 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 22 Feb 2016 15:57:53 -0800
Subject: net_sched: add network namespace support for tc actions

Currently tc actions are stored in a per-module hashtable,
therefore are visible to all network namespaces. This is
probably the last part of the tc subsystem which is not
aware of netns now. This patch makes them per-netns,
several tc action API's need to be adjusted for this.

The tc action API code is ugly due to historical reasons,
we need to refactor that code in the future.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h    |  58 ++++++++++++++++++----
 net/sched/act_api.c      | 113 +++++++++++++++++++++--------------------
 net/sched/act_bpf.c      |  52 +++++++++++++++++--
 net/sched/act_connmark.c |  54 +++++++++++++++++---
 net/sched/act_csum.c     |  59 +++++++++++++++++++---
 net/sched/act_gact.c     |  55 +++++++++++++++++---
 net/sched/act_ipt.c      | 127 ++++++++++++++++++++++++++++++++++++++++++-----
 net/sched/act_mirred.c   |  54 +++++++++++++++++---
 net/sched/act_nat.c      |  54 +++++++++++++++++---
 net/sched/act_pedit.c    |  54 +++++++++++++++++---
 net/sched/act_police.c   |  52 +++++++++++++++----
 net/sched/act_simple.c   |  55 +++++++++++++++++---
 net/sched/act_skbedit.c  |  54 +++++++++++++++++---
 net/sched/act_vlan.c     |  54 +++++++++++++++++---
 14 files changed, 746 insertions(+), 149 deletions(-)

(limited to 'net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8c4e3ff723fb..342be6c5ab5c 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -7,6 +7,8 @@
 
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 
 struct tcf_common {
 	struct hlist_node		tcfc_head;
@@ -87,31 +89,65 @@ struct tc_action {
 	__u32			type; /* for backward compat(TCA_OLD_COMPAT) */
 	__u32			order;
 	struct list_head	list;
+	struct tcf_hashinfo	*hinfo;
 };
 
 struct tc_action_ops {
 	struct list_head head;
-	struct tcf_hashinfo *hinfo;
 	char    kind[IFNAMSIZ];
 	__u32   type; /* TBD to match kind */
 	struct module		*owner;
 	int     (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *);
 	int     (*dump)(struct sk_buff *, struct tc_action *, int, int);
 	void	(*cleanup)(struct tc_action *, int bind);
-	int     (*lookup)(struct tc_action *, u32);
+	int     (*lookup)(struct net *, struct tc_action *, u32);
 	int     (*init)(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action *act, int ovr,
 			int bind);
-	int     (*walk)(struct sk_buff *, struct netlink_callback *, int, struct tc_action *);
+	int     (*walk)(struct net *, struct sk_buff *,
+			struct netlink_callback *, int, struct tc_action *);
+};
+
+struct tc_action_net {
+	struct tcf_hashinfo *hinfo;
+	const struct tc_action_ops *ops;
 };
 
-int tcf_hash_search(struct tc_action *a, u32 index);
-u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo);
-int tcf_hash_check(u32 index, struct tc_action *a, int bind);
-int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
-		    int size, int bind, bool cpustats);
+static inline
+int tc_action_net_init(struct tc_action_net *tn, const struct tc_action_ops *ops,
+		       unsigned int mask)
+{
+	int err = 0;
+
+	tn->hinfo = kmalloc(sizeof(*tn->hinfo), GFP_KERNEL);
+	if (!tn->hinfo)
+		return -ENOMEM;
+	tn->ops = ops;
+	err = tcf_hashinfo_init(tn->hinfo, mask);
+	if (err)
+		kfree(tn->hinfo);
+	return err;
+}
+
+void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
+			  struct tcf_hashinfo *hinfo);
+
+static inline void tc_action_net_exit(struct tc_action_net *tn)
+{
+	tcf_hashinfo_destroy(tn->ops, tn->hinfo);
+}
+
+int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
+		       struct netlink_callback *cb, int type,
+		       struct tc_action *a);
+int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index);
+u32 tcf_hash_new_index(struct tc_action_net *tn);
+int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+		   int bind);
+int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
+		    struct tc_action *a, int size, int bind, bool cpustats);
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est);
-void tcf_hash_insert(struct tc_action *a);
+void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a);
 
 int __tcf_hash_release(struct tc_action *a, bool bind, bool strict);
 
@@ -120,8 +156,8 @@ static inline int tcf_hash_release(struct tc_action *a, bool bind)
 	return __tcf_hash_release(a, bind, false);
 }
 
-int tcf_register_action(struct tc_action_ops *a, unsigned int mask);
-int tcf_unregister_action(struct tc_action_ops *a);
+int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops);
+int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops);
 int tcf_action_destroy(struct list_head *actions, int bind);
 int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
 		    struct tcf_result *res);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index acafaf7434fc..96066665e376 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -36,10 +36,9 @@ static void free_tcf(struct rcu_head *head)
 	kfree(p);
 }
 
-static void tcf_hash_destroy(struct tc_action *a)
+static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a)
 {
 	struct tcf_common *p = a->priv;
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 
 	spin_lock_bh(&hinfo->lock);
 	hlist_del(&p->tcfc_head);
@@ -68,7 +67,7 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 		if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
 			if (a->ops->cleanup)
 				a->ops->cleanup(a, bind);
-			tcf_hash_destroy(a);
+			tcf_hash_destroy(a->hinfo, a);
 			ret = ACT_P_DELETED;
 		}
 	}
@@ -77,10 +76,9 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 }
 EXPORT_SYMBOL(__tcf_hash_release);
 
-static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
-			   struct tc_action *a)
+static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
+			   struct netlink_callback *cb, struct tc_action *a)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	struct hlist_head *head;
 	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
@@ -126,9 +124,9 @@ nla_put_failure:
 	goto done;
 }
 
-static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
+static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
+			  struct tc_action *a)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	struct hlist_head *head;
 	struct hlist_node *n;
 	struct tcf_common *p;
@@ -163,18 +161,24 @@ nla_put_failure:
 	return ret;
 }
 
-static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
-			      int type, struct tc_action *a)
+int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
+		       struct netlink_callback *cb, int type,
+		       struct tc_action *a)
 {
+	struct tcf_hashinfo *hinfo = tn->hinfo;
+
+	a->hinfo = hinfo;
+
 	if (type == RTM_DELACTION) {
-		return tcf_del_walker(skb, a);
+		return tcf_del_walker(hinfo, skb, a);
 	} else if (type == RTM_GETACTION) {
-		return tcf_dump_walker(skb, cb, a);
+		return tcf_dump_walker(hinfo, skb, cb, a);
 	} else {
 		WARN(1, "tcf_generic_walker: unknown action %d\n", type);
 		return -EINVAL;
 	}
 }
+EXPORT_SYMBOL(tcf_generic_walker);
 
 static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
 {
@@ -191,8 +195,9 @@ static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
 	return p;
 }
 
-u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
+u32 tcf_hash_new_index(struct tc_action_net *tn)
 {
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	u32 val = hinfo->index;
 
 	do {
@@ -205,28 +210,31 @@ u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
 }
 EXPORT_SYMBOL(tcf_hash_new_index);
 
-int tcf_hash_search(struct tc_action *a, u32 index)
+int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	struct tcf_common *p = tcf_hash_lookup(index, hinfo);
 
 	if (p) {
 		a->priv = p;
+		a->hinfo = hinfo;
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(tcf_hash_search);
 
-int tcf_hash_check(u32 index, struct tc_action *a, int bind)
+int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+		   int bind)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	struct tcf_common *p = NULL;
 	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
 		if (bind)
 			p->tcfc_bindcnt++;
 		p->tcfc_refcnt++;
 		a->priv = p;
+		a->hinfo = hinfo;
 		return 1;
 	}
 	return 0;
@@ -243,11 +251,11 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 }
 EXPORT_SYMBOL(tcf_hash_cleanup);
 
-int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
-		    int size, int bind, bool cpustats)
+int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
+		    struct tc_action *a, int size, int bind, bool cpustats)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	int err = -ENOMEM;
 
 	if (unlikely(!p))
@@ -272,7 +280,7 @@ err2:
 	}
 	spin_lock_init(&p->tcfc_lock);
 	INIT_HLIST_NODE(&p->tcfc_head);
-	p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
+	p->tcfc_index = index ? index : tcf_hash_new_index(tn);
 	p->tcfc_tm.install = jiffies;
 	p->tcfc_tm.lastuse = jiffies;
 	if (est) {
@@ -286,14 +294,15 @@ err2:
 	}
 
 	a->priv = (void *) p;
+	a->hinfo = hinfo;
 	return 0;
 }
 EXPORT_SYMBOL(tcf_hash_create);
 
-void tcf_hash_insert(struct tc_action *a)
+void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a)
 {
 	struct tcf_common *p = a->priv;
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
 
 	spin_lock_bh(&hinfo->lock);
@@ -302,11 +311,12 @@ void tcf_hash_insert(struct tc_action *a)
 }
 EXPORT_SYMBOL(tcf_hash_insert);
 
-static void tcf_hashinfo_destroy(const struct tc_action_ops *ops)
+void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
+			  struct tcf_hashinfo *hinfo)
 {
-	struct tcf_hashinfo *hinfo = ops->hinfo;
 	struct tc_action a = {
 		.ops = ops,
+		.hinfo = hinfo,
 	};
 	int i;
 
@@ -327,60 +337,52 @@ static void tcf_hashinfo_destroy(const struct tc_action_ops *ops)
 	}
 	kfree(hinfo->htab);
 }
+EXPORT_SYMBOL(tcf_hashinfo_destroy);
 
 static LIST_HEAD(act_base);
 static DEFINE_RWLOCK(act_mod_lock);
 
-int tcf_register_action(struct tc_action_ops *act, unsigned int mask)
+int tcf_register_action(struct tc_action_ops *act,
+			struct pernet_operations *ops)
 {
 	struct tc_action_ops *a;
-	int err;
+	int ret;
 
-	/* Must supply act, dump and init */
-	if (!act->act || !act->dump || !act->init)
+	if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
 		return -EINVAL;
 
-	/* Supply defaults */
-	if (!act->lookup)
-		act->lookup = tcf_hash_search;
-	if (!act->walk)
-		act->walk = tcf_generic_walker;
-
-	act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL);
-	if (!act->hinfo)
-		return -ENOMEM;
-	err = tcf_hashinfo_init(act->hinfo, mask);
-	if (err) {
-		kfree(act->hinfo);
-		return err;
-	}
-
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
 			write_unlock(&act_mod_lock);
-			tcf_hashinfo_destroy(act);
-			kfree(act->hinfo);
 			return -EEXIST;
 		}
 	}
 	list_add_tail(&act->head, &act_base);
 	write_unlock(&act_mod_lock);
+
+	ret = register_pernet_subsys(ops);
+	if (ret) {
+		tcf_unregister_action(act, ops);
+		return ret;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(tcf_register_action);
 
-int tcf_unregister_action(struct tc_action_ops *act)
+int tcf_unregister_action(struct tc_action_ops *act,
+			  struct pernet_operations *ops)
 {
 	struct tc_action_ops *a;
 	int err = -ENOENT;
 
+	unregister_pernet_subsys(ops);
+
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (a == act) {
 			list_del(&act->head);
-			tcf_hashinfo_destroy(act);
-			kfree(act->hinfo);
 			err = 0;
 			break;
 		}
@@ -747,8 +749,8 @@ static struct tc_action *create_a(int i)
 	return act;
 }
 
-static struct tc_action *
-tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
+static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
+					  struct nlmsghdr *n, u32 portid)
 {
 	struct nlattr *tb[TCA_ACT_MAX + 1];
 	struct tc_action *a;
@@ -775,7 +777,7 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
 	if (a->ops == NULL) /* could happen in batch of actions */
 		goto err_free;
 	err = -ENOENT;
-	if (a->ops->lookup(a, index) == 0)
+	if (a->ops->lookup(net, a, index) == 0)
 		goto err_mod;
 
 	module_put(a->ops->owner);
@@ -845,7 +847,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	if (nest == NULL)
 		goto out_module_put;
 
-	err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);
+	err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a);
 	if (err < 0)
 		goto out_module_put;
 	if (err == 0)
@@ -923,7 +925,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 	}
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_get_1(tb[i], n, portid);
+		act = tcf_action_get_1(net, tb[i], n, portid);
 		if (IS_ERR(act)) {
 			ret = PTR_ERR(act);
 			goto err;
@@ -1070,6 +1072,7 @@ find_dump_kind(const struct nlmsghdr *n)
 static int
 tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	struct nlmsghdr *nlh;
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
@@ -1104,7 +1107,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (nest == NULL)
 		goto out_module_put;
 
-	ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
+	ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a);
 	if (ret < 0)
 		goto out_module_put;
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 0bc6f912f870..8c9f1f0459ab 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -33,6 +33,8 @@ struct tcf_bpf_cfg {
 	bool is_ebpf;
 };
 
+static int bpf_net_id;
+
 static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 		   struct tcf_result *res)
 {
@@ -275,6 +277,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action *act,
 			int replace, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
 	struct tcf_bpf_cfg cfg, old;
 	struct tc_act_bpf *parm;
@@ -294,8 +297,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
 
-	if (!tcf_hash_check(parm->index, act, bind)) {
-		ret = tcf_hash_create(parm->index, est, act,
+	if (!tcf_hash_check(tn, parm->index, act, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, act,
 				      sizeof(*prog), bind, true);
 		if (ret < 0)
 			return ret;
@@ -344,7 +347,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	rcu_assign_pointer(prog->filter, cfg.filter);
 
 	if (res == ACT_P_CREATED) {
-		tcf_hash_insert(act);
+		tcf_hash_insert(tn, act);
 	} else {
 		/* make sure the program being replaced is no longer executing */
 		synchronize_rcu();
@@ -367,6 +370,22 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind)
 	tcf_bpf_cfg_cleanup(&tmp);
 }
 
+static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_bpf_ops __read_mostly = {
 	.kind		=	"bpf",
 	.type		=	TCA_ACT_BPF,
@@ -375,16 +394,39 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
 	.dump		=	tcf_bpf_dump,
 	.cleanup	=	tcf_bpf_cleanup,
 	.init		=	tcf_bpf_init,
+	.walk		=	tcf_bpf_walker,
+	.lookup		=	tcf_bpf_search,
+};
+
+static __net_init int bpf_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+	return tc_action_net_init(tn, &act_bpf_ops, BPF_TAB_MASK);
+}
+
+static void __net_exit bpf_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations bpf_net_ops = {
+	.init = bpf_init_net,
+	.exit = bpf_exit_net,
+	.id   = &bpf_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 static int __init bpf_init_module(void)
 {
-	return tcf_register_action(&act_bpf_ops, BPF_TAB_MASK);
+	return tcf_register_action(&act_bpf_ops, &bpf_net_ops);
 }
 
 static void __exit bpf_cleanup_module(void)
 {
-	tcf_unregister_action(&act_bpf_ops);
+	tcf_unregister_action(&act_bpf_ops, &bpf_net_ops);
 }
 
 module_init(bpf_init_module);
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index bb41699c6c49..c0ed93ce2391 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -30,6 +30,8 @@
 
 #define CONNMARK_TAB_MASK     3
 
+static int connmark_net_id;
+
 static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 			struct tcf_result *res)
 {
@@ -97,6 +99,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 			     struct nlattr *est, struct tc_action *a,
 			     int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 	struct nlattr *tb[TCA_CONNMARK_MAX + 1];
 	struct tcf_connmark_info *ci;
 	struct tc_connmark *parm;
@@ -111,9 +114,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_CONNMARK_PARMS]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*ci),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*ci), bind, false);
 		if (ret)
 			return ret;
 
@@ -122,7 +125,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 		ci->net = net;
 		ci->zone = parm->zone;
 
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 		ret = ACT_P_CREATED;
 	} else {
 		ci = to_connmark(a);
@@ -169,6 +172,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
+			       struct netlink_callback *cb, int type,
+			       struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_connmark_ops = {
 	.kind		=	"connmark",
 	.type		=	TCA_ACT_CONNMARK,
@@ -176,16 +195,39 @@ static struct tc_action_ops act_connmark_ops = {
 	.act		=	tcf_connmark,
 	.dump		=	tcf_connmark_dump,
 	.init		=	tcf_connmark_init,
+	.walk		=	tcf_connmark_walker,
+	.lookup		=	tcf_connmark_search,
+};
+
+static __net_init int connmark_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+	return tc_action_net_init(tn, &act_connmark_ops, CONNMARK_TAB_MASK);
+}
+
+static void __net_exit connmark_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations connmark_net_ops = {
+	.init = connmark_init_net,
+	.exit = connmark_exit_net,
+	.id   = &connmark_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 static int __init connmark_init_module(void)
 {
-	return tcf_register_action(&act_connmark_ops, CONNMARK_TAB_MASK);
+	return tcf_register_action(&act_connmark_ops, &connmark_net_ops);
 }
 
 static void __exit connmark_cleanup_module(void)
 {
-	tcf_unregister_action(&act_connmark_ops);
+	tcf_unregister_action(&act_connmark_ops, &connmark_net_ops);
 }
 
 module_init(connmark_init_module);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index eeb3eb3ea9eb..d22426cdebc0 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -42,9 +42,13 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
 	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
 };
 
-static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
-			 struct tc_action *a, int ovr, int bind)
+static int csum_net_id;
+
+static int tcf_csum_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action *a, int ovr,
+			 int bind)
 {
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
 	struct nlattr *tb[TCA_CSUM_MAX + 1];
 	struct tc_csum *parm;
 	struct tcf_csum *p;
@@ -61,9 +65,9 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
 		return -EINVAL;
 	parm = nla_data(tb[TCA_CSUM_PARMS]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*p), bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -82,7 +86,7 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 
 	return ret;
 }
@@ -555,6 +559,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_csum_ops = {
 	.kind		= "csum",
 	.type		= TCA_ACT_CSUM,
@@ -562,6 +582,29 @@ static struct tc_action_ops act_csum_ops = {
 	.act		= tcf_csum,
 	.dump		= tcf_csum_dump,
 	.init		= tcf_csum_init,
+	.walk		= tcf_csum_walker,
+	.lookup		= tcf_csum_search,
+};
+
+static __net_init int csum_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+	return tc_action_net_init(tn, &act_csum_ops, CSUM_TAB_MASK);
+}
+
+static void __net_exit csum_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations csum_net_ops = {
+	.init = csum_init_net,
+	.exit = csum_exit_net,
+	.id   = &csum_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_DESCRIPTION("Checksum updating actions");
@@ -569,12 +612,12 @@ MODULE_LICENSE("GPL");
 
 static int __init csum_init_module(void)
 {
-	return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);
+	return tcf_register_action(&act_csum_ops, &csum_net_ops);
 }
 
 static void __exit csum_cleanup_module(void)
 {
-	tcf_unregister_action(&act_csum_ops);
+	tcf_unregister_action(&act_csum_ops, &csum_net_ops);
 }
 
 module_init(csum_init_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 5c1b05170736..887fc1f209ff 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -25,6 +25,8 @@
 
 #define GACT_TAB_MASK	15
 
+static int gact_net_id;
+
 #ifdef CONFIG_GACT_PROB
 static int gact_net_rand(struct tcf_gact *gact)
 {
@@ -57,6 +59,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action *a,
 			 int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
 	struct nlattr *tb[TCA_GACT_MAX + 1];
 	struct tc_gact *parm;
 	struct tcf_gact *gact;
@@ -88,9 +91,9 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	}
 #endif
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*gact),
-				      bind, true);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*gact), bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -118,7 +121,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	}
 #endif
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -183,6 +186,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_gact_ops = {
 	.kind		=	"gact",
 	.type		=	TCA_ACT_GACT,
@@ -190,6 +209,29 @@ static struct tc_action_ops act_gact_ops = {
 	.act		=	tcf_gact,
 	.dump		=	tcf_gact_dump,
 	.init		=	tcf_gact_init,
+	.walk		=	tcf_gact_walker,
+	.lookup		=	tcf_gact_search,
+};
+
+static __net_init int gact_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+	return tc_action_net_init(tn, &act_gact_ops, GACT_TAB_MASK);
+}
+
+static void __net_exit gact_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations gact_net_ops = {
+	.init = gact_init_net,
+	.exit = gact_exit_net,
+	.id   = &gact_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -203,12 +245,13 @@ static int __init gact_init_module(void)
 #else
 	pr_info("GACT probability NOT on\n");
 #endif
-	return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);
+
+	return tcf_register_action(&act_gact_ops, &gact_net_ops);
 }
 
 static void __exit gact_cleanup_module(void)
 {
-	tcf_unregister_action(&act_gact_ops);
+	tcf_unregister_action(&act_gact_ops, &gact_net_ops);
 }
 
 module_init(gact_init_module);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d05869646515..89c41a1f3589 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -30,6 +30,10 @@
 
 #define IPT_TAB_MASK     15
 
+static int ipt_net_id;
+
+static int xt_net_id;
+
 static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
 {
 	struct xt_tgchk_param par;
@@ -83,8 +87,9 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 	[TCA_IPT_TARG]	= { .len = sizeof(struct xt_entry_target) },
 };
 
-static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
-			struct tc_action *a, int ovr, int bind)
+static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
+			  struct nlattr *est, struct tc_action *a, int ovr,
+			  int bind)
 {
 	struct nlattr *tb[TCA_IPT_MAX + 1];
 	struct tcf_ipt *ipt;
@@ -113,8 +118,9 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	if (tb[TCA_IPT_INDEX] != NULL)
 		index = nla_get_u32(tb[TCA_IPT_INDEX]);
 
-	if (!tcf_hash_check(index, a, bind) ) {
-		ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false);
+	if (!tcf_hash_check(tn, index, a, bind)) {
+		ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind,
+				      false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -157,7 +163,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	ipt->tcfi_hook  = hook;
 	spin_unlock_bh(&ipt->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 
 err3:
@@ -170,6 +176,24 @@ err1:
 	return err;
 }
 
+static int tcf_ipt_init(struct net *net, struct nlattr *nla,
+			struct nlattr *est, struct tc_action *a, int ovr,
+			int bind)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+}
+
+static int tcf_xt_init(struct net *net, struct nlattr *nla,
+		       struct nlattr *est, struct tc_action *a, int ovr,
+		       int bind)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+}
+
 static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 		   struct tcf_result *res)
 {
@@ -260,6 +284,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_ipt_ops = {
 	.kind		=	"ipt",
 	.type		=	TCA_ACT_IPT,
@@ -268,8 +308,47 @@ static struct tc_action_ops act_ipt_ops = {
 	.dump		=	tcf_ipt_dump,
 	.cleanup	=	tcf_ipt_release,
 	.init		=	tcf_ipt_init,
+	.walk		=	tcf_ipt_walker,
+	.lookup		=	tcf_ipt_search,
+};
+
+static __net_init int ipt_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	return tc_action_net_init(tn, &act_ipt_ops, IPT_TAB_MASK);
+}
+
+static void __net_exit ipt_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations ipt_net_ops = {
+	.init = ipt_init_net,
+	.exit = ipt_exit_net,
+	.id   = &ipt_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
+static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
+			 struct netlink_callback *cb, int type,
+			 struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_xt_ops = {
 	.kind		=	"xt",
 	.type		=	TCA_ACT_XT,
@@ -277,7 +356,30 @@ static struct tc_action_ops act_xt_ops = {
 	.act		=	tcf_ipt,
 	.dump		=	tcf_ipt_dump,
 	.cleanup	=	tcf_ipt_release,
-	.init		=	tcf_ipt_init,
+	.init		=	tcf_xt_init,
+	.walk		=	tcf_xt_walker,
+	.lookup		=	tcf_xt_search,
+};
+
+static __net_init int xt_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	return tc_action_net_init(tn, &act_xt_ops, IPT_TAB_MASK);
+}
+
+static void __net_exit xt_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations xt_net_ops = {
+	.init = xt_init_net,
+	.exit = xt_exit_net,
+	.id   = &xt_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
@@ -289,12 +391,13 @@ static int __init ipt_init_module(void)
 {
 	int ret1, ret2;
 
-	ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);
+	ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops);
 	if (ret1 < 0)
-		printk("Failed to load xt action\n");
-	ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);
+		pr_err("Failed to load xt action\n");
+
+	ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops);
 	if (ret2 < 0)
-		printk("Failed to load ipt action\n");
+		pr_err("Failed to load ipt action\n");
 
 	if (ret1 < 0 && ret2 < 0) {
 		return ret1;
@@ -304,8 +407,8 @@ static int __init ipt_init_module(void)
 
 static void __exit ipt_cleanup_module(void)
 {
-	tcf_unregister_action(&act_xt_ops);
-	tcf_unregister_action(&act_ipt_ops);
+	tcf_unregister_action(&act_ipt_ops, &ipt_net_ops);
+	tcf_unregister_action(&act_xt_ops, &xt_net_ops);
 }
 
 module_init(ipt_init_module);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 32fcdecdb9e2..6b284d991e0b 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -50,10 +50,13 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
 	[TCA_MIRRED_PARMS]	= { .len = sizeof(struct tc_mirred) },
 };
 
+static int mirred_net_id;
+
 static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action *a, int ovr,
 			   int bind)
 {
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 	struct nlattr *tb[TCA_MIRRED_MAX + 1];
 	struct tc_mirred *parm;
 	struct tcf_mirred *m;
@@ -96,11 +99,11 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		dev = NULL;
 	}
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		if (dev == NULL)
 			return -EINVAL;
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*m),
-				      bind, true);
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*m), bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -130,7 +133,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		spin_lock_bh(&mirred_list_lock);
 		list_add(&m->tcfm_list, &mirred_list);
 		spin_unlock_bh(&mirred_list_lock);
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	}
 
 	return ret;
@@ -221,6 +224,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static int mirred_device_event(struct notifier_block *unused,
 			       unsigned long event, void *ptr)
 {
@@ -257,6 +276,29 @@ static struct tc_action_ops act_mirred_ops = {
 	.dump		=	tcf_mirred_dump,
 	.cleanup	=	tcf_mirred_release,
 	.init		=	tcf_mirred_init,
+	.walk		=	tcf_mirred_walker,
+	.lookup		=	tcf_mirred_search,
+};
+
+static __net_init int mirred_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+	return tc_action_net_init(tn, &act_mirred_ops, MIRRED_TAB_MASK);
+}
+
+static void __net_exit mirred_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations mirred_net_ops = {
+	.init = mirred_init_net,
+	.exit = mirred_exit_net,
+	.id   = &mirred_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002)");
@@ -270,12 +312,12 @@ static int __init mirred_init_module(void)
 		return err;
 
 	pr_info("Mirror/redirect action on\n");
-	return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);
+	return tcf_register_action(&act_mirred_ops, &mirred_net_ops);
 }
 
 static void __exit mirred_cleanup_module(void)
 {
-	tcf_unregister_action(&act_mirred_ops);
+	tcf_unregister_action(&act_mirred_ops, &mirred_net_ops);
 	unregister_netdevice_notifier(&mirred_device_notifier);
 }
 
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 27607b863aba..0f65cdfbfb1d 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -31,6 +31,8 @@
 
 #define NAT_TAB_MASK	15
 
+static int nat_net_id;
+
 static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 	[TCA_NAT_PARMS]	= { .len = sizeof(struct tc_nat) },
 };
@@ -38,6 +40,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 			struct tc_action *a, int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
 	struct nlattr *tb[TCA_NAT_MAX + 1];
 	struct tc_nat *parm;
 	int ret = 0, err;
@@ -54,9 +57,9 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 		return -EINVAL;
 	parm = nla_data(tb[TCA_NAT_PARMS]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*p), bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
@@ -79,7 +82,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 
 	return ret;
 }
@@ -274,6 +277,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_nat_ops = {
 	.kind		=	"nat",
 	.type		=	TCA_ACT_NAT,
@@ -281,6 +300,29 @@ static struct tc_action_ops act_nat_ops = {
 	.act		=	tcf_nat,
 	.dump		=	tcf_nat_dump,
 	.init		=	tcf_nat_init,
+	.walk		=	tcf_nat_walker,
+	.lookup		=	tcf_nat_search,
+};
+
+static __net_init int nat_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+	return tc_action_net_init(tn, &act_nat_ops, NAT_TAB_MASK);
+}
+
+static void __net_exit nat_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations nat_net_ops = {
+	.init = nat_init_net,
+	.exit = nat_exit_net,
+	.id   = &nat_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_DESCRIPTION("Stateless NAT actions");
@@ -288,12 +330,12 @@ MODULE_LICENSE("GPL");
 
 static int __init nat_init_module(void)
 {
-	return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);
+	return tcf_register_action(&act_nat_ops, &nat_net_ops);
 }
 
 static void __exit nat_cleanup_module(void)
 {
-	tcf_unregister_action(&act_nat_ops);
+	tcf_unregister_action(&act_nat_ops, &nat_net_ops);
 }
 
 module_init(nat_init_module);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index e38a7701f154..429c3ab65142 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -25,6 +25,8 @@
 
 #define PEDIT_TAB_MASK	15
 
+static int pedit_net_id;
+
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
 	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },
 };
@@ -33,6 +35,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action *a,
 			  int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
 	struct tc_pedit *parm;
 	int ret = 0, err;
@@ -54,11 +57,11 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
 		return -EINVAL;
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
 		if (!parm->nkeys)
 			return -EINVAL;
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
-				      bind, false);
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*p), bind, false);
 		if (ret)
 			return ret;
 		p = to_pedit(a);
@@ -93,7 +96,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	memcpy(p->tcfp_keys, parm->keys, ksize);
 	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -211,6 +214,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
+			    struct netlink_callback *cb, int type,
+			    struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_pedit_ops = {
 	.kind		=	"pedit",
 	.type		=	TCA_ACT_PEDIT,
@@ -219,6 +238,29 @@ static struct tc_action_ops act_pedit_ops = {
 	.dump		=	tcf_pedit_dump,
 	.cleanup	=	tcf_pedit_cleanup,
 	.init		=	tcf_pedit_init,
+	.walk		=	tcf_pedit_walker,
+	.lookup		=	tcf_pedit_search,
+};
+
+static __net_init int pedit_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+	return tc_action_net_init(tn, &act_pedit_ops, PEDIT_TAB_MASK);
+}
+
+static void __net_exit pedit_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations pedit_net_ops = {
+	.init = pedit_init_net,
+	.exit = pedit_exit_net,
+	.id   = &pedit_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -227,12 +269,12 @@ MODULE_LICENSE("GPL");
 
 static int __init pedit_init_module(void)
 {
-	return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);
+	return tcf_register_action(&act_pedit_ops, &pedit_net_ops);
 }
 
 static void __exit pedit_cleanup_module(void)
 {
-	tcf_unregister_action(&act_pedit_ops);
+	tcf_unregister_action(&act_pedit_ops, &pedit_net_ops);
 }
 
 module_init(pedit_init_module);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 9a1c42a43f92..330f14e302e8 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -55,10 +55,14 @@ struct tc_police_compat {
 
 /* Each policer is serialized by its individual spinlock */
 
-static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
-			      int type, struct tc_action *a)
+static int police_net_id;
+
+static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
+				 struct netlink_callback *cb, int type,
+				 struct tc_action *a)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	struct hlist_head *head;
 	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
@@ -121,7 +125,8 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
 	struct tc_police *parm;
 	struct tcf_police *police;
 	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+	struct tcf_hashinfo *hinfo = tn->hinfo;
 	int size;
 
 	if (nla == NULL)
@@ -139,7 +144,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
 	parm = nla_data(tb[TCA_POLICE_TBF]);
 
 	if (parm->index) {
-		if (tcf_hash_search(a, parm->index)) {
+		if (tcf_hash_search(tn, a, parm->index)) {
 			police = to_police(a->priv);
 			if (bind) {
 				police->tcf_bindcnt += 1;
@@ -233,7 +238,7 @@ override:
 
 	police->tcfp_t_c = ktime_get_ns();
 	police->tcf_index = parm->index ? parm->index :
-		tcf_hash_new_index(hinfo);
+		tcf_hash_new_index(tn);
 	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
 	spin_lock_bh(&hinfo->lock);
 	hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
@@ -342,6 +347,13 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_police_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 MODULE_AUTHOR("Alexey Kuznetsov");
 MODULE_DESCRIPTION("Policing actions");
 MODULE_LICENSE("GPL");
@@ -353,19 +365,41 @@ static struct tc_action_ops act_police_ops = {
 	.act		=	tcf_act_police,
 	.dump		=	tcf_act_police_dump,
 	.init		=	tcf_act_police_locate,
-	.walk		=	tcf_act_police_walker
+	.walk		=	tcf_act_police_walker,
+	.lookup		=	tcf_police_search,
+};
+
+static __net_init int police_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+
+	return tc_action_net_init(tn, &act_police_ops, POL_TAB_MASK);
+}
+
+static void __net_exit police_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, police_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations police_net_ops = {
+	.init = police_init_net,
+	.exit = police_exit_net,
+	.id   = &police_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 static int __init
 police_init_module(void)
 {
-	return tcf_register_action(&act_police_ops, POL_TAB_MASK);
+	return tcf_register_action(&act_police_ops, &police_net_ops);
 }
 
 static void __exit
 police_cleanup_module(void)
 {
-	tcf_unregister_action(&act_police_ops);
+	tcf_unregister_action(&act_police_ops, &police_net_ops);
 }
 
 module_init(police_init_module);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index d6b708d6afdf..75b2be13fbcc 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -26,6 +26,8 @@
 
 #define SIMP_TAB_MASK     7
 
+static int simp_net_id;
+
 #define SIMP_MAX_DATA	32
 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
@@ -80,6 +82,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action *a,
 			 int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
 	struct nlattr *tb[TCA_DEF_MAX + 1];
 	struct tc_defact *parm;
 	struct tcf_defact *d;
@@ -102,9 +105,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	parm = nla_data(tb[TCA_DEF_PARMS]);
 	defdata = nla_data(tb[TCA_DEF_DATA]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*d), bind, false);
 		if (ret)
 			return ret;
 
@@ -129,7 +132,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -161,6 +164,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_simp_ops = {
 	.kind		=	"simple",
 	.type		=	TCA_ACT_SIMP,
@@ -169,6 +188,29 @@ static struct tc_action_ops act_simp_ops = {
 	.dump		=	tcf_simp_dump,
 	.cleanup	=	tcf_simp_release,
 	.init		=	tcf_simp_init,
+	.walk		=	tcf_simp_walker,
+	.lookup		=	tcf_simp_search,
+};
+
+static __net_init int simp_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+	return tc_action_net_init(tn, &act_simp_ops, SIMP_TAB_MASK);
+}
+
+static void __net_exit simp_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations simp_net_ops = {
+	.init = simp_init_net,
+	.exit = simp_exit_net,
+	.id   = &simp_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2005)");
@@ -177,8 +219,7 @@ MODULE_LICENSE("GPL");
 
 static int __init simp_init_module(void)
 {
-	int ret;
-	ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);
+	int ret = tcf_register_action(&act_simp_ops, &simp_net_ops);
 	if (!ret)
 		pr_info("Simple TC action Loaded\n");
 	return ret;
@@ -186,7 +227,7 @@ static int __init simp_init_module(void)
 
 static void __exit simp_cleanup_module(void)
 {
-	tcf_unregister_action(&act_simp_ops);
+	tcf_unregister_action(&act_simp_ops, &simp_net_ops);
 }
 
 module_init(simp_init_module);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6751b5f8c046..cfcdbdc00c9b 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -29,6 +29,8 @@
 
 #define SKBEDIT_TAB_MASK     15
 
+static int skbedit_net_id;
+
 static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 		       struct tcf_result *res)
 {
@@ -61,6 +63,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 			    struct nlattr *est, struct tc_action *a,
 			    int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
 	struct tc_skbedit *parm;
 	struct tcf_skbedit *d;
@@ -98,9 +101,9 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*d), bind, false);
 		if (ret)
 			return ret;
 
@@ -130,7 +133,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	spin_unlock_bh(&d->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -173,6 +176,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
+			      struct netlink_callback *cb, int type,
+			      struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_skbedit_ops = {
 	.kind		=	"skbedit",
 	.type		=	TCA_ACT_SKBEDIT,
@@ -180,6 +199,29 @@ static struct tc_action_ops act_skbedit_ops = {
 	.act		=	tcf_skbedit,
 	.dump		=	tcf_skbedit_dump,
 	.init		=	tcf_skbedit_init,
+	.walk		=	tcf_skbedit_walker,
+	.lookup		=	tcf_skbedit_search,
+};
+
+static __net_init int skbedit_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+	return tc_action_net_init(tn, &act_skbedit_ops, SKBEDIT_TAB_MASK);
+}
+
+static void __net_exit skbedit_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations skbedit_net_ops = {
+	.init = skbedit_init_net,
+	.exit = skbedit_exit_net,
+	.id   = &skbedit_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
@@ -188,12 +230,12 @@ MODULE_LICENSE("GPL");
 
 static int __init skbedit_init_module(void)
 {
-	return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);
+	return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops);
 }
 
 static void __exit skbedit_cleanup_module(void)
 {
-	tcf_unregister_action(&act_skbedit_ops);
+	tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops);
 }
 
 module_init(skbedit_init_module);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 796785e0bf96..bab8ae0cefc0 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -21,6 +21,8 @@
 
 #define VLAN_TAB_MASK     15
 
+static int vlan_net_id;
+
 static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
 {
@@ -68,6 +70,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action *a,
 			 int ovr, int bind)
 {
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 	struct nlattr *tb[TCA_VLAN_MAX + 1];
 	struct tc_vlan *parm;
 	struct tcf_vlan *v;
@@ -115,9 +118,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	}
 	action = parm->v_action;
 
-	if (!tcf_hash_check(parm->index, a, bind)) {
-		ret = tcf_hash_create(parm->index, est, a, sizeof(*v),
-				      bind, false);
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      sizeof(*v), bind, false);
 		if (ret)
 			return ret;
 
@@ -143,7 +146,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	spin_unlock_bh(&v->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(a);
+		tcf_hash_insert(tn, a);
 	return ret;
 }
 
@@ -181,6 +184,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
 static struct tc_action_ops act_vlan_ops = {
 	.kind		=	"vlan",
 	.type		=	TCA_ACT_VLAN,
@@ -188,16 +207,39 @@ static struct tc_action_ops act_vlan_ops = {
 	.act		=	tcf_vlan,
 	.dump		=	tcf_vlan_dump,
 	.init		=	tcf_vlan_init,
+	.walk		=	tcf_vlan_walker,
+	.lookup		=	tcf_vlan_search,
+};
+
+static __net_init int vlan_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+	return tc_action_net_init(tn, &act_vlan_ops, VLAN_TAB_MASK);
+}
+
+static void __net_exit vlan_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations vlan_net_ops = {
+	.init = vlan_init_net,
+	.exit = vlan_exit_net,
+	.id   = &vlan_net_id,
+	.size = sizeof(struct tc_action_net),
 };
 
 static int __init vlan_init_module(void)
 {
-	return tcf_register_action(&act_vlan_ops, VLAN_TAB_MASK);
+	return tcf_register_action(&act_vlan_ops, &vlan_net_ops);
 }
 
 static void __exit vlan_cleanup_module(void)
 {
-	tcf_unregister_action(&act_vlan_ops);
+	tcf_unregister_action(&act_vlan_ops, &vlan_net_ops);
 }
 
 module_init(vlan_init_module);
-- 
cgit v1.2.3-59-g8ed1b


From 65aebfc002abc1827ac7c8644a2bba0459ce3ce2 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 23 Feb 2016 12:13:54 -0500
Subject: net: dsa: add port_vlan_dump routine

Similar to port_fdb_dump, add a port_vlan_dump function to DSA drivers
which gets passed the switchdev VLAN object and callback.

This function, if implemented, takes precedence over the soon legacy
vlan_getnext/port_pvid_get approach.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dsa/dsa.txt | 4 ++++
 include/net/dsa.h                    | 3 +++
 net/dsa/slave.c                      | 3 +++
 3 files changed, 10 insertions(+)

(limited to 'net')

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index ebf21530471f..350a502e031f 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -554,6 +554,10 @@ Bridge VLAN filtering
 - port_vlan_del: bridge layer function invoked when a VLAN is removed from the
   given switch port
 
+- port_vlan_dump: bridge layer function invoked with a switchdev callback
+  function that the driver has to call for each VLAN the given port is a member
+  of. A switchdev object is used to carry the VID and bridge flags.
+
 - vlan_getnext: bridge layer function invoked to query the next configured VLAN
   in the switch, i.e. returns the bitmaps of members and untagged ports
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1c845d7bf0b2..ebc0d9ea96a1 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -313,6 +313,9 @@ struct dsa_switch_driver {
 				 struct switchdev_trans *trans);
 	int	(*port_vlan_del)(struct dsa_switch *ds, int port,
 				 const struct switchdev_obj_port_vlan *vlan);
+	int	(*port_vlan_dump)(struct dsa_switch *ds, int port,
+				  struct switchdev_obj_port_vlan *vlan,
+				  int (*cb)(struct switchdev_obj *obj));
 	int	(*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid);
 	int	(*vlan_getnext)(struct dsa_switch *ds, u16 *vid,
 				unsigned long *ports, unsigned long *untagged);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 14ca9784ec0c..a9cbb72fb155 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -248,6 +248,9 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
 	u16 pvid, vid = 0;
 	int err;
 
+	if (ds->drv->port_vlan_dump)
+		return ds->drv->port_vlan_dump(ds, p->port, vlan, cb);
+
 	if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get)
 		return -EOPNOTSUPP;
 
-- 
cgit v1.2.3-59-g8ed1b


From 477b184526a7f44164029eea720da0e0c888cac6 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 23 Feb 2016 12:13:56 -0500
Subject: net: dsa: drop vlan_getnext

The VLAN GetNext operation is specific to some switches, and thus can be
complicated to implement for some drivers.

Remove the support for the vlan_getnext/port_pvid_get approach in favor
of the generic and simpler port_vlan_dump function.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dsa/dsa.txt |  9 ---------
 include/net/dsa.h                    |  3 ---
 net/dsa/slave.c                      | 35 +----------------------------------
 3 files changed, 1 insertion(+), 46 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index 350a502e031f..974e9c387d1e 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -542,12 +542,6 @@ Bridge layer
 Bridge VLAN filtering
 ---------------------
 
-- port_pvid_get: bridge layer function invoked when a Port-based VLAN ID is
-  queried for the given switch port
-
-- port_pvid_set: bridge layer function invoked when a Port-based VLAN ID needs
-  to be configured on the given switch port
-
 - port_vlan_add: bridge layer function invoked when a VLAN is configured
   (tagged or untagged) for the given switch port
 
@@ -558,9 +552,6 @@ Bridge VLAN filtering
   function that the driver has to call for each VLAN the given port is a member
   of. A switchdev object is used to carry the VID and bridge flags.
 
-- vlan_getnext: bridge layer function invoked to query the next configured VLAN
-  in the switch, i.e. returns the bitmaps of members and untagged ports
-
 - port_fdb_add: bridge layer function invoked when the bridge wants to install a
   Forwarding Database entry, the switch hardware should be programmed with the
   specified address in the specified VLAN Id in the forwarding database
diff --git a/include/net/dsa.h b/include/net/dsa.h
index ebc0d9ea96a1..3dd54867174a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -316,9 +316,6 @@ struct dsa_switch_driver {
 	int	(*port_vlan_dump)(struct dsa_switch *ds, int port,
 				  struct switchdev_obj_port_vlan *vlan,
 				  int (*cb)(struct switchdev_obj *obj));
-	int	(*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid);
-	int	(*vlan_getnext)(struct dsa_switch *ds, u16 *vid,
-				unsigned long *ports, unsigned long *untagged);
 
 	/*
 	 * Forwarding database
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a9cbb72fb155..cde29239b60d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -243,44 +243,11 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = p->parent;
-	DECLARE_BITMAP(members, DSA_MAX_PORTS);
-	DECLARE_BITMAP(untagged, DSA_MAX_PORTS);
-	u16 pvid, vid = 0;
-	int err;
 
 	if (ds->drv->port_vlan_dump)
 		return ds->drv->port_vlan_dump(ds, p->port, vlan, cb);
 
-	if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get)
-		return -EOPNOTSUPP;
-
-	err = ds->drv->port_pvid_get(ds, p->port, &pvid);
-	if (err)
-		return err;
-
-	for (;;) {
-		err = ds->drv->vlan_getnext(ds, &vid, members, untagged);
-		if (err)
-			break;
-
-		if (!test_bit(p->port, members))
-			continue;
-
-		memset(vlan, 0, sizeof(*vlan));
-		vlan->vid_begin = vlan->vid_end = vid;
-
-		if (vid == pvid)
-			vlan->flags |= BRIDGE_VLAN_INFO_PVID;
-
-		if (test_bit(p->port, untagged))
-			vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
-
-		err = cb(&vlan->obj);
-		if (err)
-			break;
-	}
-
-	return err == -ENOENT ? 0 : err;
+	return -EOPNOTSUPP;
 }
 
 static int dsa_slave_port_fdb_add(struct net_device *dev,
-- 
cgit v1.2.3-59-g8ed1b


From b170997acedc6c11ed2ec07b8d415601e65bb452 Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Wed, 24 Feb 2016 11:00:19 -0500
Subject: tipc: eliminate risk of finding to-be-deleted node instance

Although we have never seen it happen, we have identified the
following problematic scenario when nodes are stopped and deleted:

CPU0:                            CPU1:

tipc_node_xxx()                                   //ref == 1
   tipc_node_put()                                //ref -> 0
                                 tipc_node_find() // node still in table
       tipc_node_delete()
         list_del_rcu(n. list)
                                 tipc_node_get()  //ref -> 1, bad
         kfree_rcu()

                                 tipc_node_put() //ref to 0 again.
                                 kfree_rcu()     // BOOM!

We fix this by introducing use of the conditional kref_get_if_not_zero()
instead of kref_get() in the function tipc_node_find(). This eliminates
any risk of post-mortem access.

Reported-by: Zhijiang Hu <huzhijiang@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/node.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/tipc/node.c b/net/tipc/node.c
index 9fcc2fb0ee00..792bbcbb3eed 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -245,23 +245,23 @@ static void tipc_node_get(struct tipc_node *node)
  */
 static struct tipc_node *tipc_node_find(struct net *net, u32 addr)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_net *tn = tipc_net(net);
 	struct tipc_node *node;
+	unsigned int thash = tipc_hashfn(addr);
 
 	if (unlikely(!in_own_cluster_exact(net, addr)))
 		return NULL;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(node, &tn->node_htable[tipc_hashfn(addr)],
-				 hash) {
-		if (node->addr == addr) {
-			tipc_node_get(node);
-			rcu_read_unlock();
-			return node;
-		}
+	hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) {
+		if (node->addr != addr)
+			continue;
+		if (!kref_get_unless_zero(&node->kref))
+			node = NULL;
+		break;
 	}
 	rcu_read_unlock();
-	return NULL;
+	return node;
 }
 
 static void tipc_node_read_lock(struct tipc_node *n)
-- 
cgit v1.2.3-59-g8ed1b


From d25a01257e422a4bdeb426f69529d57c73b235fe Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Wed, 24 Feb 2016 11:10:48 -0500
Subject: tipc: fix crash during node removal

When the TIPC module is unloaded, we have identified a race condition
that allows a node reference counter to go to zero and the node instance
being freed before the node timer is finished with accessing it. This
leads to occasional crashes, especially in multi-namespace environments.

The scenario goes as follows:

CPU0:(node_stop)                       CPU1:(node_timeout)  // ref == 2

1:                                          if(!mod_timer())
2: if (del_timer())
3:   tipc_node_put()                                        // ref -> 1
4: tipc_node_put()                                          // ref -> 0
5:   kfree_rcu(node);
6:                                               tipc_node_get(node)
7:                                               // BOOM!

We now clean up this functionality as follows:

1) We remove the node pointer from the node lookup table before we
   attempt deactivating the timer. This way, we reduce the risk that
   tipc_node_find() may obtain a valid pointer to an instance marked
   for deletion; a harmless but undesirable situation.

2) We use del_timer_sync() instead of del_timer() to safely deactivate
   the node timer without any risk that it might be reactivated by the
   timeout handler. There is no risk of deadlock here, since the two
   functions never touch the same spinlocks.

3: We remove a pointless tipc_node_get() + tipc_node_put() from the
   timeout handler.

Reported-by: Zhijiang Hu <huzhijiang@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/node.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/tipc/node.c b/net/tipc/node.c
index 792bbcbb3eed..cdb79503d890 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -225,9 +225,10 @@ static unsigned int tipc_hashfn(u32 addr)
 
 static void tipc_node_kref_release(struct kref *kref)
 {
-	struct tipc_node *node = container_of(kref, struct tipc_node, kref);
+	struct tipc_node *n = container_of(kref, struct tipc_node, kref);
 
-	tipc_node_delete(node);
+	kfree(n->bc_entry.link);
+	kfree_rcu(n, rcu);
 }
 
 static void tipc_node_put(struct tipc_node *node)
@@ -395,21 +396,20 @@ static void tipc_node_delete(struct tipc_node *node)
 {
 	list_del_rcu(&node->list);
 	hlist_del_rcu(&node->hash);
-	kfree(node->bc_entry.link);
-	kfree_rcu(node, rcu);
+	tipc_node_put(node);
+
+	del_timer_sync(&node->timer);
+	tipc_node_put(node);
 }
 
 void tipc_node_stop(struct net *net)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_net *tn = tipc_net(net);
 	struct tipc_node *node, *t_node;
 
 	spin_lock_bh(&tn->node_list_lock);
-	list_for_each_entry_safe(node, t_node, &tn->node_list, list) {
-		if (del_timer(&node->timer))
-			tipc_node_put(node);
-		tipc_node_put(node);
-	}
+	list_for_each_entry_safe(node, t_node, &tn->node_list, list)
+		tipc_node_delete(node);
 	spin_unlock_bh(&tn->node_list_lock);
 }
 
@@ -530,9 +530,7 @@ static void tipc_node_timeout(unsigned long data)
 		if (rc & TIPC_LINK_DOWN_EVT)
 			tipc_node_link_down(n, bearer_id, false);
 	}
-	if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
-		tipc_node_get(n);
-	tipc_node_put(n);
+	mod_timer(&n->timer, jiffies + n->keepalive_intv);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 619b17452a388d6be69a7b28dc92310ea3f5aa65 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 24 Feb 2016 17:20:17 +0100
Subject: tipc: fix null deref crash in compat config path

msg.dst_sk needs to be set up with a valid socket because some callbacks
later derive the netns from it.

Fixes: 263ea09084d172d ("Revert "genl: Add genlmsg_new_unicast() for unicast message allocation")
Reported-by: Jon Maloy <maloy@donjonn.com>
Bisected-by: Jon Maloy <maloy@donjonn.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/netlink_compat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index de66d8f945ed..d7d050f44fc1 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1105,6 +1105,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
 	msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN;
 	msg.cmd = req_userhdr->cmd;
 	msg.net = genl_info_net(info);
+	msg.dst_sk = skb->sk;
 
 	if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) {
 		msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN);
-- 
cgit v1.2.3-59-g8ed1b


From f1705ec197e705b79ea40fe7a2cc5acfa1d3bfac Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 24 Feb 2016 09:25:37 -0800
Subject: net: ipv6: Make address flushing on ifdown optional

Currently, all ipv6 addresses are flushed when the interface is configured
down, including global, static addresses:

    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000
        inet6 2100:1::2/120 scope global
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link
           valid_lft forever preferred_lft forever
    $ ip link set dev eth1 down
    $ ip -6 addr show dev eth1
    << nothing; all addresses have been flushed>>

Add a new sysctl to make this behavior optional. The new setting defaults to
flush all addresses to maintain backwards compatibility. When the set global
addresses with no expire times are not flushed on an admin down. The sysctl
is per-interface or system-wide for all interfaces

    $ sysctl -w net.ipv6.conf.eth1.keep_addr_on_down=1
or
    $ sysctl -w net.ipv6.conf.all.keep_addr_on_down=1

Will keep addresses on eth1 on an admin down.

    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000
        inet6 2100:1::2/120 scope global
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link
           valid_lft forever preferred_lft forever
    $ ip link set dev eth1 down
    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST> mtu 1500 state DOWN qlen 1000
        inet6 2100:1::2/120 scope global tentative
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link tentative
           valid_lft forever preferred_lft forever

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |   9 +++
 include/linux/ipv6.h                   |   1 +
 include/uapi/linux/ipv6.h              |   1 +
 net/ipv6/addrconf.c                    | 136 +++++++++++++++++++++++++++++----
 4 files changed, 132 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 24ce97f42d35..d5df40c75aa4 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1563,6 +1563,15 @@ temp_prefered_lft - INTEGER
 	Preferred lifetime (in seconds) for temporary addresses.
 	Default: 86400 (1 day)
 
+keep_addr_on_down - INTEGER
+	Keep all IPv6 addresses on an interface down event. If set static
+	global addresses with no expiration time are not flushed.
+	  >0 : enabled
+	   0 : system default
+	  <0 : disabled
+
+	Default: 0 (addresses are removed)
+
 max_desync_factor - INTEGER
 	Maximum value for DESYNC_FACTOR, which is a random value
 	that ensures that clients don't synchronize with each
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 4b2267e1b7c3..7edc14fb66b6 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -62,6 +62,7 @@ struct ipv6_devconf {
 		struct in6_addr secret;
 	} stable_secret;
 	__s32		use_oif_addrs_only;
+	__s32		keep_addr_on_down;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index ec117b65d5a5..395876060f50 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -176,6 +176,7 @@ enum {
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	DEVCONF_DROP_UNSOLICITED_NA,
+	DEVCONF_KEEP_ADDR_ON_DOWN,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4751f8922362..a2d6f6c242af 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -3168,6 +3170,55 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
+static int fixup_permanent_addr(struct inet6_dev *idev,
+				struct inet6_ifaddr *ifp)
+{
+	if (!ifp->rt) {
+		struct rt6_info *rt;
+
+		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
+		if (unlikely(IS_ERR(rt)))
+			return PTR_ERR(rt);
+
+		ifp->rt = rt;
+	}
+
+	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      idev->dev, 0, 0);
+	}
+
+	addrconf_dad_start(ifp);
+
+	return 0;
+}
+
+static void addrconf_permanent_addr(struct net_device *dev)
+{
+	struct inet6_ifaddr *ifp, *tmp;
+	struct inet6_dev *idev;
+
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		return;
+
+	write_lock_bh(&idev->lock);
+
+	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+		if ((ifp->flags & IFA_F_PERMANENT) &&
+		    fixup_permanent_addr(idev, ifp) < 0) {
+			write_unlock_bh(&idev->lock);
+			ipv6_del_addr(ifp);
+			write_lock_bh(&idev->lock);
+
+			net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
+					     idev->dev->name, &ifp->addr);
+		}
+	}
+
+	write_unlock_bh(&idev->lock);
+}
+
 static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
@@ -3253,6 +3304,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			run_pending = 1;
 		}
 
+		/* restore routes for permanent addresses */
+		addrconf_permanent_addr(dev);
+
 		switch (dev->type) {
 #if IS_ENABLED(CONFIG_IPV6_SIT)
 		case ARPHRD_SIT:
@@ -3356,7 +3410,10 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 {
 	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
-	struct inet6_ifaddr *ifa;
+	struct inet6_ifaddr *ifa, *tmp;
+	struct list_head del_list;
+	int _keep_addr;
+	bool keep_addr;
 	int state, i;
 
 	ASSERT_RTNL();
@@ -3383,6 +3440,16 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	}
 
+	/* aggregate the system setting and interface setting */
+	_keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+	if (!_keep_addr)
+		_keep_addr = idev->cnf.keep_addr_on_down;
+
+	/* combine the user config with event to determine if permanent
+	 * addresses are to be removed from address hash table
+	 */
+	keep_addr = !(how || _keep_addr <= 0);
+
 	/* Step 2: clear hash table */
 	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
 		struct hlist_head *h = &inet6_addr_lst[i];
@@ -3391,9 +3458,15 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 restart:
 		hlist_for_each_entry_rcu(ifa, h, addr_lst) {
 			if (ifa->idev == idev) {
-				hlist_del_init_rcu(&ifa->addr_lst);
 				addrconf_del_dad_work(ifa);
-				goto restart;
+				/* combined flag + permanent flag decide if
+				 * address is retained on a down event
+				 */
+				if (!keep_addr ||
+				    !(ifa->flags & IFA_F_PERMANENT)) {
+					hlist_del_init_rcu(&ifa->addr_lst);
+					goto restart;
+				}
 			}
 		}
 		spin_unlock_bh(&addrconf_hash_lock);
@@ -3427,31 +3500,53 @@ restart:
 		write_lock_bh(&idev->lock);
 	}
 
-	while (!list_empty(&idev->addr_list)) {
-		ifa = list_first_entry(&idev->addr_list,
-				       struct inet6_ifaddr, if_list);
-		addrconf_del_dad_work(ifa);
+	/* re-combine the user config with event to determine if permanent
+	 * addresses are to be removed from the interface list
+	 */
+	keep_addr = (!how && _keep_addr > 0);
 
-		list_del(&ifa->if_list);
+	INIT_LIST_HEAD(&del_list);
+	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+		addrconf_del_dad_work(ifa);
 
 		write_unlock_bh(&idev->lock);
-
 		spin_lock_bh(&ifa->lock);
-		state = ifa->state;
-		ifa->state = INET6_IFADDR_STATE_DEAD;
+
+		if (keep_addr && (ifa->flags & IFA_F_PERMANENT)) {
+			/* set state to skip the notifier below */
+			state = INET6_IFADDR_STATE_DEAD;
+			ifa->state = 0;
+			if (!(ifa->flags & IFA_F_NODAD))
+				ifa->flags |= IFA_F_TENTATIVE;
+		} else {
+			state = ifa->state;
+			ifa->state = INET6_IFADDR_STATE_DEAD;
+
+			list_del(&ifa->if_list);
+			list_add(&ifa->if_list, &del_list);
+		}
+
 		spin_unlock_bh(&ifa->lock);
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
 			inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
 		}
-		in6_ifa_put(ifa);
 
 		write_lock_bh(&idev->lock);
 	}
 
 	write_unlock_bh(&idev->lock);
 
+	/* now clean up addresses to be removed */
+	while (!list_empty(&del_list)) {
+		ifa = list_first_entry(&del_list,
+				       struct inet6_ifaddr, if_list);
+		list_del(&ifa->if_list);
+
+		in6_ifa_put(ifa);
+	}
+
 	/* Step 5: Discard anycast and multicast list */
 	if (how) {
 		ipv6_ac_destroy_dev(idev);
@@ -4716,6 +4811,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
+	array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5197,10 +5293,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			if (rt)
 				ip6_del_rt(rt);
 		}
-		dst_hold(&ifp->rt->dst);
-
-		ip6_del_rt(ifp->rt);
+		if (ifp->rt) {
+			dst_hold(&ifp->rt->dst);
 
+			ip6_del_rt(ifp->rt);
+			ifp->rt = NULL;
+		}
 		rt_genid_bump_ipv6(net);
 		break;
 	}
@@ -5803,6 +5901,14 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname       = "keep_addr_on_down",
+			.data           = &ipv6_devconf.keep_addr_on_down,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 		{
 			/* sentinel */
 		}
-- 
cgit v1.2.3-59-g8ed1b


From a87cb3e48ee86d29868d3f59cfb9ce1a8fa63314 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 24 Feb 2016 10:02:52 -0800
Subject: net: Facility to report route quality of connected sockets

This patch add the SO_CNX_ADVICE socket option (setsockopt only). The
purpose is to allow an application to give feedback to the kernel about
the quality of the network path for a connected socket. The value
argument indicates the type of quality report. For this initial patch
the only supported advice is a value of 1 which indicates "bad path,
please reroute"-- the action taken by the kernel is to call
dst_negative_advice which will attempt to choose a different ECMP route,
reset the TX hash for flow label and UDP source port in encapsulation,
etc.

This facility should be useful for connected UDP sockets where only the
application can provide any feedback about path quality. It could also
be useful for TCP applications that have additional knowledge about the
path outside of the normal TCP control loop.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h   | 2 ++
 arch/avr32/include/uapi/asm/socket.h   | 2 ++
 arch/frv/include/uapi/asm/socket.h     | 2 ++
 arch/ia64/include/uapi/asm/socket.h    | 2 ++
 arch/m32r/include/uapi/asm/socket.h    | 2 ++
 arch/mips/include/uapi/asm/socket.h    | 2 ++
 arch/mn10300/include/uapi/asm/socket.h | 2 ++
 arch/parisc/include/uapi/asm/socket.h  | 2 ++
 arch/powerpc/include/uapi/asm/socket.h | 2 ++
 arch/s390/include/uapi/asm/socket.h    | 2 ++
 arch/sparc/include/uapi/asm/socket.h   | 2 ++
 arch/xtensa/include/uapi/asm/socket.h  | 2 ++
 include/uapi/asm-generic/socket.h      | 2 ++
 net/core/sock.c                        | 4 ++++
 14 files changed, 30 insertions(+)

(limited to 'net')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index c5fb9e6bc3a5..9e46d6e656d9 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -95,4 +95,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 9de0796240a0..1fd147f09a38 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -88,4 +88,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index f02e4849ae83..afbc98f02d27 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -88,5 +88,7 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index bce29166de1b..0018fad9039f 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -97,4 +97,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 14aa4a6bccf1..5fe42fc7b6c5 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -88,4 +88,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 5910fe294e93..2027240aafbb 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -106,4 +106,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index 58b1aa01ab9f..5129f23a9ee1 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -88,4 +88,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index f9cf1223422c..9c935d717df9 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -87,4 +87,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	0x402C
 #define SO_ATTACH_REUSEPORT_EBPF	0x402D
 
+#define SO_CNX_ADVICE		0x402E
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index dd54f28ecdec..1672e3398270 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -95,4 +95,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index d02e89d14fef..41b51c2f4f1b 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -94,4 +94,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index d270ee91968e..31aede3af088 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -84,6 +84,8 @@
 #define SO_ATTACH_REUSEPORT_CBPF	0x0035
 #define SO_ATTACH_REUSEPORT_EBPF	0x0036
 
+#define SO_CNX_ADVICE		0x0037
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index fd3b96d1153f..81435d995e11 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index fb8a41668382..67d632f1743d 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -90,4 +90,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 46dc8ad7d050..4493ff820c2c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -987,6 +987,10 @@ set_rcvbuf:
 		sk->sk_incoming_cpu = val;
 		break;
 
+	case SO_CNX_ADVICE:
+		if (val == 1)
+			dst_negative_advice(sk);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 3f1ac7a700d039c61d8d8b99f28d605d489a60cf Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:57:59 -0800
Subject: net: ethtool: add new ETHTOOL_xLINKSETTINGS API

This patch defines a new ETHTOOL_GLINKSETTINGS/SLINKSETTINGS API,
handled by the new get_link_ksettings/set_link_ksettings callbacks.
This API provides support for most legacy ethtool_cmd fields, adds
support for larger link mode masks (up to 4064 bits, variable length),
and removes ethtool_cmd deprecated
fields (transceiver/maxrxpkt/maxtxpkt).

This API is deprecating the legacy ETHTOOL_GSET/SSET API and provides
the following backward compatibility properties:
 - legacy ethtool with legacy drivers: no change, still using the
   get_settings/set_settings callbacks.
 - legacy ethtool with new get/set_link_ksettings drivers: the new
   driver callbacks are used, data internally converted to legacy
   ethtool_cmd. ETHTOOL_GSET will return only the 1st 32b of each link
   mode mask. ETHTOOL_SSET will fail if user tries to set the
   ethtool_cmd deprecated fields to
   non-0 (transceiver/maxrxpkt/maxtxpkt). A kernel warning is logged if
   driver sets higher bits.
 - future ethtool with legacy drivers: no change, still using the
   get_settings/set_settings callbacks, internally converted to new data
   structure. Deprecated fields (transceiver/maxrxpkt/maxtxpkt) will be
   ignored and seen as 0 from user space. Note that that "future"
   ethtool tool will not allow changes to these deprecated fields.
 - future ethtool with new drivers: direct call to the new callbacks.

By "future" ethtool, what is meant is:
 - query: first try ETHTOOL_GLINKSETTINGS, and revert to ETHTOOL_GSET if
   fails
 - set: query first and remember which of ETHTOOL_GLINKSETTINGS or
   ETHTOOL_GSET was successful
   + if ETHTOOL_GLINKSETTINGS was successful, then change config with
     ETHTOOL_SLINKSETTINGS. A failure there is final (do not try
     ETHTOOL_SSET).
   + otherwise ETHTOOL_GSET was successful, change config with
     ETHTOOL_SSET. A failure there is final (do not try
     ETHTOOL_SLINKSETTINGS).

The interaction user/kernel via the new API requires a small
ETHTOOL_GLINKSETTINGS handshake first to agree on the length of the link
mode bitmaps. If kernel doesn't agree with user, it returns the bitmap
length it is expecting from user as a negative length (and cmd field is
0). When kernel and user agree, kernel returns valid info in all
fields (ie. link mode length > 0 and cmd is ETHTOOL_GLINKSETTINGS).

Data structure crossing user/kernel boundary is 32/64-bit
agnostic. Converted internally to a legal kernel bitmap.

The internal __ethtool_get_settings kernel helper will gradually be
replaced by __ethtool_get_link_ksettings by the time the first
"link_settings" drivers start to appear. So this patch doesn't change
it, it will be removed before it needs to be changed.

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  91 ++++++++-
 include/uapi/linux/ethtool.h | 322 +++++++++++++++++++++++-------
 net/core/ethtool.c           | 453 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 786 insertions(+), 80 deletions(-)

(limited to 'net')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 472d7d7b01c2..8a400a54c92e 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -12,6 +12,7 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
+#include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <uapi/linux/ethtool.h>
 
@@ -40,9 +41,6 @@ struct compat_ethtool_rxnfc {
 
 #include <linux/rculist.h>
 
-extern int __ethtool_get_settings(struct net_device *dev,
-				  struct ethtool_cmd *cmd);
-
 /**
  * enum ethtool_phys_id_state - indicator state for physical identification
  * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated
@@ -97,13 +95,74 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 	return index % n_rx_rings;
 }
 
+/* number of link mode bits/ulongs handled internally by kernel */
+#define __ETHTOOL_LINK_MODE_MASK_NBITS			\
+	(__ETHTOOL_LINK_MODE_LAST + 1)
+
+/* declare a link mode bitmap */
+#define __ETHTOOL_DECLARE_LINK_MODE_MASK(name)		\
+	DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/* drivers must ignore base.cmd and base.link_mode_masks_nwords
+ * fields, but they are allowed to overwrite them (will be ignored).
+ */
+struct ethtool_link_ksettings {
+	struct ethtool_link_settings base;
+	struct {
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
+	} link_modes;
+};
+
+/**
+ * ethtool_link_ksettings_zero_link_mode - clear link_ksettings link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ */
+#define ethtool_link_ksettings_zero_link_mode(ptr, name)		\
+	bitmap_zero((ptr)->link_modes.name, __ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/**
+ * ethtool_link_ksettings_add_link_mode - set bit in link_ksettings
+ * link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ *   @mode : one of the ETHTOOL_LINK_MODE_*_BIT
+ * (not atomic, no bound checking)
+ */
+#define ethtool_link_ksettings_add_link_mode(ptr, name, mode)		\
+	__set_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+
+/**
+ * ethtool_link_ksettings_test_link_mode - test bit in ksettings link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ *   @mode : one of the ETHTOOL_LINK_MODE_*_BIT
+ * (not atomic, no bound checking)
+ *
+ * Returns true/false.
+ */
+#define ethtool_link_ksettings_test_link_mode(ptr, name, mode)		\
+	test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+
+extern int
+__ethtool_get_link_ksettings(struct net_device *dev,
+			     struct ethtool_link_ksettings *link_ksettings);
+
+/* DEPRECATED, use __ethtool_get_link_ksettings */
+extern int __ethtool_get_settings(struct net_device *dev,
+				  struct ethtool_cmd *cmd);
+
 /**
  * struct ethtool_ops - optional netdev operations
- * @get_settings: Get various device settings including Ethernet link
+ * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
+ *	API. Get various device settings including Ethernet link
  *	settings. The @cmd parameter is expected to have been cleared
- *	before get_settings is called. Returns a negative error code or
- *	zero.
- * @set_settings: Set various device settings including Ethernet link
+ *	before get_settings is called. Returns a negative error code
+ *	or zero.
+ * @set_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
+ *	API. Set various device settings including Ethernet link
  *	settings.  Returns a negative error code or zero.
  * @get_drvinfo: Report driver/device information.  Should only set the
  *	@driver, @version, @fw_version and @bus_info fields.  If not
@@ -211,6 +270,19 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
  *	queue has this number, ignore the inapplicable fields.
  *	Returns a negative error code or zero.
+ * @get_link_ksettings: When defined, takes precedence over the
+ *	%get_settings method. Get various device settings
+ *	including Ethernet link settings. The %cmd and
+ *	%link_mode_masks_nwords fields should be ignored (use
+ *	%__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), any
+ *	change to them will be overwritten by kernel. Returns a
+ *	negative error code or zero.
+ * @set_link_ksettings: When defined, takes precedence over the
+ *	%set_settings method. Set various device settings including
+ *	Ethernet link settings. The %cmd and %link_mode_masks_nwords
+ *	fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
+ *	instead of the latter), any change to them will be overwritten
+ *	by kernel. Returns a negative error code or zero.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -293,6 +365,9 @@ struct ethtool_ops {
 					  struct ethtool_coalesce *);
 	int	(*set_per_queue_coalesce)(struct net_device *, u32,
 					  struct ethtool_coalesce *);
-
+	int	(*get_link_ksettings)(struct net_device *,
+				      struct ethtool_link_ksettings *);
+	int	(*set_link_ksettings)(struct net_device *,
+				      const struct ethtool_link_ksettings *);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f15ae02621a1..37fd6dc33de4 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -21,7 +21,8 @@
  */
 
 /**
- * struct ethtool_cmd - link control and status
+ * struct ethtool_cmd - DEPRECATED, link control and status
+ * This structure is DEPRECATED, please use struct ethtool_link_settings.
  * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET
  * @supported: Bitmask of %SUPPORTED_* flags for the link modes,
  *	physical connectors and other link features for which the
@@ -1219,8 +1220,12 @@ struct ethtool_per_queue_op {
 };
 
 /* CMDs currently supported */
-#define ETHTOOL_GSET		0x00000001 /* Get settings. */
-#define ETHTOOL_SSET		0x00000002 /* Set settings. */
+#define ETHTOOL_GSET		0x00000001 /* DEPRECATED, Get settings.
+					    * Please use ETHTOOL_GLINKSETTINGS
+					    */
+#define ETHTOOL_SSET		0x00000002 /* DEPRECATED, Set settings.
+					    * Please use ETHTOOL_SLINKSETTINGS
+					    */
 #define ETHTOOL_GDRVINFO	0x00000003 /* Get driver info. */
 #define ETHTOOL_GREGS		0x00000004 /* Get NIC registers. */
 #define ETHTOOL_GWOL		0x00000005 /* Get wake-on-lan options. */
@@ -1302,73 +1307,139 @@ struct ethtool_per_queue_op {
 
 #define ETHTOOL_PERQUEUE	0x0000004b /* Set per queue options */
 
+#define ETHTOOL_GLINKSETTINGS	0x0000004c /* Get ethtool_link_settings */
+#define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
+
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
 
-#define SUPPORTED_10baseT_Half		(1 << 0)
-#define SUPPORTED_10baseT_Full		(1 << 1)
-#define SUPPORTED_100baseT_Half		(1 << 2)
-#define SUPPORTED_100baseT_Full		(1 << 3)
-#define SUPPORTED_1000baseT_Half	(1 << 4)
-#define SUPPORTED_1000baseT_Full	(1 << 5)
-#define SUPPORTED_Autoneg		(1 << 6)
-#define SUPPORTED_TP			(1 << 7)
-#define SUPPORTED_AUI			(1 << 8)
-#define SUPPORTED_MII			(1 << 9)
-#define SUPPORTED_FIBRE			(1 << 10)
-#define SUPPORTED_BNC			(1 << 11)
-#define SUPPORTED_10000baseT_Full	(1 << 12)
-#define SUPPORTED_Pause			(1 << 13)
-#define SUPPORTED_Asym_Pause		(1 << 14)
-#define SUPPORTED_2500baseX_Full	(1 << 15)
-#define SUPPORTED_Backplane		(1 << 16)
-#define SUPPORTED_1000baseKX_Full	(1 << 17)
-#define SUPPORTED_10000baseKX4_Full	(1 << 18)
-#define SUPPORTED_10000baseKR_Full	(1 << 19)
-#define SUPPORTED_10000baseR_FEC	(1 << 20)
-#define SUPPORTED_20000baseMLD2_Full	(1 << 21)
-#define SUPPORTED_20000baseKR2_Full	(1 << 22)
-#define SUPPORTED_40000baseKR4_Full	(1 << 23)
-#define SUPPORTED_40000baseCR4_Full	(1 << 24)
-#define SUPPORTED_40000baseSR4_Full	(1 << 25)
-#define SUPPORTED_40000baseLR4_Full	(1 << 26)
-#define SUPPORTED_56000baseKR4_Full	(1 << 27)
-#define SUPPORTED_56000baseCR4_Full	(1 << 28)
-#define SUPPORTED_56000baseSR4_Full	(1 << 29)
-#define SUPPORTED_56000baseLR4_Full	(1 << 30)
-
-#define ADVERTISED_10baseT_Half		(1 << 0)
-#define ADVERTISED_10baseT_Full		(1 << 1)
-#define ADVERTISED_100baseT_Half	(1 << 2)
-#define ADVERTISED_100baseT_Full	(1 << 3)
-#define ADVERTISED_1000baseT_Half	(1 << 4)
-#define ADVERTISED_1000baseT_Full	(1 << 5)
-#define ADVERTISED_Autoneg		(1 << 6)
-#define ADVERTISED_TP			(1 << 7)
-#define ADVERTISED_AUI			(1 << 8)
-#define ADVERTISED_MII			(1 << 9)
-#define ADVERTISED_FIBRE		(1 << 10)
-#define ADVERTISED_BNC			(1 << 11)
-#define ADVERTISED_10000baseT_Full	(1 << 12)
-#define ADVERTISED_Pause		(1 << 13)
-#define ADVERTISED_Asym_Pause		(1 << 14)
-#define ADVERTISED_2500baseX_Full	(1 << 15)
-#define ADVERTISED_Backplane		(1 << 16)
-#define ADVERTISED_1000baseKX_Full	(1 << 17)
-#define ADVERTISED_10000baseKX4_Full	(1 << 18)
-#define ADVERTISED_10000baseKR_Full	(1 << 19)
-#define ADVERTISED_10000baseR_FEC	(1 << 20)
-#define ADVERTISED_20000baseMLD2_Full	(1 << 21)
-#define ADVERTISED_20000baseKR2_Full	(1 << 22)
-#define ADVERTISED_40000baseKR4_Full	(1 << 23)
-#define ADVERTISED_40000baseCR4_Full	(1 << 24)
-#define ADVERTISED_40000baseSR4_Full	(1 << 25)
-#define ADVERTISED_40000baseLR4_Full	(1 << 26)
-#define ADVERTISED_56000baseKR4_Full	(1 << 27)
-#define ADVERTISED_56000baseCR4_Full	(1 << 28)
-#define ADVERTISED_56000baseSR4_Full	(1 << 29)
-#define ADVERTISED_56000baseLR4_Full	(1 << 30)
+/* Link mode bit indices */
+enum ethtool_link_mode_bit_indices {
+	ETHTOOL_LINK_MODE_10baseT_Half_BIT	= 0,
+	ETHTOOL_LINK_MODE_10baseT_Full_BIT	= 1,
+	ETHTOOL_LINK_MODE_100baseT_Half_BIT	= 2,
+	ETHTOOL_LINK_MODE_100baseT_Full_BIT	= 3,
+	ETHTOOL_LINK_MODE_1000baseT_Half_BIT	= 4,
+	ETHTOOL_LINK_MODE_1000baseT_Full_BIT	= 5,
+	ETHTOOL_LINK_MODE_Autoneg_BIT		= 6,
+	ETHTOOL_LINK_MODE_TP_BIT		= 7,
+	ETHTOOL_LINK_MODE_AUI_BIT		= 8,
+	ETHTOOL_LINK_MODE_MII_BIT		= 9,
+	ETHTOOL_LINK_MODE_FIBRE_BIT		= 10,
+	ETHTOOL_LINK_MODE_BNC_BIT		= 11,
+	ETHTOOL_LINK_MODE_10000baseT_Full_BIT	= 12,
+	ETHTOOL_LINK_MODE_Pause_BIT		= 13,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT	= 14,
+	ETHTOOL_LINK_MODE_2500baseX_Full_BIT	= 15,
+	ETHTOOL_LINK_MODE_Backplane_BIT		= 16,
+	ETHTOOL_LINK_MODE_1000baseKX_Full_BIT	= 17,
+	ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT	= 18,
+	ETHTOOL_LINK_MODE_10000baseKR_Full_BIT	= 19,
+	ETHTOOL_LINK_MODE_10000baseR_FEC_BIT	= 20,
+	ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21,
+	ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT	= 22,
+	ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT	= 23,
+	ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT	= 24,
+	ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT	= 25,
+	ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT	= 26,
+	ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT	= 27,
+	ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT	= 28,
+	ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT	= 29,
+	ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT	= 30,
+
+	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
+	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
+	 * macro for bits > 31. The only way to use indices > 31 is to
+	 * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API.
+	 */
+
+	__ETHTOOL_LINK_MODE_LAST
+	  = ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT,
+};
+
+#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)	\
+	(1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT))
+
+/* DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new SUPPORTED_* macro for bits > 31.
+ */
+#define SUPPORTED_10baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define SUPPORTED_10baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define SUPPORTED_100baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define SUPPORTED_100baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define SUPPORTED_1000baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define SUPPORTED_1000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define SUPPORTED_Autoneg		__ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define SUPPORTED_TP			__ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define SUPPORTED_AUI			__ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define SUPPORTED_MII			__ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define SUPPORTED_FIBRE			__ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define SUPPORTED_BNC			__ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define SUPPORTED_10000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define SUPPORTED_Pause			__ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define SUPPORTED_Asym_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define SUPPORTED_2500baseX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define SUPPORTED_Backplane		__ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define SUPPORTED_1000baseKX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define SUPPORTED_10000baseKX4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define SUPPORTED_10000baseKR_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define SUPPORTED_10000baseR_FEC	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define SUPPORTED_20000baseMLD2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define SUPPORTED_20000baseKR2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define SUPPORTED_40000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define SUPPORTED_40000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define SUPPORTED_40000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define SUPPORTED_40000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define SUPPORTED_56000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define SUPPORTED_56000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define SUPPORTED_56000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define SUPPORTED_56000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new SUPPORTED_* macro for bits > 31, see
+ * notice above.
+ */
+
+/*
+ * DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new ADERTISE_* macro for bits > 31.
+ */
+#define ADVERTISED_10baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define ADVERTISED_10baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define ADVERTISED_100baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define ADVERTISED_100baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define ADVERTISED_1000baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define ADVERTISED_1000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define ADVERTISED_Autoneg		__ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define ADVERTISED_TP			__ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define ADVERTISED_AUI			__ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define ADVERTISED_MII			__ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define ADVERTISED_FIBRE		__ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define ADVERTISED_BNC			__ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define ADVERTISED_10000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define ADVERTISED_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define ADVERTISED_Asym_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define ADVERTISED_2500baseX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define ADVERTISED_Backplane		__ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define ADVERTISED_1000baseKX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define ADVERTISED_10000baseKX4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define ADVERTISED_10000baseKR_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define ADVERTISED_10000baseR_FEC	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define ADVERTISED_20000baseMLD2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define ADVERTISED_20000baseKR2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define ADVERTISED_40000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define ADVERTISED_40000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define ADVERTISED_40000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define ADVERTISED_40000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define ADVERTISED_56000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define ADVERTISED_56000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define ADVERTISED_56000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define ADVERTISED_56000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new ADVERTISED_* macro for bits > 31, see
+ * notice above.
+ */
 
 /* The following are all involved in forcing a particular link
  * mode for the device for setting things.  When getting the
@@ -1533,4 +1604,123 @@ enum ethtool_reset_flags {
 };
 #define ETH_RESET_SHARED_SHIFT	16
 
+
+/**
+ * struct ethtool_link_settings - link control and status
+ *
+ * IMPORTANT, Backward compatibility notice: When implementing new
+ *	user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and
+ *	if it succeeds use %ETHTOOL_SLINKSETTINGS to change link
+ *	settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS
+ *	succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in
+ *	that case.  Conversely, if %ETHTOOL_GLINKSETTINGS fails, use
+ *	%ETHTOOL_GSET to query and %ETHTOOL_SSET to change link
+ *	settings; do not use %ETHTOOL_SLINKSETTINGS if
+ *	%ETHTOOL_GLINKSETTINGS failed: stick to
+ *	%ETHTOOL_GSET/%ETHTOOL_SSET in that case.
+ *
+ * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS
+ * @speed: Link speed (Mbps)
+ * @duplex: Duplex mode; one of %DUPLEX_*
+ * @port: Physical connector type; one of %PORT_*
+ * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not
+ *	applicable.  For clause 45 PHYs this is the PRTAD.
+ * @autoneg: Enable/disable autonegotiation and auto-detection;
+ *	either %AUTONEG_DISABLE or %AUTONEG_ENABLE
+ * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO
+ *	protocols supported by the interface; 0 if unknown.
+ *	Read-only.
+ * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of
+ *	%ETH_TP_MDI_*.  If the status is unknown or not applicable, the
+ *	value will be %ETH_TP_MDI_INVALID.  Read-only.
+ * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of
+ *	%ETH_TP_MDI_*.  If MDI(-X) control is not implemented, reads
+ *	yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected.
+ *	When written successfully, the link should be renegotiated if
+ *	necessary.
+ * @link_mode_masks_nwords: Number of 32-bit words for each of the
+ *	supported, advertising, lp_advertising link mode bitmaps. For
+ *	%ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user
+ *	(>= 0); on return, if handshake in progress, negative if
+ *	request size unsupported by kernel: absolute value indicates
+ *	kernel recommended size and cmd field is 0, as well as all the
+ *	other fields; otherwise (handshake completed), strictly
+ *	positive to indicate size used by kernel and cmd field is
+ *	%ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For
+ *	%ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive
+ *	value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise
+ *	refused. For drivers: ignore this field (use kernel's
+ *	__ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will
+ *	be overwritten by kernel.
+ * @supported: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, physical
+ *	connectors and other link features for which the interface
+ *	supports autonegotiation or auto-detection.  Read-only.
+ * @advertising: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, physical
+ *	connectors and other link features that are advertised through
+ *	autonegotiation or enabled for auto-detection.
+ * @lp_advertising: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, and other
+ *	link features that the link partner advertised through
+ *	autonegotiation; 0 if unknown or not applicable.  Read-only.
+ *
+ * If autonegotiation is disabled, the speed and @duplex represent the
+ * fixed link mode and are writable if the driver supports multiple
+ * link modes.  If it is enabled then they are read-only; if the link
+ * is up they represent the negotiated link mode; if the link is down,
+ * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and
+ * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode.
+ *
+ * Some hardware interfaces may have multiple PHYs and/or physical
+ * connectors fitted or do not allow the driver to detect which are
+ * fitted.  For these interfaces @port and/or @phy_address may be
+ * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE.
+ * Otherwise, attempts to write different values may be ignored or
+ * rejected.
+ *
+ * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt
+ * are not available in %ethtool_link_settings. Until all drivers are
+ * converted to ignore them or to the new %ethtool_link_settings API,
+ * for both queries and changes, users should always try
+ * %ETHTOOL_GLINKSETTINGS first, and if it fails with -ENOTSUPP stick
+ * only to %ETHTOOL_GSET and %ETHTOOL_SSET consistently. If it
+ * succeeds, then users should stick to %ETHTOOL_GLINKSETTINGS and
+ * %ETHTOOL_SLINKSETTINGS (which would support drivers implementing
+ * either %ethtool_cmd or %ethtool_link_settings).
+ *
+ * Users should assume that all fields not marked read-only are
+ * writable and subject to validation by the driver.  They should use
+ * %ETHTOOL_GLINKSETTINGS to get the current values before making specific
+ * changes and then applying them with %ETHTOOL_SLINKSETTINGS.
+ *
+ * Drivers that implement %get_link_ksettings and/or
+ * %set_link_ksettings should ignore the @cmd
+ * and @link_mode_masks_nwords fields (any change to them overwritten
+ * by kernel), and rely only on kernel's internal
+ * %__ETHTOOL_LINK_MODE_MASK_NBITS and
+ * %ethtool_link_mode_mask_t. Drivers that implement
+ * %set_link_ksettings() should validate all fields other than @cmd
+ * and @link_mode_masks_nwords that are not described as read-only or
+ * deprecated, and must ignore all fields described as read-only.
+ */
+struct ethtool_link_settings {
+	__u32	cmd;
+	__u32	speed;
+	__u8	duplex;
+	__u8	port;
+	__u8	phy_address;
+	__u8	autoneg;
+	__u8	mdio_support;
+	__u8	eth_tp_mdix;
+	__u8	eth_tp_mdix_ctrl;
+	__s8	link_mode_masks_nwords;
+	__u32	reserved[8];
+	__u32	link_mode_masks[0];
+	/* layout of link_mode_masks fields:
+	 * __u32 map_supported[link_mode_masks_nwords];
+	 * __u32 map_advertising[link_mode_masks_nwords];
+	 * __u32 map_lp_advertising[link_mode_masks_nwords];
+	 */
+};
 #endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 2406101002b1..edcec56ed228 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -387,6 +387,359 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 	return 0;
 }
 
+static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32)
+{
+	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	dst[0] = legacy_u32;
+}
+
+/* return false if src had higher bits set. lower bits always updated. */
+static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+					    const unsigned long *src)
+{
+	bool retval = true;
+
+	/* TODO: following test will soon always be true */
+	if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
+
+		bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		bitmap_fill(ext, 32);
+		bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		if (bitmap_intersects(ext, src,
+				      __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+			/* src mask goes beyond bit 31 */
+			retval = false;
+		}
+	}
+	*legacy_u32 = src[0];
+	return retval;
+}
+
+/* return false if legacy contained non-0 deprecated fields
+ * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+static bool
+convert_legacy_settings_to_link_ksettings(
+	struct ethtool_link_ksettings *link_ksettings,
+	const struct ethtool_cmd *legacy_settings)
+{
+	bool retval = true;
+
+	memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+	/* This is used to tell users that driver is still using these
+	 * deprecated legacy fields, and they should not use
+	 * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+	 */
+	if (legacy_settings->transceiver ||
+	    legacy_settings->maxtxpkt ||
+	    legacy_settings->maxrxpkt)
+		retval = false;
+
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.supported,
+		legacy_settings->supported);
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.advertising,
+		legacy_settings->advertising);
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.lp_advertising,
+		legacy_settings->lp_advertising);
+	link_ksettings->base.speed
+		= ethtool_cmd_speed(legacy_settings);
+	link_ksettings->base.duplex
+		= legacy_settings->duplex;
+	link_ksettings->base.port
+		= legacy_settings->port;
+	link_ksettings->base.phy_address
+		= legacy_settings->phy_address;
+	link_ksettings->base.autoneg
+		= legacy_settings->autoneg;
+	link_ksettings->base.mdio_support
+		= legacy_settings->mdio_support;
+	link_ksettings->base.eth_tp_mdix
+		= legacy_settings->eth_tp_mdix;
+	link_ksettings->base.eth_tp_mdix_ctrl
+		= legacy_settings->eth_tp_mdix_ctrl;
+	return retval;
+}
+
+/* return false if ksettings link modes had higher bits
+ * set. legacy_settings always updated (best effort)
+ */
+static bool
+convert_link_ksettings_to_legacy_settings(
+	struct ethtool_cmd *legacy_settings,
+	const struct ethtool_link_ksettings *link_ksettings)
+{
+	bool retval = true;
+
+	memset(legacy_settings, 0, sizeof(*legacy_settings));
+	/* this also clears the deprecated fields in legacy structure:
+	 * __u8		transceiver;
+	 * __u32	maxtxpkt;
+	 * __u32	maxrxpkt;
+	 */
+
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->supported,
+		link_ksettings->link_modes.supported);
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->advertising,
+		link_ksettings->link_modes.advertising);
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->lp_advertising,
+		link_ksettings->link_modes.lp_advertising);
+	ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
+	legacy_settings->duplex
+		= link_ksettings->base.duplex;
+	legacy_settings->port
+		= link_ksettings->base.port;
+	legacy_settings->phy_address
+		= link_ksettings->base.phy_address;
+	legacy_settings->autoneg
+		= link_ksettings->base.autoneg;
+	legacy_settings->mdio_support
+		= link_ksettings->base.mdio_support;
+	legacy_settings->eth_tp_mdix
+		= link_ksettings->base.eth_tp_mdix;
+	legacy_settings->eth_tp_mdix_ctrl
+		= link_ksettings->base.eth_tp_mdix_ctrl;
+	return retval;
+}
+
+/* number of 32-bit words to store the user's link mode bitmaps */
+#define __ETHTOOL_LINK_MODE_MASK_NU32			\
+	DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+	struct ethtool_link_settings base;
+	struct {
+		__u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
+		__u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+		__u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+	} link_modes;
+};
+
+/* Internal kernel helper to query a device ethtool_link_settings.
+ *
+ * Backward compatibility note: for compatibility with legacy drivers
+ * that implement only the ethtool_cmd API, this has to work with both
+ * drivers implementing get_link_ksettings API and drivers
+ * implementing get_settings API. When drivers implement get_settings
+ * and report ethtool_cmd deprecated fields
+ * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
+ * because the resulting struct ethtool_link_settings does not report them.
+ */
+int __ethtool_get_link_ksettings(struct net_device *dev,
+				 struct ethtool_link_ksettings *link_ksettings)
+{
+	int err;
+	struct ethtool_cmd cmd;
+
+	ASSERT_RTNL();
+
+	if (dev->ethtool_ops->get_link_ksettings) {
+		memset(link_ksettings, 0, sizeof(*link_ksettings));
+		return dev->ethtool_ops->get_link_ksettings(dev,
+							    link_ksettings);
+	}
+
+	/* driver doesn't support %ethtool_link_ksettings API. revert to
+	 * legacy %ethtool_cmd API, unless it's not supported either.
+	 * TODO: remove when ethtool_ops::get_settings disappears internally
+	 */
+	err = __ethtool_get_settings(dev, &cmd);
+	if (err < 0)
+		return err;
+
+	/* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
+	 */
+	convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
+	return err;
+}
+EXPORT_SYMBOL(__ethtool_get_link_ksettings);
+
+/* convert ethtool_link_usettings in user space to a kernel internal
+ * ethtool_link_ksettings. return 0 on success, errno on error.
+ */
+static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
+					 const void __user *from)
+{
+	struct ethtool_link_usettings link_usettings;
+
+	if (copy_from_user(&link_usettings, from, sizeof(link_usettings)))
+		return -EFAULT;
+
+	memcpy(&to->base, &link_usettings.base, sizeof(to->base));
+	bitmap_from_u32array(to->link_modes.supported,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.supported,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+	bitmap_from_u32array(to->link_modes.advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+	bitmap_from_u32array(to->link_modes.lp_advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.lp_advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+
+	return 0;
+}
+
+/* convert a kernel internal ethtool_link_ksettings to
+ * ethtool_link_usettings in user space. return 0 on success, errno on
+ * error.
+ */
+static int
+store_link_ksettings_for_user(void __user *to,
+			      const struct ethtool_link_ksettings *from)
+{
+	struct ethtool_link_usettings link_usettings;
+
+	memcpy(&link_usettings.base, &from->base, sizeof(link_usettings));
+	bitmap_to_u32array(link_usettings.link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_to_u32array(link_usettings.link_modes.advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_to_u32array(link_usettings.link_modes.lp_advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.lp_advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+	if (copy_to_user(to, &link_usettings, sizeof(link_usettings)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/* Query device for its ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::get_link_ksettings, even if legacy
+ * ethtool_ops::get_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_GSET for
+ * this driver, so that they can correctly access the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_get_link_ksettings(struct net_device *dev,
+				      void __user *useraddr)
+{
+	int err = 0;
+	struct ethtool_link_ksettings link_ksettings;
+
+	ASSERT_RTNL();
+
+	if (!dev->ethtool_ops->get_link_ksettings)
+		return -EOPNOTSUPP;
+
+	/* handle bitmap nbits handshake */
+	if (copy_from_user(&link_ksettings.base, useraddr,
+			   sizeof(link_ksettings.base)))
+		return -EFAULT;
+
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords) {
+		/* wrong link mode nbits requested */
+		memset(&link_ksettings, 0, sizeof(link_ksettings));
+		/* keep cmd field reset to 0 */
+		/* send back number of words required as negative val */
+		compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
+				   "need too many bits for link modes!");
+		link_ksettings.base.link_mode_masks_nwords
+			= -((s8)__ETHTOOL_LINK_MODE_MASK_NU32);
+
+		/* copy the base fields back to user, not the link
+		 * mode bitmaps
+		 */
+		if (copy_to_user(useraddr, &link_ksettings.base,
+				 sizeof(link_ksettings.base)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	/* handshake successful: user/kernel agree on
+	 * link_mode_masks_nwords
+	 */
+
+	memset(&link_ksettings, 0, sizeof(link_ksettings));
+	err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+	if (err < 0)
+		return err;
+
+	/* make sure we tell the right values to user */
+	link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+	link_ksettings.base.link_mode_masks_nwords
+		= __ETHTOOL_LINK_MODE_MASK_NU32;
+
+	return store_link_ksettings_for_user(useraddr, &link_ksettings);
+}
+
+/* Update device ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::set_link_ksettings, even if legacy
+ * ethtool_ops::set_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_SSET for
+ * this driver, so that they can correctly update the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_set_link_ksettings(struct net_device *dev,
+				      void __user *useraddr)
+{
+	int err;
+	struct ethtool_link_ksettings link_ksettings;
+
+	ASSERT_RTNL();
+
+	if (!dev->ethtool_ops->set_link_ksettings)
+		return -EOPNOTSUPP;
+
+	/* make sure nbits field has expected value */
+	if (copy_from_user(&link_ksettings.base, useraddr,
+			   sizeof(link_ksettings.base)))
+		return -EFAULT;
+
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords)
+		return -EINVAL;
+
+	/* copy the whole structure, now that we know it has expected
+	 * format
+	 */
+	err = load_link_ksettings_from_user(&link_ksettings, useraddr);
+	if (err)
+		return err;
+
+	/* re-check nwords field, just in case */
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords)
+		return -EINVAL;
+
+	return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+}
+
+/* Internal kernel helper to query a device ethtool_cmd settings.
+ *
+ * Note about transition to ethtool_link_settings API: We do not need
+ * (or want) this function to support "dev" instances that implement
+ * the ethtool_link_settings API as we will update the drivers calling
+ * this function to call __ethtool_get_link_ksettings instead, before
+ * the first drivers implement ethtool_ops::get_link_ksettings.
+ *
+ * TODO 1: at least make this function static when no driver is using it
+ * TODO 2: remove when ethtool_ops::get_settings disappears internally
+ */
 int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	ASSERT_RTNL();
@@ -400,30 +753,112 @@ int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 }
 EXPORT_SYMBOL(__ethtool_get_settings);
 
+static void
+warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
+{
+	char name[sizeof(current->comm)];
+
+	pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n",
+		     get_task_comm(name, current), details);
+}
+
+/* Query device for its ethtool_cmd settings.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing get_link_ksettings
+ * API and drivers implementing get_settings API. When drivers
+ * implement get_link_ksettings and report higher link mode bits, a
+ * kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful
+ * (only the lower link mode bits reported back to user).
+ */
 static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 {
-	int err;
 	struct ethtool_cmd cmd;
 
-	err = __ethtool_get_settings(dev, &cmd);
-	if (err < 0)
-		return err;
+	ASSERT_RTNL();
+
+	if (dev->ethtool_ops->get_link_ksettings) {
+		/* First, use link_ksettings API if it is supported */
+		int err;
+		struct ethtool_link_ksettings link_ksettings;
+
+		memset(&link_ksettings, 0, sizeof(link_ksettings));
+		err = dev->ethtool_ops->get_link_ksettings(dev,
+							   &link_ksettings);
+		if (err < 0)
+			return err;
+		if (!convert_link_ksettings_to_legacy_settings(&cmd,
+							       &link_ksettings))
+			warn_incomplete_ethtool_legacy_settings_conversion(
+				"link modes are only partially reported");
+
+		/* send a sensible cmd tag back to user */
+		cmd.cmd = ETHTOOL_GSET;
+	} else {
+		int err;
+		/* TODO: return -EOPNOTSUPP when
+		 * ethtool_ops::get_settings disappears internally
+		 */
+
+		/* driver doesn't support %ethtool_link_ksettings
+		 * API. revert to legacy %ethtool_cmd API, unless it's
+		 * not supported either.
+		 */
+		err = __ethtool_get_settings(dev, &cmd);
+		if (err < 0)
+			return err;
+	}
 
 	if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
 		return -EFAULT;
+
 	return 0;
 }
 
+/* Update device link settings with given ethtool_cmd.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing set_link_ksettings
+ * API and drivers implementing set_settings API. When drivers
+ * implement set_link_ksettings and user's request updates deprecated
+ * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, and the request is rejected.
+ */
 static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_cmd cmd;
 
-	if (!dev->ethtool_ops->set_settings)
-		return -EOPNOTSUPP;
+	ASSERT_RTNL();
 
 	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
 		return -EFAULT;
 
+	/* first, try new %ethtool_link_ksettings API. */
+	if (dev->ethtool_ops->set_link_ksettings) {
+		struct ethtool_link_ksettings link_ksettings;
+
+		if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
+							       &cmd))
+			return -EINVAL;
+
+		link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
+		link_ksettings.base.link_mode_masks_nwords
+			= __ETHTOOL_LINK_MODE_MASK_NU32;
+		return dev->ethtool_ops->set_link_ksettings(dev,
+							    &link_ksettings);
+	}
+
+	/* legacy %ethtool_cmd API */
+
+	/* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
+	 * disappears internally
+	 */
+
+	if (!dev->ethtool_ops->set_settings)
+		return -EOPNOTSUPP;
+
 	return dev->ethtool_ops->set_settings(dev, &cmd);
 }
 
@@ -2252,6 +2687,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_PERQUEUE:
 		rc = ethtool_set_per_queue(dev, useraddr);
 		break;
+	case ETHTOOL_GLINKSETTINGS:
+		rc = ethtool_get_link_ksettings(dev, useraddr);
+		break;
+	case ETHTOOL_SLINKSETTINGS:
+		rc = ethtool_set_link_ksettings(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 577097981d72a034e4e2354881478cf202a9ba72 Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:58:08 -0800
Subject: net: 8021q: use __ethtool_get_ksettings

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 055f0e989e90..e416a4038a12 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -621,12 +621,12 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
 	return features;
 }
 
-static int vlan_ethtool_get_settings(struct net_device *dev,
-				     struct ethtool_cmd *cmd)
+static int vlan_ethtool_get_link_ksettings(struct net_device *dev,
+					   struct ethtool_link_ksettings *cmd)
 {
 	const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 
-	return __ethtool_get_settings(vlan->real_dev, cmd);
+	return __ethtool_get_link_ksettings(vlan->real_dev, cmd);
 }
 
 static void vlan_ethtool_get_drvinfo(struct net_device *dev,
@@ -741,7 +741,7 @@ static int vlan_dev_get_iflink(const struct net_device *dev)
 }
 
 static const struct ethtool_ops vlan_ethtool_ops = {
-	.get_settings	        = vlan_ethtool_get_settings,
+	.get_link_ksettings	= vlan_ethtool_get_link_ksettings,
 	.get_drvinfo	        = vlan_ethtool_get_drvinfo,
 	.get_link		= ethtool_op_get_link,
 	.get_ts_info		= vlan_ethtool_get_ts_info,
-- 
cgit v1.2.3-59-g8ed1b


From 702b26a24d3d012b9286efb3efbf2b81b23271f5 Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:58:09 -0800
Subject: net: bridge: use __ethtool_get_ksettings

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_if.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index c367b3e1b5ac..b37a1cc97d98 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -36,10 +36,10 @@
  */
 static int port_cost(struct net_device *dev)
 {
-	struct ethtool_cmd ecmd;
+	struct ethtool_link_ksettings ecmd;
 
-	if (!__ethtool_get_settings(dev, &ecmd)) {
-		switch (ethtool_cmd_speed(&ecmd)) {
+	if (!__ethtool_get_link_ksettings(dev, &ecmd)) {
+		switch (ecmd.base.speed) {
 		case SPEED_10000:
 			return 2;
 		case SPEED_1000:
-- 
cgit v1.2.3-59-g8ed1b


From 7cad1bac96d381d953b47695f6602b4e2480aa5d Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:58:10 -0800
Subject: net: core: use __ethtool_get_ksettings

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c   | 15 +++++++++------
 net/packet/af_packet.c | 11 +++++------
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 4ae17c3166fc..2b3f76fe65f4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -198,9 +198,10 @@ static ssize_t speed_show(struct device *dev,
 		return restart_syscall();
 
 	if (netif_running(netdev)) {
-		struct ethtool_cmd cmd;
-		if (!__ethtool_get_settings(netdev, &cmd))
-			ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd));
+		struct ethtool_link_ksettings cmd;
+
+		if (!__ethtool_get_link_ksettings(netdev, &cmd))
+			ret = sprintf(buf, fmt_dec, cmd.base.speed);
 	}
 	rtnl_unlock();
 	return ret;
@@ -217,10 +218,12 @@ static ssize_t duplex_show(struct device *dev,
 		return restart_syscall();
 
 	if (netif_running(netdev)) {
-		struct ethtool_cmd cmd;
-		if (!__ethtool_get_settings(netdev, &cmd)) {
+		struct ethtool_link_ksettings cmd;
+
+		if (!__ethtool_get_link_ksettings(netdev, &cmd)) {
 			const char *duplex;
-			switch (cmd.duplex) {
+
+			switch (cmd.base.duplex) {
 			case DUPLEX_HALF:
 				duplex = "half";
 				break;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b7e7851ddc5d..d41b1074cb2d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -557,9 +557,8 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
 {
 	struct net_device *dev;
 	unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
-	struct ethtool_cmd ecmd;
+	struct ethtool_link_ksettings ecmd;
 	int err;
-	u32 speed;
 
 	rtnl_lock();
 	dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
@@ -567,19 +566,19 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
 		rtnl_unlock();
 		return DEFAULT_PRB_RETIRE_TOV;
 	}
-	err = __ethtool_get_settings(dev, &ecmd);
-	speed = ethtool_cmd_speed(&ecmd);
+	err = __ethtool_get_link_ksettings(dev, &ecmd);
 	rtnl_unlock();
 	if (!err) {
 		/*
 		 * If the link speed is so slow you don't really
 		 * need to worry about perf anyways
 		 */
-		if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
+		if (ecmd.base.speed < SPEED_1000 ||
+		    ecmd.base.speed == SPEED_UNKNOWN) {
 			return DEFAULT_PRB_RETIRE_TOV;
 		} else {
 			msec = 1;
-			div = speed / 1000;
+			div = ecmd.base.speed / 1000;
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 3237fc63a3297d472a8cec33cb914f20570cfc23 Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:58:11 -0800
Subject: net: ethtool: remove unused __ethtool_get_settings

replaced by __ethtool_get_link_ksettings.

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  4 ----
 net/core/ethtool.c      | 45 ++++++++++++++-------------------------------
 2 files changed, 14 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 8a400a54c92e..e2b7bf27c03e 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -150,10 +150,6 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
-/* DEPRECATED, use __ethtool_get_link_ksettings */
-extern int __ethtool_get_settings(struct net_device *dev,
-				  struct ethtool_cmd *cmd);
-
 /**
  * struct ethtool_ops - optional netdev operations
  * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index edcec56ed228..2966cd0d7c93 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -551,7 +551,12 @@ int __ethtool_get_link_ksettings(struct net_device *dev,
 	 * legacy %ethtool_cmd API, unless it's not supported either.
 	 * TODO: remove when ethtool_ops::get_settings disappears internally
 	 */
-	err = __ethtool_get_settings(dev, &cmd);
+	if (!dev->ethtool_ops->get_settings)
+		return -EOPNOTSUPP;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cmd = ETHTOOL_GSET;
+	err = dev->ethtool_ops->get_settings(dev, &cmd);
 	if (err < 0)
 		return err;
 
@@ -729,30 +734,6 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
 	return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
 }
 
-/* Internal kernel helper to query a device ethtool_cmd settings.
- *
- * Note about transition to ethtool_link_settings API: We do not need
- * (or want) this function to support "dev" instances that implement
- * the ethtool_link_settings API as we will update the drivers calling
- * this function to call __ethtool_get_link_ksettings instead, before
- * the first drivers implement ethtool_ops::get_link_ksettings.
- *
- * TODO 1: at least make this function static when no driver is using it
- * TODO 2: remove when ethtool_ops::get_settings disappears internally
- */
-int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
-{
-	ASSERT_RTNL();
-
-	if (!dev->ethtool_ops->get_settings)
-		return -EOPNOTSUPP;
-
-	memset(cmd, 0, sizeof(struct ethtool_cmd));
-	cmd->cmd = ETHTOOL_GSET;
-	return dev->ethtool_ops->get_settings(dev, cmd);
-}
-EXPORT_SYMBOL(__ethtool_get_settings);
-
 static void
 warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
 {
@@ -796,16 +777,18 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 		/* send a sensible cmd tag back to user */
 		cmd.cmd = ETHTOOL_GSET;
 	} else {
-		int err;
-		/* TODO: return -EOPNOTSUPP when
-		 * ethtool_ops::get_settings disappears internally
-		 */
-
 		/* driver doesn't support %ethtool_link_ksettings
 		 * API. revert to legacy %ethtool_cmd API, unless it's
 		 * not supported either.
 		 */
-		err = __ethtool_get_settings(dev, &cmd);
+		int err;
+
+		if (!dev->ethtool_ops->get_settings)
+			return -EOPNOTSUPP;
+
+		memset(&cmd, 0, sizeof(cmd));
+		cmd.cmd = ETHTOOL_GSET;
+		err = dev->ethtool_ops->get_settings(dev, &cmd);
 		if (err < 0)
 			return err;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 2306f65637079c8922aec15b4ced75ea457e757b Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Fri, 26 Feb 2016 09:06:07 +0100
Subject: 6lowpan: iphc: fix invalid case handling

This patch fixes the return value in a case which should never occur.
Instead returning "-EINVAL" we return LOWPAN_IPHC_DAM_00 which is
invalid on context based addresses. Also change the WARN_ON_ONCE to
WARN_ONCE which was suggested by Dan Carpenter.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/6lowpan/iphc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 06287ac03211..72172514fea0 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -811,8 +811,8 @@ static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr,
 		goto out;
 	}
 
-	WARN_ON_ONCE("context found but no address mode matched\n");
-	return -EINVAL;
+	WARN_ONCE(1, "context found but no address mode matched\n");
+	return LOWPAN_IPHC_DAM_00;
 out:
 
 	if (sam)
-- 
cgit v1.2.3-59-g8ed1b


From 3f2fb9a834cb1fcddbae22deca7fde136944dc89 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 24 Feb 2016 11:47:02 -0800
Subject: net: l3mdev: address selection should only consider devices in L3
 domain

David Lamparter noted a use case where the source address selection fails
to pick an address from a VRF interface - unnumbered interfaces.

Relevant commands from his script:
    ip addr add 9.9.9.9/32 dev lo
    ip link set lo up

    ip link add name vrf0 type vrf table 101
    ip rule add oif vrf0 table 101
    ip rule add iif vrf0 table 101
    ip link set vrf0 up
    ip addr add 10.0.0.3/32 dev vrf0

    ip link add name dummy2 type dummy
    ip link set dummy2 master vrf0 up

    --> note dummy2 has no address - unnumbered device

    ip route add 10.2.2.2/32 dev dummy2 table 101
    ip neigh add 10.2.2.2 dev dummy2 lladdr 02:00:00:00:00:02

    tcpdump -ni dummy2 &

And using ping instead of his socat example:
    $ ping -I vrf0 -c1 10.2.2.2
    ping: Warning: source address might be selected on device other than vrf0.
    PING 10.2.2.2 (10.2.2.2) from 9.9.9.9 vrf0: 56(84) bytes of data.

>From tcpdump:
    12:57:29.449128 IP 9.9.9.9 > 10.2.2.2: ICMP echo request, id 2491, seq 1, length 64

Note the source address is from lo and is not a VRF local address. With
this patch:

    $ ping -I vrf0 -c1 10.2.2.2
    PING 10.2.2.2 (10.2.2.2) from 10.0.0.3 vrf0: 56(84) bytes of data.

>From tcpdump:
    12:59:25.096426 IP 10.0.0.3 > 10.2.2.2: ICMP echo request, id 2113, seq 1, length 64

Now the source address comes from vrf0.

The ipv4 function for selecting source address takes a const argument.
Removing the const requires touching a lot of places, so instead
l3mdev_master_ifindex_rcu is changed to take a const argument and then
do the typecast to non-const as required by netdev_master_upper_dev_get_rcu.
This is similar to what l3mdev_fib_table_rcu does.

IPv6 for unnumbered interfaces appears to be selecting the addresses
properly.

Cc: David Lamparter <david@opensourcerouting.org>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/l3mdev.h |  4 ++--
 net/ipv4/devinet.c   |  5 +++++
 net/l3mdev/l3mdev.c  | 11 +++++++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 5567d46b3cff..c43a9c73de5e 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -39,7 +39,7 @@ struct l3mdev_ops {
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 
-int l3mdev_master_ifindex_rcu(struct net_device *dev);
+int l3mdev_master_ifindex_rcu(const struct net_device *dev);
 static inline int l3mdev_master_ifindex(struct net_device *dev)
 {
 	int ifindex;
@@ -179,7 +179,7 @@ struct dst_entry *l3mdev_rt6_dst_by_oif(struct net *net,
 
 #else
 
-static inline int l3mdev_master_ifindex_rcu(struct net_device *dev)
+static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
 {
 	return 0;
 }
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 29b8d3a7b19b..18d510fa7ee2 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1194,6 +1194,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
 	__be32 addr = 0;
 	struct in_device *in_dev;
 	struct net *net = dev_net(dev);
+	int master_idx;
 
 	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(dev);
@@ -1214,12 +1215,16 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
 	if (addr)
 		goto out_unlock;
 no_in_dev:
+	master_idx = l3mdev_master_ifindex_rcu(dev);
 
 	/* Not loopback addresses on loopback should be preferred
 	   in this case. It is important that lo is the first interface
 	   in dev_base list.
 	 */
 	for_each_netdev_rcu(net, dev) {
+		if (l3mdev_master_ifindex_rcu(dev) != master_idx)
+			continue;
+
 		in_dev = __in_dev_get_rcu(dev);
 		if (!in_dev)
 			continue;
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 8e5ead366e7f..e925037fa0df 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -17,7 +17,7 @@
  *	@dev: targeted interface
  */
 
-int l3mdev_master_ifindex_rcu(struct net_device *dev)
+int l3mdev_master_ifindex_rcu(const struct net_device *dev)
 {
 	int ifindex = 0;
 
@@ -28,8 +28,15 @@ int l3mdev_master_ifindex_rcu(struct net_device *dev)
 		ifindex = dev->ifindex;
 	} else if (netif_is_l3_slave(dev)) {
 		struct net_device *master;
+		struct net_device *_dev = (struct net_device *)dev;
 
-		master = netdev_master_upper_dev_get_rcu(dev);
+		/* netdev_master_upper_dev_get_rcu calls
+		 * list_first_or_null_rcu to walk the upper dev list.
+		 * list_first_or_null_rcu does not handle a const arg. We aren't
+		 * making changes, just want the master device from that list so
+		 * typecast to remove the const
+		 */
+		master = netdev_master_upper_dev_get_rcu(_dev);
 		if (master)
 			ifindex = master->ifindex;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 17b693cdd87635ae813ad61ad0b8a8458f4c3960 Mon Sep 17 00:00:00 2001
From: David Lamparter <equinox@diac24.net>
Date: Wed, 24 Feb 2016 11:47:03 -0800
Subject: net: l3mdev: prefer VRF master for source address selection

When selecting an address in context of a VRF, the vrf master should be
preferred for address selection.  If it isn't, the user has a hard time
getting the system to select to their preference - the code will pick
the address off the first in-VRF interface it can find, which on a
router could well be a non-routable address.

Signed-off-by: David Lamparter <equinox@diac24.net>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
[dsa: Fixed comment style and removed extra blank link ]
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 18d510fa7ee2..8c3df2ccba45 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1217,6 +1217,23 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
 no_in_dev:
 	master_idx = l3mdev_master_ifindex_rcu(dev);
 
+	/* For VRFs, the VRF device takes the place of the loopback device,
+	 * with addresses on it being preferred.  Note in such cases the
+	 * loopback device will be among the devices that fail the master_idx
+	 * equality check in the loop below.
+	 */
+	if (master_idx &&
+	    (dev = dev_get_by_index_rcu(net, master_idx)) &&
+	    (in_dev = __in_dev_get_rcu(dev))) {
+		for_primary_ifa(in_dev) {
+			if (ifa->ifa_scope != RT_SCOPE_LINK &&
+			    ifa->ifa_scope <= scope) {
+				addr = ifa->ifa_local;
+				goto out_unlock;
+			}
+		} endfor_ifa(in_dev);
+	}
+
 	/* Not loopback addresses on loopback should be preferred
 	   in this case. It is important that lo is the first interface
 	   in dev_base list.
-- 
cgit v1.2.3-59-g8ed1b


From 224638766235ba82c53b4216e4dabc510701fbf2 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Wed, 24 Feb 2016 16:46:21 -0800
Subject: GSO: Provide software checksum of tunneled UDP fragmentation offload

On reviewing the code I realized that GRE and UDP tunnels could cause a
kernel panic if we used GSO to segment a large UDP frame that was sent
through the tunnel with an outer checksum and hardware offloads were not
available.

In order to correct this we need to update the feature flags that are
passed to the skb_segment function so that in the event of UDP
fragmentation being requested for the inner header the segmentation
function will correctly generate the checksum for the payload if we cannot
segment the outer header.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/gre_offload.c | 21 ++++++++++++++++++---
 net/ipv4/udp_offload.c | 15 ++++++++++++---
 net/ipv6/udp_offload.c |  8 +++++++-
 3 files changed, 37 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 003b0ebbcfdd..47f4c544c916 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	__be16 protocol = skb->protocol;
 	u16 mac_len = skb->mac_len;
 	int gre_offset, outer_hlen;
-	bool need_csum;
+	bool need_csum, ufo;
 
 	if (unlikely(skb_shinfo(skb)->gso_type &
 				~(SKB_GSO_TCPV4 |
@@ -58,8 +58,20 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
 	skb->encap_hdr_csum = need_csum;
 
+	ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+
 	features &= skb->dev->hw_enc_features;
 
+	/* The only checksum offload we care about from here on out is the
+	 * outer one so strip the existing checksum feature flags based
+	 * on the fact that we will be computing our checksum in software.
+	 */
+	if (ufo) {
+		features &= ~NETIF_F_CSUM_MASK;
+		if (!need_csum)
+			features |= NETIF_F_HW_CSUM;
+	}
+
 	/* segment inner packet. */
 	segs = skb_mac_gso_segment(skb, features);
 	if (IS_ERR_OR_NULL(segs)) {
@@ -75,8 +87,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 		struct gre_base_hdr *greh;
 		__be32 *pcsum;
 
-		skb_reset_inner_headers(skb);
-		skb->encapsulation = 1;
+		/* Set up inner headers if we are offloading inner checksum */
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			skb_reset_inner_headers(skb);
+			skb->encapsulation = 1;
+		}
 
 		skb->mac_len = mac_len;
 		skb->protocol = protocol;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 56c4c8b88b28..f5abb1ae1358 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -33,8 +33,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	__be16 new_protocol, bool is_ipv6)
 {
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+	bool remcsum, need_csum, offload_csum, ufo;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	bool remcsum, need_csum, offload_csum;
 	struct udphdr *uh = udp_hdr(skb);
 	u16 mac_offset = skb->mac_header;
 	__be16 protocol = skb->protocol;
@@ -62,6 +62,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
 	skb->remcsum_offload = remcsum;
 
+	ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+
 	/* Try to offload checksum if possible */
 	offload_csum = !!(need_csum &&
 			  (skb->dev->features &
@@ -74,9 +76,9 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	 * outer one so strip the existing checksum feature flags and
 	 * instead set the flag based on our outer checksum offload value.
 	 */
-	if (remcsum) {
+	if (remcsum || ufo) {
 		features &= ~NETIF_F_CSUM_MASK;
-		if (offload_csum)
+		if (!need_csum || offload_csum)
 			features |= NETIF_F_HW_CSUM;
 	}
 
@@ -230,6 +232,13 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 
 	skb->ip_summed = CHECKSUM_NONE;
 
+	/* If there is no outer header we can fake a checksum offload
+	 * due to the fact that we have already done the checksum in
+	 * software prior to segmenting the frame.
+	 */
+	if (!skb->encap_hdr_csum)
+		features |= NETIF_F_HW_CSUM;
+
 	/* Fragment the skb. IP headers of the fragments are updated in
 	 * inet_gso_segment()
 	 */
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 7441e1e63893..2b0fbe6929e8 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -81,12 +81,18 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		csum = skb_checksum(skb, 0, skb->len, 0);
 		uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
 					  &ipv6h->daddr, csum);
-
 		if (uh->check == 0)
 			uh->check = CSUM_MANGLED_0;
 
 		skb->ip_summed = CHECKSUM_NONE;
 
+		/* If there is no outer header we can fake a checksum offload
+		 * due to the fact that we have already done the checksum in
+		 * software prior to segmenting the frame.
+		 */
+		if (!skb->encap_hdr_csum)
+			features |= NETIF_F_HW_CSUM;
+
 		/* Check if there is enough headroom to insert fragment header. */
 		tnl_hlen = skb_tnl_header_len(skb);
 		if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
-- 
cgit v1.2.3-59-g8ed1b


From 0744ff8fa8fad7aae669b73fd78f3efe1d8e31ca Mon Sep 17 00:00:00 2001
From: Linus Luessing <linus.luessing@web.de>
Date: Sat, 16 Jan 2016 16:40:08 +0800
Subject: batman-adv: Add hard_iface specific sysfs wrapper macros for UINT

This allows us to easily add a sysfs parameter for an
unsigned int later, which is not for a batman mesh interface
(e.g. bat0), but for a common interface instead. It allows
reading and writing an atomic_t in hard_iface (instead of
bat_priv compared to the mesh variant).

Developed by Linus during a 6 months trainee study period in
Ascom (Switzerland) AG.

Signed-off-by: Linus Luessing <linus.luessing@web.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
[antonio@open-mesh.com: rename functions and move macros]
Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
---
 net/batman-adv/sysfs.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 4d70d4413e40..d3f69d5e79d9 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -242,6 +242,55 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj,			\
 	static BATADV_ATTR_VLAN(_name, _mode, batadv_show_vlan_##_name,	\
 				batadv_store_vlan_##_name)
 
+#define BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, _max, _post_func)	\
+ssize_t batadv_store_##_name(struct kobject *kobj,			\
+			     struct attribute *attr, char *buff,	\
+			     size_t count)				\
+{									\
+	struct net_device *net_dev = batadv_kobj_to_netdev(kobj);	\
+	struct batadv_hard_iface *hard_iface;				\
+	ssize_t length;							\
+									\
+	hard_iface = batadv_hardif_get_by_netdev(net_dev);		\
+	if (!hard_iface)						\
+		return 0;						\
+									\
+	length = __batadv_store_uint_attr(buff, count, _min, _max,	\
+					  _post_func, attr,		\
+					  &hard_iface->_var, net_dev);	\
+									\
+	batadv_hardif_put(hard_iface);				\
+	return length;							\
+}
+
+#define BATADV_ATTR_HIF_SHOW_UINT(_name, _var)				\
+ssize_t batadv_show_##_name(struct kobject *kobj,			\
+			    struct attribute *attr, char *buff)		\
+{									\
+	struct net_device *net_dev = batadv_kobj_to_netdev(kobj);	\
+	struct batadv_hard_iface *hard_iface;				\
+	ssize_t length;							\
+									\
+	hard_iface = batadv_hardif_get_by_netdev(net_dev);		\
+	if (!hard_iface)						\
+		return 0;						\
+									\
+	length = sprintf(buff, "%i\n", atomic_read(&hard_iface->_var));	\
+									\
+	batadv_hardif_put(hard_iface);				\
+	return length;							\
+}
+
+/* Use this, if you are going to set [name] in hard_iface to an
+ * unsigned integer value
+ */
+#define BATADV_ATTR_HIF_UINT(_name, _var, _mode, _min, _max, _post_func)\
+	static BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min,		\
+					  _max, _post_func)		\
+	static BATADV_ATTR_HIF_SHOW_UINT(_name, _var)			\
+	static BATADV_ATTR(_name, _mode, batadv_show_##_name,		\
+			   batadv_store_##_name)
+
 static int batadv_store_bool_attr(char *buff, size_t count,
 				  struct net_device *net_dev,
 				  const char *attr_name, atomic_t *attr,
-- 
cgit v1.2.3-59-g8ed1b


From d6f94d91f766b4205e5b0aa4b11f96271c793f6d Mon Sep 17 00:00:00 2001
From: Linus Luessing <linus.luessing@web.de>
Date: Sat, 16 Jan 2016 16:40:09 +0800
Subject: batman-adv: ELP - adding basic infrastructure

The B.A.T.M.A.N. protocol originally only used a single
message type (called OGM) to determine the link qualities to
the direct neighbors and spreading these link quality
information through the whole mesh. This procedure is
summarized on the BATMAN concept page and explained in
details in the RFC draft published in 2008.

This approach was chosen for its simplicity during the
protocol design phase and the implementation. However, it
also bears some drawbacks:

 *  Wireless interfaces usually come with some packet loss,
    therefore a higher broadcast rate is desirable to allow
    a fast reaction on flaky connections.
    Other interfaces of the same host might be connected to
    Ethernet LANs / VPNs / etc which rarely exhibit packet
    loss would benefit from a lower broadcast rate to reduce
    overhead.
 *  It generally is more desirable to detect local link
    quality changes at a faster rate than propagating all
    these changes through the entire mesh (the far end of
    the mesh does not need to care about local link quality
    changes that much). Other optimizations strategies, like
    reducing overhead, might be possible if OGMs weren't
    used for all tasks in the mesh at the same time.

As a result detecting local link qualities shall be handled
by an independent message type, ELP, whereas the OGM message
type remains responsible for flooding the mesh with these
link quality information and determining the overall path
transmit qualities.

Developed by Linus during a 6 months trainee study period in
Ascom (Switzerland) AG.

Signed-off-by: Linus Luessing <linus.luessing@web.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
---
 net/batman-adv/Kconfig     |  14 ++++
 net/batman-adv/Makefile    |   2 +
 net/batman-adv/bat_algo.h  |  15 +++-
 net/batman-adv/bat_v.c     |  74 +++++++++++++++++
 net/batman-adv/bat_v_elp.c | 193 +++++++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/bat_v_elp.h |  27 +++++++
 net/batman-adv/main.c      |   1 +
 net/batman-adv/packet.h    |  20 +++++
 net/batman-adv/types.h     |  18 +++++
 9 files changed, 363 insertions(+), 1 deletion(-)
 create mode 100644 net/batman-adv/bat_v.c
 create mode 100644 net/batman-adv/bat_v_elp.c
 create mode 100644 net/batman-adv/bat_v_elp.h

(limited to 'net')

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index 2dd40e5ea030..5c148a8489da 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -15,6 +15,20 @@ config BATMAN_ADV
           https://www.open-mesh.org/ for more information and user space
           tools.
 
+config BATMAN_ADV_BATMAN_V
+	bool "B.A.T.M.A.N. V protocol (experimental)"
+	depends on BATMAN_ADV
+	default n
+	help
+	  This option enables the B.A.T.M.A.N. V protocol, the successor
+	  of the currently used B.A.T.M.A.N. IV protocol. The main
+	  changes include splitting of the OGM protocol into a neighbor
+	  discovery protocol (Echo Location Protocol, ELP) and a new OGM
+	  Protocol OGMv2 for flooding protocol information through the
+	  network, as well as a throughput based metric.
+	  B.A.T.M.A.N. V is currently considered experimental and not
+	  compatible to B.A.T.M.A.N. IV networks.
+
 config BATMAN_ADV_BLA
 	bool "Bridge Loop Avoidance"
 	depends on BATMAN_ADV && INET
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index 207e2af316c7..ca51686e9f72 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -18,6 +18,8 @@
 
 obj-$(CONFIG_BATMAN_ADV) += batman-adv.o
 batman-adv-y += bat_iv_ogm.o
+batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v.o
+batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o
 batman-adv-y += bitarray.o
 batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
 batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index a7485d676088..a4e994e26da1 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,6 +1,6 @@
 /* Copyright (C) 2011-2016  B.A.T.M.A.N. contributors:
  *
- * Marek Lindner
+ * Marek Lindner, Linus Lüssing
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -20,4 +20,17 @@
 
 int batadv_iv_init(void);
 
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+
+int batadv_v_init(void);
+
+#else
+
+static inline int batadv_v_init(void)
+{
+	return 0;
+}
+
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
 #endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
new file mode 100644
index 000000000000..01327f627a08
--- /dev/null
+++ b/net/batman-adv/bat_v.c
@@ -0,0 +1,74 @@
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "bat_algo.h"
+#include "main.h"
+
+#include <linux/cache.h>
+#include <linux/init.h>
+
+#include "bat_v_elp.h"
+
+static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+	return batadv_v_elp_iface_enable(hard_iface);
+}
+
+static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+	batadv_v_elp_iface_disable(hard_iface);
+}
+
+static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
+{
+}
+
+static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
+{
+	batadv_v_elp_primary_iface_set(hard_iface);
+}
+
+static void batadv_v_ogm_schedule(struct batadv_hard_iface *hard_iface)
+{
+}
+
+static void batadv_v_ogm_emit(struct batadv_forw_packet *forw_packet)
+{
+}
+
+static struct batadv_algo_ops batadv_batman_v __read_mostly = {
+	.name = "BATMAN_V",
+	.bat_iface_enable = batadv_v_iface_enable,
+	.bat_iface_disable = batadv_v_iface_disable,
+	.bat_iface_update_mac = batadv_v_iface_update_mac,
+	.bat_primary_iface_set = batadv_v_primary_iface_set,
+	.bat_ogm_emit = batadv_v_ogm_emit,
+	.bat_ogm_schedule = batadv_v_ogm_schedule,
+};
+
+/**
+ * batadv_v_init - B.A.T.M.A.N. V initialization function
+ *
+ * Description: Takes care of initializing all the subcomponents.
+ * It is invoked upon module load only.
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int __init batadv_v_init(void)
+{
+	return batadv_algo_register(&batadv_batman_v);
+}
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
new file mode 100644
index 000000000000..bc6e046c614c
--- /dev/null
+++ b/net/batman-adv/bat_v_elp.c
@@ -0,0 +1,193 @@
+/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "bat_v_elp.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "bat_algo.h"
+#include "hard-interface.h"
+#include "packet.h"
+#include "send.h"
+
+/**
+ * batadv_v_elp_start_timer - restart timer for ELP periodic work
+ * @hard_iface: the interface for which the timer has to be reset
+ */
+static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
+{
+	unsigned int msecs;
+
+	msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER;
+	msecs += prandom_u32() % (2 * BATADV_JITTER);
+
+	queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq,
+			   msecs_to_jiffies(msecs));
+}
+
+/**
+ * batadv_v_elp_periodic_work - ELP periodic task per interface
+ * @work: work queue item
+ *
+ * Emits broadcast ELP message in regular intervals.
+ */
+static void batadv_v_elp_periodic_work(struct work_struct *work)
+{
+	struct batadv_hard_iface *hard_iface;
+	struct batadv_hard_iface_bat_v *bat_v;
+	struct batadv_elp_packet *elp_packet;
+	struct batadv_priv *bat_priv;
+	struct sk_buff *skb;
+	u32 elp_interval;
+
+	bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work);
+	hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v);
+	bat_priv = netdev_priv(hard_iface->soft_iface);
+
+	if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+		goto out;
+
+	/* we are in the process of shutting this interface down */
+	if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) ||
+	    (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED))
+		goto out;
+
+	/* the interface was enabled but may not be ready yet */
+	if (hard_iface->if_status != BATADV_IF_ACTIVE)
+		goto restart_timer;
+
+	skb = skb_copy(hard_iface->bat_v.elp_skb, GFP_ATOMIC);
+	if (!skb)
+		goto restart_timer;
+
+	elp_packet = (struct batadv_elp_packet *)skb->data;
+	elp_packet->seqno = htonl(atomic_read(&hard_iface->bat_v.elp_seqno));
+	elp_interval = atomic_read(&hard_iface->bat_v.elp_interval);
+	elp_packet->elp_interval = htonl(elp_interval);
+
+	batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+		   "Sending broadcast ELP packet on interface %s, seqno %u\n",
+		   hard_iface->net_dev->name,
+		   atomic_read(&hard_iface->bat_v.elp_seqno));
+
+	batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+
+	atomic_inc(&hard_iface->bat_v.elp_seqno);
+
+restart_timer:
+	batadv_v_elp_start_timer(hard_iface);
+out:
+	return;
+}
+
+/**
+ * batadv_v_elp_iface_enable - setup the ELP interface private resources
+ * @hard_iface: interface for which the data has to be prepared
+ *
+ * Return: 0 on success or a -ENOMEM in case of failure.
+ */
+int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+	struct batadv_elp_packet *elp_packet;
+	unsigned char *elp_buff;
+	u32 random_seqno;
+	size_t size;
+	int res = -ENOMEM;
+
+	size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN;
+	hard_iface->bat_v.elp_skb = dev_alloc_skb(size);
+	if (!hard_iface->bat_v.elp_skb)
+		goto out;
+
+	skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN);
+	elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
+	elp_packet = (struct batadv_elp_packet *)elp_buff;
+	memset(elp_packet, 0, BATADV_ELP_HLEN);
+
+	elp_packet->packet_type = BATADV_ELP;
+	elp_packet->version = BATADV_COMPAT_VERSION;
+
+	/* randomize initial seqno to avoid collision */
+	get_random_bytes(&random_seqno, sizeof(random_seqno));
+	atomic_set(&hard_iface->bat_v.elp_seqno, random_seqno);
+	atomic_set(&hard_iface->bat_v.elp_interval, 500);
+
+	INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq,
+			  batadv_v_elp_periodic_work);
+	batadv_v_elp_start_timer(hard_iface);
+	res = 0;
+
+out:
+	return res;
+}
+
+/**
+ * batadv_v_elp_iface_disable - release ELP interface private resources
+ * @hard_iface: interface for which the resources have to be released
+ */
+void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+	cancel_delayed_work_sync(&hard_iface->bat_v.elp_wq);
+
+	dev_kfree_skb(hard_iface->bat_v.elp_skb);
+	hard_iface->bat_v.elp_skb = NULL;
+}
+
+/**
+ * batadv_v_elp_primary_iface_set - change internal data to reflect the new
+ *  primary interface
+ * @primary_iface: the new primary interface
+ */
+void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
+{
+	struct batadv_hard_iface *hard_iface;
+	struct batadv_elp_packet *elp_packet;
+	struct sk_buff *skb;
+
+	/* update orig field of every elp iface belonging to this mesh */
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+		if (primary_iface->soft_iface != hard_iface->soft_iface)
+			continue;
+
+		if (!hard_iface->bat_v.elp_skb)
+			continue;
+
+		skb = hard_iface->bat_v.elp_skb;
+		elp_packet = (struct batadv_elp_packet *)skb->data;
+		ether_addr_copy(elp_packet->orig,
+				primary_iface->net_dev->dev_addr);
+	}
+	rcu_read_unlock();
+}
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
new file mode 100644
index 000000000000..7a584d1f055b
--- /dev/null
+++ b/net/batman-adv/bat_v_elp.h
@@ -0,0 +1,27 @@
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+
+#ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_
+#define _NET_BATMAN_ADV_BAT_V_ELP_H_
+
+int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
+
+#endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index e3d7051747b0..216057205e10 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -87,6 +87,7 @@ static int __init batadv_init(void)
 
 	batadv_recv_handler_init();
 
+	batadv_v_init();
 	batadv_iv_init();
 	batadv_nc_init();
 
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index e7f915181aba..c698b6452953 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -26,6 +26,7 @@
  * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV
  * @BATADV_BCAST: broadcast packets carrying broadcast payload
  * @BATADV_CODED: network coded packets
+ * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V
  *
  * @BATADV_UNICAST: unicast packets carrying unicast payload traffic
  * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original
@@ -40,6 +41,7 @@ enum batadv_packettype {
 	BATADV_IV_OGM           = 0x00,
 	BATADV_BCAST            = 0x01,
 	BATADV_CODED            = 0x02,
+	BATADV_ELP		= 0x03,
 	/* 0x40 - 0x7f: unicast */
 #define BATADV_UNICAST_MIN     0x40
 	BATADV_UNICAST          = 0x40,
@@ -234,6 +236,24 @@ struct batadv_ogm_packet {
 
 #define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet)
 
+/**
+ * struct batadv_elp_packet - elp (neighbor discovery) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @orig: originator mac address
+ * @seqno: sequence number
+ * @elp_interval: currently used ELP sending interval in ms
+ */
+struct batadv_elp_packet {
+	u8     packet_type;
+	u8     version;
+	u8     orig[ETH_ALEN];
+	__be32 seqno;
+	__be32 elp_interval;
+};
+
+#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
+
 /**
  * struct batadv_icmp_header - common members among all the ICMP packets
  * @packet_type: batman-adv packet type, part of the general header
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 612de23178e6..992d5fd5554e 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -85,6 +85,20 @@ struct batadv_hard_iface_bat_iv {
 	atomic_t ogm_seqno;
 };
 
+/**
+ * struct batadv_hard_iface_bat_v - per hard-interface B.A.T.M.A.N. V data
+ * @elp_interval: time interval between two ELP transmissions
+ * @elp_seqno: current ELP sequence number
+ * @elp_skb: base skb containing the ELP message to send
+ * @elp_wq: workqueue used to schedule ELP transmissions
+ */
+struct batadv_hard_iface_bat_v {
+	atomic_t elp_interval;
+	atomic_t elp_seqno;
+	struct sk_buff *elp_skb;
+	struct delayed_work elp_wq;
+};
+
 /**
  * struct batadv_hard_iface - network device known to batman-adv
  * @list: list node for batadv_hardif_list
@@ -99,6 +113,7 @@ struct batadv_hard_iface_bat_iv {
  * @soft_iface: the batman-adv interface which uses this network interface
  * @rcu: struct used for freeing in an RCU-safe manner
  * @bat_iv: per hard-interface B.A.T.M.A.N. IV data
+ * @bat_v: per hard-interface B.A.T.M.A.N. V data
  * @cleanup_work: work queue callback item for hard-interface deinit
  * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
  * @neigh_list: list of unique single hop neighbors via this interface
@@ -116,6 +131,9 @@ struct batadv_hard_iface {
 	struct net_device *soft_iface;
 	struct rcu_head rcu;
 	struct batadv_hard_iface_bat_iv bat_iv;
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	struct batadv_hard_iface_bat_v bat_v;
+#endif
 	struct work_struct cleanup_work;
 	struct dentry *debug_dir;
 	struct hlist_head neigh_list;
-- 
cgit v1.2.3-59-g8ed1b


From 162bd64c24aba8efe68948e95e61628403106cd7 Mon Sep 17 00:00:00 2001
From: Linus Luessing <linus.luessing@web.de>
Date: Sat, 16 Jan 2016 16:40:10 +0800
Subject: batman-adv: ELP - creating neighbor structures

Initially developed by Linus during a 6 months trainee study
period in Ascom (Switzerland) AG.

Signed-off-by: Linus Luessing <linus.luessing@web.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
---
 net/batman-adv/bat_v.c     |  23 ++++++-
 net/batman-adv/bat_v_elp.c | 152 +++++++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/bat_v_elp.h |   4 ++
 net/batman-adv/main.h      |   3 +
 net/batman-adv/types.h     |  33 ++++++++++
 5 files changed, 214 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 01327f627a08..0fea894855ff 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 
 #include "bat_v_elp.h"
+#include "packet.h"
 
 static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
 {
@@ -42,6 +43,12 @@ static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
 	batadv_v_elp_primary_iface_set(hard_iface);
 }
 
+static void
+batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
+{
+	ewma_throughput_init(&hardif_neigh->bat_v.throughput);
+}
+
 static void batadv_v_ogm_schedule(struct batadv_hard_iface *hard_iface)
 {
 }
@@ -56,6 +63,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
 	.bat_iface_disable = batadv_v_iface_disable,
 	.bat_iface_update_mac = batadv_v_iface_update_mac,
 	.bat_primary_iface_set = batadv_v_primary_iface_set,
+	.bat_hardif_neigh_init = batadv_v_hardif_neigh_init,
 	.bat_ogm_emit = batadv_v_ogm_emit,
 	.bat_ogm_schedule = batadv_v_ogm_schedule,
 };
@@ -70,5 +78,18 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
  */
 int __init batadv_v_init(void)
 {
-	return batadv_algo_register(&batadv_batman_v);
+	int ret;
+
+	/* B.A.T.M.A.N. V echo location protocol packet  */
+	ret = batadv_recv_handler_register(BATADV_ELP,
+					   batadv_v_elp_packet_recv);
+	if (ret < 0)
+		return ret;
+
+	ret = batadv_algo_register(&batadv_batman_v);
+
+	if (ret < 0)
+		batadv_recv_handler_unregister(BATADV_ELP);
+
+	return ret;
 }
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index bc6e046c614c..dac88fad2697 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -38,7 +38,10 @@
 
 #include "bat_algo.h"
 #include "hard-interface.h"
+#include "hash.h"
+#include "originator.h"
 #include "packet.h"
+#include "routing.h"
 #include "send.h"
 
 /**
@@ -191,3 +194,152 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
 	}
 	rcu_read_unlock();
 }
+
+/**
+ * batadv_v_ogm_orig_get - retrieve and possibly create an originator node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the address of the originator
+ *
+ * Return: the orig_node corresponding to the specified address. If such object
+ * does not exist it is allocated here. In case of allocation failure returns
+ * NULL.
+ */
+static struct batadv_orig_node *
+batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
+		      const u8 *addr)
+{
+	struct batadv_orig_node *orig_node;
+	int hash_added;
+
+	orig_node = batadv_orig_hash_find(bat_priv, addr);
+	if (orig_node)
+		return orig_node;
+
+	orig_node = batadv_orig_node_new(bat_priv, addr);
+	if (!orig_node)
+		return NULL;
+
+	hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
+				     batadv_choose_orig, orig_node,
+				     &orig_node->hash_entry);
+	if (hash_added != 0) {
+		/* orig_node->refcounter is initialised to 2 by
+		 * batadv_orig_node_new()
+		 */
+		batadv_orig_node_put(orig_node);
+		batadv_orig_node_put(orig_node);
+		orig_node = NULL;
+	}
+
+	return orig_node;
+}
+
+/**
+ * batadv_v_elp_neigh_update - update an ELP neighbour node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @neigh_addr: the neighbour interface address
+ * @if_incoming: the interface the packet was received through
+ * @elp_packet: the received ELP packet
+ *
+ * Updates the ELP neighbour node state with the data received within the new
+ * ELP packet.
+ */
+static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv,
+				      u8 *neigh_addr,
+				      struct batadv_hard_iface *if_incoming,
+				      struct batadv_elp_packet *elp_packet)
+
+{
+	struct batadv_neigh_node *neigh;
+	struct batadv_orig_node *orig_neigh;
+	struct batadv_hardif_neigh_node *hardif_neigh;
+	s32 seqno_diff;
+	s32 elp_latest_seqno;
+
+	orig_neigh = batadv_v_ogm_orig_get(bat_priv, elp_packet->orig);
+	if (!orig_neigh)
+		return;
+
+	neigh = batadv_neigh_node_new(orig_neigh, if_incoming, neigh_addr);
+	if (!neigh)
+		goto orig_free;
+
+	hardif_neigh = batadv_hardif_neigh_get(if_incoming, neigh_addr);
+	if (!hardif_neigh)
+		goto neigh_free;
+
+	elp_latest_seqno = hardif_neigh->bat_v.elp_latest_seqno;
+	seqno_diff = ntohl(elp_packet->seqno) - elp_latest_seqno;
+
+	/* known or older sequence numbers are ignored. However always adopt
+	 * if the router seems to have been restarted.
+	 */
+	if (seqno_diff < 1 && seqno_diff > -BATADV_ELP_MAX_AGE)
+		goto hardif_free;
+
+	neigh->last_seen = jiffies;
+	hardif_neigh->last_seen = jiffies;
+	hardif_neigh->bat_v.elp_latest_seqno = ntohl(elp_packet->seqno);
+	hardif_neigh->bat_v.elp_interval = ntohl(elp_packet->elp_interval);
+
+hardif_free:
+	if (hardif_neigh)
+		batadv_hardif_neigh_put(hardif_neigh);
+neigh_free:
+	if (neigh)
+		batadv_neigh_node_put(neigh);
+orig_free:
+	if (orig_neigh)
+		batadv_orig_node_put(orig_neigh);
+}
+
+/**
+ * batadv_v_elp_packet_recv - main ELP packet handler
+ * @skb: the received packet
+ * @if_incoming: the interface this packet was received through
+ *
+ * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly
+ * processed or NET_RX_DROP in case of failure.
+ */
+int batadv_v_elp_packet_recv(struct sk_buff *skb,
+			     struct batadv_hard_iface *if_incoming)
+{
+	struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct batadv_elp_packet *elp_packet;
+	struct batadv_hard_iface *primary_if;
+	struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
+	bool ret;
+
+	ret = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
+	if (!ret)
+		return NET_RX_DROP;
+
+	if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
+		return NET_RX_DROP;
+
+	/* did we receive a B.A.T.M.A.N. V ELP packet on an interface
+	 * that does not have B.A.T.M.A.N. V ELP enabled ?
+	 */
+	if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0)
+		return NET_RX_DROP;
+
+	elp_packet = (struct batadv_elp_packet *)skb->data;
+
+	batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+		   "Received ELP packet from %pM seqno %u ORIG: %pM\n",
+		   ethhdr->h_source, ntohl(elp_packet->seqno),
+		   elp_packet->orig);
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto out;
+
+	batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming,
+				  elp_packet);
+
+out:
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+	consume_skb(skb);
+	return NET_RX_SUCCESS;
+}
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index 7a584d1f055b..5a7bc398a9ef 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -20,8 +20,12 @@
 #ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_
 #define _NET_BATMAN_ADV_BAT_V_ELP_H_
 
+struct sk_buff;
+
 int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
 void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
 void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
+int batadv_v_elp_packet_recv(struct sk_buff *skb,
+			     struct batadv_hard_iface *if_incoming);
 
 #endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 8c01f54c61f3..d91bae270f42 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -60,6 +60,9 @@
 #define BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM 1
 #define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1
 
+/* B.A.T.M.A.N. V */
+#define BATADV_ELP_MAX_AGE 64
+
 /* number of OGMs sent with the last tt diff */
 #define BATADV_TT_OGM_APPEND_MAX 3
 
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 992d5fd5554e..7fc82a5d4ac4 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -22,6 +22,7 @@
 #error only "main.h" can be included directly
 #endif
 
+#include <linux/average.h>
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/if_ether.h>
@@ -364,12 +365,28 @@ struct batadv_gw_node {
 	struct rcu_head rcu;
 };
 
+DECLARE_EWMA(throughput, 1024, 8)
+
+/**
+ * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor
+ *  information
+ * @throughput: ewma link throughput towards this neighbor
+ * @elp_interval: time interval between two ELP transmissions
+ * @elp_latest_seqno: latest and best known ELP sequence number
+ */
+struct batadv_hardif_neigh_node_bat_v {
+	struct ewma_throughput throughput;
+	u32 elp_interval;
+	u32 elp_latest_seqno;
+};
+
 /**
  * struct batadv_hardif_neigh_node - unique neighbor per hard-interface
  * @list: list node for batadv_hard_iface::neigh_list
  * @addr: the MAC address of the neighboring interface
  * @if_incoming: pointer to incoming hard-interface
  * @last_seen: when last packet via this neighbor was received
+ * @bat_v: B.A.T.M.A.N. V private data
  * @refcount: number of contexts the object is used
  * @rcu: struct used for freeing in a RCU-safe manner
  */
@@ -378,6 +395,9 @@ struct batadv_hardif_neigh_node {
 	u8 addr[ETH_ALEN];
 	struct batadv_hard_iface *if_incoming;
 	unsigned long last_seen;
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	struct batadv_hardif_neigh_node_bat_v bat_v;
+#endif
 	struct kref refcount;
 	struct rcu_head rcu;
 };
@@ -424,11 +444,21 @@ struct batadv_neigh_ifinfo_bat_iv {
 	u8 real_packet_count;
 };
 
+/**
+ * struct batadv_neigh_ifinfo_bat_v - neighbor information per outgoing
+ *  interface for B.A.T.M.A.N. V
+ * @throughput: last throughput metric received from originator via this neigh
+ */
+struct batadv_neigh_ifinfo_bat_v {
+	u32 throughput;
+};
+
 /**
  * struct batadv_neigh_ifinfo - neighbor information per outgoing interface
  * @list: list node for batadv_neigh_node::ifinfo_list
  * @if_outgoing: pointer to outgoing hard-interface
  * @bat_iv: B.A.T.M.A.N. IV private structure
+ * @bat_v: B.A.T.M.A.N. V private data
  * @last_ttl: last received ttl from this neigh node
  * @refcount: number of contexts the object is used
  * @rcu: struct used for freeing in a RCU-safe manner
@@ -437,6 +467,9 @@ struct batadv_neigh_ifinfo {
 	struct hlist_node list;
 	struct batadv_hard_iface *if_outgoing;
 	struct batadv_neigh_ifinfo_bat_iv bat_iv;
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	struct batadv_neigh_ifinfo_bat_v bat_v;
+#endif
 	u8 last_ttl;
 	struct kref refcount;
 	struct rcu_head rcu;
-- 
cgit v1.2.3-59-g8ed1b


From 7f136cd491013285442ee1e7854fab1736f5757c Mon Sep 17 00:00:00 2001
From: Linus Luessing <linus.luessing@web.de>
Date: Sat, 16 Jan 2016 16:40:11 +0800
Subject: batman-adv: ELP - adding sysfs parameter for elp interval

This parameter can be set individually on each interface and
allows the configuration of the elp interval for the link
quality measurements during runtime. Usually it is desirable
to set it to a higher (= slower) value on interfaces which
have a more static characteristic (e.g. wired interfaces)
or very dense neighbourhoods to reduce overhead.

Developed by Linus during a 6 months trainee study period in
Ascom (Switzerland) AG.

Signed-off-by: Linus Luessing <linus.luessing@web.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
[antonio@open-mesh.com: respin on top of the latest master]
Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
---
 Documentation/ABI/testing/sysfs-class-net-batman-adv | 8 +++++++-
 net/batman-adv/sysfs.c                               | 7 +++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/Documentation/ABI/testing/sysfs-class-net-batman-adv b/Documentation/ABI/testing/sysfs-class-net-batman-adv
index 7f34a95bb963..aea78db983bc 100644
--- a/Documentation/ABI/testing/sysfs-class-net-batman-adv
+++ b/Documentation/ABI/testing/sysfs-class-net-batman-adv
@@ -1,4 +1,11 @@
 
+What:           /sys/class/net/<iface>/batman-adv/elp_interval
+Date:           Feb 2014
+Contact:        Linus Lüssing <linus.luessing@web.de>
+Description:
+                Defines the interval in milliseconds in which batman
+                sends its probing packets for link quality measurements.
+
 What:           /sys/class/net/<iface>/batman-adv/iface_status
 Date:           May 2010
 Contact:        Marek Lindner <mareklindner@neomailbox.ch>
@@ -12,4 +19,3 @@ Description:
                 The /sys/class/net/<iface>/batman-adv/mesh_iface file
                 displays the batman mesh interface this <iface>
                 currently is associated with.
-
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index d3f69d5e79d9..e86014332e1c 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -920,10 +920,17 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj,
 static BATADV_ATTR(mesh_iface, S_IRUGO | S_IWUSR, batadv_show_mesh_iface,
 		   batadv_store_mesh_iface);
 static BATADV_ATTR(iface_status, S_IRUGO, batadv_show_iface_status, NULL);
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, S_IRUGO | S_IWUSR,
+		     2 * BATADV_JITTER, INT_MAX, NULL);
+#endif
 
 static struct batadv_attribute *batadv_batman_attrs[] = {
 	&batadv_attr_mesh_iface,
 	&batadv_attr_iface_status,
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	&batadv_attr_elp_interval,
+#endif
 	NULL,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 0da0035942d47766c32843143fb5dba7a29cb48c Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Sat, 16 Jan 2016 16:40:12 +0800
Subject: batman-adv: OGMv2 - add basic infrastructure

This is the initial implementation of the new OGM protocol
(version 2). It has been designed to work on top of the
newly added ELP.

In the previous version the OGM protocol was used to both
measure link qualities and flood the network with the metric
information. In this version the protocol is in charge of
the latter task only, leaving the former to ELP.

This means being able to decouple the interval used by the
neighbor discovery from the OGM broadcasting, which revealed
to be costly in dense networks and needed to be relaxed so
leading to a less responsive routing protocol.

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/Makefile    |   1 +
 net/batman-adv/bat_algo.h  |  13 +++
 net/batman-adv/bat_v.c     |  51 ++++++++-
 net/batman-adv/bat_v_ogm.c | 279 +++++++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/bat_v_ogm.h |  32 ++++++
 net/batman-adv/main.c      |   6 +
 net/batman-adv/main.h      |   1 +
 net/batman-adv/packet.h    |  29 +++++
 net/batman-adv/types.h     |  18 +++
 9 files changed, 427 insertions(+), 3 deletions(-)
 create mode 100644 net/batman-adv/bat_v_ogm.c
 create mode 100644 net/batman-adv/bat_v_ogm.h

(limited to 'net')

diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index ca51686e9f72..797cf2fc88c1 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv.o
 batman-adv-y += bat_iv_ogm.o
 batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v.o
 batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o
+batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o
 batman-adv-y += bitarray.o
 batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
 batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index a4e994e26da1..03dafd33d23b 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -18,11 +18,15 @@
 #ifndef _NET_BATMAN_ADV_BAT_ALGO_H_
 #define _NET_BATMAN_ADV_BAT_ALGO_H_
 
+struct batadv_priv;
+
 int batadv_iv_init(void);
 
 #ifdef CONFIG_BATMAN_ADV_BATMAN_V
 
 int batadv_v_init(void);
+int batadv_v_mesh_init(struct batadv_priv *bat_priv);
+void batadv_v_mesh_free(struct batadv_priv *bat_priv);
 
 #else
 
@@ -31,6 +35,15 @@ static inline int batadv_v_init(void)
 	return 0;
 }
 
+static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv)
+{
+	return 0;
+}
+
+static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv)
+{
+}
+
 #endif /* CONFIG_BATMAN_ADV_BATMAN_V */
 
 #endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 0fea894855ff..b90a4dfe8ba6 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -22,11 +22,22 @@
 #include <linux/init.h>
 
 #include "bat_v_elp.h"
+#include "bat_v_ogm.h"
 #include "packet.h"
 
 static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
 {
-	return batadv_v_elp_iface_enable(hard_iface);
+	int ret;
+
+	ret = batadv_v_elp_iface_enable(hard_iface);
+	if (ret < 0)
+		return ret;
+
+	ret = batadv_v_ogm_iface_enable(hard_iface);
+	if (ret < 0)
+		batadv_v_elp_iface_disable(hard_iface);
+
+	return ret;
 }
 
 static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface)
@@ -41,6 +52,7 @@ static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
 static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
 {
 	batadv_v_elp_primary_iface_set(hard_iface);
+	batadv_v_ogm_primary_iface_set(hard_iface);
 }
 
 static void
@@ -68,6 +80,27 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
 	.bat_ogm_schedule = batadv_v_ogm_schedule,
 };
 
+/**
+ * batadv_v_mesh_init - initialize the B.A.T.M.A.N. V private resources for a
+ *  mesh
+ * @bat_priv: the object representing the mesh interface to initialise
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int batadv_v_mesh_init(struct batadv_priv *bat_priv)
+{
+	return batadv_v_ogm_init(bat_priv);
+}
+
+/**
+ * batadv_v_mesh_free - free the B.A.T.M.A.N. V private resources for a mesh
+ * @bat_priv: the object representing the mesh interface to free
+ */
+void batadv_v_mesh_free(struct batadv_priv *bat_priv)
+{
+	batadv_v_ogm_free(bat_priv);
+}
+
 /**
  * batadv_v_init - B.A.T.M.A.N. V initialization function
  *
@@ -86,10 +119,22 @@ int __init batadv_v_init(void)
 	if (ret < 0)
 		return ret;
 
-	ret = batadv_algo_register(&batadv_batman_v);
+	ret = batadv_recv_handler_register(BATADV_OGM2,
+					   batadv_v_ogm_packet_recv);
+	if (ret < 0)
+		goto elp_unregister;
 
+	ret = batadv_algo_register(&batadv_batman_v);
 	if (ret < 0)
-		batadv_recv_handler_unregister(BATADV_ELP);
+		goto ogm_unregister;
+
+	return ret;
+
+ogm_unregister:
+	batadv_recv_handler_unregister(BATADV_OGM2);
+
+elp_unregister:
+	batadv_recv_handler_unregister(BATADV_ELP);
 
 	return ret;
 }
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
new file mode 100644
index 000000000000..23932f9b85ec
--- /dev/null
+++ b/net/batman-adv/bat_v_ogm.c
@@ -0,0 +1,279 @@
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "bat_v_ogm.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "hard-interface.h"
+#include "packet.h"
+#include "routing.h"
+#include "send.h"
+#include "translation-table.h"
+
+/**
+ * batadv_v_ogm_start_timer - restart the OGM sending timer
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
+{
+	unsigned long msecs;
+	/* this function may be invoked in different contexts (ogm rescheduling
+	 * or hard_iface activation), but the work timer should not be reset
+	 */
+	if (delayed_work_pending(&bat_priv->bat_v.ogm_wq))
+		return;
+
+	msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER;
+	msecs += prandom_u32() % (2 * BATADV_JITTER);
+	queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq,
+			   msecs_to_jiffies(msecs));
+}
+
+/**
+ * batadv_v_ogm_send_to_if - send a batman ogm using a given interface
+ * @skb: the OGM to send
+ * @hard_iface: the interface to use to send the OGM
+ */
+static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
+				    struct batadv_hard_iface *hard_iface)
+{
+	struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+
+	if (hard_iface->if_status != BATADV_IF_ACTIVE)
+		return;
+
+	batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX);
+	batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
+			   skb->len + ETH_HLEN);
+
+	batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+}
+
+/**
+ * batadv_v_ogm_send - periodic worker broadcasting the own OGM
+ * @work: work queue item
+ */
+static void batadv_v_ogm_send(struct work_struct *work)
+{
+	struct batadv_hard_iface *hard_iface;
+	struct batadv_priv_bat_v *bat_v;
+	struct batadv_priv *bat_priv;
+	struct batadv_ogm2_packet *ogm_packet;
+	struct sk_buff *skb, *skb_tmp;
+	unsigned char *ogm_buff, *pkt_buff;
+	int ogm_buff_len;
+	u16 tvlv_len = 0;
+
+	bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
+	bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
+
+	if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+		goto out;
+
+	ogm_buff = bat_priv->bat_v.ogm_buff;
+	ogm_buff_len = bat_priv->bat_v.ogm_buff_len;
+	/* tt changes have to be committed before the tvlv data is
+	 * appended as it may alter the tt tvlv container
+	 */
+	batadv_tt_local_commit_changes(bat_priv);
+	tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff,
+						    &ogm_buff_len,
+						    BATADV_OGM2_HLEN);
+
+	bat_priv->bat_v.ogm_buff = ogm_buff;
+	bat_priv->bat_v.ogm_buff_len = ogm_buff_len;
+
+	skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len);
+	if (!skb)
+		goto reschedule;
+
+	skb_reserve(skb, ETH_HLEN);
+	pkt_buff = skb_put(skb, ogm_buff_len);
+	memcpy(pkt_buff, ogm_buff, ogm_buff_len);
+
+	ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+	ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno));
+	atomic_inc(&bat_priv->bat_v.ogm_seqno);
+	ogm_packet->tvlv_len = htons(tvlv_len);
+
+	/* broadcast on every interface */
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+		if (hard_iface->soft_iface != bat_priv->soft_iface)
+			continue;
+
+		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+			   "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n",
+			   ogm_packet->orig, ntohl(ogm_packet->seqno),
+			   ntohl(ogm_packet->throughput), ogm_packet->ttl,
+			   hard_iface->net_dev->name,
+			   hard_iface->net_dev->dev_addr);
+
+		/* this skb gets consumed by batadv_v_ogm_send_to_if() */
+		skb_tmp = skb_clone(skb, GFP_ATOMIC);
+		if (!skb_tmp)
+			break;
+
+		batadv_v_ogm_send_to_if(skb_tmp, hard_iface);
+	}
+	rcu_read_unlock();
+
+	consume_skb(skb);
+
+reschedule:
+	batadv_v_ogm_start_timer(bat_priv);
+out:
+	return;
+}
+
+/**
+ * batadv_v_ogm_iface_enable - prepare an interface for B.A.T.M.A.N. V
+ * @hard_iface: the interface to prepare
+ *
+ * Takes care of scheduling own OGM sending routine for this interface.
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+	struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+
+	batadv_v_ogm_start_timer(bat_priv);
+
+	return 0;
+}
+
+/**
+ * batadv_v_ogm_primary_iface_set - set a new primary interface
+ * @primary_iface: the new primary interface
+ */
+void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface)
+{
+	struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface);
+	struct batadv_ogm2_packet *ogm_packet;
+
+	if (!bat_priv->bat_v.ogm_buff)
+		return;
+
+	ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff;
+	ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr);
+}
+
+/**
+ * batadv_v_ogm_packet_recv - OGM2 receiving handler
+ * @skb: the received OGM
+ * @if_incoming: the interface where this OGM has been received
+ *
+ * Return: NET_RX_SUCCESS and consume the skb on success or returns NET_RX_DROP
+ * (without freeing the skb) on failure
+ */
+int batadv_v_ogm_packet_recv(struct sk_buff *skb,
+			     struct batadv_hard_iface *if_incoming)
+{
+	struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct batadv_ogm2_packet *ogm_packet;
+	struct ethhdr *ethhdr = eth_hdr(skb);
+
+	/* did we receive a OGM2 packet on an interface that does not have
+	 * B.A.T.M.A.N. V enabled ?
+	 */
+	if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0)
+		return NET_RX_DROP;
+
+	if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
+		return NET_RX_DROP;
+
+	if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
+		return NET_RX_DROP;
+
+	ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+
+	if (batadv_is_my_mac(bat_priv, ogm_packet->orig))
+		return NET_RX_DROP;
+
+	batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
+	batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
+			   skb->len + ETH_HLEN);
+
+	consume_skb(skb);
+	return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_v_ogm_init - initialise the OGM2 engine
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or a negative error code in case of failure
+ */
+int batadv_v_ogm_init(struct batadv_priv *bat_priv)
+{
+	struct batadv_ogm2_packet *ogm_packet;
+	unsigned char *ogm_buff;
+	u32 random_seqno;
+
+	bat_priv->bat_v.ogm_buff_len = BATADV_OGM2_HLEN;
+	ogm_buff = kzalloc(bat_priv->bat_v.ogm_buff_len, GFP_ATOMIC);
+	if (!ogm_buff)
+		return -ENOMEM;
+
+	bat_priv->bat_v.ogm_buff = ogm_buff;
+	ogm_packet = (struct batadv_ogm2_packet *)ogm_buff;
+	ogm_packet->packet_type = BATADV_OGM2;
+	ogm_packet->version = BATADV_COMPAT_VERSION;
+	ogm_packet->ttl = BATADV_TTL;
+	ogm_packet->flags = BATADV_NO_FLAGS;
+	ogm_packet->throughput = htonl(BATADV_THROUGHPUT_MAX_VALUE);
+
+	/* randomize initial seqno to avoid collision */
+	get_random_bytes(&random_seqno, sizeof(random_seqno));
+	atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno);
+	INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send);
+
+	return 0;
+}
+
+/**
+ * batadv_v_ogm_free - free OGM private resources
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_v_ogm_free(struct batadv_priv *bat_priv)
+{
+	cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq);
+
+	kfree(bat_priv->bat_v.ogm_buff);
+	bat_priv->bat_v.ogm_buff = NULL;
+	bat_priv->bat_v.ogm_buff_len = 0;
+}
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
new file mode 100644
index 000000000000..d9f178cedcd2
--- /dev/null
+++ b/net/batman-adv/bat_v_ogm.h
@@ -0,0 +1,32 @@
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _BATMAN_ADV_BATADV_V_OGM_H_
+#define _BATMAN_ADV_BATADV_V_OGM_H_
+
+struct batadv_hard_iface;
+struct batadv_priv;
+struct sk_buff;
+
+int batadv_v_ogm_init(struct batadv_priv *bat_priv);
+void batadv_v_ogm_free(struct batadv_priv *bat_priv);
+int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface);
+void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
+int batadv_v_ogm_packet_recv(struct sk_buff *skb,
+			     struct batadv_hard_iface *if_incoming);
+
+#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 216057205e10..d64ddb961979 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -160,6 +160,10 @@ int batadv_mesh_init(struct net_device *soft_iface)
 	INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list);
 	INIT_HLIST_HEAD(&bat_priv->softif_vlan_list);
 
+	ret = batadv_v_mesh_init(bat_priv);
+	if (ret < 0)
+		goto err;
+
 	ret = batadv_originator_init(bat_priv);
 	if (ret < 0)
 		goto err;
@@ -202,6 +206,8 @@ void batadv_mesh_free(struct net_device *soft_iface)
 	batadv_purge_outstanding_packets(bat_priv, NULL);
 
 	batadv_gw_node_free(bat_priv);
+
+	batadv_v_mesh_free(bat_priv);
 	batadv_nc_mesh_free(bat_priv);
 	batadv_dat_free(bat_priv);
 	batadv_bla_free(bat_priv);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index d91bae270f42..fa0b067e589d 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -30,6 +30,7 @@
 /* B.A.T.M.A.N. parameters */
 
 #define BATADV_TQ_MAX_VALUE 255
+#define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF
 #define BATADV_JITTER 20
 
 /* Time To Live of broadcast messages */
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index c698b6452953..8a8d7ca1a5cf 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -27,6 +27,7 @@
  * @BATADV_BCAST: broadcast packets carrying broadcast payload
  * @BATADV_CODED: network coded packets
  * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V
+ * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V
  *
  * @BATADV_UNICAST: unicast packets carrying unicast payload traffic
  * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original
@@ -42,6 +43,7 @@ enum batadv_packettype {
 	BATADV_BCAST            = 0x01,
 	BATADV_CODED            = 0x02,
 	BATADV_ELP		= 0x03,
+	BATADV_OGM2		= 0x04,
 	/* 0x40 - 0x7f: unicast */
 #define BATADV_UNICAST_MIN     0x40
 	BATADV_UNICAST          = 0x40,
@@ -236,6 +238,33 @@ struct batadv_ogm_packet {
 
 #define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet)
 
+/**
+ * struct batadv_ogm2_packet - ogm2 (routing protocol) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the general header
+ * @ttl: time to live for this packet, part of the general header
+ * @flags: reseved for routing relevant flags - currently always 0
+ * @seqno: sequence number
+ * @orig: originator mac address
+ * @tvlv_len: length of the appended tvlv buffer (in bytes)
+ * @throughput: the currently flooded path throughput
+ */
+struct batadv_ogm2_packet {
+	u8     packet_type;
+	u8     version;
+	u8     ttl;
+	u8     flags;
+	__be32 seqno;
+	u8     orig[ETH_ALEN];
+	__be16 tvlv_len;
+	__be32 throughput;
+	/* __packed is not needed as the struct size is divisible by 4,
+	 * and the largest data type in this struct has a size of 4.
+	 */
+};
+
+#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet)
+
 /**
  * struct batadv_elp_packet - elp (neighbor discovery) packet
  * @packet_type: batman-adv packet type, part of the general header
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 7fc82a5d4ac4..e213ccdd4c6c 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -801,6 +801,20 @@ struct batadv_softif_vlan {
 	struct rcu_head rcu;
 };
 
+/**
+ * struct batadv_priv_bat_v - B.A.T.M.A.N. V per soft-interface private data
+ * @ogm_buff: buffer holding the OGM packet
+ * @ogm_buff_len: length of the OGM packet buffer
+ * @ogm_seqno: OGM sequence number - used to identify each OGM
+ * @ogm_wq: workqueue used to schedule OGM transmissions
+ */
+struct batadv_priv_bat_v {
+	unsigned char *ogm_buff;
+	int ogm_buff_len;
+	atomic_t ogm_seqno;
+	struct delayed_work ogm_wq;
+};
+
 /**
  * struct batadv_priv - per mesh interface data
  * @mesh_state: current status of the mesh (inactive/active/deactivating)
@@ -855,6 +869,7 @@ struct batadv_softif_vlan {
  * @mcast: multicast data
  * @network_coding: bool indicating whether network coding is enabled
  * @nc: network coding data
+ * @bat_v: B.A.T.M.A.N. V per soft-interface private data
  */
 struct batadv_priv {
 	atomic_t mesh_state;
@@ -920,6 +935,9 @@ struct batadv_priv {
 	atomic_t network_coding;
 	struct batadv_priv_nc nc;
 #endif /* CONFIG_BATMAN_ADV_NC */
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	struct batadv_priv_bat_v bat_v;
+#endif
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 9323158ef9f49935f0c61509919acd31dda8f11b Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Sat, 16 Jan 2016 16:40:13 +0800
Subject: batman-adv: OGMv2 - implement originators logic

Add the support for recognising new originators in the
network and rebroadcast their OGMs.

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/bat_v_elp.c |  41 +---
 net/batman-adv/bat_v_ogm.c | 556 ++++++++++++++++++++++++++++++++++++++++++++-
 net/batman-adv/bat_v_ogm.h |   4 +
 net/batman-adv/main.h      |   2 +
 net/batman-adv/types.h     |   4 +
 5 files changed, 566 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index dac88fad2697..fac5aee7ce05 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -37,8 +37,8 @@
 #include <linux/workqueue.h>
 
 #include "bat_algo.h"
+#include "bat_v_ogm.h"
 #include "hard-interface.h"
-#include "hash.h"
 #include "originator.h"
 #include "packet.h"
 #include "routing.h"
@@ -195,45 +195,6 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
 	rcu_read_unlock();
 }
 
-/**
- * batadv_v_ogm_orig_get - retrieve and possibly create an originator node
- * @bat_priv: the bat priv with all the soft interface information
- * @addr: the address of the originator
- *
- * Return: the orig_node corresponding to the specified address. If such object
- * does not exist it is allocated here. In case of allocation failure returns
- * NULL.
- */
-static struct batadv_orig_node *
-batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
-		      const u8 *addr)
-{
-	struct batadv_orig_node *orig_node;
-	int hash_added;
-
-	orig_node = batadv_orig_hash_find(bat_priv, addr);
-	if (orig_node)
-		return orig_node;
-
-	orig_node = batadv_orig_node_new(bat_priv, addr);
-	if (!orig_node)
-		return NULL;
-
-	hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
-				     batadv_choose_orig, orig_node,
-				     &orig_node->hash_entry);
-	if (hash_added != 0) {
-		/* orig_node->refcounter is initialised to 2 by
-		 * batadv_orig_node_new()
-		 */
-		batadv_orig_node_put(orig_node);
-		batadv_orig_node_put(orig_node);
-		orig_node = NULL;
-	}
-
-	return orig_node;
-}
-
 /**
  * batadv_v_elp_neigh_update - update an ELP neighbour node
  * @bat_priv: the bat priv with all the soft interface information
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 23932f9b85ec..22dda0e079a5 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -26,6 +26,7 @@
 #include <linux/if_ether.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/random.h>
 #include <linux/rculist.h>
@@ -38,11 +39,51 @@
 #include <linux/workqueue.h>
 
 #include "hard-interface.h"
+#include "hash.h"
+#include "originator.h"
 #include "packet.h"
 #include "routing.h"
 #include "send.h"
 #include "translation-table.h"
 
+/**
+ * batadv_v_ogm_orig_get - retrieve and possibly create an originator node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the address of the originator
+ *
+ * Return: the orig_node corresponding to the specified address. If such object
+ * does not exist it is allocated here. In case of allocation failure returns
+ * NULL.
+ */
+struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
+					       const u8 *addr)
+{
+	struct batadv_orig_node *orig_node;
+	int hash_added;
+
+	orig_node = batadv_orig_hash_find(bat_priv, addr);
+	if (orig_node)
+		return orig_node;
+
+	orig_node = batadv_orig_node_new(bat_priv, addr);
+	if (!orig_node)
+		return NULL;
+
+	hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
+				     batadv_choose_orig, orig_node,
+				     &orig_node->hash_entry);
+	if (hash_added != 0) {
+		/* orig_node->refcounter is initialised to 2 by
+		 * batadv_orig_node_new()
+		 */
+		batadv_orig_node_put(orig_node);
+		batadv_orig_node_put(orig_node);
+		orig_node = NULL;
+	}
+
+	return orig_node;
+}
+
 /**
  * batadv_v_ogm_start_timer - restart the OGM sending timer
  * @bat_priv: the bat priv with all the soft interface information
@@ -192,6 +233,500 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface)
 	ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr);
 }
 
+/**
+ * batadv_v_ogm_orig_update - update the originator status based on the received
+ *  OGM
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: the originator to update
+ * @neigh_node: the neighbour the OGM has been received from (to update)
+ * @ogm2: the received OGM
+ * @if_outgoing: the interface where this OGM is going to be forwarded through
+ */
+static void
+batadv_v_ogm_orig_update(struct batadv_priv *bat_priv,
+			 struct batadv_orig_node *orig_node,
+			 struct batadv_neigh_node *neigh_node,
+			 const struct batadv_ogm2_packet *ogm2,
+			 struct batadv_hard_iface *if_outgoing)
+{
+	struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL;
+	struct batadv_neigh_node *router = NULL;
+	s32 neigh_seq_diff;
+	u32 neigh_last_seqno;
+	u32 router_last_seqno;
+	u32 router_throughput, neigh_throughput;
+
+	batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+		   "Searching and updating originator entry of received packet\n");
+
+	/* if this neighbor already is our next hop there is nothing
+	 * to change
+	 */
+	router = batadv_orig_router_get(orig_node, if_outgoing);
+	if (router == neigh_node)
+		goto out;
+
+	/* don't consider neighbours with worse throughput.
+	 * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than
+	 * the last received seqno from our best next hop.
+	 */
+	if (router) {
+		router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
+		neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+
+		/* if these are not allocated, something is wrong. */
+		if (!router_ifinfo || !neigh_ifinfo)
+			goto out;
+
+		neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno;
+		router_last_seqno = router_ifinfo->bat_v.last_seqno;
+		neigh_seq_diff = neigh_last_seqno - router_last_seqno;
+		router_throughput = router_ifinfo->bat_v.throughput;
+		neigh_throughput = neigh_ifinfo->bat_v.throughput;
+
+		if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) &&
+		    (router_throughput >= neigh_throughput))
+			goto out;
+	}
+
+	batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node);
+
+out:
+	if (router_ifinfo)
+		batadv_neigh_ifinfo_put(router_ifinfo);
+	if (neigh_ifinfo)
+		batadv_neigh_ifinfo_put(neigh_ifinfo);
+	if (router)
+		batadv_neigh_node_put(router);
+}
+
+/**
+ * batadv_v_forward_penalty - apply a penalty to the throughput metric forwarded
+ *  with B.A.T.M.A.N. V OGMs
+ * @bat_priv: the bat priv with all the soft interface information
+ * @if_incoming: the interface where the OGM has been received
+ * @if_outgoing: the interface where the OGM has to be forwarded to
+ * @throughput: the current throughput
+ *
+ * Apply a penalty on the current throughput metric value based on the
+ * characteristic of the interface where the OGM has been received. The return
+ * value is computed as follows:
+ * - throughput * 50%          if the incoming and outgoing interface are the
+ *                             same WiFi interface and the throughput is above
+ *                             1MBit/s
+ * - throughput                if the outgoing interface is the default
+ *                             interface (i.e. this OGM is processed for the
+ *                             internal table and not forwarded)
+ * - throughput * hop penalty  otherwise
+ *
+ * Return: the penalised throughput metric.
+ */
+static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
+				    struct batadv_hard_iface *if_incoming,
+				    struct batadv_hard_iface *if_outgoing,
+				    u32 throughput)
+{
+	int hop_penalty = atomic_read(&bat_priv->hop_penalty);
+	int hop_penalty_max = BATADV_TQ_MAX_VALUE;
+
+	/* Don't apply hop penalty in default originator table. */
+	if (if_outgoing == BATADV_IF_DEFAULT)
+		return throughput;
+
+	/* Forwarding on the same WiFi interface cuts the throughput in half
+	 * due to the store & forward characteristics of WIFI.
+	 * Very low throughput values are the exception.
+	 */
+	if ((throughput > 10) &&
+	    (if_incoming == if_outgoing) &&
+	    (batadv_is_wifi_netdev(if_incoming->net_dev)))
+		return throughput / 2;
+
+	/* hop penalty of 255 equals 100% */
+	return throughput * (hop_penalty_max - hop_penalty) / hop_penalty_max;
+}
+
+/**
+ * batadv_v_ogm_forward - forward an OGM to the given outgoing interface
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ogm_received: previously received OGM to be forwarded
+ * @throughput: throughput to announce, may vary per outgoing interface
+ * @if_incoming: the interface on which this OGM was received on
+ * @if_outgoing: the interface to which the OGM has to be forwarded to
+ *
+ * Forward an OGM to an interface after having altered the throughput metric and
+ * the TTL value contained in it. The original OGM isn't modified.
+ */
+static void batadv_v_ogm_forward(struct batadv_priv *bat_priv,
+				 const struct batadv_ogm2_packet *ogm_received,
+				 u32 throughput,
+				 struct batadv_hard_iface *if_incoming,
+				 struct batadv_hard_iface *if_outgoing)
+{
+	struct batadv_ogm2_packet *ogm_forward;
+	unsigned char *skb_buff;
+	struct sk_buff *skb;
+	size_t packet_len;
+	u16 tvlv_len;
+
+	if (ogm_received->ttl <= 1) {
+		batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n");
+		return;
+	}
+
+	tvlv_len = ntohs(ogm_received->tvlv_len);
+
+	packet_len = BATADV_OGM2_HLEN + tvlv_len;
+	skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev,
+					ETH_HLEN + packet_len);
+	if (!skb)
+		return;
+
+	skb_reserve(skb, ETH_HLEN);
+	skb_buff = skb_put(skb, packet_len);
+	memcpy(skb_buff, ogm_received, packet_len);
+
+	/* apply forward penalty */
+	ogm_forward = (struct batadv_ogm2_packet *)skb_buff;
+	ogm_forward->throughput = htonl(throughput);
+	ogm_forward->ttl--;
+
+	batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+		   "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n",
+		   if_outgoing->net_dev->name, throughput, ogm_forward->ttl,
+		   if_incoming->net_dev->name);
+
+	batadv_v_ogm_send_to_if(skb, if_outgoing);
+}
+
+/**
+ * batadv_v_ogm_metric_update - update route metric based on OGM
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ogm2: OGM2 structure
+ * @orig_node: Originator structure for which the OGM has been received
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ *
+ * Return:
+ *  1  if the OGM is new,
+ *  0  if it is not new but valid,
+ *  <0 on error (e.g. old OGM)
+ */
+static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv,
+				      const struct batadv_ogm2_packet *ogm2,
+				      struct batadv_orig_node *orig_node,
+				      struct batadv_neigh_node *neigh_node,
+				      struct batadv_hard_iface *if_incoming,
+				      struct batadv_hard_iface *if_outgoing)
+{
+	struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+	struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
+	bool protection_started = false;
+	int ret = -EINVAL;
+	u32 path_throughput;
+	s32 seq_diff;
+
+	orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+	if (!orig_ifinfo)
+		goto out;
+
+	seq_diff = ntohl(ogm2->seqno) - orig_ifinfo->last_real_seqno;
+
+	if (!hlist_empty(&orig_node->neigh_list) &&
+	    batadv_window_protected(bat_priv, seq_diff,
+				    BATADV_OGM_MAX_AGE,
+				    &orig_ifinfo->batman_seqno_reset,
+				    &protection_started)) {
+		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+			   "Drop packet: packet within window protection time from %pM\n",
+			   ogm2->orig);
+		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+			   "Last reset: %ld, %ld\n",
+			   orig_ifinfo->batman_seqno_reset, jiffies);
+		goto out;
+	}
+
+	/* drop packets with old seqnos, however accept the first packet after
+	 * a host has been rebooted.
+	 */
+	if ((seq_diff < 0) && !protection_started)
+		goto out;
+
+	neigh_node->last_seen = jiffies;
+
+	orig_node->last_seen = jiffies;
+
+	orig_ifinfo->last_real_seqno = ntohl(ogm2->seqno);
+	orig_ifinfo->last_ttl = ogm2->ttl;
+
+	neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
+	if (!neigh_ifinfo)
+		goto out;
+
+	path_throughput = batadv_v_forward_penalty(bat_priv, if_incoming,
+						   if_outgoing,
+						   ntohl(ogm2->throughput));
+	neigh_ifinfo->bat_v.throughput = path_throughput;
+	neigh_ifinfo->bat_v.last_seqno = ntohl(ogm2->seqno);
+	neigh_ifinfo->last_ttl = ogm2->ttl;
+
+	if (seq_diff > 0 || protection_started)
+		ret = 1;
+	else
+		ret = 0;
+out:
+	if (orig_ifinfo)
+		batadv_orig_ifinfo_put(orig_ifinfo);
+	if (neigh_ifinfo)
+		batadv_neigh_ifinfo_put(neigh_ifinfo);
+
+	return ret;
+}
+
+/**
+ * batadv_v_ogm_route_update - update routes based on OGM
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: the Ethernet header of the OGM2
+ * @ogm2: OGM2 structure
+ * @orig_node: Originator structure for which the OGM has been received
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ */
+static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
+				      const struct ethhdr *ethhdr,
+				      const struct batadv_ogm2_packet *ogm2,
+				      struct batadv_orig_node *orig_node,
+				      struct batadv_neigh_node *neigh_node,
+				      struct batadv_hard_iface *if_incoming,
+				      struct batadv_hard_iface *if_outgoing)
+{
+	struct batadv_neigh_node *router = NULL;
+	struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
+	struct batadv_orig_node *orig_neigh_node = NULL;
+	struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+	struct batadv_neigh_node *orig_neigh_router = NULL;
+
+	neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+	if (!neigh_ifinfo)
+		goto out;
+
+	orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source);
+	if (!orig_neigh_node)
+		goto out;
+
+	orig_neigh_router = batadv_orig_router_get(orig_neigh_node,
+						   if_outgoing);
+
+	/* drop packet if sender is not a direct neighbor and if we
+	 * don't route towards it
+	 */
+	router = batadv_orig_router_get(orig_node, if_outgoing);
+	if (router && router->orig_node != orig_node && !orig_neigh_router) {
+		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+			   "Drop packet: OGM via unknown neighbor!\n");
+		goto out;
+	}
+
+	if (router)
+		batadv_neigh_node_put(router);
+
+	/* Update routes, and check if the OGM is from the best next hop */
+	batadv_v_ogm_orig_update(bat_priv, orig_node, neigh_node, ogm2,
+				 if_outgoing);
+
+	orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+	if (!orig_ifinfo)
+		goto out;
+
+	/* don't forward the same seqno twice on one interface */
+	if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm2->seqno))
+		goto out;
+
+	/* acquire possibly updated router */
+	router = batadv_orig_router_get(orig_node, if_outgoing);
+
+	/* strict rule: forward packets coming from the best next hop only */
+	if (neigh_node != router)
+		goto out;
+
+	/* only forward for specific interface, not for the default one. */
+	if (if_outgoing != BATADV_IF_DEFAULT) {
+		orig_ifinfo->last_seqno_forwarded = ntohl(ogm2->seqno);
+		batadv_v_ogm_forward(bat_priv, ogm2,
+				     neigh_ifinfo->bat_v.throughput,
+				     if_incoming, if_outgoing);
+	}
+
+out:
+	if (orig_ifinfo)
+		batadv_orig_ifinfo_put(orig_ifinfo);
+	if (router)
+		batadv_neigh_node_put(router);
+	if (orig_neigh_router)
+		batadv_neigh_node_put(orig_neigh_router);
+	if (orig_neigh_node)
+		batadv_orig_node_put(orig_neigh_node);
+	if (neigh_ifinfo)
+		batadv_neigh_ifinfo_put(neigh_ifinfo);
+}
+
+/**
+ * batadv_v_ogm_process_per_outif - process a batman v OGM for an outgoing if
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: the Ethernet header of the OGM2
+ * @ogm2: OGM2 structure
+ * @orig_node: Originator structure for which the OGM has been received
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ */
+static void
+batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
+			       const struct ethhdr *ethhdr,
+			       const struct batadv_ogm2_packet *ogm2,
+			       struct batadv_orig_node *orig_node,
+			       struct batadv_neigh_node *neigh_node,
+			       struct batadv_hard_iface *if_incoming,
+			       struct batadv_hard_iface *if_outgoing)
+{
+	int seqno_age;
+
+	/* first, update the metric with according sanity checks */
+	seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node,
+					       neigh_node, if_incoming,
+					       if_outgoing);
+
+	/* outdated sequence numbers are to be discarded */
+	if (seqno_age < 0)
+		return;
+
+	/* only unknown & newer OGMs contain TVLVs we are interested in */
+	if ((seqno_age > 0) && (if_outgoing == BATADV_IF_DEFAULT))
+		batadv_tvlv_containers_process(bat_priv, true, orig_node,
+					       NULL, NULL,
+					       (unsigned char *)(ogm2 + 1),
+					       ntohs(ogm2->tvlv_len));
+
+	/* if the metric update went through, update routes if needed */
+	batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node,
+				  neigh_node, if_incoming, if_outgoing);
+}
+
+/**
+ * batadv_v_ogm_aggr_packet - checks if there is another OGM aggregated
+ * @buff_pos: current position in the skb
+ * @packet_len: total length of the skb
+ * @tvlv_len: tvlv length of the previously considered OGM
+ *
+ * Return: true if there is enough space for another OGM, false otherwise.
+ */
+static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len,
+				     __be16 tvlv_len)
+{
+	int next_buff_pos = 0;
+
+	next_buff_pos += buff_pos + BATADV_OGM2_HLEN;
+	next_buff_pos += ntohs(tvlv_len);
+
+	return (next_buff_pos <= packet_len) &&
+	       (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES);
+}
+
+/**
+ * batadv_v_ogm_process - process an incoming batman v OGM
+ * @skb: the skb containing the OGM
+ * @ogm_offset: offset to the OGM which should be processed (for aggregates)
+ * @if_incoming: the interface where this packet was receved
+ */
+static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
+				 struct batadv_hard_iface *if_incoming)
+{
+	struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct ethhdr *ethhdr;
+	struct batadv_orig_node *orig_node = NULL;
+	struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+	struct batadv_neigh_node *neigh_node = NULL;
+	struct batadv_hard_iface *hard_iface;
+	struct batadv_ogm2_packet *ogm_packet;
+	u32 ogm_throughput, link_throughput, path_throughput;
+
+	ethhdr = eth_hdr(skb);
+	ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset);
+
+	ogm_throughput = ntohl(ogm_packet->throughput);
+
+	batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+		   "Received OGM2 packet via NB: %pM, IF: %s [%pM] (from OG: %pM, seqno %u, troughput %u, TTL %u, V %u, tvlv_len %u)\n",
+		   ethhdr->h_source, if_incoming->net_dev->name,
+		   if_incoming->net_dev->dev_addr, ogm_packet->orig,
+		   ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl,
+		   ogm_packet->version, ntohs(ogm_packet->tvlv_len));
+
+	/* If the troughput metric is 0, immediately drop the packet. No need to
+	 * create orig_node / neigh_node for an unusable route.
+	 */
+	if (ogm_throughput == 0) {
+		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+			   "Drop packet: originator packet with troughput metric of 0\n");
+		return;
+	}
+
+	/* require ELP packets be to received from this neighbor first */
+	hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source);
+	if (!hardif_neigh) {
+		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+			   "Drop packet: OGM via unknown neighbor!\n");
+		goto out;
+	}
+
+	orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig);
+	if (!orig_node)
+		return;
+
+	neigh_node = batadv_neigh_node_new(orig_node, if_incoming,
+					   ethhdr->h_source);
+	if (!neigh_node)
+		goto out;
+
+	/* Update the received throughput metric to match the link
+	 * characteristic:
+	 *  - If this OGM traveled one hop so far (emitted by single hop
+	 *    neighbor) the path throughput metric equals the link throughput.
+	 *  - For OGMs traversing more than hop the path throughput metric is
+	 *    the smaller of the path throughput and the link throughput.
+	 */
+	link_throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
+	path_throughput = min_t(u32, link_throughput, ogm_throughput);
+	ogm_packet->throughput = htonl(path_throughput);
+
+	batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node,
+				       neigh_node, if_incoming,
+				       BATADV_IF_DEFAULT);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+		if (hard_iface->if_status != BATADV_IF_ACTIVE)
+			continue;
+
+		if (hard_iface->soft_iface != bat_priv->soft_iface)
+			continue;
+
+		batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet,
+					       orig_node, neigh_node,
+					       if_incoming, hard_iface);
+	}
+	rcu_read_unlock();
+out:
+	if (orig_node)
+		batadv_orig_node_put(orig_node);
+	if (neigh_node)
+		batadv_neigh_node_put(neigh_node);
+	if (hardif_neigh)
+		batadv_hardif_neigh_put(hardif_neigh);
+}
+
 /**
  * batadv_v_ogm_packet_recv - OGM2 receiving handler
  * @skb: the received OGM
@@ -206,6 +741,9 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
 	struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
 	struct batadv_ogm2_packet *ogm_packet;
 	struct ethhdr *ethhdr = eth_hdr(skb);
+	int ogm_offset;
+	u8 *packet_pos;
+	int ret = NET_RX_DROP;
 
 	/* did we receive a OGM2 packet on an interface that does not have
 	 * B.A.T.M.A.N. V enabled ?
@@ -228,8 +766,24 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
 	batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
 			   skb->len + ETH_HLEN);
 
+	ogm_offset = 0;
+	ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+
+	while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb),
+					ogm_packet->tvlv_len)) {
+		batadv_v_ogm_process(skb, ogm_offset, if_incoming);
+
+		ogm_offset += BATADV_OGM2_HLEN;
+		ogm_offset += ntohs(ogm_packet->tvlv_len);
+
+		packet_pos = skb->data + ogm_offset;
+		ogm_packet = (struct batadv_ogm2_packet *)packet_pos;
+	}
+
+	ret = NET_RX_SUCCESS;
 	consume_skb(skb);
-	return NET_RX_SUCCESS;
+
+	return ret;
 }
 
 /**
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index d9f178cedcd2..d849c75ada0e 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -18,6 +18,8 @@
 #ifndef _BATMAN_ADV_BATADV_V_OGM_H_
 #define _BATMAN_ADV_BATADV_V_OGM_H_
 
+#include <linux/types.h>
+
 struct batadv_hard_iface;
 struct batadv_priv;
 struct sk_buff;
@@ -25,6 +27,8 @@ struct sk_buff;
 int batadv_v_ogm_init(struct batadv_priv *bat_priv);
 void batadv_v_ogm_free(struct batadv_priv *bat_priv);
 int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface);
+struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
+					       const u8 *addr);
 void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
 int batadv_v_ogm_packet_recv(struct sk_buff *skb,
 			     struct batadv_hard_iface *if_incoming);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index fa0b067e589d..2e795446208f 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -63,6 +63,8 @@
 
 /* B.A.T.M.A.N. V */
 #define BATADV_ELP_MAX_AGE 64
+#define BATADV_OGM_MAX_ORIGDIFF 5
+#define BATADV_OGM_MAX_AGE 64
 
 /* number of OGMs sent with the last tt diff */
 #define BATADV_TT_OGM_APPEND_MAX 3
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index e213ccdd4c6c..cf55cd0586b5 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -149,6 +149,7 @@ struct batadv_hard_iface {
  * @router: router that should be used to reach this originator
  * @last_real_seqno: last and best known sequence number
  * @last_ttl: ttl of last received packet
+ * @last_seqno_forwarded: seqno of the OGM which was forwarded last
  * @batman_seqno_reset: time when the batman seqno window was reset
  * @refcount: number of contexts the object is used
  * @rcu: struct used for freeing in an RCU-safe manner
@@ -159,6 +160,7 @@ struct batadv_orig_ifinfo {
 	struct batadv_neigh_node __rcu *router; /* rcu protected pointer */
 	u32 last_real_seqno;
 	u8 last_ttl;
+	u32 last_seqno_forwarded;
 	unsigned long batman_seqno_reset;
 	struct kref refcount;
 	struct rcu_head rcu;
@@ -448,9 +450,11 @@ struct batadv_neigh_ifinfo_bat_iv {
  * struct batadv_neigh_ifinfo_bat_v - neighbor information per outgoing
  *  interface for B.A.T.M.A.N. V
  * @throughput: last throughput metric received from originator via this neigh
+ * @last_seqno: last sequence number known for this neighbor
  */
 struct batadv_neigh_ifinfo_bat_v {
 	u32 throughput;
+	u32 last_seqno;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 0b5ecc6811bd576ecc9813bbe069f2293cb1c6aa Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Sat, 16 Jan 2016 16:40:14 +0800
Subject: batman-adv: add throughput override attribute to hard_ifaces

This attribute is exported to user space to disable the link
throughput auto-detection by setting a fixed value.
The throughput override value is used when batman-adv is
computing the link throughput towards a neighbour.

If the value is set to 0 then batman-adv will try to detect
the throughput by itself.

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 .../ABI/testing/sysfs-class-net-batman-adv         |  9 +++
 net/batman-adv/bat_v.c                             |  6 ++
 net/batman-adv/gateway_common.c                    |  4 +-
 net/batman-adv/gateway_common.h                    |  2 +
 net/batman-adv/sysfs.c                             | 74 ++++++++++++++++++++++
 net/batman-adv/types.h                             |  2 +
 6 files changed, 95 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/Documentation/ABI/testing/sysfs-class-net-batman-adv b/Documentation/ABI/testing/sysfs-class-net-batman-adv
index aea78db983bc..518f6a1dbc0c 100644
--- a/Documentation/ABI/testing/sysfs-class-net-batman-adv
+++ b/Documentation/ABI/testing/sysfs-class-net-batman-adv
@@ -1,4 +1,13 @@
 
+What:		/sys/class/net/<iface>/batman-adv/throughput_override
+Date:		Feb 2014
+Contact:	Antonio Quartulli <antonio@meshcoding.com>
+description:
+		Defines the throughput value to be used by B.A.T.M.A.N. V
+		when estimating the link throughput using this interface.
+		If the value is set to 0 then batman-adv will try to
+		estimate the throughput by itself.
+
 What:           /sys/class/net/<iface>/batman-adv/elp_interval
 Date:           Feb 2014
 Contact:        Linus Lüssing <linus.luessing@web.de>
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index b90a4dfe8ba6..d9cb5c4922c1 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -18,6 +18,7 @@
 #include "bat_algo.h"
 #include "main.h"
 
+#include <linux/atomic.h>
 #include <linux/cache.h>
 #include <linux/init.h>
 
@@ -37,6 +38,11 @@ static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
 	if (ret < 0)
 		batadv_v_elp_iface_disable(hard_iface);
 
+	/* enable link throughput auto-detection by setting the throughput
+	 * override to zero
+	 */
+	atomic_set(&hard_iface->bat_v.throughput_override, 0);
+
 	return ret;
 }
 
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 5ee04f7140af..4423047889e1 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -40,8 +40,8 @@
  *
  * Return: false on parse error and true otherwise.
  */
-static bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
-				    const char *description, u32 *throughput)
+bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
+			     const char *description, u32 *throughput)
 {
 	enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT;
 	u64 lthroughput;
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index b58346350024..8a5e1ddf1175 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -49,5 +49,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
 void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv);
 void batadv_gw_init(struct batadv_priv *bat_priv);
 void batadv_gw_free(struct batadv_priv *bat_priv);
+bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
+			     const char *description, u32 *throughput);
 
 #endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index e86014332e1c..e7cf51333a36 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -917,12 +917,85 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj,
 	return length;
 }
 
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+
+/**
+ * batadv_store_throughput_override - parse and store throughput override
+ *  entered by the user
+ * @kobj: kobject representing the private mesh sysfs directory
+ * @attr: the batman-adv attribute the user is interacting with
+ * @buff: the buffer containing the user data
+ * @count: number of bytes in the buffer
+ *
+ * Return: 'count' on success or a negative error code in case of failure
+ */
+static ssize_t batadv_store_throughput_override(struct kobject *kobj,
+						struct attribute *attr,
+						char *buff, size_t count)
+{
+	struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+	struct batadv_hard_iface *hard_iface;
+	u32 tp_override;
+	u32 old_tp_override;
+	bool ret;
+
+	hard_iface = batadv_hardif_get_by_netdev(net_dev);
+	if (!hard_iface)
+		return -EINVAL;
+
+	if (buff[count - 1] == '\n')
+		buff[count - 1] = '\0';
+
+	ret = batadv_parse_throughput(net_dev, buff, "throughput_override",
+				      &tp_override);
+	if (!ret)
+		return count;
+
+	old_tp_override = atomic_read(&hard_iface->bat_v.throughput_override);
+	if (old_tp_override == tp_override)
+		goto out;
+
+	batadv_info(net_dev, "%s: Changing from: %u.%u MBit to: %u.%u MBit\n",
+		    "throughput_override",
+		    old_tp_override / 10, old_tp_override % 10,
+		    tp_override / 10, tp_override % 10);
+
+	atomic_set(&hard_iface->bat_v.throughput_override, tp_override);
+
+out:
+	batadv_hardif_put(hard_iface);
+	return count;
+}
+
+static ssize_t batadv_show_throughput_override(struct kobject *kobj,
+					       struct attribute *attr,
+					       char *buff)
+{
+	struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+	struct batadv_hard_iface *hard_iface;
+	u32 tp_override;
+
+	hard_iface = batadv_hardif_get_by_netdev(net_dev);
+	if (!hard_iface)
+		return -EINVAL;
+
+	tp_override = atomic_read(&hard_iface->bat_v.throughput_override);
+
+	return sprintf(buff, "%u.%u MBit\n", tp_override / 10,
+		       tp_override % 10);
+}
+
+#endif
+
 static BATADV_ATTR(mesh_iface, S_IRUGO | S_IWUSR, batadv_show_mesh_iface,
 		   batadv_store_mesh_iface);
 static BATADV_ATTR(iface_status, S_IRUGO, batadv_show_iface_status, NULL);
 #ifdef CONFIG_BATMAN_ADV_BATMAN_V
 BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, S_IRUGO | S_IWUSR,
 		     2 * BATADV_JITTER, INT_MAX, NULL);
+static BATADV_ATTR(throughput_override, S_IRUGO | S_IWUSR,
+		   batadv_show_throughput_override,
+		   batadv_store_throughput_override);
 #endif
 
 static struct batadv_attribute *batadv_batman_attrs[] = {
@@ -930,6 +1003,7 @@ static struct batadv_attribute *batadv_batman_attrs[] = {
 	&batadv_attr_iface_status,
 #ifdef CONFIG_BATMAN_ADV_BATMAN_V
 	&batadv_attr_elp_interval,
+	&batadv_attr_throughput_override,
 #endif
 	NULL,
 };
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index cf55cd0586b5..a6f0952d2840 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -92,12 +92,14 @@ struct batadv_hard_iface_bat_iv {
  * @elp_seqno: current ELP sequence number
  * @elp_skb: base skb containing the ELP message to send
  * @elp_wq: workqueue used to schedule ELP transmissions
+ * @throughput_override: throughput override to disable link auto-detection
  */
 struct batadv_hard_iface_bat_v {
 	atomic_t elp_interval;
 	atomic_t elp_seqno;
 	struct sk_buff *elp_skb;
 	struct delayed_work elp_wq;
+	atomic_t throughput_override;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 95d392784dd0a51e4216e075f04a68c922745985 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Sat, 16 Jan 2016 16:40:15 +0800
Subject: batman-adv: keep track of when unicast packets are sent

To enable ELP to send probing packets over wireless links
only if needed, batman-adv must keep track of the last time
it sent a unicast packet towards every neighbour.

For this purpose a 2 main changes are introduced:
1) a new member of the elp_neigh_node structure stores the
   last time a unicast packet was sent towards this neighbour;
2) a wrapper function for sending unicast packets is
   implemented. This function will simply update the member
   describe din point 1) and then forward the packet to the
   real sending routine.

Point 2) implies that any code-path leading to a unicast
sending now has to use the new wrapper.

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/bat_iv_ogm.c            |  2 +-
 net/batman-adv/bat_v_elp.c             |  2 +-
 net/batman-adv/bat_v_ogm.c             |  2 +-
 net/batman-adv/distributed-arp-table.c |  4 +--
 net/batman-adv/fragmentation.c         |  8 ++---
 net/batman-adv/icmp_socket.c           |  2 +-
 net/batman-adv/network-coding.c        | 22 +++++++-------
 net/batman-adv/send.c                  | 55 +++++++++++++++++++++++++++++-----
 net/batman-adv/send.h                  | 10 +++++--
 net/batman-adv/types.h                 |  2 ++
 10 files changed, 75 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 5651e33ca6bd..cb2d1b9b0340 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -478,7 +478,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
 		batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX);
 		batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
 				   skb->len + ETH_HLEN);
-		batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+		batadv_send_broadcast_skb(skb, hard_iface);
 	}
 }
 
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index fac5aee7ce05..1e4d13cc267f 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -104,7 +104,7 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
 		   hard_iface->net_dev->name,
 		   atomic_read(&hard_iface->bat_v.elp_seqno));
 
-	batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+	batadv_send_broadcast_skb(skb, hard_iface);
 
 	atomic_inc(&hard_iface->bat_v.elp_seqno);
 
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 22dda0e079a5..c8000962a266 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -120,7 +120,7 @@ static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
 	batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
 			   skb->len + ETH_HLEN);
 
-	batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+	batadv_send_broadcast_skb(skb, hard_iface);
 }
 
 /**
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 4c9b69d465a6..e96d7c745b4a 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -654,9 +654,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
 			goto free_neigh;
 		}
 
-		send_status = batadv_send_skb_packet(tmp_skb,
-						     neigh_node->if_incoming,
-						     neigh_node->addr);
+		send_status = batadv_send_unicast_skb(tmp_skb, neigh_node);
 		if (send_status == NET_XMIT_SUCCESS) {
 			/* count the sent packet */
 			switch (packet_subtype) {
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index adb9c3989add..e6956d0746a2 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -378,8 +378,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
 				   skb->len + ETH_HLEN);
 
 		packet->ttl--;
-		batadv_send_skb_packet(skb, neigh_node->if_incoming,
-				       neigh_node->addr);
+		batadv_send_unicast_skb(skb, neigh_node);
 		ret = true;
 	}
 
@@ -486,8 +485,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
 		batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
 		batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
 				   skb_fragment->len + ETH_HLEN);
-		batadv_send_skb_packet(skb_fragment, neigh_node->if_incoming,
-				       neigh_node->addr);
+		batadv_send_unicast_skb(skb_fragment, neigh_node);
 		frag_header.no++;
 
 		/* The initial check in this function should cover this case */
@@ -506,7 +504,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
 	batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
 	batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
 			   skb->len + ETH_HLEN);
-	batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+	batadv_send_unicast_skb(skb, neigh_node);
 
 	ret = true;
 
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 6268f08b7154..14d0013b387e 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -278,7 +278,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff,
 
 	ether_addr_copy(icmp_header->orig, primary_if->net_dev->dev_addr);
 
-	batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+	batadv_send_unicast_skb(skb, neigh_node);
 	goto out;
 
 dst_unreach:
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index d253bb23e2ac..b41719b6487a 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -575,9 +575,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash,
  */
 static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
 {
-	batadv_send_skb_packet(nc_packet->skb,
-			       nc_packet->neigh_node->if_incoming,
-			       nc_packet->nc_path->next_hop);
+	batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node);
 	nc_packet->skb = NULL;
 	batadv_nc_packet_free(nc_packet);
 }
@@ -1067,11 +1065,11 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
 	struct batadv_unicast_packet *packet1;
 	struct batadv_unicast_packet *packet2;
 	struct batadv_coded_packet *coded_packet;
-	struct batadv_neigh_node *neigh_tmp, *router_neigh;
-	struct batadv_neigh_node *router_coding = NULL;
+	struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest;
+	struct batadv_neigh_node *router_coding = NULL, *second_dest;
 	struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL;
 	struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL;
-	u8 *first_source, *first_dest, *second_source, *second_dest;
+	u8 *first_source, *second_source;
 	__be32 packet_id1, packet_id2;
 	size_t count;
 	bool res = false;
@@ -1114,9 +1112,9 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
 	 */
 	if (tq_weighted_neigh >= tq_weighted_coding) {
 		/* Destination from nc_packet is selected for MAC-header */
-		first_dest = nc_packet->nc_path->next_hop;
+		first_dest = nc_packet->neigh_node;
 		first_source = nc_packet->nc_path->prev_hop;
-		second_dest = neigh_node->addr;
+		second_dest = neigh_node;
 		second_source = ethhdr->h_source;
 		packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data;
 		packet2 = (struct batadv_unicast_packet *)skb->data;
@@ -1125,9 +1123,9 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
 					      skb->data + sizeof(*packet2));
 	} else {
 		/* Destination for skb is selected for MAC-header */
-		first_dest = neigh_node->addr;
+		first_dest = neigh_node;
 		first_source = ethhdr->h_source;
-		second_dest = nc_packet->nc_path->next_hop;
+		second_dest = nc_packet->neigh_node;
 		second_source = nc_packet->nc_path->prev_hop;
 		packet1 = (struct batadv_unicast_packet *)skb->data;
 		packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data;
@@ -1169,7 +1167,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
 	coded_packet->first_ttvn = packet1->ttvn;
 
 	/* Info about second unicast packet */
-	ether_addr_copy(coded_packet->second_dest, second_dest);
+	ether_addr_copy(coded_packet->second_dest, second_dest->addr);
 	ether_addr_copy(coded_packet->second_source, second_source);
 	ether_addr_copy(coded_packet->second_orig_dest, packet2->dest);
 	coded_packet->second_crc = packet_id2;
@@ -1224,7 +1222,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
 	batadv_nc_packet_free(nc_packet);
 
 	/* Send the coded packet and return true */
-	batadv_send_skb_packet(skb_dest, neigh_node->if_incoming, first_dest);
+	batadv_send_unicast_skb(skb_dest, first_dest);
 	res = true;
 out:
 	if (router_neigh)
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index caff32cf6fe7..3ce06e0a91b1 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -49,16 +49,30 @@
 
 static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
 
-/* send out an already prepared packet to the given address via the
- * specified batman interface
+/**
+ * batadv_send_skb_packet - send an already prepared packet
+ * @skb: the packet to send
+ * @hard_iface: the interface to use to send the broadcast packet
+ * @dst_addr: the payload destination
+ *
+ * Send out an already prepared packet to the given neighbor or broadcast it
+ * using the specified interface. Either hard_iface or neigh_node must be not
+ * NULL.
+ * If neigh_node is NULL, then the packet is broadcasted using hard_iface,
+ * otherwise it is sent as unicast to the given neighbor.
+ *
+ * Return: NET_TX_DROP in case of error or the result of dev_queue_xmit(skb)
+ * otherwise
  */
 int batadv_send_skb_packet(struct sk_buff *skb,
 			   struct batadv_hard_iface *hard_iface,
 			   const u8 *dst_addr)
 {
-	struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	struct batadv_priv *bat_priv;
 	struct ethhdr *ethhdr;
 
+	bat_priv = netdev_priv(hard_iface->soft_iface);
+
 	if (hard_iface->if_status != BATADV_IF_ACTIVE)
 		goto send_skb_err;
 
@@ -100,6 +114,35 @@ send_skb_err:
 	return NET_XMIT_DROP;
 }
 
+int batadv_send_broadcast_skb(struct sk_buff *skb,
+			      struct batadv_hard_iface *hard_iface)
+{
+	return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+}
+
+int batadv_send_unicast_skb(struct sk_buff *skb,
+			    struct batadv_neigh_node *neigh)
+{
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	struct batadv_hardif_neigh_node *hardif_neigh;
+#endif
+	int ret;
+
+	ret = batadv_send_skb_packet(skb, neigh->if_incoming, neigh->addr);
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr);
+
+	if ((hardif_neigh) && (ret != NET_XMIT_DROP))
+		hardif_neigh->bat_v.last_unicast_tx = jiffies;
+
+	if (hardif_neigh)
+		batadv_hardif_neigh_put(hardif_neigh);
+#endif
+
+	return ret;
+}
+
 /**
  * batadv_send_skb_to_orig - Lookup next-hop and transmit skb.
  * @skb: Packet to be transmitted.
@@ -146,8 +189,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
 	if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) {
 		ret = NET_XMIT_POLICED;
 	} else {
-		batadv_send_skb_packet(skb, neigh_node->if_incoming,
-				       neigh_node->addr);
+		batadv_send_unicast_skb(skb, neigh_node);
 		ret = NET_XMIT_SUCCESS;
 	}
 
@@ -538,8 +580,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
 		/* send a copy of the saved skb */
 		skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC);
 		if (skb1)
-			batadv_send_skb_packet(skb1, hard_iface,
-					       batadv_broadcast_addr);
+			batadv_send_broadcast_skb(skb1, hard_iface);
 	}
 	rcu_read_unlock();
 
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 7ff95cada2e7..6fd7270d8ce6 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -28,12 +28,16 @@
 struct sk_buff;
 struct work_struct;
 
-int batadv_send_skb_packet(struct sk_buff *skb,
-			   struct batadv_hard_iface *hard_iface,
-			   const u8 *dst_addr);
 int batadv_send_skb_to_orig(struct sk_buff *skb,
 			    struct batadv_orig_node *orig_node,
 			    struct batadv_hard_iface *recv_if);
+int batadv_send_skb_packet(struct sk_buff *skb,
+			   struct batadv_hard_iface *hard_iface,
+			   const u8 *dst_addr);
+int batadv_send_broadcast_skb(struct sk_buff *skb,
+			      struct batadv_hard_iface *hard_iface);
+int batadv_send_unicast_skb(struct sk_buff *skb,
+			    struct batadv_neigh_node *neigh_node);
 void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface);
 int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
 				    const struct sk_buff *skb,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index a6f0952d2840..c56bb8835e60 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -377,11 +377,13 @@ DECLARE_EWMA(throughput, 1024, 8)
  * @throughput: ewma link throughput towards this neighbor
  * @elp_interval: time interval between two ELP transmissions
  * @elp_latest_seqno: latest and best known ELP sequence number
+ * @last_unicast_tx: when the last unicast packet has been sent to this neighbor
  */
 struct batadv_hardif_neigh_node_bat_v {
 	struct ewma_throughput throughput;
 	u32 elp_interval;
 	u32 elp_latest_seqno;
+	unsigned long last_unicast_tx;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From c833484e5f3872a38fe232c663586069d5ad9645 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Tue, 10 Nov 2015 18:50:51 +0100
Subject: batman-adv: ELP - compute the metric based on the estimated
 throughput

In case of wireless interface retrieve the throughput by
querying cfg80211. To perform this call a separate work
must be scheduled because the function may sleep and this
is not allowed within an RCU protected context (RCU in this
case is used to iterate over all the neighbours).

Use ethtool to retrieve information about an Ethernet link
like HALF/FULL_DUPLEX and advertised bandwidth (e.g.
100/10Mbps).

The metric is updated each time a new ELP packet is sent,
this way it is possible to timely react to a metric
variation which can imply (for example) a neighbour
disconnection.

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/Kconfig     |   2 +-
 net/batman-adv/bat_v.c     |   3 +
 net/batman-adv/bat_v_elp.c | 140 +++++++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/bat_v_elp.h |   2 +
 net/batman-adv/bat_v_ogm.c |   2 +-
 net/batman-adv/main.h      |   1 +
 net/batman-adv/types.h     |  16 ++++++
 7 files changed, 164 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index 5c148a8489da..e651dc927bfd 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -17,7 +17,7 @@ config BATMAN_ADV
 
 config BATMAN_ADV_BATMAN_V
 	bool "B.A.T.M.A.N. V protocol (experimental)"
-	depends on BATMAN_ADV
+	depends on BATMAN_ADV && CFG80211
 	default n
 	help
 	  This option enables the B.A.T.M.A.N. V protocol, the successor
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index d9cb5c4922c1..ff31f2af9cfe 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -21,6 +21,7 @@
 #include <linux/atomic.h>
 #include <linux/cache.h>
 #include <linux/init.h>
+#include <linux/workqueue.h>
 
 #include "bat_v_elp.h"
 #include "bat_v_ogm.h"
@@ -65,6 +66,8 @@ static void
 batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
 {
 	ewma_throughput_init(&hardif_neigh->bat_v.throughput);
+	INIT_WORK(&hardif_neigh->bat_v.metric_work,
+		  batadv_v_elp_throughput_metric_update);
 }
 
 static void batadv_v_ogm_schedule(struct batadv_hard_iface *hard_iface)
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 1e4d13cc267f..8376730ebd69 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -22,19 +22,23 @@
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
+#include <linux/ethtool.h>
 #include <linux/fs.h>
 #include <linux/if_ether.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/netdevice.h>
 #include <linux/random.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <net/cfg80211.h>
 
 #include "bat_algo.h"
 #include "bat_v_ogm.h"
@@ -59,6 +63,107 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
 			   msecs_to_jiffies(msecs));
 }
 
+/**
+ * batadv_v_elp_get_throughput - get the throughput towards a neighbour
+ * @neigh: the neighbour for which the throughput has to be obtained
+ *
+ * Return: The throughput towards the given neighbour in multiples of 100kpbs
+ *         (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc).
+ */
+static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
+{
+	struct batadv_hard_iface *hard_iface = neigh->if_incoming;
+	struct ethtool_link_ksettings link_settings;
+	struct station_info sinfo;
+	u32 throughput;
+	int ret;
+
+	/* if the user specified a customised value for this interface, then
+	 * return it directly
+	 */
+	throughput =  atomic_read(&hard_iface->bat_v.throughput_override);
+	if (throughput != 0)
+		return throughput;
+
+	/* if this is a wireless device, then ask its throughput through
+	 * cfg80211 API
+	 */
+	if (batadv_is_wifi_netdev(hard_iface->net_dev)) {
+		if (hard_iface->net_dev->ieee80211_ptr) {
+			ret = cfg80211_get_station(hard_iface->net_dev,
+						   neigh->addr, &sinfo);
+			if (ret == -ENOENT) {
+				/* Node is not associated anymore! It would be
+				 * possible to delete this neighbor. For now set
+				 * the throughput metric to 0.
+				 */
+				return 0;
+			}
+			if (!ret)
+				return sinfo.expected_throughput / 100;
+		}
+
+		/* unsupported WiFi driver version */
+		goto default_throughput;
+	}
+
+	/* if not a wifi interface, check if this device provides data via
+	 * ethtool (e.g. an Ethernet adapter)
+	 */
+	memset(&link_settings, 0, sizeof(link_settings));
+	rtnl_lock();
+	ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
+	rtnl_unlock();
+	if (ret == 0) {
+		/* link characteristics might change over time */
+		if (link_settings.base.duplex == DUPLEX_FULL)
+			hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
+		else
+			hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
+
+		throughput = link_settings.base.speed;
+		if (throughput && (throughput != SPEED_UNKNOWN))
+			return throughput * 10;
+	}
+
+default_throughput:
+	if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) {
+		batadv_info(hard_iface->soft_iface,
+			    "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n",
+			    hard_iface->net_dev->name,
+			    BATADV_THROUGHPUT_DEFAULT_VALUE / 10,
+			    BATADV_THROUGHPUT_DEFAULT_VALUE % 10);
+		hard_iface->bat_v.flags |= BATADV_WARNING_DEFAULT;
+	}
+
+	/* if none of the above cases apply, return the base_throughput */
+	return BATADV_THROUGHPUT_DEFAULT_VALUE;
+}
+
+/**
+ * batadv_v_elp_throughput_metric_update - worker updating the throughput metric
+ *  of a single hop neighbour
+ * @work: the work queue item
+ */
+void batadv_v_elp_throughput_metric_update(struct work_struct *work)
+{
+	struct batadv_hardif_neigh_node_bat_v *neigh_bat_v;
+	struct batadv_hardif_neigh_node *neigh;
+
+	neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v,
+				   metric_work);
+	neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node,
+			     bat_v);
+
+	ewma_throughput_add(&neigh->bat_v.throughput,
+			    batadv_v_elp_get_throughput(neigh));
+
+	/* decrement refcounter to balance increment performed before scheduling
+	 * this task
+	 */
+	batadv_hardif_neigh_put(neigh);
+}
+
 /**
  * batadv_v_elp_periodic_work - ELP periodic task per interface
  * @work: work queue item
@@ -67,6 +172,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
  */
 static void batadv_v_elp_periodic_work(struct work_struct *work)
 {
+	struct batadv_hardif_neigh_node *hardif_neigh;
 	struct batadv_hard_iface *hard_iface;
 	struct batadv_hard_iface_bat_v *bat_v;
 	struct batadv_elp_packet *elp_packet;
@@ -108,6 +214,31 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
 
 	atomic_inc(&hard_iface->bat_v.elp_seqno);
 
+	/* The throughput metric is updated on each sent packet. This way, if a
+	 * node is dead and no longer sends packets, batman-adv is still able to
+	 * react timely to its death.
+	 *
+	 * The throughput metric is updated by following these steps:
+	 * 1) if the hard_iface is wifi => send a number of unicast ELPs for
+	 *    probing/sampling to each neighbor
+	 * 2) update the throughput metric value of each neighbor (note that the
+	 *    value retrieved in this step might be 100ms old because the
+	 *    probing packets at point 1) could still be in the HW queue)
+	 */
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) {
+		if (!kref_get_unless_zero(&hardif_neigh->refcount))
+			continue;
+
+		/* Reading the estimated throughput from cfg80211 is a task that
+		 * may sleep and that is not allowed in an rcu protected
+		 * context. Therefore schedule a task for that.
+		 */
+		queue_work(batadv_event_workqueue,
+			   &hardif_neigh->bat_v.metric_work);
+	}
+	rcu_read_unlock();
+
 restart_timer:
 	batadv_v_elp_start_timer(hard_iface);
 out:
@@ -146,6 +277,15 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
 	atomic_set(&hard_iface->bat_v.elp_seqno, random_seqno);
 	atomic_set(&hard_iface->bat_v.elp_interval, 500);
 
+	/* assume full-duplex by default */
+	hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
+
+	/* warn the user (again) if there is no throughput data is available */
+	hard_iface->bat_v.flags &= ~BATADV_WARNING_DEFAULT;
+
+	if (batadv_is_wifi_netdev(hard_iface->net_dev))
+		hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
+
 	INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq,
 			  batadv_v_elp_periodic_work);
 	batadv_v_elp_start_timer(hard_iface);
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index 5a7bc398a9ef..e95f1bca0785 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -21,11 +21,13 @@
 #define _NET_BATMAN_ADV_BAT_V_ELP_H_
 
 struct sk_buff;
+struct work_struct;
 
 int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
 void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
 void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
 int batadv_v_elp_packet_recv(struct sk_buff *skb,
 			     struct batadv_hard_iface *if_incoming);
+void batadv_v_elp_throughput_metric_update(struct work_struct *work);
 
 #endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index c8000962a266..d9bcbe6e7d65 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -339,7 +339,7 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
 	 */
 	if ((throughput > 10) &&
 	    (if_incoming == if_outgoing) &&
-	    (batadv_is_wifi_netdev(if_incoming->net_dev)))
+	    !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX))
 		return throughput / 2;
 
 	/* hop penalty of 255 equals 100% */
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 2e795446208f..5c68bf2618c7 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -62,6 +62,7 @@
 #define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1
 
 /* B.A.T.M.A.N. V */
+#define BATADV_THROUGHPUT_DEFAULT_VALUE 10 /* 1 Mbps */
 #define BATADV_ELP_MAX_AGE 64
 #define BATADV_OGM_MAX_ORIGDIFF 5
 #define BATADV_OGM_MAX_AGE 64
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index c56bb8835e60..9abfb3e73c34 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -86,6 +86,18 @@ struct batadv_hard_iface_bat_iv {
 	atomic_t ogm_seqno;
 };
 
+/**
+ * enum batadv_v_hard_iface_flags - interface flags useful to B.A.T.M.A.N. V
+ * @BATADV_FULL_DUPLEX: tells if the connection over this link is full-duplex
+ * @BATADV_WARNING_DEFAULT: tells whether we have warned the user that no
+ *  throughput data is available for this interface and that default values are
+ *  assumed.
+ */
+enum batadv_v_hard_iface_flags {
+	BATADV_FULL_DUPLEX	= BIT(0),
+	BATADV_WARNING_DEFAULT	= BIT(1),
+};
+
 /**
  * struct batadv_hard_iface_bat_v - per hard-interface B.A.T.M.A.N. V data
  * @elp_interval: time interval between two ELP transmissions
@@ -93,6 +105,7 @@ struct batadv_hard_iface_bat_iv {
  * @elp_skb: base skb containing the ELP message to send
  * @elp_wq: workqueue used to schedule ELP transmissions
  * @throughput_override: throughput override to disable link auto-detection
+ * @flags: interface specific flags
  */
 struct batadv_hard_iface_bat_v {
 	atomic_t elp_interval;
@@ -100,6 +113,7 @@ struct batadv_hard_iface_bat_v {
 	struct sk_buff *elp_skb;
 	struct delayed_work elp_wq;
 	atomic_t throughput_override;
+	u8 flags;
 };
 
 /**
@@ -378,12 +392,14 @@ DECLARE_EWMA(throughput, 1024, 8)
  * @elp_interval: time interval between two ELP transmissions
  * @elp_latest_seqno: latest and best known ELP sequence number
  * @last_unicast_tx: when the last unicast packet has been sent to this neighbor
+ * @metric_work: work queue callback item for metric update
  */
 struct batadv_hardif_neigh_node_bat_v {
 	struct ewma_throughput throughput;
 	u32 elp_interval;
 	u32 elp_latest_seqno;
 	unsigned long last_unicast_tx;
+	struct work_struct metric_work;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 8d2d499e08145d9851097e1241ef15aad8c9170a Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Tue, 10 Nov 2015 18:51:22 +0100
Subject: batman-adv: ELP - send unicast ELP packets for throughput sampling

In case of an unused wireless link, the mac80211 throughput estimation
won't get updated further. Consequently, the reported throughput metric
will become obsolete.

With this patch unicast sampling is introduced by periodically sending
unicast ELP packets to each neighbor on idle WiFi links. These sampling
packets will fill an entire frame, so that the measurement is as
reliable as possible

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/bat_v_elp.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/main.h      |  3 ++
 2 files changed, 72 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 8376730ebd69..3844e7efd0b0 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -164,6 +164,69 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work)
 	batadv_hardif_neigh_put(neigh);
 }
 
+/**
+ * batadv_v_elp_wifi_neigh_probe - send link probing packets to a neighbour
+ * @neigh: the neighbour to probe
+ *
+ * Sends a predefined number of unicast wifi packets to a given neighbour in
+ * order to trigger the throughput estimation on this link by the RC algorithm.
+ * Packets are sent only if there there is not enough payload unicast traffic
+ * towards this neighbour..
+ *
+ * Return: True on success and false in case of error during skb preparation.
+ */
+static bool
+batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
+{
+	struct batadv_hard_iface *hard_iface = neigh->if_incoming;
+	struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	unsigned long last_tx_diff;
+	struct sk_buff *skb;
+	int probe_len, i;
+	int elp_skb_len;
+
+	/* this probing routine is for Wifi neighbours only */
+	if (!batadv_is_wifi_netdev(hard_iface->net_dev))
+		return true;
+
+	/* probe the neighbor only if no unicast packets have been sent
+	 * to it in the last 100 milliseconds: this is the rate control
+	 * algorithm sampling interval (minstrel). In this way, if not
+	 * enough traffic has been sent to the neighbor, batman-adv can
+	 * generate 2 probe packets and push the RC algorithm to perform
+	 * the sampling
+	 */
+	last_tx_diff = jiffies_to_msecs(jiffies - neigh->bat_v.last_unicast_tx);
+	if (last_tx_diff <= BATADV_ELP_PROBE_MAX_TX_DIFF)
+		return true;
+
+	probe_len = max_t(int, sizeof(struct batadv_elp_packet),
+			  BATADV_ELP_MIN_PROBE_SIZE);
+
+	for (i = 0; i < BATADV_ELP_PROBES_PER_NODE; i++) {
+		elp_skb_len = hard_iface->bat_v.elp_skb->len;
+		skb = skb_copy_expand(hard_iface->bat_v.elp_skb, 0,
+				      probe_len - elp_skb_len,
+				      GFP_ATOMIC);
+		if (!skb)
+			return false;
+
+		/* Tell the skb to get as big as the allocated space (we want
+		 * the packet to be exactly of that size to make the link
+		 * throughput estimation effective.
+		 */
+		skb_put(skb, probe_len - hard_iface->bat_v.elp_skb->len);
+
+		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+			   "Sending unicast (probe) ELP packet on interface %s to %pM\n",
+			   hard_iface->net_dev->name, neigh->addr);
+
+		batadv_send_skb_packet(skb, hard_iface, neigh->addr);
+	}
+
+	return true;
+}
+
 /**
  * batadv_v_elp_periodic_work - ELP periodic task per interface
  * @work: work queue item
@@ -227,6 +290,12 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
 	 */
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) {
+		if (!batadv_v_elp_wifi_neigh_probe(hardif_neigh))
+			/* if something goes wrong while probing, better to stop
+			 * sending packets immediately and reschedule the task
+			 */
+			break;
+
 		if (!kref_get_unless_zero(&hardif_neigh->refcount))
 			continue;
 
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 5c68bf2618c7..878c35974845 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -63,6 +63,9 @@
 
 /* B.A.T.M.A.N. V */
 #define BATADV_THROUGHPUT_DEFAULT_VALUE 10 /* 1 Mbps */
+#define BATADV_ELP_PROBES_PER_NODE 2
+#define BATADV_ELP_MIN_PROBE_SIZE 200 /* bytes */
+#define BATADV_ELP_PROBE_MAX_TX_DIFF 100 /* milliseconds */
 #define BATADV_ELP_MAX_AGE 64
 #define BATADV_OGM_MAX_ORIGDIFF 5
 #define BATADV_OGM_MAX_AGE 64
-- 
cgit v1.2.3-59-g8ed1b


From 9786906022eba35763b17c54a35913ca65151a78 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Sat, 16 Jan 2016 16:40:17 +0800
Subject: batman-adv: B.A.T.M.A.N. V - implement neighbor comparison API calls

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/bat_v.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index ff31f2af9cfe..953c0d150231 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -19,12 +19,15 @@
 #include "main.h"
 
 #include <linux/atomic.h>
+#include <linux/bug.h>
 #include <linux/cache.h>
 #include <linux/init.h>
+#include <linux/types.h>
 #include <linux/workqueue.h>
 
 #include "bat_v_elp.h"
 #include "bat_v_ogm.h"
+#include "originator.h"
 #include "packet.h"
 
 static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
@@ -78,6 +81,39 @@ static void batadv_v_ogm_emit(struct batadv_forw_packet *forw_packet)
 {
 }
 
+static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
+			      struct batadv_hard_iface *if_outgoing1,
+			      struct batadv_neigh_node *neigh2,
+			      struct batadv_hard_iface *if_outgoing2)
+{
+	struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
+
+	ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+	ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+
+	if (WARN_ON(!ifinfo1 || !ifinfo2))
+		return 0;
+
+	return ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput;
+}
+
+static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
+				  struct batadv_hard_iface *if_outgoing1,
+				  struct batadv_neigh_node *neigh2,
+				  struct batadv_hard_iface *if_outgoing2)
+{
+	struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
+	u32 threshold;
+
+	ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+	ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+
+	threshold = ifinfo1->bat_v.throughput / 4;
+	threshold = ifinfo1->bat_v.throughput - threshold;
+
+	return ifinfo2->bat_v.throughput > threshold;
+}
+
 static struct batadv_algo_ops batadv_batman_v __read_mostly = {
 	.name = "BATMAN_V",
 	.bat_iface_enable = batadv_v_iface_enable,
@@ -87,6 +123,8 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
 	.bat_hardif_neigh_init = batadv_v_hardif_neigh_init,
 	.bat_ogm_emit = batadv_v_ogm_emit,
 	.bat_ogm_schedule = batadv_v_ogm_schedule,
+	.bat_neigh_cmp = batadv_v_neigh_cmp,
+	.bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob,
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 261e264db636ae1f4c43e56b8c57d7343b166fc9 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@open-mesh.com>
Date: Sat, 16 Jan 2016 16:40:18 +0800
Subject: batman-adv: B.A.T.M.A.N. V - implement bat_orig_print API

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/bat_v.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 953c0d150231..a90117c9a861 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -22,11 +22,17 @@
 #include <linux/bug.h>
 #include <linux/cache.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
 #include "bat_v_elp.h"
 #include "bat_v_ogm.h"
+#include "hash.h"
 #include "originator.h"
 #include "packet.h"
 
@@ -81,6 +87,104 @@ static void batadv_v_ogm_emit(struct batadv_forw_packet *forw_packet)
 {
 }
 
+/**
+ * batadv_v_orig_print_neigh - print neighbors for the originator table
+ * @orig_node: the orig_node for which the neighbors are printed
+ * @if_outgoing: outgoing interface for these entries
+ * @seq: debugfs table seq_file struct
+ *
+ * Must be called while holding an rcu lock.
+ */
+static void
+batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node,
+			  struct batadv_hard_iface *if_outgoing,
+			  struct seq_file *seq)
+{
+	struct batadv_neigh_node *neigh_node;
+	struct batadv_neigh_ifinfo *n_ifinfo;
+
+	hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+		n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+		if (!n_ifinfo)
+			continue;
+
+		seq_printf(seq, " %pM (%9u.%1u)",
+			   neigh_node->addr,
+			   n_ifinfo->bat_v.throughput / 10,
+			   n_ifinfo->bat_v.throughput % 10);
+
+		batadv_neigh_ifinfo_put(n_ifinfo);
+	}
+}
+
+/**
+ * batadv_v_orig_print - print the originator table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq: debugfs table seq_file struct
+ * @if_outgoing: the outgoing interface for which this should be printed
+ */
+static void batadv_v_orig_print(struct batadv_priv *bat_priv,
+				struct seq_file *seq,
+				struct batadv_hard_iface *if_outgoing)
+{
+	struct batadv_neigh_node *neigh_node;
+	struct batadv_hashtable *hash = bat_priv->orig_hash;
+	int last_seen_msecs, last_seen_secs;
+	struct batadv_orig_node *orig_node;
+	struct batadv_neigh_ifinfo *n_ifinfo;
+	unsigned long last_seen_jiffies;
+	struct hlist_head *head;
+	int batman_count = 0;
+	u32 i;
+
+	seq_printf(seq, "  %-15s %s (%11s) %17s [%10s]: %20s ...\n",
+		   "Originator", "last-seen", "throughput", "Nexthop",
+		   "outgoingIF", "Potential nexthops");
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+			neigh_node = batadv_orig_router_get(orig_node,
+							    if_outgoing);
+			if (!neigh_node)
+				continue;
+
+			n_ifinfo = batadv_neigh_ifinfo_get(neigh_node,
+							   if_outgoing);
+			if (!n_ifinfo)
+				goto next;
+
+			last_seen_jiffies = jiffies - orig_node->last_seen;
+			last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
+			last_seen_secs = last_seen_msecs / 1000;
+			last_seen_msecs = last_seen_msecs % 1000;
+
+			seq_printf(seq, "%pM %4i.%03is (%9u.%1u) %pM [%10s]:",
+				   orig_node->orig, last_seen_secs,
+				   last_seen_msecs,
+				   n_ifinfo->bat_v.throughput / 10,
+				   n_ifinfo->bat_v.throughput % 10,
+				   neigh_node->addr,
+				   neigh_node->if_incoming->net_dev->name);
+
+			batadv_v_orig_print_neigh(orig_node, if_outgoing, seq);
+			seq_puts(seq, "\n");
+			batman_count++;
+
+next:
+			batadv_neigh_node_put(neigh_node);
+			if (n_ifinfo)
+				batadv_neigh_ifinfo_put(n_ifinfo);
+		}
+		rcu_read_unlock();
+	}
+
+	if (batman_count == 0)
+		seq_puts(seq, "No batman nodes in range ...\n");
+}
+
 static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
 			      struct batadv_hard_iface *if_outgoing1,
 			      struct batadv_neigh_node *neigh2,
@@ -123,6 +227,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
 	.bat_hardif_neigh_init = batadv_v_hardif_neigh_init,
 	.bat_ogm_emit = batadv_v_ogm_emit,
 	.bat_ogm_schedule = batadv_v_ogm_schedule,
+	.bat_orig_print = batadv_v_orig_print,
 	.bat_neigh_cmp = batadv_v_neigh_cmp,
 	.bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob,
 };
-- 
cgit v1.2.3-59-g8ed1b


From 626d23e83c88df5ff535414c2fe29e16b95d6b7a Mon Sep 17 00:00:00 2001
From: Linus Luessing <linus.luessing@web.de>
Date: Sat, 16 Jan 2016 16:40:19 +0800
Subject: batman-adv: B.A.T.M.A.N. V - implement bat_neigh_print API

Lists all neighbours detected by the Echo Locating Protocol
(ELP) and their throughput metric.

Initially Developed by Linus during a 6 months trainee study
period in Ascom (Switzerland) AG.

Signed-off-by: Linus Luessing <linus.luessing@web.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
---
 net/batman-adv/bat_v.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index a90117c9a861..3315b9a598af 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -117,6 +117,60 @@ batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node,
 	}
 }
 
+/**
+ * batadv_v_hardif_neigh_print - print a single ELP neighbour node
+ * @seq: neighbour table seq_file struct
+ * @hardif_neigh: hardif neighbour information
+ */
+static void
+batadv_v_hardif_neigh_print(struct seq_file *seq,
+			    struct batadv_hardif_neigh_node *hardif_neigh)
+{
+	int last_secs, last_msecs;
+	u32 throughput;
+
+	last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000;
+	last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000;
+	throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
+
+	seq_printf(seq, "%pM %4i.%03is (%9u.%1u) [%10s]\n",
+		   hardif_neigh->addr, last_secs, last_msecs, throughput / 10,
+		   throughput % 10, hardif_neigh->if_incoming->net_dev->name);
+}
+
+/**
+ * batadv_v_neigh_print - print the single hop neighbour list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq: neighbour table seq_file struct
+ */
+static void batadv_v_neigh_print(struct batadv_priv *bat_priv,
+				 struct seq_file *seq)
+{
+	struct net_device *net_dev = (struct net_device *)seq->private;
+	struct batadv_hardif_neigh_node *hardif_neigh;
+	struct batadv_hard_iface *hard_iface;
+	int batman_count = 0;
+
+	seq_printf(seq, "  %-15s %s (%11s) [%10s]\n", "Neighbor",
+		   "last-seen", "throughput", "IF");
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+		if (hard_iface->soft_iface != net_dev)
+			continue;
+
+		hlist_for_each_entry_rcu(hardif_neigh,
+					 &hard_iface->neigh_list, list) {
+			batadv_v_hardif_neigh_print(seq, hardif_neigh);
+			batman_count++;
+		}
+	}
+	rcu_read_unlock();
+
+	if (batman_count == 0)
+		seq_puts(seq, "No batman nodes in range ...\n");
+}
+
 /**
  * batadv_v_orig_print - print the originator table
  * @bat_priv: the bat priv with all the soft interface information
@@ -230,6 +284,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
 	.bat_orig_print = batadv_v_orig_print,
 	.bat_neigh_cmp = batadv_v_neigh_cmp,
 	.bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob,
+	.bat_neigh_print = batadv_v_neigh_print,
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 97575407e9547416a23de03d141072e639f04623 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <sw@simonwunderlich.de>
Date: Tue, 19 Jan 2016 17:08:29 +0100
Subject: batman-adv: Start new development cycle

Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/main.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 878c35974845..db4533631834 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -24,7 +24,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2016.0"
+#define BATADV_SOURCE_VERSION "2016.1"
 #endif
 
 /* B.A.T.M.A.N. parameters */
-- 
cgit v1.2.3-59-g8ed1b


From 017b1b6d28c479f1ad9a7a41f775545a3e1cba35 Mon Sep 17 00:00:00 2001
From: Phil Turnbull <phil.turnbull@oracle.com>
Date: Wed, 24 Feb 2016 15:34:43 -0500
Subject: netfilter: nfnetlink_acct: validate NFACCT_FILTER parameters

nfacct_filter_alloc doesn't validate the NFACCT_FILTER_MASK and
NFACCT_FILTER_VALUE parameters which can trigger a NULL pointer
dereference. CAP_NET_ADMIN is required to trigger the bug.

Signed-off-by: Phil Turnbull <phil.turnbull@oracle.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_acct.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 5274b04c42a6..4c2b4c0c4d5f 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -242,6 +242,9 @@ nfacct_filter_alloc(const struct nlattr * const attr)
 	if (err < 0)
 		return ERR_PTR(err);
 
+	if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE])
+		return ERR_PTR(-EINVAL);
+
 	filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL);
 	if (!filter)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3-59-g8ed1b


From b07edbe1cf3dae9ba81f24888e2f2a9dbe778918 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Feb 2016 17:24:08 +0100
Subject: netfilter: meta: add PRANDOM support

Can be used to randomly match packets e.g. for statistic traffic sampling.

See commit 3ad0040573b0c00f8848
("bpf: split state from prandom_u32() and consolidate {c, e}BPF prngs")
for more info why this doesn't use prandom_u32 directly.

Unlike bpf nft_meta can be built as a module, so add an EXPORT_SYMBOL
for prandom_seed_full_state too.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 lib/random32.c                           |  1 +
 net/netfilter/nft_meta.c                 | 11 +++++++++++
 3 files changed, 14 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index be41ffc128b8..b19be0a098c0 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -681,6 +681,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_IIFGROUP: packet input interface group
  * @NFT_META_OIFGROUP: packet output interface group
  * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid)
+ * @NFT_META_PRANDOM: a 32bit pseudo-random number
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -707,6 +708,7 @@ enum nft_meta_keys {
 	NFT_META_IIFGROUP,
 	NFT_META_OIFGROUP,
 	NFT_META_CGROUP,
+	NFT_META_PRANDOM,
 };
 
 /**
diff --git a/lib/random32.c b/lib/random32.c
index 12111910ccd0..510d1ce7d4d2 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -255,6 +255,7 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
 		prandom_warmup(state);
 	}
 }
+EXPORT_SYMBOL(prandom_seed_full_state);
 
 /*
  *	Generate better values after random number generator
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index fe885bf271c5..16c50b0dd426 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -28,6 +28,8 @@
 
 #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
 
+static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
+
 void nft_meta_get_eval(const struct nft_expr *expr,
 		       struct nft_regs *regs,
 		       const struct nft_pktinfo *pkt)
@@ -181,6 +183,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		*dest = sock_cgroup_classid(&sk->sk_cgrp_data);
 		break;
 #endif
+	case NFT_META_PRANDOM: {
+		struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
+		*dest = prandom_u32_state(state);
+		break;
+	}
 	default:
 		WARN_ON(1);
 		goto err;
@@ -277,6 +284,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 	case NFT_META_OIFNAME:
 		len = IFNAMSIZ;
 		break;
+	case NFT_META_PRANDOM:
+		prandom_init_once(&nft_prandom_state);
+		len = sizeof(u32);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 8ee225e78531981e479e79b74354b63e188b4ee2 Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Tue, 23 Feb 2016 13:40:10 +0530
Subject: netfilter: xt_osf: remove unused variable

While building with W=1 we got the warning:
net/netfilter/xt_osf.c:265:9: warning: variable 'loop_cont' set but not used

The local variable loop_cont was only initialized and then assigned a
value but was never used or checked after that.
While removing the variable, the case of OSFOPT_TS was not removed so
that it will serve as a reminder to us that we can do something in that
particular case.

Signed-off-by: Sudip Mukherjee <sudip.mukherjee@codethink.co.uk>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_osf.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 4e3c3affd285..2455b69b5810 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -262,7 +262,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 			if (f->opt[optnum].kind == (*optp)) {
 				__u32 len = f->opt[optnum].length;
 				const __u8 *optend = optp + len;
-				int loop_cont = 0;
 
 				fmatch = FMATCH_OK;
 
@@ -275,7 +274,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 					mss = ntohs((__force __be16)mss);
 					break;
 				case OSFOPT_TS:
-					loop_cont = 1;
 					break;
 				}
 
-- 
cgit v1.2.3-59-g8ed1b


From 86a7996cc8a078793670d82ed97d5a99bb4e8496 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 25 Feb 2016 14:55:00 -0800
Subject: net_sched: introduce qdisc_replace() helper

Remove nearly duplicated code and prepare for the following patch.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 17 +++++++++++++++++
 net/sched/sch_cbq.c       |  7 +------
 net/sched/sch_drr.c       |  6 +-----
 net/sched/sch_dsmark.c    |  8 +-------
 net/sched/sch_hfsc.c      |  6 +-----
 net/sched/sch_htb.c       |  9 +--------
 net/sched/sch_multiq.c    |  8 +-------
 net/sched/sch_netem.c     | 10 +---------
 net/sched/sch_prio.c      |  8 +-------
 net/sched/sch_qfq.c       |  6 +-----
 net/sched/sch_red.c       |  7 +------
 net/sched/sch_sfb.c       |  7 +------
 net/sched/sch_tbf.c       |  8 +-------
 13 files changed, 29 insertions(+), 78 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 636a362a0e03..8fdad9f7a2fb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -707,6 +707,23 @@ static inline void qdisc_reset_queue(struct Qdisc *sch)
 	sch->qstats.backlog = 0;
 }
 
+static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
+					  struct Qdisc **pold)
+{
+	struct Qdisc *old;
+
+	sch_tree_lock(sch);
+	old = *pold;
+	*pold = new;
+	if (old != NULL) {
+		qdisc_tree_decrease_qlen(old, old->q.qlen);
+		qdisc_reset(old);
+	}
+	sch_tree_unlock(sch);
+
+	return old;
+}
+
 static inline unsigned int __qdisc_queue_drop(struct Qdisc *sch,
 					      struct sk_buff_head *list)
 {
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index c538d9e4a8f6..7f8474cdce32 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1624,13 +1624,8 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 			new->reshape_fail = cbq_reshape_fail;
 #endif
 	}
-	sch_tree_lock(sch);
-	*old = cl->q;
-	cl->q = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
 
+	*old = qdisc_replace(sch, new, &cl->q);
 	return 0;
 }
 
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index a1cd778240cd..b96c9a8e70ab 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -226,11 +226,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
 			new = &noop_qdisc;
 	}
 
-	sch_tree_lock(sch);
-	drr_purge_queue(cl);
-	*old = cl->qdisc;
-	cl->qdisc = new;
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &cl->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index f357f34d02d2..cfddb1c635c3 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -73,13 +73,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
 			new = &noop_qdisc;
 	}
 
-	sch_tree_lock(sch);
-	*old = p->q;
-	p->q = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &p->q);
 	return 0;
 }
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index b7ebe2c87586..089f3b667d36 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1215,11 +1215,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 			new = &noop_qdisc;
 	}
 
-	sch_tree_lock(sch);
-	hfsc_purge_queue(sch, cl);
-	*old = cl->qdisc;
-	cl->qdisc = new;
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &cl->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 15ccd7f8fb2a..0efbcf358cd0 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1163,14 +1163,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 				     cl->common.classid)) == NULL)
 		return -ENOBUFS;
 
-	sch_tree_lock(sch);
-	*old = cl->un.leaf.q;
-	cl->un.leaf.q = new;
-	if (*old != NULL) {
-		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-		qdisc_reset(*old);
-	}
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &cl->un.leaf.q);
 	return 0;
 }
 
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 4e904ca0af9d..a0103a138563 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -303,13 +303,7 @@ static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->queues[band];
-	q->queues[band] = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &q->queues[band]);
 	return 0;
 }
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 5abd1d9de989..0a6ddaf7f561 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1037,15 +1037,7 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 
-	sch_tree_lock(sch);
-	*old = q->qdisc;
-	q->qdisc = new;
-	if (*old) {
-		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-		qdisc_reset(*old);
-	}
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &q->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index ba6487f2741f..1b4aaec64a24 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -268,13 +268,7 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->queues[band];
-	q->queues[band] = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &q->queues[band]);
 	return 0;
 }
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 3dc3a6e56052..b5c52caf2e73 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -617,11 +617,7 @@ static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
 			new = &noop_qdisc;
 	}
 
-	sch_tree_lock(sch);
-	qfq_purge_queue(cl);
-	*old = cl->qdisc;
-	cl->qdisc = new;
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &cl->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 6c0534cc7758..d5abcee454d8 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -313,12 +313,7 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->qdisc;
-	q->qdisc = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &q->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 5bbb6332ec57..0e74e55fda15 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -606,12 +606,7 @@ static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->qdisc;
-	q->qdisc = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
+	*old = qdisc_replace(sch, new, &q->qdisc);
 	return 0;
 }
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index a4afde14e865..56a1aef3495f 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -502,13 +502,7 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (new == NULL)
 		new = &noop_qdisc;
 
-	sch_tree_lock(sch);
-	*old = q->qdisc;
-	q->qdisc = new;
-	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-	qdisc_reset(*old);
-	sch_tree_unlock(sch);
-
+	*old = qdisc_replace(sch, new, &q->qdisc);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 2ccccf5fb43ff62b2b96cc58d95fc0b3596516e4 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 25 Feb 2016 14:55:01 -0800
Subject: net_sched: update hierarchical backlog too

When the bottom qdisc decides to, for example, drop some packet,
it calls qdisc_tree_decrease_qlen() to update the queue length
for all its ancestors, we need to update the backlog too to
keep the stats on root qdisc accurate.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/codel.h       |  4 ++++
 include/net/sch_generic.h |  5 +++--
 net/sched/sch_api.c       |  8 +++++---
 net/sched/sch_cbq.c       |  5 +++--
 net/sched/sch_choke.c     |  6 ++++--
 net/sched/sch_codel.c     | 10 ++++++----
 net/sched/sch_drr.c       |  3 ++-
 net/sched/sch_fq.c        |  4 +++-
 net/sched/sch_fq_codel.c  | 17 ++++++++++++-----
 net/sched/sch_hfsc.c      |  3 ++-
 net/sched/sch_hhf.c       | 10 +++++++---
 net/sched/sch_htb.c       | 10 ++++++----
 net/sched/sch_multiq.c    |  8 +++++---
 net/sched/sch_netem.c     |  3 ++-
 net/sched/sch_pie.c       |  5 +++--
 net/sched/sch_prio.c      |  7 ++++---
 net/sched/sch_qfq.c       |  3 ++-
 net/sched/sch_red.c       |  3 ++-
 net/sched/sch_sfb.c       |  3 ++-
 net/sched/sch_sfq.c       | 16 +++++++++-------
 net/sched/sch_tbf.c       |  7 +++++--
 21 files changed, 91 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/include/net/codel.h b/include/net/codel.h
index 267e70210061..d168aca115cc 100644
--- a/include/net/codel.h
+++ b/include/net/codel.h
@@ -162,12 +162,14 @@ struct codel_vars {
  * struct codel_stats - contains codel shared variables and stats
  * @maxpacket:	largest packet we've seen so far
  * @drop_count:	temp count of dropped packets in dequeue()
+ * @drop_len:	bytes of dropped packets in dequeue()
  * ecn_mark:	number of packets we ECN marked instead of dropping
  * ce_mark:	number of packets CE marked because sojourn time was above ce_threshold
  */
 struct codel_stats {
 	u32		maxpacket;
 	u32		drop_count;
+	u32		drop_len;
 	u32		ecn_mark;
 	u32		ce_mark;
 };
@@ -308,6 +310,7 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
 								  vars->rec_inv_sqrt);
 					goto end;
 				}
+				stats->drop_len += qdisc_pkt_len(skb);
 				qdisc_drop(skb, sch);
 				stats->drop_count++;
 				skb = dequeue_func(vars, sch);
@@ -330,6 +333,7 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
 		if (params->ecn && INET_ECN_set_ce(skb)) {
 			stats->ecn_mark++;
 		} else {
+			stats->drop_len += qdisc_pkt_len(skb);
 			qdisc_drop(skb, sch);
 			stats->drop_count++;
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 8fdad9f7a2fb..e5bba897d206 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -396,7 +396,8 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 			      struct Qdisc *qdisc);
 void qdisc_reset(struct Qdisc *qdisc);
 void qdisc_destroy(struct Qdisc *qdisc);
-void qdisc_tree_decrease_qlen(struct Qdisc *qdisc, unsigned int n);
+void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n,
+			       unsigned int len);
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  const struct Qdisc_ops *ops);
 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
@@ -716,7 +717,7 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
 	old = *pold;
 	*pold = new;
 	if (old != NULL) {
-		qdisc_tree_decrease_qlen(old, old->q.qlen);
+		qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog);
 		qdisc_reset(old);
 	}
 	sch_tree_unlock(sch);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index de1e176e35cc..3b180ff72f79 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -744,14 +744,15 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
 	return 0;
 }
 
-void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
+void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
+			       unsigned int len)
 {
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
 	u32 parentid;
 	int drops;
 
-	if (n == 0)
+	if (n == 0 && len == 0)
 		return;
 	drops = max_t(int, n, 0);
 	rcu_read_lock();
@@ -774,11 +775,12 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 			cops->put(sch, cl);
 		}
 		sch->q.qlen -= n;
+		sch->qstats.backlog -= len;
 		__qdisc_qstats_drop(sch, drops);
 	}
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
+EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
 
 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 			       struct nlmsghdr *n, u32 clid,
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 7f8474cdce32..baafddf229ce 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1909,7 +1909,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl = (struct cbq_class *)arg;
-	unsigned int qlen;
+	unsigned int qlen, backlog;
 
 	if (cl->filters || cl->children || cl == &q->link)
 		return -EBUSY;
@@ -1917,8 +1917,9 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 	sch_tree_lock(sch);
 
 	qlen = cl->q->q.qlen;
+	backlog = cl->q->qstats.backlog;
 	qdisc_reset(cl->q);
-	qdisc_tree_decrease_qlen(cl->q, qlen);
+	qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
 
 	if (cl->next_alive)
 		cbq_deactivate_class(cl);
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 5ffb8b8337c7..0a08c860eee4 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -128,8 +128,8 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
 		choke_zap_tail_holes(q);
 
 	qdisc_qstats_backlog_dec(sch, skb);
+	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
 	qdisc_drop(skb, sch);
-	qdisc_tree_decrease_qlen(sch, 1);
 	--sch->q.qlen;
 }
 
@@ -456,6 +456,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 		old = q->tab;
 		if (old) {
 			unsigned int oqlen = sch->q.qlen, tail = 0;
+			unsigned dropped = 0;
 
 			while (q->head != q->tail) {
 				struct sk_buff *skb = q->tab[q->head];
@@ -467,11 +468,12 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 					ntab[tail++] = skb;
 					continue;
 				}
+				dropped += qdisc_pkt_len(skb);
 				qdisc_qstats_backlog_dec(sch, skb);
 				--sch->q.qlen;
 				qdisc_drop(skb, sch);
 			}
-			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
 			q->head = 0;
 			q->tail = tail;
 		}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 535007d5f0b5..9b7e2980ee5c 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -79,12 +79,13 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
 
 	skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
 
-	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+	/* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
 	 * or HTB crashes. Defer it for next round.
 	 */
 	if (q->stats.drop_count && sch->q.qlen) {
-		qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
+		qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len);
 		q->stats.drop_count = 0;
+		q->stats.drop_len = 0;
 	}
 	if (skb)
 		qdisc_bstats_update(sch, skb);
@@ -116,7 +117,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct codel_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_CODEL_MAX + 1];
-	unsigned int qlen;
+	unsigned int qlen, dropped = 0;
 	int err;
 
 	if (!opt)
@@ -156,10 +157,11 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = __skb_dequeue(&sch->q);
 
+		dropped += qdisc_pkt_len(skb);
 		qdisc_qstats_backlog_dec(sch, skb);
 		qdisc_drop(skb, sch);
 	}
-	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index b96c9a8e70ab..a63e879e8975 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -53,9 +53,10 @@ static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
 static void drr_purge_queue(struct drr_class *cl)
 {
 	unsigned int len = cl->qdisc->q.qlen;
+	unsigned int backlog = cl->qdisc->qstats.backlog;
 
 	qdisc_reset(cl->qdisc);
-	qdisc_tree_decrease_qlen(cl->qdisc, len);
+	qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
 }
 
 static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 109b2322778f..3c6a47d66a04 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -662,6 +662,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_FQ_MAX + 1];
 	int err, drop_count = 0;
+	unsigned drop_len = 0;
 	u32 fq_log;
 
 	if (!opt)
@@ -736,10 +737,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 
 		if (!skb)
 			break;
+		drop_len += qdisc_pkt_len(skb);
 		kfree_skb(skb);
 		drop_count++;
 	}
-	qdisc_tree_decrease_qlen(sch, drop_count);
+	qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
 
 	sch_tree_unlock(sch);
 	return err;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 4c834e93dafb..d3fc8f9dd3d4 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -175,7 +175,7 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
 static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
-	unsigned int idx;
+	unsigned int idx, prev_backlog;
 	struct fq_codel_flow *flow;
 	int uninitialized_var(ret);
 
@@ -203,6 +203,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (++sch->q.qlen <= sch->limit)
 		return NET_XMIT_SUCCESS;
 
+	prev_backlog = sch->qstats.backlog;
 	q->drop_overlimit++;
 	/* Return Congestion Notification only if we dropped a packet
 	 * from this flow.
@@ -211,7 +212,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return NET_XMIT_CN;
 
 	/* As we dropped a packet, better let upper stack know this */
-	qdisc_tree_decrease_qlen(sch, 1);
+	qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
 	return NET_XMIT_SUCCESS;
 }
 
@@ -241,6 +242,7 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
 	struct fq_codel_flow *flow;
 	struct list_head *head;
 	u32 prev_drop_count, prev_ecn_mark;
+	unsigned int prev_backlog;
 
 begin:
 	head = &q->new_flows;
@@ -259,6 +261,7 @@ begin:
 
 	prev_drop_count = q->cstats.drop_count;
 	prev_ecn_mark = q->cstats.ecn_mark;
+	prev_backlog = sch->qstats.backlog;
 
 	skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
 			    dequeue);
@@ -276,12 +279,14 @@ begin:
 	}
 	qdisc_bstats_update(sch, skb);
 	flow->deficit -= qdisc_pkt_len(skb);
-	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+	/* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
 	 * or HTB crashes. Defer it for next round.
 	 */
 	if (q->cstats.drop_count && sch->q.qlen) {
-		qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+		qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
+					  q->cstats.drop_len);
 		q->cstats.drop_count = 0;
+		q->cstats.drop_len = 0;
 	}
 	return skb;
 }
@@ -372,11 +377,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = fq_codel_dequeue(sch);
 
+		q->cstats.drop_len += qdisc_pkt_len(skb);
 		kfree_skb(skb);
 		q->cstats.drop_count++;
 	}
-	qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+	qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len);
 	q->cstats.drop_count = 0;
+	q->cstats.drop_len = 0;
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 089f3b667d36..d783d7cc3348 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -895,9 +895,10 @@ static void
 hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
 {
 	unsigned int len = cl->qdisc->q.qlen;
+	unsigned int backlog = cl->qdisc->qstats.backlog;
 
 	qdisc_reset(cl->qdisc);
-	qdisc_tree_decrease_qlen(cl->qdisc, len);
+	qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
 }
 
 static void
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 86b04e31e60b..13d6f83ec491 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -382,6 +382,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	struct hhf_sched_data *q = qdisc_priv(sch);
 	enum wdrr_bucket_idx idx;
 	struct wdrr_bucket *bucket;
+	unsigned int prev_backlog;
 
 	idx = hhf_classify(skb, sch);
 
@@ -409,6 +410,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (++sch->q.qlen <= sch->limit)
 		return NET_XMIT_SUCCESS;
 
+	prev_backlog = sch->qstats.backlog;
 	q->drop_overlimit++;
 	/* Return Congestion Notification only if we dropped a packet from this
 	 * bucket.
@@ -417,7 +419,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return NET_XMIT_CN;
 
 	/* As we dropped a packet, better let upper stack know this. */
-	qdisc_tree_decrease_qlen(sch, 1);
+	qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
 	return NET_XMIT_SUCCESS;
 }
 
@@ -527,7 +529,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct hhf_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_HHF_MAX + 1];
-	unsigned int qlen;
+	unsigned int qlen, prev_backlog;
 	int err;
 	u64 non_hh_quantum;
 	u32 new_quantum = q->quantum;
@@ -577,12 +579,14 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
 	}
 
 	qlen = sch->q.qlen;
+	prev_backlog = sch->qstats.backlog;
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = hhf_dequeue(sch);
 
 		kfree_skb(skb);
 	}
-	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen,
+				  prev_backlog - sch->qstats.backlog);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 0efbcf358cd0..846a7f98cef9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1265,7 +1265,6 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl = (struct htb_class *)arg;
-	unsigned int qlen;
 	struct Qdisc *new_q = NULL;
 	int last_child = 0;
 
@@ -1285,9 +1284,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 	sch_tree_lock(sch);
 
 	if (!cl->level) {
-		qlen = cl->un.leaf.q->q.qlen;
+		unsigned int qlen = cl->un.leaf.q->q.qlen;
+		unsigned int backlog = cl->un.leaf.q->qstats.backlog;
+
 		qdisc_reset(cl->un.leaf.q);
-		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
+		qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
 	}
 
 	/* delete from hash and active; remainder in destroy_class */
@@ -1421,10 +1422,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		sch_tree_lock(sch);
 		if (parent && !parent->level) {
 			unsigned int qlen = parent->un.leaf.q->q.qlen;
+			unsigned int backlog = parent->un.leaf.q->qstats.backlog;
 
 			/* turn parent into inner node */
 			qdisc_reset(parent->un.leaf.q);
-			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
+			qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
 			qdisc_destroy(parent->un.leaf.q);
 			if (parent->prio_activity)
 				htb_deactivate(q, parent);
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index a0103a138563..bcdd54bb101c 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -218,7 +218,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 		if (q->queues[i] != &noop_qdisc) {
 			struct Qdisc *child = q->queues[i];
 			q->queues[i] = &noop_qdisc;
-			qdisc_tree_decrease_qlen(child, child->q.qlen);
+			qdisc_tree_reduce_backlog(child, child->q.qlen,
+						  child->qstats.backlog);
 			qdisc_destroy(child);
 		}
 	}
@@ -238,8 +239,9 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 				q->queues[i] = child;
 
 				if (old != &noop_qdisc) {
-					qdisc_tree_decrease_qlen(old,
-								 old->q.qlen);
+					qdisc_tree_reduce_backlog(old,
+								  old->q.qlen,
+								  old->qstats.backlog);
 					qdisc_destroy(old);
 				}
 				sch_tree_unlock(sch);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 0a6ddaf7f561..9640bb39a5d2 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -598,7 +598,8 @@ deliver:
 				if (unlikely(err != NET_XMIT_SUCCESS)) {
 					if (net_xmit_drop_count(err)) {
 						qdisc_qstats_drop(sch);
-						qdisc_tree_decrease_qlen(sch, 1);
+						qdisc_tree_reduce_backlog(sch, 1,
+									  qdisc_pkt_len(skb));
 					}
 				}
 				goto tfifo_dequeue;
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index b783a446d884..71ae3b9629f9 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -183,7 +183,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct pie_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_PIE_MAX + 1];
-	unsigned int qlen;
+	unsigned int qlen, dropped = 0;
 	int err;
 
 	if (!opt)
@@ -232,10 +232,11 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = __skb_dequeue(&sch->q);
 
+		dropped += qdisc_pkt_len(skb);
 		qdisc_qstats_backlog_dec(sch, skb);
 		qdisc_drop(skb, sch);
 	}
-	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 1b4aaec64a24..fee1b15506b2 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -191,7 +191,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
 		struct Qdisc *child = q->queues[i];
 		q->queues[i] = &noop_qdisc;
 		if (child != &noop_qdisc) {
-			qdisc_tree_decrease_qlen(child, child->q.qlen);
+			qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog);
 			qdisc_destroy(child);
 		}
 	}
@@ -210,8 +210,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
 				q->queues[i] = child;
 
 				if (old != &noop_qdisc) {
-					qdisc_tree_decrease_qlen(old,
-								 old->q.qlen);
+					qdisc_tree_reduce_backlog(old,
+								  old->q.qlen,
+								  old->qstats.backlog);
 					qdisc_destroy(old);
 				}
 				sch_tree_unlock(sch);
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index b5c52caf2e73..8d2d8d953432 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -220,9 +220,10 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
 static void qfq_purge_queue(struct qfq_class *cl)
 {
 	unsigned int len = cl->qdisc->q.qlen;
+	unsigned int backlog = cl->qdisc->qstats.backlog;
 
 	qdisc_reset(cl->qdisc);
-	qdisc_tree_decrease_qlen(cl->qdisc, len);
+	qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
 }
 
 static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index d5abcee454d8..8c0508c0e287 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -210,7 +210,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
 	if (child) {
-		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+					  q->qdisc->qstats.backlog);
 		qdisc_destroy(q->qdisc);
 		q->qdisc = child;
 	}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 0e74e55fda15..c69611640fa5 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -510,7 +510,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
 
 	sch_tree_lock(sch);
 
-	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+	qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+				  q->qdisc->qstats.backlog);
 	qdisc_destroy(q->qdisc);
 	q->qdisc = child;
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 3abab534eb5c..498f0a2cb47f 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -346,7 +346,7 @@ static int
 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	unsigned int hash;
+	unsigned int hash, dropped;
 	sfq_index x, qlen;
 	struct sfq_slot *slot;
 	int uninitialized_var(ret);
@@ -461,7 +461,7 @@ enqueue:
 		return NET_XMIT_SUCCESS;
 
 	qlen = slot->qlen;
-	sfq_drop(sch);
+	dropped = sfq_drop(sch);
 	/* Return Congestion Notification only if we dropped a packet
 	 * from this flow.
 	 */
@@ -469,7 +469,7 @@ enqueue:
 		return NET_XMIT_CN;
 
 	/* As we dropped a packet, better let upper stack know this */
-	qdisc_tree_decrease_qlen(sch, 1);
+	qdisc_tree_reduce_backlog(sch, 1, dropped);
 	return NET_XMIT_SUCCESS;
 }
 
@@ -537,6 +537,7 @@ static void sfq_rehash(struct Qdisc *sch)
 	struct sfq_slot *slot;
 	struct sk_buff_head list;
 	int dropped = 0;
+	unsigned int drop_len = 0;
 
 	__skb_queue_head_init(&list);
 
@@ -565,6 +566,7 @@ static void sfq_rehash(struct Qdisc *sch)
 			if (x >= SFQ_MAX_FLOWS) {
 drop:
 				qdisc_qstats_backlog_dec(sch, skb);
+				drop_len += qdisc_pkt_len(skb);
 				kfree_skb(skb);
 				dropped++;
 				continue;
@@ -594,7 +596,7 @@ drop:
 		}
 	}
 	sch->q.qlen -= dropped;
-	qdisc_tree_decrease_qlen(sch, dropped);
+	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 }
 
 static void sfq_perturbation(unsigned long arg)
@@ -618,7 +620,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	struct tc_sfq_qopt *ctl = nla_data(opt);
 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
-	unsigned int qlen;
+	unsigned int qlen, dropped = 0;
 	struct red_parms *p = NULL;
 
 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
@@ -667,8 +669,8 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 
 	qlen = sch->q.qlen;
 	while (sch->q.qlen > q->limit)
-		sfq_drop(sch);
-	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+		dropped += sfq_drop(sch);
+	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 
 	del_timer(&q->perturb_timer);
 	if (q->perturb_period) {
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 56a1aef3495f..c2fbde742f37 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -160,6 +160,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 	struct tbf_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *segs, *nskb;
 	netdev_features_t features = netif_skb_features(skb);
+	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 	int ret, nb;
 
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
@@ -172,6 +173,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 		nskb = segs->next;
 		segs->next = NULL;
 		qdisc_skb_cb(segs)->pkt_len = segs->len;
+		len += segs->len;
 		ret = qdisc_enqueue(segs, q->qdisc);
 		if (ret != NET_XMIT_SUCCESS) {
 			if (net_xmit_drop_count(ret))
@@ -183,7 +185,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 	}
 	sch->q.qlen += nb;
 	if (nb > 1)
-		qdisc_tree_decrease_qlen(sch, 1 - nb);
+		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 	consume_skb(skb);
 	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 }
@@ -399,7 +401,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 
 	sch_tree_lock(sch);
 	if (child) {
-		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+					  q->qdisc->qstats.backlog);
 		qdisc_destroy(q->qdisc);
 		q->qdisc = child;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 431e3a8e36a05a37126f34b41aa3a5a6456af04e Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 25 Feb 2016 14:55:02 -0800
Subject: sch_htb: update backlog as well

We saw qlen!=0 but backlog==0 on our production machine:

qdisc htb 1: dev eth0 root refcnt 2 r2q 10 default 1 direct_packets_stat 0 ver 3.17
 Sent 172680457356 bytes 222469449 pkt (dropped 0, overlimits 123575834 requeues 0)
 backlog 0b 72p requeues 0

The problem is we only count qlen for HTB qdisc but not backlog.
We need to update backlog too when we update qlen, so that we
can at least know the average packet length.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_htb.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 846a7f98cef9..87b02ed3d5f2 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -600,6 +600,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		htb_activate(q, cl);
 	}
 
+	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 	return NET_XMIT_SUCCESS;
 }
@@ -889,6 +890,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 ok:
 		qdisc_bstats_update(sch, skb);
 		qdisc_unthrottled(sch);
+		qdisc_qstats_backlog_dec(sch, skb);
 		sch->q.qlen--;
 		return skb;
 	}
@@ -955,6 +957,7 @@ static unsigned int htb_drop(struct Qdisc *sch)
 			unsigned int len;
 			if (cl->un.leaf.q->ops->drop &&
 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
+				sch->qstats.backlog -= len;
 				sch->q.qlen--;
 				if (!cl->un.leaf.q->q.qlen)
 					htb_deactivate(q, cl);
@@ -984,12 +987,12 @@ static void htb_reset(struct Qdisc *sch)
 			}
 			cl->prio_activity = 0;
 			cl->cmode = HTB_CAN_SEND;
-
 		}
 	}
 	qdisc_watchdog_cancel(&q->watchdog);
 	__skb_queue_purge(&q->direct_queue);
 	sch->q.qlen = 0;
+	sch->qstats.backlog = 0;
 	memset(q->hlevel, 0, sizeof(q->hlevel));
 	memset(q->row_mask, 0, sizeof(q->row_mask));
 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
-- 
cgit v1.2.3-59-g8ed1b


From bdf17661f63a79c3cb4209b970b1cc39e34f7543 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 25 Feb 2016 14:55:03 -0800
Subject: sch_dsmark: update backlog as well

Similarly, we need to update backlog too when we update qlen.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_dsmark.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index cfddb1c635c3..d0dff0cd8186 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -258,6 +258,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return err;
 	}
 
+	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
@@ -280,6 +281,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 		return NULL;
 
 	qdisc_bstats_update(sch, skb);
+	qdisc_qstats_backlog_dec(sch, skb);
 	sch->q.qlen--;
 
 	index = skb->tc_index & (p->indices - 1);
@@ -395,6 +397,7 @@ static void dsmark_reset(struct Qdisc *sch)
 
 	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 	qdisc_reset(p->q);
+	sch->qstats.backlog = 0;
 	sch->q.qlen = 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 45493d47c3db862263583400c1f721f13f2a0610 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 26 Feb 2016 10:45:38 +0100
Subject: bridge: notify enslaved devices of headroom changes

On bridge needed_headroom changes, the enslaved devices are
notified via the ndo_set_rx_headroom method

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_if.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index b37a1cc97d98..a73df3315df9 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head)
 	destroy_nbp(p);
 }
 
+static unsigned get_max_headroom(struct net_bridge *br)
+{
+	unsigned max_headroom = 0;
+	struct net_bridge_port *p;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		unsigned dev_headroom = netdev_get_fwd_headroom(p->dev);
+
+		if (dev_headroom > max_headroom)
+			max_headroom = dev_headroom;
+	}
+
+	return max_headroom;
+}
+
+static void update_headroom(struct net_bridge *br, int new_hr)
+{
+	struct net_bridge_port *p;
+
+	list_for_each_entry(p, &br->port_list, list)
+		netdev_set_rx_headroom(p->dev, new_hr);
+
+	br->dev->needed_headroom = new_hr;
+}
+
 /* Delete port(interface) from bridge is done in two steps.
  * via RCU. First step, marks device as down. That deletes
  * all the timers and stops new packets from flowing through.
@@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p)
 	br_ifinfo_notify(RTM_DELLINK, p);
 
 	list_del_rcu(&p->list);
+	if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom)
+		update_headroom(br, get_max_headroom(br));
+	netdev_reset_rx_headroom(dev);
 
 	nbp_vlan_flush(p);
 	br_fdb_delete_by_port(br, p, 0, 1);
@@ -438,6 +466,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 {
 	struct net_bridge_port *p;
 	int err = 0;
+	unsigned br_hr, dev_hr;
 	bool changed_addr;
 
 	/* Don't allow bridging non-ethernet like devices, or DSA-enabled
@@ -505,8 +534,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	netdev_update_features(br->dev);
 
-	if (br->dev->needed_headroom < dev->needed_headroom)
-		br->dev->needed_headroom = dev->needed_headroom;
+	br_hr = br->dev->needed_headroom;
+	dev_hr = netdev_get_fwd_headroom(dev);
+	if (br_hr < dev_hr)
+		update_headroom(br, dev_hr);
+	else
+		netdev_set_rx_headroom(dev, br_hr);
 
 	if (br_fdb_insert(br, p, dev->dev_addr, 0))
 		netdev_err(dev, "failed insert local address bridge forwarding table\n");
-- 
cgit v1.2.3-59-g8ed1b


From 3a927bc7cf9d0fbe8f4a8189dd5f8440228f64e7 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 26 Feb 2016 10:45:39 +0100
Subject: ovs: propagate per dp max headroom to all vports

This patch implements bookkeeping support to compute the maximum
headroom for all the devices in each datapath. When said value
changes, the underlying devs are notified via the
ndo_set_rx_headroom method.

This also increases the internal vports xmit performance.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c           | 40 ++++++++++++++++++++++++++++++++++++
 net/openvswitch/datapath.h           |  4 ++++
 net/openvswitch/vport-internal_dev.c | 10 ++++++++-
 3 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c4e8455d5d56..e6a7d494df24 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1908,6 +1908,29 @@ static struct vport *lookup_vport(struct net *net,
 		return ERR_PTR(-EINVAL);
 }
 
+/* Called with ovs_mutex */
+static void update_headroom(struct datapath *dp)
+{
+	unsigned dev_headroom, max_headroom = 0;
+	struct net_device *dev;
+	struct vport *vport;
+	int i;
+
+	for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
+		hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
+			dev = vport->dev;
+			dev_headroom = netdev_get_fwd_headroom(dev);
+			if (dev_headroom > max_headroom)
+				max_headroom = dev_headroom;
+		}
+	}
+
+	dp->max_headroom = max_headroom;
+	for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
+		hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
+			netdev_set_rx_headroom(vport->dev, max_headroom);
+}
+
 static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr **a = info->attrs;
@@ -1973,6 +1996,12 @@ restart:
 
 	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
 				      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+
+	if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
+		update_headroom(dp);
+	else
+		netdev_set_rx_headroom(vport->dev, dp->max_headroom);
+
 	BUG_ON(err < 0);
 	ovs_unlock();
 
@@ -2039,8 +2068,10 @@ exit_unlock_free:
 
 static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 {
+	bool must_update_headroom = false;
 	struct nlattr **a = info->attrs;
 	struct sk_buff *reply;
+	struct datapath *dp;
 	struct vport *vport;
 	int err;
 
@@ -2062,7 +2093,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
 				      info->snd_seq, 0, OVS_VPORT_CMD_DEL);
 	BUG_ON(err < 0);
+
+	/* the vport deletion may trigger dp headroom update */
+	dp = vport->dp;
+	if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
+		must_update_headroom = true;
+	netdev_reset_rx_headroom(vport->dev);
 	ovs_dp_detach_port(vport);
+
+	if (must_update_headroom)
+		update_headroom(dp);
 	ovs_unlock();
 
 	ovs_notify(&dp_vport_genl_family, reply, info);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 67bdecd9fdc1..427e39a045cf 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -68,6 +68,8 @@ struct dp_stats_percpu {
  * ovs_mutex and RCU.
  * @stats_percpu: Per-CPU datapath statistics.
  * @net: Reference to net namespace.
+ * @max_headroom: the maximum headroom of all vports in this datapath; it will
+ * be used by all the internal vports in this dp.
  *
  * Context: See the comment on locking at the top of datapath.c for additional
  * locking information.
@@ -89,6 +91,8 @@ struct datapath {
 	possible_net_t net;
 
 	u32 user_features;
+
+	u32 max_headroom;
 };
 
 /**
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index ec76398a792f..83a5534abd31 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	return stats;
 }
 
+void internal_set_rx_headroom(struct net_device *dev, int new_hr)
+{
+	dev->needed_headroom = new_hr;
+}
+
 static const struct net_device_ops internal_dev_netdev_ops = {
 	.ndo_open = internal_dev_open,
 	.ndo_stop = internal_dev_stop,
@@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = {
 	.ndo_set_mac_address = eth_mac_addr,
 	.ndo_change_mtu = internal_dev_change_mtu,
 	.ndo_get_stats64 = internal_get_stats,
+	.ndo_set_rx_headroom = internal_set_rx_headroom,
 };
 
 static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev)
 	netdev->netdev_ops = &internal_dev_netdev_ops;
 
 	netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
-	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH;
+	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
+			      IFF_PHONY_HEADROOM;
 	netdev->destructor = internal_dev_destructor;
 	netdev->ethtool_ops = &internal_dev_ethtool_ops;
 	netdev->rtnl_link_ops = &internal_dev_link_ops;
@@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
 		err = -ENOMEM;
 		goto error_free_netdev;
 	}
+	vport->dev->needed_headroom = vport->dp->max_headroom;
 
 	dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
 	internal_dev = internal_dev_priv(vport->dev);
-- 
cgit v1.2.3-59-g8ed1b


From 6843e7a2abe7cac10c19702ffec90018df6f040d Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 26 Feb 2016 07:53:49 -0800
Subject: net: sched: consolidate offload decision in cls_u32

The offload decision was originally very basic and tied to if the dev
implemented the appropriate ndo op hook. The next step is to allow
the user to more flexibly define if any paticular rule should be
offloaded or not. In order to have this logic in one function lift
the current check into a helper routine tc_should_offload().

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 5 +++++
 net/sched/cls_u32.c   | 8 ++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 2121df574262..e64d20b81047 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -392,4 +392,9 @@ struct tc_cls_u32_offload {
 	};
 };
 
+static inline bool tc_should_offload(struct net_device *dev)
+{
+	return dev->netdev_ops->ndo_setup_tc;
+}
+
 #endif
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index d54bc942ea87..24e888b9b728 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -434,7 +434,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (dev->netdev_ops->ndo_setup_tc) {
+	if (tc_should_offload(dev)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -451,7 +451,7 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (dev->netdev_ops->ndo_setup_tc) {
+	if (tc_should_offload(dev)) {
 		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -471,7 +471,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (dev->netdev_ops->ndo_setup_tc) {
+	if (tc_should_offload(dev)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -491,7 +491,7 @@ static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (dev->netdev_ops->ndo_setup_tc) {
+	if (tc_should_offload(dev)) {
 		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
 		offload.cls_u32->knode.handle = n->handle;
 		offload.cls_u32->knode.fshift = n->fshift;
-- 
cgit v1.2.3-59-g8ed1b


From 9e8ce79cd711d4dfe09d8bba6822cd9bb7db96bd Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 26 Feb 2016 07:54:39 -0800
Subject: net: sched: cls_u32 add bit to specify software only rules

In the initial implementation the only way to stop a rule from being
inserted into the hardware table was via the device feature flag.
However this doesn't work well when working on an end host system
where packets are expect to hit both the hardware and software
datapaths.

For example we can imagine a rule that will match an IP address and
increment a field. If we install this rule in both hardware and
software we may increment the field twice. To date we have only
added support for the drop action so we have been able to ignore
these cases. But as we extend the action support we will hit this
example plus more such cases. Arguably these are not even corner
cases in many working systems these cases will be common.

To avoid forcing the driver to always abort (i.e. the above example)
this patch adds a flag to add a rule in software only. A careful
user can use this flag to build software and hardware datapaths
that work together. One example we have found particularly useful
is to use hardware resources to set the skb->mark on the skb when
the match may be expensive to run in software but a mark lookup
in a hash table is cheap. The idea here is hardware can do in one
lookup what the u32 classifier may need to traverse multiple lists
and hash tables to compute. The flag is only passed down on inserts.
On deletion to avoid stale references in hardware we always try
to remove a rule if it exists.

The flags field is part of the classifier specific options. Although
it is tempting to lift this into the generic structure doing this
proves difficult do to how the tc netlink attributes are implemented
along with how the dump/change routines are called. There is also
precedence for putting seemingly generic pieces in the specific
classifier options such as TCA_U32_POLICE, TCA_U32_ACT, etc. So
although not ideal I've left FLAGS in the u32 options as well as it
simplifies the code greatly and user space has already learned how
to manage these bits ala 'tc' tool.

Another thing if trying to update a rule we require the flags to
be unchanged. This is to force user space, software u32 and
the hardware u32 to keep in sync. Thanks to Simon Horman for
catching this case.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        | 13 +++++++++++--
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/cls_u32.c          | 37 +++++++++++++++++++++++++++----------
 3 files changed, 39 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6096e96fb78b..bea14eee373e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -392,12 +392,21 @@ struct tc_cls_u32_offload {
 	};
 };
 
-static inline bool tc_should_offload(struct net_device *dev)
+/* tca flags definitions */
+#define TCA_CLS_FLAGS_SKIP_HW 1
+
+static inline bool tc_should_offload(struct net_device *dev, u32 flags)
 {
 	if (!(dev->features & NETIF_F_HW_TC))
 		return false;
 
-	return dev->netdev_ops->ndo_setup_tc;
+	if (flags & TCA_CLS_FLAGS_SKIP_HW)
+		return false;
+
+	if (!dev->netdev_ops->ndo_setup_tc)
+		return false;
+
+	return true;
 }
 
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 439873775d49..9874f5680926 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -172,6 +172,7 @@ enum {
 	TCA_U32_INDEV,
 	TCA_U32_PCNT,
 	TCA_U32_MARK,
+	TCA_U32_FLAGS,
 	__TCA_U32_MAX
 };
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 24e888b9b728..563cdad76448 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -59,6 +59,7 @@ struct tc_u_knode {
 #ifdef CONFIG_CLS_U32_PERF
 	struct tc_u32_pcnt __percpu *pf;
 #endif
+	u32			flags;
 #ifdef CONFIG_CLS_U32_MARK
 	u32			val;
 	u32			mask;
@@ -434,7 +435,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -442,7 +443,9 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	}
 }
 
-static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+static void u32_replace_hw_hnode(struct tcf_proto *tp,
+				 struct tc_u_hnode *h,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -451,7 +454,7 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -471,7 +474,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -482,7 +485,9 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	}
 }
 
-static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
+static void u32_replace_hw_knode(struct tcf_proto *tp,
+				 struct tc_u_knode *n,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -491,7 +496,7 @@ static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
 		offload.cls_u32->knode.handle = n->handle;
 		offload.cls_u32->knode.fshift = n->fshift;
@@ -679,6 +684,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
 	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
 	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
+	[TCA_U32_FLAGS]		= { .type = NLA_U32 },
 };
 
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
@@ -786,6 +792,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
 #endif
 	new->fshift = n->fshift;
 	new->res = n->res;
+	new->flags = n->flags;
 	RCU_INIT_POINTER(new->ht_down, n->ht_down);
 
 	/* bump reference count as long as we hold pointer to structure */
@@ -825,7 +832,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	struct tc_u32_sel *s;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_U32_MAX + 1];
-	u32 htid;
+	u32 htid, flags = 0;
 	int err;
 #ifdef CONFIG_CLS_U32_PERF
 	size_t size;
@@ -838,6 +845,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
+	if (tb[TCA_U32_FLAGS])
+		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
+
 	n = (struct tc_u_knode *)*arg;
 	if (n) {
 		struct tc_u_knode *new;
@@ -845,6 +855,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (TC_U32_KEY(n->handle) == 0)
 			return -EINVAL;
 
+		if (n->flags != flags)
+			return -EINVAL;
+
 		new = u32_init_knode(tp, n);
 		if (!new)
 			return -ENOMEM;
@@ -861,7 +874,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
-		u32_replace_hw_knode(tp, new);
+		u32_replace_hw_knode(tp, new, flags);
 		return 0;
 	}
 
@@ -889,7 +902,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		rcu_assign_pointer(tp_c->hlist, ht);
 		*arg = (unsigned long)ht;
 
-		u32_replace_hw_hnode(tp, ht);
+		u32_replace_hw_hnode(tp, ht, flags);
 		return 0;
 	}
 
@@ -940,6 +953,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	RCU_INIT_POINTER(n->ht_up, ht);
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+	n->flags = flags;
 	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
 	n->tp = tp;
 
@@ -972,7 +986,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
-		u32_replace_hw_knode(tp, n);
+		u32_replace_hw_knode(tp, n, flags);
 		*arg = (unsigned long)n;
 		return 0;
 	}
@@ -1077,6 +1091,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
 			goto nla_put_failure;
 
+		if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
+			goto nla_put_failure;
+
 #ifdef CONFIG_CLS_U32_MARK
 		if ((n->val || n->mask)) {
 			struct tc_u32_mark mark = {.val = n->val,
-- 
cgit v1.2.3-59-g8ed1b


From bfcd3a46617209454cfc0947ab093e37fd1e84ef Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 26 Feb 2016 17:32:23 +0100
Subject: Introduce devlink infrastructure

Introduce devlink infrastructure for drivers to register and expose to
userspace via generic Netlink interface.

There are two basic objects defined:
devlink - one instance for every "parent device", for example switch ASIC
devlink port - one instance for every physical port of the device.

This initial portion implements basic get/dump of objects to userspace.
Also, port splitter and port type setting is implemented.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                  |   8 +
 include/net/devlink.h        | 140 ++++++++
 include/uapi/linux/devlink.h |  72 +++++
 net/Kconfig                  |   7 +
 net/core/Makefile            |   1 +
 net/core/devlink.c           | 738 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 966 insertions(+)
 create mode 100644 include/net/devlink.h
 create mode 100644 include/uapi/linux/devlink.h
 create mode 100644 net/core/devlink.c

(limited to 'net')

diff --git a/MAINTAINERS b/MAINTAINERS
index 12b764f4c93c..e45682745263 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3499,6 +3499,14 @@ F:	include/linux/device-mapper.h
 F:	include/linux/dm-*.h
 F:	include/uapi/linux/dm-*.h
 
+DEVLINK
+M:	Jiri Pirko <jiri@mellanox.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	net/core/devlink.c
+F:	include/net/devlink.h
+F:	include/uapi/linux/devlink.h
+
 DIALOG SEMICONDUCTOR DRIVERS
 M:	Support Opensource <support.opensource@diasemi.com>
 W:	http://www.dialog-semiconductor.com/products
diff --git a/include/net/devlink.h b/include/net/devlink.h
new file mode 100644
index 000000000000..c37d257891d6
--- /dev/null
+++ b/include/net/devlink.h
@@ -0,0 +1,140 @@
+/*
+ * include/net/devlink.h - Network physical device Netlink interface
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef _NET_DEVLINK_H_
+#define _NET_DEVLINK_H_
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <uapi/linux/devlink.h>
+
+struct devlink_ops;
+
+struct devlink {
+	struct list_head list;
+	struct list_head port_list;
+	const struct devlink_ops *ops;
+	struct device *dev;
+	possible_net_t _net;
+	char priv[0] __aligned(NETDEV_ALIGN);
+};
+
+struct devlink_port {
+	struct list_head list;
+	struct devlink *devlink;
+	unsigned index;
+	bool registered;
+	enum devlink_port_type type;
+	enum devlink_port_type desired_type;
+	void *type_dev;
+	bool split;
+	u32 split_group;
+};
+
+struct devlink_ops {
+	size_t priv_size;
+	int (*port_type_set)(struct devlink_port *devlink_port,
+			     enum devlink_port_type port_type);
+	int (*port_split)(struct devlink *devlink, unsigned int port_index,
+			  unsigned int count);
+	int (*port_unsplit)(struct devlink *devlink, unsigned int port_index);
+};
+
+static inline void *devlink_priv(struct devlink *devlink)
+{
+	BUG_ON(!devlink);
+	return &devlink->priv;
+}
+
+static inline struct devlink *priv_to_devlink(void *priv)
+{
+	BUG_ON(!priv);
+	return container_of(priv, struct devlink, priv);
+}
+
+struct ib_device;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
+int devlink_register(struct devlink *devlink, struct device *dev);
+void devlink_unregister(struct devlink *devlink);
+void devlink_free(struct devlink *devlink);
+int devlink_port_register(struct devlink *devlink,
+			  struct devlink_port *devlink_port,
+			  unsigned int port_index);
+void devlink_port_unregister(struct devlink_port *devlink_port);
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+			       struct net_device *netdev);
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+			      struct ib_device *ibdev);
+void devlink_port_type_clear(struct devlink_port *devlink_port);
+void devlink_port_split_set(struct devlink_port *devlink_port,
+			    u32 split_group);
+
+#else
+
+static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
+					    size_t priv_size)
+{
+	return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL);
+}
+
+static inline int devlink_register(struct devlink *devlink, struct device *dev)
+{
+	return 0;
+}
+
+static inline void devlink_unregister(struct devlink *devlink)
+{
+}
+
+static inline void devlink_free(struct devlink *devlink)
+{
+	kfree(devlink);
+}
+
+static inline int devlink_port_register(struct devlink *devlink,
+					struct devlink_port *devlink_port,
+					unsigned int port_index)
+{
+	return 0;
+}
+
+static inline void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+}
+
+static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+					     struct net_device *netdev)
+{
+}
+
+static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+					    struct ib_device *ibdev)
+{
+}
+
+static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+}
+
+static inline void devlink_port_split_set(struct devlink_port *devlink_port,
+					  u32 split_group)
+{
+}
+
+#endif
+
+#endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
new file mode 100644
index 000000000000..c9fee5781eb1
--- /dev/null
+++ b/include/uapi/linux/devlink.h
@@ -0,0 +1,72 @@
+/*
+ * include/uapi/linux/devlink.h - Network physical device Netlink interface
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_DEVLINK_H_
+#define _UAPI_LINUX_DEVLINK_H_
+
+#define DEVLINK_GENL_NAME "devlink"
+#define DEVLINK_GENL_VERSION 0x1
+#define DEVLINK_GENL_MCGRP_CONFIG_NAME "config"
+
+enum devlink_command {
+	/* don't change the order or add anything between, this is ABI! */
+	DEVLINK_CMD_UNSPEC,
+
+	DEVLINK_CMD_GET,		/* can dump */
+	DEVLINK_CMD_SET,
+	DEVLINK_CMD_NEW,
+	DEVLINK_CMD_DEL,
+
+	DEVLINK_CMD_PORT_GET,		/* can dump */
+	DEVLINK_CMD_PORT_SET,
+	DEVLINK_CMD_PORT_NEW,
+	DEVLINK_CMD_PORT_DEL,
+
+	DEVLINK_CMD_PORT_SPLIT,
+	DEVLINK_CMD_PORT_UNSPLIT,
+
+	/* add new commands above here */
+
+	__DEVLINK_CMD_MAX,
+	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
+};
+
+enum devlink_port_type {
+	DEVLINK_PORT_TYPE_NOTSET,
+	DEVLINK_PORT_TYPE_AUTO,
+	DEVLINK_PORT_TYPE_ETH,
+	DEVLINK_PORT_TYPE_IB,
+};
+
+enum devlink_attr {
+	/* don't change the order or add anything between, this is ABI! */
+	DEVLINK_ATTR_UNSPEC,
+
+	/* bus name + dev name together are a handle for devlink entity */
+	DEVLINK_ATTR_BUS_NAME,			/* string */
+	DEVLINK_ATTR_DEV_NAME,			/* string */
+
+	DEVLINK_ATTR_PORT_INDEX,		/* u32 */
+	DEVLINK_ATTR_PORT_TYPE,			/* u16 */
+	DEVLINK_ATTR_PORT_DESIRED_TYPE,		/* u16 */
+	DEVLINK_ATTR_PORT_NETDEV_IFINDEX,	/* u32 */
+	DEVLINK_ATTR_PORT_NETDEV_NAME,		/* string */
+	DEVLINK_ATTR_PORT_IBDEV_NAME,		/* string */
+	DEVLINK_ATTR_PORT_SPLIT_COUNT,		/* u32 */
+	DEVLINK_ATTR_PORT_SPLIT_GROUP,		/* u32 */
+
+	/* add new attributes above here, update the policy in devlink.c */
+
+	__DEVLINK_ATTR_MAX,
+	DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1
+};
+
+#endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/Kconfig b/net/Kconfig
index b80efecfc1a0..6c9cfb0d7639 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -396,6 +396,13 @@ config DST_CACHE
 	bool "dst cache"
 	default n
 
+config NET_DEVLINK
+	tristate "Network physical/parent device Netlink interface"
+	help
+	  Network physical/parent device Netlink interface provides
+	  infrastructure to support access to physical chip-wide config and
+	  monitoring.
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF_JIT
diff --git a/net/core/Makefile b/net/core/Makefile
index 7a8fb8aef992..014422e2561f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/devlink.c b/net/core/devlink.c
new file mode 100644
index 000000000000..590fa561cb7f
--- /dev/null
+++ b/net/core/devlink.c
@@ -0,0 +1,738 @@
+/*
+ * net/core/devlink.c - Network physical/parent device Netlink interface
+ *
+ * Heavily inspired by net/wireless/
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <rdma/ib_verbs.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/devlink.h>
+
+static LIST_HEAD(devlink_list);
+
+/* devlink_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ * It also guards devlink devices list and it is taken when
+ * driver registers/unregisters it.
+ */
+static DEFINE_MUTEX(devlink_mutex);
+
+/* devlink_port_mutex
+ *
+ * Shared lock to guard lists of ports in all devlink devices.
+ */
+static DEFINE_MUTEX(devlink_port_mutex);
+
+static struct net *devlink_net(const struct devlink *devlink)
+{
+	return read_pnet(&devlink->_net);
+}
+
+static void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+	write_pnet(&devlink->_net, net);
+}
+
+static struct devlink *devlink_get_from_attrs(struct net *net,
+					      struct nlattr **attrs)
+{
+	struct devlink *devlink;
+	char *busname;
+	char *devname;
+
+	if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+		return ERR_PTR(-EINVAL);
+
+	busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+	devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+		    strcmp(dev_name(devlink->dev), devname) == 0 &&
+		    net_eq(devlink_net(devlink), net))
+			return devlink;
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+static struct devlink *devlink_get_from_info(struct genl_info *info)
+{
+	return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+}
+
+static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+						      int port_index)
+{
+	struct devlink_port *devlink_port;
+
+	list_for_each_entry(devlink_port, &devlink->port_list, list) {
+		if (devlink_port->index == port_index)
+			return devlink_port;
+	}
+	return NULL;
+}
+
+static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
+{
+	return devlink_port_get_by_index(devlink, port_index);
+}
+
+static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+							struct nlattr **attrs)
+{
+	if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+		u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+		struct devlink_port *devlink_port;
+
+		devlink_port = devlink_port_get_by_index(devlink, port_index);
+		if (!devlink_port)
+			return ERR_PTR(-ENODEV);
+		return devlink_port;
+	}
+	return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+						       struct genl_info *info)
+{
+	return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+#define DEVLINK_NL_FLAG_NEED_PORT	BIT(0)
+
+static int devlink_nl_pre_doit(const struct genl_ops *ops,
+			       struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink;
+
+	mutex_lock(&devlink_mutex);
+	devlink = devlink_get_from_info(info);
+	if (IS_ERR(devlink)) {
+		mutex_unlock(&devlink_mutex);
+		return PTR_ERR(devlink);
+	}
+	info->user_ptr[0] = devlink;
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
+		struct devlink_port *devlink_port;
+
+		mutex_lock(&devlink_port_mutex);
+		devlink_port = devlink_port_get_from_info(devlink, info);
+		if (IS_ERR(devlink_port)) {
+			mutex_unlock(&devlink_port_mutex);
+			mutex_unlock(&devlink_mutex);
+			return PTR_ERR(devlink_port);
+		}
+		info->user_ptr[1] = devlink_port;
+	}
+	return 0;
+}
+
+static void devlink_nl_post_doit(const struct genl_ops *ops,
+				 struct sk_buff *skb, struct genl_info *info)
+{
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
+		mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink_mutex);
+}
+
+static struct genl_family devlink_nl_family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= DEVLINK_GENL_NAME,
+	.version	= DEVLINK_GENL_VERSION,
+	.maxattr	= DEVLINK_ATTR_MAX,
+	.netnsok	= true,
+	.pre_doit	= devlink_nl_pre_doit,
+	.post_doit	= devlink_nl_post_doit,
+};
+
+enum devlink_multicast_groups {
+	DEVLINK_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+	[DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+	if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+			   enum devlink_command cmd, u32 portid,
+			   u32 seq, int flags)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+				struct devlink_port *devlink_port,
+				enum devlink_command cmd, u32 portid,
+				u32 seq, int flags)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+		goto nla_put_failure;
+	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+		goto nla_put_failure;
+	if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+	    nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+			devlink_port->desired_type))
+		goto nla_put_failure;
+	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+		struct net_device *netdev = devlink_port->type_dev;
+
+		if (netdev &&
+		    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+				 netdev->ifindex) ||
+		     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+				    netdev->name)))
+			goto nla_put_failure;
+	}
+	if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+		struct ib_device *ibdev = devlink_port->type_dev;
+
+		if (ibdev &&
+		    nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+				   ibdev->name))
+			goto nla_put_failure;
+	}
+	if (devlink_port->split &&
+	    nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+			devlink_port->split_group))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+				enum devlink_command cmd)
+{
+	struct devlink *devlink = devlink_port->devlink;
+	struct sk_buff *msg;
+	int err;
+
+	if (!devlink_port->registered)
+		return;
+
+	WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+			      info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
+				     struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+				      NETLINK_CB(cb->skb).portid,
+				      cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			goto out;
+		idx++;
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_port_fill(msg, devlink, devlink_port,
+				   DEVLINK_CMD_PORT_NEW,
+				   info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
+					  struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	struct devlink_port *devlink_port;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink_port_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		list_for_each_entry(devlink_port, &devlink->port_list, list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_port_fill(msg, devlink, devlink_port,
+						   DEVLINK_CMD_NEW,
+						   NETLINK_CB(cb->skb).portid,
+						   cb->nlh->nlmsg_seq,
+						   NLM_F_MULTI);
+			if (err)
+				goto out;
+			idx++;
+		}
+	}
+out:
+	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_port_type_set(struct devlink *devlink,
+				 struct devlink_port *devlink_port,
+				 enum devlink_port_type port_type)
+
+{
+	int err;
+
+	if (devlink->ops && devlink->ops->port_type_set) {
+		if (port_type == DEVLINK_PORT_TYPE_NOTSET)
+			return -EINVAL;
+		err = devlink->ops->port_type_set(devlink_port, port_type);
+		if (err)
+			return err;
+		devlink_port->desired_type = port_type;
+		devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
+	int err;
+
+	if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+		enum devlink_port_type port_type;
+
+		port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+		err = devlink_port_type_set(devlink, devlink_port, port_type);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int devlink_port_split(struct devlink *devlink,
+			      u32 port_index, u32 count)
+
+{
+	if (devlink->ops && devlink->ops->port_split)
+		return devlink->ops->port_split(devlink, port_index, count);
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
+					  struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	u32 port_index;
+	u32 count;
+
+	if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
+	    !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+		return -EINVAL;
+
+	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+	count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+	return devlink_port_split(devlink, port_index, count);
+}
+
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+
+{
+	if (devlink->ops && devlink->ops->port_unsplit)
+		return devlink->ops->port_unsplit(devlink, port_index);
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
+					    struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	u32 port_index;
+
+	if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+	return devlink_port_unsplit(devlink, port_index);
+}
+
+static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+	[DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
+};
+
+static const struct genl_ops devlink_nl_ops[] = {
+	{
+		.cmd = DEVLINK_CMD_GET,
+		.doit = devlink_nl_cmd_get_doit,
+		.dumpit = devlink_nl_cmd_get_dumpit,
+		.policy = devlink_nl_policy,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_GET,
+		.doit = devlink_nl_cmd_port_get_doit,
+		.dumpit = devlink_nl_cmd_port_get_dumpit,
+		.policy = devlink_nl_policy,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_SET,
+		.doit = devlink_nl_cmd_port_set_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_SPLIT,
+		.doit = devlink_nl_cmd_port_split_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_UNSPLIT,
+		.doit = devlink_nl_cmd_port_unsplit_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+};
+
+/**
+ *	devlink_alloc - Allocate new devlink instance resources
+ *
+ *	@ops: ops
+ *	@priv_size: size of user private data
+ *
+ *	Allocate new devlink instance resources, including devlink index
+ *	and name.
+ */
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+{
+	struct devlink *devlink;
+
+	devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
+	if (!devlink)
+		return NULL;
+	devlink->ops = ops;
+	devlink_net_set(devlink, &init_net);
+	INIT_LIST_HEAD(&devlink->port_list);
+	return devlink;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc);
+
+/**
+ *	devlink_register - Register devlink instance
+ *
+ *	@devlink: devlink
+ */
+int devlink_register(struct devlink *devlink, struct device *dev)
+{
+	mutex_lock(&devlink_mutex);
+	devlink->dev = dev;
+	list_add_tail(&devlink->list, &devlink_list);
+	devlink_notify(devlink, DEVLINK_CMD_NEW);
+	mutex_unlock(&devlink_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ *	devlink_unregister - Unregister devlink instance
+ *
+ *	@devlink: devlink
+ */
+void devlink_unregister(struct devlink *devlink)
+{
+	mutex_lock(&devlink_mutex);
+	devlink_notify(devlink, DEVLINK_CMD_DEL);
+	list_del(&devlink->list);
+	mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ *	devlink_free - Free devlink instance resources
+ *
+ *	@devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+	kfree(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+/**
+ *	devlink_port_register - Register devlink port
+ *
+ *	@devlink: devlink
+ *	@devlink_port: devlink port
+ *	@port_index
+ *
+ *	Register devlink port with provided port index. User can use
+ *	any indexing, even hw-related one. devlink_port structure
+ *	is convenient to be embedded inside user driver private structure.
+ *	Note that the caller should take care of zeroing the devlink_port
+ *	structure.
+ */
+int devlink_port_register(struct devlink *devlink,
+			  struct devlink_port *devlink_port,
+			  unsigned int port_index)
+{
+	mutex_lock(&devlink_port_mutex);
+	if (devlink_port_index_exists(devlink, port_index)) {
+		mutex_unlock(&devlink_port_mutex);
+		return -EEXIST;
+	}
+	devlink_port->devlink = devlink;
+	devlink_port->index = port_index;
+	devlink_port->type = DEVLINK_PORT_TYPE_NOTSET;
+	devlink_port->registered = true;
+	list_add_tail(&devlink_port->list, &devlink->port_list);
+	mutex_unlock(&devlink_port_mutex);
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register);
+
+/**
+ *	devlink_port_unregister - Unregister devlink port
+ *
+ *	@devlink_port: devlink port
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+	mutex_lock(&devlink_port_mutex);
+	list_del(&devlink_port->list);
+	mutex_unlock(&devlink_port_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+				    enum devlink_port_type type,
+				    void *type_dev)
+{
+	devlink_port->type = type;
+	devlink_port->type_dev = type_dev;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ *	devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ *	@devlink_port: devlink port
+ *	@netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+			       struct net_device *netdev)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_ETH, netdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ *	devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ *	@devlink_port: devlink port
+ *	@ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+			      struct ib_device *ibdev)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ *	devlink_port_type_clear - Clear port type
+ *
+ *	@devlink_port: devlink port
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+/**
+ *	devlink_port_split_set - Set port is split
+ *
+ *	@devlink_port: devlink port
+ *	@split_group: split group - identifies group split port is part of
+ */
+void devlink_port_split_set(struct devlink_port *devlink_port,
+			    u32 split_group)
+{
+	devlink_port->split = true;
+	devlink_port->split_group = split_group;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_port_split_set);
+
+static int __init devlink_module_init(void)
+{
+	return genl_register_family_with_ops_groups(&devlink_nl_family,
+						    devlink_nl_ops,
+						    devlink_nl_mcgrps);
+}
+
+static void __exit devlink_module_exit(void)
+{
+	genl_unregister_family(&devlink_nl_family);
+}
+
+module_init(devlink_module_init);
+module_exit(devlink_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Network physical device Netlink interface");
+MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME);
-- 
cgit v1.2.3-59-g8ed1b


From fb2dabad69f099fb9c03a44276778911da50ba29 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 26 Feb 2016 13:16:00 -0500
Subject: net: dsa: support VLAN filtering switchdev attr

When a user explicitly requests VLAN filtering with something like:

    # echo 1 > /sys/class/net/<bridge>/bridge/vlan_filtering

Switchdev propagates a SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING port
attribute.

Add support for it in the DSA layer with a new port_vlan_filtering
function to let drivers toggle 802.1Q filtering on user demand.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  2 ++
 net/dsa/slave.c   | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 3dd54867174a..26c0a3fa009a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -305,6 +305,8 @@ struct dsa_switch_driver {
 	/*
 	 * VLAN support
 	 */
+	int	(*port_vlan_filtering)(struct dsa_switch *ds, int port,
+				       bool vlan_filtering);
 	int	(*port_vlan_prepare)(struct dsa_switch *ds, int port,
 				     const struct switchdev_obj_port_vlan *vlan,
 				     struct switchdev_trans *trans);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index cde29239b60d..27bf03d11670 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -317,6 +317,24 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state)
 	return ret;
 }
 
+static int dsa_slave_vlan_filtering(struct net_device *dev,
+				    const struct switchdev_attr *attr,
+				    struct switchdev_trans *trans)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+
+	/* bridge skips -EOPNOTSUPP, so skip the prepare phase */
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	if (ds->drv->port_vlan_filtering)
+		return ds->drv->port_vlan_filtering(ds, p->port,
+						    attr->u.vlan_filtering);
+
+	return 0;
+}
+
 static int dsa_slave_port_attr_set(struct net_device *dev,
 				   const struct switchdev_attr *attr,
 				   struct switchdev_trans *trans)
@@ -333,6 +351,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
 			ret = ds->drv->port_stp_update(ds, p->port,
 						       attr->u.stp_state);
 		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
+		ret = dsa_slave_vlan_filtering(dev, attr, trans);
+		break;
 	default:
 		ret = -EOPNOTSUPP;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 7f0aec7a668419bdbff12de6e8016544f874e708 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:01 +0100
Subject: bridge: mcast: use names for the different multicast_router types

Using raw values makes it difficult to extend and also understand the
code, give them names and do explicit per-option manipulation in
br_multicast_set_port_router.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  7 +++++
 net/bridge/br_multicast.c      | 61 +++++++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 0890b217580d..e47f3bc7f323 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -177,6 +177,13 @@ enum {
 };
 #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
 
+/* multicast router types */
+enum {
+	MDB_RTR_TYPE_DISABLED,
+	MDB_RTR_TYPE_TEMP_QUERY,
+	MDB_RTR_TYPE_PERM,
+};
+
 enum {
 	MDBA_ROUTER_UNSPEC,
 	MDBA_ROUTER_PORT,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 8b6e4249be1b..71c109b0943f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -759,7 +759,7 @@ static void br_multicast_router_expired(unsigned long data)
 	struct net_bridge *br = port->br;
 
 	spin_lock(&br->multicast_lock);
-	if (port->multicast_router != 1 ||
+	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY ||
 	    timer_pending(&port->multicast_router_timer) ||
 	    hlist_unhashed(&port->rlist))
 		goto out;
@@ -912,7 +912,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
 
 void br_multicast_add_port(struct net_bridge_port *port)
 {
-	port->multicast_router = 1;
+	port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 
 	setup_timer(&port->multicast_router_timer, br_multicast_router_expired,
 		    (unsigned long)port);
@@ -959,7 +959,8 @@ void br_multicast_enable_port(struct net_bridge_port *port)
 #if IS_ENABLED(CONFIG_IPV6)
 	br_multicast_enable(&port->ip6_own_query);
 #endif
-	if (port->multicast_router == 2 && hlist_unhashed(&port->rlist))
+	if (port->multicast_router == MDB_RTR_TYPE_PERM &&
+	    hlist_unhashed(&port->rlist))
 		br_multicast_add_router(br, port);
 
 out:
@@ -1227,13 +1228,13 @@ static void br_multicast_mark_router(struct net_bridge *br,
 	unsigned long now = jiffies;
 
 	if (!port) {
-		if (br->multicast_router == 1)
+		if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY)
 			mod_timer(&br->multicast_router_timer,
 				  now + br->multicast_querier_interval);
 		return;
 	}
 
-	if (port->multicast_router != 1)
+	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
 		return;
 
 	br_multicast_add_router(br, port);
@@ -1713,7 +1714,7 @@ void br_multicast_init(struct net_bridge *br)
 	br->hash_elasticity = 4;
 	br->hash_max = 512;
 
-	br->multicast_router = 1;
+	br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 	br->multicast_querier = 0;
 	br->multicast_query_use_ifaddr = 0;
 	br->multicast_last_member_count = 2;
@@ -1823,11 +1824,11 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 	spin_lock_bh(&br->multicast_lock);
 
 	switch (val) {
-	case 0:
-	case 2:
+	case MDB_RTR_TYPE_DISABLED:
+	case MDB_RTR_TYPE_PERM:
 		del_timer(&br->multicast_router_timer);
 		/* fall through */
-	case 1:
+	case MDB_RTR_TYPE_TEMP_QUERY:
 		br->multicast_router = val;
 		err = 0;
 		break;
@@ -1838,6 +1839,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 	return err;
 }
 
+static void __del_port_router(struct net_bridge_port *p)
+{
+	if (hlist_unhashed(&p->rlist))
+		return;
+	hlist_del_init_rcu(&p->rlist);
+	br_rtr_notify(p->br->dev, p, RTM_DELMDB);
+}
+
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 {
 	struct net_bridge *br = p->br;
@@ -1846,29 +1855,25 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 	spin_lock(&br->multicast_lock);
 
 	switch (val) {
-	case 0:
-	case 1:
-	case 2:
-		p->multicast_router = val;
-		err = 0;
-
-		if (val < 2 && !hlist_unhashed(&p->rlist)) {
-			hlist_del_init_rcu(&p->rlist);
-			br_rtr_notify(br->dev, p, RTM_DELMDB);
-		}
-
-		if (val == 1)
-			break;
-
+	case MDB_RTR_TYPE_DISABLED:
+		p->multicast_router = MDB_RTR_TYPE_DISABLED;
+		__del_port_router(p);
+		del_timer(&p->multicast_router_timer);
+		break;
+	case MDB_RTR_TYPE_TEMP_QUERY:
+		p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+		__del_port_router(p);
+		break;
+	case MDB_RTR_TYPE_PERM:
+		p->multicast_router = MDB_RTR_TYPE_PERM;
 		del_timer(&p->multicast_router_timer);
-
-		if (val == 0)
-			break;
-
 		br_multicast_add_router(br, p);
 		break;
+	default:
+		goto unlock;
 	}
-
+	err = 0;
+unlock:
 	spin_unlock(&br->multicast_lock);
 
 	return err;
-- 
cgit v1.2.3-59-g8ed1b


From 4950cfd1e6a85b5e079a78f19792e6ec54199170 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:02 +0100
Subject: bridge: mcast: do nothing if port's multicast_router is set to the
 same val

This is needed for the upcoming temporary port router. There's no point
to go through the logic if the value is the same.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 71c109b0943f..f1140cf5168d 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1853,7 +1853,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 	int err = -EINVAL;
 
 	spin_lock(&br->multicast_lock);
-
+	if (p->multicast_router == val) {
+		err = 0;
+		goto unlock;
+	}
 	switch (val) {
 	case MDB_RTR_TYPE_DISABLED:
 		p->multicast_router = MDB_RTR_TYPE_DISABLED;
-- 
cgit v1.2.3-59-g8ed1b


From a55d8246abcc910346771175b521ee2bce5a69b3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:03 +0100
Subject: bridge: mcast: add support for temporary port router

Add support for a temporary router port which doesn't depend only on the
incoming query. It can be refreshed if set to the same value, which is
a no-op for the rest.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index e47f3bc7f323..74ee03a47e79 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -182,6 +182,7 @@ enum {
 	MDB_RTR_TYPE_DISABLED,
 	MDB_RTR_TYPE_TEMP_QUERY,
 	MDB_RTR_TYPE_PERM,
+	MDB_RTR_TYPE_TEMP
 };
 
 enum {
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f1140cf5168d..a4c15df2b792 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -759,13 +759,17 @@ static void br_multicast_router_expired(unsigned long data)
 	struct net_bridge *br = port->br;
 
 	spin_lock(&br->multicast_lock);
-	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY ||
+	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+	    port->multicast_router == MDB_RTR_TYPE_PERM ||
 	    timer_pending(&port->multicast_router_timer) ||
 	    hlist_unhashed(&port->rlist))
 		goto out;
 
 	hlist_del_init_rcu(&port->rlist);
 	br_rtr_notify(br->dev, port, RTM_DELMDB);
+	/* Don't allow timer refresh if the router expired */
+	if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+		port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 
 out:
 	spin_unlock(&br->multicast_lock);
@@ -981,6 +985,9 @@ void br_multicast_disable_port(struct net_bridge_port *port)
 	if (!hlist_unhashed(&port->rlist)) {
 		hlist_del_init_rcu(&port->rlist);
 		br_rtr_notify(br->dev, port, RTM_DELMDB);
+		/* Don't allow timer refresh if disabling */
+		if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+			port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 	}
 	del_timer(&port->multicast_router_timer);
 	del_timer(&port->ip4_own_query.timer);
@@ -1234,7 +1241,8 @@ static void br_multicast_mark_router(struct net_bridge *br,
 		return;
 	}
 
-	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
+	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+	    port->multicast_router == MDB_RTR_TYPE_PERM)
 		return;
 
 	br_multicast_add_router(br, port);
@@ -1850,10 +1858,15 @@ static void __del_port_router(struct net_bridge_port *p)
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 {
 	struct net_bridge *br = p->br;
+	unsigned long now = jiffies;
 	int err = -EINVAL;
 
 	spin_lock(&br->multicast_lock);
 	if (p->multicast_router == val) {
+		/* Refresh the temp router port timer */
+		if (p->multicast_router == MDB_RTR_TYPE_TEMP)
+			mod_timer(&p->multicast_router_timer,
+				  now + br->multicast_querier_interval);
 		err = 0;
 		goto unlock;
 	}
@@ -1872,6 +1885,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 		del_timer(&p->multicast_router_timer);
 		br_multicast_add_router(br, p);
 		break;
+	case MDB_RTR_TYPE_TEMP:
+		p->multicast_router = MDB_RTR_TYPE_TEMP;
+		br_multicast_mark_router(br, p);
+		break;
 	default:
 		goto unlock;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 59f78f9f6c2e80dcf0f520be85b660f856217b79 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:04 +0100
Subject: bridge: mcast: add support for more router port information dumping

Allow for more multicast router port information to be dumped such as
timer and type attributes. For that that purpose we need to extend the
MDBA_ROUTER_PORT attribute similar to how it was done for the mdb entries
recently. The new format is thus:
[MDBA_ROUTER_PORT] = { <- nested attribute
    u32 ifindex <- router port ifindex for user-space compatibility
    [MDBA_ROUTER_PATTR attributes]
}
This way it remains compatible with older users (they'll simply retrieve
the u32 in the beginning) and new users can parse the remaining
attributes. It would also allow to add future extensions to the router
port without breaking compatibility.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 14 +++++++++++++-
 net/bridge/br_mdb.c            | 16 ++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 74ee03a47e79..0536eefff9bf 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -144,7 +144,10 @@ struct bridge_vlan_info {
  *     }
  * }
  * [MDBA_ROUTER] = {
- *    [MDBA_ROUTER_PORT]
+ *    [MDBA_ROUTER_PORT] = {
+ *        u32 ifindex
+ *        [MDBA_ROUTER_PATTR attributes]
+ *    }
  * }
  */
 enum {
@@ -192,6 +195,15 @@ enum {
 };
 #define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1)
 
+/* router port attributes */
+enum {
+	MDBA_ROUTER_PATTR_UNSPEC,
+	MDBA_ROUTER_PATTR_TIMER,
+	MDBA_ROUTER_PATTR_TYPE,
+	__MDBA_ROUTER_PATTR_MAX
+};
+#define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1)
+
 struct br_port_msg {
 	__u8  family;
 	__u32 ifindex;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 73786e2fe065..253bc77eda3b 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -20,7 +20,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_port *p;
-	struct nlattr *nest;
+	struct nlattr *nest, *port_nest;
 
 	if (!br->multicast_router || hlist_empty(&br->router_list))
 		return 0;
@@ -30,8 +30,20 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 		return -EMSGSIZE;
 
 	hlist_for_each_entry_rcu(p, &br->router_list, rlist) {
-		if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex))
+		if (!p)
+			continue;
+		port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT);
+		if (!port_nest)
 			goto fail;
+		if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) ||
+		    nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER,
+				br_timer_value(&p->multicast_router_timer)) ||
+		    nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE,
+			       p->multicast_router)) {
+			nla_nest_cancel(skb, port_nest);
+			goto fail;
+		}
+		nla_nest_end(skb, port_nest);
 	}
 
 	nla_nest_end(skb, nest);
-- 
cgit v1.2.3-59-g8ed1b


From ef6980b6becb1afd9d82a4f043749a10ae81bf14 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sat, 27 Feb 2016 08:08:54 -0500
Subject: introduce IFE action

This action allows for a sending side to encapsulate arbitrary metadata
which is decapsulated by the receiving end.
The sender runs in encoding mode and the receiver in decode mode.
Both sender and receiver must specify the same ethertype.
At some point we hope to have a registered ethertype and we'll
then provide a default so the user doesnt have to specify it.
For now we enforce the user specify it.

Lets show example usage where we encode icmp from a sender towards
a receiver with an skbmark of 17; both sender and receiver use
ethertype of 0xdead to interop.

YYYY: Lets start with Receiver-side policy config:
xxx: add an ingress qdisc
sudo tc qdisc add dev $ETH ingress

xxx: any packets with ethertype 0xdead will be subjected to ife decoding
xxx: we then restart the classification so we can match on icmp at prio 3
sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xdead \
u32 match u32 0 0 flowid 1:1 \
action ife decode reclassify

xxx: on restarting the classification from above if it was an icmp
xxx: packet, then match it here and continue to the next rule at prio 4
xxx: which will match based on skb mark of 17
sudo tc filter add dev $ETH parent ffff: prio 3 protocol ip \
u32 match ip protocol 1 0xff flowid 1:1 \
action continue

xxx: match on skbmark of 0x11 (decimal 17) and accept
sudo tc filter add dev $ETH parent ffff: prio 4 protocol ip \
handle 0x11 fw flowid 1:1 \
action ok

xxx: Lets show the decoding policy
sudo tc -s filter ls dev $ETH parent ffff: protocol 0xdead
xxx:
filter pref 2 u32
filter pref 2 u32 fh 800: ht divisor 1
filter pref 2 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1  (rule hit 0 success 0)
  match 00000000/00000000 at 0 (success 0 )
        action order 1: ife decode action reclassify
         index 1 ref 1 bind 1 installed 14 sec used 14 sec
         type: 0x0
         Metadata: allow mark allow hash allow prio allow qmap
        Action statistics:
        Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0
xxx:
Observe that above lists all metadatum it can decode. Typically these
submodules will already be compiled into a monolithic kernel or
loaded as modules

YYYY: Lets show the sender side now ..

xxx: Add an egress qdisc on the sender netdev
sudo tc qdisc add dev $ETH root handle 1: prio
xxx:
xxx: Match all icmp packets to 192.168.122.237/24, then
xxx: tag the packet with skb mark of decimal 17, then
xxx: Encode it with:
xxx:	ethertype 0xdead
xxx:	add skb->mark to whitelist of metadatum to send
xxx:	rewrite target dst MAC address to 02:15:15:15:15:15
xxx:
sudo $TC filter add dev $ETH parent 1: protocol ip prio 10  u32 \
match ip dst 192.168.122.237/24 \
match ip protocol 1 0xff \
flowid 1:2 \
action skbedit mark 17 \
action ife encode \
type 0xDEAD \
allow mark \
dst 02:15:15:15:15:15

xxx: Lets show the encoding policy
sudo tc -s filter ls dev $ETH parent 1: protocol ip
xxx:
filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:2  (rule hit 0 success 0)
  match c0a87aed/ffffffff at 16 (success 0 )
  match 00010000/00ff0000 at 8 (success 0 )

	action order 1:  skbedit mark 17
	 index 6 ref 1 bind 1
 	Action statistics:
	Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0

	action order 2: ife encode action pipe
	 index 3 ref 1 bind 1
	 dst MAC: 02:15:15:15:15:15 type: 0xDEAD
 	 Metadata: allow mark
 	Action statistics:
	Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0
xxx:

test by sending ping from sender to destination

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h        |  61 +++
 include/uapi/linux/tc_act/tc_ife.h |  38 ++
 net/sched/Kconfig                  |  12 +
 net/sched/Makefile                 |   1 +
 net/sched/act_ife.c                | 870 +++++++++++++++++++++++++++++++++++++
 5 files changed, 982 insertions(+)
 create mode 100644 include/net/tc_act/tc_ife.h
 create mode 100644 include/uapi/linux/tc_act/tc_ife.h
 create mode 100644 net/sched/act_ife.c

(limited to 'net')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
new file mode 100644
index 000000000000..dc9a09aefb33
--- /dev/null
+++ b/include/net/tc_act/tc_ife.h
@@ -0,0 +1,61 @@
+#ifndef __NET_TC_IFE_H
+#define __NET_TC_IFE_H
+
+#include <net/act_api.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+
+#define IFE_METAHDRLEN 2
+struct tcf_ife_info {
+	struct tcf_common common;
+	u8 eth_dst[ETH_ALEN];
+	u8 eth_src[ETH_ALEN];
+	u16 eth_type;
+	u16 flags;
+	/* list of metaids allowed */
+	struct list_head metalist;
+};
+#define to_ife(a) \
+	container_of(a->priv, struct tcf_ife_info, common)
+
+struct tcf_meta_info {
+	const struct tcf_meta_ops *ops;
+	void *metaval;
+	u16 metaid;
+	struct list_head metalist;
+};
+
+struct tcf_meta_ops {
+	u16 metaid; /*Maintainer provided ID */
+	u16 metatype; /*netlink attribute type (look at net/netlink.h) */
+	const char *name;
+	const char *synopsis;
+	struct list_head list;
+	int	(*check_presence)(struct sk_buff *, struct tcf_meta_info *);
+	int	(*encode)(struct sk_buff *, void *, struct tcf_meta_info *);
+	int	(*decode)(struct sk_buff *, void *, u16 len);
+	int	(*get)(struct sk_buff *skb, struct tcf_meta_info *mi);
+	int	(*alloc)(struct tcf_meta_info *, void *);
+	void	(*release)(struct tcf_meta_info *);
+	int	(*validate)(void *val, int len);
+	struct module	*owner;
+};
+
+#define MODULE_ALIAS_IFE_META(metan)   MODULE_ALIAS("ifemeta" __stringify_1(metan))
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi);
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi);
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval);
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval);
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval);
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi);
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi);
+int ife_validate_meta_u32(void *val, int len);
+int ife_validate_meta_u16(void *val, int len);
+void ife_release_meta_gen(struct tcf_meta_info *mi);
+int register_ife_op(struct tcf_meta_ops *mops);
+int unregister_ife_op(struct tcf_meta_ops *mops);
+
+#endif /* __NET_TC_IFE_H */
diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
new file mode 100644
index 000000000000..d648ff66586f
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -0,0 +1,38 @@
+#ifndef __UAPI_TC_IFE_H
+#define __UAPI_TC_IFE_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_IFE 25
+/* Flag bits for now just encoding/decoding; mutually exclusive */
+#define IFE_ENCODE 1
+#define IFE_DECODE 0
+
+struct tc_ife {
+	tc_gen;
+	__u16 flags;
+};
+
+/*XXX: We need to encode the total number of bytes consumed */
+enum {
+	TCA_IFE_UNSPEC,
+	TCA_IFE_PARMS,
+	TCA_IFE_TM,
+	TCA_IFE_DMAC,
+	TCA_IFE_SMAC,
+	TCA_IFE_TYPE,
+	TCA_IFE_METALST,
+	__TCA_IFE_MAX
+};
+#define TCA_IFE_MAX (__TCA_IFE_MAX - 1)
+
+#define IFE_META_SKBMARK 1
+#define IFE_META_HASHID 2
+#define	IFE_META_PRIO 3
+#define	IFE_META_QMAP 4
+/*Can be overridden at runtime by module option*/
+#define	__IFE_META_MAX 5
+#define IFE_META_MAX (__IFE_META_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 82830824fb1f..4d48ef57e564 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -739,6 +739,18 @@ config NET_ACT_CONNMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called act_connmark.
 
+config NET_ACT_IFE
+        tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to allow for sourcing and terminating metadata
+	  For details refer to netdev01 paper:
+	  "Distributing Linux Traffic Control Classifier-Action Subsystem"
+	   Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_ife.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 690c1689e090..3d176671b0e1 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
new file mode 100644
index 000000000000..6e7ec257790d
--- /dev/null
+++ b/net/sched/act_ife.c
@@ -0,0 +1,870 @@
+/*
+ * net/sched/ife.c	Inter-FE action based on ForCES WG InterFE LFB
+ *
+ *		Refer to:
+ *		draft-ietf-forces-interfelfb-03
+ *		and
+ *		netdev01 paper:
+ *		"Distributing Linux Traffic Control Classifier-Action
+ *		Subsystem"
+ *		Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/etherdevice.h>
+
+#define IFE_TAB_MASK 15
+
+static int ife_net_id;
+static int max_metacnt = IFE_META_MAX + 1;
+
+static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
+	[TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)},
+	[TCA_IFE_DMAC] = { .len = ETH_ALEN},
+	[TCA_IFE_SMAC] = { .len = ETH_ALEN},
+	[TCA_IFE_TYPE] = { .type = NLA_U16},
+};
+
+/* Caller takes care of presenting data in network order
+*/
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+	u32 *tlv = (u32 *)(skbdata);
+	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
+	char *dptr = (char *)tlv + NLA_HDRLEN;
+	u32 htlv = attrtype << 16 | totlen;
+
+	*tlv = htonl(htlv);
+	memset(dptr, 0, totlen - NLA_HDRLEN);
+	memcpy(dptr, dval, dlen);
+
+	return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+	if (mi->metaval)
+		return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval);
+	else
+		return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u32);
+
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
+{
+	if (metaval || mi->metaval)
+		return 8; /* T+L+V == 2+2+4 */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u32);
+
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+	u32 edata = metaval;
+
+	if (mi->metaval)
+		edata = *(u32 *)mi->metaval;
+	else if (metaval)
+		edata = metaval;
+
+	if (!edata) /* will not encode */
+		return 0;
+
+	edata = htonl(edata);
+	return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u32);
+
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+	if (mi->metaval)
+		return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval);
+	else
+		return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u16);
+
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
+{
+	mi->metaval = kmemdup(&metaval, sizeof(u32), GFP_KERNEL);
+	if (!mi->metaval)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
+
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
+{
+	mi->metaval = kmemdup(&metaval, sizeof(u16), GFP_KERNEL);
+	if (!mi->metaval)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u16);
+
+void ife_release_meta_gen(struct tcf_meta_info *mi)
+{
+	kfree(mi->metaval);
+}
+EXPORT_SYMBOL_GPL(ife_release_meta_gen);
+
+int ife_validate_meta_u32(void *val, int len)
+{
+	if (len == 4)
+		return 0;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u32);
+
+int ife_validate_meta_u16(void *val, int len)
+{
+	/* length will include padding */
+	if (len == NLA_ALIGN(2))
+		return 0;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u16);
+
+static LIST_HEAD(ifeoplist);
+static DEFINE_RWLOCK(ife_mod_lock);
+
+static struct tcf_meta_ops *find_ife_oplist(u16 metaid)
+{
+	struct tcf_meta_ops *o;
+
+	read_lock(&ife_mod_lock);
+	list_for_each_entry(o, &ifeoplist, list) {
+		if (o->metaid == metaid) {
+			if (!try_module_get(o->owner))
+				o = NULL;
+			read_unlock(&ife_mod_lock);
+			return o;
+		}
+	}
+	read_unlock(&ife_mod_lock);
+
+	return NULL;
+}
+
+int register_ife_op(struct tcf_meta_ops *mops)
+{
+	struct tcf_meta_ops *m;
+
+	if (!mops->metaid || !mops->metatype || !mops->name ||
+	    !mops->check_presence || !mops->encode || !mops->decode ||
+	    !mops->get || !mops->alloc)
+		return -EINVAL;
+
+	write_lock(&ife_mod_lock);
+
+	list_for_each_entry(m, &ifeoplist, list) {
+		if (m->metaid == mops->metaid ||
+		    (strcmp(mops->name, m->name) == 0)) {
+			write_unlock(&ife_mod_lock);
+			return -EEXIST;
+		}
+	}
+
+	if (!mops->release)
+		mops->release = ife_release_meta_gen;
+
+	list_add_tail(&mops->list, &ifeoplist);
+	write_unlock(&ife_mod_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ife_op);
+
+int unregister_ife_op(struct tcf_meta_ops *mops)
+{
+	struct tcf_meta_ops *m;
+	int err = -ENOENT;
+
+	write_lock(&ife_mod_lock);
+	list_for_each_entry(m, &ifeoplist, list) {
+		if (m->metaid == mops->metaid) {
+			list_del(&mops->list);
+			err = 0;
+			break;
+		}
+	}
+	write_unlock(&ife_mod_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_ife_op);
+
+static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len)
+{
+	int ret = 0;
+	/* XXX: unfortunately cant use nla_policy at this point
+	* because a length of 0 is valid in the case of
+	* "allow". "use" semantics do enforce for proper
+	* length and i couldve use nla_policy but it makes it hard
+	* to use it just for that..
+	*/
+	if (ops->validate)
+		return ops->validate(val, len);
+
+	if (ops->metatype == NLA_U32)
+		ret = ife_validate_meta_u32(val, len);
+	else if (ops->metatype == NLA_U16)
+		ret = ife_validate_meta_u16(val, len);
+
+	return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
+				void *val, int len)
+{
+	struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+	int ret = 0;
+
+	if (!ops) {
+		ret = -ENOENT;
+#ifdef CONFIG_MODULES
+		spin_unlock_bh(&ife->tcf_lock);
+		rtnl_unlock();
+		request_module("ifemeta%u", metaid);
+		rtnl_lock();
+		spin_lock_bh(&ife->tcf_lock);
+		ops = find_ife_oplist(metaid);
+#endif
+	}
+
+	if (ops) {
+		ret = 0;
+		if (len)
+			ret = ife_validate_metatype(ops, val, len);
+
+		module_put(ops->owner);
+	}
+
+	return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
+			int len)
+{
+	struct tcf_meta_info *mi = NULL;
+	struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+	int ret = 0;
+
+	if (!ops)
+		return -ENOENT;
+
+	mi = kzalloc(sizeof(*mi), GFP_KERNEL);
+	if (!mi) {
+		/*put back what find_ife_oplist took */
+		module_put(ops->owner);
+		return -ENOMEM;
+	}
+
+	mi->metaid = metaid;
+	mi->ops = ops;
+	if (len > 0) {
+		ret = ops->alloc(mi, metaval);
+		if (ret != 0) {
+			kfree(mi);
+			module_put(ops->owner);
+			return ret;
+		}
+	}
+
+	list_add_tail(&mi->metalist, &ife->metalist);
+
+	return ret;
+}
+
+static int use_all_metadata(struct tcf_ife_info *ife)
+{
+	struct tcf_meta_ops *o;
+	int rc = 0;
+	int installed = 0;
+
+	list_for_each_entry(o, &ifeoplist, list) {
+		rc = add_metainfo(ife, o->metaid, NULL, 0);
+		if (rc == 0)
+			installed += 1;
+	}
+
+	if (installed)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+	struct tcf_meta_info *e;
+	struct nlattr *nest;
+	unsigned char *b = skb_tail_pointer(skb);
+	int total_encoded = 0;
+
+	/*can only happen on decode */
+	if (list_empty(&ife->metalist))
+		return 0;
+
+	nest = nla_nest_start(skb, TCA_IFE_METALST);
+	if (!nest)
+		goto out_nlmsg_trim;
+
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (!e->ops->get(skb, e))
+			total_encoded += 1;
+	}
+
+	if (!total_encoded)
+		goto out_nlmsg_trim;
+
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+out_nlmsg_trim:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+/* under ife->tcf_lock */
+static void _tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_ife_info *ife = a->priv;
+	struct tcf_meta_info *e, *n;
+
+	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+		module_put(e->ops->owner);
+		list_del(&e->metalist);
+		if (e->metaval) {
+			if (e->ops->release)
+				e->ops->release(e);
+			else
+				kfree(e->metaval);
+		}
+		kfree(e);
+	}
+}
+
+static void tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_ife_info *ife = a->priv;
+
+	spin_lock_bh(&ife->tcf_lock);
+	_tcf_ife_cleanup(a, bind);
+	spin_unlock_bh(&ife->tcf_lock);
+}
+
+/* under ife->tcf_lock */
+static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
+{
+	int len = 0;
+	int rc = 0;
+	int i = 0;
+	void *val;
+
+	for (i = 1; i < max_metacnt; i++) {
+		if (tb[i]) {
+			val = nla_data(tb[i]);
+			len = nla_len(tb[i]);
+
+			rc = load_metaops_and_vet(ife, i, val, len);
+			if (rc != 0)
+				return rc;
+
+			rc = add_metainfo(ife, i, val, len);
+			if (rc)
+				return rc;
+		}
+	}
+
+	return rc;
+}
+
+static int tcf_ife_init(struct net *net, struct nlattr *nla,
+			struct nlattr *est, struct tc_action *a,
+			int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+	struct nlattr *tb[TCA_IFE_MAX + 1];
+	struct nlattr *tb2[IFE_META_MAX + 1];
+	struct tcf_ife_info *ife;
+	struct tc_ife *parm;
+	u16 ife_type = 0;
+	u8 *daddr = NULL;
+	u8 *saddr = NULL;
+	int ret = 0;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_IFE_PARMS])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_IFE_PARMS]);
+
+	if (parm->flags & IFE_ENCODE) {
+		/* Until we get issued the ethertype, we cant have
+		 * a default..
+		**/
+		if (!tb[TCA_IFE_TYPE]) {
+			pr_info("You MUST pass etherype for encoding\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife),
+				      bind, false);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		if (bind)	/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	ife = to_ife(a);
+	ife->flags = parm->flags;
+
+	if (parm->flags & IFE_ENCODE) {
+		ife_type = nla_get_u16(tb[TCA_IFE_TYPE]);
+		if (tb[TCA_IFE_DMAC])
+			daddr = nla_data(tb[TCA_IFE_DMAC]);
+		if (tb[TCA_IFE_SMAC])
+			saddr = nla_data(tb[TCA_IFE_SMAC]);
+	}
+
+	spin_lock_bh(&ife->tcf_lock);
+	ife->tcf_action = parm->action;
+
+	if (parm->flags & IFE_ENCODE) {
+		if (daddr)
+			ether_addr_copy(ife->eth_dst, daddr);
+		else
+			eth_zero_addr(ife->eth_dst);
+
+		if (saddr)
+			ether_addr_copy(ife->eth_src, saddr);
+		else
+			eth_zero_addr(ife->eth_src);
+
+		ife->eth_type = ife_type;
+	}
+
+	if (ret == ACT_P_CREATED)
+		INIT_LIST_HEAD(&ife->metalist);
+
+	if (tb[TCA_IFE_METALST]) {
+		err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST],
+				       NULL);
+		if (err) {
+metadata_parse_err:
+			if (ret == ACT_P_CREATED)
+				_tcf_ife_cleanup(a, bind);
+
+			spin_unlock_bh(&ife->tcf_lock);
+			return err;
+		}
+
+		err = populate_metalist(ife, tb2);
+		if (err)
+			goto metadata_parse_err;
+
+	} else {
+		/* if no passed metadata allow list or passed allow-all
+		 * then here we process by adding as many supported metadatum
+		 * as we can. You better have at least one else we are
+		 * going to bail out
+		 */
+		err = use_all_metadata(ife);
+		if (err) {
+			if (ret == ACT_P_CREATED)
+				_tcf_ife_cleanup(a, bind);
+
+			spin_unlock_bh(&ife->tcf_lock);
+			return err;
+		}
+	}
+
+	spin_unlock_bh(&ife->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, a);
+
+	return ret;
+}
+
+static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+			int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ife_info *ife = a->priv;
+	struct tc_ife opt = {
+		.index = ife->tcf_index,
+		.refcnt = ife->tcf_refcnt - ref,
+		.bindcnt = ife->tcf_bindcnt - bind,
+		.action = ife->tcf_action,
+		.flags = ife->flags,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(ife->tcf_tm.expires);
+	if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t))
+		goto nla_put_failure;
+
+	if (!is_zero_ether_addr(ife->eth_dst)) {
+		if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst))
+			goto nla_put_failure;
+	}
+
+	if (!is_zero_ether_addr(ife->eth_src)) {
+		if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src))
+			goto nla_put_failure;
+	}
+
+	if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type))
+		goto nla_put_failure;
+
+	if (dump_metalist(skb, ife)) {
+		/*ignore failure to dump metalist */
+		pr_info("Failed to dump metalist\n");
+	}
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
+		       u16 metaid, u16 mlen, void *mdata)
+{
+	struct tcf_meta_info *e;
+
+	/* XXX: use hash to speed up */
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (metaid == e->metaid) {
+			if (e->ops) {
+				/* We check for decode presence already */
+				return e->ops->decode(skb, mdata, mlen);
+			}
+		}
+	}
+
+	return 0;
+}
+
+struct ifeheadr {
+	__be16 metalen;
+	u8 tlv_data[];
+};
+
+struct meta_tlvhdr {
+	__be16 type;
+	__be16 len;
+};
+
+static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+	int action = ife->tcf_action;
+	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
+	u16 ifehdrln = ifehdr->metalen;
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+	spin_unlock(&ife->tcf_lock);
+
+	ifehdrln = ntohs(ifehdrln);
+	if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+		spin_lock(&ife->tcf_lock);
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	skb_set_mac_header(skb, ifehdrln);
+	__skb_pull(skb, ifehdrln);
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	ifehdrln -= IFE_METAHDRLEN;
+
+	while (ifehdrln > 0) {
+		u8 *tlvdata = (u8 *)tlv;
+		u16 mtype = tlv->type;
+		u16 mlen = tlv->len;
+
+		mtype = ntohs(mtype);
+		mlen = ntohs(mlen);
+
+		if (find_decode_metaid(skb, ife, mtype, (mlen - 4),
+				       (void *)(tlvdata + 4))) {
+			/* abuse overlimits to count when we receive metadata
+			 * but dont have an ops for it
+			 */
+			pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
+					    mtype, mlen);
+			ife->tcf_qstats.overlimits++;
+		}
+
+		tlvdata += mlen;
+		ifehdrln -= mlen;
+		tlv = (struct meta_tlvhdr *)tlvdata;
+	}
+
+	skb_reset_network_header(skb);
+	return action;
+}
+
+/*XXX: check if we can do this at install time instead of current
+ * send data path
+**/
+static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+	struct tcf_meta_info *e, *n;
+	int tot_run_sz = 0, run_sz = 0;
+
+	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+		if (e->ops->check_presence) {
+			run_sz = e->ops->check_presence(skb, e);
+			tot_run_sz += run_sz;
+		}
+	}
+
+	return tot_run_sz;
+}
+
+static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+	int action = ife->tcf_action;
+	struct ethhdr *oethh;	/* outer ether header */
+	struct ethhdr *iethh;	/* inner eth header */
+	struct tcf_meta_info *e;
+	/*
+	   OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+	   where ORIGDATA = original ethernet header ...
+	 */
+	u16 metalen = ife_get_sz(skb, ife);
+	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
+	unsigned int skboff = skb->dev->hard_header_len;
+	u32 at = G_TC_AT(skb->tc_verd);
+	int new_len = skb->len + hdrm;
+	bool exceed_mtu = false;
+	int err;
+
+	if (at & AT_EGRESS) {
+		if (new_len > skb->dev->mtu)
+			exceed_mtu = true;
+	}
+
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+
+	if (!metalen) {		/* no metadata to send */
+		/* abuse overlimits to count when we allow packet
+		 * with no metadata
+		 */
+		ife->tcf_qstats.overlimits++;
+		spin_unlock(&ife->tcf_lock);
+		return action;
+	}
+	/* could be stupid policy setup or mtu config
+	 * so lets be conservative.. */
+	if ((action == TC_ACT_SHOT) || exceed_mtu) {
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	iethh = eth_hdr(skb);
+
+	err = skb_cow_head(skb, hdrm);
+	if (unlikely(err)) {
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	if (!(at & AT_EGRESS))
+		skb_push(skb, skb->dev->hard_header_len);
+
+	__skb_push(skb, hdrm);
+	memcpy(skb->data, iethh, skb->mac_len);
+	skb_reset_mac_header(skb);
+	oethh = eth_hdr(skb);
+
+	/*total metadata length */
+	metalen += IFE_METAHDRLEN;
+	metalen = htons(metalen);
+	memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
+	skboff += IFE_METAHDRLEN;
+
+	/* XXX: we dont have a clever way of telling encode to
+	 * not repeat some of the computations that are done by
+	 * ops->presence_check...
+	 */
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (e->ops->encode) {
+			err = e->ops->encode(skb, (void *)(skb->data + skboff),
+					     e);
+		}
+		if (err < 0) {
+			/* too corrupt to keep around if overwritten */
+			ife->tcf_qstats.drops++;
+			spin_unlock(&ife->tcf_lock);
+			return TC_ACT_SHOT;
+		}
+		skboff += err;
+	}
+
+	if (!is_zero_ether_addr(ife->eth_src))
+		ether_addr_copy(oethh->h_source, ife->eth_src);
+	else
+		ether_addr_copy(oethh->h_source, iethh->h_source);
+	if (!is_zero_ether_addr(ife->eth_dst))
+		ether_addr_copy(oethh->h_dest, ife->eth_dst);
+	else
+		ether_addr_copy(oethh->h_dest, iethh->h_dest);
+	oethh->h_proto = htons(ife->eth_type);
+
+	if (!(at & AT_EGRESS))
+		skb_pull(skb, skb->dev->hard_header_len);
+
+	spin_unlock(&ife->tcf_lock);
+
+	return action;
+}
+
+static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
+		       struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+
+	if (ife->flags & IFE_ENCODE)
+		return tcf_ife_encode(skb, a, res);
+
+	if (!(ife->flags & IFE_ENCODE))
+		return tcf_ife_decode(skb, a, res);
+
+	pr_info_ratelimited("unknown failure(policy neither de/encode\n");
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+	ife->tcf_qstats.drops++;
+	spin_unlock(&ife->tcf_lock);
+
+	return TC_ACT_SHOT;
+}
+
+static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ife_ops = {
+	.kind = "ife",
+	.type = TCA_ACT_IFE,
+	.owner = THIS_MODULE,
+	.act = tcf_ife_act,
+	.dump = tcf_ife_dump,
+	.cleanup = tcf_ife_cleanup,
+	.init = tcf_ife_init,
+	.walk = tcf_ife_walker,
+	.lookup = tcf_ife_search,
+};
+
+static __net_init int ife_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tc_action_net_init(tn, &act_ife_ops, IFE_TAB_MASK);
+}
+
+static void __net_exit ife_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations ife_net_ops = {
+	.init = ife_init_net,
+	.exit = ife_exit_net,
+	.id   = &ife_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init ife_init_module(void)
+{
+	return tcf_register_action(&act_ife_ops, &ife_net_ops);
+}
+
+static void __exit ife_cleanup_module(void)
+{
+	tcf_unregister_action(&act_ife_ops, &ife_net_ops);
+}
+
+module_init(ife_init_module);
+module_exit(ife_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-59-g8ed1b


From 084e2f6566d2a39c007ed6473f58b551a2eeefeb Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sat, 27 Feb 2016 08:08:55 -0500
Subject: Support to encoding decoding skb mark on IFE action

Example usage:
Set the skb using skbedit then allow it to be encoded

sudo tc qdisc add dev $ETH root handle 1: prio
sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action skbedit mark 17 \
action ife encode \
allow mark \
dst 02:15:15:15:15:15

Note: You dont need the skbedit action if you are already encoding the
skb mark earlier. A zero skb mark, when seen, will not be encoded.

Alternative hard code static mark of 0x12 every time the filter matches

sudo $TC filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action ife encode \
type 0xDEAD \
use mark 0x12 \
dst 02:15:15:15:15:15

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig         |  5 +++
 net/sched/Makefile        |  1 +
 net/sched/act_meta_mark.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+)
 create mode 100644 net/sched/act_meta_mark.c

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 4d48ef57e564..85854c089d11 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -751,6 +751,11 @@ config NET_ACT_IFE
 	  To compile this code as a module, choose M here: the
 	  module will be called act_ife.
 
+config NET_IFE_SKBMARK
+        tristate "Support to encoding decoding skb mark on IFE action"
+        depends on NET_ACT_IFE
+        ---help---
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 3d176671b0e1..3f7a182955c2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
+obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c
new file mode 100644
index 000000000000..82892170ce4f
--- /dev/null
+++ b/net/sched/act_meta_mark.c
@@ -0,0 +1,79 @@
+/*
+ * net/sched/act_meta_mark.c IFE skb->mark metadata module
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/rtnetlink.h>
+
+static int skbmark_encode(struct sk_buff *skb, void *skbdata,
+			  struct tcf_meta_info *e)
+{
+	u32 ifemark = skb->mark;
+
+	return ife_encode_meta_u32(ifemark, skbdata, e);
+}
+
+static int skbmark_decode(struct sk_buff *skb, void *data, u16 len)
+{
+	u32 ifemark = *(u32 *)data;
+
+	skb->mark = ntohl(ifemark);
+	return 0;
+}
+
+static int skbmark_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+	return ife_check_meta_u32(skb->mark, e);
+}
+
+static struct tcf_meta_ops ife_skbmark_ops = {
+	.metaid = IFE_META_SKBMARK,
+	.metatype = NLA_U32,
+	.name = "skbmark",
+	.synopsis = "skb mark 32 bit metadata",
+	.check_presence = skbmark_check,
+	.encode = skbmark_encode,
+	.decode = skbmark_decode,
+	.get = ife_get_meta_u32,
+	.alloc = ife_alloc_meta_u32,
+	.release = ife_release_meta_gen,
+	.validate = ife_validate_meta_u32,
+	.owner = THIS_MODULE,
+};
+
+static int __init ifemark_init_module(void)
+{
+	return register_ife_op(&ife_skbmark_ops);
+}
+
+static void __exit ifemark_cleanup_module(void)
+{
+	unregister_ife_op(&ife_skbmark_ops);
+}
+
+module_init(ifemark_init_module);
+module_exit(ifemark_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE skb mark metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_SKBMARK);
-- 
cgit v1.2.3-59-g8ed1b


From 200e10f46936d95a553414f249cefb765194b235 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sat, 27 Feb 2016 08:08:56 -0500
Subject: Support to encoding decoding skb prio on IFE action

    Example usage:
    Set the skb priority using skbedit then allow it to be encoded

    sudo tc qdisc add dev $ETH root handle 1: prio
    sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
    u32 match ip protocol 1 0xff flowid 1:2 \
    action skbedit prio 17 \
    action ife encode \
    allow prio \
    dst 02:15:15:15:15:15

    Note: You dont need the skbedit action if you are already encoding the
    skb priority earlier. A zero skb priority will not be sent

    Alternative hard code static priority of decimal 33 (unlike skbedit)
    then mark of 0x12 every time the filter matches

    sudo $TC filter add dev $ETH parent 1: protocol ip prio 10 \
    u32 match ip protocol 1 0xff flowid 1:2 \
    action ife encode \
    type 0xDEAD \
    use prio 33 \
    use mark 0x12 \
    dst 02:15:15:15:15:15

    Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig            |  5 +++
 net/sched/Makefile           |  1 +
 net/sched/act_meta_skbprio.c | 76 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 82 insertions(+)
 create mode 100644 net/sched/act_meta_skbprio.c

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 85854c089d11..b148302bbaf2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -756,6 +756,11 @@ config NET_IFE_SKBMARK
         depends on NET_ACT_IFE
         ---help---
 
+config NET_IFE_SKBPRIO
+        tristate "Support to encoding decoding skb prio on IFE action"
+        depends on NET_ACT_IFE
+        ---help---
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 3f7a182955c2..84bddb373517 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
+obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c
new file mode 100644
index 000000000000..26bf4d86030b
--- /dev/null
+++ b/net/sched/act_meta_skbprio.c
@@ -0,0 +1,76 @@
+/*
+ * net/sched/act_meta_prio.c IFE skb->priority metadata module
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+
+static int skbprio_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+	return ife_check_meta_u32(skb->priority, e);
+}
+
+static int skbprio_encode(struct sk_buff *skb, void *skbdata,
+			  struct tcf_meta_info *e)
+{
+	u32 ifeprio = skb->priority; /* avoid having to cast skb->priority*/
+
+	return ife_encode_meta_u32(ifeprio, skbdata, e);
+}
+
+static int skbprio_decode(struct sk_buff *skb, void *data, u16 len)
+{
+	u32 ifeprio = *(u32 *)data;
+
+	skb->priority = ntohl(ifeprio);
+	return 0;
+}
+
+static struct tcf_meta_ops ife_prio_ops = {
+	.metaid = IFE_META_PRIO,
+	.metatype = NLA_U32,
+	.name = "skbprio",
+	.synopsis = "skb prio metadata",
+	.check_presence = skbprio_check,
+	.encode = skbprio_encode,
+	.decode = skbprio_decode,
+	.get = ife_get_meta_u32,
+	.alloc = ife_alloc_meta_u32,
+	.owner = THIS_MODULE,
+};
+
+static int __init ifeprio_init_module(void)
+{
+	return register_ife_op(&ife_prio_ops);
+}
+
+static void __exit ifeprio_cleanup_module(void)
+{
+	unregister_ife_op(&ife_prio_ops);
+}
+
+module_init(ifeprio_init_module);
+module_exit(ifeprio_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE skb prio metadata action");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_PRIO);
-- 
cgit v1.2.3-59-g8ed1b


From 822c868532cae2cc1c51f4f18ab61c194d98aaf6 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 27 Feb 2016 00:32:15 -0800
Subject: net: ipv4: Convert IP network timestamps to be y2038 safe

ICMP timestamp messages and IP source route options require
timestamps to be in milliseconds modulo 24 hours from
midnight UT format.

Add inet_current_timestamp() function to support this. The function
returns the required timestamp in network byte order.

Timestamp calculation is also changed to call ktime_get_real_ts64()
which uses struct timespec64. struct timespec64 is y2038 safe.
Previously it called getnstimeofday() which uses struct timespec.
struct timespec is not y2038 safe.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: James Morris <jmorris@namei.org>
Cc: Patrick McHardy <kaber@trash.net>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h      |  2 ++
 net/ipv4/af_inet.c    | 26 ++++++++++++++++++++++++++
 net/ipv4/icmp.c       |  5 +----
 net/ipv4/ip_options.c | 14 ++++++--------
 4 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index cbb134b2f0e4..fad74d323bd6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -240,6 +240,8 @@ static inline int inet_is_local_reserved_port(struct net *net, int port)
 }
 #endif
 
+__be32 inet_current_timestamp(void);
+
 /* From inetpeer.c */
 extern int inet_peer_threshold;
 extern int inet_peer_minttl;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 209d1ed28954..0cc923f83e10 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1380,6 +1380,32 @@ out:
 	return pp;
 }
 
+#define SECONDS_PER_DAY	86400
+
+/* inet_current_timestamp - Return IP network timestamp
+ *
+ * Return milliseconds since midnight in network byte order.
+ */
+__be32 inet_current_timestamp(void)
+{
+	u32 secs;
+	u32 msecs;
+	struct timespec64 ts;
+
+	ktime_get_real_ts64(&ts);
+
+	/* Get secs since midnight. */
+	(void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
+	/* Convert to msecs. */
+	msecs = secs * MSEC_PER_SEC;
+	/* Convert nsec to msec. */
+	msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;
+
+	/* Convert to network byte order. */
+	return htons(msecs);
+}
+EXPORT_SYMBOL(inet_current_timestamp);
+
 int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 {
 	if (sk->sk_family == AF_INET)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36e26977c908..6333489771ed 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -931,7 +931,6 @@ static bool icmp_echo(struct sk_buff *skb)
  */
 static bool icmp_timestamp(struct sk_buff *skb)
 {
-	struct timespec tv;
 	struct icmp_bxm icmp_param;
 	/*
 	 *	Too short.
@@ -942,9 +941,7 @@ static bool icmp_timestamp(struct sk_buff *skb)
 	/*
 	 *	Fill in the current time as ms since midnight UT:
 	 */
-	getnstimeofday(&tv);
-	icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC +
-					 tv.tv_nsec / NSEC_PER_MSEC);
+	icmp_param.data.times[1] = inet_current_timestamp();
 	icmp_param.data.times[2] = icmp_param.data.times[1];
 	if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
 		BUG();
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index bd246792360b..4d158ff1def1 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -58,10 +58,9 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
 		if (opt->ts_needaddr)
 			ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
 		if (opt->ts_needtime) {
-			struct timespec tv;
 			__be32 midtime;
-			getnstimeofday(&tv);
-			midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
+
+			midtime = inet_current_timestamp();
 			memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
 		}
 		return;
@@ -415,11 +414,10 @@ int ip_options_compile(struct net *net,
 					break;
 				}
 				if (timeptr) {
-					struct timespec tv;
-					u32  midtime;
-					getnstimeofday(&tv);
-					midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
-					put_unaligned_be32(midtime, timeptr);
+					__be32 midtime;
+
+					midtime = inet_current_timestamp();
+					memcpy(timeptr, &midtime, 4);
 					opt->is_changed = 1;
 				}
 			} else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
-- 
cgit v1.2.3-59-g8ed1b


From b1b270d863c022f3dbf1f8786fd2956703ee10fc Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 27 Feb 2016 00:32:16 -0800
Subject: net: ipv4: tcp_probe: Replace timespec with timespec64

TCP probe log timestamps use struct timespec which is
not y2038 safe. Even though timespec might be good enough here
as it is used to represent delta time, the plan is to get rid
of all uses of timespec in the kernel.
Replace with struct timespec64 which is y2038 safe.

Prints still use unsigned long format and type.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_probe.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index ebf5ff57526e..f6c50af24a64 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -187,13 +187,13 @@ static int tcpprobe_sprint(char *tbuf, int n)
 {
 	const struct tcp_log *p
 		= tcp_probe.log + tcp_probe.tail;
-	struct timespec tv
-		= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
+	struct timespec64 ts
+		= ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start));
 
 	return scnprintf(tbuf, n,
 			"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
-			(unsigned long)tv.tv_sec,
-			(unsigned long)tv.tv_nsec,
+			(unsigned long)ts.tv_sec,
+			(unsigned long)ts.tv_nsec,
 			&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
 			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6497c7e6400105424a7d1dddc3f3aadcc8e3588a Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 27 Feb 2016 00:32:17 -0800
Subject: net: sctp: Convert log timestamps to be y2038 safe

SCTP probe log timestamps use struct timespec which is
not y2038 safe.
Use struct timespec64 which is 2038 safe instead.

Use monotonic time instead of real time as only time
differences are logged.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-sctp@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/probe.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index 5e68b94ee640..6cc2152e0740 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -65,7 +65,7 @@ static struct {
 	struct kfifo	  fifo;
 	spinlock_t	  lock;
 	wait_queue_head_t wait;
-	struct timespec	  tstart;
+	struct timespec64 tstart;
 } sctpw;
 
 static __printf(1, 2) void printl(const char *fmt, ...)
@@ -85,7 +85,7 @@ static __printf(1, 2) void printl(const char *fmt, ...)
 static int sctpprobe_open(struct inode *inode, struct file *file)
 {
 	kfifo_reset(&sctpw.fifo);
-	getnstimeofday(&sctpw.tstart);
+	ktime_get_ts64(&sctpw.tstart);
 
 	return 0;
 }
@@ -138,7 +138,7 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
 	struct sk_buff *skb = chunk->skb;
 	struct sctp_transport *sp;
 	static __u32 lcwnd = 0;
-	struct timespec now;
+	struct timespec64 now;
 
 	sp = asoc->peer.primary_path;
 
@@ -149,8 +149,8 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
 	    (full || sp->cwnd != lcwnd)) {
 		lcwnd = sp->cwnd;
 
-		getnstimeofday(&now);
-		now = timespec_sub(now, sctpw.tstart);
+		ktime_get_ts64(&now);
+		now = timespec64_sub(now, sctpw.tstart);
 
 		printl("%lu.%06lu ", (unsigned long) now.tv_sec,
 		       (unsigned long) now.tv_nsec / NSEC_PER_USEC);
-- 
cgit v1.2.3-59-g8ed1b


From 4f25a1110cd4f5746b750a9567303c7ef6826b7c Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sat, 27 Feb 2016 09:59:17 -0800
Subject: net: ipv6/l3mdev: Move host route on saved address if necessary

Commit f1705ec197e70 allows IPv6 addresses to be retained on a link down.
The address can have a cached host route which can point to the wrong
FIB table if the L3 enslavement is changed (e.g., route can point to local
table instead of VRF table if device is added to an L3 domain).

On link up check the table of the cached host route against the FIB
table associated with the device and correct if needed.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a2d6f6c242af..b1169d511ab4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3170,9 +3170,35 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+/* If the host route is cached on the addr struct make sure it is associated
+ * with the proper table. e.g., enslavement can change and if so the cached
+ * host route needs to move to the new table.
+ */
+static void l3mdev_check_host_rt(struct inet6_dev *idev,
+				  struct inet6_ifaddr *ifp)
+{
+	if (ifp->rt) {
+		u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
+
+		if (tb_id != ifp->rt->rt6i_table->tb6_id) {
+			ip6_del_rt(ifp->rt);
+			ifp->rt = NULL;
+		}
+	}
+}
+#else
+static void l3mdev_check_host_rt(struct inet6_dev *idev,
+				  struct inet6_ifaddr *ifp)
+{
+}
+#endif
+
 static int fixup_permanent_addr(struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
+	l3mdev_check_host_rt(idev, ifp);
+
 	if (!ifp->rt) {
 		struct rt6_info *rt;
 
-- 
cgit v1.2.3-59-g8ed1b


From 64d4e3431e686dc37ce388ba531c4c4e866fb141 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sat, 27 Feb 2016 20:19:54 -0800
Subject: net: remove skb_sender_cpu_clear()

After commit 52bd2d62ce67 ("net: better skb->sender_cpu and skb->napi_id cohabitation")
skb_sender_cpu_clear() becomes empty and can be removed.

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h          | 4 ----
 net/bridge/br_forward.c         | 1 -
 net/core/filter.c               | 2 --
 net/core/skbuff.c               | 1 -
 net/ipv4/ip_forward.c           | 1 -
 net/ipv6/ip6_output.c           | 1 -
 net/netfilter/ipvs/ip_vs_xmit.c | 6 ------
 net/netfilter/nf_dup_netdev.c   | 1 -
 net/sched/act_mirred.c          | 1 -
 9 files changed, 18 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index eab4f8fbed58..797cefb888fb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1161,10 +1161,6 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
 	to->l4_hash = from->l4_hash;
 };
 
-static inline void skb_sender_cpu_clear(struct sk_buff *skb)
-{
-}
-
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
 static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
 {
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index fcdb86dd5a23..f47759f05b6d 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -44,7 +44,6 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb
 
 	skb_push(skb, ETH_HLEN);
 	br_drop_fake_rtable(skb);
-	skb_sender_cpu_clear(skb);
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
 	    (skb->protocol == htons(ETH_P_8021Q) ||
diff --git a/net/core/filter.c b/net/core/filter.c
index a3aba15a8025..5e2a3b5e5196 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1597,7 +1597,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 	}
 
 	skb2->dev = dev;
-	skb_sender_cpu_clear(skb2);
 	return dev_queue_xmit(skb2);
 }
 
@@ -1650,7 +1649,6 @@ int skb_do_redirect(struct sk_buff *skb)
 	}
 
 	skb->dev = dev;
-	skb_sender_cpu_clear(skb);
 	return dev_queue_xmit(skb);
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 488566b09c6d..7af7ec635d90 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4302,7 +4302,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 	skb->skb_iif = 0;
 	skb->ignore_df = 0;
 	skb_dst_drop(skb);
-	skb_sender_cpu_clear(skb);
 	secpath_reset(skb);
 	nf_reset(skb);
 	nf_reset_trace(skb);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index da0d7ce85844..af18f1e4889e 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -71,7 +71,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
 
-	skb_sender_cpu_clear(skb);
 	return dst_output(net, sk, skb);
 }
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a163102f1803..9428345d3a07 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -332,7 +332,6 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 				     struct sk_buff *skb)
 {
-	skb_sender_cpu_clear(skb);
 	return dst_output(net, sk, skb);
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index a3f5cd9b3c4c..dc196a0f501d 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -531,8 +531,6 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 	if (ret == NF_ACCEPT) {
 		nf_reset(skb);
 		skb_forward_csum(skb);
-		if (!skb->sk)
-			skb_sender_cpu_clear(skb);
 	}
 	return ret;
 }
@@ -573,8 +571,6 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
 
 	if (!local) {
 		skb_forward_csum(skb);
-		if (!skb->sk)
-			skb_sender_cpu_clear(skb);
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output);
 	} else
@@ -595,8 +591,6 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
 	if (!local) {
 		ip_vs_drop_early_demux_sk(skb);
 		skb_forward_csum(skb);
-		if (!skb->sk)
-			skb_sender_cpu_clear(skb);
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output);
 	} else
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 8414ee1a0319..7ec69723940f 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -31,7 +31,6 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
 		skb_push(skb, skb->mac_len);
 
 	skb->dev = dev;
-	skb_sender_cpu_clear(skb);
 	dev_queue_xmit(skb);
 }
 EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6b284d991e0b..e8a760cf7775 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -182,7 +182,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
-	skb_sender_cpu_clear(skb2);
 	err = dev_queue_xmit(skb2);
 
 	if (err) {
-- 
cgit v1.2.3-59-g8ed1b


From 241deec94425b5e962e2bf8a929d563afda045f7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 1 Mar 2016 17:44:59 -0500
Subject: sch_mqprio: Fix build with older gcc.

  CC [M]  net/sched/sch_mqprio.o
net/sched/sch_mqprio.c: In function ?mqprio_init?:
net/sched/sch_mqprio.c:145: error: unknown field ?tc? specified in initializer
net/sched/sch_mqprio.c:145: warning: missing braces around initializer
net/sched/sch_mqprio.c:145: warning: (near initialization for ?tc.<anonymous>?)
make[2]: *** [net/sched/sch_mqprio.o] Error 1
make[1]: *** [net/sched] Error 2
make: *** [net] Error 2

Several people reported this, surround the unnamed union
member initialization with braces to fix.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_mqprio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index f9947d1f4952..02ffb3fbbc20 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -142,7 +142,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 */
 	if (qopt->hw) {
 		struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO,
-					  .tc = qopt->num_tc};
+					  { .tc = qopt->num_tc }};
 
 		priv->hw_owned = 1;
 		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
-- 
cgit v1.2.3-59-g8ed1b


From c145aeb3ff899f38bd28ccbc40dbb647dd1b1fd3 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Mon, 29 Feb 2016 08:21:30 +0000
Subject: net: pktgen: use reset to set mac header

Since offset is zero, it's not necessary to use set function.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 1474cfd2dc1c..20999aa596dd 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2856,7 +2856,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 		*vlan_encapsulated_proto = htons(ETH_P_IP);
 	}
 
-	skb_set_mac_header(skb, 0);
+	skb_reset_mac_header(skb);
 	skb_set_network_header(skb, skb->len);
 	iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr));
 
@@ -2983,7 +2983,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 		*vlan_encapsulated_proto = htons(ETH_P_IPV6);
 	}
 
-	skb_set_mac_header(skb, 0);
+	skb_reset_mac_header(skb);
 	skb_set_network_header(skb, skb->len);
 	iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
 
-- 
cgit v1.2.3-59-g8ed1b


From fb653ebddcf871e8a83a63c1e4e73d41bc3cf599 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 2 Mar 2016 14:54:35 +0100
Subject: batman-adv: clarify CFG80211 dependency

The driver calls cfg80211_get_station, which may be part of a
module, so we must not enable BATMAN_ADV_BATMAN_V if
BATMAN_ADV=y and CFG80211=m:

net/built-in.o: In function `batadv_v_elp_get_throughput':
(text+0x5c62c): undefined reference to `cfg80211_get_station'

This clarifies the dependency to cover all combinations.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: c833484e5f38 ("batman-adv: ELP - compute the metric based on the estimated throughput")
Acked-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/batman-adv/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index e651dc927bfd..f66930ee3c0b 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -17,7 +17,7 @@ config BATMAN_ADV
 
 config BATMAN_ADV_BATMAN_V
 	bool "B.A.T.M.A.N. V protocol (experimental)"
-	depends on BATMAN_ADV && CFG80211
+	depends on BATMAN_ADV && CFG80211=y || (CFG80211=m && BATMAN_ADV=m)
 	default n
 	help
 	  This option enables the B.A.T.M.A.N. V protocol, the successor
-- 
cgit v1.2.3-59-g8ed1b


From 5f547391f5f25bb71f27860ad25bd1e4715f0752 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@ovn.org>
Date: Wed, 3 Feb 2016 10:00:10 -0800
Subject: netfilter: nf_defrag_ipv4: Drop redundant ip_send_check()

Since commit 0848f6428ba3 ("inet: frags: fix defragmented packet's IP
header for af_packet"), ip_send_check() would be called twice for
defragmentation that occurs from netfilter ipv4 defrag hooks. Remove the
extra call.

Signed-off-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_defrag_ipv4.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index a04dee536b8e..d88da36b383c 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -31,10 +31,8 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
 	err = ip_defrag(net, skb, user);
 	local_bh_enable();
 
-	if (!err) {
-		ip_send_check(ip_hdr(skb));
+	if (!err)
 		skb->ignore_df = 1;
-	}
 
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From a67dd266adf42a24df31380e9da78390bb4d65ef Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 25 Feb 2016 10:08:35 +0100
Subject: netfilter: xtables: prepare for on-demand hook register

This change prepares for upcoming on-demand xtables hook registration.

We change the protoypes of the register/unregister functions.
A followup patch will then add nf_hook_register/unregister calls
to the iptables one.

Once a hook is registered packets will be picked up, so all assignments
of the form

net->ipv4.iptable_$table = new_table

have to be moved to ip(6)t_register_table, else we can see NULL
net->ipv4.iptable_$table later.

This patch doesn't change functionality; without this the actual change
simply gets too big.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_arp/arp_tables.h  |  9 +++++----
 include/linux/netfilter_ipv4/ip_tables.h  |  9 +++++----
 include/linux/netfilter_ipv6/ip6_tables.h |  9 +++++----
 net/ipv4/netfilter/arp_tables.c           | 25 ++++++++++++++-----------
 net/ipv4/netfilter/arptable_filter.c      | 11 ++++++-----
 net/ipv4/netfilter/ip_tables.c            | 21 ++++++++++-----------
 net/ipv4/netfilter/iptable_filter.c       |  9 +++++----
 net/ipv4/netfilter/iptable_mangle.c       |  9 +++++----
 net/ipv4/netfilter/iptable_nat.c          |  8 +++++---
 net/ipv4/netfilter/iptable_raw.c          |  9 +++++----
 net/ipv4/netfilter/iptable_security.c     |  9 +++++----
 net/ipv6/netfilter/ip6_tables.c           | 23 ++++++++++++-----------
 net/ipv6/netfilter/ip6table_filter.c      |  9 +++++----
 net/ipv6/netfilter/ip6table_mangle.c      |  9 +++++----
 net/ipv6/netfilter/ip6table_nat.c         |  8 +++++---
 net/ipv6/netfilter/ip6table_raw.c         |  9 +++++----
 net/ipv6/netfilter/ip6table_security.c    |  9 +++++----
 17 files changed, 107 insertions(+), 88 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 6f074db2f23d..029b95e8924e 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -48,10 +48,11 @@ struct arpt_error {
 }
 
 extern void *arpt_alloc_initial_table(const struct xt_table *);
-extern struct xt_table *arpt_register_table(struct net *net,
-					    const struct xt_table *table,
-					    const struct arpt_replace *repl);
-extern void arpt_unregister_table(struct xt_table *table);
+int arpt_register_table(struct net *net, const struct xt_table *table,
+			const struct arpt_replace *repl,
+			const struct nf_hook_ops *ops, struct xt_table **res);
+void arpt_unregister_table(struct net *net, struct xt_table *table,
+			   const struct nf_hook_ops *ops);
 extern unsigned int arpt_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  struct xt_table *table);
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index aa598f942c01..7bfc5893ec31 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -24,10 +24,11 @@
 
 extern void ipt_init(void) __init;
 
-extern struct xt_table *ipt_register_table(struct net *net,
-					   const struct xt_table *table,
-					   const struct ipt_replace *repl);
-extern void ipt_unregister_table(struct net *net, struct xt_table *table);
+int ipt_register_table(struct net *net, const struct xt_table *table,
+		       const struct ipt_replace *repl,
+		       const struct nf_hook_ops *ops, struct xt_table **res);
+void ipt_unregister_table(struct net *net, struct xt_table *table,
+			  const struct nf_hook_ops *ops);
 
 /* Standard entry. */
 struct ipt_standard {
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 0f76e5c674f9..b21c392d6012 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -25,10 +25,11 @@
 extern void ip6t_init(void) __init;
 
 extern void *ip6t_alloc_initial_table(const struct xt_table *);
-extern struct xt_table *ip6t_register_table(struct net *net,
-					    const struct xt_table *table,
-					    const struct ip6t_replace *repl);
-extern void ip6t_unregister_table(struct net *net, struct xt_table *table);
+int ip6t_register_table(struct net *net, const struct xt_table *table,
+			const struct ip6t_replace *repl,
+			const struct nf_hook_ops *ops, struct xt_table **res);
+void ip6t_unregister_table(struct net *net, struct xt_table *table,
+			   const struct nf_hook_ops *ops);
 extern unsigned int ip6t_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  struct xt_table *table);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b488cac9c5ca..00eed0852dfc 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1780,9 +1780,11 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 	return ret;
 }
 
-struct xt_table *arpt_register_table(struct net *net,
-				     const struct xt_table *table,
-				     const struct arpt_replace *repl)
+int arpt_register_table(struct net *net,
+			const struct xt_table *table,
+			const struct arpt_replace *repl,
+			const struct nf_hook_ops *ops,
+			struct xt_table **res)
 {
 	int ret;
 	struct xt_table_info *newinfo;
@@ -1791,10 +1793,8 @@ struct xt_table *arpt_register_table(struct net *net,
 	struct xt_table *new_table;
 
 	newinfo = xt_alloc_table_info(repl->size);
-	if (!newinfo) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!newinfo)
+		return -ENOMEM;
 
 	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -1809,15 +1809,18 @@ struct xt_table *arpt_register_table(struct net *net,
 		ret = PTR_ERR(new_table);
 		goto out_free;
 	}
-	return new_table;
+
+	WRITE_ONCE(*res, new_table);
+
+	return ret;
 
 out_free:
 	xt_free_table_info(newinfo);
-out:
-	return ERR_PTR(ret);
+	return ret;
 }
 
-void arpt_unregister_table(struct xt_table *table)
+void arpt_unregister_table(struct net *net, struct xt_table *table,
+			   const struct nf_hook_ops *ops)
 {
 	struct xt_table_info *private;
 	void *loc_cpu_entry;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 1897ee160920..4c0241692576 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -38,19 +38,20 @@ static struct nf_hook_ops *arpfilter_ops __read_mostly;
 static int __net_init arptable_filter_net_init(struct net *net)
 {
 	struct arpt_replace *repl;
-	
+	int err;
+
 	repl = arpt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.arptable_filter =
-		arpt_register_table(net, &packet_filter, repl);
+	err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops,
+				  &net->ipv4.arptable_filter);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
+	return err;
 }
 
 static void __net_exit arptable_filter_net_exit(struct net *net)
 {
-	arpt_unregister_table(net->ipv4.arptable_filter);
+	arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops);
 }
 
 static struct pernet_operations arptable_filter_net_ops = {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b99affad6ba1..1eb4fe5b4702 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -2062,9 +2062,9 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
-struct xt_table *ipt_register_table(struct net *net,
-				    const struct xt_table *table,
-				    const struct ipt_replace *repl)
+int ipt_register_table(struct net *net, const struct xt_table *table,
+		       const struct ipt_replace *repl,
+		       const struct nf_hook_ops *ops, struct xt_table **res)
 {
 	int ret;
 	struct xt_table_info *newinfo;
@@ -2073,10 +2073,8 @@ struct xt_table *ipt_register_table(struct net *net,
 	struct xt_table *new_table;
 
 	newinfo = xt_alloc_table_info(repl->size);
-	if (!newinfo) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!newinfo)
+		return -ENOMEM;
 
 	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2091,15 +2089,16 @@ struct xt_table *ipt_register_table(struct net *net,
 		goto out_free;
 	}
 
-	return new_table;
+	WRITE_ONCE(*res, new_table);
+	return ret;
 
 out_free:
 	xt_free_table_info(newinfo);
-out:
-	return ERR_PTR(ret);
+	return ret;
 }
 
-void ipt_unregister_table(struct net *net, struct xt_table *table)
+void ipt_unregister_table(struct net *net, struct xt_table *table,
+			  const struct nf_hook_ops *ops)
 {
 	struct xt_table_info *private;
 	void *loc_cpu_entry;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 397ef2dd133e..3fbe4acacb27 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -54,6 +54,7 @@ module_param(forward, bool, 0000);
 static int __net_init iptable_filter_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int err;
 
 	repl = ipt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
@@ -62,15 +63,15 @@ static int __net_init iptable_filter_net_init(struct net *net)
 	((struct ipt_standard *)repl->entries)[1].target.verdict =
 		forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
 
-	net->ipv4.iptable_filter =
-		ipt_register_table(net, &packet_filter, repl);
+	err = ipt_register_table(net, &packet_filter, repl, filter_ops,
+				 &net->ipv4.iptable_filter);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
+	return err;
 }
 
 static void __net_exit iptable_filter_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.iptable_filter);
+	ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops);
 }
 
 static struct pernet_operations iptable_filter_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index ba5d392a13c4..668e79166b81 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -96,19 +96,20 @@ static struct nf_hook_ops *mangle_ops __read_mostly;
 static int __net_init iptable_mangle_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
 
 	repl = ipt_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.iptable_mangle =
-		ipt_register_table(net, &packet_mangler, repl);
+	ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops,
+				 &net->ipv4.iptable_mangle);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
+	return ret;
 }
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.iptable_mangle);
+	ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops);
 }
 
 static struct pernet_operations iptable_mangle_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index ae2cd2752046..e984f1d3017f 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -98,18 +98,20 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
 static int __net_init iptable_nat_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
 
 	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+	ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
+				 nf_nat_ipv4_ops, &net->ipv4.nat_table);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
+	return ret;
 }
 
 static void __net_exit iptable_nat_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.nat_table);
+	ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
 }
 
 static struct pernet_operations iptable_nat_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 1ba02811acb0..9d78780a9036 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -37,19 +37,20 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init iptable_raw_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
 
 	repl = ipt_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.iptable_raw =
-		ipt_register_table(net, &packet_raw, repl);
+	ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops,
+				 &net->ipv4.iptable_raw);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
+	return ret;
 }
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.iptable_raw);
+	ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops);
 }
 
 static struct pernet_operations iptable_raw_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index c2e23d5e9cd4..88bc52fb8f4a 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -54,19 +54,20 @@ static struct nf_hook_ops *sectbl_ops __read_mostly;
 static int __net_init iptable_security_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
 
 	repl = ipt_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv4.iptable_security =
-		ipt_register_table(net, &security_table, repl);
+	ret = ipt_register_table(net, &security_table, repl, sectbl_ops,
+				 &net->ipv4.iptable_security);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
+	return ret;
 }
 
 static void __net_exit iptable_security_net_exit(struct net *net)
 {
-	ipt_unregister_table(net, net->ipv4.iptable_security);
+	ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops);
 }
 
 static struct pernet_operations iptable_security_net_ops = {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 99425cf2819b..052d7447b52e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -2071,9 +2071,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
-struct xt_table *ip6t_register_table(struct net *net,
-				     const struct xt_table *table,
-				     const struct ip6t_replace *repl)
+int ip6t_register_table(struct net *net, const struct xt_table *table,
+			const struct ip6t_replace *repl,
+			const struct nf_hook_ops *ops,
+			struct xt_table **res)
 {
 	int ret;
 	struct xt_table_info *newinfo;
@@ -2082,10 +2083,8 @@ struct xt_table *ip6t_register_table(struct net *net,
 	struct xt_table *new_table;
 
 	newinfo = xt_alloc_table_info(repl->size);
-	if (!newinfo) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!newinfo)
+		return -ENOMEM;
 
 	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2099,15 +2098,17 @@ struct xt_table *ip6t_register_table(struct net *net,
 		ret = PTR_ERR(new_table);
 		goto out_free;
 	}
-	return new_table;
+
+	WRITE_ONCE(*res, new_table);
+	return ret;
 
 out_free:
 	xt_free_table_info(newinfo);
-out:
-	return ERR_PTR(ret);
+	return ret;
 }
 
-void ip6t_unregister_table(struct net *net, struct xt_table *table)
+void ip6t_unregister_table(struct net *net, struct xt_table *table,
+			   const struct nf_hook_ops *ops)
 {
 	struct xt_table_info *private;
 	void *loc_cpu_entry;
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 8b277b983ca5..d191d54cdf50 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -47,6 +47,7 @@ module_param(forward, bool, 0000);
 static int __net_init ip6table_filter_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int err;
 
 	repl = ip6t_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
@@ -55,15 +56,15 @@ static int __net_init ip6table_filter_net_init(struct net *net)
 	((struct ip6t_standard *)repl->entries)[1].target.verdict =
 		forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
 
-	net->ipv6.ip6table_filter =
-		ip6t_register_table(net, &packet_filter, repl);
+	err = ip6t_register_table(net, &packet_filter, repl, filter_ops,
+				  &net->ipv6.ip6table_filter);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter);
+	return err;
 }
 
 static void __net_exit ip6table_filter_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_filter);
+	ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops);
 }
 
 static struct pernet_operations ip6table_filter_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index abe278b07932..fe43d08284bc 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -91,19 +91,20 @@ static struct nf_hook_ops *mangle_ops __read_mostly;
 static int __net_init ip6table_mangle_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int ret;
 
 	repl = ip6t_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_mangle =
-		ip6t_register_table(net, &packet_mangler, repl);
+	ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops,
+				  &net->ipv6.ip6table_mangle);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle);
+	return ret;
 }
 
 static void __net_exit ip6table_mangle_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_mangle);
+	ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops);
 }
 
 static struct pernet_operations ip6table_mangle_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index de2a10a565f5..7f9740e8ef47 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -100,18 +100,20 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
 static int __net_init ip6table_nat_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int ret;
 
 	repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl);
+	ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
+				  nf_nat_ipv6_ops, &net->ipv6.ip6table_nat);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat);
+	return ret;
 }
 
 static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_nat);
+	ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops);
 }
 
 static struct pernet_operations ip6table_nat_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 9021963565c3..5fac433da069 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -30,19 +30,20 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init ip6table_raw_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int ret;
 
 	repl = ip6t_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_raw =
-		ip6t_register_table(net, &packet_raw, repl);
+	ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops,
+				  &net->ipv6.ip6table_raw);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw);
+	return ret;
 }
 
 static void __net_exit ip6table_raw_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_raw);
+	ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops);
 }
 
 static struct pernet_operations ip6table_raw_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 0d856fedfeb0..cf587453e322 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -47,19 +47,20 @@ static struct nf_hook_ops *sectbl_ops __read_mostly;
 static int __net_init ip6table_security_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	int ret;
 
 	repl = ip6t_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_security =
-		ip6t_register_table(net, &security_table, repl);
+	ret = ip6t_register_table(net, &security_table, repl, sectbl_ops,
+				  &net->ipv6.ip6table_security);
 	kfree(repl);
-	return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security);
+	return ret;
 }
 
 static void __net_exit ip6table_security_net_exit(struct net *net)
 {
-	ip6t_unregister_table(net, net->ipv6.ip6table_security);
+	ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops);
 }
 
 static struct pernet_operations ip6table_security_net_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From b9e69e127397187b70c813a4397cce7afb5e8cb1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 25 Feb 2016 10:08:36 +0100
Subject: netfilter: xtables: don't hook tables by default

delay hook registration until the table is being requested inside a
namespace.

Historically, a particular table (iptables mangle, ip6tables filter, etc)
was registered on module load.

When netns support was added to iptables only the ip/ip6tables ruleset was
made namespace aware, not the actual hook points.

This means f.e. that when ipt_filter table/module is loaded on a system,
then each namespace on that system has an (empty) iptables filter ruleset.

In other words, if a namespace sends a packet, such skb is 'caught' by
netfilter machinery and fed to hooking points for that table (i.e. INPUT,
FORWARD, etc).

Thanks to Eric Biederman, hooks are no longer global, but per namespace.

This means that we can avoid allocation of empty ruleset in a namespace and
defer hook registration until we need the functionality.

We register a tables hook entry points ONLY in the initial namespace.
When an iptables get/setockopt is issued inside a given namespace, we check
if the table is found in the per-namespace list.

If not, we attempt to find it in the initial namespace, and, if found,
create an empty default table in the requesting namespace and register the
needed hooks.

Hook points are destroyed only once namespace is deleted, there is no
'usage count' (it makes no sense since there is no 'remove table' operation
in xtables api).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h     |  6 ++--
 net/ipv4/netfilter/arp_tables.c        | 41 +++++++++++++--------
 net/ipv4/netfilter/arptable_filter.c   | 29 ++++++++-------
 net/ipv4/netfilter/ip_tables.c         | 42 ++++++++++++++--------
 net/ipv4/netfilter/iptable_filter.c    | 35 ++++++++++++------
 net/ipv4/netfilter/iptable_mangle.c    | 32 ++++++++++++-----
 net/ipv4/netfilter/iptable_nat.c       | 33 ++++++++---------
 net/ipv4/netfilter/iptable_raw.c       | 29 ++++++++++-----
 net/ipv4/netfilter/iptable_security.c  | 35 +++++++++++-------
 net/ipv6/netfilter/ip6_tables.c        | 42 ++++++++++++++--------
 net/ipv6/netfilter/ip6table_filter.c   | 38 ++++++++++++--------
 net/ipv6/netfilter/ip6table_mangle.c   | 37 +++++++++++--------
 net/ipv6/netfilter/ip6table_nat.c      | 33 ++++++++---------
 net/ipv6/netfilter/ip6table_raw.c      | 37 +++++++++++--------
 net/ipv6/netfilter/ip6table_security.c | 35 +++++++++++-------
 net/netfilter/x_tables.c               | 65 +++++++++++++++++++++-------------
 16 files changed, 361 insertions(+), 208 deletions(-)

(limited to 'net')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index c5577410c25d..80a305b85323 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -200,6 +200,9 @@ struct xt_table {
 	u_int8_t af;		/* address/protocol family */
 	int priority;		/* hook order */
 
+	/* called when table is needed in the given netns */
+	int (*table_init)(struct net *net);
+
 	/* A unique name... */
 	const char name[XT_TABLE_MAXNAMELEN];
 };
@@ -408,8 +411,7 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
 	return cnt;
 }
 
-struct nf_hook_ops *xt_hook_link(const struct xt_table *, nf_hookfn *);
-void xt_hook_unlink(const struct xt_table *, struct nf_hook_ops *);
+struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);
 
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 00eed0852dfc..bf081927e06b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1780,6 +1780,24 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 	return ret;
 }
 
+static void __arpt_unregister_table(struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct arpt_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries;
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
 int arpt_register_table(struct net *net,
 			const struct xt_table *table,
 			const struct arpt_replace *repl,
@@ -1810,8 +1828,15 @@ int arpt_register_table(struct net *net,
 		goto out_free;
 	}
 
+	/* set res now, will see skbs right after nf_register_net_hooks */
 	WRITE_ONCE(*res, new_table);
 
+	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+	if (ret != 0) {
+		__arpt_unregister_table(new_table);
+		*res = NULL;
+	}
+
 	return ret;
 
 out_free:
@@ -1822,20 +1847,8 @@ out_free:
 void arpt_unregister_table(struct net *net, struct xt_table *table,
 			   const struct nf_hook_ops *ops)
 {
-	struct xt_table_info *private;
-	void *loc_cpu_entry;
-	struct module *table_owner = table->me;
-	struct arpt_entry *iter;
-
-	private = xt_unregister_table(table);
-
-	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries;
-	xt_entry_foreach(iter, loc_cpu_entry, private->size)
-		cleanup_entry(iter);
-	if (private->number > private->initial_entries)
-		module_put(table_owner);
-	xt_free_table_info(private);
+	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	__arpt_unregister_table(table);
 }
 
 /* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 4c0241692576..dd8c80dc32a2 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table");
 #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
 			   (1 << NF_ARP_FORWARD))
 
+static int __net_init arptable_filter_table_init(struct net *net);
+
 static const struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_ARP,
 	.priority	= NF_IP_PRI_FILTER,
+	.table_init	= arptable_filter_table_init,
 };
 
 /* The work comes in here from netfilter.c */
@@ -35,11 +38,14 @@ arptable_filter_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *arpfilter_ops __read_mostly;
 
-static int __net_init arptable_filter_net_init(struct net *net)
+static int __net_init arptable_filter_table_init(struct net *net)
 {
 	struct arpt_replace *repl;
 	int err;
 
+	if (net->ipv4.arptable_filter)
+		return 0;
+
 	repl = arpt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -51,11 +57,13 @@ static int __net_init arptable_filter_net_init(struct net *net)
 
 static void __net_exit arptable_filter_net_exit(struct net *net)
 {
+	if (!net->ipv4.arptable_filter)
+		return;
 	arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops);
+	net->ipv4.arptable_filter = NULL;
 }
 
 static struct pernet_operations arptable_filter_net_ops = {
-	.init = arptable_filter_net_init,
 	.exit = arptable_filter_net_exit,
 };
 
@@ -63,26 +71,23 @@ static int __init arptable_filter_init(void)
 {
 	int ret;
 
+	arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook);
+	if (IS_ERR(arpfilter_ops))
+		return PTR_ERR(arpfilter_ops);
+
 	ret = register_pernet_subsys(&arptable_filter_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(arpfilter_ops);
 		return ret;
-
-	arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
-	if (IS_ERR(arpfilter_ops)) {
-		ret = PTR_ERR(arpfilter_ops);
-		goto cleanup_table;
 	}
-	return ret;
 
-cleanup_table:
-	unregister_pernet_subsys(&arptable_filter_net_ops);
 	return ret;
 }
 
 static void __exit arptable_filter_fini(void)
 {
-	xt_hook_unlink(&packet_filter, arpfilter_ops);
 	unregister_pernet_subsys(&arptable_filter_net_ops);
+	kfree(arpfilter_ops);
 }
 
 module_init(arptable_filter_init);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 1eb4fe5b4702..e53f8d6f326d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -2062,6 +2062,24 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
+static void __ipt_unregister_table(struct net *net, struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct ipt_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries;
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter, net);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
 int ipt_register_table(struct net *net, const struct xt_table *table,
 		       const struct ipt_replace *repl,
 		       const struct nf_hook_ops *ops, struct xt_table **res)
@@ -2089,7 +2107,15 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
 		goto out_free;
 	}
 
+	/* set res now, will see skbs right after nf_register_net_hooks */
 	WRITE_ONCE(*res, new_table);
+
+	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+	if (ret != 0) {
+		__ipt_unregister_table(net, new_table);
+		*res = NULL;
+	}
+
 	return ret;
 
 out_free:
@@ -2100,20 +2126,8 @@ out_free:
 void ipt_unregister_table(struct net *net, struct xt_table *table,
 			  const struct nf_hook_ops *ops)
 {
-	struct xt_table_info *private;
-	void *loc_cpu_entry;
-	struct module *table_owner = table->me;
-	struct ipt_entry *iter;
-
-	private = xt_unregister_table(table);
-
-	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries;
-	xt_entry_foreach(iter, loc_cpu_entry, private->size)
-		cleanup_entry(iter, net);
-	if (private->number > private->initial_entries)
-		module_put(table_owner);
-	xt_free_table_info(private);
+	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	__ipt_unregister_table(net, table);
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 3fbe4acacb27..7667f223d7f8 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table");
 #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
 			    (1 << NF_INET_FORWARD) | \
 			    (1 << NF_INET_LOCAL_OUT))
+static int __net_init iptable_filter_table_init(struct net *net);
 
 static const struct xt_table packet_filter = {
 	.name		= "filter",
@@ -30,6 +31,7 @@ static const struct xt_table packet_filter = {
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
 	.priority	= NF_IP_PRI_FILTER,
+	.table_init	= iptable_filter_table_init,
 };
 
 static unsigned int
@@ -48,14 +50,17 @@ iptable_filter_hook(void *priv, struct sk_buff *skb,
 static struct nf_hook_ops *filter_ops __read_mostly;
 
 /* Default to forward because I got too much mail already. */
-static bool forward = true;
+static bool forward __read_mostly = true;
 module_param(forward, bool, 0000);
 
-static int __net_init iptable_filter_net_init(struct net *net)
+static int __net_init iptable_filter_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int err;
 
+	if (net->ipv4.iptable_filter)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -69,9 +74,20 @@ static int __net_init iptable_filter_net_init(struct net *net)
 	return err;
 }
 
+static int __net_init iptable_filter_net_init(struct net *net)
+{
+	if (net == &init_net || !forward)
+		return iptable_filter_table_init(net);
+
+	return 0;
+}
+
 static void __net_exit iptable_filter_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_filter)
+		return;
 	ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops);
+	net->ipv4.iptable_filter = NULL;
 }
 
 static struct pernet_operations iptable_filter_net_ops = {
@@ -83,24 +99,21 @@ static int __init iptable_filter_init(void)
 {
 	int ret;
 
+	filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
+	if (IS_ERR(filter_ops))
+		return PTR_ERR(filter_ops);
+
 	ret = register_pernet_subsys(&iptable_filter_net_ops);
 	if (ret < 0)
-		return ret;
-
-	/* Register hooks */
-	filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
-	if (IS_ERR(filter_ops)) {
-		ret = PTR_ERR(filter_ops);
-		unregister_pernet_subsys(&iptable_filter_net_ops);
-	}
+		kfree(filter_ops);
 
 	return ret;
 }
 
 static void __exit iptable_filter_fini(void)
 {
-	xt_hook_unlink(&packet_filter, filter_ops);
 	unregister_pernet_subsys(&iptable_filter_net_ops);
+	kfree(filter_ops);
 }
 
 module_init(iptable_filter_init);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 668e79166b81..57fc97cdac70 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table");
 			    (1 << NF_INET_LOCAL_OUT) | \
 			    (1 << NF_INET_POST_ROUTING))
 
+static int __net_init iptable_mangle_table_init(struct net *net);
+
 static const struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
 	.priority	= NF_IP_PRI_MANGLE,
+	.table_init	= iptable_mangle_table_init,
 };
 
 static unsigned int
@@ -92,12 +95,14 @@ iptable_mangle_hook(void *priv,
 }
 
 static struct nf_hook_ops *mangle_ops __read_mostly;
-
-static int __net_init iptable_mangle_net_init(struct net *net)
+static int __net_init iptable_mangle_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
 
+	if (net->ipv4.iptable_mangle)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -109,11 +114,13 @@ static int __net_init iptable_mangle_net_init(struct net *net)
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_mangle)
+		return;
 	ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops);
+	net->ipv4.iptable_mangle = NULL;
 }
 
 static struct pernet_operations iptable_mangle_net_ops = {
-	.init = iptable_mangle_net_init,
 	.exit = iptable_mangle_net_exit,
 };
 
@@ -121,15 +128,22 @@ static int __init iptable_mangle_init(void)
 {
 	int ret;
 
+	mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
+	if (IS_ERR(mangle_ops)) {
+		ret = PTR_ERR(mangle_ops);
+		return ret;
+	}
+
 	ret = register_pernet_subsys(&iptable_mangle_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(mangle_ops);
 		return ret;
+	}
 
-	/* Register hooks */
-	mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
-	if (IS_ERR(mangle_ops)) {
-		ret = PTR_ERR(mangle_ops);
+	ret = iptable_mangle_table_init(&init_net);
+	if (ret) {
 		unregister_pernet_subsys(&iptable_mangle_net_ops);
+		kfree(mangle_ops);
 	}
 
 	return ret;
@@ -137,8 +151,8 @@ static int __init iptable_mangle_init(void)
 
 static void __exit iptable_mangle_fini(void)
 {
-	xt_hook_unlink(&packet_mangler, mangle_ops);
 	unregister_pernet_subsys(&iptable_mangle_net_ops);
+	kfree(mangle_ops);
 }
 
 module_init(iptable_mangle_init);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index e984f1d3017f..138a24bc76ad 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -18,6 +18,8 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 
+static int __net_init iptable_nat_table_init(struct net *net);
+
 static const struct xt_table nf_nat_ipv4_table = {
 	.name		= "nat",
 	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) |
@@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = {
 			  (1 << NF_INET_LOCAL_IN),
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
+	.table_init	= iptable_nat_table_init,
 };
 
 static unsigned int iptable_nat_do_chain(void *priv,
@@ -95,11 +98,14 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
 	},
 };
 
-static int __net_init iptable_nat_net_init(struct net *net)
+static int __net_init iptable_nat_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
 
+	if (net->ipv4.nat_table)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -111,36 +117,31 @@ static int __net_init iptable_nat_net_init(struct net *net)
 
 static void __net_exit iptable_nat_net_exit(struct net *net)
 {
+	if (!net->ipv4.nat_table)
+		return;
 	ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
+	net->ipv4.nat_table = NULL;
 }
 
 static struct pernet_operations iptable_nat_net_ops = {
-	.init	= iptable_nat_net_init,
 	.exit	= iptable_nat_net_exit,
 };
 
 static int __init iptable_nat_init(void)
 {
-	int err;
-
-	err = register_pernet_subsys(&iptable_nat_net_ops);
-	if (err < 0)
-		goto err1;
+	int ret = register_pernet_subsys(&iptable_nat_net_ops);
 
-	err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
-	if (err < 0)
-		goto err2;
-	return 0;
+	if (ret)
+		return ret;
 
-err2:
-	unregister_pernet_subsys(&iptable_nat_net_ops);
-err1:
-	return err;
+	ret = iptable_nat_table_init(&init_net);
+	if (ret)
+		unregister_pernet_subsys(&iptable_nat_net_ops);
+	return ret;
 }
 
 static void __exit iptable_nat_exit(void)
 {
-	nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
 	unregister_pernet_subsys(&iptable_nat_net_ops);
 }
 
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 9d78780a9036..2642ecd2645c 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -10,12 +10,15 @@
 
 #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
 
+static int __net_init iptable_raw_table_init(struct net *net);
+
 static const struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks =  RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
 	.af = NFPROTO_IPV4,
 	.priority = NF_IP_PRI_RAW,
+	.table_init = iptable_raw_table_init,
 };
 
 /* The work comes in here from netfilter.c. */
@@ -34,11 +37,14 @@ iptable_raw_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
 
-static int __net_init iptable_raw_net_init(struct net *net)
+static int __net_init iptable_raw_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
 
+	if (net->ipv4.iptable_raw)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -50,11 +56,13 @@ static int __net_init iptable_raw_net_init(struct net *net)
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_raw)
+		return;
 	ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops);
+	net->ipv4.iptable_raw = NULL;
 }
 
 static struct pernet_operations iptable_raw_net_ops = {
-	.init = iptable_raw_net_init,
 	.exit = iptable_raw_net_exit,
 };
 
@@ -62,15 +70,20 @@ static int __init iptable_raw_init(void)
 {
 	int ret;
 
+	rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook);
+	if (IS_ERR(rawtable_ops))
+		return PTR_ERR(rawtable_ops);
+
 	ret = register_pernet_subsys(&iptable_raw_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(rawtable_ops);
 		return ret;
+	}
 
-	/* Register hooks */
-	rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
-	if (IS_ERR(rawtable_ops)) {
-		ret = PTR_ERR(rawtable_ops);
+	ret = iptable_raw_table_init(&init_net);
+	if (ret) {
 		unregister_pernet_subsys(&iptable_raw_net_ops);
+		kfree(rawtable_ops);
 	}
 
 	return ret;
@@ -78,8 +91,8 @@ static int __init iptable_raw_init(void)
 
 static void __exit iptable_raw_fini(void)
 {
-	xt_hook_unlink(&packet_raw, rawtable_ops);
 	unregister_pernet_subsys(&iptable_raw_net_ops);
+	kfree(rawtable_ops);
 }
 
 module_init(iptable_raw_init);
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 88bc52fb8f4a..ff226596e4b5 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
 				(1 << NF_INET_FORWARD) | \
 				(1 << NF_INET_LOCAL_OUT)
 
+static int __net_init iptable_security_table_init(struct net *net);
+
 static const struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV4,
 	.priority	= NF_IP_PRI_SECURITY,
+	.table_init	= iptable_security_table_init,
 };
 
 static unsigned int
@@ -51,11 +54,14 @@ iptable_security_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
 
-static int __net_init iptable_security_net_init(struct net *net)
+static int __net_init iptable_security_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
 	int ret;
 
+	if (net->ipv4.iptable_security)
+		return 0;
+
 	repl = ipt_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -67,11 +73,14 @@ static int __net_init iptable_security_net_init(struct net *net)
 
 static void __net_exit iptable_security_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_security)
+		return;
+
 	ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops);
+	net->ipv4.iptable_security = NULL;
 }
 
 static struct pernet_operations iptable_security_net_ops = {
-	.init = iptable_security_net_init,
 	.exit = iptable_security_net_exit,
 };
 
@@ -79,27 +88,29 @@ static int __init iptable_security_init(void)
 {
 	int ret;
 
+	sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook);
+	if (IS_ERR(sectbl_ops))
+		return PTR_ERR(sectbl_ops);
+
 	ret = register_pernet_subsys(&iptable_security_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(sectbl_ops);
 		return ret;
-
-	sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
-	if (IS_ERR(sectbl_ops)) {
-		ret = PTR_ERR(sectbl_ops);
-		goto cleanup_table;
 	}
 
-	return ret;
+	ret = iptable_security_table_init(&init_net);
+	if (ret) {
+		unregister_pernet_subsys(&iptable_security_net_ops);
+		kfree(sectbl_ops);
+	}
 
-cleanup_table:
-	unregister_pernet_subsys(&iptable_security_net_ops);
 	return ret;
 }
 
 static void __exit iptable_security_fini(void)
 {
-	xt_hook_unlink(&security_table, sectbl_ops);
 	unregister_pernet_subsys(&iptable_security_net_ops);
+	kfree(sectbl_ops);
 }
 
 module_init(iptable_security_init);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 052d7447b52e..84f9baf7aee8 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -2071,6 +2071,24 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
+static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct ip6t_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries;
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter, net);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
 int ip6t_register_table(struct net *net, const struct xt_table *table,
 			const struct ip6t_replace *repl,
 			const struct nf_hook_ops *ops,
@@ -2099,7 +2117,15 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
 		goto out_free;
 	}
 
+	/* set res now, will see skbs right after nf_register_net_hooks */
 	WRITE_ONCE(*res, new_table);
+
+	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+	if (ret != 0) {
+		__ip6t_unregister_table(net, new_table);
+		*res = NULL;
+	}
+
 	return ret;
 
 out_free:
@@ -2110,20 +2136,8 @@ out_free:
 void ip6t_unregister_table(struct net *net, struct xt_table *table,
 			   const struct nf_hook_ops *ops)
 {
-	struct xt_table_info *private;
-	void *loc_cpu_entry;
-	struct module *table_owner = table->me;
-	struct ip6t_entry *iter;
-
-	private = xt_unregister_table(table);
-
-	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries;
-	xt_entry_foreach(iter, loc_cpu_entry, private->size)
-		cleanup_entry(iter, net);
-	if (private->number > private->initial_entries)
-		module_put(table_owner);
-	xt_free_table_info(private);
+	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	__ip6t_unregister_table(net, table);
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index d191d54cdf50..1343077dde93 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -22,12 +22,15 @@ MODULE_DESCRIPTION("ip6tables filter table");
 			    (1 << NF_INET_FORWARD) | \
 			    (1 << NF_INET_LOCAL_OUT))
 
+static int __net_init ip6table_filter_table_init(struct net *net);
+
 static const struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
 	.priority	= NF_IP6_PRI_FILTER,
+	.table_init	= ip6table_filter_table_init,
 };
 
 /* The work comes in here from netfilter.c. */
@@ -44,11 +47,14 @@ static struct nf_hook_ops *filter_ops __read_mostly;
 static bool forward = true;
 module_param(forward, bool, 0000);
 
-static int __net_init ip6table_filter_net_init(struct net *net)
+static int __net_init ip6table_filter_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int err;
 
+	if (net->ipv6.ip6table_filter)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -62,9 +68,20 @@ static int __net_init ip6table_filter_net_init(struct net *net)
 	return err;
 }
 
+static int __net_init ip6table_filter_net_init(struct net *net)
+{
+	if (net == &init_net || !forward)
+		return ip6table_filter_table_init(net);
+
+	return 0;
+}
+
 static void __net_exit ip6table_filter_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_filter)
+		return;
 	ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops);
+	net->ipv6.ip6table_filter = NULL;
 }
 
 static struct pernet_operations ip6table_filter_net_ops = {
@@ -76,28 +93,21 @@ static int __init ip6table_filter_init(void)
 {
 	int ret;
 
+	filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook);
+	if (IS_ERR(filter_ops))
+		return PTR_ERR(filter_ops);
+
 	ret = register_pernet_subsys(&ip6table_filter_net_ops);
 	if (ret < 0)
-		return ret;
-
-	/* Register hooks */
-	filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook);
-	if (IS_ERR(filter_ops)) {
-		ret = PTR_ERR(filter_ops);
-		goto cleanup_table;
-	}
+		kfree(filter_ops);
 
 	return ret;
-
- cleanup_table:
-	unregister_pernet_subsys(&ip6table_filter_net_ops);
-	return ret;
 }
 
 static void __exit ip6table_filter_fini(void)
 {
-	xt_hook_unlink(&packet_filter, filter_ops);
 	unregister_pernet_subsys(&ip6table_filter_net_ops);
+	kfree(filter_ops);
 }
 
 module_init(ip6table_filter_init);
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index fe43d08284bc..cb2b28883252 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -23,12 +23,15 @@ MODULE_DESCRIPTION("ip6tables mangle table");
 			    (1 << NF_INET_LOCAL_OUT) | \
 			    (1 << NF_INET_POST_ROUTING))
 
+static int __net_init ip6table_mangle_table_init(struct net *net);
+
 static const struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
 	.priority	= NF_IP6_PRI_MANGLE,
+	.table_init	= ip6table_mangle_table_init,
 };
 
 static unsigned int
@@ -88,11 +91,14 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb,
 }
 
 static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init ip6table_mangle_net_init(struct net *net)
+static int __net_init ip6table_mangle_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
 
+	if (net->ipv6.ip6table_mangle)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -104,11 +110,14 @@ static int __net_init ip6table_mangle_net_init(struct net *net)
 
 static void __net_exit ip6table_mangle_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_mangle)
+		return;
+
 	ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops);
+	net->ipv6.ip6table_mangle = NULL;
 }
 
 static struct pernet_operations ip6table_mangle_net_ops = {
-	.init = ip6table_mangle_net_init,
 	.exit = ip6table_mangle_net_exit,
 };
 
@@ -116,28 +125,28 @@ static int __init ip6table_mangle_init(void)
 {
 	int ret;
 
+	mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
+	if (IS_ERR(mangle_ops))
+		return PTR_ERR(mangle_ops);
+
 	ret = register_pernet_subsys(&ip6table_mangle_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(mangle_ops);
 		return ret;
-
-	/* Register hooks */
-	mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook);
-	if (IS_ERR(mangle_ops)) {
-		ret = PTR_ERR(mangle_ops);
-		goto cleanup_table;
 	}
 
-	return ret;
-
- cleanup_table:
-	unregister_pernet_subsys(&ip6table_mangle_net_ops);
+	ret = ip6table_mangle_table_init(&init_net);
+	if (ret) {
+		unregister_pernet_subsys(&ip6table_mangle_net_ops);
+		kfree(mangle_ops);
+	}
 	return ret;
 }
 
 static void __exit ip6table_mangle_fini(void)
 {
-	xt_hook_unlink(&packet_mangler, mangle_ops);
 	unregister_pernet_subsys(&ip6table_mangle_net_ops);
+	kfree(mangle_ops);
 }
 
 module_init(ip6table_mangle_init);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 7f9740e8ef47..7d2bd940291f 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -20,6 +20,8 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 
+static int __net_init ip6table_nat_table_init(struct net *net);
+
 static const struct xt_table nf_nat_ipv6_table = {
 	.name		= "nat",
 	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) |
@@ -28,6 +30,7 @@ static const struct xt_table nf_nat_ipv6_table = {
 			  (1 << NF_INET_LOCAL_IN),
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
+	.table_init	= ip6table_nat_table_init,
 };
 
 static unsigned int ip6table_nat_do_chain(void *priv,
@@ -97,11 +100,14 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
 	},
 };
 
-static int __net_init ip6table_nat_net_init(struct net *net)
+static int __net_init ip6table_nat_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
 
+	if (net->ipv6.ip6table_nat)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -113,36 +119,31 @@ static int __net_init ip6table_nat_net_init(struct net *net)
 
 static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_nat)
+		return;
 	ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops);
+	net->ipv6.ip6table_nat = NULL;
 }
 
 static struct pernet_operations ip6table_nat_net_ops = {
-	.init	= ip6table_nat_net_init,
 	.exit	= ip6table_nat_net_exit,
 };
 
 static int __init ip6table_nat_init(void)
 {
-	int err;
-
-	err = register_pernet_subsys(&ip6table_nat_net_ops);
-	if (err < 0)
-		goto err1;
+	int ret = register_pernet_subsys(&ip6table_nat_net_ops);
 
-	err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
-	if (err < 0)
-		goto err2;
-	return 0;
+	if (ret)
+		return ret;
 
-err2:
-	unregister_pernet_subsys(&ip6table_nat_net_ops);
-err1:
-	return err;
+	ret = ip6table_nat_table_init(&init_net);
+	if (ret)
+		unregister_pernet_subsys(&ip6table_nat_net_ops);
+	return ret;
 }
 
 static void __exit ip6table_nat_exit(void)
 {
-	nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
 	unregister_pernet_subsys(&ip6table_nat_net_ops);
 }
 
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 5fac433da069..d4bc56443dc1 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -9,12 +9,15 @@
 
 #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
 
+static int __net_init ip6table_raw_table_init(struct net *net);
+
 static const struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks = RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
 	.af = NFPROTO_IPV6,
 	.priority = NF_IP6_PRI_RAW,
+	.table_init = ip6table_raw_table_init,
 };
 
 /* The work comes in here from netfilter.c. */
@@ -27,11 +30,14 @@ ip6table_raw_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
 
-static int __net_init ip6table_raw_net_init(struct net *net)
+static int __net_init ip6table_raw_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
 
+	if (net->ipv6.ip6table_raw)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -43,11 +49,13 @@ static int __net_init ip6table_raw_net_init(struct net *net)
 
 static void __net_exit ip6table_raw_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_raw)
+		return;
 	ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops);
+	net->ipv6.ip6table_raw = NULL;
 }
 
 static struct pernet_operations ip6table_raw_net_ops = {
-	.init = ip6table_raw_net_init,
 	.exit = ip6table_raw_net_exit,
 };
 
@@ -55,28 +63,29 @@ static int __init ip6table_raw_init(void)
 {
 	int ret;
 
+	/* Register hooks */
+	rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook);
+	if (IS_ERR(rawtable_ops))
+		return PTR_ERR(rawtable_ops);
+
 	ret = register_pernet_subsys(&ip6table_raw_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(rawtable_ops);
 		return ret;
-
-	/* Register hooks */
-	rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook);
-	if (IS_ERR(rawtable_ops)) {
-		ret = PTR_ERR(rawtable_ops);
-		goto cleanup_table;
 	}
 
-	return ret;
-
- cleanup_table:
-	unregister_pernet_subsys(&ip6table_raw_net_ops);
+	ret = ip6table_raw_table_init(&init_net);
+	if (ret) {
+		unregister_pernet_subsys(&ip6table_raw_net_ops);
+		kfree(rawtable_ops);
+	}
 	return ret;
 }
 
 static void __exit ip6table_raw_fini(void)
 {
-	xt_hook_unlink(&packet_raw, rawtable_ops);
 	unregister_pernet_subsys(&ip6table_raw_net_ops);
+	kfree(rawtable_ops);
 }
 
 module_init(ip6table_raw_init);
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index cf587453e322..cf26ccb04056 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -27,12 +27,15 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
 				(1 << NF_INET_FORWARD) | \
 				(1 << NF_INET_LOCAL_OUT)
 
+static int __net_init ip6table_security_table_init(struct net *net);
+
 static const struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_IPV6,
 	.priority	= NF_IP6_PRI_SECURITY,
+	.table_init     = ip6table_security_table_init,
 };
 
 static unsigned int
@@ -44,11 +47,14 @@ ip6table_security_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
 
-static int __net_init ip6table_security_net_init(struct net *net)
+static int __net_init ip6table_security_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 	int ret;
 
+	if (net->ipv6.ip6table_security)
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -60,11 +66,13 @@ static int __net_init ip6table_security_net_init(struct net *net)
 
 static void __net_exit ip6table_security_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_security)
+		return;
 	ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops);
+	net->ipv6.ip6table_security = NULL;
 }
 
 static struct pernet_operations ip6table_security_net_ops = {
-	.init = ip6table_security_net_init,
 	.exit = ip6table_security_net_exit,
 };
 
@@ -72,27 +80,28 @@ static int __init ip6table_security_init(void)
 {
 	int ret;
 
+	sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook);
+	if (IS_ERR(sectbl_ops))
+		return PTR_ERR(sectbl_ops);
+
 	ret = register_pernet_subsys(&ip6table_security_net_ops);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(sectbl_ops);
 		return ret;
-
-	sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook);
-	if (IS_ERR(sectbl_ops)) {
-		ret = PTR_ERR(sectbl_ops);
-		goto cleanup_table;
 	}
 
-	return ret;
-
-cleanup_table:
-	unregister_pernet_subsys(&ip6table_security_net_ops);
+	ret = ip6table_security_table_init(&init_net);
+	if (ret) {
+		unregister_pernet_subsys(&ip6table_security_net_ops);
+		kfree(sectbl_ops);
+	}
 	return ret;
 }
 
 static void __exit ip6table_security_fini(void)
 {
-	xt_hook_unlink(&security_table, sectbl_ops);
 	unregister_pernet_subsys(&ip6table_security_net_ops);
+	kfree(sectbl_ops);
 }
 
 module_init(ip6table_security_init);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c8a0b7da5ff4..d0cd2b9bf844 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -694,12 +694,45 @@ EXPORT_SYMBOL(xt_free_table_info);
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
 {
-	struct xt_table *t;
+	struct xt_table *t, *found = NULL;
 
 	mutex_lock(&xt[af].mutex);
 	list_for_each_entry(t, &net->xt.tables[af], list)
 		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
 			return t;
+
+	if (net == &init_net)
+		goto out;
+
+	/* Table doesn't exist in this netns, re-try init */
+	list_for_each_entry(t, &init_net.xt.tables[af], list) {
+		if (strcmp(t->name, name))
+			continue;
+		if (!try_module_get(t->me))
+			return NULL;
+
+		mutex_unlock(&xt[af].mutex);
+		if (t->table_init(net) != 0) {
+			module_put(t->me);
+			return NULL;
+		}
+
+		found = t;
+
+		mutex_lock(&xt[af].mutex);
+		break;
+	}
+
+	if (!found)
+		goto out;
+
+	/* and once again: */
+	list_for_each_entry(t, &net->xt.tables[af], list)
+		if (strcmp(t->name, name) == 0)
+			return t;
+
+	module_put(found->me);
+ out:
 	mutex_unlock(&xt[af].mutex);
 	return NULL;
 }
@@ -1170,20 +1203,20 @@ static const struct file_operations xt_target_ops = {
 #endif /* CONFIG_PROC_FS */
 
 /**
- * xt_hook_link - set up hooks for a new table
+ * xt_hook_ops_alloc - set up hooks for a new table
  * @table:	table with metadata needed to set up hooks
  * @fn:		Hook function
  *
- * This function will take care of creating and registering the necessary
- * Netfilter hooks for XT tables.
+ * This function will create the nf_hook_ops that the x_table needs
+ * to hand to xt_hook_link_net().
  */
-struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
+struct nf_hook_ops *
+xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
 {
 	unsigned int hook_mask = table->valid_hooks;
 	uint8_t i, num_hooks = hweight32(hook_mask);
 	uint8_t hooknum;
 	struct nf_hook_ops *ops;
-	int ret;
 
 	ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL);
 	if (ops == NULL)
@@ -1200,27 +1233,9 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
 		++i;
 	}
 
-	ret = nf_register_hooks(ops, num_hooks);
-	if (ret < 0) {
-		kfree(ops);
-		return ERR_PTR(ret);
-	}
-
 	return ops;
 }
-EXPORT_SYMBOL_GPL(xt_hook_link);
-
-/**
- * xt_hook_unlink - remove hooks for a table
- * @ops:	nf_hook_ops array as returned by nf_hook_link
- * @hook_mask:	the very same mask that was passed to nf_hook_link
- */
-void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops)
-{
-	nf_unregister_hooks(ops, hweight32(table->valid_hooks));
-	kfree(ops);
-}
-EXPORT_SYMBOL_GPL(xt_hook_unlink);
+EXPORT_SYMBOL_GPL(xt_hook_ops_alloc);
 
 int xt_proto_init(struct net *net, u_int8_t af)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 5f6c253ebe93b02dece01c6f58447f16b29f6dd3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 25 Feb 2016 10:08:37 +0100
Subject: netfilter: bridge: register hooks only when bridge interface is added

This moves bridge hooks to a register-when-needed scheme.

We use a device notifier to register the 'call-iptables' netfilter hooks
only once a bridge gets added.

This means that if the initial namespace uses a bridge, newly created
network namespaces no longer get the PRE_ROUTING ipt_sabotage hook.

It will registered in that network namespace once a bridge is created
within that namespace.

A few modules still use global hooks:

- conntrack
- bridge PF_BRIDGE hooks
- IPVS
- CLUSTER match (deprecated)
- SYNPROXY

As long as these modules are not loaded/used, a new network namespace has
empty hook list and NF_HOOK() will boil down to single list_empty test even
if initial namespace does stateless packet filtering.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/br_netfilter_hooks.c | 68 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 7ddbe7ec81d6..44114a94c576 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -37,6 +37,7 @@
 #include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
+#include <net/netns/generic.h>
 
 #include <asm/uaccess.h>
 #include "br_private.h"
@@ -44,6 +45,12 @@
 #include <linux/sysctl.h>
 #endif
 
+static int brnf_net_id __read_mostly;
+
+struct brnf_net {
+	bool enabled;
+};
+
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *brnf_sysctl_header;
 static int brnf_call_iptables __read_mostly = 1;
@@ -938,6 +945,53 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
 	},
 };
 
+static int brnf_device_event(struct notifier_block *unused, unsigned long event,
+			     void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct brnf_net *brnet;
+	struct net *net;
+	int ret;
+
+	if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE))
+		return NOTIFY_DONE;
+
+	ASSERT_RTNL();
+
+	net = dev_net(dev);
+	brnet = net_generic(net, brnf_net_id);
+	if (brnet->enabled)
+		return NOTIFY_OK;
+
+	ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	if (ret)
+		return NOTIFY_BAD;
+
+	brnet->enabled = true;
+	return NOTIFY_OK;
+}
+
+static void __net_exit brnf_exit_net(struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+	if (!brnet->enabled)
+		return;
+
+	nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	brnet->enabled = false;
+}
+
+static struct pernet_operations brnf_net_ops __read_mostly = {
+	.exit = brnf_exit_net,
+	.id   = &brnf_net_id,
+	.size = sizeof(struct brnf_net),
+};
+
+static struct notifier_block brnf_notifier __read_mostly = {
+	.notifier_call = brnf_device_event,
+};
+
 #ifdef CONFIG_SYSCTL
 static
 int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
@@ -1003,16 +1057,23 @@ static int __init br_netfilter_init(void)
 {
 	int ret;
 
-	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	ret = register_pernet_subsys(&brnf_net_ops);
 	if (ret < 0)
 		return ret;
 
+	ret = register_netdevice_notifier(&brnf_notifier);
+	if (ret < 0) {
+		unregister_pernet_subsys(&brnf_net_ops);
+		return ret;
+	}
+
 #ifdef CONFIG_SYSCTL
 	brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
 	if (brnf_sysctl_header == NULL) {
 		printk(KERN_WARNING
 		       "br_netfilter: can't register to sysctl.\n");
-		nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+		unregister_netdevice_notifier(&brnf_notifier);
+		unregister_pernet_subsys(&brnf_net_ops);
 		return -ENOMEM;
 	}
 #endif
@@ -1024,7 +1085,8 @@ static int __init br_netfilter_init(void)
 static void __exit br_netfilter_fini(void)
 {
 	RCU_INIT_POINTER(nf_br_ops, NULL);
-	nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	unregister_netdevice_notifier(&brnf_notifier);
+	unregister_pernet_subsys(&brnf_net_ops);
 #ifdef CONFIG_SYSCTL
 	unregister_net_sysctl_table(brnf_sysctl_header);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 8a6bf5da1aefdafd60b73d9122c7af9fd2d7bb9c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 1 Mar 2016 19:55:14 +0100
Subject: netfilter: nft_masq: support port range

Complete masquerading support by allowing port range selection.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_masq.h         |  4 ++-
 include/uapi/linux/netfilter/nf_tables.h |  4 +++
 net/ipv4/netfilter/nft_masq_ipv4.c       |  7 ++++-
 net/ipv6/netfilter/nft_masq_ipv6.c       |  7 ++++-
 net/netfilter/nft_masq.c                 | 51 +++++++++++++++++++++++++-------
 5 files changed, 59 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h
index e2a518b60e19..a3f3c11b2526 100644
--- a/include/net/netfilter/nft_masq.h
+++ b/include/net/netfilter/nft_masq.h
@@ -2,7 +2,9 @@
 #define _NFT_MASQ_H_
 
 struct nft_masq {
-	u32	flags;
+	u32			flags;
+	enum nft_registers      sreg_proto_min:8;
+	enum nft_registers      sreg_proto_max:8;
 };
 
 extern const struct nla_policy nft_masq_policy[];
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index b19be0a098c0..eeffde196f80 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -951,10 +951,14 @@ enum nft_nat_attributes {
  * enum nft_masq_attributes - nf_tables masquerade expression attributes
  *
  * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32)
+ * @NFTA_MASQ_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
+ * @NFTA_MASQ_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
  */
 enum nft_masq_attributes {
 	NFTA_MASQ_UNSPEC,
 	NFTA_MASQ_FLAGS,
+	NFTA_MASQ_REG_PROTO_MIN,
+	NFTA_MASQ_REG_PROTO_MAX,
 	__NFTA_MASQ_MAX
 };
 #define NFTA_MASQ_MAX		(__NFTA_MASQ_MAX - 1)
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index b72ffc58e255..51ced81b616c 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
-
+	if (priv->sreg_proto_min) {
+		range.min_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_min];
+		range.max_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_max];
+	}
 	regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
 						    &range, pkt->out);
 }
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index cd1ac1637a05..9597ffb74077 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -26,7 +26,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
-
+	if (priv->sreg_proto_min) {
+		range.min_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_min];
+		range.max_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_max];
+	}
 	regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
 }
 
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 9aea747b43ea..81b5ad6165ac 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -17,7 +17,9 @@
 #include <net/netfilter/nft_masq.h>
 
 const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
-	[NFTA_MASQ_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_MASQ_FLAGS]		= { .type = NLA_U32 },
+	[NFTA_MASQ_REG_PROTO_MIN]	= { .type = NLA_U32 },
+	[NFTA_MASQ_REG_PROTO_MAX]	= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL_GPL(nft_masq_policy);
 
@@ -40,6 +42,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
 		  const struct nft_expr *expr,
 		  const struct nlattr * const tb[])
 {
+	u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
 	struct nft_masq *priv = nft_expr_priv(expr);
 	int err;
 
@@ -47,12 +50,32 @@ int nft_masq_init(const struct nft_ctx *ctx,
 	if (err)
 		return err;
 
-	if (tb[NFTA_MASQ_FLAGS] == NULL)
-		return 0;
-
-	priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
-	if (priv->flags & ~NF_NAT_RANGE_MASK)
-		return -EINVAL;
+	if (tb[NFTA_MASQ_FLAGS]) {
+		priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
+		if (priv->flags & ~NF_NAT_RANGE_MASK)
+			return -EINVAL;
+	}
+
+	if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
+		priv->sreg_proto_min =
+			nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]);
+
+		err = nft_validate_register_load(priv->sreg_proto_min, plen);
+		if (err < 0)
+			return err;
+
+		if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
+			priv->sreg_proto_max =
+				nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]);
+
+			err = nft_validate_register_load(priv->sreg_proto_max,
+							 plen);
+			if (err < 0)
+				return err;
+		} else {
+			priv->sreg_proto_max = priv->sreg_proto_min;
+		}
+	}
 
 	return 0;
 }
@@ -62,12 +85,18 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_masq *priv = nft_expr_priv(expr);
 
-	if (priv->flags == 0)
-		return 0;
-
-	if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
+	if (priv->flags != 0 &&
+	    nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
 		goto nla_put_failure;
 
+	if (priv->sreg_proto_min) {
+		if (nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MIN,
+				      priv->sreg_proto_min) ||
+		    nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MAX,
+				      priv->sreg_proto_max))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3-59-g8ed1b


From dcdede0406d30e14a6ed727dce69dfac0518371d Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:42 -0800
Subject: RDS: Drop stale iWARP RDMA transport

RDS iWarp support code has become stale and non testable. As
indicated earlier, am dropping the support for it.

If new iWarp user(s) shows up in future, we can adapat the RDS IB
transprt for the special RDMA READ sink case. iWarp needs an MR
for the RDMA READ sink.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/rds.txt |   4 +-
 net/rds/Kconfig                  |   7 +-
 net/rds/Makefile                 |   4 +-
 net/rds/iw.c                     | 312 -------------
 net/rds/iw.h                     | 398 ----------------
 net/rds/iw_cm.c                  | 769 ------------------------------
 net/rds/iw_rdma.c                | 837 ---------------------------------
 net/rds/iw_recv.c                | 904 ------------------------------------
 net/rds/iw_ring.c                | 169 -------
 net/rds/iw_send.c                | 981 ---------------------------------------
 net/rds/iw_stats.c               |  95 ----
 net/rds/iw_sysctl.c              | 123 -----
 net/rds/rdma_transport.c         |  13 +-
 net/rds/rdma_transport.h         |   5 -
 14 files changed, 7 insertions(+), 4614 deletions(-)
 delete mode 100644 net/rds/iw.c
 delete mode 100644 net/rds/iw.h
 delete mode 100644 net/rds/iw_cm.c
 delete mode 100644 net/rds/iw_rdma.c
 delete mode 100644 net/rds/iw_recv.c
 delete mode 100644 net/rds/iw_ring.c
 delete mode 100644 net/rds/iw_send.c
 delete mode 100644 net/rds/iw_stats.c
 delete mode 100644 net/rds/iw_sysctl.c

(limited to 'net')

diff --git a/Documentation/networking/rds.txt b/Documentation/networking/rds.txt
index e1a3d59bbe0f..9d219d856d46 100644
--- a/Documentation/networking/rds.txt
+++ b/Documentation/networking/rds.txt
@@ -19,9 +19,7 @@ to N*N if you use a connection-oriented socket transport like TCP.
 
 RDS is not Infiniband-specific; it was designed to support different
 transports.  The current implementation used to support RDS over TCP as well
-as IB. Work is in progress to support RDS over iWARP, and using DCE to
-guarantee no dropped packets on Ethernet, it may be possible to use RDS over
-UDP in the future.
+as IB.
 
 The high-level semantics of RDS from the application's point of view are
 
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index f2c670ba7b9b..bffde4b46c5d 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -4,14 +4,13 @@ config RDS
 	depends on INET
 	---help---
 	  The RDS (Reliable Datagram Sockets) protocol provides reliable,
-	  sequenced delivery of datagrams over Infiniband, iWARP,
-	  or TCP.
+	  sequenced delivery of datagrams over Infiniband or TCP.
 
 config RDS_RDMA
-	tristate "RDS over Infiniband and iWARP"
+	tristate "RDS over Infiniband"
 	depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
 	---help---
-	  Allow RDS to use Infiniband and iWARP as a transport.
+	  Allow RDS to use Infiniband as a transport.
 	  This transport supports RDMA operations.
 
 config RDS_TCP
diff --git a/net/rds/Makefile b/net/rds/Makefile
index 56d3f6023ced..19e54854ba2d 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -6,9 +6,7 @@ rds-y :=	af_rds.o bind.o cong.o connection.o info.o message.o   \
 obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
 rds_rdma-y :=	rdma_transport.o \
 			ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
-			ib_sysctl.o ib_rdma.o \
-			iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
-			iw_sysctl.o iw_rdma.o
+			ib_sysctl.o ib_rdma.o
 
 
 obj-$(CONFIG_RDS_TCP) += rds_tcp.o
diff --git a/net/rds/iw.c b/net/rds/iw.c
deleted file mode 100644
index f4a9fff829e0..000000000000
--- a/net/rds/iw.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/if_arp.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-
-#include "rds.h"
-#include "iw.h"
-
-unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
-unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
-
-module_param(fastreg_pool_size, int, 0444);
-MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
-module_param(fastreg_message_size, int, 0444);
-MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
-
-struct list_head rds_iw_devices;
-
-/* NOTE: if also grabbing iwdev lock, grab this first */
-DEFINE_SPINLOCK(iw_nodev_conns_lock);
-LIST_HEAD(iw_nodev_conns);
-
-static void rds_iw_add_one(struct ib_device *device)
-{
-	struct rds_iw_device *rds_iwdev;
-
-	/* Only handle iwarp devices */
-	if (device->node_type != RDMA_NODE_RNIC)
-		return;
-
-	rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
-	if (!rds_iwdev)
-		return;
-
-	spin_lock_init(&rds_iwdev->spinlock);
-
-	rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
-	rds_iwdev->max_wrs = device->attrs.max_qp_wr;
-	rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE);
-
-	rds_iwdev->dev = device;
-	rds_iwdev->pd = ib_alloc_pd(device);
-	if (IS_ERR(rds_iwdev->pd))
-		goto free_dev;
-
-	if (!rds_iwdev->dma_local_lkey) {
-		rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
-					IB_ACCESS_REMOTE_READ |
-					IB_ACCESS_REMOTE_WRITE |
-					IB_ACCESS_LOCAL_WRITE);
-		if (IS_ERR(rds_iwdev->mr))
-			goto err_pd;
-	} else
-		rds_iwdev->mr = NULL;
-
-	rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
-	if (IS_ERR(rds_iwdev->mr_pool)) {
-		rds_iwdev->mr_pool = NULL;
-		goto err_mr;
-	}
-
-	INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
-	INIT_LIST_HEAD(&rds_iwdev->conn_list);
-	list_add_tail(&rds_iwdev->list, &rds_iw_devices);
-
-	ib_set_client_data(device, &rds_iw_client, rds_iwdev);
-	return;
-
-err_mr:
-	if (rds_iwdev->mr)
-		ib_dereg_mr(rds_iwdev->mr);
-err_pd:
-	ib_dealloc_pd(rds_iwdev->pd);
-free_dev:
-	kfree(rds_iwdev);
-}
-
-static void rds_iw_remove_one(struct ib_device *device, void *client_data)
-{
-	struct rds_iw_device *rds_iwdev = client_data;
-	struct rds_iw_cm_id *i_cm_id, *next;
-
-	if (!rds_iwdev)
-		return;
-
-	spin_lock_irq(&rds_iwdev->spinlock);
-	list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
-		list_del(&i_cm_id->list);
-		kfree(i_cm_id);
-	}
-	spin_unlock_irq(&rds_iwdev->spinlock);
-
-	rds_iw_destroy_conns(rds_iwdev);
-
-	if (rds_iwdev->mr_pool)
-		rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
-
-	if (rds_iwdev->mr)
-		ib_dereg_mr(rds_iwdev->mr);
-
-	ib_dealloc_pd(rds_iwdev->pd);
-
-	list_del(&rds_iwdev->list);
-	kfree(rds_iwdev);
-}
-
-struct ib_client rds_iw_client = {
-	.name   = "rds_iw",
-	.add    = rds_iw_add_one,
-	.remove = rds_iw_remove_one
-};
-
-static int rds_iw_conn_info_visitor(struct rds_connection *conn,
-				    void *buffer)
-{
-	struct rds_info_rdma_connection *iinfo = buffer;
-	struct rds_iw_connection *ic;
-
-	/* We will only ever look at IB transports */
-	if (conn->c_trans != &rds_iw_transport)
-		return 0;
-
-	iinfo->src_addr = conn->c_laddr;
-	iinfo->dst_addr = conn->c_faddr;
-
-	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
-	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
-	if (rds_conn_state(conn) == RDS_CONN_UP) {
-		struct rds_iw_device *rds_iwdev;
-		struct rdma_dev_addr *dev_addr;
-
-		ic = conn->c_transport_data;
-		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
-
-		rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
-		rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
-
-		rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
-		iinfo->max_send_wr = ic->i_send_ring.w_nr;
-		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
-		iinfo->max_send_sge = rds_iwdev->max_sge;
-		rds_iw_get_mr_info(rds_iwdev, iinfo);
-	}
-	return 1;
-}
-
-static void rds_iw_ic_info(struct socket *sock, unsigned int len,
-			   struct rds_info_iterator *iter,
-			   struct rds_info_lengths *lens)
-{
-	rds_for_each_conn_info(sock, len, iter, lens,
-				rds_iw_conn_info_visitor,
-				sizeof(struct rds_info_rdma_connection));
-}
-
-
-/*
- * Early RDS/IB was built to only bind to an address if there is an IPoIB
- * device with that address set.
- *
- * If it were me, I'd advocate for something more flexible.  Sending and
- * receiving should be device-agnostic.  Transports would try and maintain
- * connections between peers who have messages queued.  Userspace would be
- * allowed to influence which paths have priority.  We could call userspace
- * asserting this policy "routing".
- */
-static int rds_iw_laddr_check(struct net *net, __be32 addr)
-{
-	int ret;
-	struct rdma_cm_id *cm_id;
-	struct sockaddr_in sin;
-
-	/* Create a CMA ID and try to bind it. This catches both
-	 * IB and iWARP capable NICs.
-	 */
-	cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
-	if (IS_ERR(cm_id))
-		return PTR_ERR(cm_id);
-
-	memset(&sin, 0, sizeof(sin));
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = addr;
-
-	/* rdma_bind_addr will only succeed for IB & iWARP devices */
-	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
-	/* due to this, we will claim to support IB devices unless we
-	   check node_type. */
-	if (ret || !cm_id->device ||
-	    cm_id->device->node_type != RDMA_NODE_RNIC)
-		ret = -EADDRNOTAVAIL;
-
-	rdsdebug("addr %pI4 ret %d node type %d\n",
-		&addr, ret,
-		cm_id->device ? cm_id->device->node_type : -1);
-
-	rdma_destroy_id(cm_id);
-
-	return ret;
-}
-
-void rds_iw_exit(void)
-{
-	rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
-	rds_iw_destroy_nodev_conns();
-	ib_unregister_client(&rds_iw_client);
-	rds_iw_sysctl_exit();
-	rds_iw_recv_exit();
-	rds_trans_unregister(&rds_iw_transport);
-}
-
-struct rds_transport rds_iw_transport = {
-	.laddr_check		= rds_iw_laddr_check,
-	.xmit_complete		= rds_iw_xmit_complete,
-	.xmit			= rds_iw_xmit,
-	.xmit_rdma		= rds_iw_xmit_rdma,
-	.recv			= rds_iw_recv,
-	.conn_alloc		= rds_iw_conn_alloc,
-	.conn_free		= rds_iw_conn_free,
-	.conn_connect		= rds_iw_conn_connect,
-	.conn_shutdown		= rds_iw_conn_shutdown,
-	.inc_copy_to_user	= rds_iw_inc_copy_to_user,
-	.inc_free		= rds_iw_inc_free,
-	.cm_initiate_connect	= rds_iw_cm_initiate_connect,
-	.cm_handle_connect	= rds_iw_cm_handle_connect,
-	.cm_connect_complete	= rds_iw_cm_connect_complete,
-	.stats_info_copy	= rds_iw_stats_info_copy,
-	.exit			= rds_iw_exit,
-	.get_mr			= rds_iw_get_mr,
-	.sync_mr		= rds_iw_sync_mr,
-	.free_mr		= rds_iw_free_mr,
-	.flush_mrs		= rds_iw_flush_mrs,
-	.t_owner		= THIS_MODULE,
-	.t_name			= "iwarp",
-	.t_type			= RDS_TRANS_IWARP,
-	.t_prefer_loopback	= 1,
-};
-
-int rds_iw_init(void)
-{
-	int ret;
-
-	INIT_LIST_HEAD(&rds_iw_devices);
-
-	ret = ib_register_client(&rds_iw_client);
-	if (ret)
-		goto out;
-
-	ret = rds_iw_sysctl_init();
-	if (ret)
-		goto out_ibreg;
-
-	ret = rds_iw_recv_init();
-	if (ret)
-		goto out_sysctl;
-
-	ret = rds_trans_register(&rds_iw_transport);
-	if (ret)
-		goto out_recv;
-
-	rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
-
-	goto out;
-
-out_recv:
-	rds_iw_recv_exit();
-out_sysctl:
-	rds_iw_sysctl_exit();
-out_ibreg:
-	ib_unregister_client(&rds_iw_client);
-out:
-	return ret;
-}
-
-MODULE_LICENSE("GPL");
-
diff --git a/net/rds/iw.h b/net/rds/iw.h
deleted file mode 100644
index 5af01d1758b3..000000000000
--- a/net/rds/iw.h
+++ /dev/null
@@ -1,398 +0,0 @@
-#ifndef _RDS_IW_H
-#define _RDS_IW_H
-
-#include <linux/interrupt.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/rdma_cm.h>
-#include "rds.h"
-#include "rdma_transport.h"
-
-#define RDS_FASTREG_SIZE		20
-#define RDS_FASTREG_POOL_SIZE		2048
-
-#define RDS_IW_MAX_SGE			8
-#define RDS_IW_RECV_SGE 		2
-
-#define RDS_IW_DEFAULT_RECV_WR		1024
-#define RDS_IW_DEFAULT_SEND_WR		256
-
-#define RDS_IW_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
-
-extern struct list_head rds_iw_devices;
-
-/*
- * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
- * try and minimize the amount of memory tied up both the device and
- * socket receive queues.
- */
-/* page offset of the final full frag that fits in the page */
-#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
-struct rds_page_frag {
-	struct list_head	f_item;
-	struct page		*f_page;
-	unsigned long		f_offset;
-	dma_addr_t 		f_mapped;
-};
-
-struct rds_iw_incoming {
-	struct list_head	ii_frags;
-	struct rds_incoming	ii_inc;
-};
-
-struct rds_iw_connect_private {
-	/* Add new fields at the end, and don't permute existing fields. */
-	__be32			dp_saddr;
-	__be32			dp_daddr;
-	u8			dp_protocol_major;
-	u8			dp_protocol_minor;
-	__be16			dp_protocol_minor_mask; /* bitmask */
-	__be32			dp_reserved1;
-	__be64			dp_ack_seq;
-	__be32			dp_credit;		/* non-zero enables flow ctl */
-};
-
-struct rds_iw_scatterlist {
-	struct scatterlist	*list;
-	unsigned int		len;
-	int			dma_len;
-	unsigned int		dma_npages;
-	unsigned int		bytes;
-};
-
-struct rds_iw_mapping {
-	spinlock_t		m_lock;	/* protect the mapping struct */
-	struct list_head	m_list;
-	struct rds_iw_mr	*m_mr;
-	uint32_t		m_rkey;
-	struct rds_iw_scatterlist m_sg;
-};
-
-struct rds_iw_send_work {
-	struct rds_message	*s_rm;
-
-	/* We should really put these into a union: */
-	struct rm_rdma_op	*s_op;
-	struct rds_iw_mapping	*s_mapping;
-	struct ib_mr		*s_mr;
-	unsigned char		s_remap_count;
-
-	union {
-		struct ib_send_wr	s_send_wr;
-		struct ib_rdma_wr	s_rdma_wr;
-		struct ib_reg_wr	s_reg_wr;
-	};
-	struct ib_sge		s_sge[RDS_IW_MAX_SGE];
-	unsigned long		s_queued;
-};
-
-struct rds_iw_recv_work {
-	struct rds_iw_incoming 	*r_iwinc;
-	struct rds_page_frag	*r_frag;
-	struct ib_recv_wr	r_wr;
-	struct ib_sge		r_sge[2];
-};
-
-struct rds_iw_work_ring {
-	u32		w_nr;
-	u32		w_alloc_ptr;
-	u32		w_alloc_ctr;
-	u32		w_free_ptr;
-	atomic_t	w_free_ctr;
-};
-
-struct rds_iw_device;
-
-struct rds_iw_connection {
-
-	struct list_head	iw_node;
-	struct rds_iw_device 	*rds_iwdev;
-	struct rds_connection	*conn;
-
-	/* alphabet soup, IBTA style */
-	struct rdma_cm_id	*i_cm_id;
-	struct ib_pd		*i_pd;
-	struct ib_mr		*i_mr;
-	struct ib_cq		*i_send_cq;
-	struct ib_cq		*i_recv_cq;
-
-	/* tx */
-	struct rds_iw_work_ring	i_send_ring;
-	struct rds_message	*i_rm;
-	struct rds_header	*i_send_hdrs;
-	u64			i_send_hdrs_dma;
-	struct rds_iw_send_work *i_sends;
-
-	/* rx */
-	struct tasklet_struct	i_recv_tasklet;
-	struct mutex		i_recv_mutex;
-	struct rds_iw_work_ring	i_recv_ring;
-	struct rds_iw_incoming	*i_iwinc;
-	u32			i_recv_data_rem;
-	struct rds_header	*i_recv_hdrs;
-	u64			i_recv_hdrs_dma;
-	struct rds_iw_recv_work *i_recvs;
-	struct rds_page_frag	i_frag;
-	u64			i_ack_recv;	/* last ACK received */
-
-	/* sending acks */
-	unsigned long		i_ack_flags;
-#ifdef KERNEL_HAS_ATOMIC64
-	atomic64_t		i_ack_next;	/* next ACK to send */
-#else
-	spinlock_t		i_ack_lock;	/* protect i_ack_next */
-	u64			i_ack_next;	/* next ACK to send */
-#endif
-	struct rds_header	*i_ack;
-	struct ib_send_wr	i_ack_wr;
-	struct ib_sge		i_ack_sge;
-	u64			i_ack_dma;
-	unsigned long		i_ack_queued;
-
-	/* Flow control related information
-	 *
-	 * Our algorithm uses a pair variables that we need to access
-	 * atomically - one for the send credits, and one posted
-	 * recv credits we need to transfer to remote.
-	 * Rather than protect them using a slow spinlock, we put both into
-	 * a single atomic_t and update it using cmpxchg
-	 */
-	atomic_t		i_credits;
-
-	/* Protocol version specific information */
-	unsigned int		i_flowctl:1;	/* enable/disable flow ctl */
-	unsigned int		i_dma_local_lkey:1;
-	unsigned int		i_fastreg_posted:1; /* fastreg posted on this connection */
-	/* Batched completions */
-	unsigned int		i_unsignaled_wrs;
-	long			i_unsignaled_bytes;
-};
-
-/* This assumes that atomic_t is at least 32 bits */
-#define IB_GET_SEND_CREDITS(v)	((v) & 0xffff)
-#define IB_GET_POST_CREDITS(v)	((v) >> 16)
-#define IB_SET_SEND_CREDITS(v)	((v) & 0xffff)
-#define IB_SET_POST_CREDITS(v)	((v) << 16)
-
-struct rds_iw_cm_id {
-	struct list_head	list;
-	struct rdma_cm_id	*cm_id;
-};
-
-struct rds_iw_device {
-	struct list_head	list;
-	struct list_head	cm_id_list;
-	struct list_head	conn_list;
-	struct ib_device	*dev;
-	struct ib_pd		*pd;
-	struct ib_mr		*mr;
-	struct rds_iw_mr_pool	*mr_pool;
-	int			max_sge;
-	unsigned int		max_wrs;
-	unsigned int		dma_local_lkey:1;
-	spinlock_t		spinlock;	/* protect the above */
-};
-
-/* bits for i_ack_flags */
-#define IB_ACK_IN_FLIGHT	0
-#define IB_ACK_REQUESTED	1
-
-/* Magic WR_ID for ACKs */
-#define RDS_IW_ACK_WR_ID	((u64)0xffffffffffffffffULL)
-#define RDS_IW_REG_WR_ID	((u64)0xefefefefefefefefULL)
-#define RDS_IW_LOCAL_INV_WR_ID	((u64)0xdfdfdfdfdfdfdfdfULL)
-
-struct rds_iw_statistics {
-	uint64_t	s_iw_connect_raced;
-	uint64_t	s_iw_listen_closed_stale;
-	uint64_t	s_iw_tx_cq_call;
-	uint64_t	s_iw_tx_cq_event;
-	uint64_t	s_iw_tx_ring_full;
-	uint64_t	s_iw_tx_throttle;
-	uint64_t	s_iw_tx_sg_mapping_failure;
-	uint64_t	s_iw_tx_stalled;
-	uint64_t	s_iw_tx_credit_updates;
-	uint64_t	s_iw_rx_cq_call;
-	uint64_t	s_iw_rx_cq_event;
-	uint64_t	s_iw_rx_ring_empty;
-	uint64_t	s_iw_rx_refill_from_cq;
-	uint64_t	s_iw_rx_refill_from_thread;
-	uint64_t	s_iw_rx_alloc_limit;
-	uint64_t	s_iw_rx_credit_updates;
-	uint64_t	s_iw_ack_sent;
-	uint64_t	s_iw_ack_send_failure;
-	uint64_t	s_iw_ack_send_delayed;
-	uint64_t	s_iw_ack_send_piggybacked;
-	uint64_t	s_iw_ack_received;
-	uint64_t	s_iw_rdma_mr_alloc;
-	uint64_t	s_iw_rdma_mr_free;
-	uint64_t	s_iw_rdma_mr_used;
-	uint64_t	s_iw_rdma_mr_pool_flush;
-	uint64_t	s_iw_rdma_mr_pool_wait;
-	uint64_t	s_iw_rdma_mr_pool_depleted;
-};
-
-extern struct workqueue_struct *rds_iw_wq;
-
-/*
- * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
- * doesn't define it.
- */
-static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
-		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
-{
-	unsigned int i;
-
-	for (i = 0; i < sg_dma_len; ++i) {
-		ib_dma_sync_single_for_cpu(dev,
-				ib_sg_dma_address(dev, &sg[i]),
-				ib_sg_dma_len(dev, &sg[i]),
-				direction);
-	}
-}
-#define ib_dma_sync_sg_for_cpu	rds_iw_dma_sync_sg_for_cpu
-
-static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
-		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
-{
-	unsigned int i;
-
-	for (i = 0; i < sg_dma_len; ++i) {
-		ib_dma_sync_single_for_device(dev,
-				ib_sg_dma_address(dev, &sg[i]),
-				ib_sg_dma_len(dev, &sg[i]),
-				direction);
-	}
-}
-#define ib_dma_sync_sg_for_device	rds_iw_dma_sync_sg_for_device
-
-static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
-{
-	return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
-}
-
-/* ib.c */
-extern struct rds_transport rds_iw_transport;
-extern struct ib_client rds_iw_client;
-
-extern unsigned int fastreg_pool_size;
-extern unsigned int fastreg_message_size;
-
-extern spinlock_t iw_nodev_conns_lock;
-extern struct list_head iw_nodev_conns;
-
-/* ib_cm.c */
-int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
-void rds_iw_conn_free(void *arg);
-int rds_iw_conn_connect(struct rds_connection *conn);
-void rds_iw_conn_shutdown(struct rds_connection *conn);
-void rds_iw_state_change(struct sock *sk);
-int rds_iw_listen_init(void);
-void rds_iw_listen_stop(void);
-void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
-int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
-			     struct rdma_cm_event *event);
-int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
-void rds_iw_cm_connect_complete(struct rds_connection *conn,
-				struct rdma_cm_event *event);
-
-
-#define rds_iw_conn_error(conn, fmt...) \
-	__rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
-
-/* ib_rdma.c */
-int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
-void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
-void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
-void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
-static inline void rds_iw_destroy_nodev_conns(void)
-{
-	__rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
-}
-static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
-{
-	__rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
-}
-struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
-void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
-void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
-void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
-		    struct rds_sock *rs, u32 *key_ret);
-void rds_iw_sync_mr(void *trans_private, int dir);
-void rds_iw_free_mr(void *trans_private, int invalidate);
-void rds_iw_flush_mrs(void);
-
-/* ib_recv.c */
-int rds_iw_recv_init(void);
-void rds_iw_recv_exit(void);
-int rds_iw_recv(struct rds_connection *conn);
-int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
-		       gfp_t page_gfp, int prefill);
-void rds_iw_inc_free(struct rds_incoming *inc);
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
-void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_iw_recv_tasklet_fn(unsigned long data);
-void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
-void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
-void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
-void rds_iw_attempt_ack(struct rds_iw_connection *ic);
-void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
-u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
-
-/* ib_ring.c */
-void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
-void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
-u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
-void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
-void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
-int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
-int rds_iw_ring_low(struct rds_iw_work_ring *ring);
-u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
-u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
-extern wait_queue_head_t rds_iw_ring_empty_wait;
-
-/* ib_send.c */
-void rds_iw_xmit_complete(struct rds_connection *conn);
-int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
-		unsigned int hdr_off, unsigned int sg, unsigned int off);
-void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_iw_send_init_ring(struct rds_iw_connection *ic);
-void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
-void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
-void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
-int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
-			     u32 *adv_credits, int need_posted, int max_posted);
-
-/* ib_stats.c */
-DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
-#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
-unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
-				    unsigned int avail);
-
-/* ib_sysctl.c */
-int rds_iw_sysctl_init(void);
-void rds_iw_sysctl_exit(void);
-extern unsigned long rds_iw_sysctl_max_send_wr;
-extern unsigned long rds_iw_sysctl_max_recv_wr;
-extern unsigned long rds_iw_sysctl_max_unsig_wrs;
-extern unsigned long rds_iw_sysctl_max_unsig_bytes;
-extern unsigned long rds_iw_sysctl_max_recv_allocation;
-extern unsigned int rds_iw_sysctl_flow_control;
-
-/*
- * Helper functions for getting/setting the header and data SGEs in
- * RDS packets (not RDMA)
- */
-static inline struct ib_sge *
-rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
-{
-	return &sge[0];
-}
-
-static inline struct ib_sge *
-rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
-{
-	return &sge[1];
-}
-
-#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
deleted file mode 100644
index aea4c911bc76..000000000000
--- a/net/rds/iw_cm.c
+++ /dev/null
@@ -1,769 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
-
-#include "rds.h"
-#include "iw.h"
-
-/*
- * Set the selected protocol version
- */
-static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
-{
-	conn->c_version = version;
-}
-
-/*
- * Set up flow control
- */
-static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-
-	if (rds_iw_sysctl_flow_control && credits != 0) {
-		/* We're doing flow control */
-		ic->i_flowctl = 1;
-		rds_iw_send_add_credits(conn, credits);
-	} else {
-		ic->i_flowctl = 0;
-	}
-}
-
-/*
- * Connection established.
- * We get here for both outgoing and incoming connection.
- */
-void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
-{
-	const struct rds_iw_connect_private *dp = NULL;
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	struct rds_iw_device *rds_iwdev;
-	int err;
-
-	if (event->param.conn.private_data_len) {
-		dp = event->param.conn.private_data;
-
-		rds_iw_set_protocol(conn,
-				RDS_PROTOCOL(dp->dp_protocol_major,
-					dp->dp_protocol_minor));
-		rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
-	}
-
-	/* update ib_device with this local ipaddr & conn */
-	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
-	err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
-	if (err)
-		printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
-	rds_iw_add_conn(rds_iwdev, conn);
-
-	/* If the peer gave us the last packet it saw, process this as if
-	 * we had received a regular ACK. */
-	if (dp && dp->dp_ack_seq)
-		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
-
-	printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
-			&conn->c_laddr, &conn->c_faddr,
-			RDS_PROTOCOL_MAJOR(conn->c_version),
-			RDS_PROTOCOL_MINOR(conn->c_version),
-			ic->i_flowctl ? ", flow control" : "");
-
-	rds_connect_complete(conn);
-}
-
-static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
-			struct rdma_conn_param *conn_param,
-			struct rds_iw_connect_private *dp,
-			u32 protocol_version)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-
-	memset(conn_param, 0, sizeof(struct rdma_conn_param));
-	/* XXX tune these? */
-	conn_param->responder_resources = 1;
-	conn_param->initiator_depth = 1;
-
-	if (dp) {
-		memset(dp, 0, sizeof(*dp));
-		dp->dp_saddr = conn->c_laddr;
-		dp->dp_daddr = conn->c_faddr;
-		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
-		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
-		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
-		dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
-
-		/* Advertise flow control */
-		if (ic->i_flowctl) {
-			unsigned int credits;
-
-			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
-			dp->dp_credit = cpu_to_be32(credits);
-			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
-		}
-
-		conn_param->private_data = dp;
-		conn_param->private_data_len = sizeof(*dp);
-	}
-}
-
-static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
-{
-	rdsdebug("event %u data %p\n", event->event, data);
-}
-
-static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
-{
-	struct rds_connection *conn = data;
-	struct rds_iw_connection *ic = conn->c_transport_data;
-
-	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
-
-	switch (event->event) {
-	case IB_EVENT_COMM_EST:
-		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
-		break;
-	case IB_EVENT_QP_REQ_ERR:
-	case IB_EVENT_QP_FATAL:
-	default:
-		rdsdebug("Fatal QP Event %u "
-			"- connection %pI4->%pI4, reconnecting\n",
-			event->event, &conn->c_laddr,
-			&conn->c_faddr);
-		rds_conn_drop(conn);
-		break;
-	}
-}
-
-/*
- * Create a QP
- */
-static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
-		struct rds_iw_device *rds_iwdev,
-		struct rds_iw_work_ring *send_ring,
-		void (*send_cq_handler)(struct ib_cq *, void *),
-		struct rds_iw_work_ring *recv_ring,
-		void (*recv_cq_handler)(struct ib_cq *, void *),
-		void *context)
-{
-	struct ib_device *dev = rds_iwdev->dev;
-	struct ib_cq_init_attr cq_attr = {};
-	unsigned int send_size, recv_size;
-	int ret;
-
-	/* The offset of 1 is to accommodate the additional ACK WR. */
-	send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
-	recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
-	rds_iw_ring_resize(send_ring, send_size - 1);
-	rds_iw_ring_resize(recv_ring, recv_size - 1);
-
-	memset(attr, 0, sizeof(*attr));
-	attr->event_handler = rds_iw_qp_event_handler;
-	attr->qp_context = context;
-	attr->cap.max_send_wr = send_size;
-	attr->cap.max_recv_wr = recv_size;
-	attr->cap.max_send_sge = rds_iwdev->max_sge;
-	attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
-	attr->sq_sig_type = IB_SIGNAL_REQ_WR;
-	attr->qp_type = IB_QPT_RC;
-
-	cq_attr.cqe = send_size;
-	attr->send_cq = ib_create_cq(dev, send_cq_handler,
-				     rds_iw_cq_event_handler,
-				     context, &cq_attr);
-	if (IS_ERR(attr->send_cq)) {
-		ret = PTR_ERR(attr->send_cq);
-		attr->send_cq = NULL;
-		rdsdebug("ib_create_cq send failed: %d\n", ret);
-		goto out;
-	}
-
-	cq_attr.cqe = recv_size;
-	attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
-				     rds_iw_cq_event_handler,
-				     context, &cq_attr);
-	if (IS_ERR(attr->recv_cq)) {
-		ret = PTR_ERR(attr->recv_cq);
-		attr->recv_cq = NULL;
-		rdsdebug("ib_create_cq send failed: %d\n", ret);
-		goto out;
-	}
-
-	ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
-	if (ret) {
-		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
-		goto out;
-	}
-
-	ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
-	if (ret) {
-		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
-		goto out;
-	}
-
-out:
-	if (ret) {
-		if (attr->send_cq)
-			ib_destroy_cq(attr->send_cq);
-		if (attr->recv_cq)
-			ib_destroy_cq(attr->recv_cq);
-	}
-	return ret;
-}
-
-/*
- * This needs to be very careful to not leave IS_ERR pointers around for
- * cleanup to trip over.
- */
-static int rds_iw_setup_qp(struct rds_connection *conn)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	struct ib_device *dev = ic->i_cm_id->device;
-	struct ib_qp_init_attr attr;
-	struct rds_iw_device *rds_iwdev;
-	int ret;
-
-	/* rds_iw_add_one creates a rds_iw_device object per IB device,
-	 * and allocates a protection domain, memory range and MR pool
-	 * for each.  If that fails for any reason, it will not register
-	 * the rds_iwdev at all.
-	 */
-	rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
-	if (!rds_iwdev) {
-		printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
-					dev->name);
-		return -EOPNOTSUPP;
-	}
-
-	/* Protection domain and memory range */
-	ic->i_pd = rds_iwdev->pd;
-	ic->i_mr = rds_iwdev->mr;
-
-	ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
-			&ic->i_send_ring, rds_iw_send_cq_comp_handler,
-			&ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
-			conn);
-	if (ret < 0)
-		goto out;
-
-	ic->i_send_cq = attr.send_cq;
-	ic->i_recv_cq = attr.recv_cq;
-
-	/*
-	 * XXX this can fail if max_*_wr is too large?  Are we supposed
-	 * to back off until we get a value that the hardware can support?
-	 */
-	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
-	if (ret) {
-		rdsdebug("rdma_create_qp failed: %d\n", ret);
-		goto out;
-	}
-
-	ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
-					   ic->i_send_ring.w_nr *
-						sizeof(struct rds_header),
-					   &ic->i_send_hdrs_dma, GFP_KERNEL);
-	if (!ic->i_send_hdrs) {
-		ret = -ENOMEM;
-		rdsdebug("ib_dma_alloc_coherent send failed\n");
-		goto out;
-	}
-
-	ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
-					   ic->i_recv_ring.w_nr *
-						sizeof(struct rds_header),
-					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
-	if (!ic->i_recv_hdrs) {
-		ret = -ENOMEM;
-		rdsdebug("ib_dma_alloc_coherent recv failed\n");
-		goto out;
-	}
-
-	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
-				       &ic->i_ack_dma, GFP_KERNEL);
-	if (!ic->i_ack) {
-		ret = -ENOMEM;
-		rdsdebug("ib_dma_alloc_coherent ack failed\n");
-		goto out;
-	}
-
-	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
-	if (!ic->i_sends) {
-		ret = -ENOMEM;
-		rdsdebug("send allocation failed\n");
-		goto out;
-	}
-	rds_iw_send_init_ring(ic);
-
-	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
-	if (!ic->i_recvs) {
-		ret = -ENOMEM;
-		rdsdebug("recv allocation failed\n");
-		goto out;
-	}
-
-	rds_iw_recv_init_ring(ic);
-	rds_iw_recv_init_ack(ic);
-
-	/* Post receive buffers - as a side effect, this will update
-	 * the posted credit count. */
-	rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
-
-	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
-		 ic->i_send_cq, ic->i_recv_cq);
-
-out:
-	return ret;
-}
-
-static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
-{
-	u16 common;
-	u32 version = 0;
-
-	/* rdma_cm private data is odd - when there is any private data in the
-	 * request, we will be given a pretty large buffer without telling us the
-	 * original size. The only way to tell the difference is by looking at
-	 * the contents, which are initialized to zero.
-	 * If the protocol version fields aren't set, this is a connection attempt
-	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
-	 * We really should have changed this for OFED 1.3 :-( */
-	if (dp->dp_protocol_major == 0)
-		return RDS_PROTOCOL_3_0;
-
-	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
-	if (dp->dp_protocol_major == 3 && common) {
-		version = RDS_PROTOCOL_3_0;
-		while ((common >>= 1) != 0)
-			version++;
-	}
-	printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using "
-			"incompatible protocol version %u.%u\n",
-			&dp->dp_saddr,
-			dp->dp_protocol_major,
-			dp->dp_protocol_minor);
-	return version;
-}
-
-int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
-				    struct rdma_cm_event *event)
-{
-	const struct rds_iw_connect_private *dp = event->param.conn.private_data;
-	struct rds_iw_connect_private dp_rep;
-	struct rds_connection *conn = NULL;
-	struct rds_iw_connection *ic = NULL;
-	struct rdma_conn_param conn_param;
-	struct rds_iw_device *rds_iwdev;
-	u32 version;
-	int err, destroy = 1;
-
-	/* Check whether the remote protocol version matches ours. */
-	version = rds_iw_protocol_compatible(dp);
-	if (!version)
-		goto out;
-
-	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
-		 &dp->dp_saddr, &dp->dp_daddr,
-		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
-
-	/* RDS/IW is not currently netns aware, thus init_net */
-	conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
-			       &rds_iw_transport, GFP_KERNEL);
-	if (IS_ERR(conn)) {
-		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
-		conn = NULL;
-		goto out;
-	}
-
-	/*
-	 * The connection request may occur while the
-	 * previous connection exist, e.g. in case of failover.
-	 * But as connections may be initiated simultaneously
-	 * by both hosts, we have a random backoff mechanism -
-	 * see the comment above rds_queue_reconnect()
-	 */
-	mutex_lock(&conn->c_cm_lock);
-	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
-		if (rds_conn_state(conn) == RDS_CONN_UP) {
-			rdsdebug("incoming connect while connecting\n");
-			rds_conn_drop(conn);
-			rds_iw_stats_inc(s_iw_listen_closed_stale);
-		} else
-		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
-			/* Wait and see - our connect may still be succeeding */
-			rds_iw_stats_inc(s_iw_connect_raced);
-		}
-		mutex_unlock(&conn->c_cm_lock);
-		goto out;
-	}
-
-	ic = conn->c_transport_data;
-
-	rds_iw_set_protocol(conn, version);
-	rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
-
-	/* If the peer gave us the last packet it saw, process this as if
-	 * we had received a regular ACK. */
-	if (dp->dp_ack_seq)
-		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
-
-	BUG_ON(cm_id->context);
-	BUG_ON(ic->i_cm_id);
-
-	ic->i_cm_id = cm_id;
-	cm_id->context = conn;
-
-	rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
-	ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
-
-	/* We got halfway through setting up the ib_connection, if we
-	 * fail now, we have to take the long route out of this mess. */
-	destroy = 0;
-
-	err = rds_iw_setup_qp(conn);
-	if (err) {
-		rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
-		mutex_unlock(&conn->c_cm_lock);
-		goto out;
-	}
-
-	rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
-
-	/* rdma_accept() calls rdma_reject() internally if it fails */
-	err = rdma_accept(cm_id, &conn_param);
-	mutex_unlock(&conn->c_cm_lock);
-	if (err) {
-		rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
-		goto out;
-	}
-
-	return 0;
-
-out:
-	rdma_reject(cm_id, NULL, 0);
-	return destroy;
-}
-
-
-int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
-{
-	struct rds_connection *conn = cm_id->context;
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	struct rdma_conn_param conn_param;
-	struct rds_iw_connect_private dp;
-	int ret;
-
-	/* If the peer doesn't do protocol negotiation, we must
-	 * default to RDSv3.0 */
-	rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
-	ic->i_flowctl = rds_iw_sysctl_flow_control;	/* advertise flow control */
-
-	ret = rds_iw_setup_qp(conn);
-	if (ret) {
-		rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
-		goto out;
-	}
-
-	rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
-
-	ret = rdma_connect(cm_id, &conn_param);
-	if (ret)
-		rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
-
-out:
-	/* Beware - returning non-zero tells the rdma_cm to destroy
-	 * the cm_id. We should certainly not do it as long as we still
-	 * "own" the cm_id. */
-	if (ret) {
-		struct rds_iw_connection *ic = conn->c_transport_data;
-
-		if (ic->i_cm_id == cm_id)
-			ret = 0;
-	}
-	return ret;
-}
-
-int rds_iw_conn_connect(struct rds_connection *conn)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	struct rds_iw_device *rds_iwdev;
-	struct sockaddr_in src, dest;
-	int ret;
-
-	/* XXX I wonder what affect the port space has */
-	/* delegate cm event handler to rdma_transport */
-	ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
-				     RDMA_PS_TCP, IB_QPT_RC);
-	if (IS_ERR(ic->i_cm_id)) {
-		ret = PTR_ERR(ic->i_cm_id);
-		ic->i_cm_id = NULL;
-		rdsdebug("rdma_create_id() failed: %d\n", ret);
-		goto out;
-	}
-
-	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
-
-	src.sin_family = AF_INET;
-	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
-	src.sin_port = (__force u16)htons(0);
-
-	/* First, bind to the local address and device. */
-	ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
-	if (ret) {
-		rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
-				&conn->c_laddr, ret);
-		rdma_destroy_id(ic->i_cm_id);
-		ic->i_cm_id = NULL;
-		goto out;
-	}
-
-	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
-	ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
-
-	dest.sin_family = AF_INET;
-	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
-	dest.sin_port = (__force u16)htons(RDS_PORT);
-
-	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
-				(struct sockaddr *)&dest,
-				RDS_RDMA_RESOLVE_TIMEOUT_MS);
-	if (ret) {
-		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
-			 ret);
-		rdma_destroy_id(ic->i_cm_id);
-		ic->i_cm_id = NULL;
-	}
-
-out:
-	return ret;
-}
-
-/*
- * This is so careful about only cleaning up resources that were built up
- * so that it can be called at any point during startup.  In fact it
- * can be called multiple times for a given connection.
- */
-void rds_iw_conn_shutdown(struct rds_connection *conn)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	int err = 0;
-	struct ib_qp_attr qp_attr;
-
-	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
-		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
-		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
-
-	if (ic->i_cm_id) {
-		struct ib_device *dev = ic->i_cm_id->device;
-
-		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
-		err = rdma_disconnect(ic->i_cm_id);
-		if (err) {
-			/* Actually this may happen quite frequently, when
-			 * an outgoing connect raced with an incoming connect.
-			 */
-			rdsdebug("failed to disconnect, cm: %p err %d\n",
-				 ic->i_cm_id, err);
-		}
-
-		if (ic->i_cm_id->qp) {
-			qp_attr.qp_state = IB_QPS_ERR;
-			ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
-		}
-
-		wait_event(rds_iw_ring_empty_wait,
-			rds_iw_ring_empty(&ic->i_send_ring) &&
-			rds_iw_ring_empty(&ic->i_recv_ring));
-
-		if (ic->i_send_hdrs)
-			ib_dma_free_coherent(dev,
-					   ic->i_send_ring.w_nr *
-						sizeof(struct rds_header),
-					   ic->i_send_hdrs,
-					   ic->i_send_hdrs_dma);
-
-		if (ic->i_recv_hdrs)
-			ib_dma_free_coherent(dev,
-					   ic->i_recv_ring.w_nr *
-						sizeof(struct rds_header),
-					   ic->i_recv_hdrs,
-					   ic->i_recv_hdrs_dma);
-
-		if (ic->i_ack)
-			ib_dma_free_coherent(dev, sizeof(struct rds_header),
-					     ic->i_ack, ic->i_ack_dma);
-
-		if (ic->i_sends)
-			rds_iw_send_clear_ring(ic);
-		if (ic->i_recvs)
-			rds_iw_recv_clear_ring(ic);
-
-		if (ic->i_cm_id->qp)
-			rdma_destroy_qp(ic->i_cm_id);
-		if (ic->i_send_cq)
-			ib_destroy_cq(ic->i_send_cq);
-		if (ic->i_recv_cq)
-			ib_destroy_cq(ic->i_recv_cq);
-
-		/*
-		 * If associated with an rds_iw_device:
-		 * 	Move connection back to the nodev list.
-		 * 	Remove cm_id from the device cm_id list.
-		 */
-		if (ic->rds_iwdev)
-			rds_iw_remove_conn(ic->rds_iwdev, conn);
-
-		rdma_destroy_id(ic->i_cm_id);
-
-		ic->i_cm_id = NULL;
-		ic->i_pd = NULL;
-		ic->i_mr = NULL;
-		ic->i_send_cq = NULL;
-		ic->i_recv_cq = NULL;
-		ic->i_send_hdrs = NULL;
-		ic->i_recv_hdrs = NULL;
-		ic->i_ack = NULL;
-	}
-	BUG_ON(ic->rds_iwdev);
-
-	/* Clear pending transmit */
-	if (ic->i_rm) {
-		rds_message_put(ic->i_rm);
-		ic->i_rm = NULL;
-	}
-
-	/* Clear the ACK state */
-	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
-#ifdef KERNEL_HAS_ATOMIC64
-	atomic64_set(&ic->i_ack_next, 0);
-#else
-	ic->i_ack_next = 0;
-#endif
-	ic->i_ack_recv = 0;
-
-	/* Clear flow control state */
-	ic->i_flowctl = 0;
-	atomic_set(&ic->i_credits, 0);
-
-	rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
-	rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
-
-	if (ic->i_iwinc) {
-		rds_inc_put(&ic->i_iwinc->ii_inc);
-		ic->i_iwinc = NULL;
-	}
-
-	vfree(ic->i_sends);
-	ic->i_sends = NULL;
-	vfree(ic->i_recvs);
-	ic->i_recvs = NULL;
-	rdsdebug("shutdown complete\n");
-}
-
-int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
-{
-	struct rds_iw_connection *ic;
-	unsigned long flags;
-
-	/* XXX too lazy? */
-	ic = kzalloc(sizeof(struct rds_iw_connection), gfp);
-	if (!ic)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&ic->iw_node);
-	tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
-		     (unsigned long) ic);
-	mutex_init(&ic->i_recv_mutex);
-#ifndef KERNEL_HAS_ATOMIC64
-	spin_lock_init(&ic->i_ack_lock);
-#endif
-
-	/*
-	 * rds_iw_conn_shutdown() waits for these to be emptied so they
-	 * must be initialized before it can be called.
-	 */
-	rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
-	rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
-
-	ic->conn = conn;
-	conn->c_transport_data = ic;
-
-	spin_lock_irqsave(&iw_nodev_conns_lock, flags);
-	list_add_tail(&ic->iw_node, &iw_nodev_conns);
-	spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
-
-
-	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
-	return 0;
-}
-
-/*
- * Free a connection. Connection must be shut down and not set for reconnect.
- */
-void rds_iw_conn_free(void *arg)
-{
-	struct rds_iw_connection *ic = arg;
-	spinlock_t	*lock_ptr;
-
-	rdsdebug("ic %p\n", ic);
-
-	/*
-	 * Conn is either on a dev's list or on the nodev list.
-	 * A race with shutdown() or connect() would cause problems
-	 * (since rds_iwdev would change) but that should never happen.
-	 */
-	lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
-
-	spin_lock_irq(lock_ptr);
-	list_del(&ic->iw_node);
-	spin_unlock_irq(lock_ptr);
-
-	kfree(ic);
-}
-
-/*
- * An error occurred on the connection
- */
-void
-__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
-{
-	va_list ap;
-
-	rds_conn_drop(conn);
-
-	va_start(ap, fmt);
-	vprintk(fmt, ap);
-	va_end(ap);
-}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
deleted file mode 100644
index b09a40c1adce..000000000000
--- a/net/rds/iw_rdma.c
+++ /dev/null
@@ -1,837 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/ratelimit.h>
-
-#include "rds.h"
-#include "iw.h"
-
-
-/*
- * This is stored as mr->r_trans_private.
- */
-struct rds_iw_mr {
-	struct rds_iw_device	*device;
-	struct rds_iw_mr_pool	*pool;
-	struct rdma_cm_id	*cm_id;
-
-	struct ib_mr	*mr;
-
-	struct rds_iw_mapping	mapping;
-	unsigned char		remap_count;
-};
-
-/*
- * Our own little MR pool
- */
-struct rds_iw_mr_pool {
-	struct rds_iw_device	*device;		/* back ptr to the device that owns us */
-
-	struct mutex		flush_lock;		/* serialize fmr invalidate */
-	struct work_struct	flush_worker;		/* flush worker */
-
-	spinlock_t		list_lock;		/* protect variables below */
-	atomic_t		item_count;		/* total # of MRs */
-	atomic_t		dirty_count;		/* # dirty of MRs */
-	struct list_head	dirty_list;		/* dirty mappings */
-	struct list_head	clean_list;		/* unused & unamapped MRs */
-	atomic_t		free_pinned;		/* memory pinned by free MRs */
-	unsigned long		max_message_size;	/* in pages */
-	unsigned long		max_items;
-	unsigned long		max_items_soft;
-	unsigned long		max_free_pinned;
-	int			max_pages;
-};
-
-static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
-static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
-static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-static int rds_iw_map_reg(struct rds_iw_mr_pool *pool,
-			  struct rds_iw_mr *ibmr,
-			  struct scatterlist *sg, unsigned int nents);
-static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
-			struct list_head *unmap_list,
-			struct list_head *kill_list,
-			int *unpinned);
-static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-
-static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
-			     struct rds_iw_device **rds_iwdev,
-			     struct rdma_cm_id **cm_id)
-{
-	struct rds_iw_device *iwdev;
-	struct rds_iw_cm_id *i_cm_id;
-
-	*rds_iwdev = NULL;
-	*cm_id = NULL;
-
-	list_for_each_entry(iwdev, &rds_iw_devices, list) {
-		spin_lock_irq(&iwdev->spinlock);
-		list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
-			struct sockaddr_in *src_addr, *dst_addr;
-
-			src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
-			dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
-
-			rdsdebug("local ipaddr = %x port %d, "
-				 "remote ipaddr = %x port %d"
-				 "..looking for %x port %d, "
-				 "remote ipaddr = %x port %d\n",
-				src_addr->sin_addr.s_addr,
-				src_addr->sin_port,
-				dst_addr->sin_addr.s_addr,
-				dst_addr->sin_port,
-				src->sin_addr.s_addr,
-				src->sin_port,
-				dst->sin_addr.s_addr,
-				dst->sin_port);
-#ifdef WORKING_TUPLE_DETECTION
-			if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr &&
-			    src_addr->sin_port == src->sin_port &&
-			    dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr &&
-			    dst_addr->sin_port == dst->sin_port) {
-#else
-			/* FIXME - needs to compare the local and remote
-			 * ipaddr/port tuple, but the ipaddr is the only
-			 * available information in the rds_sock (as the rest are
-			 * zero'ed.  It doesn't appear to be properly populated
-			 * during connection setup...
-			 */
-			if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) {
-#endif
-				spin_unlock_irq(&iwdev->spinlock);
-				*rds_iwdev = iwdev;
-				*cm_id = i_cm_id->cm_id;
-				return 0;
-			}
-		}
-		spin_unlock_irq(&iwdev->spinlock);
-	}
-
-	return 1;
-}
-
-static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
-{
-	struct rds_iw_cm_id *i_cm_id;
-
-	i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
-	if (!i_cm_id)
-		return -ENOMEM;
-
-	i_cm_id->cm_id = cm_id;
-
-	spin_lock_irq(&rds_iwdev->spinlock);
-	list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
-	spin_unlock_irq(&rds_iwdev->spinlock);
-
-	return 0;
-}
-
-static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
-				struct rdma_cm_id *cm_id)
-{
-	struct rds_iw_cm_id *i_cm_id;
-
-	spin_lock_irq(&rds_iwdev->spinlock);
-	list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
-		if (i_cm_id->cm_id == cm_id) {
-			list_del(&i_cm_id->list);
-			kfree(i_cm_id);
-			break;
-		}
-	}
-	spin_unlock_irq(&rds_iwdev->spinlock);
-}
-
-
-int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
-{
-	struct sockaddr_in *src_addr, *dst_addr;
-	struct rds_iw_device *rds_iwdev_old;
-	struct rdma_cm_id *pcm_id;
-	int rc;
-
-	src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
-	dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
-
-	rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id);
-	if (rc)
-		rds_iw_remove_cm_id(rds_iwdev, cm_id);
-
-	return rds_iw_add_cm_id(rds_iwdev, cm_id);
-}
-
-void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-
-	/* conn was previously on the nodev_conns_list */
-	spin_lock_irq(&iw_nodev_conns_lock);
-	BUG_ON(list_empty(&iw_nodev_conns));
-	BUG_ON(list_empty(&ic->iw_node));
-	list_del(&ic->iw_node);
-
-	spin_lock(&rds_iwdev->spinlock);
-	list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
-	spin_unlock(&rds_iwdev->spinlock);
-	spin_unlock_irq(&iw_nodev_conns_lock);
-
-	ic->rds_iwdev = rds_iwdev;
-}
-
-void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-
-	/* place conn on nodev_conns_list */
-	spin_lock(&iw_nodev_conns_lock);
-
-	spin_lock_irq(&rds_iwdev->spinlock);
-	BUG_ON(list_empty(&ic->iw_node));
-	list_del(&ic->iw_node);
-	spin_unlock_irq(&rds_iwdev->spinlock);
-
-	list_add_tail(&ic->iw_node, &iw_nodev_conns);
-
-	spin_unlock(&iw_nodev_conns_lock);
-
-	rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
-	ic->rds_iwdev = NULL;
-}
-
-void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
-{
-	struct rds_iw_connection *ic, *_ic;
-	LIST_HEAD(tmp_list);
-
-	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(list_lock);
-	list_splice(list, &tmp_list);
-	INIT_LIST_HEAD(list);
-	spin_unlock_irq(list_lock);
-
-	list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
-		rds_conn_destroy(ic->conn);
-}
-
-static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
-		struct scatterlist *list, unsigned int sg_len)
-{
-	sg->list = list;
-	sg->len = sg_len;
-	sg->dma_len = 0;
-	sg->dma_npages = 0;
-	sg->bytes = 0;
-}
-
-static int rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
-				  struct rds_iw_scatterlist *sg)
-{
-	struct ib_device *dev = rds_iwdev->dev;
-	int i, ret;
-
-	WARN_ON(sg->dma_len);
-
-	sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
-	if (unlikely(!sg->dma_len)) {
-		printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
-		return -EBUSY;
-	}
-
-	sg->bytes = 0;
-	sg->dma_npages = 0;
-
-	ret = -EINVAL;
-	for (i = 0; i < sg->dma_len; ++i) {
-		unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
-		u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
-		u64 end_addr;
-
-		sg->bytes += dma_len;
-
-		end_addr = dma_addr + dma_len;
-		if (dma_addr & PAGE_MASK) {
-			if (i > 0)
-				goto out_unmap;
-			dma_addr &= ~PAGE_MASK;
-		}
-		if (end_addr & PAGE_MASK) {
-			if (i < sg->dma_len - 1)
-				goto out_unmap;
-			end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
-		}
-
-		sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
-	}
-
-	/* Now gather the dma addrs into one list */
-	if (sg->dma_npages > fastreg_message_size)
-		goto out_unmap;
-
-
-
-	return 0;
-
-out_unmap:
-	ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
-	sg->dma_len = 0;
-	return ret;
-}
-
-
-struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
-{
-	struct rds_iw_mr_pool *pool;
-
-	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-	if (!pool) {
-		printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	pool->device = rds_iwdev;
-	INIT_LIST_HEAD(&pool->dirty_list);
-	INIT_LIST_HEAD(&pool->clean_list);
-	mutex_init(&pool->flush_lock);
-	spin_lock_init(&pool->list_lock);
-	INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
-
-	pool->max_message_size = fastreg_message_size;
-	pool->max_items = fastreg_pool_size;
-	pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
-	pool->max_pages = fastreg_message_size;
-
-	/* We never allow more than max_items MRs to be allocated.
-	 * When we exceed more than max_items_soft, we start freeing
-	 * items more aggressively.
-	 * Make sure that max_items > max_items_soft > max_items / 2
-	 */
-	pool->max_items_soft = pool->max_items * 3 / 4;
-
-	return pool;
-}
-
-void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
-{
-	struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
-
-	iinfo->rdma_mr_max = pool->max_items;
-	iinfo->rdma_mr_size = pool->max_pages;
-}
-
-void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
-{
-	flush_workqueue(rds_wq);
-	rds_iw_flush_mr_pool(pool, 1);
-	BUG_ON(atomic_read(&pool->item_count));
-	BUG_ON(atomic_read(&pool->free_pinned));
-	kfree(pool);
-}
-
-static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
-{
-	struct rds_iw_mr *ibmr = NULL;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pool->list_lock, flags);
-	if (!list_empty(&pool->clean_list)) {
-		ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
-		list_del_init(&ibmr->mapping.m_list);
-	}
-	spin_unlock_irqrestore(&pool->list_lock, flags);
-
-	return ibmr;
-}
-
-static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
-{
-	struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
-	struct rds_iw_mr *ibmr = NULL;
-	int err = 0, iter = 0;
-
-	while (1) {
-		ibmr = rds_iw_reuse_fmr(pool);
-		if (ibmr)
-			return ibmr;
-
-		/* No clean MRs - now we have the choice of either
-		 * allocating a fresh MR up to the limit imposed by the
-		 * driver, or flush any dirty unused MRs.
-		 * We try to avoid stalling in the send path if possible,
-		 * so we allocate as long as we're allowed to.
-		 *
-		 * We're fussy with enforcing the FMR limit, though. If the driver
-		 * tells us we can't use more than N fmrs, we shouldn't start
-		 * arguing with it */
-		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
-			break;
-
-		atomic_dec(&pool->item_count);
-
-		if (++iter > 2) {
-			rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
-			return ERR_PTR(-EAGAIN);
-		}
-
-		/* We do have some empty MRs. Flush them out. */
-		rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
-		rds_iw_flush_mr_pool(pool, 0);
-	}
-
-	ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
-	if (!ibmr) {
-		err = -ENOMEM;
-		goto out_no_cigar;
-	}
-
-	spin_lock_init(&ibmr->mapping.m_lock);
-	INIT_LIST_HEAD(&ibmr->mapping.m_list);
-	ibmr->mapping.m_mr = ibmr;
-
-	err = rds_iw_init_reg(pool, ibmr);
-	if (err)
-		goto out_no_cigar;
-
-	rds_iw_stats_inc(s_iw_rdma_mr_alloc);
-	return ibmr;
-
-out_no_cigar:
-	if (ibmr) {
-		rds_iw_destroy_fastreg(pool, ibmr);
-		kfree(ibmr);
-	}
-	atomic_dec(&pool->item_count);
-	return ERR_PTR(err);
-}
-
-void rds_iw_sync_mr(void *trans_private, int direction)
-{
-	struct rds_iw_mr *ibmr = trans_private;
-	struct rds_iw_device *rds_iwdev = ibmr->device;
-
-	switch (direction) {
-	case DMA_FROM_DEVICE:
-		ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
-			ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
-		break;
-	case DMA_TO_DEVICE:
-		ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
-			ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
-		break;
-	}
-}
-
-/*
- * Flush our pool of MRs.
- * At a minimum, all currently unused MRs are unmapped.
- * If the number of MRs allocated exceeds the limit, we also try
- * to free as many MRs as needed to get back to this limit.
- */
-static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
-{
-	struct rds_iw_mr *ibmr, *next;
-	LIST_HEAD(unmap_list);
-	LIST_HEAD(kill_list);
-	unsigned long flags;
-	unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
-
-	rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
-
-	mutex_lock(&pool->flush_lock);
-
-	spin_lock_irqsave(&pool->list_lock, flags);
-	/* Get the list of all mappings to be destroyed */
-	list_splice_init(&pool->dirty_list, &unmap_list);
-	if (free_all)
-		list_splice_init(&pool->clean_list, &kill_list);
-	spin_unlock_irqrestore(&pool->list_lock, flags);
-
-	/* Batched invalidate of dirty MRs.
-	 * For FMR based MRs, the mappings on the unmap list are
-	 * actually members of an ibmr (ibmr->mapping). They either
-	 * migrate to the kill_list, or have been cleaned and should be
-	 * moved to the clean_list.
-	 * For fastregs, they will be dynamically allocated, and
-	 * will be destroyed by the unmap function.
-	 */
-	if (!list_empty(&unmap_list)) {
-		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
-						     &kill_list, &unpinned);
-		/* If we've been asked to destroy all MRs, move those
-		 * that were simply cleaned to the kill list */
-		if (free_all)
-			list_splice_init(&unmap_list, &kill_list);
-	}
-
-	/* Destroy any MRs that are past their best before date */
-	list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
-		rds_iw_stats_inc(s_iw_rdma_mr_free);
-		list_del(&ibmr->mapping.m_list);
-		rds_iw_destroy_fastreg(pool, ibmr);
-		kfree(ibmr);
-		nfreed++;
-	}
-
-	/* Anything that remains are laundered ibmrs, which we can add
-	 * back to the clean list. */
-	if (!list_empty(&unmap_list)) {
-		spin_lock_irqsave(&pool->list_lock, flags);
-		list_splice(&unmap_list, &pool->clean_list);
-		spin_unlock_irqrestore(&pool->list_lock, flags);
-	}
-
-	atomic_sub(unpinned, &pool->free_pinned);
-	atomic_sub(ncleaned, &pool->dirty_count);
-	atomic_sub(nfreed, &pool->item_count);
-
-	mutex_unlock(&pool->flush_lock);
-}
-
-static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
-{
-	struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
-
-	rds_iw_flush_mr_pool(pool, 0);
-}
-
-void rds_iw_free_mr(void *trans_private, int invalidate)
-{
-	struct rds_iw_mr *ibmr = trans_private;
-	struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
-
-	rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
-	if (!pool)
-		return;
-
-	/* Return it to the pool's free list */
-	rds_iw_free_fastreg(pool, ibmr);
-
-	/* If we've pinned too many pages, request a flush */
-	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
-	    atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-		queue_work(rds_wq, &pool->flush_worker);
-
-	if (invalidate) {
-		if (likely(!in_interrupt())) {
-			rds_iw_flush_mr_pool(pool, 0);
-		} else {
-			/* We get here if the user created a MR marked
-			 * as use_once and invalidate at the same time. */
-			queue_work(rds_wq, &pool->flush_worker);
-		}
-	}
-}
-
-void rds_iw_flush_mrs(void)
-{
-	struct rds_iw_device *rds_iwdev;
-
-	list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
-		struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
-
-		if (pool)
-			rds_iw_flush_mr_pool(pool, 0);
-	}
-}
-
-void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
-		    struct rds_sock *rs, u32 *key_ret)
-{
-	struct rds_iw_device *rds_iwdev;
-	struct rds_iw_mr *ibmr = NULL;
-	struct rdma_cm_id *cm_id;
-	struct sockaddr_in src = {
-		.sin_addr.s_addr = rs->rs_bound_addr,
-		.sin_port = rs->rs_bound_port,
-	};
-	struct sockaddr_in dst = {
-		.sin_addr.s_addr = rs->rs_conn_addr,
-		.sin_port = rs->rs_conn_port,
-	};
-	int ret;
-
-	ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id);
-	if (ret || !cm_id) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	if (!rds_iwdev->mr_pool) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	ibmr = rds_iw_alloc_mr(rds_iwdev);
-	if (IS_ERR(ibmr))
-		return ibmr;
-
-	ibmr->cm_id = cm_id;
-	ibmr->device = rds_iwdev;
-
-	ret = rds_iw_map_reg(rds_iwdev->mr_pool, ibmr, sg, nents);
-	if (ret == 0)
-		*key_ret = ibmr->mr->rkey;
-	else
-		printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
-
-out:
-	if (ret) {
-		if (ibmr)
-			rds_iw_free_mr(ibmr, 0);
-		ibmr = ERR_PTR(ret);
-	}
-	return ibmr;
-}
-
-/*
- * iWARP reg handling
- *
- * The life cycle of a fastreg registration is a bit different from
- * FMRs.
- * The idea behind fastreg is to have one MR, to which we bind different
- * mappings over time. To avoid stalling on the expensive map and invalidate
- * operations, these operations are pipelined on the same send queue on
- * which we want to send the message containing the r_key.
- *
- * This creates a bit of a problem for us, as we do not have the destination
- * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
- * RDMA to be correctly setup.  If a fastreg request is present, rds_iw_xmit
- * will try to queue a LOCAL_INV (if needed) and a REG_MR work request
- * before queuing the SEND. When completions for these arrive, they are
- * dispatched to the MR has a bit set showing that RDMa can be performed.
- *
- * There is another interesting aspect that's related to invalidation.
- * The application can request that a mapping is invalidated in FREE_MR.
- * The expectation there is that this invalidation step includes ALL
- * PREVIOUSLY FREED MRs.
- */
-static int rds_iw_init_reg(struct rds_iw_mr_pool *pool,
-			   struct rds_iw_mr *ibmr)
-{
-	struct rds_iw_device *rds_iwdev = pool->device;
-	struct ib_mr *mr;
-	int err;
-
-	mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG,
-			 pool->max_message_size);
-	if (IS_ERR(mr)) {
-		err = PTR_ERR(mr);
-
-		printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err);
-		return err;
-	}
-
-	ibmr->mr = mr;
-	return 0;
-}
-
-static int rds_iw_rdma_reg_mr(struct rds_iw_mapping *mapping)
-{
-	struct rds_iw_mr *ibmr = mapping->m_mr;
-	struct rds_iw_scatterlist *m_sg = &mapping->m_sg;
-	struct ib_reg_wr reg_wr;
-	struct ib_send_wr *failed_wr;
-	int ret, n;
-
-	n = ib_map_mr_sg_zbva(ibmr->mr, m_sg->list, m_sg->len, PAGE_SIZE);
-	if (unlikely(n != m_sg->len))
-		return n < 0 ? n : -EINVAL;
-
-	reg_wr.wr.next = NULL;
-	reg_wr.wr.opcode = IB_WR_REG_MR;
-	reg_wr.wr.wr_id = RDS_IW_REG_WR_ID;
-	reg_wr.wr.num_sge = 0;
-	reg_wr.mr = ibmr->mr;
-	reg_wr.key = mapping->m_rkey;
-	reg_wr.access = IB_ACCESS_LOCAL_WRITE |
-			IB_ACCESS_REMOTE_READ |
-			IB_ACCESS_REMOTE_WRITE;
-
-	/*
-	 * Perform a WR for the reg_mr. Each individual page
-	 * in the sg list is added to the fast reg page list and placed
-	 * inside the reg_mr WR.  The key used is a rolling 8bit
-	 * counter, which should guarantee uniqueness.
-	 */
-	ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
-	mapping->m_rkey = ibmr->mr->rkey;
-
-	failed_wr = &reg_wr.wr;
-	ret = ib_post_send(ibmr->cm_id->qp, &reg_wr.wr, &failed_wr);
-	BUG_ON(failed_wr != &reg_wr.wr);
-	if (ret)
-		printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
-			__func__, __LINE__, ret);
-	return ret;
-}
-
-static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
-{
-	struct ib_send_wr s_wr, *failed_wr;
-	int ret = 0;
-
-	if (!ibmr->cm_id->qp || !ibmr->mr)
-		goto out;
-
-	memset(&s_wr, 0, sizeof(s_wr));
-	s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
-	s_wr.opcode = IB_WR_LOCAL_INV;
-	s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
-	s_wr.send_flags = IB_SEND_SIGNALED;
-
-	failed_wr = &s_wr;
-	ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
-	if (ret) {
-		printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
-			__func__, __LINE__, ret);
-		goto out;
-	}
-out:
-	return ret;
-}
-
-static int rds_iw_map_reg(struct rds_iw_mr_pool *pool,
-			  struct rds_iw_mr *ibmr,
-			  struct scatterlist *sg,
-			  unsigned int sg_len)
-{
-	struct rds_iw_device *rds_iwdev = pool->device;
-	struct rds_iw_mapping *mapping = &ibmr->mapping;
-	u64 *dma_pages;
-	int ret = 0;
-
-	rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
-
-	ret = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
-	if (ret) {
-		dma_pages = NULL;
-		goto out;
-	}
-
-	if (mapping->m_sg.dma_len > pool->max_message_size) {
-		ret = -EMSGSIZE;
-		goto out;
-	}
-
-	ret = rds_iw_rdma_reg_mr(mapping);
-	if (ret)
-		goto out;
-
-	rds_iw_stats_inc(s_iw_rdma_mr_used);
-
-out:
-	kfree(dma_pages);
-
-	return ret;
-}
-
-/*
- * "Free" a fastreg MR.
- */
-static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
-		struct rds_iw_mr *ibmr)
-{
-	unsigned long flags;
-	int ret;
-
-	if (!ibmr->mapping.m_sg.dma_len)
-		return;
-
-	ret = rds_iw_rdma_fastreg_inv(ibmr);
-	if (ret)
-		return;
-
-	/* Try to post the LOCAL_INV WR to the queue. */
-	spin_lock_irqsave(&pool->list_lock, flags);
-
-	list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
-	atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
-	atomic_inc(&pool->dirty_count);
-
-	spin_unlock_irqrestore(&pool->list_lock, flags);
-}
-
-static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
-				struct list_head *unmap_list,
-				struct list_head *kill_list,
-				int *unpinned)
-{
-	struct rds_iw_mapping *mapping, *next;
-	unsigned int ncleaned = 0;
-	LIST_HEAD(laundered);
-
-	/* Batched invalidation of fastreg MRs.
-	 * Why do we do it this way, even though we could pipeline unmap
-	 * and remap? The reason is the application semantics - when the
-	 * application requests an invalidation of MRs, it expects all
-	 * previously released R_Keys to become invalid.
-	 *
-	 * If we implement MR reuse naively, we risk memory corruption
-	 * (this has actually been observed). So the default behavior
-	 * requires that a MR goes through an explicit unmap operation before
-	 * we can reuse it again.
-	 *
-	 * We could probably improve on this a little, by allowing immediate
-	 * reuse of a MR on the same socket (eg you could add small
-	 * cache of unused MRs to strct rds_socket - GET_MR could grab one
-	 * of these without requiring an explicit invalidate).
-	 */
-	while (!list_empty(unmap_list)) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&pool->list_lock, flags);
-		list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
-			*unpinned += mapping->m_sg.len;
-			list_move(&mapping->m_list, &laundered);
-			ncleaned++;
-		}
-		spin_unlock_irqrestore(&pool->list_lock, flags);
-	}
-
-	/* Move all laundered mappings back to the unmap list.
-	 * We do not kill any WRs right now - it doesn't seem the
-	 * fastreg API has a max_remap limit. */
-	list_splice_init(&laundered, unmap_list);
-
-	return ncleaned;
-}
-
-static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
-		struct rds_iw_mr *ibmr)
-{
-	if (ibmr->mr)
-		ib_dereg_mr(ibmr->mr);
-}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
deleted file mode 100644
index a66d1794b2d0..000000000000
--- a/net/rds/iw_recv.c
+++ /dev/null
@@ -1,904 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <rdma/rdma_cm.h>
-
-#include "rds.h"
-#include "iw.h"
-
-static struct kmem_cache *rds_iw_incoming_slab;
-static struct kmem_cache *rds_iw_frag_slab;
-static atomic_t	rds_iw_allocation = ATOMIC_INIT(0);
-
-static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
-{
-	rdsdebug("frag %p page %p\n", frag, frag->f_page);
-	__free_page(frag->f_page);
-	frag->f_page = NULL;
-}
-
-static void rds_iw_frag_free(struct rds_page_frag *frag)
-{
-	rdsdebug("frag %p page %p\n", frag, frag->f_page);
-	BUG_ON(frag->f_page);
-	kmem_cache_free(rds_iw_frag_slab, frag);
-}
-
-/*
- * We map a page at a time.  Its fragments are posted in order.  This
- * is called in fragment order as the fragments get send completion events.
- * Only the last frag in the page performs the unmapping.
- *
- * It's OK for ring cleanup to call this in whatever order it likes because
- * DMA is not in flight and so we can unmap while other ring entries still
- * hold page references in their frags.
- */
-static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
-				   struct rds_iw_recv_work *recv)
-{
-	struct rds_page_frag *frag = recv->r_frag;
-
-	rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
-	if (frag->f_mapped)
-		ib_dma_unmap_page(ic->i_cm_id->device,
-			       frag->f_mapped,
-			       RDS_FRAG_SIZE, DMA_FROM_DEVICE);
-	frag->f_mapped = 0;
-}
-
-void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
-{
-	struct rds_iw_recv_work *recv;
-	u32 i;
-
-	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
-		struct ib_sge *sge;
-
-		recv->r_iwinc = NULL;
-		recv->r_frag = NULL;
-
-		recv->r_wr.next = NULL;
-		recv->r_wr.wr_id = i;
-		recv->r_wr.sg_list = recv->r_sge;
-		recv->r_wr.num_sge = RDS_IW_RECV_SGE;
-
-		sge = rds_iw_data_sge(ic, recv->r_sge);
-		sge->addr = 0;
-		sge->length = RDS_FRAG_SIZE;
-		sge->lkey = 0;
-
-		sge = rds_iw_header_sge(ic, recv->r_sge);
-		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
-		sge->length = sizeof(struct rds_header);
-		sge->lkey = 0;
-	}
-}
-
-static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
-				  struct rds_iw_recv_work *recv)
-{
-	if (recv->r_iwinc) {
-		rds_inc_put(&recv->r_iwinc->ii_inc);
-		recv->r_iwinc = NULL;
-	}
-	if (recv->r_frag) {
-		rds_iw_recv_unmap_page(ic, recv);
-		if (recv->r_frag->f_page)
-			rds_iw_frag_drop_page(recv->r_frag);
-		rds_iw_frag_free(recv->r_frag);
-		recv->r_frag = NULL;
-	}
-}
-
-void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
-{
-	u32 i;
-
-	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
-		rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
-
-	if (ic->i_frag.f_page)
-		rds_iw_frag_drop_page(&ic->i_frag);
-}
-
-static int rds_iw_recv_refill_one(struct rds_connection *conn,
-				  struct rds_iw_recv_work *recv,
-				  gfp_t kptr_gfp, gfp_t page_gfp)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	dma_addr_t dma_addr;
-	struct ib_sge *sge;
-	int ret = -ENOMEM;
-
-	if (!recv->r_iwinc) {
-		if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
-			rds_iw_stats_inc(s_iw_rx_alloc_limit);
-			goto out;
-		}
-		recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
-						 kptr_gfp);
-		if (!recv->r_iwinc) {
-			atomic_dec(&rds_iw_allocation);
-			goto out;
-		}
-		INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
-		rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
-	}
-
-	if (!recv->r_frag) {
-		recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
-		if (!recv->r_frag)
-			goto out;
-		INIT_LIST_HEAD(&recv->r_frag->f_item);
-		recv->r_frag->f_page = NULL;
-	}
-
-	if (!ic->i_frag.f_page) {
-		ic->i_frag.f_page = alloc_page(page_gfp);
-		if (!ic->i_frag.f_page)
-			goto out;
-		ic->i_frag.f_offset = 0;
-	}
-
-	dma_addr = ib_dma_map_page(ic->i_cm_id->device,
-				  ic->i_frag.f_page,
-				  ic->i_frag.f_offset,
-				  RDS_FRAG_SIZE,
-				  DMA_FROM_DEVICE);
-	if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
-		goto out;
-
-	/*
-	 * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
-	 * must be called on this recv.  This happens as completions hit
-	 * in order or on connection shutdown.
-	 */
-	recv->r_frag->f_page = ic->i_frag.f_page;
-	recv->r_frag->f_offset = ic->i_frag.f_offset;
-	recv->r_frag->f_mapped = dma_addr;
-
-	sge = rds_iw_data_sge(ic, recv->r_sge);
-	sge->addr = dma_addr;
-	sge->length = RDS_FRAG_SIZE;
-
-	sge = rds_iw_header_sge(ic, recv->r_sge);
-	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
-	sge->length = sizeof(struct rds_header);
-
-	get_page(recv->r_frag->f_page);
-
-	if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
-		ic->i_frag.f_offset += RDS_FRAG_SIZE;
-	} else {
-		put_page(ic->i_frag.f_page);
-		ic->i_frag.f_page = NULL;
-		ic->i_frag.f_offset = 0;
-	}
-
-	ret = 0;
-out:
-	return ret;
-}
-
-/*
- * This tries to allocate and post unused work requests after making sure that
- * they have all the allocations they need to queue received fragments into
- * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
- * pairs don't go unmatched.
- *
- * -1 is returned if posting fails due to temporary resource exhaustion.
- */
-int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
-		       gfp_t page_gfp, int prefill)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	struct rds_iw_recv_work *recv;
-	struct ib_recv_wr *failed_wr;
-	unsigned int posted = 0;
-	int ret = 0;
-	u32 pos;
-
-	while ((prefill || rds_conn_up(conn)) &&
-	       rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
-		if (pos >= ic->i_recv_ring.w_nr) {
-			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
-					pos);
-			ret = -EINVAL;
-			break;
-		}
-
-		recv = &ic->i_recvs[pos];
-		ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
-		if (ret) {
-			ret = -1;
-			break;
-		}
-
-		/* XXX when can this fail? */
-		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
-		rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
-			 recv->r_iwinc, recv->r_frag->f_page,
-			 (long) recv->r_frag->f_mapped, ret);
-		if (ret) {
-			rds_iw_conn_error(conn, "recv post on "
-			       "%pI4 returned %d, disconnecting and "
-			       "reconnecting\n", &conn->c_faddr,
-			       ret);
-			ret = -1;
-			break;
-		}
-
-		posted++;
-	}
-
-	/* We're doing flow control - update the window. */
-	if (ic->i_flowctl && posted)
-		rds_iw_advertise_credits(conn, posted);
-
-	if (ret)
-		rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
-	return ret;
-}
-
-static void rds_iw_inc_purge(struct rds_incoming *inc)
-{
-	struct rds_iw_incoming *iwinc;
-	struct rds_page_frag *frag;
-	struct rds_page_frag *pos;
-
-	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
-	rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
-
-	list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
-		list_del_init(&frag->f_item);
-		rds_iw_frag_drop_page(frag);
-		rds_iw_frag_free(frag);
-	}
-}
-
-void rds_iw_inc_free(struct rds_incoming *inc)
-{
-	struct rds_iw_incoming *iwinc;
-
-	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
-
-	rds_iw_inc_purge(inc);
-	rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
-	BUG_ON(!list_empty(&iwinc->ii_frags));
-	kmem_cache_free(rds_iw_incoming_slab, iwinc);
-	atomic_dec(&rds_iw_allocation);
-	BUG_ON(atomic_read(&rds_iw_allocation) < 0);
-}
-
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
-{
-	struct rds_iw_incoming *iwinc;
-	struct rds_page_frag *frag;
-	unsigned long to_copy;
-	unsigned long frag_off = 0;
-	int copied = 0;
-	int ret;
-	u32 len;
-
-	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
-	frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
-	len = be32_to_cpu(inc->i_hdr.h_len);
-
-	while (iov_iter_count(to) && copied < len) {
-		if (frag_off == RDS_FRAG_SIZE) {
-			frag = list_entry(frag->f_item.next,
-					  struct rds_page_frag, f_item);
-			frag_off = 0;
-		}
-		to_copy = min_t(unsigned long, iov_iter_count(to),
-				RDS_FRAG_SIZE - frag_off);
-		to_copy = min_t(unsigned long, to_copy, len - copied);
-
-		/* XXX needs + offset for multiple recvs per page */
-		rds_stats_add(s_copy_to_user, to_copy);
-		ret = copy_page_to_iter(frag->f_page,
-					frag->f_offset + frag_off,
-					to_copy,
-					to);
-		if (ret != to_copy)
-			return -EFAULT;
-
-		frag_off += to_copy;
-		copied += to_copy;
-	}
-
-	return copied;
-}
-
-/* ic starts out kzalloc()ed */
-void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
-{
-	struct ib_send_wr *wr = &ic->i_ack_wr;
-	struct ib_sge *sge = &ic->i_ack_sge;
-
-	sge->addr = ic->i_ack_dma;
-	sge->length = sizeof(struct rds_header);
-	sge->lkey = rds_iw_local_dma_lkey(ic);
-
-	wr->sg_list = sge;
-	wr->num_sge = 1;
-	wr->opcode = IB_WR_SEND;
-	wr->wr_id = RDS_IW_ACK_WR_ID;
-	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-}
-
-/*
- * You'd think that with reliable IB connections you wouldn't need to ack
- * messages that have been received.  The problem is that IB hardware generates
- * an ack message before it has DMAed the message into memory.  This creates a
- * potential message loss if the HCA is disabled for any reason between when it
- * sends the ack and before the message is DMAed and processed.  This is only a
- * potential issue if another HCA is available for fail-over.
- *
- * When the remote host receives our ack they'll free the sent message from
- * their send queue.  To decrease the latency of this we always send an ack
- * immediately after we've received messages.
- *
- * For simplicity, we only have one ack in flight at a time.  This puts
- * pressure on senders to have deep enough send queues to absorb the latency of
- * a single ack frame being in flight.  This might not be good enough.
- *
- * This is implemented by have a long-lived send_wr and sge which point to a
- * statically allocated ack frame.  This ack wr does not fall under the ring
- * accounting that the tx and rx wrs do.  The QP attribute specifically makes
- * room for it beyond the ring size.  Send completion notices its special
- * wr_id and avoids working with the ring in that case.
- */
-#ifndef KERNEL_HAS_ATOMIC64
-static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
-				int ack_required)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ic->i_ack_lock, flags);
-	ic->i_ack_next = seq;
-	if (ack_required)
-		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
-}
-
-static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
-{
-	unsigned long flags;
-	u64 seq;
-
-	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-
-	spin_lock_irqsave(&ic->i_ack_lock, flags);
-	seq = ic->i_ack_next;
-	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
-
-	return seq;
-}
-#else
-static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
-				int ack_required)
-{
-	atomic64_set(&ic->i_ack_next, seq);
-	if (ack_required) {
-		smp_mb__before_atomic();
-		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-	}
-}
-
-static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
-{
-	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-	smp_mb__after_atomic();
-
-	return atomic64_read(&ic->i_ack_next);
-}
-#endif
-
-
-static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
-{
-	struct rds_header *hdr = ic->i_ack;
-	struct ib_send_wr *failed_wr;
-	u64 seq;
-	int ret;
-
-	seq = rds_iw_get_ack(ic);
-
-	rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
-	rds_message_populate_header(hdr, 0, 0, 0);
-	hdr->h_ack = cpu_to_be64(seq);
-	hdr->h_credit = adv_credits;
-	rds_message_make_checksum(hdr);
-	ic->i_ack_queued = jiffies;
-
-	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
-	if (unlikely(ret)) {
-		/* Failed to send. Release the WR, and
-		 * force another ACK.
-		 */
-		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
-		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-
-		rds_iw_stats_inc(s_iw_ack_send_failure);
-
-		rds_iw_conn_error(ic->conn, "sending ack failed\n");
-	} else
-		rds_iw_stats_inc(s_iw_ack_sent);
-}
-
-/*
- * There are 3 ways of getting acknowledgements to the peer:
- *  1.	We call rds_iw_attempt_ack from the recv completion handler
- *	to send an ACK-only frame.
- *	However, there can be only one such frame in the send queue
- *	at any time, so we may have to postpone it.
- *  2.	When another (data) packet is transmitted while there's
- *	an ACK in the queue, we piggyback the ACK sequence number
- *	on the data packet.
- *  3.	If the ACK WR is done sending, we get called from the
- *	send queue completion handler, and check whether there's
- *	another ACK pending (postponed because the WR was on the
- *	queue). If so, we transmit it.
- *
- * We maintain 2 variables:
- *  -	i_ack_flags, which keeps track of whether the ACK WR
- *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
- *  -	i_ack_next, which is the last sequence number we received
- *
- * Potentially, send queue and receive queue handlers can run concurrently.
- * It would be nice to not have to use a spinlock to synchronize things,
- * but the one problem that rules this out is that 64bit updates are
- * not atomic on all platforms. Things would be a lot simpler if
- * we had atomic64 or maybe cmpxchg64 everywhere.
- *
- * Reconnecting complicates this picture just slightly. When we
- * reconnect, we may be seeing duplicate packets. The peer
- * is retransmitting them, because it hasn't seen an ACK for
- * them. It is important that we ACK these.
- *
- * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
- * this flag set *MUST* be acknowledged immediately.
- */
-
-/*
- * When we get here, we're called from the recv queue handler.
- * Check whether we ought to transmit an ACK.
- */
-void rds_iw_attempt_ack(struct rds_iw_connection *ic)
-{
-	unsigned int adv_credits;
-
-	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
-		return;
-
-	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
-		rds_iw_stats_inc(s_iw_ack_send_delayed);
-		return;
-	}
-
-	/* Can we get a send credit? */
-	if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
-		rds_iw_stats_inc(s_iw_tx_throttle);
-		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
-		return;
-	}
-
-	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-	rds_iw_send_ack(ic, adv_credits);
-}
-
-/*
- * We get here from the send completion handler, when the
- * adapter tells us the ACK frame was sent.
- */
-void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
-{
-	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
-	rds_iw_attempt_ack(ic);
-}
-
-/*
- * This is called by the regular xmit code when it wants to piggyback
- * an ACK on an outgoing frame.
- */
-u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
-{
-	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
-		rds_iw_stats_inc(s_iw_ack_send_piggybacked);
-	return rds_iw_get_ack(ic);
-}
-
-/*
- * It's kind of lame that we're copying from the posted receive pages into
- * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
- * them.  But receiving new congestion bitmaps should be a *rare* event, so
- * hopefully we won't need to invest that complexity in making it more
- * efficient.  By copying we can share a simpler core with TCP which has to
- * copy.
- */
-static void rds_iw_cong_recv(struct rds_connection *conn,
-			      struct rds_iw_incoming *iwinc)
-{
-	struct rds_cong_map *map;
-	unsigned int map_off;
-	unsigned int map_page;
-	struct rds_page_frag *frag;
-	unsigned long frag_off;
-	unsigned long to_copy;
-	unsigned long copied;
-	uint64_t uncongested = 0;
-	void *addr;
-
-	/* catch completely corrupt packets */
-	if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
-		return;
-
-	map = conn->c_fcong;
-	map_page = 0;
-	map_off = 0;
-
-	frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
-	frag_off = 0;
-
-	copied = 0;
-
-	while (copied < RDS_CONG_MAP_BYTES) {
-		uint64_t *src, *dst;
-		unsigned int k;
-
-		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
-		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
-
-		addr = kmap_atomic(frag->f_page);
-
-		src = addr + frag_off;
-		dst = (void *)map->m_page_addrs[map_page] + map_off;
-		for (k = 0; k < to_copy; k += 8) {
-			/* Record ports that became uncongested, ie
-			 * bits that changed from 0 to 1. */
-			uncongested |= ~(*src) & *dst;
-			*dst++ = *src++;
-		}
-		kunmap_atomic(addr);
-
-		copied += to_copy;
-
-		map_off += to_copy;
-		if (map_off == PAGE_SIZE) {
-			map_off = 0;
-			map_page++;
-		}
-
-		frag_off += to_copy;
-		if (frag_off == RDS_FRAG_SIZE) {
-			frag = list_entry(frag->f_item.next,
-					  struct rds_page_frag, f_item);
-			frag_off = 0;
-		}
-	}
-
-	/* the congestion map is in little endian order */
-	uncongested = le64_to_cpu(uncongested);
-
-	rds_cong_map_updated(map, uncongested);
-}
-
-/*
- * Rings are posted with all the allocations they'll need to queue the
- * incoming message to the receiving socket so this can't fail.
- * All fragments start with a header, so we can make sure we're not receiving
- * garbage, and we can tell a small 8 byte fragment from an ACK frame.
- */
-struct rds_iw_ack_state {
-	u64		ack_next;
-	u64		ack_recv;
-	unsigned int	ack_required:1;
-	unsigned int	ack_next_valid:1;
-	unsigned int	ack_recv_valid:1;
-};
-
-static void rds_iw_process_recv(struct rds_connection *conn,
-				struct rds_iw_recv_work *recv, u32 byte_len,
-				struct rds_iw_ack_state *state)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	struct rds_iw_incoming *iwinc = ic->i_iwinc;
-	struct rds_header *ihdr, *hdr;
-
-	/* XXX shut down the connection if port 0,0 are seen? */
-
-	rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
-		 byte_len);
-
-	if (byte_len < sizeof(struct rds_header)) {
-		rds_iw_conn_error(conn, "incoming message "
-		       "from %pI4 didn't include a "
-		       "header, disconnecting and "
-		       "reconnecting\n",
-		       &conn->c_faddr);
-		return;
-	}
-	byte_len -= sizeof(struct rds_header);
-
-	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
-
-	/* Validate the checksum. */
-	if (!rds_message_verify_checksum(ihdr)) {
-		rds_iw_conn_error(conn, "incoming message "
-		       "from %pI4 has corrupted header - "
-		       "forcing a reconnect\n",
-		       &conn->c_faddr);
-		rds_stats_inc(s_recv_drop_bad_checksum);
-		return;
-	}
-
-	/* Process the ACK sequence which comes with every packet */
-	state->ack_recv = be64_to_cpu(ihdr->h_ack);
-	state->ack_recv_valid = 1;
-
-	/* Process the credits update if there was one */
-	if (ihdr->h_credit)
-		rds_iw_send_add_credits(conn, ihdr->h_credit);
-
-	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
-		/* This is an ACK-only packet. The fact that it gets
-		 * special treatment here is that historically, ACKs
-		 * were rather special beasts.
-		 */
-		rds_iw_stats_inc(s_iw_ack_received);
-
-		/*
-		 * Usually the frags make their way on to incs and are then freed as
-		 * the inc is freed.  We don't go that route, so we have to drop the
-		 * page ref ourselves.  We can't just leave the page on the recv
-		 * because that confuses the dma mapping of pages and each recv's use
-		 * of a partial page.  We can leave the frag, though, it will be
-		 * reused.
-		 *
-		 * FIXME: Fold this into the code path below.
-		 */
-		rds_iw_frag_drop_page(recv->r_frag);
-		return;
-	}
-
-	/*
-	 * If we don't already have an inc on the connection then this
-	 * fragment has a header and starts a message.. copy its header
-	 * into the inc and save the inc so we can hang upcoming fragments
-	 * off its list.
-	 */
-	if (!iwinc) {
-		iwinc = recv->r_iwinc;
-		recv->r_iwinc = NULL;
-		ic->i_iwinc = iwinc;
-
-		hdr = &iwinc->ii_inc.i_hdr;
-		memcpy(hdr, ihdr, sizeof(*hdr));
-		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
-
-		rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
-			 ic->i_recv_data_rem, hdr->h_flags);
-	} else {
-		hdr = &iwinc->ii_inc.i_hdr;
-		/* We can't just use memcmp here; fragments of a
-		 * single message may carry different ACKs */
-		if (hdr->h_sequence != ihdr->h_sequence ||
-		    hdr->h_len != ihdr->h_len ||
-		    hdr->h_sport != ihdr->h_sport ||
-		    hdr->h_dport != ihdr->h_dport) {
-			rds_iw_conn_error(conn,
-				"fragment header mismatch; forcing reconnect\n");
-			return;
-		}
-	}
-
-	list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
-	recv->r_frag = NULL;
-
-	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
-		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
-	else {
-		ic->i_recv_data_rem = 0;
-		ic->i_iwinc = NULL;
-
-		if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
-			rds_iw_cong_recv(conn, iwinc);
-		else {
-			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
-					  &iwinc->ii_inc, GFP_ATOMIC);
-			state->ack_next = be64_to_cpu(hdr->h_sequence);
-			state->ack_next_valid = 1;
-		}
-
-		/* Evaluate the ACK_REQUIRED flag *after* we received
-		 * the complete frame, and after bumping the next_rx
-		 * sequence. */
-		if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
-			rds_stats_inc(s_recv_ack_required);
-			state->ack_required = 1;
-		}
-
-		rds_inc_put(&iwinc->ii_inc);
-	}
-}
-
-/*
- * Plucking the oldest entry from the ring can be done concurrently with
- * the thread refilling the ring.  Each ring operation is protected by
- * spinlocks and the transient state of refilling doesn't change the
- * recording of which entry is oldest.
- *
- * This relies on IB only calling one cq comp_handler for each cq so that
- * there will only be one caller of rds_recv_incoming() per RDS connection.
- */
-void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
-{
-	struct rds_connection *conn = context;
-	struct rds_iw_connection *ic = conn->c_transport_data;
-
-	rdsdebug("conn %p cq %p\n", conn, cq);
-
-	rds_iw_stats_inc(s_iw_rx_cq_call);
-
-	tasklet_schedule(&ic->i_recv_tasklet);
-}
-
-static inline void rds_poll_cq(struct rds_iw_connection *ic,
-			       struct rds_iw_ack_state *state)
-{
-	struct rds_connection *conn = ic->conn;
-	struct ib_wc wc;
-	struct rds_iw_recv_work *recv;
-
-	while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
-		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
-			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
-			 be32_to_cpu(wc.ex.imm_data));
-		rds_iw_stats_inc(s_iw_rx_cq_event);
-
-		recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
-
-		rds_iw_recv_unmap_page(ic, recv);
-
-		/*
-		 * Also process recvs in connecting state because it is possible
-		 * to get a recv completion _before_ the rdmacm ESTABLISHED
-		 * event is processed.
-		 */
-		if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
-			/* We expect errors as the qp is drained during shutdown */
-			if (wc.status == IB_WC_SUCCESS) {
-				rds_iw_process_recv(conn, recv, wc.byte_len, state);
-			} else {
-				rds_iw_conn_error(conn, "recv completion on "
-				       "%pI4 had status %u, disconnecting and "
-				       "reconnecting\n", &conn->c_faddr,
-				       wc.status);
-			}
-		}
-
-		rds_iw_ring_free(&ic->i_recv_ring, 1);
-	}
-}
-
-void rds_iw_recv_tasklet_fn(unsigned long data)
-{
-	struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
-	struct rds_connection *conn = ic->conn;
-	struct rds_iw_ack_state state = { 0, };
-
-	rds_poll_cq(ic, &state);
-	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
-	rds_poll_cq(ic, &state);
-
-	if (state.ack_next_valid)
-		rds_iw_set_ack(ic, state.ack_next, state.ack_required);
-	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
-		rds_send_drop_acked(conn, state.ack_recv, NULL);
-		ic->i_ack_recv = state.ack_recv;
-	}
-	if (rds_conn_up(conn))
-		rds_iw_attempt_ack(ic);
-
-	/* If we ever end up with a really empty receive ring, we're
-	 * in deep trouble, as the sender will definitely see RNR
-	 * timeouts. */
-	if (rds_iw_ring_empty(&ic->i_recv_ring))
-		rds_iw_stats_inc(s_iw_rx_ring_empty);
-
-	/*
-	 * If the ring is running low, then schedule the thread to refill.
-	 */
-	if (rds_iw_ring_low(&ic->i_recv_ring))
-		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
-}
-
-int rds_iw_recv(struct rds_connection *conn)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	int ret = 0;
-
-	rdsdebug("conn %p\n", conn);
-
-	/*
-	 * If we get a temporary posting failure in this context then
-	 * we're really low and we want the caller to back off for a bit.
-	 */
-	mutex_lock(&ic->i_recv_mutex);
-	if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
-		ret = -ENOMEM;
-	else
-		rds_iw_stats_inc(s_iw_rx_refill_from_thread);
-	mutex_unlock(&ic->i_recv_mutex);
-
-	if (rds_conn_up(conn))
-		rds_iw_attempt_ack(ic);
-
-	return ret;
-}
-
-int rds_iw_recv_init(void)
-{
-	struct sysinfo si;
-	int ret = -ENOMEM;
-
-	/* Default to 30% of all available RAM for recv memory */
-	si_meminfo(&si);
-	rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
-
-	rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
-					sizeof(struct rds_iw_incoming),
-					0, 0, NULL);
-	if (!rds_iw_incoming_slab)
-		goto out;
-
-	rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
-					sizeof(struct rds_page_frag),
-					0, 0, NULL);
-	if (!rds_iw_frag_slab)
-		kmem_cache_destroy(rds_iw_incoming_slab);
-	else
-		ret = 0;
-out:
-	return ret;
-}
-
-void rds_iw_recv_exit(void)
-{
-	kmem_cache_destroy(rds_iw_incoming_slab);
-	kmem_cache_destroy(rds_iw_frag_slab);
-}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
deleted file mode 100644
index da8e3b63f663..000000000000
--- a/net/rds/iw_ring.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-
-#include "rds.h"
-#include "iw.h"
-
-/*
- * Locking for IB rings.
- * We assume that allocation is always protected by a mutex
- * in the caller (this is a valid assumption for the current
- * implementation).
- *
- * Freeing always happens in an interrupt, and hence only
- * races with allocations, but not with other free()s.
- *
- * The interaction between allocation and freeing is that
- * the alloc code has to determine the number of free entries.
- * To this end, we maintain two counters; an allocation counter
- * and a free counter. Both are allowed to run freely, and wrap
- * around.
- * The number of used entries is always (alloc_ctr - free_ctr) % NR.
- *
- * The current implementation makes free_ctr atomic. When the
- * caller finds an allocation fails, it should set an "alloc fail"
- * bit and retry the allocation. The "alloc fail" bit essentially tells
- * the CQ completion handlers to wake it up after freeing some
- * more entries.
- */
-
-/*
- * This only happens on shutdown.
- */
-DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
-
-void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
-{
-	memset(ring, 0, sizeof(*ring));
-	ring->w_nr = nr;
-	rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
-}
-
-static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
-{
-	u32 diff;
-
-	/* This assumes that atomic_t has at least as many bits as u32 */
-	diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
-	BUG_ON(diff > ring->w_nr);
-
-	return diff;
-}
-
-void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
-{
-	/* We only ever get called from the connection setup code,
-	 * prior to creating the QP. */
-	BUG_ON(__rds_iw_ring_used(ring));
-	ring->w_nr = nr;
-}
-
-static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
-{
-	return __rds_iw_ring_used(ring) == 0;
-}
-
-u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
-{
-	u32 ret = 0, avail;
-
-	avail = ring->w_nr - __rds_iw_ring_used(ring);
-
-	rdsdebug("ring %p val %u next %u free %u\n", ring, val,
-		 ring->w_alloc_ptr, avail);
-
-	if (val && avail) {
-		ret = min(val, avail);
-		*pos = ring->w_alloc_ptr;
-
-		ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
-		ring->w_alloc_ctr += ret;
-	}
-
-	return ret;
-}
-
-void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
-{
-	ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
-	atomic_add(val, &ring->w_free_ctr);
-
-	if (__rds_iw_ring_empty(ring) &&
-	    waitqueue_active(&rds_iw_ring_empty_wait))
-		wake_up(&rds_iw_ring_empty_wait);
-}
-
-void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
-{
-	ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
-	ring->w_alloc_ctr -= val;
-}
-
-int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
-{
-	return __rds_iw_ring_empty(ring);
-}
-
-int rds_iw_ring_low(struct rds_iw_work_ring *ring)
-{
-	return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
-}
-
-
-/*
- * returns the oldest alloced ring entry.  This will be the next one
- * freed.  This can't be called if there are none allocated.
- */
-u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
-{
-	return ring->w_free_ptr;
-}
-
-/*
- * returns the number of completed work requests.
- */
-
-u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
-{
-	u32 ret;
-
-	if (oldest <= (unsigned long long)wr_id)
-		ret = (unsigned long long)wr_id - oldest + 1;
-	else
-		ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
-
-	rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
-		 wr_id, oldest);
-	return ret;
-}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
deleted file mode 100644
index e20bd503f4bd..000000000000
--- a/net/rds/iw_send.c
+++ /dev/null
@@ -1,981 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <linux/device.h>
-#include <linux/dmapool.h>
-#include <linux/ratelimit.h>
-
-#include "rds.h"
-#include "iw.h"
-
-static void rds_iw_send_rdma_complete(struct rds_message *rm,
-				      int wc_status)
-{
-	int notify_status;
-
-	switch (wc_status) {
-	case IB_WC_WR_FLUSH_ERR:
-		return;
-
-	case IB_WC_SUCCESS:
-		notify_status = RDS_RDMA_SUCCESS;
-		break;
-
-	case IB_WC_REM_ACCESS_ERR:
-		notify_status = RDS_RDMA_REMOTE_ERROR;
-		break;
-
-	default:
-		notify_status = RDS_RDMA_OTHER_ERROR;
-		break;
-	}
-	rds_rdma_send_complete(rm, notify_status);
-}
-
-static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
-				   struct rm_rdma_op *op)
-{
-	if (op->op_mapped) {
-		ib_dma_unmap_sg(ic->i_cm_id->device,
-			op->op_sg, op->op_nents,
-			op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		op->op_mapped = 0;
-	}
-}
-
-static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
-			  struct rds_iw_send_work *send,
-			  int wc_status)
-{
-	struct rds_message *rm = send->s_rm;
-
-	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
-
-	ib_dma_unmap_sg(ic->i_cm_id->device,
-		     rm->data.op_sg, rm->data.op_nents,
-		     DMA_TO_DEVICE);
-
-	if (rm->rdma.op_active) {
-		rds_iw_send_unmap_rdma(ic, &rm->rdma);
-
-		/* If the user asked for a completion notification on this
-		 * message, we can implement three different semantics:
-		 *  1.	Notify when we received the ACK on the RDS message
-		 *	that was queued with the RDMA. This provides reliable
-		 *	notification of RDMA status at the expense of a one-way
-		 *	packet delay.
-		 *  2.	Notify when the IB stack gives us the completion event for
-		 *	the RDMA operation.
-		 *  3.	Notify when the IB stack gives us the completion event for
-		 *	the accompanying RDS messages.
-		 * Here, we implement approach #3. To implement approach #2,
-		 * call rds_rdma_send_complete from the cq_handler. To implement #1,
-		 * don't call rds_rdma_send_complete at all, and fall back to the notify
-		 * handling in the ACK processing code.
-		 *
-		 * Note: There's no need to explicitly sync any RDMA buffers using
-		 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
-		 * operation itself unmapped the RDMA buffers, which takes care
-		 * of synching.
-		 */
-		rds_iw_send_rdma_complete(rm, wc_status);
-
-		if (rm->rdma.op_write)
-			rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
-		else
-			rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
-	}
-
-	/* If anyone waited for this message to get flushed out, wake
-	 * them up now */
-	rds_message_unmapped(rm);
-
-	rds_message_put(rm);
-	send->s_rm = NULL;
-}
-
-void rds_iw_send_init_ring(struct rds_iw_connection *ic)
-{
-	struct rds_iw_send_work *send;
-	u32 i;
-
-	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
-		struct ib_sge *sge;
-
-		send->s_rm = NULL;
-		send->s_op = NULL;
-		send->s_mapping = NULL;
-
-		send->s_send_wr.next = NULL;
-		send->s_send_wr.wr_id = i;
-		send->s_send_wr.sg_list = send->s_sge;
-		send->s_send_wr.num_sge = 1;
-		send->s_send_wr.opcode = IB_WR_SEND;
-		send->s_send_wr.send_flags = 0;
-		send->s_send_wr.ex.imm_data = 0;
-
-		sge = rds_iw_data_sge(ic, send->s_sge);
-		sge->lkey = 0;
-
-		sge = rds_iw_header_sge(ic, send->s_sge);
-		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
-		sge->length = sizeof(struct rds_header);
-		sge->lkey = 0;
-
-		send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG,
-					 fastreg_message_size);
-		if (IS_ERR(send->s_mr)) {
-			printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n");
-			break;
-		}
-	}
-}
-
-void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
-{
-	struct rds_iw_send_work *send;
-	u32 i;
-
-	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
-		BUG_ON(!send->s_mr);
-		ib_dereg_mr(send->s_mr);
-		if (send->s_send_wr.opcode == 0xdead)
-			continue;
-		if (send->s_rm)
-			rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
-		if (send->s_op)
-			rds_iw_send_unmap_rdma(ic, send->s_op);
-	}
-}
-
-/*
- * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
- * operations performed in the send path.  As the sender allocs and potentially
- * unallocs the next free entry in the ring it doesn't alter which is
- * the next to be freed, which is what this is concerned with.
- */
-void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
-{
-	struct rds_connection *conn = context;
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	struct ib_wc wc;
-	struct rds_iw_send_work *send;
-	u32 completed;
-	u32 oldest;
-	u32 i;
-	int ret;
-
-	rdsdebug("cq %p conn %p\n", cq, conn);
-	rds_iw_stats_inc(s_iw_tx_cq_call);
-	ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-	if (ret)
-		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
-
-	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
-			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
-			 be32_to_cpu(wc.ex.imm_data));
-		rds_iw_stats_inc(s_iw_tx_cq_event);
-
-		if (wc.status != IB_WC_SUCCESS) {
-			printk(KERN_ERR "WC Error:  status = %d opcode = %d\n", wc.status, wc.opcode);
-			break;
-		}
-
-		if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
-			ic->i_fastreg_posted = 0;
-			continue;
-		}
-
-		if (wc.opcode == IB_WC_REG_MR && wc.wr_id == RDS_IW_REG_WR_ID) {
-			ic->i_fastreg_posted = 1;
-			continue;
-		}
-
-		if (wc.wr_id == RDS_IW_ACK_WR_ID) {
-			if (time_after(jiffies, ic->i_ack_queued + HZ/2))
-				rds_iw_stats_inc(s_iw_tx_stalled);
-			rds_iw_ack_send_complete(ic);
-			continue;
-		}
-
-		oldest = rds_iw_ring_oldest(&ic->i_send_ring);
-
-		completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
-
-		for (i = 0; i < completed; i++) {
-			send = &ic->i_sends[oldest];
-
-			/* In the error case, wc.opcode sometimes contains garbage */
-			switch (send->s_send_wr.opcode) {
-			case IB_WR_SEND:
-				if (send->s_rm)
-					rds_iw_send_unmap_rm(ic, send, wc.status);
-				break;
-			case IB_WR_REG_MR:
-			case IB_WR_RDMA_WRITE:
-			case IB_WR_RDMA_READ:
-			case IB_WR_RDMA_READ_WITH_INV:
-				/* Nothing to be done - the SG list will be unmapped
-				 * when the SEND completes. */
-				break;
-			default:
-				printk_ratelimited(KERN_NOTICE
-						"RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
-						__func__, send->s_send_wr.opcode);
-				break;
-			}
-
-			send->s_send_wr.opcode = 0xdead;
-			send->s_send_wr.num_sge = 1;
-			if (time_after(jiffies, send->s_queued + HZ/2))
-				rds_iw_stats_inc(s_iw_tx_stalled);
-
-			/* If a RDMA operation produced an error, signal this right
-			 * away. If we don't, the subsequent SEND that goes with this
-			 * RDMA will be canceled with ERR_WFLUSH, and the application
-			 * never learn that the RDMA failed. */
-			if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
-				struct rds_message *rm;
-
-				rm = rds_send_get_message(conn, send->s_op);
-				if (rm)
-					rds_iw_send_rdma_complete(rm, wc.status);
-			}
-
-			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
-		}
-
-		rds_iw_ring_free(&ic->i_send_ring, completed);
-
-		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
-		    test_bit(0, &conn->c_map_queued))
-			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
-		/* We expect errors as the qp is drained during shutdown */
-		if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-			rds_iw_conn_error(conn,
-				"send completion on %pI4 "
-				"had status %u, disconnecting and reconnecting\n",
-				&conn->c_faddr, wc.status);
-		}
-	}
-}
-
-/*
- * This is the main function for allocating credits when sending
- * messages.
- *
- * Conceptually, we have two counters:
- *  -	send credits: this tells us how many WRs we're allowed
- *	to submit without overruning the receiver's queue. For
- *	each SEND WR we post, we decrement this by one.
- *
- *  -	posted credits: this tells us how many WRs we recently
- *	posted to the receive queue. This value is transferred
- *	to the peer as a "credit update" in a RDS header field.
- *	Every time we transmit credits to the peer, we subtract
- *	the amount of transferred credits from this counter.
- *
- * It is essential that we avoid situations where both sides have
- * exhausted their send credits, and are unable to send new credits
- * to the peer. We achieve this by requiring that we send at least
- * one credit update to the peer before exhausting our credits.
- * When new credits arrive, we subtract one credit that is withheld
- * until we've posted new buffers and are ready to transmit these
- * credits (see rds_iw_send_add_credits below).
- *
- * The RDS send code is essentially single-threaded; rds_send_xmit
- * grabs c_send_lock to ensure exclusive access to the send ring.
- * However, the ACK sending code is independent and can race with
- * message SENDs.
- *
- * In the send path, we need to update the counters for send credits
- * and the counter of posted buffers atomically - when we use the
- * last available credit, we cannot allow another thread to race us
- * and grab the posted credits counter.  Hence, we have to use a
- * spinlock to protect the credit counter, or use atomics.
- *
- * Spinlocks shared between the send and the receive path are bad,
- * because they create unnecessary delays. An early implementation
- * using a spinlock showed a 5% degradation in throughput at some
- * loads.
- *
- * This implementation avoids spinlocks completely, putting both
- * counters into a single atomic, and updating that atomic using
- * atomic_add (in the receive path, when receiving fresh credits),
- * and using atomic_cmpxchg when updating the two counters.
- */
-int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
-			     u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
-{
-	unsigned int avail, posted, got = 0, advertise;
-	long oldval, newval;
-
-	*adv_credits = 0;
-	if (!ic->i_flowctl)
-		return wanted;
-
-try_again:
-	advertise = 0;
-	oldval = newval = atomic_read(&ic->i_credits);
-	posted = IB_GET_POST_CREDITS(oldval);
-	avail = IB_GET_SEND_CREDITS(oldval);
-
-	rdsdebug("wanted=%u credits=%u posted=%u\n",
-			wanted, avail, posted);
-
-	/* The last credit must be used to send a credit update. */
-	if (avail && !posted)
-		avail--;
-
-	if (avail < wanted) {
-		struct rds_connection *conn = ic->i_cm_id->context;
-
-		/* Oops, there aren't that many credits left! */
-		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
-		got = avail;
-	} else {
-		/* Sometimes you get what you want, lalala. */
-		got = wanted;
-	}
-	newval -= IB_SET_SEND_CREDITS(got);
-
-	/*
-	 * If need_posted is non-zero, then the caller wants
-	 * the posted regardless of whether any send credits are
-	 * available.
-	 */
-	if (posted && (got || need_posted)) {
-		advertise = min_t(unsigned int, posted, max_posted);
-		newval -= IB_SET_POST_CREDITS(advertise);
-	}
-
-	/* Finally bill everything */
-	if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
-		goto try_again;
-
-	*adv_credits = advertise;
-	return got;
-}
-
-void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-
-	if (credits == 0)
-		return;
-
-	rdsdebug("credits=%u current=%u%s\n",
-			credits,
-			IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
-			test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
-
-	atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
-	if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
-		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
-	WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
-
-	rds_iw_stats_inc(s_iw_rx_credit_updates);
-}
-
-void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-
-	if (posted == 0)
-		return;
-
-	atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
-
-	/* Decide whether to send an update to the peer now.
-	 * If we would send a credit update for every single buffer we
-	 * post, we would end up with an ACK storm (ACK arrives,
-	 * consumes buffer, we refill the ring, send ACK to remote
-	 * advertising the newly posted buffer... ad inf)
-	 *
-	 * Performance pretty much depends on how often we send
-	 * credit updates - too frequent updates mean lots of ACKs.
-	 * Too infrequent updates, and the peer will run out of
-	 * credits and has to throttle.
-	 * For the time being, 16 seems to be a good compromise.
-	 */
-	if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
-		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-}
-
-static inline void
-rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
-		struct rds_iw_send_work *send, unsigned int pos,
-		unsigned long buffer, unsigned int length,
-		int send_flags)
-{
-	struct ib_sge *sge;
-
-	WARN_ON(pos != send - ic->i_sends);
-
-	send->s_send_wr.send_flags = send_flags;
-	send->s_send_wr.opcode = IB_WR_SEND;
-	send->s_send_wr.num_sge = 2;
-	send->s_send_wr.next = NULL;
-	send->s_queued = jiffies;
-	send->s_op = NULL;
-
-	if (length != 0) {
-		sge = rds_iw_data_sge(ic, send->s_sge);
-		sge->addr = buffer;
-		sge->length = length;
-		sge->lkey = rds_iw_local_dma_lkey(ic);
-
-		sge = rds_iw_header_sge(ic, send->s_sge);
-	} else {
-		/* We're sending a packet with no payload. There is only
-		 * one SGE */
-		send->s_send_wr.num_sge = 1;
-		sge = &send->s_sge[0];
-	}
-
-	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
-	sge->length = sizeof(struct rds_header);
-	sge->lkey = rds_iw_local_dma_lkey(ic);
-}
-
-/*
- * This can be called multiple times for a given message.  The first time
- * we see a message we map its scatterlist into the IB device so that
- * we can provide that mapped address to the IB scatter gather entries
- * in the IB work requests.  We translate the scatterlist into a series
- * of work requests that fragment the message.  These work requests complete
- * in order so we pass ownership of the message to the completion handler
- * once we send the final fragment.
- *
- * The RDS core uses the c_send_lock to only enter this function once
- * per connection.  This makes sure that the tx ring alloc/unalloc pairs
- * don't get out of sync and confuse the ring.
- */
-int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
-		unsigned int hdr_off, unsigned int sg, unsigned int off)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	struct ib_device *dev = ic->i_cm_id->device;
-	struct rds_iw_send_work *send = NULL;
-	struct rds_iw_send_work *first;
-	struct rds_iw_send_work *prev;
-	struct ib_send_wr *failed_wr;
-	struct scatterlist *scat;
-	u32 pos;
-	u32 i;
-	u32 work_alloc;
-	u32 credit_alloc;
-	u32 posted;
-	u32 adv_credits = 0;
-	int send_flags = 0;
-	int sent;
-	int ret;
-	int flow_controlled = 0;
-
-	BUG_ON(off % RDS_FRAG_SIZE);
-	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
-
-	/* Fastreg support */
-	if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
-		ret = -EAGAIN;
-		goto out;
-	}
-
-	/* FIXME we may overallocate here */
-	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
-		i = 1;
-	else
-		i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
-
-	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
-	if (work_alloc == 0) {
-		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
-		rds_iw_stats_inc(s_iw_tx_ring_full);
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	credit_alloc = work_alloc;
-	if (ic->i_flowctl) {
-		credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
-		adv_credits += posted;
-		if (credit_alloc < work_alloc) {
-			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
-			work_alloc = credit_alloc;
-			flow_controlled++;
-		}
-		if (work_alloc == 0) {
-			set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
-			rds_iw_stats_inc(s_iw_tx_throttle);
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
-	/* map the message the first time we see it */
-	if (!ic->i_rm) {
-		/*
-		printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
-				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
-				rm->m_inc.i_hdr.h_flags,
-				be32_to_cpu(rm->m_inc.i_hdr.h_len));
-		   */
-		if (rm->data.op_nents) {
-			rm->data.op_count = ib_dma_map_sg(dev,
-							  rm->data.op_sg,
-							  rm->data.op_nents,
-							  DMA_TO_DEVICE);
-			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
-			if (rm->data.op_count == 0) {
-				rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
-				rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
-				ret = -ENOMEM; /* XXX ? */
-				goto out;
-			}
-		} else {
-			rm->data.op_count = 0;
-		}
-
-		ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
-		ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
-		rds_message_addref(rm);
-		rm->data.op_dmasg = 0;
-		rm->data.op_dmaoff = 0;
-		ic->i_rm = rm;
-
-		/* Finalize the header */
-		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
-			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
-		if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
-			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
-
-		/* If it has a RDMA op, tell the peer we did it. This is
-		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->rdma.op_active) {
-			struct rds_ext_header_rdma ext_hdr;
-
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
-			rds_message_add_extension(&rm->m_inc.i_hdr,
-					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
-		}
-		if (rm->m_rdma_cookie) {
-			rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
-					rds_rdma_cookie_key(rm->m_rdma_cookie),
-					rds_rdma_cookie_offset(rm->m_rdma_cookie));
-		}
-
-		/* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
-		 * we should not do this unless we have a chance of at least
-		 * sticking the header into the send ring. Which is why we
-		 * should call rds_iw_ring_alloc first. */
-		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
-		rds_message_make_checksum(&rm->m_inc.i_hdr);
-
-		/*
-		 * Update adv_credits since we reset the ACK_REQUIRED bit.
-		 */
-		rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
-		adv_credits += posted;
-		BUG_ON(adv_credits > 255);
-	}
-
-	send = &ic->i_sends[pos];
-	first = send;
-	prev = NULL;
-	scat = &rm->data.op_sg[rm->data.op_dmasg];
-	sent = 0;
-	i = 0;
-
-	/* Sometimes you want to put a fence between an RDMA
-	 * READ and the following SEND.
-	 * We could either do this all the time
-	 * or when requested by the user. Right now, we let
-	 * the application choose.
-	 */
-	if (rm->rdma.op_active && rm->rdma.op_fence)
-		send_flags = IB_SEND_FENCE;
-
-	/*
-	 * We could be copying the header into the unused tail of the page.
-	 * That would need to be changed in the future when those pages might
-	 * be mapped userspace pages or page cache pages.  So instead we always
-	 * use a second sge and our long-lived ring of mapped headers.  We send
-	 * the header after the data so that the data payload can be aligned on
-	 * the receiver.
-	 */
-
-	/* handle a 0-len message */
-	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
-		rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
-		goto add_header;
-	}
-
-	/* if there's data reference it with a chain of work reqs */
-	for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
-		unsigned int len;
-
-		send = &ic->i_sends[pos];
-
-		len = min(RDS_FRAG_SIZE,
-			  ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff);
-		rds_iw_xmit_populate_wr(ic, send, pos,
-			ib_sg_dma_address(dev, scat) + rm->data.op_dmaoff, len,
-			send_flags);
-
-		/*
-		 * We want to delay signaling completions just enough to get
-		 * the batching benefits but not so much that we create dead time
-		 * on the wire.
-		 */
-		if (ic->i_unsignaled_wrs-- == 0) {
-			ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
-			send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-		}
-
-		ic->i_unsignaled_bytes -= len;
-		if (ic->i_unsignaled_bytes <= 0) {
-			ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
-			send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-		}
-
-		/*
-		 * Always signal the last one if we're stopping due to flow control.
-		 */
-		if (flow_controlled && i == (work_alloc-1))
-			send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-
-		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
-			 &send->s_send_wr, send->s_send_wr.num_sge, send->s_send_wr.next);
-
-		sent += len;
-		rm->data.op_dmaoff += len;
-		if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
-			scat++;
-			rm->data.op_dmaoff = 0;
-			rm->data.op_dmasg++;
-		}
-
-add_header:
-		/* Tack on the header after the data. The header SGE should already
-		 * have been set up to point to the right header buffer. */
-		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
-
-		if (0) {
-			struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
-			printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
-				be16_to_cpu(hdr->h_dport),
-				hdr->h_flags,
-				be32_to_cpu(hdr->h_len));
-		}
-		if (adv_credits) {
-			struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
-			/* add credit and redo the header checksum */
-			hdr->h_credit = adv_credits;
-			rds_message_make_checksum(hdr);
-			adv_credits = 0;
-			rds_iw_stats_inc(s_iw_tx_credit_updates);
-		}
-
-		if (prev)
-			prev->s_send_wr.next = &send->s_send_wr;
-		prev = send;
-
-		pos = (pos + 1) % ic->i_send_ring.w_nr;
-	}
-
-	/* Account the RDS header in the number of bytes we sent, but just once.
-	 * The caller has no concept of fragmentation. */
-	if (hdr_off == 0)
-		sent += sizeof(struct rds_header);
-
-	/* if we finished the message then send completion owns it */
-	if (scat == &rm->data.op_sg[rm->data.op_count]) {
-		prev->s_rm = ic->i_rm;
-		prev->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-		ic->i_rm = NULL;
-	}
-
-	if (i < work_alloc) {
-		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
-		work_alloc = i;
-	}
-	if (ic->i_flowctl && i < credit_alloc)
-		rds_iw_send_add_credits(conn, credit_alloc - i);
-
-	/* XXX need to worry about failed_wr and partial sends. */
-	failed_wr = &first->s_send_wr;
-	ret = ib_post_send(ic->i_cm_id->qp, &first->s_send_wr, &failed_wr);
-	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
-		 first, &first->s_send_wr, ret, failed_wr);
-	BUG_ON(failed_wr != &first->s_send_wr);
-	if (ret) {
-		printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
-		       "returned %d\n", &conn->c_faddr, ret);
-		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
-		if (prev->s_rm) {
-			ic->i_rm = prev->s_rm;
-			prev->s_rm = NULL;
-		}
-		goto out;
-	}
-
-	ret = sent;
-out:
-	BUG_ON(adv_credits);
-	return ret;
-}
-
-static int rds_iw_build_send_reg(struct rds_iw_send_work *send,
-				 struct scatterlist *sg,
-				 int sg_nents)
-{
-	int n;
-
-	n = ib_map_mr_sg(send->s_mr, sg, sg_nents, PAGE_SIZE);
-	if (unlikely(n != sg_nents))
-		return n < 0 ? n : -EINVAL;
-
-	send->s_reg_wr.wr.opcode = IB_WR_REG_MR;
-	send->s_reg_wr.wr.wr_id = 0;
-	send->s_reg_wr.wr.num_sge = 0;
-	send->s_reg_wr.mr = send->s_mr;
-	send->s_reg_wr.key = send->s_mr->rkey;
-	send->s_reg_wr.access = IB_ACCESS_REMOTE_WRITE;
-
-	ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
-
-	return 0;
-}
-
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-	struct rds_iw_send_work *send = NULL;
-	struct rds_iw_send_work *first;
-	struct rds_iw_send_work *prev;
-	struct ib_send_wr *failed_wr;
-	struct rds_iw_device *rds_iwdev;
-	struct scatterlist *scat;
-	unsigned long len;
-	u64 remote_addr = op->op_remote_addr;
-	u32 pos, fr_pos;
-	u32 work_alloc;
-	u32 i;
-	u32 j;
-	int sent;
-	int ret;
-	int num_sge;
-	int sg_nents;
-
-	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
-
-	/* map the message the first time we see it */
-	if (!op->op_mapped) {
-		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
-					     op->op_sg, op->op_nents, (op->op_write) ?
-					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
-		if (op->op_count == 0) {
-			rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
-			ret = -ENOMEM; /* XXX ? */
-			goto out;
-		}
-
-		op->op_mapped = 1;
-	}
-
-	if (!op->op_write) {
-		/* Alloc space on the send queue for the fastreg */
-		work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
-		if (work_alloc != 1) {
-			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
-			rds_iw_stats_inc(s_iw_tx_ring_full);
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
-	/*
-	 * Instead of knowing how to return a partial rdma read/write we insist that there
-	 * be enough work requests to send the entire message.
-	 */
-	i = ceil(op->op_count, rds_iwdev->max_sge);
-
-	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
-	if (work_alloc != i) {
-		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
-		rds_iw_stats_inc(s_iw_tx_ring_full);
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	send = &ic->i_sends[pos];
-	if (!op->op_write) {
-		first = prev = &ic->i_sends[fr_pos];
-	} else {
-		first = send;
-		prev = NULL;
-	}
-	scat = &op->op_sg[0];
-	sent = 0;
-	num_sge = op->op_count;
-	sg_nents = 0;
-
-	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
-		send->s_rdma_wr.wr.send_flags = 0;
-		send->s_queued = jiffies;
-
-		/*
-		 * We want to delay signaling completions just enough to get
-		 * the batching benefits but not so much that we create dead time on the wire.
-		 */
-		if (ic->i_unsignaled_wrs-- == 0) {
-			ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
-			send->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED;
-		}
-
-		/* To avoid the need to have the plumbing to invalidate the fastreg_mr used
-		 * for local access after RDS is finished with it, using
-		 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
-		 */
-		if (op->op_write)
-			send->s_rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
-		else
-			send->s_rdma_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
-
-		send->s_rdma_wr.remote_addr = remote_addr;
-		send->s_rdma_wr.rkey = op->op_rkey;
-		send->s_op = op;
-
-		if (num_sge > rds_iwdev->max_sge) {
-			send->s_rdma_wr.wr.num_sge = rds_iwdev->max_sge;
-			num_sge -= rds_iwdev->max_sge;
-		} else
-			send->s_rdma_wr.wr.num_sge = num_sge;
-
-		send->s_rdma_wr.wr.next = NULL;
-
-		if (prev)
-			prev->s_send_wr.next = &send->s_rdma_wr.wr;
-
-		for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
-		     scat != &op->op_sg[op->op_count]; j++) {
-			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
-
-			if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV)
-				sg_nents++;
-			else {
-				send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
-				send->s_sge[j].length = len;
-				send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
-			}
-
-			sent += len;
-			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
-			remote_addr += len;
-
-			scat++;
-		}
-
-		if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
-			send->s_rdma_wr.wr.num_sge = 1;
-			send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
-			send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
-			send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
-		}
-
-		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
-			&send->s_rdma_wr,
-			send->s_rdma_wr.wr.num_sge,
-			send->s_rdma_wr.wr.next);
-
-		prev = send;
-		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
-			send = ic->i_sends;
-	}
-
-	/* if we finished the message then send completion owns it */
-	if (scat == &op->op_sg[op->op_count])
-		first->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED;
-
-	if (i < work_alloc) {
-		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
-		work_alloc = i;
-	}
-
-	/* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
-	 * recommended.  Putting the lkey on the wire is a security hole, as it can
-	 * allow for memory access to all of memory on the remote system.  Some
-	 * adapters do not allow using the lkey for this at all.  To bypass this use a
-	 * fastreg_mr (or possibly a dma_mr)
-	 */
-	if (!op->op_write) {
-		ret = rds_iw_build_send_reg(&ic->i_sends[fr_pos],
-					    &op->op_sg[0], sg_nents);
-		if (ret) {
-			printk(KERN_WARNING "RDS/IW: failed to reg send mem\n");
-			goto out;
-		}
-		work_alloc++;
-	}
-
-	failed_wr = &first->s_rdma_wr.wr;
-	ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr);
-	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
-		 first, &first->s_rdma_wr, ret, failed_wr);
-	BUG_ON(failed_wr != &first->s_rdma_wr.wr);
-	if (ret) {
-		printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
-		       "returned %d\n", &conn->c_faddr, ret);
-		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
-		goto out;
-	}
-
-out:
-	return ret;
-}
-
-void rds_iw_xmit_complete(struct rds_connection *conn)
-{
-	struct rds_iw_connection *ic = conn->c_transport_data;
-
-	/* We may have a pending ACK or window update we were unable
-	 * to send previously (due to flow control). Try again. */
-	rds_iw_attempt_ack(ic);
-}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
deleted file mode 100644
index 5fe67f6a1d80..000000000000
--- a/net/rds/iw_stats.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/percpu.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
-#include "rds.h"
-#include "iw.h"
-
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
-
-static const char *const rds_iw_stat_names[] = {
-	"iw_connect_raced",
-	"iw_listen_closed_stale",
-	"iw_tx_cq_call",
-	"iw_tx_cq_event",
-	"iw_tx_ring_full",
-	"iw_tx_throttle",
-	"iw_tx_sg_mapping_failure",
-	"iw_tx_stalled",
-	"iw_tx_credit_updates",
-	"iw_rx_cq_call",
-	"iw_rx_cq_event",
-	"iw_rx_ring_empty",
-	"iw_rx_refill_from_cq",
-	"iw_rx_refill_from_thread",
-	"iw_rx_alloc_limit",
-	"iw_rx_credit_updates",
-	"iw_ack_sent",
-	"iw_ack_send_failure",
-	"iw_ack_send_delayed",
-	"iw_ack_send_piggybacked",
-	"iw_ack_received",
-	"iw_rdma_mr_alloc",
-	"iw_rdma_mr_free",
-	"iw_rdma_mr_used",
-	"iw_rdma_mr_pool_flush",
-	"iw_rdma_mr_pool_wait",
-	"iw_rdma_mr_pool_depleted",
-};
-
-unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
-				    unsigned int avail)
-{
-	struct rds_iw_statistics stats = {0, };
-	uint64_t *src;
-	uint64_t *sum;
-	size_t i;
-	int cpu;
-
-	if (avail < ARRAY_SIZE(rds_iw_stat_names))
-		goto out;
-
-	for_each_online_cpu(cpu) {
-		src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
-		sum = (uint64_t *)&stats;
-		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
-			*(sum++) += *(src++);
-	}
-
-	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
-			    ARRAY_SIZE(rds_iw_stat_names));
-out:
-	return ARRAY_SIZE(rds_iw_stat_names);
-}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
deleted file mode 100644
index 139239d2cb22..000000000000
--- a/net/rds/iw_sysctl.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-
-#include "iw.h"
-
-static struct ctl_table_header *rds_iw_sysctl_hdr;
-
-unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
-unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
-unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
-static unsigned long rds_iw_sysctl_max_wr_min = 1;
-/* hardware will fail CQ creation long before this */
-static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
-
-unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
-static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
-static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
-
-unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
-static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
-static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
-
-unsigned int rds_iw_sysctl_flow_control = 1;
-
-static struct ctl_table rds_iw_sysctl_table[] = {
-	{
-		.procname       = "max_send_wr",
-		.data		= &rds_iw_sysctl_max_send_wr,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = proc_doulongvec_minmax,
-		.extra1		= &rds_iw_sysctl_max_wr_min,
-		.extra2		= &rds_iw_sysctl_max_wr_max,
-	},
-	{
-		.procname       = "max_recv_wr",
-		.data		= &rds_iw_sysctl_max_recv_wr,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = proc_doulongvec_minmax,
-		.extra1		= &rds_iw_sysctl_max_wr_min,
-		.extra2		= &rds_iw_sysctl_max_wr_max,
-	},
-	{
-		.procname       = "max_unsignaled_wr",
-		.data		= &rds_iw_sysctl_max_unsig_wrs,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = proc_doulongvec_minmax,
-		.extra1		= &rds_iw_sysctl_max_unsig_wr_min,
-		.extra2		= &rds_iw_sysctl_max_unsig_wr_max,
-	},
-	{
-		.procname       = "max_unsignaled_bytes",
-		.data		= &rds_iw_sysctl_max_unsig_bytes,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = proc_doulongvec_minmax,
-		.extra1		= &rds_iw_sysctl_max_unsig_bytes_min,
-		.extra2		= &rds_iw_sysctl_max_unsig_bytes_max,
-	},
-	{
-		.procname       = "max_recv_allocation",
-		.data		= &rds_iw_sysctl_max_recv_allocation,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = proc_doulongvec_minmax,
-	},
-	{
-		.procname	= "flow_control",
-		.data		= &rds_iw_sysctl_flow_control,
-		.maxlen		= sizeof(rds_iw_sysctl_flow_control),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{ }
-};
-
-void rds_iw_sysctl_exit(void)
-{
-	unregister_net_sysctl_table(rds_iw_sysctl_hdr);
-}
-
-int rds_iw_sysctl_init(void)
-{
-	rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
-	if (!rds_iw_sysctl_hdr)
-		return -ENOMEM;
-	return 0;
-}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 9c1fed81bf0f..4f4b3d88319d 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -49,9 +49,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 	rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
 		 event->event, rdma_event_msg(event->event));
 
-	if (cm_id->device->node_type == RDMA_NODE_RNIC)
-		trans = &rds_iw_transport;
-	else
+	if (cm_id->device->node_type == RDMA_NODE_IB_CA)
 		trans = &rds_ib_transport;
 
 	/* Prevent shutdown from tearing down the connection
@@ -200,10 +198,6 @@ static int rds_rdma_init(void)
 	if (ret)
 		goto out;
 
-	ret = rds_iw_init();
-	if (ret)
-		goto err_iw_init;
-
 	ret = rds_ib_init();
 	if (ret)
 		goto err_ib_init;
@@ -211,8 +205,6 @@ static int rds_rdma_init(void)
 	goto out;
 
 err_ib_init:
-	rds_iw_exit();
-err_iw_init:
 	rds_rdma_listen_stop();
 out:
 	return ret;
@@ -224,11 +216,10 @@ static void rds_rdma_exit(void)
 	/* stop listening first to ensure no new connections are attempted */
 	rds_rdma_listen_stop();
 	rds_ib_exit();
-	rds_iw_exit();
 }
 module_exit(rds_rdma_exit);
 
 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
-MODULE_DESCRIPTION("RDS: IB/iWARP transport");
+MODULE_DESCRIPTION("RDS: IB transport");
 MODULE_LICENSE("Dual BSD/GPL");
 
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index faba4e382695..ff2010e9d20c 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -16,9 +16,4 @@ extern struct rds_transport rds_ib_transport;
 int rds_ib_init(void);
 void rds_ib_exit(void);
 
-/* from iw.c */
-extern struct rds_transport rds_iw_transport;
-int rds_iw_init(void);
-void rds_iw_exit(void);
-
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 5711f8b353176393ee2ab023808a0a8c4a9d39b5 Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:43 -0800
Subject: RDS: Add support for SO_TIMESTAMP for incoming messages

The SO_TIMESTAMP generates time stamp for each incoming RDS messages
User app can enable it by using SO_TIMESTAMP setsocketopt() at
SOL_SOCKET level. CMSG data of cmsg type SO_TIMESTAMP contains the
time stamp in struct timeval format.

Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/af_rds.c | 26 ++++++++++++++++++++++++++
 net/rds/rds.h    |  1 +
 net/rds/recv.c   | 20 ++++++++++++++++++--
 3 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index b5476aebd68d..6beaeb1138f3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -277,6 +277,27 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval,
 	return rs->rs_transport ? 0 : -ENOPROTOOPT;
 }
 
+static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
+				 int optlen)
+{
+	int val, valbool;
+
+	if (optlen != sizeof(int))
+		return -EFAULT;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	valbool = val ? 1 : 0;
+
+	if (valbool)
+		sock_set_flag(sk, SOCK_RCVTSTAMP);
+	else
+		sock_reset_flag(sk, SOCK_RCVTSTAMP);
+
+	return 0;
+}
+
 static int rds_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -312,6 +333,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
 		ret = rds_set_transport(rs, optval, optlen);
 		release_sock(sock->sk);
 		break;
+	case SO_TIMESTAMP:
+		lock_sock(sock->sk);
+		ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
+		release_sock(sock->sk);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 	}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0e2797bdc316..80256b08eac0 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -222,6 +222,7 @@ struct rds_incoming {
 	__be32			i_saddr;
 
 	rds_rdma_cookie_t	i_rdma_cookie;
+	struct timeval		i_rx_tstamp;
 };
 
 struct rds_mr {
diff --git a/net/rds/recv.c b/net/rds/recv.c
index a00462b0d01d..c0be1ecd11c9 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -35,6 +35,8 @@
 #include <net/sock.h>
 #include <linux/in.h>
 #include <linux/export.h>
+#include <linux/time.h>
+#include <linux/rds.h>
 
 #include "rds.h"
 
@@ -46,6 +48,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 	inc->i_conn = conn;
 	inc->i_saddr = saddr;
 	inc->i_rdma_cookie = 0;
+	inc->i_rx_tstamp.tv_sec = 0;
+	inc->i_rx_tstamp.tv_usec = 0;
 }
 EXPORT_SYMBOL_GPL(rds_inc_init);
 
@@ -228,6 +232,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
 				      be32_to_cpu(inc->i_hdr.h_len),
 				      inc->i_hdr.h_dport);
+		if (sock_flag(sk, SOCK_RCVTSTAMP))
+			do_gettimeofday(&inc->i_rx_tstamp);
 		rds_inc_addref(inc);
 		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
 		__rds_wake_sk_sleep(sk);
@@ -381,7 +387,8 @@ static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
 /*
  * Receive any control messages.
  */
-static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
+			 struct rds_sock *rs)
 {
 	int ret = 0;
 
@@ -392,6 +399,15 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
 			return ret;
 	}
 
+	if ((inc->i_rx_tstamp.tv_sec != 0) &&
+	    sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
+		ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+			       sizeof(struct timeval),
+			       &inc->i_rx_tstamp);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
@@ -474,7 +490,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 			msg->msg_flags |= MSG_TRUNC;
 		}
 
-		if (rds_cmsg_recv(inc, msg)) {
+		if (rds_cmsg_recv(inc, msg, rs)) {
 			ret = -EFAULT;
 			goto out;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From dcfd041c8710320d59fce322fd901bddaf912ae8 Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:45 -0800
Subject: RDS: IB: Remove the RDS_IB_SEND_OP dependency

This helps to combine asynchronous fastreg MR completion handler
with send completion handler.

No functional change.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib.h      |  1 -
 net/rds/ib_cm.c   | 42 +++++++++++++++++++++++++++---------------
 net/rds/ib_send.c |  6 ++----
 3 files changed, 29 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index b3fdebb57460..09cd8e3a90aa 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -28,7 +28,6 @@
 #define RDS_IB_RECYCLE_BATCH_COUNT	32
 
 #define RDS_IB_WC_MAX			32
-#define RDS_IB_SEND_OP			BIT_ULL(63)
 
 extern struct rw_semaphore rds_ib_devices_lock;
 extern struct list_head rds_ib_devices;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index da5a7fb98c77..7f68abc8a5bf 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -236,12 +236,10 @@ static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
 	tasklet_schedule(&ic->i_recv_tasklet);
 }
 
-static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
-		    struct ib_wc *wcs,
-		    struct rds_ib_ack_state *ack_state)
+static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
+		     struct ib_wc *wcs)
 {
-	int nr;
-	int i;
+	int nr, i;
 	struct ib_wc *wc;
 
 	while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
@@ -251,10 +249,7 @@ static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
 				 (unsigned long long)wc->wr_id, wc->status,
 				 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
 
-			if (wc->wr_id & RDS_IB_SEND_OP)
-				rds_ib_send_cqe_handler(ic, wc);
-			else
-				rds_ib_recv_cqe_handler(ic, wc, ack_state);
+			rds_ib_send_cqe_handler(ic, wc);
 		}
 	}
 }
@@ -263,14 +258,12 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
 {
 	struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
 	struct rds_connection *conn = ic->conn;
-	struct rds_ib_ack_state state;
 
 	rds_ib_stats_inc(s_ib_tasklet_call);
 
-	memset(&state, 0, sizeof(state));
-	poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
 	ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
-	poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
 
 	if (rds_conn_up(conn) &&
 	    (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
@@ -278,6 +271,25 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
 		rds_send_xmit(ic->conn);
 }
 
+static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
+		     struct ib_wc *wcs,
+		     struct rds_ib_ack_state *ack_state)
+{
+	int nr, i;
+	struct ib_wc *wc;
+
+	while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
+		for (i = 0; i < nr; i++) {
+			wc = wcs + i;
+			rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+				 (unsigned long long)wc->wr_id, wc->status,
+				 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+
+			rds_ib_recv_cqe_handler(ic, wc, ack_state);
+		}
+	}
+}
+
 static void rds_ib_tasklet_fn_recv(unsigned long data)
 {
 	struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
@@ -291,9 +303,9 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
 	rds_ib_stats_inc(s_ib_tasklet_call);
 
 	memset(&state, 0, sizeof(state));
-	poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
 	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
-	poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
 
 	if (state.ack_next_valid)
 		rds_ib_set_ack(ic, state.ack_next, state.ack_required);
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index eac30bf486d7..f27d2c82b036 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -195,7 +195,7 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
 
 		send->s_op = NULL;
 
-		send->s_wr.wr_id = i | RDS_IB_SEND_OP;
+		send->s_wr.wr_id = i;
 		send->s_wr.sg_list = send->s_sge;
 		send->s_wr.ex.imm_data = 0;
 
@@ -263,9 +263,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 
 	oldest = rds_ib_ring_oldest(&ic->i_send_ring);
 
-	completed = rds_ib_ring_completed(&ic->i_send_ring,
-					  (wc->wr_id & ~RDS_IB_SEND_OP),
-					  oldest);
+	completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest);
 
 	for (i = 0; i < completed; i++) {
 		send = &ic->i_sends[oldest];
-- 
cgit v1.2.3-59-g8ed1b


From f6df683f32f09f8c0c440bdd3741a1e7dc7927fe Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:46 -0800
Subject: RDS: IB: Re-organise ibmr code

No functional changes. This is in preperation towards adding
fastreg memory resgitration support.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/Makefile  |   2 +-
 net/rds/ib.c      |  37 +++---
 net/rds/ib.h      |  25 +---
 net/rds/ib_fmr.c  | 217 +++++++++++++++++++++++++++++++
 net/rds/ib_mr.h   | 109 ++++++++++++++++
 net/rds/ib_rdma.c | 379 +++++++++++-------------------------------------------
 6 files changed, 422 insertions(+), 347 deletions(-)
 create mode 100644 net/rds/ib_fmr.c
 create mode 100644 net/rds/ib_mr.h

(limited to 'net')

diff --git a/net/rds/Makefile b/net/rds/Makefile
index 19e54854ba2d..bcf5591b493a 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -6,7 +6,7 @@ rds-y :=	af_rds.o bind.o cong.o connection.o info.o message.o   \
 obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
 rds_rdma-y :=	rdma_transport.o \
 			ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
-			ib_sysctl.o ib_rdma.o
+			ib_sysctl.o ib_rdma.o ib_fmr.o
 
 
 obj-$(CONFIG_RDS_TCP) += rds_tcp.o
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 9481d55ff6cb..bb32cb9d85f1 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -42,15 +42,16 @@
 
 #include "rds.h"
 #include "ib.h"
+#include "ib_mr.h"
 
-unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
-unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
+unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
+unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
 unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
 
-module_param(rds_ib_fmr_1m_pool_size, int, 0444);
-MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA");
-module_param(rds_ib_fmr_8k_pool_size, int, 0444);
-MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA");
+module_param(rds_ib_mr_1m_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
+module_param(rds_ib_mr_8k_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA");
 module_param(rds_ib_retry_count, int, 0444);
 MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
 
@@ -140,13 +141,13 @@ static void rds_ib_add_one(struct ib_device *device)
 	rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
 
 	rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
-	rds_ibdev->max_1m_fmrs = device->attrs.max_mr ?
+	rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
 		min_t(unsigned int, (device->attrs.max_mr / 2),
-		      rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
+		      rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
 
-	rds_ibdev->max_8k_fmrs = device->attrs.max_mr ?
+	rds_ibdev->max_8k_mrs = device->attrs.max_mr ?
 		min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
-		      rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
+		      rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
 
 	rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
 	rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
@@ -172,10 +173,10 @@ static void rds_ib_add_one(struct ib_device *device)
 		goto put_dev;
 	}
 
-	rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
+	rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
 		 device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
-		 rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
-		 rds_ibdev->max_8k_fmrs);
+		 rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs,
+		 rds_ibdev->max_8k_mrs);
 
 	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
 	INIT_LIST_HEAD(&rds_ibdev->conn_list);
@@ -364,7 +365,7 @@ void rds_ib_exit(void)
 	rds_ib_sysctl_exit();
 	rds_ib_recv_exit();
 	rds_trans_unregister(&rds_ib_transport);
-	rds_ib_fmr_exit();
+	rds_ib_mr_exit();
 }
 
 struct rds_transport rds_ib_transport = {
@@ -400,13 +401,13 @@ int rds_ib_init(void)
 
 	INIT_LIST_HEAD(&rds_ib_devices);
 
-	ret = rds_ib_fmr_init();
+	ret = rds_ib_mr_init();
 	if (ret)
 		goto out;
 
 	ret = ib_register_client(&rds_ib_client);
 	if (ret)
-		goto out_fmr_exit;
+		goto out_mr_exit;
 
 	ret = rds_ib_sysctl_init();
 	if (ret)
@@ -430,8 +431,8 @@ out_sysctl:
 	rds_ib_sysctl_exit();
 out_ibreg:
 	rds_ib_unregister_client();
-out_fmr_exit:
-	rds_ib_fmr_exit();
+out_mr_exit:
+	rds_ib_mr_exit();
 out:
 	return ret;
 }
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 09cd8e3a90aa..c88cb229db16 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -9,12 +9,6 @@
 #include "rds.h"
 #include "rdma_transport.h"
 
-#define RDS_FMR_1M_POOL_SIZE		(8192 / 2)
-#define RDS_FMR_1M_MSG_SIZE		256
-#define RDS_FMR_8K_MSG_SIZE		2
-#define RDS_MR_8K_SCALE			(256 / (RDS_FMR_8K_MSG_SIZE + 1))
-#define RDS_FMR_8K_POOL_SIZE		(RDS_MR_8K_SCALE * (8192 / 2))
-
 #define RDS_IB_MAX_SGE			8
 #define RDS_IB_RECV_SGE 		2
 
@@ -206,12 +200,12 @@ struct rds_ib_device {
 	struct list_head	conn_list;
 	struct ib_device	*dev;
 	struct ib_pd		*pd;
-	unsigned int		max_fmrs;
+	unsigned int		max_mrs;
 	struct rds_ib_mr_pool	*mr_1m_pool;
 	struct rds_ib_mr_pool   *mr_8k_pool;
 	unsigned int		fmr_max_remaps;
-	unsigned int		max_8k_fmrs;
-	unsigned int		max_1m_fmrs;
+	unsigned int		max_8k_mrs;
+	unsigned int		max_1m_mrs;
 	int			max_sge;
 	unsigned int		max_wrs;
 	unsigned int		max_initiator_depth;
@@ -316,8 +310,6 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
 extern struct ib_client rds_ib_client;
 
-extern unsigned int rds_ib_fmr_1m_pool_size;
-extern unsigned int rds_ib_fmr_8k_pool_size;
 extern unsigned int rds_ib_retry_count;
 
 extern spinlock_t ib_nodev_conns_lock;
@@ -347,17 +339,6 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_destroy_nodev_conns(void);
-struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
-					     int npages);
-void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
-void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
-void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
-		    struct rds_sock *rs, u32 *key_ret);
-void rds_ib_sync_mr(void *trans_private, int dir);
-void rds_ib_free_mr(void *trans_private, int invalidate);
-void rds_ib_flush_mrs(void);
-int rds_ib_fmr_init(void);
-void rds_ib_fmr_exit(void);
 
 /* ib_recv.c */
 int rds_ib_recv_init(void);
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
new file mode 100644
index 000000000000..d4f200dc61e8
--- /dev/null
+++ b/net/rds/ib_fmr.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
+{
+	struct rds_ib_mr_pool *pool;
+	struct rds_ib_mr *ibmr = NULL;
+	int err = 0, iter = 0;
+
+	if (npages <= RDS_MR_8K_MSG_SIZE)
+		pool = rds_ibdev->mr_8k_pool;
+	else
+		pool = rds_ibdev->mr_1m_pool;
+
+	if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+
+	/* Switch pools if one of the pool is reaching upper limit */
+	if (atomic_read(&pool->dirty_count) >=  pool->max_items * 9 / 10) {
+		if (pool->pool_type == RDS_IB_MR_8K_POOL)
+			pool = rds_ibdev->mr_1m_pool;
+		else
+			pool = rds_ibdev->mr_8k_pool;
+	}
+
+	while (1) {
+		ibmr = rds_ib_reuse_mr(pool);
+		if (ibmr)
+			return ibmr;
+
+		/* No clean MRs - now we have the choice of either
+		 * allocating a fresh MR up to the limit imposed by the
+		 * driver, or flush any dirty unused MRs.
+		 * We try to avoid stalling in the send path if possible,
+		 * so we allocate as long as we're allowed to.
+		 *
+		 * We're fussy with enforcing the FMR limit, though. If the
+		 * driver tells us we can't use more than N fmrs, we shouldn't
+		 * start arguing with it
+		 */
+		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+			break;
+
+		atomic_dec(&pool->item_count);
+
+		if (++iter > 2) {
+			if (pool->pool_type == RDS_IB_MR_8K_POOL)
+				rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
+			else
+				rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		/* We do have some empty MRs. Flush them out. */
+		if (pool->pool_type == RDS_IB_MR_8K_POOL)
+			rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
+		else
+			rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
+		rds_ib_flush_mr_pool(pool, 0, &ibmr);
+		if (ibmr)
+			return ibmr;
+	}
+
+	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+			    rdsibdev_to_node(rds_ibdev));
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+			(IB_ACCESS_LOCAL_WRITE |
+			 IB_ACCESS_REMOTE_READ |
+			 IB_ACCESS_REMOTE_WRITE |
+			 IB_ACCESS_REMOTE_ATOMIC),
+			&pool->fmr_attr);
+	if (IS_ERR(ibmr->fmr)) {
+		err = PTR_ERR(ibmr->fmr);
+		ibmr->fmr = NULL;
+		pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err);
+		goto out_no_cigar;
+	}
+
+	ibmr->pool = pool;
+	if (pool->pool_type == RDS_IB_MR_8K_POOL)
+		rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+	else
+		rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+	return ibmr;
+
+out_no_cigar:
+	if (ibmr) {
+		if (ibmr->fmr)
+			ib_dealloc_fmr(ibmr->fmr);
+		kfree(ibmr);
+	}
+	atomic_dec(&pool->item_count);
+	return ERR_PTR(err);
+}
+
+int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+		   struct scatterlist *sg, unsigned int nents)
+{
+	struct ib_device *dev = rds_ibdev->dev;
+	struct scatterlist *scat = sg;
+	u64 io_addr = 0;
+	u64 *dma_pages;
+	u32 len;
+	int page_cnt, sg_dma_len;
+	int i, j;
+	int ret;
+
+	sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
+	if (unlikely(!sg_dma_len)) {
+		pr_warn("RDS/IB: %s failed!\n", __func__);
+		return -EBUSY;
+	}
+
+	len = 0;
+	page_cnt = 0;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		if (dma_addr & ~PAGE_MASK) {
+			if (i > 0)
+				return -EINVAL;
+			else
+				++page_cnt;
+		}
+		if ((dma_addr + dma_len) & ~PAGE_MASK) {
+			if (i < sg_dma_len - 1)
+				return -EINVAL;
+			else
+				++page_cnt;
+		}
+
+		len += dma_len;
+	}
+
+	page_cnt += len >> PAGE_SHIFT;
+	if (page_cnt > ibmr->pool->fmr_attr.max_pages)
+		return -EINVAL;
+
+	dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+				 rdsibdev_to_node(rds_ibdev));
+	if (!dma_pages)
+		return -ENOMEM;
+
+	page_cnt = 0;
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		for (j = 0; j < dma_len; j += PAGE_SIZE)
+			dma_pages[page_cnt++] =
+				(dma_addr & PAGE_MASK) + j;
+	}
+
+	ret = ib_map_phys_fmr(ibmr->fmr, dma_pages, page_cnt, io_addr);
+	if (ret)
+		goto out;
+
+	/* Success - we successfully remapped the MR, so we can
+	 * safely tear down the old mapping.
+	 */
+	rds_ib_teardown_mr(ibmr);
+
+	ibmr->sg = scat;
+	ibmr->sg_len = nents;
+	ibmr->sg_dma_len = sg_dma_len;
+	ibmr->remap_count++;
+
+	if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+		rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+	else
+		rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+	ret = 0;
+
+out:
+	kfree(dma_pages);
+
+	return ret;
+}
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
new file mode 100644
index 000000000000..d88724fe9a0b
--- /dev/null
+++ b/net/rds/ib_mr.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _RDS_IB_MR_H
+#define _RDS_IB_MR_H
+
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+#define RDS_MR_1M_POOL_SIZE		(8192 / 2)
+#define RDS_MR_1M_MSG_SIZE		256
+#define RDS_MR_8K_MSG_SIZE		2
+#define RDS_MR_8K_SCALE			(256 / (RDS_MR_8K_MSG_SIZE + 1))
+#define RDS_MR_8K_POOL_SIZE		(RDS_MR_8K_SCALE * (8192 / 2))
+
+/* This is stored as mr->r_trans_private. */
+struct rds_ib_mr {
+	struct rds_ib_device	*device;
+	struct rds_ib_mr_pool	*pool;
+	struct ib_fmr		*fmr;
+
+	struct llist_node	llnode;
+
+	/* unmap_list is for freeing */
+	struct list_head	unmap_list;
+	unsigned int		remap_count;
+
+	struct scatterlist	*sg;
+	unsigned int		sg_len;
+	u64			*dma;
+	int			sg_dma_len;
+};
+
+/* Our own little MR pool */
+struct rds_ib_mr_pool {
+	unsigned int            pool_type;
+	struct mutex		flush_lock;	/* serialize fmr invalidate */
+	struct delayed_work	flush_worker;	/* flush worker */
+
+	atomic_t		item_count;	/* total # of MRs */
+	atomic_t		dirty_count;	/* # dirty of MRs */
+
+	struct llist_head	drop_list;	/* MRs not reached max_maps */
+	struct llist_head	free_list;	/* unused MRs */
+	struct llist_head	clean_list;	/* unused & unmapped MRs */
+	wait_queue_head_t	flush_wait;
+
+	atomic_t		free_pinned;	/* memory pinned by free MRs */
+	unsigned long		max_items;
+	unsigned long		max_items_soft;
+	unsigned long		max_free_pinned;
+	struct ib_fmr_attr	fmr_attr;
+};
+
+extern struct workqueue_struct *rds_ib_mr_wq;
+extern unsigned int rds_ib_mr_1m_pool_size;
+extern unsigned int rds_ib_mr_8k_pool_size;
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
+					     int npages);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+			struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+int rds_ib_mr_init(void);
+void rds_ib_mr_exit(void);
+
+void __rds_ib_teardown_mr(struct rds_ib_mr *);
+void rds_ib_teardown_mr(struct rds_ib_mr *);
+struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int);
+int rds_ib_map_fmr(struct rds_ib_device *, struct rds_ib_mr *,
+		   struct scatterlist *, unsigned int);
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *);
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **);
+#endif
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a2340748ec86..c59451938b36 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -35,78 +35,13 @@
 #include <linux/rculist.h>
 #include <linux/llist.h>
 
-#include "rds.h"
-#include "ib.h"
+#include "ib_mr.h"
+
+struct workqueue_struct *rds_ib_mr_wq;
 
 static DEFINE_PER_CPU(unsigned long, clean_list_grace);
 #define CLEAN_LIST_BUSY_BIT 0
 
-/*
- * This is stored as mr->r_trans_private.
- */
-struct rds_ib_mr {
-	struct rds_ib_device	*device;
-	struct rds_ib_mr_pool	*pool;
-	struct ib_fmr		*fmr;
-
-	struct llist_node	llnode;
-
-	/* unmap_list is for freeing */
-	struct list_head	unmap_list;
-	unsigned int		remap_count;
-
-	struct scatterlist	*sg;
-	unsigned int		sg_len;
-	u64			*dma;
-	int			sg_dma_len;
-};
-
-/*
- * Our own little FMR pool
- */
-struct rds_ib_mr_pool {
-	unsigned int            pool_type;
-	struct mutex		flush_lock;		/* serialize fmr invalidate */
-	struct delayed_work	flush_worker;		/* flush worker */
-
-	atomic_t		item_count;		/* total # of MRs */
-	atomic_t		dirty_count;		/* # dirty of MRs */
-
-	struct llist_head	drop_list;		/* MRs that have reached their max_maps limit */
-	struct llist_head	free_list;		/* unused MRs */
-	struct llist_head	clean_list;		/* global unused & unamapped MRs */
-	wait_queue_head_t	flush_wait;
-
-	atomic_t		free_pinned;		/* memory pinned by free MRs */
-	unsigned long		max_items;
-	unsigned long		max_items_soft;
-	unsigned long		max_free_pinned;
-	struct ib_fmr_attr	fmr_attr;
-};
-
-static struct workqueue_struct *rds_ib_fmr_wq;
-
-int rds_ib_fmr_init(void)
-{
-	rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
-	if (!rds_ib_fmr_wq)
-		return -ENOMEM;
-	return 0;
-}
-
-/* By the time this is called all the IB devices should have been torn down and
- * had their pools freed.  As each pool is freed its work struct is waited on,
- * so the pool flushing work queue should be idle by the time we get here.
- */
-void rds_ib_fmr_exit(void)
-{
-	destroy_workqueue(rds_ib_fmr_wq);
-}
-
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
-static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
-static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
-
 static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
 {
 	struct rds_ib_device *rds_ibdev;
@@ -235,41 +170,6 @@ void rds_ib_destroy_nodev_conns(void)
 		rds_conn_destroy(ic->conn);
 }
 
-struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
-					     int pool_type)
-{
-	struct rds_ib_mr_pool *pool;
-
-	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-	if (!pool)
-		return ERR_PTR(-ENOMEM);
-
-	pool->pool_type = pool_type;
-	init_llist_head(&pool->free_list);
-	init_llist_head(&pool->drop_list);
-	init_llist_head(&pool->clean_list);
-	mutex_init(&pool->flush_lock);
-	init_waitqueue_head(&pool->flush_wait);
-	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
-
-	if (pool_type == RDS_IB_MR_1M_POOL) {
-		/* +1 allows for unaligned MRs */
-		pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1;
-		pool->max_items = RDS_FMR_1M_POOL_SIZE;
-	} else {
-		/* pool_type == RDS_IB_MR_8K_POOL */
-		pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1;
-		pool->max_items = RDS_FMR_8K_POOL_SIZE;
-	}
-
-	pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
-	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
-	pool->fmr_attr.page_shift = PAGE_SHIFT;
-	pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
-
-	return pool;
-}
-
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
 {
 	struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
@@ -278,16 +178,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 	iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
 }
 
-void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
-{
-	cancel_delayed_work_sync(&pool->flush_worker);
-	rds_ib_flush_mr_pool(pool, 1, NULL);
-	WARN_ON(atomic_read(&pool->item_count));
-	WARN_ON(atomic_read(&pool->free_pinned));
-	kfree(pool);
-}
-
-static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 {
 	struct rds_ib_mr *ibmr = NULL;
 	struct llist_node *ret;
@@ -317,190 +208,6 @@ static inline void wait_clean_list_grace(void)
 	}
 }
 
-static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
-					  int npages)
-{
-	struct rds_ib_mr_pool *pool;
-	struct rds_ib_mr *ibmr = NULL;
-	int err = 0, iter = 0;
-
-	if (npages <= RDS_FMR_8K_MSG_SIZE)
-		pool = rds_ibdev->mr_8k_pool;
-	else
-		pool = rds_ibdev->mr_1m_pool;
-
-	if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-		queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
-
-	/* Switch pools if one of the pool is reaching upper limit */
-	if (atomic_read(&pool->dirty_count) >=  pool->max_items * 9 / 10) {
-		if (pool->pool_type == RDS_IB_MR_8K_POOL)
-			pool = rds_ibdev->mr_1m_pool;
-		else
-			pool = rds_ibdev->mr_8k_pool;
-	}
-
-	while (1) {
-		ibmr = rds_ib_reuse_fmr(pool);
-		if (ibmr)
-			return ibmr;
-
-		/* No clean MRs - now we have the choice of either
-		 * allocating a fresh MR up to the limit imposed by the
-		 * driver, or flush any dirty unused MRs.
-		 * We try to avoid stalling in the send path if possible,
-		 * so we allocate as long as we're allowed to.
-		 *
-		 * We're fussy with enforcing the FMR limit, though. If the driver
-		 * tells us we can't use more than N fmrs, we shouldn't start
-		 * arguing with it */
-		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
-			break;
-
-		atomic_dec(&pool->item_count);
-
-		if (++iter > 2) {
-			if (pool->pool_type == RDS_IB_MR_8K_POOL)
-				rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
-			else
-				rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
-			return ERR_PTR(-EAGAIN);
-		}
-
-		/* We do have some empty MRs. Flush them out. */
-		if (pool->pool_type == RDS_IB_MR_8K_POOL)
-			rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
-		else
-			rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
-		rds_ib_flush_mr_pool(pool, 0, &ibmr);
-		if (ibmr)
-			return ibmr;
-	}
-
-	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
-	if (!ibmr) {
-		err = -ENOMEM;
-		goto out_no_cigar;
-	}
-
-	ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
-			(IB_ACCESS_LOCAL_WRITE |
-			 IB_ACCESS_REMOTE_READ |
-			 IB_ACCESS_REMOTE_WRITE|
-			 IB_ACCESS_REMOTE_ATOMIC),
-			&pool->fmr_attr);
-	if (IS_ERR(ibmr->fmr)) {
-		err = PTR_ERR(ibmr->fmr);
-		ibmr->fmr = NULL;
-		printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
-		goto out_no_cigar;
-	}
-
-	ibmr->pool = pool;
-	if (pool->pool_type == RDS_IB_MR_8K_POOL)
-		rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
-	else
-		rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
-
-	return ibmr;
-
-out_no_cigar:
-	if (ibmr) {
-		if (ibmr->fmr)
-			ib_dealloc_fmr(ibmr->fmr);
-		kfree(ibmr);
-	}
-	atomic_dec(&pool->item_count);
-	return ERR_PTR(err);
-}
-
-static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
-	       struct scatterlist *sg, unsigned int nents)
-{
-	struct ib_device *dev = rds_ibdev->dev;
-	struct scatterlist *scat = sg;
-	u64 io_addr = 0;
-	u64 *dma_pages;
-	u32 len;
-	int page_cnt, sg_dma_len;
-	int i, j;
-	int ret;
-
-	sg_dma_len = ib_dma_map_sg(dev, sg, nents,
-				 DMA_BIDIRECTIONAL);
-	if (unlikely(!sg_dma_len)) {
-		printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
-		return -EBUSY;
-	}
-
-	len = 0;
-	page_cnt = 0;
-
-	for (i = 0; i < sg_dma_len; ++i) {
-		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
-		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
-
-		if (dma_addr & ~PAGE_MASK) {
-			if (i > 0)
-				return -EINVAL;
-			else
-				++page_cnt;
-		}
-		if ((dma_addr + dma_len) & ~PAGE_MASK) {
-			if (i < sg_dma_len - 1)
-				return -EINVAL;
-			else
-				++page_cnt;
-		}
-
-		len += dma_len;
-	}
-
-	page_cnt += len >> PAGE_SHIFT;
-	if (page_cnt > ibmr->pool->fmr_attr.max_pages)
-		return -EINVAL;
-
-	dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
-				 rdsibdev_to_node(rds_ibdev));
-	if (!dma_pages)
-		return -ENOMEM;
-
-	page_cnt = 0;
-	for (i = 0; i < sg_dma_len; ++i) {
-		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
-		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
-
-		for (j = 0; j < dma_len; j += PAGE_SIZE)
-			dma_pages[page_cnt++] =
-				(dma_addr & PAGE_MASK) + j;
-	}
-
-	ret = ib_map_phys_fmr(ibmr->fmr,
-				   dma_pages, page_cnt, io_addr);
-	if (ret)
-		goto out;
-
-	/* Success - we successfully remapped the MR, so we can
-	 * safely tear down the old mapping. */
-	rds_ib_teardown_mr(ibmr);
-
-	ibmr->sg = scat;
-	ibmr->sg_len = nents;
-	ibmr->sg_dma_len = sg_dma_len;
-	ibmr->remap_count++;
-
-	if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
-		rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
-	else
-		rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
-	ret = 0;
-
-out:
-	kfree(dma_pages);
-
-	return ret;
-}
-
 void rds_ib_sync_mr(void *trans_private, int direction)
 {
 	struct rds_ib_mr *ibmr = trans_private;
@@ -518,7 +225,7 @@ void rds_ib_sync_mr(void *trans_private, int direction)
 	}
 }
 
-static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 {
 	struct rds_ib_device *rds_ibdev = ibmr->device;
 
@@ -549,7 +256,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 	}
 }
 
-static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 {
 	unsigned int pinned = ibmr->sg_len;
 
@@ -623,8 +330,8 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
  * If the number of MRs allocated exceeds the limit, we also try
  * to free as many MRs as needed to get back to this limit.
  */
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
-				int free_all, struct rds_ib_mr **ibmr_ret)
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+			 int free_all, struct rds_ib_mr **ibmr_ret)
 {
 	struct rds_ib_mr *ibmr, *next;
 	struct llist_node *clean_nodes;
@@ -643,7 +350,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 	if (ibmr_ret) {
 		DEFINE_WAIT(wait);
 		while (!mutex_trylock(&pool->flush_lock)) {
-			ibmr = rds_ib_reuse_fmr(pool);
+			ibmr = rds_ib_reuse_mr(pool);
 			if (ibmr) {
 				*ibmr_ret = ibmr;
 				finish_wait(&pool->flush_wait, &wait);
@@ -655,7 +362,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 			if (llist_empty(&pool->clean_list))
 				schedule();
 
-			ibmr = rds_ib_reuse_fmr(pool);
+			ibmr = rds_ib_reuse_mr(pool);
 			if (ibmr) {
 				*ibmr_ret = ibmr;
 				finish_wait(&pool->flush_wait, &wait);
@@ -667,7 +374,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 		mutex_lock(&pool->flush_lock);
 
 	if (ibmr_ret) {
-		ibmr = rds_ib_reuse_fmr(pool);
+		ibmr = rds_ib_reuse_mr(pool);
 		if (ibmr) {
 			*ibmr_ret = ibmr;
 			goto out;
@@ -773,7 +480,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 	/* If we've pinned too many pages, request a flush */
 	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
 	    atomic_read(&pool->dirty_count) >= pool->max_items / 5)
-		queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+		queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
 
 	if (invalidate) {
 		if (likely(!in_interrupt())) {
@@ -782,7 +489,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 			/* We get here if the user created a MR marked
 			 * as use_once and invalidate at the same time.
 			 */
-			queue_delayed_work(rds_ib_fmr_wq,
+			queue_delayed_work(rds_ib_mr_wq,
 					   &pool->flush_worker, 10);
 		}
 	}
@@ -849,3 +556,63 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 	return ibmr;
 }
 
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+	cancel_delayed_work_sync(&pool->flush_worker);
+	rds_ib_flush_mr_pool(pool, 1, NULL);
+	WARN_ON(atomic_read(&pool->item_count));
+	WARN_ON(atomic_read(&pool->free_pinned));
+	kfree(pool);
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
+					     int pool_type)
+{
+	struct rds_ib_mr_pool *pool;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	pool->pool_type = pool_type;
+	init_llist_head(&pool->free_list);
+	init_llist_head(&pool->drop_list);
+	init_llist_head(&pool->clean_list);
+	mutex_init(&pool->flush_lock);
+	init_waitqueue_head(&pool->flush_wait);
+	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+	if (pool_type == RDS_IB_MR_1M_POOL) {
+		/* +1 allows for unaligned MRs */
+		pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1;
+		pool->max_items = RDS_MR_1M_POOL_SIZE;
+	} else {
+		/* pool_type == RDS_IB_MR_8K_POOL */
+		pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1;
+		pool->max_items = RDS_MR_8K_POOL_SIZE;
+	}
+
+	pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
+	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+	pool->fmr_attr.page_shift = PAGE_SHIFT;
+	pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
+
+	return pool;
+}
+
+int rds_ib_mr_init(void)
+{
+	rds_ib_mr_wq = create_workqueue("rds_mr_flushd");
+	if (!rds_ib_mr_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+/* By the time this is called all the IB devices should have been torn down and
+ * had their pools freed.  As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void rds_ib_mr_exit(void)
+{
+	destroy_workqueue(rds_ib_mr_wq);
+}
-- 
cgit v1.2.3-59-g8ed1b


From a69365a39cd35d0ad11be7d4a896b2f76a8fde44 Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:47 -0800
Subject: RDS: IB: create struct rds_ib_fmr

Keep fmr related filed in its own struct. Fastreg MR structure
will be added to the union.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_fmr.c  | 17 ++++++++++-------
 net/rds/ib_mr.h   | 11 +++++++++--
 net/rds/ib_rdma.c | 14 ++++++++++----
 3 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
index d4f200dc61e8..74f2c21ebc7a 100644
--- a/net/rds/ib_fmr.c
+++ b/net/rds/ib_fmr.c
@@ -36,6 +36,7 @@ struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
 {
 	struct rds_ib_mr_pool *pool;
 	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_fmr *fmr;
 	int err = 0, iter = 0;
 
 	if (npages <= RDS_MR_8K_MSG_SIZE)
@@ -99,15 +100,16 @@ struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
 		goto out_no_cigar;
 	}
 
-	ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+	fmr = &ibmr->u.fmr;
+	fmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
 			(IB_ACCESS_LOCAL_WRITE |
 			 IB_ACCESS_REMOTE_READ |
 			 IB_ACCESS_REMOTE_WRITE |
 			 IB_ACCESS_REMOTE_ATOMIC),
 			&pool->fmr_attr);
-	if (IS_ERR(ibmr->fmr)) {
-		err = PTR_ERR(ibmr->fmr);
-		ibmr->fmr = NULL;
+	if (IS_ERR(fmr->fmr)) {
+		err = PTR_ERR(fmr->fmr);
+		fmr->fmr = NULL;
 		pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err);
 		goto out_no_cigar;
 	}
@@ -122,8 +124,8 @@ struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
 
 out_no_cigar:
 	if (ibmr) {
-		if (ibmr->fmr)
-			ib_dealloc_fmr(ibmr->fmr);
+		if (fmr->fmr)
+			ib_dealloc_fmr(fmr->fmr);
 		kfree(ibmr);
 	}
 	atomic_dec(&pool->item_count);
@@ -134,6 +136,7 @@ int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
 		   struct scatterlist *sg, unsigned int nents)
 {
 	struct ib_device *dev = rds_ibdev->dev;
+	struct rds_ib_fmr *fmr = &ibmr->u.fmr;
 	struct scatterlist *scat = sg;
 	u64 io_addr = 0;
 	u64 *dma_pages;
@@ -190,7 +193,7 @@ int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
 				(dma_addr & PAGE_MASK) + j;
 	}
 
-	ret = ib_map_phys_fmr(ibmr->fmr, dma_pages, page_cnt, io_addr);
+	ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr);
 	if (ret)
 		goto out;
 
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index d88724fe9a0b..309ad59bf218 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -43,11 +43,15 @@
 #define RDS_MR_8K_SCALE			(256 / (RDS_MR_8K_MSG_SIZE + 1))
 #define RDS_MR_8K_POOL_SIZE		(RDS_MR_8K_SCALE * (8192 / 2))
 
+struct rds_ib_fmr {
+	struct ib_fmr		*fmr;
+	u64			*dma;
+};
+
 /* This is stored as mr->r_trans_private. */
 struct rds_ib_mr {
 	struct rds_ib_device	*device;
 	struct rds_ib_mr_pool	*pool;
-	struct ib_fmr		*fmr;
 
 	struct llist_node	llnode;
 
@@ -57,8 +61,11 @@ struct rds_ib_mr {
 
 	struct scatterlist	*sg;
 	unsigned int		sg_len;
-	u64			*dma;
 	int			sg_dma_len;
+
+	union {
+		struct rds_ib_fmr	fmr;
+	} u;
 };
 
 /* Our own little MR pool */
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index c59451938b36..9e608d99a570 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -334,6 +334,7 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 			 int free_all, struct rds_ib_mr **ibmr_ret)
 {
 	struct rds_ib_mr *ibmr, *next;
+	struct rds_ib_fmr *fmr;
 	struct llist_node *clean_nodes;
 	struct llist_node *clean_tail;
 	LIST_HEAD(unmap_list);
@@ -395,8 +396,10 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 		goto out;
 
 	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
-	list_for_each_entry(ibmr, &unmap_list, unmap_list)
-		list_add(&ibmr->fmr->list, &fmr_list);
+	list_for_each_entry(ibmr, &unmap_list, unmap_list) {
+		fmr = &ibmr->u.fmr;
+		list_add(&fmr->fmr->list, &fmr_list);
+	}
 
 	ret = ib_unmap_fmr(&fmr_list);
 	if (ret)
@@ -405,6 +408,7 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 	/* Now we can destroy the DMA mapping and unpin any pages */
 	list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
 		unpinned += ibmr->sg_len;
+		fmr = &ibmr->u.fmr;
 		__rds_ib_teardown_mr(ibmr);
 		if (nfreed < free_goal ||
 		    ibmr->remap_count >= pool->fmr_attr.max_maps) {
@@ -413,7 +417,7 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 			else
 				rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
 			list_del(&ibmr->unmap_list);
-			ib_dealloc_fmr(ibmr->fmr);
+			ib_dealloc_fmr(fmr->fmr);
 			kfree(ibmr);
 			nfreed++;
 		}
@@ -517,6 +521,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 {
 	struct rds_ib_device *rds_ibdev;
 	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_fmr *fmr;
 	int ret;
 
 	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
@@ -536,9 +541,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 		return ibmr;
 	}
 
+	fmr = &ibmr->u.fmr;
 	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
 	if (ret == 0)
-		*key_ret = ibmr->fmr->rkey;
+		*key_ret = fmr->fmr->rkey;
 	else
 		printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
 
-- 
cgit v1.2.3-59-g8ed1b


From 490ea5967b0d94b84747d3327903d31f287f30e5 Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:48 -0800
Subject: RDS: IB: move FMR code to its own file

No functional change.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_fmr.c  | 126 +++++++++++++++++++++++++++++++++---------------------
 net/rds/ib_mr.h   |   6 +++
 net/rds/ib_rdma.c | 108 ++++++++++++++++++++++------------------------
 3 files changed, 134 insertions(+), 106 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
index 74f2c21ebc7a..4fe8f4fec4ee 100644
--- a/net/rds/ib_fmr.c
+++ b/net/rds/ib_fmr.c
@@ -37,61 +37,16 @@ struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
 	struct rds_ib_mr_pool *pool;
 	struct rds_ib_mr *ibmr = NULL;
 	struct rds_ib_fmr *fmr;
-	int err = 0, iter = 0;
+	int err = 0;
 
 	if (npages <= RDS_MR_8K_MSG_SIZE)
 		pool = rds_ibdev->mr_8k_pool;
 	else
 		pool = rds_ibdev->mr_1m_pool;
 
-	if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-		queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
-
-	/* Switch pools if one of the pool is reaching upper limit */
-	if (atomic_read(&pool->dirty_count) >=  pool->max_items * 9 / 10) {
-		if (pool->pool_type == RDS_IB_MR_8K_POOL)
-			pool = rds_ibdev->mr_1m_pool;
-		else
-			pool = rds_ibdev->mr_8k_pool;
-	}
-
-	while (1) {
-		ibmr = rds_ib_reuse_mr(pool);
-		if (ibmr)
-			return ibmr;
-
-		/* No clean MRs - now we have the choice of either
-		 * allocating a fresh MR up to the limit imposed by the
-		 * driver, or flush any dirty unused MRs.
-		 * We try to avoid stalling in the send path if possible,
-		 * so we allocate as long as we're allowed to.
-		 *
-		 * We're fussy with enforcing the FMR limit, though. If the
-		 * driver tells us we can't use more than N fmrs, we shouldn't
-		 * start arguing with it
-		 */
-		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
-			break;
-
-		atomic_dec(&pool->item_count);
-
-		if (++iter > 2) {
-			if (pool->pool_type == RDS_IB_MR_8K_POOL)
-				rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
-			else
-				rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
-			return ERR_PTR(-EAGAIN);
-		}
-
-		/* We do have some empty MRs. Flush them out. */
-		if (pool->pool_type == RDS_IB_MR_8K_POOL)
-			rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
-		else
-			rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
-		rds_ib_flush_mr_pool(pool, 0, &ibmr);
-		if (ibmr)
-			return ibmr;
-	}
+	ibmr = rds_ib_try_reuse_ibmr(pool);
+	if (ibmr)
+		return ibmr;
 
 	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
 			    rdsibdev_to_node(rds_ibdev));
@@ -218,3 +173,76 @@ out:
 
 	return ret;
 }
+
+struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev,
+				 struct scatterlist *sg,
+				 unsigned long nents,
+				 u32 *key)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_fmr *fmr;
+	int ret;
+
+	ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
+	if (IS_ERR(ibmr))
+		return ibmr;
+
+	ibmr->device = rds_ibdev;
+	fmr = &ibmr->u.fmr;
+	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+	if (ret == 0)
+		*key = fmr->fmr->rkey;
+	else
+		rds_ib_free_mr(ibmr, 0);
+
+	return ibmr;
+}
+
+void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed,
+		      unsigned long *unpinned, unsigned int goal)
+{
+	struct rds_ib_mr *ibmr, *next;
+	struct rds_ib_fmr *fmr;
+	LIST_HEAD(fmr_list);
+	int ret = 0;
+	unsigned int freed = *nfreed;
+
+	/* String all ib_mr's onto one list and hand them to  ib_unmap_fmr */
+	list_for_each_entry(ibmr, list, unmap_list) {
+		fmr = &ibmr->u.fmr;
+		list_add(&fmr->fmr->list, &fmr_list);
+	}
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret);
+
+	/* Now we can destroy the DMA mapping and unpin any pages */
+	list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+		fmr = &ibmr->u.fmr;
+		*unpinned += ibmr->sg_len;
+		__rds_ib_teardown_mr(ibmr);
+		if (freed < goal ||
+		    ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) {
+			if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+				rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+			else
+				rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+			list_del(&ibmr->unmap_list);
+			ib_dealloc_fmr(fmr->fmr);
+			kfree(ibmr);
+			freed++;
+		}
+	}
+	*nfreed = freed;
+}
+
+void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_mr_pool *pool = ibmr->pool;
+
+	if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+		llist_add(&ibmr->llnode, &pool->drop_list);
+	else
+		llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 309ad59bf218..f5c1fcb4b91e 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -113,4 +113,10 @@ int rds_ib_map_fmr(struct rds_ib_device *, struct rds_ib_mr *,
 		   struct scatterlist *, unsigned int);
 struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *);
 int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **);
+struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *,
+				 unsigned long, u32 *);
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *);
+void rds_ib_unreg_fmr(struct list_head *, unsigned int *,
+		      unsigned long *, unsigned int);
+void rds_ib_free_fmr_list(struct rds_ib_mr *);
 #endif
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 9e608d99a570..0e848433597f 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -333,15 +333,12 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
 int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 			 int free_all, struct rds_ib_mr **ibmr_ret)
 {
-	struct rds_ib_mr *ibmr, *next;
-	struct rds_ib_fmr *fmr;
+	struct rds_ib_mr *ibmr;
 	struct llist_node *clean_nodes;
 	struct llist_node *clean_tail;
 	LIST_HEAD(unmap_list);
-	LIST_HEAD(fmr_list);
 	unsigned long unpinned = 0;
 	unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
-	int ret = 0;
 
 	if (pool->pool_type == RDS_IB_MR_8K_POOL)
 		rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
@@ -395,33 +392,7 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 	if (list_empty(&unmap_list))
 		goto out;
 
-	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
-	list_for_each_entry(ibmr, &unmap_list, unmap_list) {
-		fmr = &ibmr->u.fmr;
-		list_add(&fmr->fmr->list, &fmr_list);
-	}
-
-	ret = ib_unmap_fmr(&fmr_list);
-	if (ret)
-		printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
-
-	/* Now we can destroy the DMA mapping and unpin any pages */
-	list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
-		unpinned += ibmr->sg_len;
-		fmr = &ibmr->u.fmr;
-		__rds_ib_teardown_mr(ibmr);
-		if (nfreed < free_goal ||
-		    ibmr->remap_count >= pool->fmr_attr.max_maps) {
-			if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
-				rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
-			else
-				rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
-			list_del(&ibmr->unmap_list);
-			ib_dealloc_fmr(fmr->fmr);
-			kfree(ibmr);
-			nfreed++;
-		}
-	}
+	rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
 
 	if (!list_empty(&unmap_list)) {
 		/* we have to make sure that none of the things we're about
@@ -454,7 +425,47 @@ out:
 	if (waitqueue_active(&pool->flush_wait))
 		wake_up(&pool->flush_wait);
 out_nolock:
-	return ret;
+	return 0;
+}
+
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	int iter = 0;
+
+	if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10)
+		queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+
+	while (1) {
+		ibmr = rds_ib_reuse_mr(pool);
+		if (ibmr)
+			return ibmr;
+
+		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+			break;
+
+		atomic_dec(&pool->item_count);
+
+		if (++iter > 2) {
+			if (pool->pool_type == RDS_IB_MR_8K_POOL)
+				rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
+			else
+				rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		/* We do have some empty MRs. Flush them out. */
+		if (pool->pool_type == RDS_IB_MR_8K_POOL)
+			rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
+		else
+			rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
+
+		rds_ib_flush_mr_pool(pool, 0, &ibmr);
+		if (ibmr)
+			return ibmr;
+	}
+
+	return ibmr;
 }
 
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
@@ -473,10 +484,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 
 	/* Return it to the pool's free list */
-	if (ibmr->remap_count >= pool->fmr_attr.max_maps)
-		llist_add(&ibmr->llnode, &pool->drop_list);
-	else
-		llist_add(&ibmr->llnode, &pool->free_list);
+	rds_ib_free_fmr_list(ibmr);
 
 	atomic_add(ibmr->sg_len, &pool->free_pinned);
 	atomic_inc(&pool->dirty_count);
@@ -521,7 +529,6 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 {
 	struct rds_ib_device *rds_ibdev;
 	struct rds_ib_mr *ibmr = NULL;
-	struct rds_ib_fmr *fmr;
 	int ret;
 
 	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
@@ -535,30 +542,17 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 		goto out;
 	}
 
-	ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
-	if (IS_ERR(ibmr)) {
-		rds_ib_dev_put(rds_ibdev);
-		return ibmr;
-	}
-
-	fmr = &ibmr->u.fmr;
-	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
-	if (ret == 0)
-		*key_ret = fmr->fmr->rkey;
-	else
-		printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
-
-	ibmr->device = rds_ibdev;
-	rds_ibdev = NULL;
+	ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
+	if (ibmr)
+		rds_ibdev = NULL;
 
  out:
-	if (ret) {
-		if (ibmr)
-			rds_ib_free_mr(ibmr, 0);
-		ibmr = ERR_PTR(ret);
-	}
+	if (!ibmr)
+		pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
+
 	if (rds_ibdev)
 		rds_ib_dev_put(rds_ibdev);
+
 	return ibmr;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d4de76da5c39582aa14a94186c35b4aec6ef292c Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:49 -0800
Subject: RDS: IB: add connection info to ibmr

Preperatory patch for FRMR support. From connection info,
we can retrieve cm_id which contains qp handled needed for
work request posting.

We also need to drop the RDS connection on QP error states
where connection handle becomes useful.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_mr.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index f5c1fcb4b91e..add7725bc062 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -50,18 +50,19 @@ struct rds_ib_fmr {
 
 /* This is stored as mr->r_trans_private. */
 struct rds_ib_mr {
-	struct rds_ib_device	*device;
-	struct rds_ib_mr_pool	*pool;
+	struct rds_ib_device		*device;
+	struct rds_ib_mr_pool		*pool;
+	struct rds_ib_connection	*ic;
 
-	struct llist_node	llnode;
+	struct llist_node		llnode;
 
 	/* unmap_list is for freeing */
-	struct list_head	unmap_list;
-	unsigned int		remap_count;
+	struct list_head		unmap_list;
+	unsigned int			remap_count;
 
-	struct scatterlist	*sg;
-	unsigned int		sg_len;
-	int			sg_dma_len;
+	struct scatterlist		*sg;
+	unsigned int			sg_len;
+	int				sg_dma_len;
 
 	union {
 		struct rds_ib_fmr	fmr;
-- 
cgit v1.2.3-59-g8ed1b


From 37ea401e9ccc44453da6785514bb64ad6e9c181f Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:50 -0800
Subject: RDS: IB: handle the RDMA CM time wait event

Drop the RDS connection on RDMA_CM_EVENT_TIMEWAIT_EXIT so that
it can reconnect and resume.

While testing fastreg, this error happened in couple of tests but
was getting un-noticed.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/rdma_transport.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 4f4b3d88319d..7220bebcf558 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -117,6 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 		rds_conn_drop(conn);
 		break;
 
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		if (conn) {
+			pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n",
+				&conn->c_laddr, &conn->c_faddr);
+			rds_conn_drop(conn);
+		}
+		break;
+
 	default:
 		/* things like device disconnect? */
 		printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
-- 
cgit v1.2.3-59-g8ed1b


From db42753adb638b63572583162bb08ea193947309 Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:51 -0800
Subject: RDS: IB: add mr reused stats

Add MR reuse statistics to RDS IB transport.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib.h       | 2 ++
 net/rds/ib_rdma.c  | 7 ++++++-
 net/rds/ib_stats.c | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index c88cb229db16..62fe7d5330ad 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -259,6 +259,8 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_rdma_mr_1m_pool_flush;
 	uint64_t	s_ib_rdma_mr_1m_pool_wait;
 	uint64_t	s_ib_rdma_mr_1m_pool_depleted;
+	uint64_t	s_ib_rdma_mr_8k_reused;
+	uint64_t	s_ib_rdma_mr_1m_reused;
 	uint64_t	s_ib_atomic_cswp;
 	uint64_t	s_ib_atomic_fadd;
 };
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 0e848433597f..ec7ea32265c0 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -188,8 +188,13 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 	flag = this_cpu_ptr(&clean_list_grace);
 	set_bit(CLEAN_LIST_BUSY_BIT, flag);
 	ret = llist_del_first(&pool->clean_list);
-	if (ret)
+	if (ret) {
 		ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+		if (pool->pool_type == RDS_IB_MR_8K_POOL)
+			rds_ib_stats_inc(s_ib_rdma_mr_8k_reused);
+		else
+			rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
+	}
 
 	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
 	preempt_enable();
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index d77e04473056..7e78dca1f252 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -73,6 +73,8 @@ static const char *const rds_ib_stat_names[] = {
 	"ib_rdma_mr_1m_pool_flush",
 	"ib_rdma_mr_1m_pool_wait",
 	"ib_rdma_mr_1m_pool_depleted",
+	"ib_rdma_mr_8k_reused",
+	"ib_rdma_mr_1m_reused",
 	"ib_atomic_cswp",
 	"ib_atomic_fadd",
 };
-- 
cgit v1.2.3-59-g8ed1b


From 2cb2912d65633e751d3f8397377174501412aa47 Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:52 -0800
Subject: RDS: IB: add Fastreg MR (FRMR) detection support

Discovere Fast Memmory Registration support using IB device
IB_DEVICE_MEM_MGT_EXTENSIONS. Certain HCA might support just FRMR
or FMR or both FMR and FRWR. In case both mr type are supported,
default FMR is used.

Default MR is still kept as FMR against what everyone else
is following. Default will be changed to FRMR once the
RDS performance with FRMR is comparable with FMR. The
work is in progress for the same.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib.c    | 10 ++++++++++
 net/rds/ib.h    |  4 ++++
 net/rds/ib_mr.h |  1 +
 3 files changed, 15 insertions(+)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index bb32cb9d85f1..b5342fddaf98 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -140,6 +140,12 @@ static void rds_ib_add_one(struct ib_device *device)
 	rds_ibdev->max_wrs = device->attrs.max_qp_wr;
 	rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
 
+	rds_ibdev->has_fr = (device->attrs.device_cap_flags &
+				  IB_DEVICE_MEM_MGT_EXTENSIONS);
+	rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+			    device->map_phys_fmr && device->unmap_fmr);
+	rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr);
+
 	rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
 	rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
 		min_t(unsigned int, (device->attrs.max_mr / 2),
@@ -178,6 +184,10 @@ static void rds_ib_add_one(struct ib_device *device)
 		 rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs,
 		 rds_ibdev->max_8k_mrs);
 
+	pr_info("RDS/IB: %s: %s supported and preferred\n",
+		device->name,
+		rds_ibdev->use_fastreg ? "FRMR" : "FMR");
+
 	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
 	INIT_LIST_HEAD(&rds_ibdev->conn_list);
 
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 62fe7d5330ad..c5eddc2086ff 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -200,6 +200,10 @@ struct rds_ib_device {
 	struct list_head	conn_list;
 	struct ib_device	*dev;
 	struct ib_pd		*pd;
+	bool                    has_fmr;
+	bool                    has_fr;
+	bool                    use_fastreg;
+
 	unsigned int		max_mrs;
 	struct rds_ib_mr_pool	*mr_1m_pool;
 	struct rds_ib_mr_pool   *mr_8k_pool;
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index add7725bc062..2f9b9c3afc59 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -93,6 +93,7 @@ struct rds_ib_mr_pool {
 extern struct workqueue_struct *rds_ib_mr_wq;
 extern unsigned int rds_ib_mr_1m_pool_size;
 extern unsigned int rds_ib_mr_8k_pool_size;
+extern bool prefer_frmr;
 
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
 					     int npages);
-- 
cgit v1.2.3-59-g8ed1b


From ad6832f950d35df8c70b577993a24b31b34d88e4 Mon Sep 17 00:00:00 2001
From: "santosh.shilimkar@oracle.com" <santosh.shilimkar@oracle.com>
Date: Tue, 1 Mar 2016 15:20:53 -0800
Subject: RDS: IB: allocate extra space on queues for FRMR support

Fastreg MR(FRMR) memory registration and invalidation makes use
of work request and completion queues for its operation. Patch
allocates extra queue space towards these operation(s).

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib.h    |  4 ++++
 net/rds/ib_cm.c | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index c5eddc2086ff..eeb0d6c85cb3 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -14,6 +14,7 @@
 
 #define RDS_IB_DEFAULT_RECV_WR		1024
 #define RDS_IB_DEFAULT_SEND_WR		256
+#define RDS_IB_DEFAULT_FR_WR		512
 
 #define RDS_IB_DEFAULT_RETRY_COUNT	2
 
@@ -122,6 +123,9 @@ struct rds_ib_connection {
 	struct ib_wc		i_send_wc[RDS_IB_WC_MAX];
 	struct ib_wc		i_recv_wc[RDS_IB_WC_MAX];
 
+	/* To control the number of wrs from fastreg */
+	atomic_t		i_fastreg_wrs;
+
 	/* interrupt handling */
 	struct tasklet_struct	i_send_tasklet;
 	struct tasklet_struct	i_recv_tasklet;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 7f68abc8a5bf..83f4673970e7 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -363,7 +363,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	struct ib_qp_init_attr attr;
 	struct ib_cq_init_attr cq_attr = {};
 	struct rds_ib_device *rds_ibdev;
-	int ret;
+	int ret, fr_queue_space;
 
 	/*
 	 * It's normal to see a null device if an incoming connection races
@@ -373,6 +373,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	if (!rds_ibdev)
 		return -EOPNOTSUPP;
 
+	/* The fr_queue_space is currently set to 512, to add extra space on
+	 * completion queue and send queue. This extra space is used for FRMR
+	 * registration and invalidation work requests
+	 */
+	fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+
 	/* add the conn now so that connection establishment has the dev */
 	rds_ib_add_conn(rds_ibdev, conn);
 
@@ -384,7 +390,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	/* Protection domain and memory range */
 	ic->i_pd = rds_ibdev->pd;
 
-	cq_attr.cqe = ic->i_send_ring.w_nr + 1;
+	cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
 
 	ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
 				     rds_ib_cq_event_handler, conn,
@@ -424,7 +430,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	attr.event_handler = rds_ib_qp_event_handler;
 	attr.qp_context = conn;
 	/* + 1 to allow for the single ack message */
-	attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+	attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
 	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
 	attr.cap.max_send_sge = rds_ibdev->max_sge;
 	attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
@@ -432,6 +438,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	attr.qp_type = IB_QPT_RC;
 	attr.send_cq = ic->i_send_cq;
 	attr.recv_cq = ic->i_recv_cq;
+	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
 
 	/*
 	 * XXX this can fail if max_*_wr is too large?  Are we supposed
@@ -751,7 +758,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 		 */
 		wait_event(rds_ib_ring_empty_wait,
 			   rds_ib_ring_empty(&ic->i_recv_ring) &&
-			   (atomic_read(&ic->i_signaled_sends) == 0));
+			   (atomic_read(&ic->i_signaled_sends) == 0) &&
+			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
 		tasklet_kill(&ic->i_send_tasklet);
 		tasklet_kill(&ic->i_recv_tasklet);
 
-- 
cgit v1.2.3-59-g8ed1b


From 1659185fb4d0025835eb2058a141f0746c5cab00 Mon Sep 17 00:00:00 2001
From: Avinash Repaka <avinash.repaka@oracle.com>
Date: Tue, 1 Mar 2016 15:20:54 -0800
Subject: RDS: IB: Support Fastreg MR (FRMR) memory registration mode

Fastreg MR(FRMR) is another method with which one can
register memory to HCA. Some of the newer HCAs supports only fastreg
mr mode, so we need to add support for it to have RDS functional
on them.

Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Avinash Repaka <avinash.repaka@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/Makefile  |   2 +-
 net/rds/ib.h      |   1 +
 net/rds/ib_cm.c   |   7 +-
 net/rds/ib_frmr.c | 376 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/rds/ib_mr.h   |  24 ++++
 net/rds/ib_rdma.c |  17 ++-
 6 files changed, 422 insertions(+), 5 deletions(-)
 create mode 100644 net/rds/ib_frmr.c

(limited to 'net')

diff --git a/net/rds/Makefile b/net/rds/Makefile
index bcf5591b493a..0e72bec1529f 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -6,7 +6,7 @@ rds-y :=	af_rds.o bind.o cong.o connection.o info.o message.o   \
 obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
 rds_rdma-y :=	rdma_transport.o \
 			ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
-			ib_sysctl.o ib_rdma.o ib_fmr.o
+			ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o
 
 
 obj-$(CONFIG_RDS_TCP) += rds_tcp.o
diff --git a/net/rds/ib.h b/net/rds/ib.h
index eeb0d6c85cb3..627fb79aee65 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -349,6 +349,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_destroy_nodev_conns(void);
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
 
 /* ib_recv.c */
 int rds_ib_recv_init(void);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 83f4673970e7..8764970f0c24 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -249,7 +249,12 @@ static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
 				 (unsigned long long)wc->wr_id, wc->status,
 				 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
 
-			rds_ib_send_cqe_handler(ic, wc);
+			if (wc->wr_id <= ic->i_send_ring.w_nr ||
+			    wc->wr_id == RDS_IB_ACK_WR_ID)
+				rds_ib_send_cqe_handler(ic, wc);
+			else
+				rds_ib_mr_cqe_handler(ic, wc);
+
 		}
 	}
 }
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
new file mode 100644
index 000000000000..93ff038ea9d1
--- /dev/null
+++ b/net/rds/ib_frmr.c
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
+					   int npages)
+{
+	struct rds_ib_mr_pool *pool;
+	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_frmr *frmr;
+	int err = 0;
+
+	if (npages <= RDS_MR_8K_MSG_SIZE)
+		pool = rds_ibdev->mr_8k_pool;
+	else
+		pool = rds_ibdev->mr_1m_pool;
+
+	ibmr = rds_ib_try_reuse_ibmr(pool);
+	if (ibmr)
+		return ibmr;
+
+	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+			    rdsibdev_to_node(rds_ibdev));
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	frmr = &ibmr->u.frmr;
+	frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG,
+			 pool->fmr_attr.max_pages);
+	if (IS_ERR(frmr->mr)) {
+		pr_warn("RDS/IB: %s failed to allocate MR", __func__);
+		goto out_no_cigar;
+	}
+
+	ibmr->pool = pool;
+	if (pool->pool_type == RDS_IB_MR_8K_POOL)
+		rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+	else
+		rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+	if (atomic_read(&pool->item_count) > pool->max_items_soft)
+		pool->max_items_soft = pool->max_items;
+
+	frmr->fr_state = FRMR_IS_FREE;
+	return ibmr;
+
+out_no_cigar:
+	kfree(ibmr);
+	atomic_dec(&pool->item_count);
+	return ERR_PTR(err);
+}
+
+static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop)
+{
+	struct rds_ib_mr_pool *pool = ibmr->pool;
+
+	if (drop)
+		llist_add(&ibmr->llnode, &pool->drop_list);
+	else
+		llist_add(&ibmr->llnode, &pool->free_list);
+	atomic_add(ibmr->sg_len, &pool->free_pinned);
+	atomic_inc(&pool->dirty_count);
+
+	/* If we've pinned too many pages, request a flush */
+	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+	    atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+		queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+}
+
+static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+	struct ib_send_wr *failed_wr;
+	struct ib_reg_wr reg_wr;
+	int ret;
+
+	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		cpu_relax();
+	}
+
+	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE);
+	if (unlikely(ret != ibmr->sg_len))
+		return ret < 0 ? ret : -EINVAL;
+
+	/* Perform a WR for the fast_reg_mr. Each individual page
+	 * in the sg list is added to the fast reg page list and placed
+	 * inside the fast_reg_mr WR.  The key used is a rolling 8bit
+	 * counter, which should guarantee uniqueness.
+	 */
+	ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
+	frmr->fr_state = FRMR_IS_INUSE;
+
+	memset(&reg_wr, 0, sizeof(reg_wr));
+	reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
+	reg_wr.wr.opcode = IB_WR_REG_MR;
+	reg_wr.wr.num_sge = 0;
+	reg_wr.mr = frmr->mr;
+	reg_wr.key = frmr->mr->rkey;
+	reg_wr.access = IB_ACCESS_LOCAL_WRITE |
+			IB_ACCESS_REMOTE_READ |
+			IB_ACCESS_REMOTE_WRITE;
+	reg_wr.wr.send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = &reg_wr.wr;
+	ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, &failed_wr);
+	WARN_ON(failed_wr != &reg_wr.wr);
+	if (unlikely(ret)) {
+		/* Failure here can be because of -ENOMEM as well */
+		frmr->fr_state = FRMR_IS_STALE;
+		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		if (printk_ratelimit())
+			pr_warn("RDS/IB: %s returned error(%d)\n",
+				__func__, ret);
+	}
+	return ret;
+}
+
+static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev,
+			   struct rds_ib_mr_pool *pool,
+			   struct rds_ib_mr *ibmr,
+			   struct scatterlist *sg, unsigned int sg_len)
+{
+	struct ib_device *dev = rds_ibdev->dev;
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+	int i;
+	u32 len;
+	int ret = 0;
+
+	/* We want to teardown old ibmr values here and fill it up with
+	 * new sg values
+	 */
+	rds_ib_teardown_mr(ibmr);
+
+	ibmr->sg = sg;
+	ibmr->sg_len = sg_len;
+	ibmr->sg_dma_len = 0;
+	frmr->sg_byte_len = 0;
+	WARN_ON(ibmr->sg_dma_len);
+	ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len,
+					 DMA_BIDIRECTIONAL);
+	if (unlikely(!ibmr->sg_dma_len)) {
+		pr_warn("RDS/IB: %s failed!\n", __func__);
+		return -EBUSY;
+	}
+
+	frmr->sg_byte_len = 0;
+	frmr->dma_npages = 0;
+	len = 0;
+
+	ret = -EINVAL;
+	for (i = 0; i < ibmr->sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]);
+
+		frmr->sg_byte_len += dma_len;
+		if (dma_addr & ~PAGE_MASK) {
+			if (i > 0)
+				goto out_unmap;
+			else
+				++frmr->dma_npages;
+		}
+
+		if ((dma_addr + dma_len) & ~PAGE_MASK) {
+			if (i < ibmr->sg_dma_len - 1)
+				goto out_unmap;
+			else
+				++frmr->dma_npages;
+		}
+
+		len += dma_len;
+	}
+	frmr->dma_npages += len >> PAGE_SHIFT;
+
+	if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) {
+		ret = -EMSGSIZE;
+		goto out_unmap;
+	}
+
+	ret = rds_ib_post_reg_frmr(ibmr);
+	if (ret)
+		goto out_unmap;
+
+	if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+		rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+	else
+		rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+
+	return ret;
+
+out_unmap:
+	ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len,
+			DMA_BIDIRECTIONAL);
+	ibmr->sg_dma_len = 0;
+	return ret;
+}
+
+static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
+{
+	struct ib_send_wr *s_wr, *failed_wr;
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+	struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id;
+	int ret = -EINVAL;
+
+	if (!i_cm_id || !i_cm_id->qp || !frmr->mr)
+		goto out;
+
+	if (frmr->fr_state != FRMR_IS_INUSE)
+		goto out;
+
+	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		cpu_relax();
+	}
+
+	frmr->fr_inv = true;
+	s_wr = &frmr->fr_wr;
+
+	memset(s_wr, 0, sizeof(*s_wr));
+	s_wr->wr_id = (unsigned long)(void *)ibmr;
+	s_wr->opcode = IB_WR_LOCAL_INV;
+	s_wr->ex.invalidate_rkey = frmr->mr->rkey;
+	s_wr->send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = s_wr;
+	ret = ib_post_send(i_cm_id->qp, s_wr, &failed_wr);
+	WARN_ON(failed_wr != s_wr);
+	if (unlikely(ret)) {
+		frmr->fr_state = FRMR_IS_STALE;
+		frmr->fr_inv = false;
+		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
+		goto out;
+	}
+out:
+	return ret;
+}
+
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
+{
+	struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id;
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+	if (wc->status != IB_WC_SUCCESS) {
+		frmr->fr_state = FRMR_IS_STALE;
+		if (rds_conn_up(ic->conn))
+			rds_ib_conn_error(ic->conn,
+					  "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n",
+					  &ic->conn->c_laddr,
+					  &ic->conn->c_faddr,
+					  wc->status,
+					  ib_wc_status_msg(wc->status),
+					  wc->vendor_err);
+	}
+
+	if (frmr->fr_inv) {
+		frmr->fr_state = FRMR_IS_FREE;
+		frmr->fr_inv = false;
+	}
+
+	atomic_inc(&ic->i_fastreg_wrs);
+}
+
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+		       unsigned long *unpinned, unsigned int goal)
+{
+	struct rds_ib_mr *ibmr, *next;
+	struct rds_ib_frmr *frmr;
+	int ret = 0;
+	unsigned int freed = *nfreed;
+
+	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+	list_for_each_entry(ibmr, list, unmap_list) {
+		if (ibmr->sg_dma_len)
+			ret |= rds_ib_post_inv(ibmr);
+	}
+	if (ret)
+		pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
+
+	/* Now we can destroy the DMA mapping and unpin any pages */
+	list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+		*unpinned += ibmr->sg_len;
+		frmr = &ibmr->u.frmr;
+		__rds_ib_teardown_mr(ibmr);
+		if (freed < goal || frmr->fr_state == FRMR_IS_STALE) {
+			/* Don't de-allocate if the MR is not free yet */
+			if (frmr->fr_state == FRMR_IS_INUSE)
+				continue;
+
+			if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+				rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+			else
+				rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+			list_del(&ibmr->unmap_list);
+			if (frmr->mr)
+				ib_dereg_mr(frmr->mr);
+			kfree(ibmr);
+			freed++;
+		}
+	}
+	*nfreed = freed;
+}
+
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+				  struct rds_ib_connection *ic,
+				  struct scatterlist *sg,
+				  unsigned long nents, u32 *key)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_frmr *frmr;
+	int ret;
+
+	do {
+		if (ibmr)
+			rds_ib_free_frmr(ibmr, true);
+		ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
+		if (IS_ERR(ibmr))
+			return ibmr;
+		frmr = &ibmr->u.frmr;
+	} while (frmr->fr_state != FRMR_IS_FREE);
+
+	ibmr->ic = ic;
+	ibmr->device = rds_ibdev;
+	ret = rds_ib_map_frmr(rds_ibdev, ibmr->pool, ibmr, sg, nents);
+	if (ret == 0) {
+		*key = frmr->mr->rkey;
+	} else {
+		rds_ib_free_frmr(ibmr, false);
+		ibmr = ERR_PTR(ret);
+	}
+
+	return ibmr;
+}
+
+void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_mr_pool *pool = ibmr->pool;
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+	if (frmr->fr_state == FRMR_IS_STALE)
+		llist_add(&ibmr->llnode, &pool->drop_list);
+	else
+		llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 2f9b9c3afc59..1c754f4acbe5 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -48,6 +48,21 @@ struct rds_ib_fmr {
 	u64			*dma;
 };
 
+enum rds_ib_fr_state {
+	FRMR_IS_FREE,	/* mr invalidated & ready for use */
+	FRMR_IS_INUSE,	/* mr is in use or used & can be invalidated */
+	FRMR_IS_STALE,	/* Stale MR and needs to be dropped  */
+};
+
+struct rds_ib_frmr {
+	struct ib_mr		*mr;
+	enum rds_ib_fr_state	fr_state;
+	bool			fr_inv;
+	struct ib_send_wr	fr_wr;
+	unsigned int		dma_npages;
+	unsigned int		sg_byte_len;
+};
+
 /* This is stored as mr->r_trans_private. */
 struct rds_ib_mr {
 	struct rds_ib_device		*device;
@@ -66,6 +81,7 @@ struct rds_ib_mr {
 
 	union {
 		struct rds_ib_fmr	fmr;
+		struct rds_ib_frmr	frmr;
 	} u;
 };
 
@@ -88,6 +104,7 @@ struct rds_ib_mr_pool {
 	unsigned long		max_items_soft;
 	unsigned long		max_free_pinned;
 	struct ib_fmr_attr	fmr_attr;
+	bool			use_fastreg;
 };
 
 extern struct workqueue_struct *rds_ib_mr_wq;
@@ -121,4 +138,11 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *);
 void rds_ib_unreg_fmr(struct list_head *, unsigned int *,
 		      unsigned long *, unsigned int);
 void rds_ib_free_fmr_list(struct rds_ib_mr *);
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+				  struct rds_ib_connection *ic,
+				  struct scatterlist *sg,
+				  unsigned long nents, u32 *key);
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+		       unsigned long *unpinned, unsigned int goal);
+void rds_ib_free_frmr_list(struct rds_ib_mr *);
 #endif
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index ec7ea32265c0..f7164ac1ffc1 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -397,7 +397,10 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 	if (list_empty(&unmap_list))
 		goto out;
 
-	rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
+	if (pool->use_fastreg)
+		rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
+	else
+		rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
 
 	if (!list_empty(&unmap_list)) {
 		/* we have to make sure that none of the things we're about
@@ -489,7 +492,10 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 
 	/* Return it to the pool's free list */
-	rds_ib_free_fmr_list(ibmr);
+	if (rds_ibdev->use_fastreg)
+		rds_ib_free_frmr_list(ibmr);
+	else
+		rds_ib_free_fmr_list(ibmr);
 
 	atomic_add(ibmr->sg_len, &pool->free_pinned);
 	atomic_inc(&pool->dirty_count);
@@ -534,6 +540,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 {
 	struct rds_ib_device *rds_ibdev;
 	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
 	int ret;
 
 	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
@@ -547,7 +554,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 		goto out;
 	}
 
-	ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
+	if (rds_ibdev->use_fastreg)
+		ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
+	else
+		ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
 	if (ibmr)
 		rds_ibdev = NULL;
 
@@ -601,6 +611,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
 	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
 	pool->fmr_attr.page_shift = PAGE_SHIFT;
 	pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
+	pool->use_fastreg = rds_ibdev->use_fastreg;
 
 	return pool;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6353e1875dd775fc915cff3a2d53cb163bdeb63e Mon Sep 17 00:00:00 2001
From: Eric Engestrom <eric.engestrom@imgtec.com>
Date: Mon, 29 Feb 2016 16:36:38 +0000
Subject: net/rtnetlink: remove dead code

3b766cd832328fcb87db3507e7b98cf42f21689d ("net/core: Add reading VF
statistics through the PF netdevice") added that variable but it's never
been used.

Signed-off-by: Eric Engestrom <eric.engestrom@imgtec.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 62737f437c8e..6128aac01b11 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1391,15 +1391,6 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 	[IFLA_VF_TRUST]		= { .len = sizeof(struct ifla_vf_trust) },
 };
 
-static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
-	[IFLA_VF_STATS_RX_PACKETS]	= { .type = NLA_U64 },
-	[IFLA_VF_STATS_TX_PACKETS]	= { .type = NLA_U64 },
-	[IFLA_VF_STATS_RX_BYTES]	= { .type = NLA_U64 },
-	[IFLA_VF_STATS_TX_BYTES]	= { .type = NLA_U64 },
-	[IFLA_VF_STATS_BROADCAST]	= { .type = NLA_U64 },
-	[IFLA_VF_STATS_MULTICAST]	= { .type = NLA_U64 },
-};
-
 static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
 	[IFLA_PORT_VF]		= { .type = NLA_U32 },
 	[IFLA_PORT_PROFILE]	= { .type = NLA_STRING,
-- 
cgit v1.2.3-59-g8ed1b


From a9d562358b5c12a3d1c343f45a3c56df47dad753 Mon Sep 17 00:00:00 2001
From: Eric Engestrom <eric.engestrom@imgtec.com>
Date: Mon, 29 Feb 2016 16:38:06 +0000
Subject: net/ipv4: remove left over dead code

8cc785f6f429c2a3fb81745dc142cbd72a462c4a ("net: ipv4: make the ping
/proc code AF-independent") removed the code using it, but renamed this
variable instead of removing it.

Signed-off-by: Eric Engestrom <eric.engestrom@imgtec.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ping.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 76dce90c4581..cf9700b1a106 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1142,13 +1142,6 @@ static int ping_v4_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static const struct seq_operations ping_v4_seq_ops = {
-	.show		= ping_v4_seq_show,
-	.start		= ping_v4_seq_start,
-	.next		= ping_seq_next,
-	.stop		= ping_seq_stop,
-};
-
 static int ping_seq_open(struct inode *inode, struct file *file)
 {
 	struct ping_seq_afinfo *afinfo = PDE_DATA(inode);
-- 
cgit v1.2.3-59-g8ed1b


From 3d1cbe839ac3aa7b23a274b691092c90f9bf1b8e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 2 Mar 2016 10:40:54 +0100
Subject: net: mellanox: add DEVLINK dependencies

The new NET_DEVLINK infrastructure can be a loadable module, but the drivers
using it might be built-in, which causes link errors like:

drivers/net/built-in.o: In function `mlx4_load_one':
:(.text+0x2fbfda): undefined reference to `devlink_port_register'
:(.text+0x2fc084): undefined reference to `devlink_port_unregister'
drivers/net/built-in.o: In function `mlxsw_sx_port_remove':
:(.text+0x33a03a): undefined reference to `devlink_port_type_clear'
:(.text+0x33a04e): undefined reference to `devlink_port_unregister'

There are multiple ways to avoid this:

a) add 'depends on NET_DEVLINK || !NET_DEVLINK' dependencies
   for each user
b) use 'select NET_DEVLINK' from each driver that uses it
   and hide the symbol in Kconfig.
c) make NET_DEVLINK a 'bool' option so we don't have to
   list it as a dependency, and rely on the APIs to be
   stubbed out when it is disabled
d) use IS_REACHABLE() rather than IS_ENABLED() to check for
   NET_DEVLINK in include/net/devlink.h

This implements a variation of approach a) by adding an
intermediate symbol that drivers can depend on, and changes
the three drivers using it.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 09d4d087cd48 ("mlx4: Implement devlink interface")
Fixes: c4745500e988 ("mlxsw: Implement devlink interface")
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/Kconfig          | 1 +
 drivers/net/ethernet/mellanox/mlx4/Kconfig  | 1 +
 drivers/net/ethernet/mellanox/mlxsw/Kconfig | 1 +
 net/Kconfig                                 | 9 +++++++++
 4 files changed, 12 insertions(+)

(limited to 'net')

diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
index fc01deac1d3c..db4aa13ebae0 100644
--- a/drivers/infiniband/hw/mlx4/Kconfig
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -1,6 +1,7 @@
 config MLX4_INFINIBAND
 	tristate "Mellanox ConnectX HCA support"
 	depends on NETDEVICES && ETHERNET && PCI && INET
+	depends on MAY_USE_DEVLINK
 	select NET_VENDOR_MELLANOX
 	select MLX4_CORE
 	---help---
diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig
index 1486ce902a56..9ca3734ebb6b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig
@@ -4,6 +4,7 @@
 
 config MLX4_EN
 	tristate "Mellanox Technologies 1/10/40Gbit Ethernet support"
+	depends on MAY_USE_DEVLINK
 	depends on PCI
 	select MLX4_CORE
 	select PTP_1588_CLOCK
diff --git a/drivers/net/ethernet/mellanox/mlxsw/Kconfig b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
index ce26adcb4988..2ad7f67854d5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
@@ -4,6 +4,7 @@
 
 config MLXSW_CORE
 	tristate "Mellanox Technologies Switch ASICs support"
+	depends on MAY_USE_DEVLINK
 	---help---
 	  This driver supports Mellanox Technologies Switch ASICs family.
 
diff --git a/net/Kconfig b/net/Kconfig
index 6c9cfb0d7639..2760825e53fa 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -403,6 +403,15 @@ config NET_DEVLINK
 	  infrastructure to support access to physical chip-wide config and
 	  monitoring.
 
+config MAY_USE_DEVLINK
+	tristate
+	default m if NET_DEVLINK=m
+	default y if NET_DEVLINK=y || NET_DEVLINK=n
+	help
+	  Drivers using the devlink infrastructure should have a dependency
+	  on MAY_USE_DEVLINK to ensure they do not cause link errors when
+	  devlink is a loadable module and the driver using it is built-in.
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF_JIT
-- 
cgit v1.2.3-59-g8ed1b


From 799977d9aafbf0ca0b9c39b04cbfb16db71302c9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 2 Mar 2016 11:30:07 -0800
Subject: net: ipv6: Fix refcnt on host routes

Andrew and Ying Huang's test robot both reported usage count problems that
trace back to the 'keep address on ifdown' patch.

>From Andrew:
We execute CRIU test on linux-next. On the current linux-next kernel
they hangs on creating a network namespace.

The kernel log contains many massages like this:
[ 1036.122108] unregister_netdevice: waiting for lo to become free.
Usage count = 2
[ 1046.165156] unregister_netdevice: waiting for lo to become free.
Usage count = 2
[ 1056.210287] unregister_netdevice: waiting for lo to become free.
Usage count = 2

I tried to revert this patch and the bug disappeared.

Here is a set of commands to reproduce this bug:

[root@linux-next-test linux-next]# uname -a
Linux linux-next-test 4.5.0-rc6-next-20160301+ #3 SMP Wed Mar 2
17:32:18 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux

[root@linux-next-test ~]# unshare -n
[root@linux-next-test ~]# ip link set up dev lo
[root@linux-next-test ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN
group default qlen 1
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host
       valid_lft forever preferred_lft forever
[root@linux-next-test ~]# logout
[root@linux-next-test ~]# unshare -n

 -----

The problem is a change made to RTM_DELADDR case in __ipv6_ifa_notify that
was added in an early version of the offending patch and is no longer
needed.

Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
Cc: Andrey Wagin <avagin@gmail.com>
Cc: Ying Huang <ying.huang@linux.intel.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: Jeremiah Mahler <jmmahler@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b1169d511ab4..8c0dab2de5c9 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5319,12 +5319,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			if (rt)
 				ip6_del_rt(rt);
 		}
-		if (ifp->rt) {
-			dst_hold(&ifp->rt->dst);
+		dst_hold(&ifp->rt->dst);
+
+		ip6_del_rt(ifp->rt);
 
-			ip6_del_rt(ifp->rt);
-			ifp->rt = NULL;
-		}
 		rt_genid_bump_ipv6(net);
 		break;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 1f27cde313d72d6b44a73ba89c8b2c6a99c628cf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Mar 2016 08:21:43 -0800
Subject: net: sched: use pfifo_fast for non real queues

Some devices declare a high number of TX queues, then set a much
lower real_num_tx_queues

This cause setups using fq_codel, sfq or fq as the default qdisc to consume
more memory than really needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 6 ++++++
 net/sched/sch_generic.c   | 1 +
 net/sched/sch_mq.c        | 2 +-
 net/sched/sch_mqprio.c    | 3 ++-
 4 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e5bba897d206..46e55f0202a6 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -345,6 +345,12 @@ extern struct Qdisc_ops pfifo_fast_ops;
 extern struct Qdisc_ops mq_qdisc_ops;
 extern struct Qdisc_ops noqueue_qdisc_ops;
 extern const struct Qdisc_ops *default_qdisc_ops;
+static inline const struct Qdisc_ops *
+get_default_qdisc_ops(const struct net_device *dev, int ntx)
+{
+	return ntx < dev->real_num_tx_queues ?
+			default_qdisc_ops : &pfifo_fast_ops;
+}
 
 struct Qdisc_class_common {
 	u32			classid;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 16bc83b2842a..f18c35024207 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -567,6 +567,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,
 };
+EXPORT_SYMBOL(pfifo_fast_ops);
 
 static struct lock_class_key qdisc_tx_busylock;
 
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 3e82f047caaf..56a77b878eb3 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
 		dev_queue = netdev_get_tx_queue(dev, ntx);
-		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+		qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(ntx + 1)));
 		if (qdisc == NULL)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 02ffb3fbbc20..b8002ce3d010 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -125,7 +125,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		dev_queue = netdev_get_tx_queue(dev, i);
-		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+		qdisc = qdisc_create_dflt(dev_queue,
+					  get_default_qdisc_ops(dev, i),
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(i + 1)));
 		if (qdisc == NULL) {
-- 
cgit v1.2.3-59-g8ed1b


From e721498a63147618283d2a8093664809f300778c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:53:46 +0000
Subject: rxrpc: Fix a case where a call event bit is being used as a flag bit

Fix a case where RXRPC_CALL_RELEASE (an event) is being used to specify a
flag bit.  RXRPC_CALL_RELEASED should be used instead.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-accept.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
index 6d79310fcaae..65ea81399f6a 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/ar-accept.c
@@ -185,7 +185,7 @@ invalid_service:
 	read_unlock_bh(&local->services_lock);
 
 	read_lock_bh(&call->state_lock);
-	if (!test_bit(RXRPC_CALL_RELEASE, &call->flags) &&
+	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
 	    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) {
 		rxrpc_get_call(call);
 		rxrpc_queue_call(call);
-- 
cgit v1.2.3-59-g8ed1b


From 5b8848d14912d5e9974d5d83d71ed6cba2b49345 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:53:46 +0000
Subject: rxrpc: Convert call flag and event numbers into enums

Convert call flag and event numbers into enums and move their definitions
outside of the struct.

Also move the call state enum outside of the struct and add an extra
element to count the number of states.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-ack.c      |   3 +-
 net/rxrpc/ar-call.c     |   2 +-
 net/rxrpc/ar-internal.h | 107 ++++++++++++++++++++++++++++--------------------
 3 files changed, 65 insertions(+), 47 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
index adc555e0323d..7cb7e45c8bad 100644
--- a/net/rxrpc/ar-ack.c
+++ b/net/rxrpc/ar-ack.c
@@ -886,10 +886,11 @@ void rxrpc_process_call(struct work_struct *work)
 	struct rxrpc_header hdr;
 	struct msghdr msg;
 	struct kvec iov[5];
+	enum rxrpc_call_event genbit;
 	unsigned long bits;
 	__be32 data, pad;
 	size_t len;
-	int genbit, loop, nbit, ioc, ret, mtu;
+	int loop, nbit, ioc, ret, mtu;
 	u32 abort_code = RX_PROTOCOL_ERROR;
 	u8 *acks = NULL;
 
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
index a9e05db0f5d5..d9a8eeb9290a 100644
--- a/net/rxrpc/ar-call.c
+++ b/net/rxrpc/ar-call.c
@@ -28,7 +28,7 @@ unsigned rxrpc_max_call_lifetime = 60 * HZ;
  */
 unsigned rxrpc_dead_call_expiry = 2 * HZ;
 
-const char *const rxrpc_call_states[] = {
+const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 	[RXRPC_CALL_CLIENT_SEND_REQUEST]	= "ClSndReq",
 	[RXRPC_CALL_CLIENT_AWAIT_REPLY]		= "ClAwtRpl",
 	[RXRPC_CALL_CLIENT_RECV_REPLY]		= "ClRcvRpl",
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 2934a73a5981..3aea424a88e4 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -292,6 +292,67 @@ struct rxrpc_connection {
 	u8			out_clientflag;	/* RXRPC_CLIENT_INITIATED if we are client */
 };
 
+/*
+ * Flags in call->flags.
+ */
+enum rxrpc_call_flag {
+	RXRPC_CALL_RELEASED,		/* call has been released - no more message to userspace */
+	RXRPC_CALL_TERMINAL_MSG,	/* call has given the socket its final message */
+	RXRPC_CALL_RCVD_LAST,		/* all packets received */
+	RXRPC_CALL_RUN_RTIMER,		/* Tx resend timer started */
+	RXRPC_CALL_TX_SOFT_ACK,		/* sent some soft ACKs */
+	RXRPC_CALL_PROC_BUSY,		/* the processor is busy */
+	RXRPC_CALL_INIT_ACCEPT,		/* acceptance was initiated */
+	RXRPC_CALL_HAS_USERID,		/* has a user ID attached */
+	RXRPC_CALL_EXPECT_OOS,		/* expect out of sequence packets */
+};
+
+/*
+ * Events that can be raised on a call.
+ */
+enum rxrpc_call_event {
+	RXRPC_CALL_RCVD_ACKALL,		/* ACKALL or reply received */
+	RXRPC_CALL_RCVD_BUSY,		/* busy packet received */
+	RXRPC_CALL_RCVD_ABORT,		/* abort packet received */
+	RXRPC_CALL_RCVD_ERROR,		/* network error received */
+	RXRPC_CALL_ACK_FINAL,		/* need to generate final ACK (and release call) */
+	RXRPC_CALL_ACK,			/* need to generate ACK */
+	RXRPC_CALL_REJECT_BUSY,		/* need to generate busy message */
+	RXRPC_CALL_ABORT,		/* need to generate abort */
+	RXRPC_CALL_CONN_ABORT,		/* local connection abort generated */
+	RXRPC_CALL_RESEND_TIMER,	/* Tx resend timer expired */
+	RXRPC_CALL_RESEND,		/* Tx resend required */
+	RXRPC_CALL_DRAIN_RX_OOS,	/* drain the Rx out of sequence queue */
+	RXRPC_CALL_LIFE_TIMER,		/* call's lifetimer ran out */
+	RXRPC_CALL_ACCEPTED,		/* incoming call accepted by userspace app */
+	RXRPC_CALL_SECURED,		/* incoming call's connection is now secure */
+	RXRPC_CALL_POST_ACCEPT,		/* need to post an "accept?" message to the app */
+	RXRPC_CALL_RELEASE,		/* need to release the call's resources */
+};
+
+/*
+ * The states that a call can be in.
+ */
+enum rxrpc_call_state {
+	RXRPC_CALL_CLIENT_SEND_REQUEST,	/* - client sending request phase */
+	RXRPC_CALL_CLIENT_AWAIT_REPLY,	/* - client awaiting reply */
+	RXRPC_CALL_CLIENT_RECV_REPLY,	/* - client receiving reply phase */
+	RXRPC_CALL_CLIENT_FINAL_ACK,	/* - client sending final ACK phase */
+	RXRPC_CALL_SERVER_SECURING,	/* - server securing request connection */
+	RXRPC_CALL_SERVER_ACCEPTING,	/* - server accepting request */
+	RXRPC_CALL_SERVER_RECV_REQUEST,	/* - server receiving request */
+	RXRPC_CALL_SERVER_ACK_REQUEST,	/* - server pending ACK of request */
+	RXRPC_CALL_SERVER_SEND_REPLY,	/* - server sending reply */
+	RXRPC_CALL_SERVER_AWAIT_ACK,	/* - server awaiting final ACK */
+	RXRPC_CALL_COMPLETE,		/* - call completed */
+	RXRPC_CALL_SERVER_BUSY,		/* - call rejected by busy server */
+	RXRPC_CALL_REMOTELY_ABORTED,	/* - call aborted by peer */
+	RXRPC_CALL_LOCALLY_ABORTED,	/* - call aborted locally on error or close */
+	RXRPC_CALL_NETWORK_ERROR,	/* - call terminated by network error */
+	RXRPC_CALL_DEAD,		/* - call is dead */
+	NR__RXRPC_CALL_STATES
+};
+
 /*
  * RxRPC call definition
  * - matched by { connection, call_id }
@@ -317,57 +378,13 @@ struct rxrpc_call {
 	unsigned long		user_call_ID;	/* user-defined call ID */
 	unsigned long		creation_jif;	/* time of call creation */
 	unsigned long		flags;
-#define RXRPC_CALL_RELEASED	0	/* call has been released - no more message to userspace */
-#define RXRPC_CALL_TERMINAL_MSG	1	/* call has given the socket its final message */
-#define RXRPC_CALL_RCVD_LAST	2	/* all packets received */
-#define RXRPC_CALL_RUN_RTIMER	3	/* Tx resend timer started */
-#define RXRPC_CALL_TX_SOFT_ACK	4	/* sent some soft ACKs */
-#define RXRPC_CALL_PROC_BUSY	5	/* the processor is busy */
-#define RXRPC_CALL_INIT_ACCEPT	6	/* acceptance was initiated */
-#define RXRPC_CALL_HAS_USERID	7	/* has a user ID attached */
-#define RXRPC_CALL_EXPECT_OOS	8	/* expect out of sequence packets */
 	unsigned long		events;
-#define RXRPC_CALL_RCVD_ACKALL	0	/* ACKALL or reply received */
-#define RXRPC_CALL_RCVD_BUSY	1	/* busy packet received */
-#define RXRPC_CALL_RCVD_ABORT	2	/* abort packet received */
-#define RXRPC_CALL_RCVD_ERROR	3	/* network error received */
-#define RXRPC_CALL_ACK_FINAL	4	/* need to generate final ACK (and release call) */
-#define RXRPC_CALL_ACK		5	/* need to generate ACK */
-#define RXRPC_CALL_REJECT_BUSY	6	/* need to generate busy message */
-#define RXRPC_CALL_ABORT	7	/* need to generate abort */
-#define RXRPC_CALL_CONN_ABORT	8	/* local connection abort generated */
-#define RXRPC_CALL_RESEND_TIMER	9	/* Tx resend timer expired */
-#define RXRPC_CALL_RESEND	10	/* Tx resend required */
-#define RXRPC_CALL_DRAIN_RX_OOS	11	/* drain the Rx out of sequence queue */
-#define RXRPC_CALL_LIFE_TIMER	12	/* call's lifetimer ran out */
-#define RXRPC_CALL_ACCEPTED	13	/* incoming call accepted by userspace app */
-#define RXRPC_CALL_SECURED	14	/* incoming call's connection is now secure */
-#define RXRPC_CALL_POST_ACCEPT	15	/* need to post an "accept?" message to the app */
-#define RXRPC_CALL_RELEASE	16	/* need to release the call's resources */
-
 	spinlock_t		lock;
 	rwlock_t		state_lock;	/* lock for state transition */
 	atomic_t		usage;
 	atomic_t		sequence;	/* Tx data packet sequence counter */
 	u32			abort_code;	/* local/remote abort code */
-	enum {					/* current state of call */
-		RXRPC_CALL_CLIENT_SEND_REQUEST,	/* - client sending request phase */
-		RXRPC_CALL_CLIENT_AWAIT_REPLY,	/* - client awaiting reply */
-		RXRPC_CALL_CLIENT_RECV_REPLY,	/* - client receiving reply phase */
-		RXRPC_CALL_CLIENT_FINAL_ACK,	/* - client sending final ACK phase */
-		RXRPC_CALL_SERVER_SECURING,	/* - server securing request connection */
-		RXRPC_CALL_SERVER_ACCEPTING,	/* - server accepting request */
-		RXRPC_CALL_SERVER_RECV_REQUEST,	/* - server receiving request */
-		RXRPC_CALL_SERVER_ACK_REQUEST,	/* - server pending ACK of request */
-		RXRPC_CALL_SERVER_SEND_REPLY,	/* - server sending reply */
-		RXRPC_CALL_SERVER_AWAIT_ACK,	/* - server awaiting final ACK */
-		RXRPC_CALL_COMPLETE,		/* - call completed */
-		RXRPC_CALL_SERVER_BUSY,		/* - call rejected by busy server */
-		RXRPC_CALL_REMOTELY_ABORTED,	/* - call aborted by peer */
-		RXRPC_CALL_LOCALLY_ABORTED,	/* - call aborted locally on error or close */
-		RXRPC_CALL_NETWORK_ERROR,	/* - call terminated by network error */
-		RXRPC_CALL_DEAD,		/* - call is dead */
-	} state;
+	enum rxrpc_call_state	state : 8;	/* current state of call */
 	int			debug_id;	/* debug ID for printks */
 	u8			channel;	/* connection channel occupied by this call */
 
-- 
cgit v1.2.3-59-g8ed1b


From 4c198ad17a7253cc8ef3ff39bfe73d6b5e65ceef Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:53:46 +0000
Subject: rxrpc: Rename call events to begin RXRPC_CALL_EV_

Rename call event names to begin RXRPC_CALL_EV_ to distinguish them from the
flags.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-accept.c    |  10 ++---
 net/rxrpc/ar-ack.c       | 102 +++++++++++++++++++++++------------------------
 net/rxrpc/ar-call.c      |  14 +++----
 net/rxrpc/ar-connevent.c |   6 +--
 net/rxrpc/ar-error.c     |   2 +-
 net/rxrpc/ar-input.c     |  20 +++++-----
 net/rxrpc/ar-internal.h  |  36 ++++++++---------
 net/rxrpc/ar-output.c    |   8 ++--
 net/rxrpc/ar-recvmsg.c   |   2 +-
 net/rxrpc/ar-skbuff.c    |   2 +-
 10 files changed, 101 insertions(+), 101 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
index 65ea81399f6a..9a49f32e9e1e 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/ar-accept.c
@@ -186,7 +186,7 @@ invalid_service:
 
 	read_lock_bh(&call->state_lock);
 	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) {
+	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
 		rxrpc_get_call(call);
 		rxrpc_queue_call(call);
 	}
@@ -379,7 +379,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 	rb_insert_color(&call->sock_node, &rx->calls);
 	if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
 		BUG();
-	if (test_and_set_bit(RXRPC_CALL_ACCEPTED, &call->events))
+	if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events))
 		BUG();
 	rxrpc_queue_call(call);
 
@@ -395,7 +395,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 out_release:
 	_debug("release %p", call);
 	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
 		rxrpc_queue_call(call);
 out_discard:
 	write_unlock_bh(&call->state_lock);
@@ -434,7 +434,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
 	switch (call->state) {
 	case RXRPC_CALL_SERVER_ACCEPTING:
 		call->state = RXRPC_CALL_SERVER_BUSY;
-		if (test_and_set_bit(RXRPC_CALL_REJECT_BUSY, &call->events))
+		if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events))
 			rxrpc_queue_call(call);
 		ret = 0;
 		goto out_release;
@@ -458,7 +458,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
 out_release:
 	_debug("release %p", call);
 	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
 		rxrpc_queue_call(call);
 out_discard:
 	write_unlock_bh(&call->state_lock);
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
index 7cb7e45c8bad..9183da740600 100644
--- a/net/rxrpc/ar-ack.c
+++ b/net/rxrpc/ar-ack.c
@@ -161,7 +161,7 @@ cancel_timer:
 	try_to_del_timer_sync(&call->ack_timer);
 	read_lock_bh(&call->state_lock);
 	if (call->state <= RXRPC_CALL_COMPLETE &&
-	    !test_and_set_bit(RXRPC_CALL_ACK, &call->events))
+	    !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
 		rxrpc_queue_call(call);
 	read_unlock_bh(&call->state_lock);
 }
@@ -193,7 +193,7 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
 
 	if (resend & 1) {
 		_debug("SET RESEND");
-		set_bit(RXRPC_CALL_RESEND, &call->events);
+		set_bit(RXRPC_CALL_EV_RESEND, &call->events);
 	}
 
 	if (resend & 2) {
@@ -203,7 +203,7 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
 	} else {
 		_debug("KILL RESEND TIMER");
 		del_timer_sync(&call->resend_timer);
-		clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+		clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
 		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
 	}
 	read_unlock_bh(&call->state_lock);
@@ -555,7 +555,7 @@ inserted:
 	if (call->state < RXRPC_CALL_COMPLETE &&
 	    call->rx_data_post == call->rx_first_oos) {
 		_debug("drain rx oos now");
-		set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events);
+		set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
 	}
 	read_unlock(&call->state_lock);
 
@@ -793,7 +793,7 @@ all_acked:
 
 	del_timer_sync(&call->resend_timer);
 	clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+	clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
 
 	if (call->acks_window)
 		rxrpc_zap_tx_window(call);
@@ -928,17 +928,17 @@ void rxrpc_process_call(struct work_struct *work)
 	iov[0].iov_len	= sizeof(hdr);
 
 	/* deal with events of a final nature */
-	if (test_bit(RXRPC_CALL_RELEASE, &call->events)) {
+	if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
 		rxrpc_release_call(call);
-		clear_bit(RXRPC_CALL_RELEASE, &call->events);
+		clear_bit(RXRPC_CALL_EV_RELEASE, &call->events);
 	}
 
-	if (test_bit(RXRPC_CALL_RCVD_ERROR, &call->events)) {
+	if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
 		int error;
 
-		clear_bit(RXRPC_CALL_CONN_ABORT, &call->events);
-		clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events);
-		clear_bit(RXRPC_CALL_ABORT, &call->events);
+		clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
+		clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
+		clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
 
 		error = call->conn->trans->peer->net_error;
 		_debug("post net error %d", error);
@@ -946,32 +946,32 @@ void rxrpc_process_call(struct work_struct *work)
 		if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR,
 				       error, true) < 0)
 			goto no_mem;
-		clear_bit(RXRPC_CALL_RCVD_ERROR, &call->events);
+		clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
 		goto kill_ACKs;
 	}
 
-	if (test_bit(RXRPC_CALL_CONN_ABORT, &call->events)) {
+	if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) {
 		ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
 
-		clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events);
-		clear_bit(RXRPC_CALL_ABORT, &call->events);
+		clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
+		clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
 
 		_debug("post conn abort");
 
 		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
 				       call->conn->error, true) < 0)
 			goto no_mem;
-		clear_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+		clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
 		goto kill_ACKs;
 	}
 
-	if (test_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) {
+	if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) {
 		hdr.type = RXRPC_PACKET_TYPE_BUSY;
-		genbit = RXRPC_CALL_REJECT_BUSY;
+		genbit = RXRPC_CALL_EV_REJECT_BUSY;
 		goto send_message;
 	}
 
-	if (test_bit(RXRPC_CALL_ABORT, &call->events)) {
+	if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
 		ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
 
 		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
@@ -981,12 +981,12 @@ void rxrpc_process_call(struct work_struct *work)
 		data = htonl(call->abort_code);
 		iov[1].iov_base = &data;
 		iov[1].iov_len = sizeof(data);
-		genbit = RXRPC_CALL_ABORT;
+		genbit = RXRPC_CALL_EV_ABORT;
 		goto send_message;
 	}
 
-	if (test_bit(RXRPC_CALL_ACK_FINAL, &call->events)) {
-		genbit = RXRPC_CALL_ACK_FINAL;
+	if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) {
+		genbit = RXRPC_CALL_EV_ACK_FINAL;
 
 		ack.bufferSpace	= htons(8);
 		ack.maxSkew	= 0;
@@ -1012,12 +1012,12 @@ void rxrpc_process_call(struct work_struct *work)
 		goto send_ACK;
 	}
 
-	if (call->events & ((1 << RXRPC_CALL_RCVD_BUSY) |
-			    (1 << RXRPC_CALL_RCVD_ABORT))
+	if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) |
+			    (1 << RXRPC_CALL_EV_RCVD_ABORT))
 	    ) {
 		u32 mark;
 
-		if (test_bit(RXRPC_CALL_RCVD_ABORT, &call->events))
+		if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events))
 			mark = RXRPC_SKB_MARK_REMOTE_ABORT;
 		else
 			mark = RXRPC_SKB_MARK_BUSY;
@@ -1027,22 +1027,22 @@ void rxrpc_process_call(struct work_struct *work)
 		if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
 			goto no_mem;
 
-		clear_bit(RXRPC_CALL_RCVD_BUSY, &call->events);
-		clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+		clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
+		clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
 		goto kill_ACKs;
 	}
 
-	if (test_and_clear_bit(RXRPC_CALL_RCVD_ACKALL, &call->events)) {
+	if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) {
 		_debug("do implicit ackall");
 		rxrpc_clear_tx_window(call);
 	}
 
-	if (test_bit(RXRPC_CALL_LIFE_TIMER, &call->events)) {
+	if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
 		write_lock_bh(&call->state_lock);
 		if (call->state <= RXRPC_CALL_COMPLETE) {
 			call->state = RXRPC_CALL_LOCALLY_ABORTED;
 			call->abort_code = RX_CALL_TIMEOUT;
-			set_bit(RXRPC_CALL_ABORT, &call->events);
+			set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		}
 		write_unlock_bh(&call->state_lock);
 
@@ -1051,7 +1051,7 @@ void rxrpc_process_call(struct work_struct *work)
 				       ETIME, true) < 0)
 			goto no_mem;
 
-		clear_bit(RXRPC_CALL_LIFE_TIMER, &call->events);
+		clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
 		goto kill_ACKs;
 	}
 
@@ -1072,13 +1072,13 @@ void rxrpc_process_call(struct work_struct *work)
 	}
 
 	/* handle resending */
-	if (test_and_clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+	if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
 		rxrpc_resend_timer(call);
-	if (test_and_clear_bit(RXRPC_CALL_RESEND, &call->events))
+	if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events))
 		rxrpc_resend(call);
 
 	/* consider sending an ordinary ACK */
-	if (test_bit(RXRPC_CALL_ACK, &call->events)) {
+	if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
 		_debug("send ACK: window: %d - %d { %lx }",
 		       call->rx_data_eaten, call->ackr_win_top,
 		       call->ackr_window[0]);
@@ -1086,11 +1086,11 @@ void rxrpc_process_call(struct work_struct *work)
 		if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
 		    call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
 			/* ACK by sending reply DATA packet in this state */
-			clear_bit(RXRPC_CALL_ACK, &call->events);
+			clear_bit(RXRPC_CALL_EV_ACK, &call->events);
 			goto maybe_reschedule;
 		}
 
-		genbit = RXRPC_CALL_ACK;
+		genbit = RXRPC_CALL_EV_ACK;
 
 		acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
 			       GFP_NOFS);
@@ -1153,7 +1153,7 @@ void rxrpc_process_call(struct work_struct *work)
 
 	/* handle completion of security negotiations on an incoming
 	 * connection */
-	if (test_and_clear_bit(RXRPC_CALL_SECURED, &call->events)) {
+	if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) {
 		_debug("secured");
 		spin_lock_bh(&call->lock);
 
@@ -1161,7 +1161,7 @@ void rxrpc_process_call(struct work_struct *work)
 			_debug("securing");
 			write_lock(&call->conn->lock);
 			if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-			    !test_bit(RXRPC_CALL_RELEASE, &call->events)) {
+			    !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
 				_debug("not released");
 				call->state = RXRPC_CALL_SERVER_ACCEPTING;
 				list_move_tail(&call->accept_link,
@@ -1170,39 +1170,39 @@ void rxrpc_process_call(struct work_struct *work)
 			write_unlock(&call->conn->lock);
 			read_lock(&call->state_lock);
 			if (call->state < RXRPC_CALL_COMPLETE)
-				set_bit(RXRPC_CALL_POST_ACCEPT, &call->events);
+				set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
 			read_unlock(&call->state_lock);
 		}
 
 		spin_unlock_bh(&call->lock);
-		if (!test_bit(RXRPC_CALL_POST_ACCEPT, &call->events))
+		if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events))
 			goto maybe_reschedule;
 	}
 
 	/* post a notification of an acceptable connection to the app */
-	if (test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) {
+	if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) {
 		_debug("post accept");
 		if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
 				       0, false) < 0)
 			goto no_mem;
-		clear_bit(RXRPC_CALL_POST_ACCEPT, &call->events);
+		clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
 		goto maybe_reschedule;
 	}
 
 	/* handle incoming call acceptance */
-	if (test_and_clear_bit(RXRPC_CALL_ACCEPTED, &call->events)) {
+	if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) {
 		_debug("accepted");
 		ASSERTCMP(call->rx_data_post, ==, 0);
 		call->rx_data_post = 1;
 		read_lock_bh(&call->state_lock);
 		if (call->state < RXRPC_CALL_COMPLETE)
-			set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events);
+			set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
 		read_unlock_bh(&call->state_lock);
 	}
 
 	/* drain the out of sequence received packet queue into the packet Rx
 	 * queue */
-	if (test_and_clear_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) {
+	if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) {
 		while (call->rx_data_post == call->rx_first_oos)
 			if (rxrpc_drain_rx_oos_queue(call) < 0)
 				break;
@@ -1281,12 +1281,12 @@ send_message_2:
 	}
 
 	switch (genbit) {
-	case RXRPC_CALL_ABORT:
+	case RXRPC_CALL_EV_ABORT:
 		clear_bit(genbit, &call->events);
-		clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+		clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
 		goto kill_ACKs;
 
-	case RXRPC_CALL_ACK_FINAL:
+	case RXRPC_CALL_EV_ACK_FINAL:
 		write_lock_bh(&call->state_lock);
 		if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK)
 			call->state = RXRPC_CALL_COMPLETE;
@@ -1311,9 +1311,9 @@ send_message_2:
 
 kill_ACKs:
 	del_timer_sync(&call->ack_timer);
-	if (test_and_clear_bit(RXRPC_CALL_ACK_FINAL, &call->events))
+	if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events))
 		rxrpc_put_call(call);
-	clear_bit(RXRPC_CALL_ACK, &call->events);
+	clear_bit(RXRPC_CALL_EV_ACK, &call->events);
 
 maybe_reschedule:
 	if (call->events || !skb_queue_empty(&call->rx_queue)) {
@@ -1332,7 +1332,7 @@ maybe_reschedule:
 
 		read_lock_bh(&call->state_lock);
 		if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-		    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+		    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
 			rxrpc_queue_call(call);
 		read_unlock_bh(&call->state_lock);
 	}
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
index d9a8eeb9290a..3468a0705ab7 100644
--- a/net/rxrpc/ar-call.c
+++ b/net/rxrpc/ar-call.c
@@ -452,7 +452,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 		read_lock(&call->state_lock);
 		switch (call->state) {
 		case RXRPC_CALL_LOCALLY_ABORTED:
-			if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events))
+			if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
 				rxrpc_queue_call(call);
 		case RXRPC_CALL_REMOTELY_ABORTED:
 			read_unlock(&call->state_lock);
@@ -686,7 +686,7 @@ void rxrpc_release_call(struct rxrpc_call *call)
 		_debug("+++ ABORTING STATE %d +++\n", call->state);
 		call->state = RXRPC_CALL_LOCALLY_ABORTED;
 		call->abort_code = RX_CALL_DEAD;
-		set_bit(RXRPC_CALL_ABORT, &call->events);
+		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		rxrpc_queue_call(call);
 	}
 	write_unlock(&call->state_lock);
@@ -763,10 +763,10 @@ static void rxrpc_mark_call_released(struct rxrpc_call *call)
 			_debug("abort call %p", call);
 			call->state = RXRPC_CALL_LOCALLY_ABORTED;
 			call->abort_code = RX_CALL_DEAD;
-			if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events))
+			if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
 				sched = true;
 		}
-		if (!test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+		if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
 			sched = true;
 		if (sched)
 			rxrpc_queue_call(call);
@@ -975,7 +975,7 @@ static void rxrpc_call_life_expired(unsigned long _call)
 	_enter("{%d}", call->debug_id);
 	read_lock_bh(&call->state_lock);
 	if (call->state < RXRPC_CALL_COMPLETE) {
-		set_bit(RXRPC_CALL_LIFE_TIMER, &call->events);
+		set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
 		rxrpc_queue_call(call);
 	}
 	read_unlock_bh(&call->state_lock);
@@ -995,7 +995,7 @@ static void rxrpc_resend_time_expired(unsigned long _call)
 		return;
 
 	clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	if (!test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+	if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
 		rxrpc_queue_call(call);
 }
 
@@ -1013,7 +1013,7 @@ static void rxrpc_ack_time_expired(unsigned long _call)
 
 	read_lock_bh(&call->state_lock);
 	if (call->state < RXRPC_CALL_COMPLETE &&
-	    !test_and_set_bit(RXRPC_CALL_ACK, &call->events))
+	    !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
 		rxrpc_queue_call(call);
 	read_unlock_bh(&call->state_lock);
 }
diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c
index e7ed43a54c41..45e81b7e96ae 100644
--- a/net/rxrpc/ar-connevent.c
+++ b/net/rxrpc/ar-connevent.c
@@ -42,9 +42,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
 			call->state = state;
 			call->abort_code = abort_code;
 			if (state == RXRPC_CALL_LOCALLY_ABORTED)
-				set_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+				set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
 			else
-				set_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+				set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
 			rxrpc_queue_call(call);
 		}
 		write_unlock(&call->state_lock);
@@ -132,7 +132,7 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call)
 	if (call) {
 		read_lock(&call->state_lock);
 		if (call->state < RXRPC_CALL_COMPLETE &&
-		    !test_and_set_bit(RXRPC_CALL_SECURED, &call->events))
+		    !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events))
 			rxrpc_queue_call(call);
 		read_unlock(&call->state_lock);
 	}
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
index 0610efa83d72..3c47441c472b 100644
--- a/net/rxrpc/ar-error.c
+++ b/net/rxrpc/ar-error.c
@@ -222,7 +222,7 @@ void rxrpc_UDP_error_handler(struct work_struct *work)
 			if (call->state != RXRPC_CALL_COMPLETE &&
 			    call->state < RXRPC_CALL_NETWORK_ERROR) {
 				call->state = RXRPC_CALL_NETWORK_ERROR;
-				set_bit(RXRPC_CALL_RCVD_ERROR, &call->events);
+				set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
 				rxrpc_queue_call(call);
 			}
 			write_unlock(&call->state_lock);
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index 4505a691d88c..9185535af5f5 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -231,7 +231,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call,
 		_debug("drain rx oos now");
 		read_lock(&call->state_lock);
 		if (call->state < RXRPC_CALL_COMPLETE &&
-		    !test_and_set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events))
+		    !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events))
 			rxrpc_queue_call(call);
 		read_unlock(&call->state_lock);
 	}
@@ -287,12 +287,12 @@ static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
 		call->acks_latest = serial;
 
 		_debug("implicit ACKALL %%%u", call->acks_latest);
-		set_bit(RXRPC_CALL_RCVD_ACKALL, &call->events);
+		set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events);
 		write_unlock_bh(&call->state_lock);
 
 		if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
-			clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
-			clear_bit(RXRPC_CALL_RESEND, &call->events);
+			clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
+			clear_bit(RXRPC_CALL_EV_RESEND, &call->events);
 			clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
 		}
 		break;
@@ -358,7 +358,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		if (call->state < RXRPC_CALL_COMPLETE) {
 			call->state = RXRPC_CALL_REMOTELY_ABORTED;
 			call->abort_code = abort_code;
-			set_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+			set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
 			rxrpc_queue_call(call);
 		}
 		goto free_packet_unlock;
@@ -373,7 +373,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		switch (call->state) {
 		case RXRPC_CALL_CLIENT_SEND_REQUEST:
 			call->state = RXRPC_CALL_SERVER_BUSY;
-			set_bit(RXRPC_CALL_RCVD_BUSY, &call->events);
+			set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
 			rxrpc_queue_call(call);
 		case RXRPC_CALL_SERVER_BUSY:
 			goto free_packet_unlock;
@@ -433,7 +433,7 @@ protocol_error_locked:
 	if (call->state <= RXRPC_CALL_COMPLETE) {
 		call->state = RXRPC_CALL_LOCALLY_ABORTED;
 		call->abort_code = RX_PROTOCOL_ERROR;
-		set_bit(RXRPC_CALL_ABORT, &call->events);
+		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		rxrpc_queue_call(call);
 	}
 free_packet_unlock:
@@ -505,7 +505,7 @@ protocol_error:
 	if (call->state <= RXRPC_CALL_COMPLETE) {
 		call->state = RXRPC_CALL_LOCALLY_ABORTED;
 		call->abort_code = RX_PROTOCOL_ERROR;
-		set_bit(RXRPC_CALL_ABORT, &call->events);
+		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		rxrpc_queue_call(call);
 	}
 	write_unlock_bh(&call->state_lock);
@@ -530,7 +530,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
 	read_lock(&call->state_lock);
 	switch (call->state) {
 	case RXRPC_CALL_LOCALLY_ABORTED:
-		if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) {
+		if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
 			rxrpc_queue_call(call);
 			goto free_unlock;
 		}
@@ -546,7 +546,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
 		/* resend last packet of a completed call */
 		_debug("final ack again");
 		rxrpc_get_call(call);
-		set_bit(RXRPC_CALL_ACK_FINAL, &call->events);
+		set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 		rxrpc_queue_call(call);
 		goto free_unlock;
 	default:
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 3aea424a88e4..3f2940626569 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -311,23 +311,23 @@ enum rxrpc_call_flag {
  * Events that can be raised on a call.
  */
 enum rxrpc_call_event {
-	RXRPC_CALL_RCVD_ACKALL,		/* ACKALL or reply received */
-	RXRPC_CALL_RCVD_BUSY,		/* busy packet received */
-	RXRPC_CALL_RCVD_ABORT,		/* abort packet received */
-	RXRPC_CALL_RCVD_ERROR,		/* network error received */
-	RXRPC_CALL_ACK_FINAL,		/* need to generate final ACK (and release call) */
-	RXRPC_CALL_ACK,			/* need to generate ACK */
-	RXRPC_CALL_REJECT_BUSY,		/* need to generate busy message */
-	RXRPC_CALL_ABORT,		/* need to generate abort */
-	RXRPC_CALL_CONN_ABORT,		/* local connection abort generated */
-	RXRPC_CALL_RESEND_TIMER,	/* Tx resend timer expired */
-	RXRPC_CALL_RESEND,		/* Tx resend required */
-	RXRPC_CALL_DRAIN_RX_OOS,	/* drain the Rx out of sequence queue */
-	RXRPC_CALL_LIFE_TIMER,		/* call's lifetimer ran out */
-	RXRPC_CALL_ACCEPTED,		/* incoming call accepted by userspace app */
-	RXRPC_CALL_SECURED,		/* incoming call's connection is now secure */
-	RXRPC_CALL_POST_ACCEPT,		/* need to post an "accept?" message to the app */
-	RXRPC_CALL_RELEASE,		/* need to release the call's resources */
+	RXRPC_CALL_EV_RCVD_ACKALL,	/* ACKALL or reply received */
+	RXRPC_CALL_EV_RCVD_BUSY,	/* busy packet received */
+	RXRPC_CALL_EV_RCVD_ABORT,	/* abort packet received */
+	RXRPC_CALL_EV_RCVD_ERROR,	/* network error received */
+	RXRPC_CALL_EV_ACK_FINAL,	/* need to generate final ACK (and release call) */
+	RXRPC_CALL_EV_ACK,		/* need to generate ACK */
+	RXRPC_CALL_EV_REJECT_BUSY,	/* need to generate busy message */
+	RXRPC_CALL_EV_ABORT,		/* need to generate abort */
+	RXRPC_CALL_EV_CONN_ABORT,	/* local connection abort generated */
+	RXRPC_CALL_EV_RESEND_TIMER,	/* Tx resend timer expired */
+	RXRPC_CALL_EV_RESEND,		/* Tx resend required */
+	RXRPC_CALL_EV_DRAIN_RX_OOS,	/* drain the Rx out of sequence queue */
+	RXRPC_CALL_EV_LIFE_TIMER,	/* call's lifetimer ran out */
+	RXRPC_CALL_EV_ACCEPTED,		/* incoming call accepted by userspace app */
+	RXRPC_CALL_EV_SECURED,		/* incoming call's connection is now secure */
+	RXRPC_CALL_EV_POST_ACCEPT,	/* need to post an "accept?" message to the app */
+	RXRPC_CALL_EV_RELEASE,		/* need to release the call's resources */
 };
 
 /*
@@ -440,7 +440,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code)
 	if (call->state < RXRPC_CALL_COMPLETE) {
 		call->abort_code = abort_code;
 		call->state = RXRPC_CALL_LOCALLY_ABORTED;
-		set_bit(RXRPC_CALL_ABORT, &call->events);
+		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 	}
 	write_unlock_bh(&call->state_lock);
 }
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index 14c4e12c47b0..9e1527a6d026 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -111,11 +111,11 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
 	if (call->state <= RXRPC_CALL_COMPLETE) {
 		call->state = RXRPC_CALL_LOCALLY_ABORTED;
 		call->abort_code = abort_code;
-		set_bit(RXRPC_CALL_ABORT, &call->events);
+		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		del_timer_sync(&call->resend_timer);
 		del_timer_sync(&call->ack_timer);
-		clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
-		clear_bit(RXRPC_CALL_ACK, &call->events);
+		clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
+		clear_bit(RXRPC_CALL_EV_ACK, &call->events);
 		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
 		rxrpc_queue_call(call);
 	}
@@ -437,7 +437,7 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call)
 	if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
 		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
 		if (call->state < RXRPC_CALL_COMPLETE &&
-		    !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+		    !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
 			rxrpc_queue_call(call);
 	}
 	read_unlock_bh(&call->state_lock);
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index b92beded7459..70f47033ff2f 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -33,7 +33,7 @@ void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call)
 
 	read_lock_bh(&call->state_lock);
 	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
-	    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+	    !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
 		rxrpc_queue_call(call);
 	read_unlock_bh(&call->state_lock);
 }
diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c
index 4cfab49e329d..81f3c0238b9a 100644
--- a/net/rxrpc/ar-skbuff.c
+++ b/net/rxrpc/ar-skbuff.c
@@ -34,7 +34,7 @@ static void rxrpc_request_final_ACK(struct rxrpc_call *call)
 		/* get an extra ref on the call for the final-ACK generator to
 		 * release */
 		rxrpc_get_call(call);
-		set_bit(RXRPC_CALL_ACK_FINAL, &call->events);
+		set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
 		if (try_to_del_timer_sync(&call->ack_timer) >= 0)
 			rxrpc_queue_call(call);
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 0d12f8a4027d021c9cc942f09f38d28288020c5d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:53:46 +0000
Subject: rxrpc: Keep the skb private record of the Rx header in host byte
 order

Currently, a copy of the Rx packet header is copied into the the sk_buff
private data so that we can advance the pointer into the buffer,
potentially discarding the original.  At the moment, this copy is held in
network byte order, but this means we're doing a lot of unnecessary
translations.

The reasons it was done this way are that we need the values in network
byte order occasionally and we can use the copy, slightly modified, as part
of an iov array when sending an ack or an abort packet.

However, it seems more reasonable on review that it would be better kept in
host byte order and that we make up a new header when we want to send
another packet.

To this end, rename the original header struct to rxrpc_wire_header (with
BE fields) and institute a variant called rxrpc_host_header that has host
order fields.  Change the struct in the sk_buff private data into an
rxrpc_host_header and translate the values when filling it in.

This further allows us to keep values kept in various structures in host
byte order rather than network byte order and allows removal of some fields
that are byteswapped duplicates.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/rxrpc/packet.h    |   4 +-
 net/rxrpc/af_rxrpc.c      |  20 +++----
 net/rxrpc/ar-accept.c     |  40 ++++++++------
 net/rxrpc/ar-ack.c        | 108 +++++++++++++++++-------------------
 net/rxrpc/ar-call.c       |  68 +++++++++++------------
 net/rxrpc/ar-connection.c |  83 ++++++++++++++--------------
 net/rxrpc/ar-connevent.c  |  73 ++++++++++++------------
 net/rxrpc/ar-input.c      |  95 +++++++++++++++++--------------
 net/rxrpc/ar-internal.h   |  65 +++++++++++++---------
 net/rxrpc/ar-local.c      |  29 ++++++----
 net/rxrpc/ar-output.c     |  54 ++++++++++++------
 net/rxrpc/ar-peer.c       |   2 +-
 net/rxrpc/ar-proc.c       |  10 ++--
 net/rxrpc/ar-recvmsg.c    |  18 +++---
 net/rxrpc/ar-security.c   |   4 +-
 net/rxrpc/ar-skbuff.c     |   4 +-
 net/rxrpc/ar-transport.c  |   1 +
 net/rxrpc/rxkad.c         | 138 +++++++++++++++++++++++-----------------------
 18 files changed, 432 insertions(+), 384 deletions(-)

(limited to 'net')

diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index 4dce116bfd80..de1e67988ada 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -22,7 +22,7 @@ typedef __be32	rxrpc_serial_net_t; /* on-the-wire Rx message serial number */
  * on-the-wire Rx packet header
  * - all multibyte fields should be in network byte order
  */
-struct rxrpc_header {
+struct rxrpc_wire_header {
 	__be32		epoch;		/* client boot timestamp */
 
 	__be32		cid;		/* connection and channel ID */
@@ -68,8 +68,6 @@ struct rxrpc_header {
 
 } __packed;
 
-#define __rxrpc_header_off(X) offsetof(struct rxrpc_header,X)
-
 extern const char *rxrpc_pkts[];
 
 /*****************************************************************************/
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 7e2d1057d8bc..7bb5cca0ae32 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -37,7 +37,7 @@ static struct proto rxrpc_proto;
 static const struct proto_ops rxrpc_rpc_ops;
 
 /* local epoch for detecting local-end reset */
-__be32 rxrpc_epoch;
+u32 rxrpc_epoch;
 
 /* current debugging ID */
 atomic_t rxrpc_debug_id;
@@ -125,7 +125,6 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 	struct sock *sk = sock->sk;
 	struct rxrpc_local *local;
 	struct rxrpc_sock *rx = rxrpc_sk(sk), *prx;
-	__be16 service_id;
 	int ret;
 
 	_enter("%p,%p,%d", rx, saddr, len);
@@ -152,14 +151,12 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 
 	rx->local = local;
 	if (srx->srx_service) {
-		service_id = htons(srx->srx_service);
 		write_lock_bh(&local->services_lock);
 		list_for_each_entry(prx, &local->services, listen_link) {
-			if (prx->service_id == service_id)
+			if (prx->srx.srx_service == srx->srx_service)
 				goto service_in_use;
 		}
 
-		rx->service_id = service_id;
 		list_add_tail(&rx->listen_link, &local->services);
 		write_unlock_bh(&local->services_lock);
 
@@ -276,7 +273,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 	struct rxrpc_transport *trans;
 	struct rxrpc_call *call;
 	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
-	__be16 service_id;
 
 	_enter(",,%x,%lx", key_serial(key), user_call_ID);
 
@@ -299,16 +295,15 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 		atomic_inc(&trans->usage);
 	}
 
-	service_id = rx->service_id;
-	if (srx)
-		service_id = htons(srx->srx_service);
+	if (!srx)
+		srx = &rx->srx;
 
 	if (!key)
 		key = rx->key;
 	if (key && !key->payload.data[0])
 		key = NULL; /* a no-security key */
 
-	bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp);
+	bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp);
 	if (IS_ERR(bundle)) {
 		call = ERR_CAST(bundle);
 		goto out;
@@ -425,7 +420,6 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr,
 	}
 
 	rx->trans = trans;
-	rx->service_id = htons(srx->srx_service);
 	rx->sk.sk_state = RXRPC_CLIENT_CONNECTED;
 
 	release_sock(&rx->sk);
@@ -778,7 +772,7 @@ static struct proto rxrpc_proto = {
 	.name		= "RXRPC",
 	.owner		= THIS_MODULE,
 	.obj_size	= sizeof(struct rxrpc_sock),
-	.max_header	= sizeof(struct rxrpc_header),
+	.max_header	= sizeof(struct rxrpc_wire_header),
 };
 
 static const struct net_proto_family rxrpc_family_ops = {
@@ -796,7 +790,7 @@ static int __init af_rxrpc_init(void)
 
 	BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
 
-	rxrpc_epoch = htonl(get_seconds());
+	rxrpc_epoch = get_seconds();
 
 	ret = -ENOMEM;
 	rxrpc_call_jar = kmem_cache_create(
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
index 9a49f32e9e1e..73c905416271 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/ar-accept.c
@@ -27,7 +27,7 @@
  * generate a connection-level abort
  */
 static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
-		      struct rxrpc_header *hdr)
+		      struct rxrpc_wire_header *whdr)
 {
 	struct msghdr msg;
 	struct kvec iov[1];
@@ -36,25 +36,21 @@ static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
 
 	_enter("%d,,", local->debug_id);
 
+	whdr->type	= RXRPC_PACKET_TYPE_BUSY;
+	whdr->serial	= htonl(1);
+
 	msg.msg_name	= &srx->transport.sin;
 	msg.msg_namelen	= sizeof(srx->transport.sin);
 	msg.msg_control	= NULL;
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr->seq	= 0;
-	hdr->type	= RXRPC_PACKET_TYPE_BUSY;
-	hdr->flags	= 0;
-	hdr->userStatus	= 0;
-	hdr->_rsvd	= 0;
-
-	iov[0].iov_base	= hdr;
-	iov[0].iov_len	= sizeof(*hdr);
+	iov[0].iov_base	= whdr;
+	iov[0].iov_len	= sizeof(*whdr);
 
 	len = iov[0].iov_len;
 
-	hdr->serial = htonl(1);
-	_proto("Tx BUSY %%%u", ntohl(hdr->serial));
+	_proto("Tx BUSY %%1");
 
 	ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
 	if (ret < 0) {
@@ -211,8 +207,8 @@ void rxrpc_accept_incoming_calls(struct work_struct *work)
 	struct rxrpc_skb_priv *sp;
 	struct sockaddr_rxrpc srx;
 	struct rxrpc_sock *rx;
+	struct rxrpc_wire_header whdr;
 	struct sk_buff *skb;
-	__be16 service_id;
 	int ret;
 
 	_enter("%d", local->debug_id);
@@ -240,6 +236,19 @@ process_next_packet:
 
 	sp = rxrpc_skb(skb);
 
+	/* Set up a response packet header in case we need it */
+	whdr.epoch	= htonl(sp->hdr.epoch);
+	whdr.cid	= htonl(sp->hdr.cid);
+	whdr.callNumber	= htonl(sp->hdr.callNumber);
+	whdr.seq	= htonl(sp->hdr.seq);
+	whdr.serial	= 0;
+	whdr.flags	= 0;
+	whdr.type	= 0;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = sp->hdr.securityIndex;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(sp->hdr.serviceId);
+
 	/* determine the remote address */
 	memset(&srx, 0, sizeof(srx));
 	srx.srx_family = AF_RXRPC;
@@ -256,10 +265,9 @@ process_next_packet:
 	}
 
 	/* get the socket providing the service */
-	service_id = sp->hdr.serviceId;
 	read_lock_bh(&local->services_lock);
 	list_for_each_entry(rx, &local->services, listen_link) {
-		if (rx->service_id == service_id &&
+		if (rx->srx.srx_service == sp->hdr.serviceId &&
 		    rx->sk.sk_state != RXRPC_CLOSE)
 			goto found_service;
 	}
@@ -267,7 +275,7 @@ process_next_packet:
 	goto invalid_service;
 
 found_service:
-	_debug("found service %hd", ntohs(rx->service_id));
+	_debug("found service %hd", rx->srx.srx_service);
 	if (sk_acceptq_is_full(&rx->sk))
 		goto backlog_full;
 	sk_acceptq_added(&rx->sk);
@@ -296,7 +304,7 @@ found_service:
 backlog_full:
 	read_unlock_bh(&local->services_lock);
 busy:
-	rxrpc_busy(local, &srx, &sp->hdr);
+	rxrpc_busy(local, &srx, &whdr);
 	rxrpc_free_skb(skb);
 	goto process_next_packet;
 
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
index 9183da740600..20f3f001694e 100644
--- a/net/rxrpc/ar-ack.c
+++ b/net/rxrpc/ar-ack.c
@@ -91,7 +91,7 @@ static const s8 rxrpc_ack_priority[] = {
  * propose an ACK be sent
  */
 void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-			 __be32 serial, bool immediate)
+			 u32 serial, bool immediate)
 {
 	unsigned long expiry;
 	s8 prior = rxrpc_ack_priority[ack_reason];
@@ -99,8 +99,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 	ASSERTCMP(prior, >, 0);
 
 	_enter("{%d},%s,%%%x,%u",
-	       call->debug_id, rxrpc_acks(ack_reason), ntohl(serial),
-	       immediate);
+	       call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
 
 	if (prior < rxrpc_ack_priority[call->ackr_reason]) {
 		if (immediate)
@@ -139,7 +138,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		expiry = rxrpc_requested_ack_delay;
 		if (!expiry)
 			goto cancel_timer;
-		if (!immediate || serial == cpu_to_be32(1)) {
+		if (!immediate || serial == 1) {
 			_debug("run defer timer");
 			goto run_timer;
 		}
@@ -157,7 +156,7 @@ run_timer:
 	return;
 
 cancel_timer:
-	_debug("cancel timer %%%u", ntohl(serial));
+	_debug("cancel timer %%%u", serial);
 	try_to_del_timer_sync(&call->ack_timer);
 	read_lock_bh(&call->state_lock);
 	if (call->state <= RXRPC_CALL_COMPLETE &&
@@ -170,7 +169,7 @@ cancel_timer:
  * propose an ACK be sent, locking the call structure
  */
 void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
-		       __be32 serial, bool immediate)
+		       u32 serial, bool immediate)
 {
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
@@ -214,8 +213,8 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
  */
 static void rxrpc_resend(struct rxrpc_call *call)
 {
+	struct rxrpc_wire_header *whdr;
 	struct rxrpc_skb_priv *sp;
-	struct rxrpc_header *hdr;
 	struct sk_buff *txb;
 	unsigned long *p_txb, resend_at;
 	bool stop;
@@ -247,14 +246,13 @@ static void rxrpc_resend(struct rxrpc_call *call)
 			sp->need_resend = false;
 
 			/* each Tx packet has a new serial number */
-			sp->hdr.serial =
-				htonl(atomic_inc_return(&call->conn->serial));
+			sp->hdr.serial = atomic_inc_return(&call->conn->serial);
 
-			hdr = (struct rxrpc_header *) txb->head;
-			hdr->serial = sp->hdr.serial;
+			whdr = (struct rxrpc_wire_header *)txb->head;
+			whdr->serial = htonl(sp->hdr.serial);
 
 			_proto("Tx DATA %%%u { #%d }",
-			       ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+			       sp->hdr.serial, sp->hdr.seq);
 			if (rxrpc_send_packet(call->conn->trans, txb) < 0) {
 				stop = true;
 				sp->resend_at = jiffies + 3;
@@ -428,7 +426,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
 	int tail = call->acks_tail, old_tail;
 	int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
 
-	_enter("{%u,%u},%u", call->acks_hard, win, hard);
+	kenter("{%u,%u},%u", call->acks_hard, win, hard);
 
 	ASSERTCMP(hard - call->acks_hard, <=, win);
 
@@ -478,11 +476,11 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
 		sp = rxrpc_skb(skb);
 
 		_debug("drain OOS packet %d [%d]",
-		       ntohl(sp->hdr.seq), call->rx_first_oos);
+		       sp->hdr.seq, call->rx_first_oos);
 
-		if (ntohl(sp->hdr.seq) != call->rx_first_oos) {
+		if (sp->hdr.seq != call->rx_first_oos) {
 			skb_queue_head(&call->rx_oos_queue, skb);
-			call->rx_first_oos = ntohl(rxrpc_skb(skb)->hdr.seq);
+			call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
 			_debug("requeue %p {%u}", skb, call->rx_first_oos);
 		} else {
 			skb->mark = RXRPC_SKB_MARK_DATA;
@@ -496,8 +494,7 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
 			/* find out what the next packet is */
 			skb = skb_peek(&call->rx_oos_queue);
 			if (skb)
-				call->rx_first_oos =
-					ntohl(rxrpc_skb(skb)->hdr.seq);
+				call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
 			else
 				call->rx_first_oos = 0;
 			_debug("peek %p {%u}", skb, call->rx_first_oos);
@@ -522,7 +519,7 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
 	u32 seq;
 
 	sp = rxrpc_skb(skb);
-	seq = ntohl(sp->hdr.seq);
+	seq = sp->hdr.seq;
 	_enter(",,{%u}", seq);
 
 	skb->destructor = rxrpc_packet_destructor;
@@ -535,9 +532,8 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
 
 	skb_queue_walk(&call->rx_oos_queue, p) {
 		psp = rxrpc_skb(p);
-		if (ntohl(psp->hdr.seq) > seq) {
-			_debug("insert oos #%u before #%u",
-			       seq, ntohl(psp->hdr.seq));
+		if (psp->hdr.seq > seq) {
+			_debug("insert oos #%u before #%u", seq, psp->hdr.seq);
 			skb_insert(p, skb, &call->rx_oos_queue);
 			goto inserted;
 		}
@@ -586,7 +582,7 @@ static void rxrpc_zap_tx_window(struct rxrpc_call *call)
 
 		skb = (struct sk_buff *) _skb;
 		sp = rxrpc_skb(skb);
-		_debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
+		_debug("+++ clear Tx %u", sp->hdr.seq);
 		rxrpc_free_skb(skb);
 	}
 
@@ -657,8 +653,7 @@ process_further:
 		/* data packets that wind up here have been received out of
 		 * order, need security processing or are jumbo packets */
 	case RXRPC_PACKET_TYPE_DATA:
-		_proto("OOSQ DATA %%%u { #%u }",
-		       ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+		_proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
 
 		/* secured packets must be verified and possibly decrypted */
 		if (rxrpc_verify_packet(call, skb, _abort_code) < 0)
@@ -676,7 +671,7 @@ process_further:
 		if (!skb_pull(skb, sizeof(ack)))
 			BUG();
 
-		latest = ntohl(sp->hdr.serial);
+		latest = sp->hdr.serial;
 		hard = ntohl(ack.firstPacket);
 		tx = atomic_read(&call->sequence);
 
@@ -881,9 +876,9 @@ void rxrpc_process_call(struct work_struct *work)
 {
 	struct rxrpc_call *call =
 		container_of(work, struct rxrpc_call, processor);
+	struct rxrpc_wire_header whdr;
 	struct rxrpc_ackpacket ack;
 	struct rxrpc_ackinfo ackinfo;
-	struct rxrpc_header hdr;
 	struct msghdr msg;
 	struct kvec iov[5];
 	enum rxrpc_call_event genbit;
@@ -891,7 +886,7 @@ void rxrpc_process_call(struct work_struct *work)
 	__be32 data, pad;
 	size_t len;
 	int loop, nbit, ioc, ret, mtu;
-	u32 abort_code = RX_PROTOCOL_ERROR;
+	u32 serial, abort_code = RX_PROTOCOL_ERROR;
 	u8 *acks = NULL;
 
 	//printk("\n--------------------\n");
@@ -912,20 +907,20 @@ void rxrpc_process_call(struct work_struct *work)
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr.epoch	= call->conn->epoch;
-	hdr.cid		= call->cid;
-	hdr.callNumber	= call->call_id;
-	hdr.seq		= 0;
-	hdr.type	= RXRPC_PACKET_TYPE_ACK;
-	hdr.flags	= call->conn->out_clientflag;
-	hdr.userStatus	= 0;
-	hdr.securityIndex = call->conn->security_ix;
-	hdr._rsvd	= 0;
-	hdr.serviceId	= call->conn->service_id;
+	whdr.epoch	= htonl(call->conn->epoch);
+	whdr.cid	= htonl(call->cid);
+	whdr.callNumber	= htonl(call->call_id);
+	whdr.seq	= 0;
+	whdr.type	= RXRPC_PACKET_TYPE_ACK;
+	whdr.flags	= call->conn->out_clientflag;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = call->conn->security_ix;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(call->service_id);
 
 	memset(iov, 0, sizeof(iov));
-	iov[0].iov_base	= &hdr;
-	iov[0].iov_len	= sizeof(hdr);
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 
 	/* deal with events of a final nature */
 	if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
@@ -966,7 +961,7 @@ void rxrpc_process_call(struct work_struct *work)
 	}
 
 	if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) {
-		hdr.type = RXRPC_PACKET_TYPE_BUSY;
+		whdr.type = RXRPC_PACKET_TYPE_BUSY;
 		genbit = RXRPC_CALL_EV_REJECT_BUSY;
 		goto send_message;
 	}
@@ -977,7 +972,7 @@ void rxrpc_process_call(struct work_struct *work)
 		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
 				       ECONNABORTED, true) < 0)
 			goto no_mem;
-		hdr.type = RXRPC_PACKET_TYPE_ABORT;
+		whdr.type = RXRPC_PACKET_TYPE_ABORT;
 		data = htonl(call->abort_code);
 		iov[1].iov_base = &data;
 		iov[1].iov_len = sizeof(data);
@@ -996,9 +991,9 @@ void rxrpc_process_call(struct work_struct *work)
 		call->ackr_reason = 0;
 
 		spin_lock_bh(&call->lock);
-		ack.serial = call->ackr_serial;
-		ack.previousPacket = call->ackr_prev_seq;
-		ack.firstPacket = htonl(call->rx_data_eaten + 1);
+		ack.serial	= htonl(call->ackr_serial);
+		ack.previousPacket = htonl(call->ackr_prev_seq);
+		ack.firstPacket	= htonl(call->rx_data_eaten + 1);
 		spin_unlock_bh(&call->lock);
 
 		pad = 0;
@@ -1100,13 +1095,11 @@ void rxrpc_process_call(struct work_struct *work)
 		//hdr.flags	= RXRPC_SLOW_START_OK;
 		ack.bufferSpace	= htons(8);
 		ack.maxSkew	= 0;
-		ack.serial	= 0;
-		ack.reason	= 0;
 
 		spin_lock_bh(&call->lock);
-		ack.reason = call->ackr_reason;
-		ack.serial = call->ackr_serial;
-		ack.previousPacket = call->ackr_prev_seq;
+		ack.reason	= call->ackr_reason;
+		ack.serial	= htonl(call->ackr_serial);
+		ack.previousPacket = htonl(call->ackr_prev_seq);
 		ack.firstPacket = htonl(call->rx_data_eaten + 1);
 
 		ack.nAcks = 0;
@@ -1225,9 +1218,10 @@ send_ACK:
 	ackinfo.rxMTU	= htonl(rxrpc_rx_mtu);
 	ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
 
-	hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
+	serial = atomic_inc_return(&call->conn->serial);
+	whdr.serial = htonl(serial);
 	_proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-	       ntohl(hdr.serial),
+	       serial,
 	       ntohs(ack.maxSkew),
 	       ntohl(ack.firstPacket),
 	       ntohl(ack.previousPacket),
@@ -1243,8 +1237,9 @@ send_ACK:
 send_message:
 	_debug("send message");
 
-	hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
-	_proto("Tx %s %%%u", rxrpc_pkts[hdr.type], ntohl(hdr.serial));
+	serial = atomic_inc_return(&call->conn->serial);
+	whdr.serial = htonl(serial);
+	_proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial);
 send_message_2:
 
 	len = iov[0].iov_len;
@@ -1327,8 +1322,7 @@ maybe_reschedule:
 	if (call->state >= RXRPC_CALL_COMPLETE &&
 	    !list_empty(&call->accept_link)) {
 		_debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
-		       call, call->events, call->flags,
-		       ntohl(call->conn->cid));
+		       call, call->events, call->flags, call->conn->cid);
 
 		read_lock_bh(&call->state_lock);
 		if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
@@ -1346,7 +1340,7 @@ error:
 	 * this means there's a race between clearing the flag and setting the
 	 * work pending bit and the work item being processed again */
 	if (call->events && !work_pending(&call->processor)) {
-		_debug("jumpstart %x", ntohl(call->conn->cid));
+		_debug("jumpstart %x", call->conn->cid);
 		rxrpc_queue_call(call);
 	}
 
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
index 3468a0705ab7..4a499e0100f1 100644
--- a/net/rxrpc/ar-call.c
+++ b/net/rxrpc/ar-call.c
@@ -64,11 +64,11 @@ static DEFINE_HASHTABLE(rxrpc_call_hash, 10);
  * Hash function for rxrpc_call_hash
  */
 static unsigned long rxrpc_call_hashfunc(
-	u8		clientflag,
-	__be32		cid,
-	__be32		call_id,
-	__be32		epoch,
-	__be16		service_id,
+	u8		in_clientflag,
+	u32		cid,
+	u32		call_id,
+	u32		epoch,
+	u16		service_id,
 	sa_family_t	proto,
 	void		*localptr,
 	unsigned int	addr_size,
@@ -77,7 +77,6 @@ static unsigned long rxrpc_call_hashfunc(
 	const u16 *p;
 	unsigned int i;
 	unsigned long key;
-	u32 hcid = ntohl(cid);
 
 	_enter("");
 
@@ -85,12 +84,12 @@ static unsigned long rxrpc_call_hashfunc(
 	/* We just want to add up the __be32 values, so forcing the
 	 * cast should be okay.
 	 */
-	key += (__force u32)epoch;
-	key += (__force u16)service_id;
-	key += (__force u32)call_id;
-	key += (hcid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT;
-	key += hcid & RXRPC_CHANNELMASK;
-	key += clientflag;
+	key += epoch;
+	key += service_id;
+	key += call_id;
+	key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT;
+	key += cid & RXRPC_CHANNELMASK;
+	key += in_clientflag;
 	key += proto;
 	/* Step through the peer address in 16-bit portions for speed */
 	for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++)
@@ -148,19 +147,16 @@ static void rxrpc_call_hash_del(struct rxrpc_call *call)
  * isn't there.
  */
 struct rxrpc_call *rxrpc_find_call_hash(
-	u8		clientflag,
-	__be32		cid,
-	__be32		call_id,
-	__be32		epoch,
-	__be16		service_id,
+	struct rxrpc_host_header *hdr,
 	void		*localptr,
 	sa_family_t	proto,
-	const u8	*peer_addr)
+	const void	*peer_addr)
 {
 	unsigned long key;
 	unsigned int addr_size = 0;
 	struct rxrpc_call *call = NULL;
 	struct rxrpc_call *ret = NULL;
+	u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED;
 
 	_enter("");
 	switch (proto) {
@@ -174,20 +170,21 @@ struct rxrpc_call *rxrpc_find_call_hash(
 		break;
 	}
 
-	key = rxrpc_call_hashfunc(clientflag, cid, call_id, epoch,
-				  service_id, proto, localptr, addr_size,
+	key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber,
+				  hdr->epoch, hdr->serviceId,
+				  proto, localptr, addr_size,
 				  peer_addr);
 	hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) {
 		if (call->hash_key == key &&
-		    call->call_id == call_id &&
-		    call->cid == cid &&
-		    call->in_clientflag == clientflag &&
-		    call->service_id == service_id &&
+		    call->call_id == hdr->callNumber &&
+		    call->cid == hdr->cid &&
+		    call->in_clientflag == in_clientflag &&
+		    call->service_id == hdr->serviceId &&
 		    call->proto == proto &&
 		    call->local == localptr &&
 		    memcmp(call->peer_ip.ipv6_addr, peer_addr,
-			      addr_size) == 0 &&
-		    call->epoch == epoch) {
+			   addr_size) == 0 &&
+		    call->epoch == hdr->epoch) {
 			ret = call;
 			break;
 		}
@@ -414,12 +411,12 @@ found_extant_second:
  */
 struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 				       struct rxrpc_connection *conn,
-				       struct rxrpc_header *hdr,
+				       struct rxrpc_host_header *hdr,
 				       gfp_t gfp)
 {
 	struct rxrpc_call *call, *candidate;
 	struct rb_node **p, *parent;
-	__be32 call_id;
+	u32 call_id;
 
 	_enter(",%d,,%x", conn->debug_id, gfp);
 
@@ -433,7 +430,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 	candidate->conn = conn;
 	candidate->cid = hdr->cid;
 	candidate->call_id = hdr->callNumber;
-	candidate->channel = ntohl(hdr->cid) & RXRPC_CHANNELMASK;
+	candidate->channel = hdr->cid & RXRPC_CHANNELMASK;
 	candidate->rx_data_post = 0;
 	candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
 	if (conn->security_ix > 0)
@@ -492,9 +489,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
 		/* The tree is sorted in order of the __be32 value without
 		 * turning it into host order.
 		 */
-		if ((__force u32)call_id < (__force u32)call->call_id)
+		if (call_id < call->call_id)
 			p = &(*p)->rb_left;
-		else if ((__force u32)call_id > (__force u32)call->call_id)
+		else if (call_id > call->call_id)
 			p = &(*p)->rb_right;
 		else
 			goto old_call;
@@ -714,8 +711,7 @@ void rxrpc_release_call(struct rxrpc_call *call)
 
 			_debug("- zap %s %%%u #%u",
 			       rxrpc_pkts[sp->hdr.type],
-			       ntohl(sp->hdr.serial),
-			       ntohl(sp->hdr.seq));
+			       sp->hdr.serial, sp->hdr.seq);
 			rxrpc_free_skb(skb);
 			spin_lock_bh(&call->lock);
 		}
@@ -873,9 +869,9 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call)
 			unsigned long _skb;
 
 			_skb = call->acks_window[call->acks_tail] & ~1;
-			sp = rxrpc_skb((struct sk_buff *) _skb);
-			_debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
-			rxrpc_free_skb((struct sk_buff *) _skb);
+			sp = rxrpc_skb((struct sk_buff *)_skb);
+			_debug("+++ clear Tx %u", sp->hdr.seq);
+			rxrpc_free_skb((struct sk_buff *)_skb);
 			call->acks_tail =
 				(call->acks_tail + 1) & (call->acks_winsz - 1);
 		}
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
index 6c71ed1caf16..53df14cb8d25 100644
--- a/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@ -57,10 +57,10 @@ static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp)
  */
 static inline
 int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle,
-		     struct key *key, __be16 service_id)
+		     struct key *key, u16 service_id)
 {
 	return (bundle->service_id - service_id) ?:
-		((unsigned long) bundle->key - (unsigned long) key);
+		((unsigned long)bundle->key - (unsigned long)key);
 }
 
 /*
@@ -69,14 +69,14 @@ int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle,
 struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx,
 					   struct rxrpc_transport *trans,
 					   struct key *key,
-					   __be16 service_id,
+					   u16 service_id,
 					   gfp_t gfp)
 {
 	struct rxrpc_conn_bundle *bundle, *candidate;
 	struct rb_node *p, *parent, **pp;
 
 	_enter("%p{%x},%x,%hx,",
-	       rx, key_serial(key), trans->debug_id, ntohs(service_id));
+	       rx, key_serial(key), trans->debug_id, service_id);
 
 	if (rx->trans == trans && rx->bundle) {
 		atomic_inc(&rx->bundle->usage);
@@ -213,7 +213,7 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
 		conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
 		conn->avail_calls = RXRPC_MAXCALLS;
 		conn->size_align = 4;
-		conn->header_size = sizeof(struct rxrpc_header);
+		conn->header_size = sizeof(struct rxrpc_wire_header);
 	}
 
 	_leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
@@ -230,7 +230,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn)
 	struct rxrpc_connection *xconn;
 	struct rb_node *parent, **p;
 	__be32 epoch;
-	u32 real_conn_id;
+	u32 cid;
 
 	_enter("");
 
@@ -241,7 +241,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn)
 	conn->trans->conn_idcounter += RXRPC_CID_INC;
 	if (conn->trans->conn_idcounter < RXRPC_CID_INC)
 		conn->trans->conn_idcounter = RXRPC_CID_INC;
-	real_conn_id = conn->trans->conn_idcounter;
+	cid = conn->trans->conn_idcounter;
 
 attempt_insertion:
 	parent = NULL;
@@ -255,9 +255,9 @@ attempt_insertion:
 			p = &(*p)->rb_left;
 		else if (epoch > xconn->epoch)
 			p = &(*p)->rb_right;
-		else if (real_conn_id < xconn->real_conn_id)
+		else if (cid < xconn->cid)
 			p = &(*p)->rb_left;
-		else if (real_conn_id > xconn->real_conn_id)
+		else if (cid > xconn->cid)
 			p = &(*p)->rb_right;
 		else
 			goto id_exists;
@@ -268,20 +268,19 @@ attempt_insertion:
 	rb_link_node(&conn->node, parent, p);
 	rb_insert_color(&conn->node, &conn->trans->client_conns);
 
-	conn->real_conn_id = real_conn_id;
-	conn->cid = htonl(real_conn_id);
+	conn->cid = cid;
 	write_unlock_bh(&conn->trans->conn_lock);
-	_leave(" [CONNID %x CID %x]", real_conn_id, ntohl(conn->cid));
+	_leave(" [CID %x]", cid);
 	return;
 
 	/* we found a connection with the proposed ID - walk the tree from that
 	 * point looking for the next unused ID */
 id_exists:
 	for (;;) {
-		real_conn_id += RXRPC_CID_INC;
-		if (real_conn_id < RXRPC_CID_INC) {
-			real_conn_id = RXRPC_CID_INC;
-			conn->trans->conn_idcounter = real_conn_id;
+		cid += RXRPC_CID_INC;
+		if (cid < RXRPC_CID_INC) {
+			cid = RXRPC_CID_INC;
+			conn->trans->conn_idcounter = cid;
 			goto attempt_insertion;
 		}
 
@@ -291,7 +290,7 @@ id_exists:
 
 		xconn = rb_entry(parent, struct rxrpc_connection, node);
 		if (epoch < xconn->epoch ||
-		    real_conn_id < xconn->real_conn_id)
+		    cid < xconn->cid)
 			goto attempt_insertion;
 	}
 }
@@ -334,7 +333,7 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn,
  */
 static int rxrpc_connect_exclusive(struct rxrpc_sock *rx,
 				   struct rxrpc_transport *trans,
-				   __be16 service_id,
+				   u16 service_id,
 				   struct rxrpc_call *call,
 				   gfp_t gfp)
 {
@@ -404,11 +403,11 @@ found_channel:
 	conn->channels[chan] = call;
 	call->conn = conn;
 	call->channel = chan;
-	call->cid = conn->cid | htonl(chan);
-	call->call_id = htonl(++conn->call_counter);
+	call->cid = conn->cid | chan;
+	call->call_id = ++conn->call_counter;
 
 	_net("CONNECT client on conn %d chan %d as call %x",
-	     conn->debug_id, chan, ntohl(call->call_id));
+	     conn->debug_id, chan, call->call_id);
 
 	spin_unlock(&trans->client_lock);
 
@@ -593,11 +592,11 @@ found_channel:
 	conn->channels[chan] = call;
 	call->conn = conn;
 	call->channel = chan;
-	call->cid = conn->cid | htonl(chan);
-	call->call_id = htonl(++conn->call_counter);
+	call->cid = conn->cid | chan;
+	call->call_id = ++conn->call_counter;
 
 	_net("CONNECT client on conn %d chan %d as call %x",
-	     conn->debug_id, chan, ntohl(call->call_id));
+	     conn->debug_id, chan, call->call_id);
 
 	ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS);
 	spin_unlock(&trans->client_lock);
@@ -620,21 +619,21 @@ interrupted:
  */
 struct rxrpc_connection *
 rxrpc_incoming_connection(struct rxrpc_transport *trans,
-			  struct rxrpc_header *hdr,
+			  struct rxrpc_host_header *hdr,
 			  gfp_t gfp)
 {
 	struct rxrpc_connection *conn, *candidate = NULL;
 	struct rb_node *p, **pp;
 	const char *new = "old";
 	__be32 epoch;
-	u32 conn_id;
+	u32 cid;
 
 	_enter("");
 
 	ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED);
 
 	epoch = hdr->epoch;
-	conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+	cid = hdr->cid & RXRPC_CIDMASK;
 
 	/* search the connection list first */
 	read_lock_bh(&trans->conn_lock);
@@ -643,15 +642,15 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
 	while (p) {
 		conn = rb_entry(p, struct rxrpc_connection, node);
 
-		_debug("maybe %x", conn->real_conn_id);
+		_debug("maybe %x", conn->cid);
 
 		if (epoch < conn->epoch)
 			p = p->rb_left;
 		else if (epoch > conn->epoch)
 			p = p->rb_right;
-		else if (conn_id < conn->real_conn_id)
+		else if (cid < conn->cid)
 			p = p->rb_left;
-		else if (conn_id > conn->real_conn_id)
+		else if (cid > conn->cid)
 			p = p->rb_right;
 		else
 			goto found_extant_connection;
@@ -668,12 +667,11 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
 
 	candidate->trans = trans;
 	candidate->epoch = hdr->epoch;
-	candidate->cid = hdr->cid & cpu_to_be32(RXRPC_CIDMASK);
+	candidate->cid = hdr->cid & RXRPC_CIDMASK;
 	candidate->service_id = hdr->serviceId;
 	candidate->security_ix = hdr->securityIndex;
 	candidate->in_clientflag = RXRPC_CLIENT_INITIATED;
 	candidate->out_clientflag = 0;
-	candidate->real_conn_id = conn_id;
 	candidate->state = RXRPC_CONN_SERVER;
 	if (candidate->service_id)
 		candidate->state = RXRPC_CONN_SERVER_UNSECURED;
@@ -690,9 +688,9 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
 			pp = &(*pp)->rb_left;
 		else if (epoch > conn->epoch)
 			pp = &(*pp)->rb_right;
-		else if (conn_id < conn->real_conn_id)
+		else if (cid < conn->cid)
 			pp = &(*pp)->rb_left;
-		else if (conn_id > conn->real_conn_id)
+		else if (cid > conn->cid)
 			pp = &(*pp)->rb_right;
 		else
 			goto found_extant_second;
@@ -714,7 +712,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
 	new = "new";
 
 success:
-	_net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->real_conn_id);
+	_net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->cid);
 
 	_leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
 	return conn;
@@ -751,18 +749,17 @@ security_mismatch:
  * packet
  */
 struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans,
-					       struct rxrpc_header *hdr)
+					       struct rxrpc_host_header *hdr)
 {
 	struct rxrpc_connection *conn;
 	struct rb_node *p;
-	__be32 epoch;
-	u32 conn_id;
+	u32 epoch, cid;
 
-	_enter(",{%x,%x}", ntohl(hdr->cid), hdr->flags);
+	_enter(",{%x,%x}", hdr->cid, hdr->flags);
 
 	read_lock_bh(&trans->conn_lock);
 
-	conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+	cid = hdr->cid & RXRPC_CIDMASK;
 	epoch = hdr->epoch;
 
 	if (hdr->flags & RXRPC_CLIENT_INITIATED)
@@ -773,15 +770,15 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans,
 	while (p) {
 		conn = rb_entry(p, struct rxrpc_connection, node);
 
-		_debug("maybe %x", conn->real_conn_id);
+		_debug("maybe %x", conn->cid);
 
 		if (epoch < conn->epoch)
 			p = p->rb_left;
 		else if (epoch > conn->epoch)
 			p = p->rb_right;
-		else if (conn_id < conn->real_conn_id)
+		else if (cid < conn->cid)
 			p = p->rb_left;
-		else if (conn_id > conn->real_conn_id)
+		else if (cid > conn->cid)
 			p = p->rb_right;
 		else
 			goto found;
diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c
index 45e81b7e96ae..1bdaaed8cdc4 100644
--- a/net/rxrpc/ar-connevent.c
+++ b/net/rxrpc/ar-connevent.c
@@ -60,11 +60,12 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
 static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 				  u32 error, u32 abort_code)
 {
-	struct rxrpc_header hdr;
+	struct rxrpc_wire_header whdr;
 	struct msghdr msg;
 	struct kvec iov[2];
 	__be32 word;
 	size_t len;
+	u32 serial;
 	int ret;
 
 	_enter("%d,,%u,%u", conn->debug_id, error, abort_code);
@@ -89,28 +90,29 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr.epoch	= conn->epoch;
-	hdr.cid		= conn->cid;
-	hdr.callNumber	= 0;
-	hdr.seq		= 0;
-	hdr.type	= RXRPC_PACKET_TYPE_ABORT;
-	hdr.flags	= conn->out_clientflag;
-	hdr.userStatus	= 0;
-	hdr.securityIndex = conn->security_ix;
-	hdr._rsvd	= 0;
-	hdr.serviceId	= conn->service_id;
+	whdr.epoch	= htonl(conn->epoch);
+	whdr.cid	= htonl(conn->cid);
+	whdr.callNumber	= 0;
+	whdr.seq	= 0;
+	whdr.type	= RXRPC_PACKET_TYPE_ABORT;
+	whdr.flags	= conn->out_clientflag;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = conn->security_ix;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(conn->service_id);
 
 	word = htonl(abort_code);
 
-	iov[0].iov_base	= &hdr;
-	iov[0].iov_len	= sizeof(hdr);
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 	iov[1].iov_base	= &word;
 	iov[1].iov_len	= sizeof(word);
 
 	len = iov[0].iov_len + iov[1].iov_len;
 
-	hdr.serial = htonl(atomic_inc_return(&conn->serial));
-	_proto("Tx CONN ABORT %%%u { %d }", ntohl(hdr.serial), abort_code);
+	serial = atomic_inc_return(&conn->serial);
+	whdr.serial = htonl(serial);
+	_proto("Tx CONN ABORT %%%u { %d }", serial, abort_code);
 
 	ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
 	if (ret < 0) {
@@ -146,8 +148,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 			       u32 *_abort_code)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	__be32 tmp;
-	u32 serial;
+	__be32 wtmp;
+	u32 abort_code;
 	int loop, ret;
 
 	if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
@@ -155,19 +157,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 		return -ECONNABORTED;
 	}
 
-	serial = ntohl(sp->hdr.serial);
-
-	_enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, serial);
+	_enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial);
 
 	switch (sp->hdr.type) {
 	case RXRPC_PACKET_TYPE_ABORT:
-		if (skb_copy_bits(skb, 0, &tmp, sizeof(tmp)) < 0)
+		if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
 			return -EPROTO;
-		_proto("Rx ABORT %%%u { ac=%d }", serial, ntohl(tmp));
+		abort_code = ntohl(wtmp);
+		_proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
 
 		conn->state = RXRPC_CONN_REMOTELY_ABORTED;
 		rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
-				  ntohl(tmp));
+				  abort_code);
 		return -ECONNABORTED;
 
 	case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -335,7 +336,7 @@ void rxrpc_reject_packets(struct work_struct *work)
 		struct sockaddr_in sin;
 	} sa;
 	struct rxrpc_skb_priv *sp;
-	struct rxrpc_header hdr;
+	struct rxrpc_wire_header whdr;
 	struct rxrpc_local *local;
 	struct sk_buff *skb;
 	struct msghdr msg;
@@ -348,11 +349,11 @@ void rxrpc_reject_packets(struct work_struct *work)
 
 	_enter("%d", local->debug_id);
 
-	iov[0].iov_base = &hdr;
-	iov[0].iov_len = sizeof(hdr);
+	iov[0].iov_base = &whdr;
+	iov[0].iov_len = sizeof(whdr);
 	iov[1].iov_base = &code;
 	iov[1].iov_len = sizeof(code);
-	size = sizeof(hdr) + sizeof(code);
+	size = sizeof(whdr) + sizeof(code);
 
 	msg.msg_name = &sa;
 	msg.msg_control = NULL;
@@ -370,8 +371,8 @@ void rxrpc_reject_packets(struct work_struct *work)
 		break;
 	}
 
-	memset(&hdr, 0, sizeof(hdr));
-	hdr.type = RXRPC_PACKET_TYPE_ABORT;
+	memset(&whdr, 0, sizeof(whdr));
+	whdr.type = RXRPC_PACKET_TYPE_ABORT;
 
 	while ((skb = skb_dequeue(&local->reject_queue))) {
 		sp = rxrpc_skb(skb);
@@ -381,13 +382,13 @@ void rxrpc_reject_packets(struct work_struct *work)
 			sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
 			code = htonl(skb->priority);
 
-			hdr.epoch = sp->hdr.epoch;
-			hdr.cid = sp->hdr.cid;
-			hdr.callNumber = sp->hdr.callNumber;
-			hdr.serviceId = sp->hdr.serviceId;
-			hdr.flags = sp->hdr.flags;
-			hdr.flags ^= RXRPC_CLIENT_INITIATED;
-			hdr.flags &= RXRPC_CLIENT_INITIATED;
+			whdr.epoch	= htonl(sp->hdr.epoch);
+			whdr.cid	= htonl(sp->hdr.cid);
+			whdr.callNumber	= htonl(sp->hdr.callNumber);
+			whdr.serviceId	= htons(sp->hdr.serviceId);
+			whdr.flags	= sp->hdr.flags;
+			whdr.flags	^= RXRPC_CLIENT_INITIATED;
+			whdr.flags	&= RXRPC_CLIENT_INITIATED;
 
 			kernel_sendmsg(local->socket, &msg, iov, 2, size);
 			break;
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index 9185535af5f5..e6396a8c969f 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -310,8 +310,8 @@ static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
 void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	__be32 _abort_code;
-	u32 serial, hi_serial, seq, abort_code;
+	__be32 wtmp;
+	u32 hi_serial, abort_code;
 
 	_enter("%p,%p", call, skb);
 
@@ -330,16 +330,15 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 
 	/* track the latest serial number on this connection for ACK packet
 	 * information */
-	serial = ntohl(sp->hdr.serial);
 	hi_serial = atomic_read(&call->conn->hi_serial);
-	while (serial > hi_serial)
+	while (sp->hdr.serial > hi_serial)
 		hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial,
-					   serial);
+					   sp->hdr.serial);
 
 	/* request ACK generation for any ACK or DATA packet that requests
 	 * it */
 	if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
-		_proto("ACK Requested on %%%u", serial);
+		_proto("ACK Requested on %%%u", sp->hdr.serial);
 		rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false);
 	}
 
@@ -347,12 +346,11 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 	case RXRPC_PACKET_TYPE_ABORT:
 		_debug("abort");
 
-		if (skb_copy_bits(skb, 0, &_abort_code,
-				  sizeof(_abort_code)) < 0)
+		if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
 			goto protocol_error;
 
-		abort_code = ntohl(_abort_code);
-		_proto("Rx ABORT %%%u { %x }", serial, abort_code);
+		abort_code = ntohl(wtmp);
+		_proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
 
 		write_lock_bh(&call->state_lock);
 		if (call->state < RXRPC_CALL_COMPLETE) {
@@ -364,7 +362,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		goto free_packet_unlock;
 
 	case RXRPC_PACKET_TYPE_BUSY:
-		_proto("Rx BUSY %%%u", serial);
+		_proto("Rx BUSY %%%u", sp->hdr.serial);
 
 		if (call->conn->out_clientflag)
 			goto protocol_error;
@@ -382,15 +380,13 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		}
 
 	default:
-		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], serial);
+		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
 		goto protocol_error;
 
 	case RXRPC_PACKET_TYPE_DATA:
-		seq = ntohl(sp->hdr.seq);
+		_proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
 
-		_proto("Rx DATA %%%u { #%u }", serial, seq);
-
-		if (seq == 0)
+		if (sp->hdr.seq == 0)
 			goto protocol_error;
 
 		call->ackr_prev_seq = sp->hdr.seq;
@@ -398,9 +394,9 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 		/* received data implicitly ACKs all of the request packets we
 		 * sent when we're acting as a client */
 		if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
-			rxrpc_assume_implicit_ackall(call, serial);
+			rxrpc_assume_implicit_ackall(call, sp->hdr.serial);
 
-		switch (rxrpc_fast_process_data(call, skb, seq)) {
+		switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) {
 		case 0:
 			skb = NULL;
 			goto done;
@@ -481,12 +477,12 @@ static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
 		if (!pskb_pull(jumbo, sizeof(jhdr)))
 			BUG();
 
-		sp->hdr.seq	= htonl(ntohl(sp->hdr.seq) + 1);
-		sp->hdr.serial	= htonl(ntohl(sp->hdr.serial) + 1);
+		sp->hdr.seq	+= 1;
+		sp->hdr.serial	+= 1;
 		sp->hdr.flags	= jhdr.flags;
 		sp->hdr._rsvd	= jhdr._rsvd;
 
-		_proto("Rx DATA Jumbo %%%u", ntohl(sp->hdr.serial) - 1);
+		_proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1);
 
 		rxrpc_fast_process_packet(call, part);
 		part = NULL;
@@ -607,6 +603,35 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
 	rxrpc_queue_work(&local->event_processor);
 }
 
+/*
+ * Extract the wire header from a packet and translate the byte order.
+ */
+static noinline
+int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
+{
+	struct rxrpc_wire_header whdr;
+
+	/* dig out the RxRPC connection details */
+	if (skb_copy_bits(skb, sizeof(struct udphdr), &whdr, sizeof(whdr)) < 0)
+		return -EBADMSG;
+	if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(whdr)))
+		BUG();
+
+	memset(sp, 0, sizeof(*sp));
+	sp->hdr.epoch		= ntohl(whdr.epoch);
+	sp->hdr.cid		= ntohl(whdr.cid);
+	sp->hdr.callNumber	= ntohl(whdr.callNumber);
+	sp->hdr.seq		= ntohl(whdr.seq);
+	sp->hdr.serial		= ntohl(whdr.serial);
+	sp->hdr.flags		= whdr.flags;
+	sp->hdr.type		= whdr.type;
+	sp->hdr.userStatus	= whdr.userStatus;
+	sp->hdr.securityIndex	= whdr.securityIndex;
+	sp->hdr._rsvd		= ntohs(whdr._rsvd);
+	sp->hdr.serviceId	= ntohs(whdr.serviceId);
+	return 0;
+}
+
 static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local,
 					       struct sk_buff *skb,
 					       struct rxrpc_skb_priv *sp)
@@ -686,27 +711,22 @@ void rxrpc_data_ready(struct sock *sk)
 
 	UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0);
 
-	/* the socket buffer we have is owned by UDP, with UDP's data all over
-	 * it, but we really want our own */
+	/* The socket buffer we have is owned by UDP, with UDP's data all over
+	 * it, but we really want our own data there.
+	 */
 	skb_orphan(skb);
 	sp = rxrpc_skb(skb);
-	memset(sp, 0, sizeof(*sp));
 
 	_net("Rx UDP packet from %08x:%04hu",
 	     ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
 
 	/* dig out the RxRPC connection details */
-	if (skb_copy_bits(skb, sizeof(struct udphdr), &sp->hdr,
-			  sizeof(sp->hdr)) < 0)
+	if (rxrpc_extract_header(sp, skb) < 0)
 		goto bad_message;
-	if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(sp->hdr)))
-		BUG();
 
 	_net("Rx RxRPC %s ep=%x call=%x:%x",
 	     sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
-	     ntohl(sp->hdr.epoch),
-	     ntohl(sp->hdr.cid),
-	     ntohl(sp->hdr.callNumber));
+	     sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
 
 	if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) {
 		_proto("Rx Bad Packet Type %u", sp->hdr.type);
@@ -737,14 +757,9 @@ void rxrpc_data_ready(struct sock *sk)
 		rxrpc_put_connection(conn);
 	} else {
 		struct rxrpc_call *call;
-		u8 in_clientflag = 0;
-
-		if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
-			in_clientflag = RXRPC_CLIENT_INITIATED;
-		call = rxrpc_find_call_hash(in_clientflag, sp->hdr.cid,
-					    sp->hdr.callNumber, sp->hdr.epoch,
-					    sp->hdr.serviceId, local, AF_INET,
-					    (u8 *)&ip_hdr(skb)->saddr);
+
+		call = rxrpc_find_call_hash(&sp->hdr, local,
+					    AF_INET, &ip_hdr(skb)->saddr);
 		if (call)
 			rxrpc_post_packet_to_call(call, skb);
 		else
@@ -759,7 +774,7 @@ cant_route_call:
 	_debug("can't route call");
 	if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
 	    sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
-		if (sp->hdr.seq == cpu_to_be32(1)) {
+		if (sp->hdr.seq == 1) {
 			_debug("first packet");
 			skb_queue_tail(&local->accept_queue, skb);
 			rxrpc_queue_work(&local->acceptor);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 3f2940626569..06bf5abd920d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -70,11 +70,30 @@ struct rxrpc_sock {
 #define RXRPC_SECURITY_MAX	RXRPC_SECURITY_ENCRYPT
 	struct sockaddr_rxrpc	srx;		/* local address */
 	sa_family_t		proto;		/* protocol created with */
-	__be16			service_id;	/* service ID of local/remote service */
 };
 
 #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk)
 
+/*
+ * CPU-byteorder normalised Rx packet header.
+ */
+struct rxrpc_host_header {
+	u32		epoch;		/* client boot timestamp */
+	u32		cid;		/* connection and channel ID */
+	u32		callNumber;	/* call ID (0 for connection-level packets) */
+	u32		seq;		/* sequence number of pkt in call stream */
+	u32		serial;		/* serial number of pkt sent to network */
+	u8		type;		/* packet type */
+	u8		flags;		/* packet flags */
+	u8		userStatus;	/* app-layer defined status */
+	u8		securityIndex;	/* security protocol ID */
+	union {
+		u16	_rsvd;		/* reserved */
+		u16	cksum;		/* kerberos security checksum */
+	};
+	u16		serviceId;	/* service ID */
+} __packed;
+
 /*
  * RxRPC socket buffer private variables
  * - max 48 bytes (struct sk_buff::cb)
@@ -89,7 +108,7 @@ struct rxrpc_skb_priv {
 		bool		need_resend;	/* T if needs resending */
 	};
 
-	struct rxrpc_header	hdr;		/* RxRPC packet header from this packet */
+	struct rxrpc_host_header hdr;		/* RxRPC packet header from this packet */
 };
 
 #define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb)
@@ -230,7 +249,7 @@ struct rxrpc_conn_bundle {
 	atomic_t		usage;
 	int			debug_id;	/* debug ID for printks */
 	unsigned short		num_conns;	/* number of connections in this bundle */
-	__be16			service_id;	/* service ID */
+	u16			service_id;	/* Service ID for this bundle */
 	u8			security_ix;	/* security type */
 };
 
@@ -260,7 +279,6 @@ struct rxrpc_connection {
 	rwlock_t		lock;		/* access lock */
 	spinlock_t		state_lock;	/* state-change lock */
 	atomic_t		usage;
-	u32			real_conn_id;	/* connection ID (host-endian) */
 	enum {					/* current state of connection */
 		RXRPC_CONN_UNUSED,		/* - connection not yet attempted */
 		RXRPC_CONN_CLIENT,		/* - client connection */
@@ -282,11 +300,9 @@ struct rxrpc_connection {
 	u8			security_size;	/* security header size */
 	u32			security_level;	/* security level negotiated */
 	u32			security_nonce;	/* response re-use preventer */
-
-	/* the following are all in net order */
-	__be32			epoch;		/* epoch of this connection */
-	__be32			cid;		/* connection ID */
-	__be16			service_id;	/* service ID */
+	u32			epoch;		/* epoch of this connection */
+	u32			cid;		/* connection ID */
+	u16			service_id;	/* service ID for this connection */
 	u8			security_ix;	/* security type */
 	u8			in_clientflag;	/* RXRPC_CLIENT_INITIATED if we are server */
 	u8			out_clientflag;	/* RXRPC_CLIENT_INITIATED if we are client */
@@ -406,9 +422,9 @@ struct rxrpc_call {
 	rxrpc_seq_t		rx_data_eaten;	/* last data seq ID consumed by recvmsg */
 	rxrpc_seq_t		rx_first_oos;	/* first packet in rx_oos_queue (or 0) */
 	rxrpc_seq_t		ackr_win_top;	/* top of ACK window (rx_data_eaten is bottom) */
-	rxrpc_seq_net_t		ackr_prev_seq;	/* previous sequence number received */
+	rxrpc_seq_t		ackr_prev_seq;	/* previous sequence number received */
 	u8			ackr_reason;	/* reason to ACK */
-	__be32			ackr_serial;	/* serial of packet being ACK'd */
+	rxrpc_serial_t		ackr_serial;	/* serial of packet being ACK'd */
 	atomic_t		ackr_not_idle;	/* number of packets in Rx queue */
 
 	/* received packet records, 1 bit per record */
@@ -420,11 +436,10 @@ struct rxrpc_call {
 	u8			in_clientflag;	/* Copy of conn->in_clientflag for hashing */
 	struct rxrpc_local	*local;		/* Local endpoint. Used for hashing. */
 	sa_family_t		proto;		/* Frame protocol */
-	/* the following should all be in net order */
-	__be32			cid;		/* connection ID + channel index  */
-	__be32			call_id;	/* call ID on connection  */
-	__be32			epoch;		/* epoch of this connection */
-	__be16			service_id;	/* service ID */
+	u32			call_id;	/* call ID on connection  */
+	u32			cid;		/* connection ID plus channel index */
+	u32			epoch;		/* epoch of this connection */
+	u16			service_id;	/* service ID */
 	union {					/* Peer IP address for hashing */
 		__be32	ipv4_addr;
 		__u8	ipv6_addr[16];		/* Anticipates eventual IPv6 support */
@@ -449,7 +464,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code)
  * af_rxrpc.c
  */
 extern atomic_t rxrpc_n_skbs;
-extern __be32 rxrpc_epoch;
+extern u32 rxrpc_epoch;
 extern atomic_t rxrpc_debug_id;
 extern struct workqueue_struct *rxrpc_workqueue;
 
@@ -470,8 +485,8 @@ extern unsigned rxrpc_rx_window_size;
 extern unsigned rxrpc_rx_mtu;
 extern unsigned rxrpc_rx_jumbo_max;
 
-void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
+void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
 void rxrpc_process_call(struct work_struct *);
 
 /*
@@ -483,15 +498,15 @@ extern struct kmem_cache *rxrpc_call_jar;
 extern struct list_head rxrpc_calls;
 extern rwlock_t rxrpc_call_lock;
 
-struct rxrpc_call *rxrpc_find_call_hash(u8,  __be32, __be32, __be32,
-					__be16, void *, sa_family_t, const u8 *);
+struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *,
+					void *, sa_family_t, const void *);
 struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *,
 					 struct rxrpc_transport *,
 					 struct rxrpc_conn_bundle *,
 					 unsigned long, int, gfp_t);
 struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
 				       struct rxrpc_connection *,
-				       struct rxrpc_header *, gfp_t);
+				       struct rxrpc_host_header *, gfp_t);
 struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long);
 void rxrpc_release_call(struct rxrpc_call *);
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
@@ -507,16 +522,16 @@ extern rwlock_t rxrpc_connection_lock;
 
 struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *,
 					   struct rxrpc_transport *,
-					   struct key *, __be16, gfp_t);
+					   struct key *, u16, gfp_t);
 void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *);
 int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *,
 		       struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t);
 void rxrpc_put_connection(struct rxrpc_connection *);
 void __exit rxrpc_destroy_all_connections(void);
 struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *,
-					       struct rxrpc_header *);
+					       struct rxrpc_host_header *);
 extern struct rxrpc_connection *
-rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_header *,
+rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *,
 			  gfp_t);
 
 /*
diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c
index 78483b4602bf..4e1e6db0050b 100644
--- a/net/rxrpc/ar-local.c
+++ b/net/rxrpc/ar-local.c
@@ -323,9 +323,11 @@ void __exit rxrpc_destroy_all_locals(void)
  * Reply to a version request
  */
 static void rxrpc_send_version_request(struct rxrpc_local *local,
-				       struct rxrpc_header *hdr,
+				       struct rxrpc_host_header *hdr,
 				       struct sk_buff *skb)
 {
+	struct rxrpc_wire_header whdr;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct sockaddr_in sin;
 	struct msghdr msg;
 	struct kvec iov[2];
@@ -344,15 +346,20 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr->seq	= 0;
-	hdr->serial	= 0;
-	hdr->type	= RXRPC_PACKET_TYPE_VERSION;
-	hdr->flags	= RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
-	hdr->userStatus	= 0;
-	hdr->_rsvd	= 0;
-
-	iov[0].iov_base	= hdr;
-	iov[0].iov_len	= sizeof(*hdr);
+	whdr.epoch	= htonl(sp->hdr.epoch);
+	whdr.cid	= htonl(sp->hdr.cid);
+	whdr.callNumber	= htonl(sp->hdr.callNumber);
+	whdr.seq	= 0;
+	whdr.serial	= 0;
+	whdr.type	= RXRPC_PACKET_TYPE_VERSION;
+	whdr.flags	= RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
+	whdr.userStatus	= 0;
+	whdr.securityIndex = 0;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(sp->hdr.serviceId);
+
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 	iov[1].iov_base	= (char *)rxrpc_version_string;
 	iov[1].iov_len	= sizeof(rxrpc_version_string);
 
@@ -383,7 +390,7 @@ static void rxrpc_process_local_events(struct work_struct *work)
 	while ((skb = skb_dequeue(&local->event_queue))) {
 		struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
-		kdebug("{%d},{%u}", local->debug_id, sp->hdr.type);
+		_debug("{%d},{%u}", local->debug_id, sp->hdr.type);
 
 		switch (sp->hdr.type) {
 		case RXRPC_PACKET_TYPE_VERSION:
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index 9e1527a6d026..353f5c9141ea 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -136,7 +136,7 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans,
 	struct rxrpc_call *call;
 	unsigned long user_call_ID = 0;
 	struct key *key;
-	__be16 service_id;
+	u16 service_id;
 	u32 abort_code = 0;
 	int ret;
 
@@ -151,11 +151,11 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans,
 
 	bundle = NULL;
 	if (trans) {
-		service_id = rx->service_id;
+		service_id = rx->srx.srx_service;
 		if (msg->msg_name) {
 			DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx,
 					 msg->msg_name);
-			service_id = htons(srx->srx_service);
+			service_id = srx->srx_service;
 		}
 		key = rx->key;
 		if (key && !rx->key->payload.data[0])
@@ -348,7 +348,7 @@ int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb)
 
 	/* send the packet with the don't fragment bit set if we currently
 	 * think it's small enough */
-	if (skb->len - sizeof(struct rxrpc_header) < trans->peer->maxdata) {
+	if (skb->len - sizeof(struct rxrpc_wire_header) < trans->peer->maxdata) {
 		down_read(&trans->local->defrag_sem);
 		/* send the packet by UDP
 		 * - returns -EMSGSIZE if UDP would have to fragment the packet
@@ -480,8 +480,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 		write_unlock_bh(&call->state_lock);
 	}
 
-	_proto("Tx DATA %%%u { #%u }",
-	       ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+	_proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
 
 	sp->need_resend = false;
 	sp->resend_at = jiffies + rxrpc_resend_timeout;
@@ -512,6 +511,29 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	_leave("");
 }
 
+/*
+ * Convert a host-endian header into a network-endian header.
+ */
+static void rxrpc_insert_header(struct sk_buff *skb)
+{
+	struct rxrpc_wire_header whdr;
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+	whdr.epoch	= htonl(sp->hdr.epoch);
+	whdr.cid	= htonl(sp->hdr.cid);
+	whdr.callNumber	= htonl(sp->hdr.callNumber);
+	whdr.seq	= htonl(sp->hdr.seq);
+	whdr.serial	= htonl(sp->hdr.serial);
+	whdr.type	= sp->hdr.type;
+	whdr.flags	= sp->hdr.flags;
+	whdr.userStatus	= sp->hdr.userStatus;
+	whdr.securityIndex = sp->hdr.securityIndex;
+	whdr._rsvd	= htons(sp->hdr._rsvd);
+	whdr.serviceId	= htons(sp->hdr.serviceId);
+
+	memcpy(skb->head, &whdr, sizeof(whdr));
+}
+
 /*
  * send data through a socket
  * - must be called in process context
@@ -650,17 +672,16 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 			seq = atomic_inc_return(&call->sequence);
 
-			sp->hdr.epoch = conn->epoch;
-			sp->hdr.cid = call->cid;
+			sp->hdr.epoch	= conn->epoch;
+			sp->hdr.cid	= call->cid;
 			sp->hdr.callNumber = call->call_id;
-			sp->hdr.seq = htonl(seq);
-			sp->hdr.serial =
-				htonl(atomic_inc_return(&conn->serial));
-			sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
+			sp->hdr.seq	= seq;
+			sp->hdr.serial	= atomic_inc_return(&conn->serial);
+			sp->hdr.type	= RXRPC_PACKET_TYPE_DATA;
 			sp->hdr.userStatus = 0;
 			sp->hdr.securityIndex = conn->security_ix;
-			sp->hdr._rsvd = 0;
-			sp->hdr.serviceId = conn->service_id;
+			sp->hdr._rsvd	= 0;
+			sp->hdr.serviceId = call->service_id;
 
 			sp->hdr.flags = conn->out_clientflag;
 			if (msg_data_left(msg) == 0 && !more)
@@ -673,12 +694,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 			ret = rxrpc_secure_packet(
 				call, skb, skb->mark,
-				skb->head + sizeof(struct rxrpc_header));
+				skb->head + sizeof(struct rxrpc_wire_header));
 			if (ret < 0)
 				goto out;
 
-			memcpy(skb->head, &sp->hdr,
-			       sizeof(struct rxrpc_header));
+			rxrpc_insert_header(skb);
 			rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
 			skb = NULL;
 		}
diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c
index bebaa43484bc..dc089b1976aa 100644
--- a/net/rxrpc/ar-peer.c
+++ b/net/rxrpc/ar-peer.c
@@ -92,7 +92,7 @@ static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx,
 			BUG();
 		}
 
-		peer->hdrsize += sizeof(struct rxrpc_header);
+		peer->hdrsize += sizeof(struct rxrpc_wire_header);
 		peer->maxdata = peer->mtu - peer->hdrsize;
 	}
 
diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c
index 38047f713f2c..525b2ba5a8f4 100644
--- a/net/rxrpc/ar-proc.c
+++ b/net/rxrpc/ar-proc.c
@@ -74,9 +74,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 		   " %-8.8s %08x %lx\n",
 		   lbuff,
 		   rbuff,
-		   ntohs(call->conn->service_id),
-		   ntohl(call->conn->cid),
-		   ntohl(call->call_id),
+		   call->conn->service_id,
+		   call->cid,
+		   call->call_id,
 		   call->conn->in_clientflag ? "Svc" : "Clt",
 		   atomic_read(&call->usage),
 		   rxrpc_call_states[call->state],
@@ -157,8 +157,8 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
 		   " %s %08x %08x %08x\n",
 		   lbuff,
 		   rbuff,
-		   ntohs(conn->service_id),
-		   ntohl(conn->cid),
+		   conn->service_id,
+		   conn->cid,
 		   conn->call_counter,
 		   conn->in_clientflag ? "Svc" : "Clt",
 		   atomic_read(&conn->usage),
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index 70f47033ff2f..64facba24a45 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -158,7 +158,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			goto receive_non_data_message;
 
 		_debug("recvmsg DATA #%u { %d, %d }",
-		       ntohl(sp->hdr.seq), skb->len, sp->offset);
+		       sp->hdr.seq, skb->len, sp->offset);
 
 		if (!continue_call) {
 			/* only set the control data once per recvmsg() */
@@ -169,11 +169,11 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
 		}
 
-		ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
-		ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
-		call->rx_data_recv = ntohl(sp->hdr.seq);
+		ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+		ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+		call->rx_data_recv = sp->hdr.seq;
 
-		ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+		ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
 
 		offset = sp->offset;
 		copy = skb->len - offset;
@@ -364,11 +364,11 @@ void rxrpc_kernel_data_delivered(struct sk_buff *skb)
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_call *call = sp->call;
 
-	ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
-	ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
-	call->rx_data_recv = ntohl(sp->hdr.seq);
+	ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+	ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+	call->rx_data_recv = sp->hdr.seq;
 
-	ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+	ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
 	rxrpc_free_skb(skb);
 }
 
diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c
index 8334474eb26c..e2f4c49a9246 100644
--- a/net/rxrpc/ar-security.c
+++ b/net/rxrpc/ar-security.c
@@ -171,7 +171,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
 
 	_enter("");
 
-	sprintf(kdesc, "%u:%u", ntohs(conn->service_id), conn->security_ix);
+	sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix);
 
 	sec = rxrpc_security_lookup(conn->security_ix);
 	if (!sec) {
@@ -182,7 +182,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
 	/* find the service */
 	read_lock_bh(&local->services_lock);
 	list_for_each_entry(rx, &local->services, listen_link) {
-		if (rx->service_id == conn->service_id)
+		if (rx->srx.srx_service == conn->service_id)
 			goto found_service;
 	}
 
diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c
index 81f3c0238b9a..ae9f93f94ed2 100644
--- a/net/rxrpc/ar-skbuff.c
+++ b/net/rxrpc/ar-skbuff.c
@@ -59,7 +59,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
 
 	spin_lock_bh(&call->lock);
 
-	_debug("hard ACK #%u", ntohl(sp->hdr.seq));
+	_debug("hard ACK #%u", sp->hdr.seq);
 
 	for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
 		call->ackr_window[loop] >>= 1;
@@ -67,7 +67,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
 			call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
 	}
 
-	seq = ntohl(sp->hdr.seq);
+	seq = sp->hdr.seq;
 	ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
 	call->rx_data_eaten = seq;
 
diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c
index 9946467f16b4..5f9b9d462f53 100644
--- a/net/rxrpc/ar-transport.c
+++ b/net/rxrpc/ar-transport.c
@@ -51,6 +51,7 @@ static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local,
 		spin_lock_init(&trans->client_lock);
 		rwlock_init(&trans->conn_lock);
 		atomic_set(&trans->usage, 1);
+		trans->conn_idcounter = peer->srx.srx_service << 16;
 		trans->debug_id = atomic_inc_return(&rxrpc_debug_id);
 
 		if (peer->srx.transport.family == AF_INET) {
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index d7a9ab5a9d9c..160480221224 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -132,8 +132,8 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
 	desc.info = iv.x;
 	desc.flags = 0;
 
-	tmpbuf.x[0] = conn->epoch;
-	tmpbuf.x[1] = conn->cid;
+	tmpbuf.x[0] = htonl(conn->epoch);
+	tmpbuf.x[1] = htonl(conn->cid);
 	tmpbuf.x[2] = 0;
 	tmpbuf.x[3] = htonl(conn->security_ix);
 
@@ -169,8 +169,8 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
 
 	_enter("");
 
-	check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
-	data_size |= (u32) check << 16;
+	check = sp->hdr.seq ^ sp->hdr.callNumber;
+	data_size |= (u32)check << 16;
 
 	tmpbuf.hdr.data_size = htonl(data_size);
 	memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first));
@@ -215,9 +215,9 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
 
 	_enter("");
 
-	check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+	check = sp->hdr.seq ^ sp->hdr.callNumber;
 
-	rxkhdr.data_size = htonl(data_size | (u32) check << 16);
+	rxkhdr.data_size = htonl(data_size | (u32)check << 16);
 	rxkhdr.checksum = 0;
 
 	/* encrypt from the session key */
@@ -262,14 +262,13 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
 	struct {
 		__be32 x[2];
 	} tmpbuf __attribute__((aligned(8))); /* must all be in same page */
-	__be32 x;
-	u32 y;
+	u32 x, y;
 	int ret;
 
 	sp = rxrpc_skb(skb);
 
 	_enter("{%d{%x}},{#%u},%zu,",
-	       call->debug_id, key_serial(call->conn->key), ntohl(sp->hdr.seq),
+	       call->debug_id, key_serial(call->conn->key), sp->hdr.seq,
 	       data_size);
 
 	if (!call->conn->cipher)
@@ -286,10 +285,10 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
 	desc.flags = 0;
 
 	/* calculate the security checksum */
-	x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
-	x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
-	tmpbuf.x[0] = sp->hdr.callNumber;
-	tmpbuf.x[1] = x;
+	x = call->channel << (32 - RXRPC_CIDSHIFT);
+	x |= sp->hdr.seq & 0x3fffffff;
+	tmpbuf.x[0] = htonl(sp->hdr.callNumber);
+	tmpbuf.x[1] = htonl(x);
 
 	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
 	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
@@ -299,7 +298,7 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
 	y = (y >> 16) & 0xffff;
 	if (y == 0)
 		y = 1; /* zero checksums are not permitted */
-	sp->hdr.cksum = htons(y);
+	sp->hdr.cksum = y;
 
 	switch (call->conn->security_level) {
 	case RXRPC_SECURITY_PLAIN:
@@ -368,7 +367,7 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
 	data_size = buf & 0xffff;
 
 	check = buf >> 16;
-	check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+	check ^= sp->hdr.seq ^ sp->hdr.callNumber;
 	check &= 0xffff;
 	if (check != 0) {
 		*_abort_code = RXKADSEALEDINCON;
@@ -453,7 +452,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
 	data_size = buf & 0xffff;
 
 	check = buf >> 16;
-	check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+	check ^= sp->hdr.seq ^ sp->hdr.callNumber;
 	check &= 0xffff;
 	if (check != 0) {
 		*_abort_code = RXKADSEALEDINCON;
@@ -494,16 +493,14 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
 	struct {
 		__be32 x[2];
 	} tmpbuf __attribute__((aligned(8))); /* must all be in same page */
-	__be32 x;
-	__be16 cksum;
-	u32 y;
+	u16 cksum;
+	u32 x, y;
 	int ret;
 
 	sp = rxrpc_skb(skb);
 
 	_enter("{%d{%x}},{#%u}",
-	       call->debug_id, key_serial(call->conn->key),
-	       ntohl(sp->hdr.seq));
+	       call->debug_id, key_serial(call->conn->key), sp->hdr.seq);
 
 	if (!call->conn->cipher)
 		return 0;
@@ -521,21 +518,20 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
 	desc.flags = 0;
 
 	/* validate the security checksum */
-	x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
-	x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
-	tmpbuf.x[0] = call->call_id;
-	tmpbuf.x[1] = x;
+	x = call->channel << (32 - RXRPC_CIDSHIFT);
+	x |= sp->hdr.seq & 0x3fffffff;
+	tmpbuf.x[0] = htonl(call->call_id);
+	tmpbuf.x[1] = htonl(x);
 
 	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
 	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
 	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
 
 	y = ntohl(tmpbuf.x[1]);
-	y = (y >> 16) & 0xffff;
-	if (y == 0)
-		y = 1; /* zero checksums are not permitted */
+	cksum = (y >> 16) & 0xffff;
+	if (cksum == 0)
+		cksum = 1; /* zero checksums are not permitted */
 
-	cksum = htons(y);
 	if (sp->hdr.cksum != cksum) {
 		*_abort_code = RXKADSEALEDINCON;
 		_leave(" = -EPROTO [csum failed]");
@@ -567,10 +563,11 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
 static int rxkad_issue_challenge(struct rxrpc_connection *conn)
 {
 	struct rxkad_challenge challenge;
-	struct rxrpc_header hdr;
+	struct rxrpc_wire_header whdr;
 	struct msghdr msg;
 	struct kvec iov[2];
 	size_t len;
+	u32 serial;
 	int ret;
 
 	_enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
@@ -592,26 +589,27 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr.epoch	= conn->epoch;
-	hdr.cid		= conn->cid;
-	hdr.callNumber	= 0;
-	hdr.seq		= 0;
-	hdr.type	= RXRPC_PACKET_TYPE_CHALLENGE;
-	hdr.flags	= conn->out_clientflag;
-	hdr.userStatus	= 0;
-	hdr.securityIndex = conn->security_ix;
-	hdr._rsvd	= 0;
-	hdr.serviceId	= conn->service_id;
-
-	iov[0].iov_base	= &hdr;
-	iov[0].iov_len	= sizeof(hdr);
+	whdr.epoch	= htonl(conn->epoch);
+	whdr.cid	= htonl(conn->cid);
+	whdr.callNumber	= 0;
+	whdr.seq	= 0;
+	whdr.type	= RXRPC_PACKET_TYPE_CHALLENGE;
+	whdr.flags	= conn->out_clientflag;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = conn->security_ix;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= htons(conn->service_id);
+
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 	iov[1].iov_base	= &challenge;
 	iov[1].iov_len	= sizeof(challenge);
 
 	len = iov[0].iov_len + iov[1].iov_len;
 
-	hdr.serial = htonl(atomic_inc_return(&conn->serial));
-	_proto("Tx CHALLENGE %%%u", ntohl(hdr.serial));
+	serial = atomic_inc_return(&conn->serial);
+	whdr.serial = htonl(serial);
+	_proto("Tx CHALLENGE %%%u", serial);
 
 	ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
 	if (ret < 0) {
@@ -627,13 +625,15 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
  * send a Kerberos security response
  */
 static int rxkad_send_response(struct rxrpc_connection *conn,
-			       struct rxrpc_header *hdr,
+			       struct rxrpc_host_header *hdr,
 			       struct rxkad_response *resp,
 			       const struct rxkad_key *s2)
 {
+	struct rxrpc_wire_header whdr;
 	struct msghdr msg;
 	struct kvec iov[3];
 	size_t len;
+	u32 serial;
 	int ret;
 
 	_enter("");
@@ -644,24 +644,26 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	hdr->epoch	= conn->epoch;
-	hdr->seq	= 0;
-	hdr->type	= RXRPC_PACKET_TYPE_RESPONSE;
-	hdr->flags	= conn->out_clientflag;
-	hdr->userStatus	= 0;
-	hdr->_rsvd	= 0;
+	memset(&whdr, 0, sizeof(whdr));
+	whdr.epoch	= htonl(hdr->epoch);
+	whdr.cid	= htonl(hdr->cid);
+	whdr.type	= RXRPC_PACKET_TYPE_RESPONSE;
+	whdr.flags	= conn->out_clientflag;
+	whdr.securityIndex = hdr->securityIndex;
+	whdr.serviceId	= htons(hdr->serviceId);
 
-	iov[0].iov_base	= hdr;
-	iov[0].iov_len	= sizeof(*hdr);
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
 	iov[1].iov_base	= resp;
 	iov[1].iov_len	= sizeof(*resp);
-	iov[2].iov_base	= (void *) s2->ticket;
+	iov[2].iov_base	= (void *)s2->ticket;
 	iov[2].iov_len	= s2->ticket_len;
 
 	len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len;
 
-	hdr->serial = htonl(atomic_inc_return(&conn->serial));
-	_proto("Tx RESPONSE %%%u", ntohl(hdr->serial));
+	serial = atomic_inc_return(&conn->serial);
+	whdr.serial = htonl(serial);
+	_proto("Tx RESPONSE %%%u", serial);
 
 	ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len);
 	if (ret < 0) {
@@ -770,7 +772,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	min_level = ntohl(challenge.min_level);
 
 	_proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }",
-	       ntohl(sp->hdr.serial), version, nonce, min_level);
+	       sp->hdr.serial, version, nonce, min_level);
 
 	abort_code = RXKADINCONSISTENCY;
 	if (version != RXKAD_VERSION)
@@ -786,17 +788,17 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	memset(&resp, 0, sizeof(resp));
 
 	resp.version = RXKAD_VERSION;
-	resp.encrypted.epoch = conn->epoch;
-	resp.encrypted.cid = conn->cid;
+	resp.encrypted.epoch = htonl(conn->epoch);
+	resp.encrypted.cid = htonl(conn->cid);
 	resp.encrypted.securityIndex = htonl(conn->security_ix);
 	resp.encrypted.call_id[0] =
-		(conn->channels[0] ? conn->channels[0]->call_id : 0);
+		htonl(conn->channels[0] ? conn->channels[0]->call_id : 0);
 	resp.encrypted.call_id[1] =
-		(conn->channels[1] ? conn->channels[1]->call_id : 0);
+		htonl(conn->channels[1] ? conn->channels[1]->call_id : 0);
 	resp.encrypted.call_id[2] =
-		(conn->channels[2] ? conn->channels[2]->call_id : 0);
+		htonl(conn->channels[2] ? conn->channels[2]->call_id : 0);
 	resp.encrypted.call_id[3] =
-		(conn->channels[3] ? conn->channels[3]->call_id : 0);
+		htonl(conn->channels[3] ? conn->channels[3]->call_id : 0);
 	resp.encrypted.inc_nonce = htonl(nonce + 1);
 	resp.encrypted.level = htonl(conn->security_level);
 	resp.kvno = htonl(token->kad->kvno);
@@ -1022,7 +1024,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	kvno = ntohl(response.kvno);
 	sp = rxrpc_skb(skb);
 	_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
-	       ntohl(sp->hdr.serial), version, kvno, ticket_len);
+	       sp->hdr.serial, version, kvno, ticket_len);
 
 	abort_code = RXKADINCONSISTENCY;
 	if (version != RXKAD_VERSION)
@@ -1058,9 +1060,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	rxkad_decrypt_response(conn, &response, &session_key);
 
 	abort_code = RXKADSEALEDINCON;
-	if (response.encrypted.epoch != conn->epoch)
+	if (ntohl(response.encrypted.epoch) != conn->epoch)
 		goto protocol_error_free;
-	if (response.encrypted.cid != conn->cid)
+	if (ntohl(response.encrypted.cid) != conn->cid)
 		goto protocol_error_free;
 	if (ntohl(response.encrypted.securityIndex) != conn->security_ix)
 		goto protocol_error_free;
@@ -1077,7 +1079,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 		goto protocol_error_free;
 
 	abort_code = RXKADOUTOFSEQUENCE;
-	if (response.encrypted.inc_nonce != htonl(conn->security_nonce + 1))
+	if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1)
 		goto protocol_error_free;
 
 	abort_code = RXKADLEVELFAIL;
-- 
cgit v1.2.3-59-g8ed1b


From e33b3d97bc443aa330f086ca90a209a0d35dfa86 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:54:27 +0000
Subject: rxrpc: The protocol family should be set to PF_RXRPC not PF_UNIX

Fix the protocol family set in the proto_ops for rxrpc to be PF_RXRPC not
PF_UNIX.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 7bb5cca0ae32..5e810b83269f 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -748,7 +748,7 @@ static int rxrpc_release(struct socket *sock)
  * RxRPC network protocol
  */
 static const struct proto_ops rxrpc_rpc_ops = {
-	.family		= PF_UNIX,
+	.family		= PF_RXRPC,
 	.owner		= THIS_MODULE,
 	.release	= rxrpc_release,
 	.bind		= rxrpc_bind,
-- 
cgit v1.2.3-59-g8ed1b


From ee6fe085a992d40d2b5d3e7e2b9e0eec1cdacaac Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:55:32 +0000
Subject: rxrpc: Fix defined range for /proc/sys/net/rxrpc/rx_mtu

The upper bound of the defined range for rx_mtu is being set in the same
member as the lower bound (extra1) rather than the correct place (extra2).
I'm not entirely sure why this compiles.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 50a98a910eb1..093547ac2bcd 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -115,7 +115,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (void *)&one,
-		.extra1		= (void *)&n_65535,
+		.extra2		= (void *)&n_65535,
 	},
 	{
 		.procname	= "rx_jumbo_max",
-- 
cgit v1.2.3-59-g8ed1b


From 351c1e648623b742fe1687636117306adc8b561c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:56:06 +0000
Subject: rxrpc: Be more selective about the types of received packets we
 accept

Currently, received RxRPC packets outside the range 1-13 are rejected.
There are, however, holes in the range that should also be rejected - plus
at least one type we don't yet support - so reject these also.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/rxrpc/packet.h | 11 +++++++++++
 net/rxrpc/ar-input.c   |  3 ++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index de1e67988ada..9ebab3a8cf0a 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -70,6 +70,17 @@ struct rxrpc_wire_header {
 
 extern const char *rxrpc_pkts[];
 
+#define RXRPC_SUPPORTED_PACKET_TYPES (			\
+		(1 << RXRPC_PACKET_TYPE_DATA) |		\
+		(1 << RXRPC_PACKET_TYPE_ACK) |		\
+		(1 << RXRPC_PACKET_TYPE_BUSY) |		\
+		(1 << RXRPC_PACKET_TYPE_ABORT) |	\
+		(1 << RXRPC_PACKET_TYPE_ACKALL) |	\
+		(1 << RXRPC_PACKET_TYPE_CHALLENGE) |	\
+		(1 << RXRPC_PACKET_TYPE_RESPONSE) |	\
+		/*(1 << RXRPC_PACKET_TYPE_DEBUG) | */	\
+		(1 << RXRPC_PACKET_TYPE_VERSION))
+
 /*****************************************************************************/
 /*
  * jumbo packet secondary header
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index e6396a8c969f..63ed75c40e29 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -728,7 +728,8 @@ void rxrpc_data_ready(struct sock *sk)
 	     sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
 	     sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
 
-	if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) {
+	if (sp->hdr.type >= RXRPC_N_PACKET_TYPES ||
+	    !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) {
 		_proto("Rx Bad Packet Type %u", sp->hdr.type);
 		goto bad_message;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From b4f1342f915201ee15ef6890857b5469879ee402 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:56:19 +0000
Subject: rxrpc: Adjust some whitespace and comments

Remove some excess whitespace, insert some missing spaces and adjust a
couple of comments.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c    |  9 +++------
 net/rxrpc/ar-accept.c   |  4 +---
 net/rxrpc/ar-error.c    |  1 -
 net/rxrpc/ar-internal.h | 22 +++++++++++-----------
 net/rxrpc/ar-security.c |  2 +-
 net/rxrpc/ar-skbuff.c   |  1 -
 net/rxrpc/rxkad.c       | 12 ++++++------
 7 files changed, 22 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 5e810b83269f..b46f7742d336 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -121,7 +121,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
  */
 static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 {
-	struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) saddr;
+	struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr;
 	struct sock *sk = sock->sk;
 	struct rxrpc_local *local;
 	struct rxrpc_sock *rx = rxrpc_sk(sk), *prx;
@@ -142,7 +142,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
 
 	memcpy(&rx->srx, srx, sizeof(rx->srx));
 
-	/* find a local transport endpoint if we don't have one already */
+	/* Find or create a local transport endpoint to use */
 	local = rxrpc_lookup_local(&rx->srx);
 	if (IS_ERR(local)) {
 		ret = PTR_ERR(local);
@@ -297,7 +297,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 
 	if (!srx)
 		srx = &rx->srx;
-
 	if (!key)
 		key = rx->key;
 	if (key && !key->payload.data[0])
@@ -319,7 +318,6 @@ out_notrans:
 	_leave(" = %p", call);
 	return call;
 }
-
 EXPORT_SYMBOL(rxrpc_kernel_begin_call);
 
 /**
@@ -335,7 +333,6 @@ void rxrpc_kernel_end_call(struct rxrpc_call *call)
 	rxrpc_remove_user_ID(call->socket, call);
 	rxrpc_put_call(call);
 }
-
 EXPORT_SYMBOL(rxrpc_kernel_end_call);
 
 /**
@@ -616,7 +613,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	if (!net_eq(net, &init_net))
 		return -EAFNOSUPPORT;
 
-	/* we support transport protocol UDP only */
+	/* we support transport protocol UDP/UDP6 only */
 	if (protocol != PF_INET)
 		return -EPROTONOSUPPORT;
 
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
index 73c905416271..277731a5e67a 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/ar-accept.c
@@ -415,7 +415,7 @@ out:
 }
 
 /*
- * handle rejectance of a call by userspace
+ * Handle rejection of a call by userspace
  * - reject the call at the front of the queue
  */
 int rxrpc_reject_call(struct rxrpc_sock *rx)
@@ -495,7 +495,6 @@ struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
 	_leave(" = %p", call);
 	return call;
 }
-
 EXPORT_SYMBOL(rxrpc_kernel_accept_call);
 
 /**
@@ -514,5 +513,4 @@ int rxrpc_kernel_reject_call(struct socket *sock)
 	_leave(" = %d", ret);
 	return ret;
 }
-
 EXPORT_SYMBOL(rxrpc_kernel_reject_call);
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
index 3c47441c472b..5948191bf4eb 100644
--- a/net/rxrpc/ar-error.c
+++ b/net/rxrpc/ar-error.c
@@ -115,7 +115,6 @@ void rxrpc_UDP_error_report(struct sock *sk)
 	/* pass the transport ref to error_handler to release */
 	skb_queue_tail(&trans->error_queue, skb);
 	rxrpc_queue_work(&trans->error_handler);
-
 	_leave("");
 }
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 06bf5abd920d..8b495aed517d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -16,7 +16,7 @@
 	BUG_ON(atomic_read((X)) >> (sizeof(atomic_t) - 2) == \
 	       (POISON_FREE << 8 | POISON_FREE))
 #else
-#define CHECK_SLAB_OKAY(X) do {} while(0)
+#define CHECK_SLAB_OKAY(X) do {} while (0)
 #endif
 
 #define FCRYPT_BSIZE 8
@@ -726,7 +726,7 @@ do {								\
 		printk(KERN_ERR "RxRPC: Assertion failed\n");	\
 		BUG();						\
 	}							\
-} while(0)
+} while (0)
 
 #define ASSERTCMP(X, OP, Y)						\
 do {									\
@@ -739,7 +739,7 @@ do {									\
 		       (unsigned long)(X), (unsigned long)(Y));		\
 		BUG();							\
 	}								\
-} while(0)
+} while (0)
 
 #define ASSERTIF(C, X)						\
 do {								\
@@ -748,7 +748,7 @@ do {								\
 		printk(KERN_ERR "RxRPC: Assertion failed\n");	\
 		BUG();						\
 	}							\
-} while(0)
+} while (0)
 
 #define ASSERTIFCMP(C, X, OP, Y)					\
 do {									\
@@ -761,25 +761,25 @@ do {									\
 		       (unsigned long)(X), (unsigned long)(Y));		\
 		BUG();							\
 	}								\
-} while(0)
+} while (0)
 
 #else
 
 #define ASSERT(X)				\
 do {						\
-} while(0)
+} while (0)
 
 #define ASSERTCMP(X, OP, Y)			\
 do {						\
-} while(0)
+} while (0)
 
 #define ASSERTIF(C, X)				\
 do {						\
-} while(0)
+} while (0)
 
 #define ASSERTIFCMP(C, X, OP, Y)		\
 do {						\
-} while(0)
+} while (0)
 
 #endif /* __KDEBUGALL */
 
@@ -836,9 +836,9 @@ do {							\
 	CHECK_SLAB_OKAY(&(CALL)->usage);		\
 	if (atomic_inc_return(&(CALL)->usage) == 1)	\
 		BUG();					\
-} while(0)
+} while (0)
 
 #define rxrpc_put_call(CALL)				\
 do {							\
 	__rxrpc_put_call(CALL);				\
-} while(0)
+} while (0)
diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c
index e2f4c49a9246..ceff6394a65f 100644
--- a/net/rxrpc/ar-security.c
+++ b/net/rxrpc/ar-security.c
@@ -167,7 +167,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
 	struct rxrpc_sock *rx;
 	struct key *key;
 	key_ref_t kref;
-	char kdesc[5+1+3+1];
+	char kdesc[5 + 1 + 3 + 1];
 
 	_enter("");
 
diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c
index ae9f93f94ed2..62a267472fce 100644
--- a/net/rxrpc/ar-skbuff.c
+++ b/net/rxrpc/ar-skbuff.c
@@ -133,5 +133,4 @@ void rxrpc_kernel_free_skb(struct sk_buff *skb)
 {
 	rxrpc_free_skb(skb);
 }
-
 EXPORT_SYMBOL(rxrpc_kernel_free_skb);
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 160480221224..0f27524536bd 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -195,9 +195,9 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
  * wholly encrypt a packet (level 2 security)
  */
 static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
-					struct sk_buff *skb,
-					u32 data_size,
-					void *sechdr)
+				       struct sk_buff *skb,
+				       u32 data_size,
+				       void *sechdr)
 {
 	const struct rxrpc_key_token *token;
 	struct rxkad_level2_hdr rxkhdr
@@ -251,9 +251,9 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
  * checksum an RxRPC packet header
  */
 static int rxkad_secure_packet(const struct rxrpc_call *call,
-				struct sk_buff *skb,
-				size_t data_size,
-				void *sechdr)
+			       struct sk_buff *skb,
+			       size_t data_size,
+			       void *sechdr)
 {
 	struct rxrpc_skb_priv *sp;
 	struct blkcipher_desc desc;
-- 
cgit v1.2.3-59-g8ed1b


From ee72b9fddb41a2087b4a7aa74791ef1700f3f6cc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:58:06 +0000
Subject: rxrpc: Use ACCESS_ONCE() when accessing circular buffer pointers

Use ACCESS_ONCE() when accessing the other-end pointer into a circular
buffer as it's possible the other-end pointer might change whilst we're
doing this, and if we access it twice, we might get some weird things
happening.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-output.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index 353f5c9141ea..14c8df6b7f41 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -401,7 +401,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 	int ret;
 
 	_enter(",{%d},%ld",
-	       CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz),
+	       CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
+			  call->acks_winsz),
 	       *timeo);
 
 	add_wait_queue(&call->tx_waitq, &myself);
@@ -409,7 +410,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		ret = 0;
-		if (CIRC_SPACE(call->acks_head, call->acks_tail,
+		if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
 			       call->acks_winsz) > 0)
 			break;
 		if (signal_pending(current)) {
@@ -570,7 +571,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 
 			_debug("alloc");
 
-			if (CIRC_SPACE(call->acks_head, call->acks_tail,
+			if (CIRC_SPACE(call->acks_head,
+				       ACCESS_ONCE(call->acks_tail),
 				       call->acks_winsz) <= 0) {
 				ret = -EAGAIN;
 				if (msg->msg_flags & MSG_DONTWAIT)
@@ -686,7 +688,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			sp->hdr.flags = conn->out_clientflag;
 			if (msg_data_left(msg) == 0 && !more)
 				sp->hdr.flags |= RXRPC_LAST_PACKET;
-			else if (CIRC_SPACE(call->acks_head, call->acks_tail,
+			else if (CIRC_SPACE(call->acks_head,
+					    ACCESS_ONCE(call->acks_tail),
 					    call->acks_winsz) > 1)
 				sp->hdr.flags |= RXRPC_MORE_PACKETS;
 			if (more && seq & 1)
-- 
cgit v1.2.3-59-g8ed1b


From 098a20991d15b76a7e6835b95bd58269e10e3a60 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:59:00 +0000
Subject: rxrpc: rxkad: The version number in the response should be net byte
 order

The version number rxkad places in the response should be network byte
order.

Whilst we're at it, rearrange the code to be more readable.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/rxkad.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 0f27524536bd..ca746f382517 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -787,10 +787,15 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	/* build the response packet */
 	memset(&resp, 0, sizeof(resp));
 
-	resp.version = RXKAD_VERSION;
-	resp.encrypted.epoch = htonl(conn->epoch);
-	resp.encrypted.cid = htonl(conn->cid);
-	resp.encrypted.securityIndex = htonl(conn->security_ix);
+	resp.version			= htonl(RXKAD_VERSION);
+	resp.encrypted.epoch		= htonl(conn->epoch);
+	resp.encrypted.cid		= htonl(conn->cid);
+	resp.encrypted.securityIndex	= htonl(conn->security_ix);
+	resp.encrypted.inc_nonce	= htonl(nonce + 1);
+	resp.encrypted.level		= htonl(conn->security_level);
+	resp.kvno			= htonl(token->kad->kvno);
+	resp.ticket_len			= htonl(token->kad->ticket_len);
+
 	resp.encrypted.call_id[0] =
 		htonl(conn->channels[0] ? conn->channels[0]->call_id : 0);
 	resp.encrypted.call_id[1] =
@@ -799,10 +804,6 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 		htonl(conn->channels[2] ? conn->channels[2]->call_id : 0);
 	resp.encrypted.call_id[3] =
 		htonl(conn->channels[3] ? conn->channels[3]->call_id : 0);
-	resp.encrypted.inc_nonce = htonl(nonce + 1);
-	resp.encrypted.level = htonl(conn->security_level);
-	resp.kvno = htonl(token->kad->kvno);
-	resp.ticket_len = htonl(token->kad->ticket_len);
 
 	/* calculate the response checksum and then do the encryption */
 	rxkad_calc_response_checksum(&resp);
-- 
cgit v1.2.3-59-g8ed1b


From 2b15ef15bca689d60be27f6ffdaa2c92a59ae6e7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:59:13 +0000
Subject: rxrpc: rxkad: Casts are needed when comparing be32 values

Forced casts are needed to avoid sparse warning when directly comparing
be32 values.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/rxkad.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index ca746f382517..3106a0c4960b 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -142,7 +142,7 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
 	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
 
 	memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv));
-	ASSERTCMP(conn->csum_iv.n[0], ==, tmpbuf.x[2]);
+	ASSERTCMP((u32 __force)conn->csum_iv.n[0], ==, (u32 __force)tmpbuf.x[2]);
 
 	_leave("");
 }
-- 
cgit v1.2.3-59-g8ed1b


From ab802ee0ab5f4e1e7ee17632627726fbd7881e6c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 15:59:49 +0000
Subject: rxrpc: Clear the unused part of a sockaddr_rxrpc for memcmp() use

Clear the unused part of a sockaddr_rxrpc structs so that memcmp() can be
used to compare them.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index b46f7742d336..a76501757b59 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -81,6 +81,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
 				  struct sockaddr_rxrpc *srx,
 				  int len)
 {
+	unsigned tail;
+
 	if (len < sizeof(struct sockaddr_rxrpc))
 		return -EINVAL;
 
@@ -103,9 +105,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
 		_debug("INET: %x @ %pI4",
 		       ntohs(srx->transport.sin.sin_port),
 		       &srx->transport.sin.sin_addr);
-		if (srx->transport_len > 8)
-			memset((void *)&srx->transport + 8, 0,
-			       srx->transport_len - 8);
+		tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad);
 		break;
 
 	case AF_INET6:
@@ -113,6 +113,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
 		return -EAFNOSUPPORT;
 	}
 
+	if (tail < len)
+		memset((void *)srx + tail, 0, len - tail);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a4373a489e87e2bf8794194bc275b6c05f1af2f7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2016 16:02:03 +0000
Subject: rxrpc: Don't try to map ICMP to error as the lower layer already did
 that

In the ICMP message processing code, don't try to map ICMP codes to UNIX
error codes as the caller (IPv4/IPv6) already did that for us (ee_errno).

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-error.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
index 5948191bf4eb..3e82d6f0313c 100644
--- a/net/rxrpc/ar-error.c
+++ b/net/rxrpc/ar-error.c
@@ -151,28 +151,18 @@ void rxrpc_UDP_error_handler(struct work_struct *work)
 			switch (ee->ee_code) {
 			case ICMP_NET_UNREACH:
 				_net("Rx Received ICMP Network Unreachable");
-				err = ENETUNREACH;
 				break;
 			case ICMP_HOST_UNREACH:
 				_net("Rx Received ICMP Host Unreachable");
-				err = EHOSTUNREACH;
 				break;
 			case ICMP_PORT_UNREACH:
 				_net("Rx Received ICMP Port Unreachable");
-				err = ECONNREFUSED;
-				break;
-			case ICMP_FRAG_NEEDED:
-				_net("Rx Received ICMP Fragmentation Needed (%d)",
-				     ee->ee_info);
-				err = 0; /* dealt with elsewhere */
 				break;
 			case ICMP_NET_UNKNOWN:
 				_net("Rx Received ICMP Unknown Network");
-				err = ENETUNREACH;
 				break;
 			case ICMP_HOST_UNKNOWN:
 				_net("Rx Received ICMP Unknown Host");
-				err = EHOSTUNREACH;
 				break;
 			default:
 				_net("Rx Received ICMP DestUnreach code=%u",
-- 
cgit v1.2.3-59-g8ed1b


From d57a544d71092a81c2064ff242009a38e7b1a631 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Thu, 3 Mar 2016 01:16:56 +0000
Subject: mac80211: use reset to set header pointer

Since offset is zero, it's not necessary to use set function. Reset
function is straightforward, and will remove the unnecessary add
operation in set function.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mac80211/mesh_hwmp.c | 6 +++---
 net/mac80211/rx.c        | 2 +-
 net/mac80211/status.c    | 2 +-
 net/mac80211/tx.c        | 8 ++++----
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index c6be0b4f4058..5b6aec1a0630 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -205,9 +205,9 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 
-	skb_set_mac_header(skb, 0);
-	skb_set_network_header(skb, 0);
-	skb_set_transport_header(skb, 0);
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
 
 	/* Send all internal mgmt frames on VO. Accordingly set TID to 7. */
 	skb_set_queue_mapping(skb, IEEE80211_AC_VO);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 91279576f4a7..5690e4c67486 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3101,7 +3101,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
 	ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom,
 					 false);
 
-	skb_set_mac_header(skb, 0);
+	skb_reset_mac_header(skb);
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	skb->pkt_type = PACKET_OTHERHOST;
 	skb->protocol = htons(ETH_P_802_2);
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 6101deb805a8..8b1b2ea03eb5 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -697,7 +697,7 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
 					 rtap_len, shift);
 
 	/* XXX: is this sufficient for BPF? */
-	skb_set_mac_header(skb, 0);
+	skb_reset_mac_header(skb);
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	skb->pkt_type = PACKET_OTHERHOST;
 	skb->protocol = htons(ETH_P_802_2);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 3a7475ff1a41..62ad5321257d 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2458,7 +2458,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	/* Update skb pointers to various headers since this modified frame
 	 * is going to go through Linux networking code that may potentially
 	 * need things like pointer to IP header. */
-	skb_set_mac_header(skb, 0);
+	skb_reset_mac_header(skb);
 	skb_set_network_header(skb, nh_pos);
 	skb_set_transport_header(skb, h_pos);
 
@@ -3973,9 +3973,9 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 {
 	int ac = ieee802_1d_to_ac[tid & 7];
 
-	skb_set_mac_header(skb, 0);
-	skb_set_network_header(skb, 0);
-	skb_set_transport_header(skb, 0);
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
 
 	skb_set_queue_mapping(skb, ac);
 	skb->priority = tid;
-- 
cgit v1.2.3-59-g8ed1b


From 3ef523aeeee88130e5de10adac719dbd6fa266e5 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Thu, 3 Mar 2016 01:16:57 +0000
Subject: wireless: use reset to set mac header

Since offset is zero, it's not necessary to use set function. Reset
function is straightforward, and will remove the unnecessary add
operation in set function.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/wireless/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index c7f6820bb258..9f440a9de63b 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -636,7 +636,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
 	/* Update skb pointers to various headers since this modified frame
 	 * is going to go through Linux networking code that may potentially
 	 * need things like pointer to IP header. */
-	skb_set_mac_header(skb, 0);
+	skb_reset_mac_header(skb);
 	skb_set_network_header(skb, nh_pos);
 	skb_set_transport_header(skb, h_pos);
 
-- 
cgit v1.2.3-59-g8ed1b


From 3f20efba41916ee17ce82f0fdd02581ada2872b2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 27 Jan 2016 14:52:02 +0100
Subject: ipvs: handle ip_vs_fill_iph_skb_off failure

ip_vs_fill_iph_skb_off() may not find an IP header, and gcc has
determined that ip_vs_sip_fill_param() then incorrectly accesses
the protocol fields:

net/netfilter/ipvs/ip_vs_pe_sip.c: In function 'ip_vs_sip_fill_param':
net/netfilter/ipvs/ip_vs_pe_sip.c:76:5: error: 'iph.protocol' may be used uninitialized in this function [-Werror=maybe-uninitialized]
  if (iph.protocol != IPPROTO_UDP)
     ^
net/netfilter/ipvs/ip_vs_pe_sip.c:81:10: error: 'iph.len' may be used uninitialized in this function [-Werror=maybe-uninitialized]
  dataoff = iph.len + sizeof(struct udphdr);
          ^

This adds a check for the ip_vs_fill_iph_skb_off() return code
before looking at the ip header data returned from it.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: b0e010c527de ("ipvs: replace ip_vs_fill_ip4hdr with ip_vs_fill_iph_skb_off")
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_pe_sip.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 1b8d594e493a..c4e9ca016a88 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -70,10 +70,10 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
 	const char *dptr;
 	int retc;
 
-	ip_vs_fill_iph_skb(p->af, skb, false, &iph);
+	retc = ip_vs_fill_iph_skb(p->af, skb, false, &iph);
 
 	/* Only useful with UDP */
-	if (iph.protocol != IPPROTO_UDP)
+	if (!retc || iph.protocol != IPPROTO_UDP)
 		return -EINVAL;
 	/* todo: IPv6 fragments:
 	 *       I think this only should be done for the first fragment. /HS
-- 
cgit v1.2.3-59-g8ed1b


From f719e3754ee2f7275437e61a6afd520181fdd43b Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sat, 5 Mar 2016 15:03:22 +0200
Subject: ipvs: drop first packet to redirect conntrack

Jiri Bohac is reporting for a problem where the attempt
to reschedule existing connection to another real server
needs proper redirect for the conntrack used by the IPVS
connection. For example, when IPVS connection is created
to NAT-ed real server we alter the reply direction of
conntrack. If we later decide to select different real
server we can not alter again the conntrack. And if we
expire the old connection, the new connection is left
without conntrack.

So, the only way to redirect both the IPVS connection and
the Netfilter's conntrack is to drop the SYN packet that
hits existing connection, to wait for the next jiffie
to expire the old connection and its conntrack and to rely
on client's retransmission to create new connection as
usually.

Jiri Bohac provided a fix that drops all SYNs on rescheduling,
I extended his patch to do such drops only for connections
that use conntrack. Here is the original report from Jiri Bohac:

Since commit dc7b3eb900aa ("ipvs: Fix reuse connection if real server
is dead"), new connections to dead servers are redistributed
immediately to new servers.  The old connection is expired using
ip_vs_conn_expire_now() which sets the connection timer to expire
immediately.

However, before the timer callback, ip_vs_conn_expire(), is run
to clean the connection's conntrack entry, the new redistributed
connection may already be established and its conntrack removed
instead.

Fix this by dropping the first packet of the new connection
instead, like we do when the destination server is not available.
The timer will have deleted the old conntrack entry long before
the first packet of the new connection is retransmitted.

Fixes: dc7b3eb900aa ("ipvs: Fix reuse connection if real server is dead")
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             | 17 +++++++++++++++++
 net/netfilter/ipvs/ip_vs_core.c | 37 ++++++++++++++++++++++++++++---------
 2 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 0816c872b689..a6cc576fd467 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1588,6 +1588,23 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
 }
 #endif /* CONFIG_IP_VS_NFCT */
 
+/* Really using conntrack? */
+static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp,
+					     struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_VS_NFCT
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		return false;
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct && !nf_ct_is_untracked(ct))
+		return true;
+#endif
+	return false;
+}
+
 static inline int
 ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
 {
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index f57b4dcdb233..4da560005b0e 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1757,15 +1757,34 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
 	cp = pp->conn_in_get(ipvs, af, skb, &iph);
 
 	conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
-	if (conn_reuse_mode && !iph.fragoffs &&
-	    is_new_conn(skb, &iph) && cp &&
-	    ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
-	      unlikely(!atomic_read(&cp->dest->weight))) ||
-	     unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
-		if (!atomic_read(&cp->n_control))
-			ip_vs_conn_expire_now(cp);
-		__ip_vs_conn_put(cp);
-		cp = NULL;
+	if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
+		bool uses_ct = false, resched = false;
+
+		if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
+		    unlikely(!atomic_read(&cp->dest->weight))) {
+			resched = true;
+			uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+		} else if (is_new_conn_expected(cp, conn_reuse_mode)) {
+			uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+			if (!atomic_read(&cp->n_control)) {
+				resched = true;
+			} else {
+				/* Do not reschedule controlling connection
+				 * that uses conntrack while it is still
+				 * referenced by controlled connection(s).
+				 */
+				resched = !uses_ct;
+			}
+		}
+
+		if (resched) {
+			if (!atomic_read(&cp->n_control))
+				ip_vs_conn_expire_now(cp);
+			__ip_vs_conn_put(cp);
+			if (uses_ct)
+				return NF_DROP;
+			cp = NULL;
+		}
 	}
 
 	if (unlikely(!cp)) {
-- 
cgit v1.2.3-59-g8ed1b


From f911b675a06d05211da52cf8267db80c81b6aee4 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sat, 5 Mar 2016 15:03:23 +0200
Subject: ipvs: allow rescheduling after RST

"RFC 5961, 4.2. Mitigation" describes a mechanism to request
client to confirm with RST the restart of TCP connection
before resending its SYN. As result, IPVS can see SYNs for
existing connection in CLOSE state. Add check to allow
rescheduling in this state.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 4da560005b0e..b9a4082afa3a 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1089,6 +1089,7 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
 	switch (cp->protocol) {
 	case IPPROTO_TCP:
 		return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
+		       (cp->state == IP_VS_TCP_S_CLOSE) ||
 			((conn_reuse_mode & 2) &&
 			 (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
 			 (cp->flags & IP_VS_CONN_F_NOOUTPUT));
-- 
cgit v1.2.3-59-g8ed1b


From 7617a24f83b5d67f4dab1844956be1cebc44aec8 Mon Sep 17 00:00:00 2001
From: Marco Angaroni <marcoangaroni@gmail.com>
Date: Sat, 5 Mar 2016 12:10:02 +0100
Subject: ipvs: correct initial offset of Call-ID header search in SIP
 persistence engine

The IPVS SIP persistence engine is not able to parse the SIP header
"Call-ID" when such header is inserted in the first positions of
the SIP message.

When IPVS is configured with "--pe sip" option, like for example:
ipvsadm -A -u 1.2.3.4:5060 -s rr --pe sip -p 120 -o
some particular messages (see below for details) do not create entries
in the connection template table, which can be listed with:
ipvsadm -Lcn --persistent-conn

Problematic SIP messages are SIP responses having "Call-ID" header
positioned just after message first line:
SIP/2.0 200 OK
[Call-ID header here]
[rest of the headers]

When "Call-ID" header is positioned down (after a few other headers)
it is correctly recognized.

This is due to the data offset used in get_callid function call inside
ip_vs_pe_sip.c file: since dptr already points to the start of the
SIP message, the value of dataoff should be initially 0.
Otherwise the header is searched starting from some bytes after the
first character of the SIP message.

Fixes: 758ff0338722 ("IPVS: sip persistence engine")
Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_pe_sip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index c4e9ca016a88..0a6eb5c0d9e9 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -88,7 +88,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
 	dptr = skb->data + dataoff;
 	datalen = skb->len - dataoff;
 
-	if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+	if (get_callid(dptr, 0, datalen, &matchoff, &matchlen))
 		return -EINVAL;
 
 	/* N.B: pe_data is only set on success,
-- 
cgit v1.2.3-59-g8ed1b


From d1491fa54ceecfc32147626375e27937cb54b091 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Wed, 2 Mar 2016 11:20:36 -0800
Subject: act_ife: fix a typo in kmemdup() parameters

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ife.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 6e7ec257790d..c589a9ba506a 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(ife_get_meta_u16);
 
 int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
 {
-	mi->metaval = kmemdup(&metaval, sizeof(u32), GFP_KERNEL);
+	mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL);
 	if (!mi->metaval)
 		return -ENOMEM;
 
@@ -118,7 +118,7 @@ EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
 
 int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
 {
-	mi->metaval = kmemdup(&metaval, sizeof(u16), GFP_KERNEL);
+	mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL);
 	if (!mi->metaval)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9b3009604b8e305c32125530a3cbcc57d88f3bc1 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Thu, 3 Mar 2016 14:20:40 +0100
Subject: tipc: add net device to skb before UDP xmit

Prior to this patch enabling a IPv4 UDP bearer caused a null pointer
dereference in iptunnel_xmit_stats(), when it tried to dereference the
net device from the skb. To resolve this we now point the skb device
to the net device resolved from the routing table.

Fixes: 039f50629b7f (ip_tunnel: Move stats update to iptunnel_xmit())
Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/udp_media.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index d63a911e7fe2..f22a5bb169c9 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -181,6 +181,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
 			err = PTR_ERR(rt);
 			goto tx_error;
 		}
+
+		skb->dev = rt->dst.dev;
 		ttl = ip4_dst_hoplimit(&rt->dst);
 		udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr,
 				    dst->ipv4.s_addr, 0, ttl, 0, src->udp_port,
-- 
cgit v1.2.3-59-g8ed1b


From 2837f39c7cdbd209ab04d1c1f4eca015a40d5cd6 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Thu, 3 Mar 2016 14:20:41 +0100
Subject: tipc: don't check link reset on non existing link

Make sure we have a link before checking if it has been reset or not.

Prior to this patch tipc_link_is_reset() could be called with a non
existing link, resulting in a null pointer dereference.

Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/node.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/node.c b/net/tipc/node.c
index cdb79503d890..590d597589cf 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -843,7 +843,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
 	memcpy(&le->maddr, maddr, sizeof(*maddr));
 exit:
 	tipc_node_write_unlock(n);
-	if (reset && !tipc_link_is_reset(l))
+	if (reset && l && !tipc_link_is_reset(l))
 		tipc_node_link_down(n, b->identity, false);
 	tipc_node_put(n);
 }
-- 
cgit v1.2.3-59-g8ed1b


From ddb3712552c8807c75576fb4fbdbb16f0d48b161 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Thu, 3 Mar 2016 14:20:42 +0100
Subject: tipc: safely copy UDP netlink data from user

The netlink policy for TIPC_NLA_UDP_LOCAL and TIPC_NLA_UDP_REMOTE
is of type binary with a defined length. This causes the policy
framework to threat the defined length as maximum length.

There is however no protection against a user sending a smaller
amount of data. Prior to this patch this wasn't handled which could
result in a partially incomplete sockaddr_storage struct containing
uninitialized data.

In this patch we use nla_memcpy() when copying the user data. This
ensures a potential gap at the end is cleared out properly.

This was found by Julia with Coccinelle tool.

Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/udp_media.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index f22a5bb169c9..6fe8740a226f 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -276,7 +276,7 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub,
 			 struct udp_media_addr *remote)
 {
 	struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
-	struct sockaddr_storage *sa_local, *sa_remote;
+	struct sockaddr_storage sa_local, sa_remote;
 
 	if (!attrs[TIPC_NLA_BEARER_UDP_OPTS])
 		goto err;
@@ -285,41 +285,43 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub,
 			     tipc_nl_udp_policy))
 		goto err;
 	if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) {
-		sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]);
-		sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]);
+		nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL],
+			   sizeof(sa_local));
+		nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE],
+			   sizeof(sa_remote));
 	} else {
 err:
 		pr_err("Invalid UDP bearer configuration");
 		return -EINVAL;
 	}
-	if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) {
+	if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) {
 		struct sockaddr_in *ip4;
 
-		ip4 = (struct sockaddr_in *)sa_local;
+		ip4 = (struct sockaddr_in *)&sa_local;
 		local->proto = htons(ETH_P_IP);
 		local->udp_port = ip4->sin_port;
 		local->ipv4.s_addr = ip4->sin_addr.s_addr;
 
-		ip4 = (struct sockaddr_in *)sa_remote;
+		ip4 = (struct sockaddr_in *)&sa_remote;
 		remote->proto = htons(ETH_P_IP);
 		remote->udp_port = ip4->sin_port;
 		remote->ipv4.s_addr = ip4->sin_addr.s_addr;
 		return 0;
 
 #if IS_ENABLED(CONFIG_IPV6)
-	} else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) {
+	} else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) {
 		struct sockaddr_in6 *ip6;
 
-		ip6 = (struct sockaddr_in6 *)sa_local;
+		ip6 = (struct sockaddr_in6 *)&sa_local;
 		local->proto = htons(ETH_P_IPV6);
 		local->udp_port = ip6->sin6_port;
-		local->ipv6 = ip6->sin6_addr;
+		memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
 		ub->ifindex = ip6->sin6_scope_id;
 
-		ip6 = (struct sockaddr_in6 *)sa_remote;
+		ip6 = (struct sockaddr_in6 *)&sa_remote;
 		remote->proto = htons(ETH_P_IPV6);
 		remote->udp_port = ip6->sin6_port;
-		remote->ipv6 = ip6->sin6_addr;
+		memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
 		return 0;
 #endif
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 34f65dbb6c7572b826d836d62b99de32ecf2af48 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Thu, 3 Mar 2016 14:20:43 +0100
Subject: tipc: make sure required IPv6 addresses are scoped

Make sure the user has provided a scope for multicast and link local
addresses used locally by a UDP bearer.

Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/udp_media.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 6fe8740a226f..fb2f7ec68eef 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -310,9 +310,14 @@ err:
 
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) {
+		int atype;
 		struct sockaddr_in6 *ip6;
 
 		ip6 = (struct sockaddr_in6 *)&sa_local;
+		atype = ipv6_addr_type(&ip6->sin6_addr);
+		if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id)
+			return -EINVAL;
+
 		local->proto = htons(ETH_P_IPV6);
 		local->udp_port = ip6->sin6_port;
 		memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
-- 
cgit v1.2.3-59-g8ed1b


From e74a386d70c793ef44678b44b021c99094b28c47 Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Thu, 3 Mar 2016 14:23:21 -0500
Subject: tipc: remove pre-allocated message header in link struct

Until now, we have kept a pre-allocated protocol message header
aggregated into struct tipc_link. Apart from adding unnecessary
footprint to the link instances, this requires extra code both to
initialize and re-initialize it.

We now remove this sub-optimization. This change also makes it
possible to clean up the function tipc_build_proto_msg() and remove
a couple of small functions that were accessing the mentioned header.
In particular, we can replace all occurrences of the local function
call link_own_addr(link) with the generic tipc_own_addr(net).

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c |   5 ---
 net/tipc/bcast.h |   1 -
 net/tipc/link.c  | 114 ++++++++++++++++++++++---------------------------------
 net/tipc/link.h  |   1 -
 net/tipc/net.c   |   1 -
 5 files changed, 45 insertions(+), 77 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index e401108360a2..ae469b37d852 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -412,11 +412,6 @@ enomem:
 	return -ENOMEM;
 }
 
-void tipc_bcast_reinit(struct net *net)
-{
-	tipc_link_reinit(tipc_bc_sndlink(net), tipc_own_addr(net));
-}
-
 void tipc_bcast_stop(struct net *net)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 1944c6c00bb9..d5e79b3767fd 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -46,7 +46,6 @@ struct tipc_node_map;
 extern const char tipc_bclink_name[];
 
 int tipc_bcast_init(struct net *net);
-void tipc_bcast_reinit(struct net *net);
 void tipc_bcast_stop(struct net *net);
 void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
 			 struct sk_buff_head *xmitq);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index e31d92f80572..fc2fa833749e 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/link.c: TIPC link code
  *
- * Copyright (c) 1996-2007, 2012-2015, Ericsson AB
+ * Copyright (c) 1996-2007, 2012-2016, Ericsson AB
  * Copyright (c) 2004-2007, 2010-2013, Wind River Systems
  * All rights reserved.
  *
@@ -127,6 +127,7 @@ struct tipc_link {
 
 	/* Management and link supervision data */
 	u32 peer_session;
+	u32 session;
 	u32 peer_bearer_id;
 	u32 bearer_id;
 	u32 tolerance;
@@ -136,11 +137,7 @@ struct tipc_link {
 	u16 peer_caps;
 	bool active;
 	u32 silent_intv_cnt;
-	struct {
-		unchar hdr[INT_H_SIZE];
-		unchar body[TIPC_MAX_IF_NAME];
-	} proto_msg;
-	struct tipc_msg *pmsg;
+	char if_name[TIPC_MAX_IF_NAME];
 	u32 priority;
 	char net_plane;
 
@@ -215,10 +212,11 @@ enum {
  * Interval between NACKs when packets arrive out of order
  */
 #define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
-/*
- * Out-of-range value for link session numbers
+
+/* Wildcard value for link session numbers. When it is known that
+ * peer endpoint is down, any session number must be accepted.
  */
-#define WILDCARD_SESSION 0x10000
+#define ANY_SESSION 0x10000
 
 /* Link FSM states:
  */
@@ -398,16 +396,6 @@ char *tipc_link_name(struct tipc_link *l)
 	return l->name;
 }
 
-static u32 link_own_addr(struct tipc_link *l)
-{
-	return msg_prevnode(l->pmsg);
-}
-
-void tipc_link_reinit(struct tipc_link *l, u32 addr)
-{
-	msg_set_prevnode(l->pmsg, addr);
-}
-
 /**
  * tipc_link_create - create a new link
  * @n: pointer to associated node
@@ -441,29 +429,22 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 		      struct tipc_link **link)
 {
 	struct tipc_link *l;
-	struct tipc_msg *hdr;
 
 	l = kzalloc(sizeof(*l), GFP_ATOMIC);
 	if (!l)
 		return false;
 	*link = l;
-	l->pmsg = (struct tipc_msg *)&l->proto_msg;
-	hdr = l->pmsg;
-	tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer);
-	msg_set_size(hdr, sizeof(l->proto_msg));
-	msg_set_session(hdr, session);
-	msg_set_bearer_id(hdr, l->bearer_id);
+	l->session = session;
 
 	/* Note: peer i/f name is completed by reset/activate message */
 	sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
 		tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode),
 		if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
-	strcpy((char *)msg_data(hdr), if_name);
-
+	strcpy(l->if_name, if_name);
 	l->addr = peer;
 	l->peer_caps = peer_caps;
 	l->net = net;
-	l->peer_session = WILDCARD_SESSION;
+	l->peer_session = ANY_SESSION;
 	l->bearer_id = bearer_id;
 	l->tolerance = tolerance;
 	l->net_plane = net_plane;
@@ -790,7 +771,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
 	struct tipc_msg *msg = buf_msg(skb_peek(list));
 	int imp = msg_importance(msg);
 	u32 oport = msg_origport(msg);
-	u32 addr = link_own_addr(link);
+	u32 addr = tipc_own_addr(link->net);
 	struct sk_buff *skb;
 
 	/* This really cannot happen...  */
@@ -839,16 +820,9 @@ void link_prepare_wakeup(struct tipc_link *l)
 
 void tipc_link_reset(struct tipc_link *l)
 {
-	/* Link is down, accept any session */
-	l->peer_session = WILDCARD_SESSION;
-
-	/* If peer is up, it only accepts an incremented session number */
-	msg_set_session(l->pmsg, msg_session(l->pmsg) + 1);
-
-	/* Prepare for renewed mtu size negotiation */
+	l->peer_session = ANY_SESSION;
+	l->session++;
 	l->mtu = l->advertised_mtu;
-
-	/* Clean up all queues and counters: */
 	__skb_queue_purge(&l->transmq);
 	__skb_queue_purge(&l->deferdq);
 	skb_queue_splice_init(&l->wakeupq, l->inputq);
@@ -1156,7 +1130,7 @@ int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
 
 	/* Broadcast ACK must be sent via a unicast link => defer to caller */
 	if (link_is_bc_rcvlink(l)) {
-		if (((l->rcv_nxt ^ link_own_addr(l)) & 0xf) != 0xf)
+		if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf)
 			return 0;
 		l->rcv_unacked = 0;
 		return TIPC_LINK_SND_BC_ACK;
@@ -1268,15 +1242,30 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
 				      u16 rcvgap, int tolerance, int priority,
 				      struct sk_buff_head *xmitq)
 {
-	struct sk_buff *skb = NULL;
-	struct tipc_msg *hdr = l->pmsg;
+	struct sk_buff *skb;
+	struct tipc_msg *hdr;
+	struct sk_buff_head *dfq = &l->deferdq;
 	bool node_up = link_is_up(l->bc_rcvlink);
 
 	/* Don't send protocol message during reset or link failover */
 	if (tipc_link_is_blocked(l))
 		return;
 
-	msg_set_type(hdr, mtyp);
+	if (!tipc_link_is_up(l) && (mtyp == STATE_MSG))
+		return;
+
+	if (!skb_queue_empty(dfq))
+		rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
+
+	skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE,
+			      TIPC_MAX_IF_NAME, l->addr,
+			      tipc_own_addr(l->net), 0, 0, 0);
+	if (!skb)
+		return;
+
+	hdr = buf_msg(skb);
+	msg_set_session(hdr, l->session);
+	msg_set_bearer_id(hdr, l->bearer_id);
 	msg_set_net_plane(hdr, l->net_plane);
 	msg_set_next_sent(hdr, l->snd_nxt);
 	msg_set_ack(hdr, l->rcv_nxt - 1);
@@ -1286,36 +1275,23 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
 	msg_set_linkprio(hdr, priority);
 	msg_set_redundant_link(hdr, node_up);
 	msg_set_seq_gap(hdr, 0);
-
-	/* Compatibility: created msg must not be in sequence with pkt flow */
 	msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2);
 
 	if (mtyp == STATE_MSG) {
-		if (!tipc_link_is_up(l))
-			return;
-
-		/* Override rcvgap if there are packets in deferred queue */
-		if (!skb_queue_empty(&l->deferdq))
-			rcvgap = buf_seqno(skb_peek(&l->deferdq)) - l->rcv_nxt;
-		if (rcvgap) {
-			msg_set_seq_gap(hdr, rcvgap);
-			l->stats.sent_nacks++;
-		}
+		msg_set_seq_gap(hdr, rcvgap);
+		msg_set_size(hdr, INT_H_SIZE);
 		msg_set_probe(hdr, probe);
-		if (probe)
-			l->stats.sent_probes++;
 		l->stats.sent_states++;
 		l->rcv_unacked = 0;
 	} else {
 		/* RESET_MSG or ACTIVATE_MSG */
 		msg_set_max_pkt(hdr, l->advertised_mtu);
-		msg_set_ack(hdr, l->rcv_nxt - 1);
-		msg_set_next_sent(hdr, 1);
+		strcpy(msg_data(hdr), l->if_name);
 	}
-	skb = tipc_buf_acquire(msg_size(hdr));
-	if (!skb)
-		return;
-	skb_copy_to_linear_data(skb, hdr, msg_size(hdr));
+	if (probe)
+		l->stats.sent_probes++;
+	if (rcvgap)
+		l->stats.sent_nacks++;
 	skb->priority = TC_PRIO_CONTROL;
 	__skb_queue_tail(xmitq, skb);
 }
@@ -1340,7 +1316,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
 
 	/* At least one packet required for safe algorithm => add dummy */
 	skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
-			      BASIC_H_SIZE, 0, l->addr, link_own_addr(l),
+			      BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net),
 			      0, 0, TIPC_ERR_NO_PORT);
 	if (!skb) {
 		pr_warn("%sunable to create tunnel packet\n", link_co_err);
@@ -1351,7 +1327,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
 	__skb_queue_purge(&tmpxq);
 
 	/* Initialize reusable tunnel packet header */
-	tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL,
+	tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL,
 		      mtyp, INT_H_SIZE, l->addr);
 	pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq);
 	msg_set_msgcnt(&tnlhdr, pktcnt);
@@ -1410,7 +1386,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 	if (tipc_link_is_blocked(l) || !xmitq)
 		goto exit;
 
-	if (link_own_addr(l) > msg_prevnode(hdr))
+	if (tipc_own_addr(l->net) > msg_prevnode(hdr))
 		l->net_plane = msg_net_plane(hdr);
 
 	switch (mtyp) {
@@ -1418,7 +1394,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 
 		/* Ignore duplicate RESET with old session number */
 		if ((less_eq(msg_session(hdr), l->peer_session)) &&
-		    (l->peer_session != WILDCARD_SESSION))
+		    (l->peer_session != ANY_SESSION))
 			break;
 		/* fall thru' */
 
@@ -1515,7 +1491,7 @@ static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast,
 	u16 gap_to = peers_snd_nxt - 1;
 
 	skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE,
-			      0, l->addr, link_own_addr(l), 0, 0, 0);
+			      0, l->addr, tipc_own_addr(l->net), 0, 0, 0);
 	if (!skb)
 		return false;
 	hdr = buf_msg(skb);
@@ -1670,7 +1646,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
 	if (mtyp != STATE_MSG)
 		return 0;
 
-	if (dnode == link_own_addr(l)) {
+	if (dnode == tipc_own_addr(l->net)) {
 		tipc_link_bc_ack_rcv(l, acked, xmitq);
 		rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq);
 		l->stats.recv_nacks++;
diff --git a/net/tipc/link.h b/net/tipc/link.h
index b4ee9d6e181d..6a94175ee20a 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -86,7 +86,6 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
 			 struct sk_buff_head *namedq,
 			 struct tipc_link *bc_sndlink,
 			 struct tipc_link **link);
-void tipc_link_reinit(struct tipc_link *l, u32 addr);
 void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
 			   int mtyp, struct sk_buff_head *xmitq);
 void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 77bf9113c7a7..86d68b352bd6 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -116,7 +116,6 @@ int tipc_net_start(struct net *net, u32 addr)
 	tn->own_addr = addr;
 	tipc_named_reinit(net);
 	tipc_sk_reinit(net);
-	tipc_bcast_reinit(net);
 
 	tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr,
 			     TIPC_ZONE_SCOPE, 0, tn->own_addr);
-- 
cgit v1.2.3-59-g8ed1b


From 795f3512ca952965dd2546ceed3d9636abcb7920 Mon Sep 17 00:00:00 2001
From: Wei Tang <tangwei@cmss.chinamobile.com>
Date: Mon, 7 Mar 2016 14:25:10 +0800
Subject: netlink: do not initialise statics to 0 or NULL

This patch fixes the checkpatch.pl error to netlabel_unlabeled.c:

ERROR: do not initialise statics to 0 or NULL

Signed-off-by: Wei Tang <tangwei@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlabel/netlabel_unlabeled.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index b0380927f05f..9eaa9a1e8629 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -116,11 +116,11 @@ struct netlbl_unlhsh_walk_arg {
 static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
 #define netlbl_unlhsh_rcu_deref(p) \
 	rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock))
-static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
-static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
+static struct netlbl_unlhsh_tbl *netlbl_unlhsh;
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_def;
 
 /* Accept unlabeled packets flag */
-static u8 netlabel_unlabel_acceptflg = 0;
+static u8 netlabel_unlabel_acceptflg;
 
 /* NetLabel Generic NETLINK unlabeled family */
 static struct genl_family netlbl_unlabel_gnl_family = {
-- 
cgit v1.2.3-59-g8ed1b


From 8303394d811ad4e1245b2d72e15c39880ec3d0ff Mon Sep 17 00:00:00 2001
From: Wei Tang <tangwei@cmss.chinamobile.com>
Date: Mon, 7 Mar 2016 14:29:53 +0800
Subject: netlabel: do not initialise statics to NULL

This patch fixes the checkpatch.pl error to netlabel_domainhash.c:

ERROR: do not initialise statics to NULL

Signed-off-by: Wei Tang <tangwei@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlabel/netlabel_domainhash.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index f0cb92f3ddaf..ada67422234b 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -55,8 +55,8 @@ struct netlbl_domhsh_tbl {
 static DEFINE_SPINLOCK(netlbl_domhsh_lock);
 #define netlbl_domhsh_rcu_deref(p) \
 	rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
-static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
-static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
+static struct netlbl_domhsh_tbl *netlbl_domhsh;
+static struct netlbl_dom_map *netlbl_domhsh_def;
 
 /*
  * Domain Hash Table Helper Functions
-- 
cgit v1.2.3-59-g8ed1b


From 8dfd329fbc240729938d24bf87aca49ea89289c5 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Fri, 4 Mar 2016 14:07:54 +0000
Subject: arp: correct return value of arp_rcv

Currently, arp_rcv() always return zero on a packet delivery upcall.

To make its behavior more compliant with the way this API should be
used, this patch changes this to let it return NET_RX_SUCCESS when the
packet is proper handled, and NET_RX_DROP otherwise.

v1->v2:
If sanity check is failed, call kfree_skb() instead of consume_skb(), then
return the correct return value.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index c102eb5ac55c..c34c7544d1db 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -665,7 +665,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 	 */
 
 	if (!in_dev)
-		goto out;
+		goto out_free_skb;
 
 	arp = arp_hdr(skb);
 
@@ -673,7 +673,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 	default:
 		if (arp->ar_pro != htons(ETH_P_IP) ||
 		    htons(dev_type) != arp->ar_hrd)
-			goto out;
+			goto out_free_skb;
 		break;
 	case ARPHRD_ETHER:
 	case ARPHRD_FDDI:
@@ -690,17 +690,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 		if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
 		     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
 		    arp->ar_pro != htons(ETH_P_IP))
-			goto out;
+			goto out_free_skb;
 		break;
 	case ARPHRD_AX25:
 		if (arp->ar_pro != htons(AX25_P_IP) ||
 		    arp->ar_hrd != htons(ARPHRD_AX25))
-			goto out;
+			goto out_free_skb;
 		break;
 	case ARPHRD_NETROM:
 		if (arp->ar_pro != htons(AX25_P_IP) ||
 		    arp->ar_hrd != htons(ARPHRD_NETROM))
-			goto out;
+			goto out_free_skb;
 		break;
 	}
 
@@ -708,7 +708,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 
 	if (arp->ar_op != htons(ARPOP_REPLY) &&
 	    arp->ar_op != htons(ARPOP_REQUEST))
-		goto out;
+		goto out_free_skb;
 
 /*
  *	Extract fields
@@ -733,7 +733,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
  */
 	if (ipv4_is_multicast(tip) ||
 	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
-		goto out;
+		goto out_free_skb;
 
  /*
   *	For some 802.11 wireless deployments (and possibly other networks),
@@ -741,7 +741,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
   *	and thus should not be accepted.
   */
 	if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
-		goto out;
+		goto out_free_skb;
 
 /*
  *     Special case: We must set Frame Relay source Q.922 address
@@ -778,7 +778,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 		    !arp_ignore(in_dev, sip, tip))
 			arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
 				     sha, dev->dev_addr, sha, reply_dst);
-		goto out;
+		goto out_consume_skb;
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
@@ -803,7 +803,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 					neigh_release(n);
 				}
 			}
-			goto out;
+			goto out_consume_skb;
 		} else if (IN_DEV_FORWARD(in_dev)) {
 			if (addr_type == RTN_UNICAST  &&
 			    (arp_fwd_proxy(in_dev, dev, rt) ||
@@ -826,7 +826,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 						       in_dev->arp_parms, skb);
 					goto out_free_dst;
 				}
-				goto out;
+				goto out_consume_skb;
 			}
 		}
 	}
@@ -876,11 +876,16 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 		neigh_release(n);
 	}
 
-out:
+out_consume_skb:
 	consume_skb(skb);
+
 out_free_dst:
 	dst_release(reply_dst);
-	return 0;
+	return NET_RX_SUCCESS;
+
+out_free_skb:
+	kfree_skb(skb);
+	return NET_RX_DROP;
 }
 
 static void parp_redo(struct sk_buff *skb)
@@ -924,11 +929,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
 
 consumeskb:
 	consume_skb(skb);
-	return 0;
+	return NET_RX_SUCCESS;
 freeskb:
 	kfree_skb(skb);
 out_of_mem:
-	return 0;
+	return NET_RX_DROP;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 49cc66eaee19e772997b63b057ea4b4bf7d48db0 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Fri, 4 Mar 2016 17:04:42 +0100
Subject: tipc: move netlink policies to netlink.c

Make the c files less cluttered and enable netlink attributes to be
shared between files.

Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bearer.c     | 18 +-------------
 net/tipc/link.c       |  8 ------
 net/tipc/name_table.c |  6 -----
 net/tipc/net.c        |  6 +----
 net/tipc/netlink.c    | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
 net/tipc/netlink.h    | 11 ++++++++
 net/tipc/node.c       | 23 +----------------
 net/tipc/socket.c     |  9 +------
 net/tipc/udp_media.c  |  9 +------
 9 files changed, 85 insertions(+), 74 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 802ffad3200d..27a5406213c6 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -40,6 +40,7 @@
 #include "link.h"
 #include "discover.h"
 #include "bcast.h"
+#include "netlink.h"
 
 #define MAX_ADDR_STR 60
 
@@ -54,23 +55,6 @@ static struct tipc_media * const media_info_array[] = {
 	NULL
 };
 
-static const struct nla_policy
-tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1]	= {
-	[TIPC_NLA_BEARER_UNSPEC]		= { .type = NLA_UNSPEC },
-	[TIPC_NLA_BEARER_NAME] = {
-		.type = NLA_STRING,
-		.len = TIPC_MAX_BEARER_NAME
-	},
-	[TIPC_NLA_BEARER_PROP]			= { .type = NLA_NESTED },
-	[TIPC_NLA_BEARER_DOMAIN]		= { .type = NLA_U32 }
-};
-
-static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = {
-	[TIPC_NLA_MEDIA_UNSPEC]		= { .type = NLA_UNSPEC },
-	[TIPC_NLA_MEDIA_NAME]		= { .type = NLA_STRING },
-	[TIPC_NLA_MEDIA_PROP]		= { .type = NLA_NESTED }
-};
-
 static void bearer_disable(struct net *net, struct tipc_bearer *b);
 
 /**
diff --git a/net/tipc/link.c b/net/tipc/link.c
index fc2fa833749e..7d2bb3e70baa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -192,14 +192,6 @@ struct tipc_link {
 static const char *link_co_err = "Link tunneling error, ";
 static const char *link_rst_msg = "Resetting link ";
 
-/* Properties valid for media, bearar and link */
-static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
-	[TIPC_NLA_PROP_UNSPEC]		= { .type = NLA_UNSPEC },
-	[TIPC_NLA_PROP_PRIO]		= { .type = NLA_U32 },
-	[TIPC_NLA_PROP_TOL]		= { .type = NLA_U32 },
-	[TIPC_NLA_PROP_WIN]		= { .type = NLA_U32 }
-};
-
 /* Send states for broadcast NACKs
  */
 enum {
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 777b979b8463..e190460fe0d3 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -47,12 +47,6 @@
 
 #define TIPC_NAMETBL_SIZE 1024		/* must be a power of 2 */
 
-static const struct nla_policy
-tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = {
-	[TIPC_NLA_NAME_TABLE_UNSPEC]	= { .type = NLA_UNSPEC },
-	[TIPC_NLA_NAME_TABLE_PUBL]	= { .type = NLA_NESTED }
-};
-
 /**
  * struct name_info - name sequence publication info
  * @node_list: circular list of publications made by own node
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 86d68b352bd6..28bf4feeb81c 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -41,11 +41,7 @@
 #include "socket.h"
 #include "node.h"
 #include "bcast.h"
-
-static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
-	[TIPC_NLA_NET_UNSPEC]	= { .type = NLA_UNSPEC },
-	[TIPC_NLA_NET_ID]	= { .type = NLA_U32 }
-};
+#include "netlink.h"
 
 /*
  * The TIPC locking policy is designed to ensure a very fine locking
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 8975b0135b76..56935df2167a 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -55,6 +55,75 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = {
 	[TIPC_NLA_NAME_TABLE]	= { .type = NLA_NESTED, }
 };
 
+const struct nla_policy
+tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = {
+	[TIPC_NLA_NAME_TABLE_UNSPEC]	= { .type = NLA_UNSPEC },
+	[TIPC_NLA_NAME_TABLE_PUBL]	= { .type = NLA_NESTED }
+};
+
+const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
+	[TIPC_NLA_SOCK_UNSPEC]		= { .type = NLA_UNSPEC },
+	[TIPC_NLA_SOCK_ADDR]		= { .type = NLA_U32 },
+	[TIPC_NLA_SOCK_REF]		= { .type = NLA_U32 },
+	[TIPC_NLA_SOCK_CON]		= { .type = NLA_NESTED },
+	[TIPC_NLA_SOCK_HAS_PUBL]	= { .type = NLA_FLAG }
+};
+
+const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
+	[TIPC_NLA_NET_UNSPEC]		= { .type = NLA_UNSPEC },
+	[TIPC_NLA_NET_ID]		= { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
+	[TIPC_NLA_LINK_UNSPEC]		= { .type = NLA_UNSPEC },
+	[TIPC_NLA_LINK_NAME]		= { .type = NLA_STRING,
+					    .len = TIPC_MAX_LINK_NAME },
+	[TIPC_NLA_LINK_MTU]		= { .type = NLA_U32 },
+	[TIPC_NLA_LINK_BROADCAST]	= { .type = NLA_FLAG },
+	[TIPC_NLA_LINK_UP]		= { .type = NLA_FLAG },
+	[TIPC_NLA_LINK_ACTIVE]		= { .type = NLA_FLAG },
+	[TIPC_NLA_LINK_PROP]		= { .type = NLA_NESTED },
+	[TIPC_NLA_LINK_STATS]		= { .type = NLA_NESTED },
+	[TIPC_NLA_LINK_RX]		= { .type = NLA_U32 },
+	[TIPC_NLA_LINK_TX]		= { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
+	[TIPC_NLA_NODE_UNSPEC]		= { .type = NLA_UNSPEC },
+	[TIPC_NLA_NODE_ADDR]		= { .type = NLA_U32 },
+	[TIPC_NLA_NODE_UP]		= { .type = NLA_FLAG }
+};
+
+/* Properties valid for media, bearer and link */
+const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
+	[TIPC_NLA_PROP_UNSPEC]		= { .type = NLA_UNSPEC },
+	[TIPC_NLA_PROP_PRIO]		= { .type = NLA_U32 },
+	[TIPC_NLA_PROP_TOL]		= { .type = NLA_U32 },
+	[TIPC_NLA_PROP_WIN]		= { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1]	= {
+	[TIPC_NLA_BEARER_UNSPEC]	= { .type = NLA_UNSPEC },
+	[TIPC_NLA_BEARER_NAME]		= { .type = NLA_STRING,
+					    .len = TIPC_MAX_BEARER_NAME },
+	[TIPC_NLA_BEARER_PROP]		= { .type = NLA_NESTED },
+	[TIPC_NLA_BEARER_DOMAIN]	= { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = {
+	[TIPC_NLA_MEDIA_UNSPEC]		= { .type = NLA_UNSPEC },
+	[TIPC_NLA_MEDIA_NAME]		= { .type = NLA_STRING },
+	[TIPC_NLA_MEDIA_PROP]		= { .type = NLA_NESTED }
+};
+
+const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
+	[TIPC_NLA_UDP_UNSPEC]	= {.type = NLA_UNSPEC},
+	[TIPC_NLA_UDP_LOCAL]	= {.type = NLA_BINARY,
+				   .len = sizeof(struct sockaddr_storage)},
+	[TIPC_NLA_UDP_REMOTE]	= {.type = NLA_BINARY,
+				   .len = sizeof(struct sockaddr_storage)},
+};
+
 /* Users of the legacy API (tipc-config) can't handle that we add operations,
  * so we have a separate genl handling for the new API.
  */
diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h
index 08a1db67b927..ed1dbcb4afbd 100644
--- a/net/tipc/netlink.h
+++ b/net/tipc/netlink.h
@@ -35,6 +35,7 @@
 
 #ifndef _TIPC_NETLINK_H
 #define _TIPC_NETLINK_H
+#include <net/netlink.h>
 
 extern struct genl_family tipc_genl_family;
 int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf);
@@ -45,6 +46,16 @@ struct tipc_nl_msg {
 	u32 seq;
 };
 
+extern const struct nla_policy tipc_nl_name_table_policy[];
+extern const struct nla_policy tipc_nl_sock_policy[];
+extern const struct nla_policy tipc_nl_net_policy[];
+extern const struct nla_policy tipc_nl_link_policy[];
+extern const struct nla_policy tipc_nl_node_policy[];
+extern const struct nla_policy tipc_nl_prop_policy[];
+extern const struct nla_policy tipc_nl_bearer_policy[];
+extern const struct nla_policy tipc_nl_media_policy[];
+extern const struct nla_policy tipc_nl_udp_policy[];
+
 int tipc_netlink_start(void);
 int tipc_netlink_compat_start(void);
 void tipc_netlink_stop(void);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 590d597589cf..ace178fd3850 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -41,6 +41,7 @@
 #include "socket.h"
 #include "bcast.h"
 #include "discover.h"
+#include "netlink.h"
 
 #define INVALID_NODE_SIG	0x10000
 
@@ -164,28 +165,6 @@ struct tipc_sock_conn {
 	struct list_head list;
 };
 
-static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
-	[TIPC_NLA_LINK_UNSPEC]		= { .type = NLA_UNSPEC },
-	[TIPC_NLA_LINK_NAME] = {
-		.type = NLA_STRING,
-		.len = TIPC_MAX_LINK_NAME
-	},
-	[TIPC_NLA_LINK_MTU]		= { .type = NLA_U32 },
-	[TIPC_NLA_LINK_BROADCAST]	= { .type = NLA_FLAG },
-	[TIPC_NLA_LINK_UP]		= { .type = NLA_FLAG },
-	[TIPC_NLA_LINK_ACTIVE]		= { .type = NLA_FLAG },
-	[TIPC_NLA_LINK_PROP]		= { .type = NLA_NESTED },
-	[TIPC_NLA_LINK_STATS]		= { .type = NLA_NESTED },
-	[TIPC_NLA_LINK_RX]		= { .type = NLA_U32 },
-	[TIPC_NLA_LINK_TX]		= { .type = NLA_U32 }
-};
-
-static const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
-	[TIPC_NLA_NODE_UNSPEC]		= { .type = NLA_UNSPEC },
-	[TIPC_NLA_NODE_ADDR]		= { .type = NLA_U32 },
-	[TIPC_NLA_NODE_UP]		= { .type = NLA_FLAG }
-};
-
 static struct tipc_link *node_active_link(struct tipc_node *n, int sel)
 {
 	int bearer_id = n->active_links[sel & 1];
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 69c29050f14a..56b8a96c2257 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -42,6 +42,7 @@
 #include "name_distr.h"
 #include "socket.h"
 #include "bcast.h"
+#include "netlink.h"
 
 #define SS_LISTENING		-1	/* socket is listening */
 #define SS_READY		-2	/* socket is connectionless */
@@ -126,14 +127,6 @@ static const struct proto_ops stream_ops;
 static const struct proto_ops msg_ops;
 static struct proto tipc_proto;
 
-static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
-	[TIPC_NLA_SOCK_UNSPEC]		= { .type = NLA_UNSPEC },
-	[TIPC_NLA_SOCK_ADDR]		= { .type = NLA_U32 },
-	[TIPC_NLA_SOCK_REF]		= { .type = NLA_U32 },
-	[TIPC_NLA_SOCK_CON]		= { .type = NLA_NESTED },
-	[TIPC_NLA_SOCK_HAS_PUBL]	= { .type = NLA_FLAG }
-};
-
 static const struct rhashtable_params tsk_rht_params;
 
 /*
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index fb2f7ec68eef..49b3c2ede7ab 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -48,20 +48,13 @@
 #include <linux/tipc_netlink.h>
 #include "core.h"
 #include "bearer.h"
+#include "netlink.h"
 
 /* IANA assigned UDP port */
 #define UDP_PORT_DEFAULT	6118
 
 #define UDP_MIN_HEADROOM        28
 
-static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
-	[TIPC_NLA_UDP_UNSPEC]	= {.type = NLA_UNSPEC},
-	[TIPC_NLA_UDP_LOCAL]	= {.type = NLA_BINARY,
-				   .len = sizeof(struct sockaddr_storage)},
-	[TIPC_NLA_UDP_REMOTE]	= {.type = NLA_BINARY,
-				   .len = sizeof(struct sockaddr_storage)},
-};
-
 /**
  * struct udp_media_addr - IP/UDP addressing information
  *
-- 
cgit v1.2.3-59-g8ed1b


From 8050c0f0274a15841756968857cfb07b3ab809ae Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:02 +0100
Subject: bpf: allow bpf_csum_diff to feed bpf_l3_csum_replace as well

Commit 7d672345ed29 ("bpf: add generic bpf_csum_diff helper") added a
generic checksum diff helper that can feed bpf_l4_csum_replace() with
a target __wsum diff that is to be applied to the L4 checksum. This
facility is very flexible, can be cascaded, allows for adding, removing,
or diffing data, or for calculating the pseudo header checksum from
scratch, but it can also be reused for working with the IPv4 header
checksum.

Thus, analogous to bpf_l4_csum_replace(), add a case for header field
value of 0 to change the checksum at a given offset through a new helper
csum_replace_by_diff(). Also, in addition to that, this provides an
easy to use interface for feeding precalculated diffs f.e. coming from
a map. It nicely complements bpf_l3_csum_replace() that currently allows
only for csum updates of 2 and 4 byte diffs.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/checksum.h | 5 +++++
 net/core/filter.c      | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'net')

diff --git a/include/net/checksum.h b/include/net/checksum.h
index 10a16b5bd1c7..abffc64e7300 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -120,6 +120,11 @@ static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum)
 
 #define CSUM_MANGLED_0 ((__force __sum16)0xffff)
 
+static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff)
+{
+	*sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
+}
+
 static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to)
 {
 	__wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from);
diff --git a/net/core/filter.c b/net/core/filter.c
index 69f4ffc0a282..356a251657a5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1447,6 +1447,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EFAULT;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
+	case 0:
+		if (unlikely(from != 0))
+			return -EINVAL;
+
+		csum_replace_by_diff(ptr, to);
+		break;
 	case 2:
 		csum_replace2(ptr, from, to);
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 8afd54c87ad7089734ef0527937a256586ba828a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:03 +0100
Subject: bpf: add flags to bpf_skb_store_bytes for clearing hash

When overwriting parts of the packet with bpf_skb_store_bytes() that
were fed previously into skb->hash calculation, we should clear the
current hash with skb_clear_hash(), so that a next skb_get_hash() call
can determine the correct hash related to this skb.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ee2193287cbe..2e3e90309904 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -305,6 +305,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_skb_store_bytes flags. */
 #define BPF_F_RECOMPUTE_CSUM		(1ULL << 0)
+#define BPF_F_INVALIDATE_HASH		(1ULL << 1)
 
 /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags.
  * First 4 bits are for passing the header field size.
diff --git a/net/core/filter.c b/net/core/filter.c
index 356a251657a5..a1fe246a6147 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1353,7 +1353,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
-	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM)))
+	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
 		return -EINVAL;
 
 	/* bpf verifier guarantees that:
@@ -1384,6 +1384,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 
 	if (flags & BPF_F_RECOMPUTE_CSUM)
 		skb_postpush_rcsum(skb, ptr, len);
+	if (flags & BPF_F_INVALIDATE_HASH)
+		skb_clear_hash(skb);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 577c50aade0f34926e4a47f61629739e6da91af6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:04 +0100
Subject: bpf: make helper function protos static

They are only used here, so there's no reason they should not be static.
Only the vlan push/pop protos are used in the test_bpf suite.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index a1fe246a6147..ce4e18dd2c89 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1390,7 +1390,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	return 0;
 }
 
-const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.func		= bpf_skb_store_bytes,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1421,7 +1421,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return 0;
 }
 
-const struct bpf_func_proto bpf_skb_load_bytes_proto = {
+static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.func		= bpf_skb_load_bytes,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1472,7 +1472,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 	return 0;
 }
 
-const struct bpf_func_proto bpf_l3_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
 	.func		= bpf_l3_csum_replace,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1531,7 +1531,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 	return 0;
 }
 
-const struct bpf_func_proto bpf_l4_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
 	.func		= bpf_l4_csum_replace,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1570,7 +1570,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
 	return csum_partial(sp->diff, diff_size, seed);
 }
 
-const struct bpf_func_proto bpf_csum_diff_proto = {
+static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.func		= bpf_csum_diff,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1608,7 +1608,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 	return dev_queue_xmit(skb2);
 }
 
-const struct bpf_func_proto bpf_clone_redirect_proto = {
+static const struct bpf_func_proto bpf_clone_redirect_proto = {
 	.func           = bpf_clone_redirect,
 	.gpl_only       = false,
 	.ret_type       = RET_INTEGER,
@@ -1660,7 +1660,7 @@ int skb_do_redirect(struct sk_buff *skb)
 	return dev_queue_xmit(skb);
 }
 
-const struct bpf_func_proto bpf_redirect_proto = {
+static const struct bpf_func_proto bpf_redirect_proto = {
 	.func           = bpf_redirect,
 	.gpl_only       = false,
 	.ret_type       = RET_INTEGER,
@@ -1799,7 +1799,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	return 0;
 }
 
-const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.func		= bpf_skb_get_tunnel_key,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1861,7 +1861,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	return 0;
 }
 
-const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.func		= bpf_skb_set_tunnel_key,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-- 
cgit v1.2.3-59-g8ed1b


From 2208087061c4ad88de188911367effc550144836 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:05 +0100
Subject: bpf: allow to propagate df in bpf_skb_set_tunnel_key

Added by 9a628224a61b ("ip_tunnel: Add dont fragment flag."), allow to
feed df flag into tunneling facilities (currently supported on TX by
vxlan, geneve and gre) as a hint from eBPF's bpf_skb_set_tunnel_key()
helper.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e3e90309904..21ee6d52016f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -330,6 +330,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
+#define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
diff --git a/net/core/filter.c b/net/core/filter.c
index ce4e18dd2c89..6c9d15561d04 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1819,7 +1819,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	u8 compat[sizeof(struct bpf_tunnel_key)];
 	struct ip_tunnel_info *info;
 
-	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
+	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+			       BPF_F_DONT_FRAGMENT)))
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
@@ -1844,6 +1845,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	info->mode = IP_TUNNEL_INFO_TX;
 
 	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
+	if (flags & BPF_F_DONT_FRAGMENT)
+		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
 	info->key.ttl = from->tunnel_ttl;
-- 
cgit v1.2.3-59-g8ed1b


From 14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:06 +0100
Subject: bpf: support for access to tunnel options

After eBPF being able to programmatically access/manage tunnel key meta
data via commit d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata")
and more recently also for IPv6 through c6c33454072f ("bpf: support ipv6
for bpf_skb_{set,get}_tunnel_key"), this work adds two complementary
helpers to generically access their auxiliary tunnel options.

Geneve and vxlan support this facility. For geneve, TLVs can be pushed,
and for the vxlan case its GBP extension. I.e. setting tunnel key for geneve
case only makes sense, if we can also read/write TLVs into it. In the GBP
case, it provides the flexibility to easily map the group policy ID in
combination with other helpers or maps.

I chose to model this as two separate helpers, bpf_skb_{set,get}_tunnel_opt(),
for a couple of reasons. bpf_skb_{set,get}_tunnel_key() is already rather
complex by itself, and there may be cases for tunnel key backends where
tunnel options are not always needed. If we would have integrated this
into bpf_skb_{set,get}_tunnel_key() nevertheless, we are very limited with
remaining helper arguments, so keeping compatibility on structs in case of
passing in a flat buffer gets more cumbersome. Separating both also allows
for more flexibility and future extensibility, f.e. options could be fed
directly from a map, etc.

Moreover, change geneve's xmit path to test only for info->options_len
instead of TUNNEL_GENEVE_OPT flag. This makes it more consistent with vxlan's
xmit path and allows for avoiding to specify a protocol flag in the API on
xmit, so it can be protocol agnostic. Having info->options_len is enough
information that is needed. Tested with vxlan and geneve.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c     |  4 +--
 include/uapi/linux/bpf.h | 11 +++++++
 net/core/filter.c        | 83 ++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 90 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index bc5da357e16d..36db4cf0579c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -940,7 +940,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		u8 vni[3];
 
 		tunnel_id_to_vni(key->tun_id, vni);
-		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+		if (info->options_len)
 			opts = ip_tunnel_info_opts(info);
 
 		if (key->tun_flags & TUNNEL_CSUM)
@@ -1027,7 +1027,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		u8 vni[3];
 
 		tunnel_id_to_vni(key->tun_id, vni);
-		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+		if (info->options_len)
 			opts = ip_tunnel_info_opts(info);
 
 		if (key->tun_flags & TUNNEL_CSUM)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 21ee6d52016f..9221f653fee3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -298,6 +298,17 @@ enum bpf_func_id {
 	 * Return: csum result
 	 */
 	BPF_FUNC_csum_diff,
+
+	/**
+	 * bpf_skb_[gs]et_tunnel_opt(skb, opt, size)
+	 * retrieve or populate tunnel options metadata
+	 * @skb: pointer to skb
+	 * @opt: pointer to raw tunnel option data
+	 * @size: size of @opt
+	 * Return: 0 on success for set, option size for get
+	 */
+	BPF_FUNC_skb_get_tunnel_opt,
+	BPF_FUNC_skb_set_tunnel_opt,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6c9d15561d04..012a10c2da94 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1809,6 +1809,32 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	u8 *to = (u8 *) (long) r2;
+	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!info ||
+		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)))
+		return -ENOENT;
+	if (unlikely(size < info->options_len))
+		return -ENOMEM;
+
+	ip_tunnel_info_opts_get(to, info);
+
+	return info->options_len;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+	.func		= bpf_skb_get_tunnel_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
 static struct metadata_dst __percpu *md_dst;
 
 static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
@@ -1875,17 +1901,58 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+#define BPF_TUNLEN_MAX	255
+
+static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	u8 *from = (u8 *) (long) r2;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	const struct metadata_dst *md = this_cpu_ptr(md_dst);
+
+	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+		return -EINVAL;
+	if (unlikely(size > BPF_TUNLEN_MAX))
+		return -ENOMEM;
+
+	ip_tunnel_info_opts_set(info, from, size);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+	.func		= bpf_skb_set_tunnel_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 {
 	if (!md_dst) {
-		/* race is not possible, since it's called from
-		 * verifier that is holding verifier mutex
+		BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info,
+					  options_len) != 1);
+
+		/* Race is not possible, since it's called from verifier
+		 * that is holding verifier mutex.
 		 */
-		md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+		md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX,
+						   GFP_KERNEL);
 		if (!md_dst)
 			return NULL;
 	}
-	return &bpf_skb_set_tunnel_key_proto;
+
+	switch (which) {
+	case BPF_FUNC_skb_set_tunnel_key:
+		return &bpf_skb_set_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return &bpf_skb_set_tunnel_opt_proto;
+	default:
+		return NULL;
+	}
 }
 
 static const struct bpf_func_proto *
@@ -1939,7 +2006,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_get_tunnel_key:
 		return &bpf_skb_get_tunnel_key_proto;
 	case BPF_FUNC_skb_set_tunnel_key:
-		return bpf_get_skb_set_tunnel_key_proto();
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_skb_get_tunnel_opt:
+		return &bpf_skb_get_tunnel_opt_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return bpf_get_skb_set_tunnel_proto(func_id);
 	case BPF_FUNC_redirect:
 		return &bpf_redirect_proto;
 	case BPF_FUNC_get_route_realm:
-- 
cgit v1.2.3-59-g8ed1b


From db3c6139e6ead91b42e7c2ad044ed8beaee884e6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:07 +0100
Subject: bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit

The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.

While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.

Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.

Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.

Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c     |  6 ++----
 drivers/net/vxlan.c      | 24 ++++++++++++------------
 include/net/ip_tunnels.h | 15 +++++++++++++++
 net/core/filter.c        |  2 +-
 net/ipv4/ip_gre.c        | 10 ++++++----
 5 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 36db4cf0579c..6a0cbbe03e5d 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -775,10 +775,10 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
 				       struct flowi4 *fl4,
 				       struct ip_tunnel_info *info)
 {
+	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct dst_cache *dst_cache;
 	struct rtable *rt = NULL;
-	bool use_cache = true;
 	__u8 tos;
 
 	memset(fl4, 0, sizeof(*fl4));
@@ -804,7 +804,6 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
 		dst_cache = &geneve->dst_cache;
 	}
 
-	use_cache = use_cache && !skb->mark;
 	if (use_cache) {
 		rt = dst_cache_get_ip4(dst_cache, &fl4->saddr);
 		if (rt)
@@ -832,11 +831,11 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 					   struct flowi6 *fl6,
 					   struct ip_tunnel_info *info)
 {
+	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct geneve_sock *gs6 = geneve->sock6;
 	struct dst_entry *dst = NULL;
 	struct dst_cache *dst_cache;
-	bool use_cache = true;
 	__u8 prio;
 
 	memset(fl6, 0, sizeof(*fl6));
@@ -862,7 +861,6 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 		dst_cache = &geneve->dst_cache;
 	}
 
-	use_cache = use_cache && !skb->mark;
 	if (use_cache) {
 		dst = dst_cache_get_ip6(dst_cache, &fl6->saddr);
 		if (dst)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index fc998a3bd234..7294a459b13c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1756,17 +1756,15 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 				      struct sk_buff *skb, int oif, u8 tos,
 				      __be32 daddr, __be32 *saddr,
 				      struct dst_cache *dst_cache,
-				      struct ip_tunnel_info *info)
+				      const struct ip_tunnel_info *info)
 {
+	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct rtable *rt = NULL;
-	bool use_cache = false;
 	struct flowi4 fl4;
 
-	/* when the ip_tunnel_info is availble, the tos used for lookup is
-	 * packet independent, so we can use the cache
-	 */
-	if (!skb->mark && (!tos || info)) {
-		use_cache = true;
+	if (tos && !info)
+		use_cache = false;
+	if (use_cache) {
 		rt = dst_cache_get_ip4(dst_cache, saddr);
 		if (rt)
 			return rt;
@@ -1794,13 +1792,15 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 					  struct sk_buff *skb, int oif,
 					  const struct in6_addr *daddr,
 					  struct in6_addr *saddr,
-					  struct dst_cache *dst_cache)
+					  struct dst_cache *dst_cache,
+					  const struct ip_tunnel_info *info)
 {
+	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct dst_entry *ndst;
 	struct flowi6 fl6;
 	int err;
 
-	if (!skb->mark) {
+	if (use_cache) {
 		ndst = dst_cache_get_ip6(dst_cache, saddr);
 		if (ndst)
 			return ndst;
@@ -1820,7 +1820,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 		return ERR_PTR(err);
 
 	*saddr = fl6.saddr;
-	if (!skb->mark)
+	if (use_cache)
 		dst_cache_set_ip6(dst_cache, ndst, saddr);
 	return ndst;
 }
@@ -2018,7 +2018,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		ndst = vxlan6_get_route(vxlan, skb,
 					rdst ? rdst->remote_ifindex : 0,
 					&dst->sin6.sin6_addr, &saddr,
-					dst_cache);
+					dst_cache, info);
 		if (IS_ERR(ndst)) {
 			netdev_dbg(dev, "no route to %pI6\n",
 				   &dst->sin6.sin6_addr);
@@ -2387,7 +2387,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 			return -EINVAL;
 		ndst = vxlan6_get_route(vxlan, skb, 0,
 					&info->key.u.ipv6.dst,
-					&info->key.u.ipv6.src, NULL);
+					&info->key.u.ipv6.src, NULL, info);
 		if (IS_ERR(ndst))
 			return PTR_ERR(ndst);
 		dst_release(ndst);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 5f28b606633e..e1395d70fb48 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -140,6 +140,7 @@ struct ip_tunnel {
 #define TUNNEL_CRIT_OPT		__cpu_to_be16(0x0400)
 #define TUNNEL_GENEVE_OPT	__cpu_to_be16(0x0800)
 #define TUNNEL_VXLAN_OPT	__cpu_to_be16(0x1000)
+#define TUNNEL_NOCACHE		__cpu_to_be16(0x2000)
 
 #define TUNNEL_OPTIONS_PRESENT	(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)
 
@@ -206,6 +207,20 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
 		       0, sizeof(*key) - IP_TUNNEL_KEY_SIZE);
 }
 
+static inline bool
+ip_tunnel_dst_cache_usable(const struct sk_buff *skb,
+			   const struct ip_tunnel_info *info)
+{
+	if (skb->mark)
+		return false;
+	if (!info)
+		return true;
+	if (info->key.tun_flags & TUNNEL_NOCACHE)
+		return false;
+
+	return true;
+}
+
 static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
 					       *tun_info)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 012a10c2da94..a66dc03c261f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1870,7 +1870,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	info = &md->u.tun_info;
 	info->mode = IP_TUNNEL_INFO_TX;
 
-	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
+	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
 	if (flags & BPF_F_DONT_FRAGMENT)
 		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 202437d6087b..31936d387cfd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -527,11 +527,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel_info *tun_info;
 	const struct ip_tunnel_key *key;
+	struct rtable *rt = NULL;
 	struct flowi4 fl;
-	struct rtable *rt;
 	int min_headroom;
 	int tunnel_hlen;
 	__be16 df, flags;
+	bool use_cache;
 	int err;
 
 	tun_info = skb_tunnel_info(skb);
@@ -540,13 +541,14 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto err_free_skb;
 
 	key = &tun_info->key;
-	rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) :
-			 NULL;
+	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+	if (use_cache)
+		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr);
 	if (!rt) {
 		rt = gre_get_rt(skb, dev, &fl, key);
 		if (IS_ERR(rt))
 				goto err_free_skb;
-		if (!skb->mark)
+		if (use_cache)
 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 					  fl.saddr);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From f8b33d8e870758ccff13e5f81fd5050b52a42d35 Mon Sep 17 00:00:00 2001
From: Kyeong Yoo <Kyeong.Yoo@alliedtelesis.co.nz>
Date: Mon, 7 Mar 2016 17:07:57 +1300
Subject: net_sched: dsmark: use qdisc_dequeue_peeked()

This fix is for dsmark similar to commit 3557619f0f6f7496ed453d4825e249
("net_sched: prio: use qdisc_dequeue_peeked")
and makes use of qdisc_dequeue_peeked() instead of direct dequeue() call.

First time, wrr peeks dsmark, which will then peek into sfq.
sfq dequeues an skb and it's stored in sch->gso_skb.
Next time, wrr tries to dequeue from dsmark, which will call sfq dequeue
directly. This results skipping the previously peeked skb.

So changed dsmark dequeue to call qdisc_dequeue_peeked() instead to use
peeked skb if exists.

Signed-off-by: Kyeong Yoo <kyeong.yoo@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_dsmark.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index d0dff0cd8186..34b4ddaca27c 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -276,7 +276,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 
 	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
-	skb = p->q->ops->dequeue(p->q);
+	skb = qdisc_dequeue_peeked(p->q);
 	if (skb == NULL)
 		return NULL;
 
-- 
cgit v1.2.3-59-g8ed1b


From d8aacd87180141ff6b812b53de77a4336e87c91a Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Tue, 8 Mar 2016 20:29:10 +0100
Subject: netfilter: ipset: Check IPSET_ATTR_ETHER netlink attribute length

Julia Lawall pointed out that IPSET_ATTR_ETHER netlink attribute length
was not checked explicitly, just for the maximum possible size. Malicious
netlink clients could send shorter attribute and thus resulting a kernel
read after the buffer.

The patch adds the explicit length checkings.

Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/ipset/ip_set_bitmap_ipmac.c | 2 ++
 net/netfilter/ipset/ip_set_hash_mac.c     | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 29dde208381d..9a065f672d3a 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -267,6 +267,8 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	e.id = ip_to_id(map, ip);
 	if (tb[IPSET_ATTR_ETHER]) {
+		if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)
+			return -IPSET_ERR_PROTOCOL;
 		memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
 		e.add_mac = 1;
 	}
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index f1e7d2c0f685..8f004edad396 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -110,7 +110,8 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (tb[IPSET_ATTR_LINENO])
 		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
-	if (unlikely(!tb[IPSET_ATTR_ETHER]))
+	if (unlikely(!tb[IPSET_ATTR_ETHER] ||
+		     nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN))
 		return -IPSET_ERR_PROTOCOL;
 
 	ret = ip_set_get_extensions(set, tb, &ext);
-- 
cgit v1.2.3-59-g8ed1b


From 3570df914f9cae15df7f6ba472a51eaf798fdb46 Mon Sep 17 00:00:00 2001
From: Michal Kubeček <mkubecek@suse.cz>
Date: Tue, 8 Mar 2016 14:44:25 +0100
Subject: ipv6: replace global gc_args with local variable

Global variable gc_args is only used in fib6_run_gc() and functions
called from it. As fib6_run_gc() makes sure there is at most one
instance of fib6_clean_all() running at any moment, we can replace
gc_args with a local variable which will be needed once multiple
instances (per netns) of garbage collector are allowed.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0c7e276c230e..d7c715accac9 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1725,14 +1725,15 @@ static void fib6_flush_trees(struct net *net)
  *	Garbage collection
  */
 
-static struct fib6_gc_args
+struct fib6_gc_args
 {
 	int			timeout;
 	int			more;
-} gc_args;
+};
 
 static int fib6_age(struct rt6_info *rt, void *arg)
 {
+	struct fib6_gc_args *gc_args = arg;
 	unsigned long now = jiffies;
 
 	/*
@@ -1748,10 +1749,10 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 			RT6_TRACE("expiring %p\n", rt);
 			return -1;
 		}
-		gc_args.more++;
+		gc_args->more++;
 	} else if (rt->rt6i_flags & RTF_CACHE) {
 		if (atomic_read(&rt->dst.__refcnt) == 0 &&
-		    time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
+		    time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
 			RT6_TRACE("aging clone %p\n", rt);
 			return -1;
 		} else if (rt->rt6i_flags & RTF_GATEWAY) {
@@ -1769,7 +1770,7 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 				return -1;
 			}
 		}
-		gc_args.more++;
+		gc_args->more++;
 	}
 
 	return 0;
@@ -1779,6 +1780,7 @@ static DEFINE_SPINLOCK(fib6_gc_lock);
 
 void fib6_run_gc(unsigned long expires, struct net *net, bool force)
 {
+	struct fib6_gc_args gc_args;
 	unsigned long now;
 
 	if (force) {
@@ -1792,7 +1794,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
 
 	gc_args.more = icmp6_dst_gc();
 
-	fib6_clean_all(net, fib6_age, NULL);
+	fib6_clean_all(net, fib6_age, &gc_args);
 	now = jiffies;
 	net->ipv6.ip6_rt_last_gc = now;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9a03cd8f38efb83c13fbe62aff50eea4efff93da Mon Sep 17 00:00:00 2001
From: Michal Kubeček <mkubecek@suse.cz>
Date: Tue, 8 Mar 2016 14:44:35 +0100
Subject: ipv6: per netns fib6 walkers

The IPv6 FIB data structures are separated per network namespace but
there is still only one global walkers list and one global walker list
lock. This means changes in one namespace unnecessarily interfere with
walkers in other namespaces.

Replace the global list with per-netns lists (and give each its own
lock).

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h |  2 ++
 net/ipv6/ip6_fib.c       | 68 +++++++++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index c0368db6df54..f0109b973648 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -58,7 +58,9 @@ struct netns_ipv6 {
 	struct timer_list       ip6_fib_timer;
 	struct hlist_head       *fib_table_hash;
 	struct fib6_table       *fib6_main_tbl;
+	struct list_head	fib6_walkers;
 	struct dst_ops		ip6_dst_ops;
+	rwlock_t		fib6_walker_lock;
 	unsigned int		 ip6_rt_gc_expire;
 	unsigned long		 ip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d7c715accac9..883f2836beab 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -55,8 +55,6 @@ struct fib6_cleaner {
 	void *arg;
 };
 
-static DEFINE_RWLOCK(fib6_walker_lock);
-
 #ifdef CONFIG_IPV6_SUBTREES
 #define FWS_INIT FWS_S
 #else
@@ -66,7 +64,7 @@ static DEFINE_RWLOCK(fib6_walker_lock);
 static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
 static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
-static int fib6_walk(struct fib6_walker *w);
+static int fib6_walk(struct net *net, struct fib6_walker *w);
 static int fib6_walk_continue(struct fib6_walker *w);
 
 /*
@@ -78,21 +76,21 @@ static int fib6_walk_continue(struct fib6_walker *w);
 
 static void fib6_gc_timer_cb(unsigned long arg);
 
-static LIST_HEAD(fib6_walkers);
-#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
+#define FOR_WALKERS(net, w) \
+	list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
 
-static void fib6_walker_link(struct fib6_walker *w)
+static void fib6_walker_link(struct net *net, struct fib6_walker *w)
 {
-	write_lock_bh(&fib6_walker_lock);
-	list_add(&w->lh, &fib6_walkers);
-	write_unlock_bh(&fib6_walker_lock);
+	write_lock_bh(&net->ipv6.fib6_walker_lock);
+	list_add(&w->lh, &net->ipv6.fib6_walkers);
+	write_unlock_bh(&net->ipv6.fib6_walker_lock);
 }
 
-static void fib6_walker_unlink(struct fib6_walker *w)
+static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
 {
-	write_lock_bh(&fib6_walker_lock);
+	write_lock_bh(&net->ipv6.fib6_walker_lock);
 	list_del(&w->lh);
-	write_unlock_bh(&fib6_walker_lock);
+	write_unlock_bh(&net->ipv6.fib6_walker_lock);
 }
 
 static int fib6_new_sernum(struct net *net)
@@ -325,12 +323,13 @@ static int fib6_dump_node(struct fib6_walker *w)
 
 static void fib6_dump_end(struct netlink_callback *cb)
 {
+	struct net *net = sock_net(cb->skb->sk);
 	struct fib6_walker *w = (void *)cb->args[2];
 
 	if (w) {
 		if (cb->args[4]) {
 			cb->args[4] = 0;
-			fib6_walker_unlink(w);
+			fib6_walker_unlink(net, w);
 		}
 		cb->args[2] = 0;
 		kfree(w);
@@ -348,6 +347,7 @@ static int fib6_dump_done(struct netlink_callback *cb)
 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	struct fib6_walker *w;
 	int res;
 
@@ -359,7 +359,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 		w->skip = 0;
 
 		read_lock_bh(&table->tb6_lock);
-		res = fib6_walk(w);
+		res = fib6_walk(net, w);
 		read_unlock_bh(&table->tb6_lock);
 		if (res > 0) {
 			cb->args[4] = 1;
@@ -379,7 +379,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 		res = fib6_walk_continue(w);
 		read_unlock_bh(&table->tb6_lock);
 		if (res <= 0) {
-			fib6_walker_unlink(w);
+			fib6_walker_unlink(net, w);
 			cb->args[4] = 0;
 		}
 	}
@@ -1340,8 +1340,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 		}
 #endif
 
-		read_lock(&fib6_walker_lock);
-		FOR_WALKERS(w) {
+		read_lock(&net->ipv6.fib6_walker_lock);
+		FOR_WALKERS(net, w) {
 			if (!child) {
 				if (w->root == fn) {
 					w->root = w->node = NULL;
@@ -1368,7 +1368,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 				}
 			}
 		}
-		read_unlock(&fib6_walker_lock);
+		read_unlock(&net->ipv6.fib6_walker_lock);
 
 		node_free(fn);
 		if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
@@ -1411,8 +1411,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 	}
 
 	/* Adjust walkers */
-	read_lock(&fib6_walker_lock);
-	FOR_WALKERS(w) {
+	read_lock(&net->ipv6.fib6_walker_lock);
+	FOR_WALKERS(net, w) {
 		if (w->state == FWS_C && w->leaf == rt) {
 			RT6_TRACE("walker %p adjusted by delroute\n", w);
 			w->leaf = rt->dst.rt6_next;
@@ -1420,7 +1420,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 				w->state = FWS_U;
 		}
 	}
-	read_unlock(&fib6_walker_lock);
+	read_unlock(&net->ipv6.fib6_walker_lock);
 
 	rt->dst.rt6_next = NULL;
 
@@ -1588,17 +1588,17 @@ skip:
 	}
 }
 
-static int fib6_walk(struct fib6_walker *w)
+static int fib6_walk(struct net *net, struct fib6_walker *w)
 {
 	int res;
 
 	w->state = FWS_INIT;
 	w->node = w->root;
 
-	fib6_walker_link(w);
+	fib6_walker_link(net, w);
 	res = fib6_walk_continue(w);
 	if (res <= 0)
-		fib6_walker_unlink(w);
+		fib6_walker_unlink(net, w);
 	return res;
 }
 
@@ -1668,7 +1668,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 	c.arg = arg;
 	c.net = net;
 
-	fib6_walk(&c.w);
+	fib6_walk(net, &c.w);
 }
 
 static void __fib6_clean_all(struct net *net,
@@ -1816,6 +1816,8 @@ static int __net_init fib6_net_init(struct net *net)
 {
 	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
 
+	rwlock_init(&net->ipv6.fib6_walker_lock);
+	INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
 	setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
 
 	net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
@@ -1976,7 +1978,8 @@ static int ipv6_route_yield(struct fib6_walker *w)
 	return 0;
 }
 
-static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
+static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
+				      struct net *net)
 {
 	memset(&iter->w, 0, sizeof(iter->w));
 	iter->w.func = ipv6_route_yield;
@@ -1986,7 +1989,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
 	iter->w.args = iter;
 	iter->sernum = iter->w.root->fn_sernum;
 	INIT_LIST_HEAD(&iter->w.lh);
-	fib6_walker_link(&iter->w);
+	fib6_walker_link(net, &iter->w);
 }
 
 static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
@@ -2047,16 +2050,16 @@ iter_table:
 			++*pos;
 		return iter->w.leaf;
 	} else if (r < 0) {
-		fib6_walker_unlink(&iter->w);
+		fib6_walker_unlink(net, &iter->w);
 		return NULL;
 	}
-	fib6_walker_unlink(&iter->w);
+	fib6_walker_unlink(net, &iter->w);
 
 	iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
 	if (!iter->tbl)
 		return NULL;
 
-	ipv6_route_seq_setup_walk(iter);
+	ipv6_route_seq_setup_walk(iter, net);
 	goto iter_table;
 }
 
@@ -2071,7 +2074,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
 	iter->skip = *pos;
 
 	if (iter->tbl) {
-		ipv6_route_seq_setup_walk(iter);
+		ipv6_route_seq_setup_walk(iter, net);
 		return ipv6_route_seq_next(seq, NULL, pos);
 	} else {
 		return NULL;
@@ -2087,10 +2090,11 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
 static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
 	__releases(RCU_BH)
 {
+	struct net *net = seq_file_net(seq);
 	struct ipv6_route_iter *iter = seq->private;
 
 	if (ipv6_route_iter_active(iter))
-		fib6_walker_unlink(&iter->w);
+		fib6_walker_unlink(net, &iter->w);
 
 	rcu_read_unlock_bh();
 }
-- 
cgit v1.2.3-59-g8ed1b


From 3dc94f93be161ec4203673de9a34b7362d8985b5 Mon Sep 17 00:00:00 2001
From: Michal Kubeček <mkubecek@suse.cz>
Date: Tue, 8 Mar 2016 14:44:45 +0100
Subject: ipv6: per netns FIB garbage collection

One of our customers observed issues with FIB6 garbage collectors
running in different network namespaces blocking each other, resulting
in soft lockups (fib6_run_gc() initiated from timer runs always in
forced mode).

Now that FIB6 walkers are separated per namespace, there is no more need
for instances of fib6_run_gc() in different namespaces blocking each
other. There is still a call to icmp6_dst_gc() which operates on shared
data but this function is protected by its own shared lock.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h | 1 +
 net/ipv6/ip6_fib.c       | 9 ++++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index f0109b973648..10d0848f5b8a 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -61,6 +61,7 @@ struct netns_ipv6 {
 	struct list_head	fib6_walkers;
 	struct dst_ops		ip6_dst_ops;
 	rwlock_t		fib6_walker_lock;
+	spinlock_t		fib6_gc_lock;
 	unsigned int		 ip6_rt_gc_expire;
 	unsigned long		 ip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 883f2836beab..ea071fad67a0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1776,16 +1776,14 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 	return 0;
 }
 
-static DEFINE_SPINLOCK(fib6_gc_lock);
-
 void fib6_run_gc(unsigned long expires, struct net *net, bool force)
 {
 	struct fib6_gc_args gc_args;
 	unsigned long now;
 
 	if (force) {
-		spin_lock_bh(&fib6_gc_lock);
-	} else if (!spin_trylock_bh(&fib6_gc_lock)) {
+		spin_lock_bh(&net->ipv6.fib6_gc_lock);
+	} else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
 		mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
 		return;
 	}
@@ -1804,7 +1802,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
 					+ net->ipv6.sysctl.ip6_rt_gc_interval));
 	else
 		del_timer(&net->ipv6.ip6_fib_timer);
-	spin_unlock_bh(&fib6_gc_lock);
+	spin_unlock_bh(&net->ipv6.fib6_gc_lock);
 }
 
 static void fib6_gc_timer_cb(unsigned long arg)
@@ -1816,6 +1814,7 @@ static int __net_init fib6_net_init(struct net *net)
 {
 	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
 
+	spin_lock_init(&net->ipv6.fib6_gc_lock);
 	rwlock_init(&net->ipv6.fib6_walker_lock);
 	INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
 	setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
-- 
cgit v1.2.3-59-g8ed1b


From f4a00aacdb5f6784d46e8c999b6bb52ece4b306b Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:01 -0800
Subject: net: Make sock_alloc exportable

Export it for cases where we want to create sockets by hand.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 1 +
 net/socket.c        | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/net.h b/include/linux/net.h
index 0b4ac7da583a..49175e4ced11 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -215,6 +215,7 @@ int __sock_create(struct net *net, int family, int type, int proto,
 int sock_create(int family, int type, int proto, struct socket **res);
 int sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res);
 int sock_create_lite(int family, int type, int proto, struct socket **res);
+struct socket *sock_alloc(void);
 void sock_release(struct socket *sock);
 int sock_sendmsg(struct socket *sock, struct msghdr *msg);
 int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/socket.c b/net/socket.c
index c044d1e8508c..38a78d4d50f5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -533,7 +533,7 @@ static const struct inode_operations sockfs_inode_ops = {
  *	NULL is returned.
  */
 
-static struct socket *sock_alloc(void)
+struct socket *sock_alloc(void)
 {
 	struct inode *inode;
 	struct socket *sock;
@@ -554,6 +554,7 @@ static struct socket *sock_alloc(void)
 	this_cpu_add(sockets_in_use, 1);
 	return sock;
 }
+EXPORT_SYMBOL(sock_alloc);
 
 /**
  *	sock_release	-	close a socket
-- 
cgit v1.2.3-59-g8ed1b


From 28a94d8fb35b3a75b802f368ae6f4a9f6b0d435a Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:02 -0800
Subject: net: Allow MSG_EOR in each msghdr of sendmmsg

This patch allows setting MSG_EOR in each individual msghdr passed
in sendmmsg. This allows a sendmmsg to send multiple messages when
using SOCK_SEQPACKET.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 38a78d4d50f5..0dd4dd818f41 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1875,7 +1875,8 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
 
 static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 			 struct msghdr *msg_sys, unsigned int flags,
-			 struct used_address *used_address)
+			 struct used_address *used_address,
+			 unsigned int allowed_msghdr_flags)
 {
 	struct compat_msghdr __user *msg_compat =
 	    (struct compat_msghdr __user *)msg;
@@ -1901,6 +1902,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 
 	if (msg_sys->msg_controllen > INT_MAX)
 		goto out_freeiov;
+	flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
 	ctl_len = msg_sys->msg_controllen;
 	if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
 		err =
@@ -1979,7 +1981,7 @@ long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
 	if (!sock)
 		goto out;
 
-	err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL);
+	err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
 
 	fput_light(sock->file, fput_needed);
 out:
@@ -2024,7 +2026,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 	while (datagrams < vlen) {
 		if (MSG_CMSG_COMPAT & flags) {
 			err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
-					     &msg_sys, flags, &used_address);
+					     &msg_sys, flags, &used_address, MSG_EOR);
 			if (err < 0)
 				break;
 			err = __put_user(err, &compat_entry->msg_len);
@@ -2032,7 +2034,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 		} else {
 			err = ___sys_sendmsg(sock,
 					     (struct user_msghdr __user *)entry,
-					     &msg_sys, flags, &used_address);
+					     &msg_sys, flags, &used_address, MSG_EOR);
 			if (err < 0)
 				break;
 			err = put_user(err, &entry->msg_len);
-- 
cgit v1.2.3-59-g8ed1b


From f092276d85b82504e8a07498f4e9e0c51f06745c Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:03 -0800
Subject: net: Add MSG_BATCH flag

Add a new msg flag called MSG_BATCH. This flag is used in sendmsg to
indicate that more messages will follow (i.e. a batch of messages is
being sent). This is similar to MSG_MORE except that the following
messages are not merged into one packet, they are sent individually.
sendmmsg is updated so that each contained message except for the
last one is marked as MSG_BATCH.

MSG_BATCH is a performance optimization in cases where a socket
implementation can benefit by transmitting packets in a batch.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 1 +
 net/socket.c           | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'net')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 5bf59c8493b7..d834af22a460 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -274,6 +274,7 @@ struct ucred {
 #define MSG_MORE	0x8000	/* Sender will send more */
 #define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
 #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
+#define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
 #define MSG_EOF         MSG_FIN
 
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
diff --git a/net/socket.c b/net/socket.c
index 0dd4dd818f41..886649c88d8f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2008,6 +2008,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 	struct compat_mmsghdr __user *compat_entry;
 	struct msghdr msg_sys;
 	struct used_address used_address;
+	unsigned int oflags = flags;
 
 	if (vlen > UIO_MAXIOV)
 		vlen = UIO_MAXIOV;
@@ -2022,8 +2023,12 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 	entry = mmsg;
 	compat_entry = (struct compat_mmsghdr __user *)mmsg;
 	err = 0;
+	flags |= MSG_BATCH;
 
 	while (datagrams < vlen) {
+		if (datagrams == vlen - 1)
+			flags = oflags;
+
 		if (MSG_CMSG_COMPAT & flags) {
 			err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
 					     &msg_sys, flags, &used_address, MSG_EOR);
-- 
cgit v1.2.3-59-g8ed1b


From fa9835e52e3ea946916c2ce6c625c86421131740 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:04 -0800
Subject: net: Walk fragments in __skb_splice_bits

Add walking of fragments in __skb_splice_bits.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 39 ++++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9d7be61e5e6b..51d768e7bc90 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1918,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
 			      struct splice_pipe_desc *spd, struct sock *sk)
 {
 	int seg;
+	struct sk_buff *iter;
 
 	/* map the linear part :
 	 * If skb->head_frag is set, this 'linear' part is backed by a
@@ -1944,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
 			return true;
 	}
 
+	skb_walk_frags(skb, iter) {
+		if (*offset >= iter->len) {
+			*offset -= iter->len;
+			continue;
+		}
+		/* __skb_splice_bits() only fails if the output has no room
+		 * left, so no point in going over the frag_list for the error
+		 * case.
+		 */
+		if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
+			return true;
+	}
+
 	return false;
 }
 
@@ -1970,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk,
 
 /*
  * Map data from the skb to a pipe. Should handle both the linear part,
- * the fragments, and the frag list. It does NOT handle frag lists within
- * the frag list, if such a thing exists. We'd probably need to recurse to
- * handle that cleanly.
+ * the fragments, and the frag list.
  */
 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int tlen,
@@ -1991,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		.ops = &nosteal_pipe_buf_ops,
 		.spd_release = sock_spd_release,
 	};
-	struct sk_buff *frag_iter;
 	int ret = 0;
 
-	/*
-	 * __skb_splice_bits() only fails if the output has no room left,
-	 * so no point in going over the frag_list for the error case.
-	 */
-	if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
-		goto done;
-	else if (!tlen)
-		goto done;
+	__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
 
-	/*
-	 * now see if we have a frag_list to map
-	 */
-	skb_walk_frags(skb, frag_iter) {
-		if (!tlen)
-			break;
-		if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
-			break;
-	}
-
-done:
 	if (spd.nr_pages)
 		ret = splice_cb(sk, pipe, &spd);
 
-- 
cgit v1.2.3-59-g8ed1b


From 473bd239b808a8af5241ce9996a16d283d88ddff Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:05 -0800
Subject: tcp: Add tcp_inq to get available receive bytes on socket

Create a common kernel function to get the number of bytes available
on a TCP socket. This is based on code in INQ getsockopt and we now call
the function for that getsockopt.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 24 ++++++++++++++++++++++++
 net/ipv4/tcp.c    | 15 +--------------
 2 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e90db8546806..0302636af98c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1816,4 +1816,28 @@ static inline void skb_set_tcp_pure_ack(struct sk_buff *skb)
 	skb->truesize = 2;
 }
 
+static inline int tcp_inq(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int answ;
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		answ = 0;
+	} else if (sock_flag(sk, SOCK_URGINLINE) ||
+		   !tp->urg_data ||
+		   before(tp->urg_seq, tp->copied_seq) ||
+		   !before(tp->urg_seq, tp->rcv_nxt)) {
+
+		answ = tp->rcv_nxt - tp->copied_seq;
+
+		/* Subtract 1, if FIN was received */
+		if (answ && sock_flag(sk, SOCK_DONE))
+			answ--;
+	} else {
+		answ = tp->urg_seq - tp->copied_seq;
+	}
+
+	return answ;
+}
+
 #endif	/* _TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f9faadb42485..a265f00b9df9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -556,20 +556,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			return -EINVAL;
 
 		slow = lock_sock_fast(sk);
-		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
-			answ = 0;
-		else if (sock_flag(sk, SOCK_URGINLINE) ||
-			 !tp->urg_data ||
-			 before(tp->urg_seq, tp->copied_seq) ||
-			 !before(tp->urg_seq, tp->rcv_nxt)) {
-
-			answ = tp->rcv_nxt - tp->copied_seq;
-
-			/* Subtract 1, if FIN was received */
-			if (answ && sock_flag(sk, SOCK_DONE))
-				answ--;
-		} else
-			answ = tp->urg_seq - tp->copied_seq;
+		answ = tcp_inq(sk);
 		unlock_sock_fast(sk, slow);
 		break;
 	case SIOCATMARK:
-- 
cgit v1.2.3-59-g8ed1b


From ab7ac4eb9832e32a09f4e8042705484d2fb0aad3 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:06 -0800
Subject: kcm: Kernel Connection Multiplexor module

This module implements the Kernel Connection Multiplexor.

Kernel Connection Multiplexor (KCM) is a facility that provides a
message based interface over TCP for generic application protocols.
With KCM an application can efficiently send and receive application
protocol messages over TCP using datagram sockets.

For more information see the included Documentation/networking/kcm.txt

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h   |    6 +-
 include/net/kcm.h        |  125 +++
 include/uapi/linux/kcm.h |   40 +
 net/Kconfig              |    1 +
 net/Makefile             |    1 +
 net/kcm/Kconfig          |   10 +
 net/kcm/Makefile         |    3 +
 net/kcm/kcmsock.c        | 2016 ++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 2201 insertions(+), 1 deletion(-)
 create mode 100644 include/net/kcm.h
 create mode 100644 include/uapi/linux/kcm.h
 create mode 100644 net/kcm/Kconfig
 create mode 100644 net/kcm/Makefile
 create mode 100644 net/kcm/kcmsock.c

(limited to 'net')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index d834af22a460..73bf6c6a833b 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -200,7 +200,9 @@ struct ucred {
 #define AF_ALG		38	/* Algorithm sockets		*/
 #define AF_NFC		39	/* NFC sockets			*/
 #define AF_VSOCK	40	/* vSockets			*/
-#define AF_MAX		41	/* For now.. */
+#define AF_KCM		41	/* Kernel Connection Multiplexor*/
+
+#define AF_MAX		42	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -246,6 +248,7 @@ struct ucred {
 #define PF_ALG		AF_ALG
 #define PF_NFC		AF_NFC
 #define PF_VSOCK	AF_VSOCK
+#define PF_KCM		AF_KCM
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -323,6 +326,7 @@ struct ucred {
 #define SOL_CAIF	278
 #define SOL_ALG		279
 #define SOL_NFC		280
+#define SOL_KCM		281
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/net/kcm.h b/include/net/kcm.h
new file mode 100644
index 000000000000..1bcae39070ec
--- /dev/null
+++ b/include/net/kcm.h
@@ -0,0 +1,125 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __NET_KCM_H_
+#define __NET_KCM_H_
+
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <uapi/linux/kcm.h>
+
+extern unsigned int kcm_net_id;
+
+struct kcm_tx_msg {
+	unsigned int sent;
+	unsigned int fragidx;
+	unsigned int frag_offset;
+	unsigned int msg_flags;
+	struct sk_buff *frag_skb;
+	struct sk_buff *last_skb;
+};
+
+struct kcm_rx_msg {
+	int full_len;
+	int accum_len;
+	int offset;
+};
+
+/* Socket structure for KCM client sockets */
+struct kcm_sock {
+	struct sock sk;
+	struct kcm_mux *mux;
+	struct list_head kcm_sock_list;
+	int index;
+	u32 done : 1;
+	struct work_struct done_work;
+
+	/* Transmit */
+	struct kcm_psock *tx_psock;
+	struct work_struct tx_work;
+	struct list_head wait_psock_list;
+	struct sk_buff *seq_skb;
+
+	/* Don't use bit fields here, these are set under different locks */
+	bool tx_wait;
+	bool tx_wait_more;
+
+	/* Receive */
+	struct kcm_psock *rx_psock;
+	struct list_head wait_rx_list; /* KCMs waiting for receiving */
+	bool rx_wait;
+	u32 rx_disabled : 1;
+};
+
+struct bpf_prog;
+
+/* Structure for an attached lower socket */
+struct kcm_psock {
+	struct sock *sk;
+	struct kcm_mux *mux;
+	int index;
+
+	u32 tx_stopped : 1;
+	u32 rx_stopped : 1;
+	u32 done : 1;
+	u32 unattaching : 1;
+
+	void (*save_state_change)(struct sock *sk);
+	void (*save_data_ready)(struct sock *sk);
+	void (*save_write_space)(struct sock *sk);
+
+	struct list_head psock_list;
+
+	/* Receive */
+	struct sk_buff *rx_skb_head;
+	struct sk_buff **rx_skb_nextp;
+	struct sk_buff *ready_rx_msg;
+	struct list_head psock_ready_list;
+	struct work_struct rx_work;
+	struct delayed_work rx_delayed_work;
+	struct bpf_prog *bpf_prog;
+	struct kcm_sock *rx_kcm;
+
+	/* Transmit */
+	struct kcm_sock *tx_kcm;
+	struct list_head psock_avail_list;
+};
+
+/* Per net MUX list */
+struct kcm_net {
+	struct mutex mutex;
+	struct list_head mux_list;
+	int count;
+};
+
+/* Structure for a MUX */
+struct kcm_mux {
+	struct list_head kcm_mux_list;
+	struct rcu_head rcu;
+	struct kcm_net *knet;
+
+	struct list_head kcm_socks;	/* All KCM sockets on MUX */
+	int kcm_socks_cnt;		/* Total KCM socket count for MUX */
+	struct list_head psocks;	/* List of all psocks on MUX */
+	int psocks_cnt;		/* Total attached sockets */
+
+	/* Receive */
+	spinlock_t rx_lock ____cacheline_aligned_in_smp;
+	struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */
+	struct list_head psocks_ready;	/* List of psocks with a msg ready */
+	struct sk_buff_head rx_hold_queue;
+
+	/* Transmit */
+	spinlock_t  lock ____cacheline_aligned_in_smp;	/* TX and mux locking */
+	struct list_head psocks_avail;	/* List of available psocks */
+	struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */
+};
+
+#endif /* __NET_KCM_H_ */
diff --git a/include/uapi/linux/kcm.h b/include/uapi/linux/kcm.h
new file mode 100644
index 000000000000..a5a530940b99
--- /dev/null
+++ b/include/uapi/linux/kcm.h
@@ -0,0 +1,40 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * User API to clone KCM sockets and attach transport socket to a KCM
+ * multiplexor.
+ */
+
+#ifndef KCM_KERNEL_H
+#define KCM_KERNEL_H
+
+struct kcm_attach {
+	int fd;
+	int bpf_fd;
+};
+
+struct kcm_unattach {
+	int fd;
+};
+
+struct kcm_clone {
+	int fd;
+};
+
+#define SIOCKCMATTACH	(SIOCPROTOPRIVATE + 0)
+#define SIOCKCMUNATTACH	(SIOCPROTOPRIVATE + 1)
+#define SIOCKCMCLONE	(SIOCPROTOPRIVATE + 2)
+
+#define KCMPROTO_CONNECTED	0
+
+/* Socket options */
+#define KCM_RECV_DISABLE	1
+
+#endif
+
diff --git a/net/Kconfig b/net/Kconfig
index 2760825e53fa..10640d5f8bee 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -360,6 +360,7 @@ source "net/can/Kconfig"
 source "net/irda/Kconfig"
 source "net/bluetooth/Kconfig"
 source "net/rxrpc/Kconfig"
+source "net/kcm/Kconfig"
 
 config FIB_RULES
 	bool
diff --git a/net/Makefile b/net/Makefile
index a5d04098dfce..81d14119eab5 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA)		+= irda/
 obj-$(CONFIG_BT)		+= bluetooth/
 obj-$(CONFIG_SUNRPC)		+= sunrpc/
 obj-$(CONFIG_AF_RXRPC)		+= rxrpc/
+obj-$(CONFIG_AF_KCM)		+= kcm/
 obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_L2TP)		+= l2tp/
 obj-$(CONFIG_DECNET)		+= decnet/
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
new file mode 100644
index 000000000000..5db94d940ecc
--- /dev/null
+++ b/net/kcm/Kconfig
@@ -0,0 +1,10 @@
+
+config AF_KCM
+	tristate "KCM sockets"
+	depends on INET
+	select BPF_SYSCALL
+	---help---
+	  KCM (Kernel Connection Multiplexor) sockets provide a method
+	  for multiplexing messages of a message based application
+	  protocol over kernel connectons (e.g. TCP connections).
+
diff --git a/net/kcm/Makefile b/net/kcm/Makefile
new file mode 100644
index 000000000000..cb525f7c5a13
--- /dev/null
+++ b/net/kcm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_AF_KCM) += kcm.o
+
+kcm-y := kcmsock.o
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
new file mode 100644
index 000000000000..30ef69ac6b81
--- /dev/null
+++ b/net/kcm/kcmsock.c
@@ -0,0 +1,2016 @@
+#include <linux/bpf.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+#include <net/kcm.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <uapi/linux/kcm.h>
+
+unsigned int kcm_net_id;
+
+static struct kmem_cache *kcm_psockp __read_mostly;
+static struct kmem_cache *kcm_muxp __read_mostly;
+static struct workqueue_struct *kcm_wq;
+
+static inline struct kcm_sock *kcm_sk(const struct sock *sk)
+{
+	return (struct kcm_sock *)sk;
+}
+
+static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
+{
+	return (struct kcm_tx_msg *)skb->cb;
+}
+
+static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
+{
+	return (struct kcm_rx_msg *)((void *)skb->cb +
+				     offsetof(struct qdisc_skb_cb, data));
+}
+
+static void report_csk_error(struct sock *csk, int err)
+{
+	csk->sk_err = EPIPE;
+	csk->sk_error_report(csk);
+}
+
+/* Callback lock held */
+static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
+			       struct sk_buff *skb)
+{
+	struct sock *csk = psock->sk;
+
+	/* Unrecoverable error in receive */
+
+	if (psock->rx_stopped)
+		return;
+
+	psock->rx_stopped = 1;
+
+	/* Report an error on the lower socket */
+	report_csk_error(csk, err);
+}
+
+static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
+			       bool wakeup_kcm)
+{
+	struct sock *csk = psock->sk;
+	struct kcm_mux *mux = psock->mux;
+
+	/* Unrecoverable error in transmit */
+
+	spin_lock_bh(&mux->lock);
+
+	if (psock->tx_stopped) {
+		spin_unlock_bh(&mux->lock);
+		return;
+	}
+
+	psock->tx_stopped = 1;
+
+	if (!psock->tx_kcm) {
+		/* Take off psocks_avail list */
+		list_del(&psock->psock_avail_list);
+	} else if (wakeup_kcm) {
+		/* In this case psock is being aborted while outside of
+		 * write_msgs and psock is reserved. Schedule tx_work
+		 * to handle the failure there. Need to commit tx_stopped
+		 * before queuing work.
+		 */
+		smp_mb();
+
+		queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+	/* Report error on lower socket */
+	report_csk_error(csk, err);
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+/* KCM is ready to receive messages on its queue-- either the KCM is new or
+ * has become unblocked after being blocked on full socket buffer. Queue any
+ * pending ready messages on a psock. RX mux lock held.
+ */
+static void kcm_rcv_ready(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+	struct sk_buff *skb;
+
+	if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
+		return;
+
+	while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
+		if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+			/* Assuming buffer limit has been reached */
+			skb_queue_head(&mux->rx_hold_queue, skb);
+			WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+			return;
+		}
+	}
+
+	while (!list_empty(&mux->psocks_ready)) {
+		psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
+					 psock_ready_list);
+
+		if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
+			/* Assuming buffer limit has been reached */
+			WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+			return;
+		}
+
+		/* Consumed the ready message on the psock. Schedule rx_work to
+		 * get more messages.
+		 */
+		list_del(&psock->psock_ready_list);
+		psock->ready_rx_msg = NULL;
+
+		/* Commit clearing of ready_rx_msg for queuing work */
+		smp_mb();
+
+		queue_work(kcm_wq, &psock->rx_work);
+	}
+
+	/* Buffer limit is okay now, add to ready list */
+	list_add_tail(&kcm->wait_rx_list,
+		      &kcm->mux->kcm_rx_waiters);
+	kcm->rx_wait = true;
+}
+
+static void kcm_rfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	struct kcm_mux *mux = kcm->mux;
+	unsigned int len = skb->truesize;
+
+	sk_mem_uncharge(sk, len);
+	atomic_sub(len, &sk->sk_rmem_alloc);
+
+	/* For reading rx_wait and rx_psock without holding lock */
+	smp_mb__after_atomic();
+
+	if (!kcm->rx_wait && !kcm->rx_psock &&
+	    sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
+		spin_lock_bh(&mux->rx_lock);
+		kcm_rcv_ready(kcm);
+		spin_unlock_bh(&mux->rx_lock);
+	}
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff_head *list = &sk->sk_receive_queue;
+
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		return -ENOMEM;
+
+	if (!sk_rmem_schedule(sk, skb, skb->truesize))
+		return -ENOBUFS;
+
+	skb->dev = NULL;
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = kcm_rfree;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+	sk_mem_charge(sk, skb->truesize);
+
+	skb_queue_tail(list, skb);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk);
+
+	return 0;
+}
+
+/* Requeue received messages for a kcm socket to other kcm sockets. This is
+ * called with a kcm socket is receive disabled.
+ * RX mux lock held.
+ */
+static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
+{
+	struct sk_buff *skb;
+	struct kcm_sock *kcm;
+
+	while ((skb = __skb_dequeue(head))) {
+		/* Reset destructor to avoid calling kcm_rcv_ready */
+		skb->destructor = sock_rfree;
+		skb_orphan(skb);
+try_again:
+		if (list_empty(&mux->kcm_rx_waiters)) {
+			skb_queue_tail(&mux->rx_hold_queue, skb);
+			continue;
+		}
+
+		kcm = list_first_entry(&mux->kcm_rx_waiters,
+				       struct kcm_sock, wait_rx_list);
+
+		if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+			/* Should mean socket buffer full */
+			list_del(&kcm->wait_rx_list);
+			kcm->rx_wait = false;
+
+			/* Commit rx_wait to read in kcm_free */
+			smp_wmb();
+
+			goto try_again;
+		}
+	}
+}
+
+/* Lower sock lock held */
+static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
+				       struct sk_buff *head)
+{
+	struct kcm_mux *mux = psock->mux;
+	struct kcm_sock *kcm;
+
+	WARN_ON(psock->ready_rx_msg);
+
+	if (psock->rx_kcm)
+		return psock->rx_kcm;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	if (psock->rx_kcm) {
+		spin_unlock_bh(&mux->rx_lock);
+		return psock->rx_kcm;
+	}
+
+	if (list_empty(&mux->kcm_rx_waiters)) {
+		psock->ready_rx_msg = head;
+		list_add_tail(&psock->psock_ready_list,
+			      &mux->psocks_ready);
+		spin_unlock_bh(&mux->rx_lock);
+		return NULL;
+	}
+
+	kcm = list_first_entry(&mux->kcm_rx_waiters,
+			       struct kcm_sock, wait_rx_list);
+	list_del(&kcm->wait_rx_list);
+	kcm->rx_wait = false;
+
+	psock->rx_kcm = kcm;
+	kcm->rx_psock = psock;
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	return kcm;
+}
+
+static void kcm_done(struct kcm_sock *kcm);
+
+static void kcm_done_work(struct work_struct *w)
+{
+	kcm_done(container_of(w, struct kcm_sock, done_work));
+}
+
+/* Lower sock held */
+static void unreserve_rx_kcm(struct kcm_psock *psock,
+			     bool rcv_ready)
+{
+	struct kcm_sock *kcm = psock->rx_kcm;
+	struct kcm_mux *mux = psock->mux;
+
+	if (!kcm)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	psock->rx_kcm = NULL;
+	kcm->rx_psock = NULL;
+
+	/* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
+	 * kcm_rfree
+	 */
+	smp_mb();
+
+	if (unlikely(kcm->done)) {
+		spin_unlock_bh(&mux->rx_lock);
+
+		/* Need to run kcm_done in a task since we need to qcquire
+		 * callback locks which may already be held here.
+		 */
+		INIT_WORK(&kcm->done_work, kcm_done_work);
+		schedule_work(&kcm->done_work);
+		return;
+	}
+
+	if (unlikely(kcm->rx_disabled)) {
+		requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+	} else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
+		/* Check for degenerative race with rx_wait that all
+		 * data was dequeued (accounted for in kcm_rfree).
+		 */
+		kcm_rcv_ready(kcm);
+	}
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+/* Macro to invoke filter function. */
+#define KCM_RUN_FILTER(prog, ctx) \
+	(*prog->bpf_func)(ctx, prog->insnsi)
+
+/* Lower socket lock held */
+static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+			unsigned int orig_offset, size_t orig_len)
+{
+	struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
+	struct kcm_rx_msg *rxm;
+	struct kcm_sock *kcm;
+	struct sk_buff *head, *skb;
+	size_t eaten = 0, cand_len;
+	ssize_t extra;
+	int err;
+	bool cloned_orig = false;
+
+	if (psock->ready_rx_msg)
+		return 0;
+
+	head = psock->rx_skb_head;
+	if (head) {
+		/* Message already in progress */
+
+		if (unlikely(orig_offset)) {
+			/* Getting data with a non-zero offset when a message is
+			 * in progress is not expected. If it does happen, we
+			 * need to clone and pull since we can't deal with
+			 * offsets in the skbs for a message expect in the head.
+			 */
+			orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
+			if (!orig_skb) {
+				desc->error = -ENOMEM;
+				return 0;
+			}
+			if (!pskb_pull(orig_skb, orig_offset)) {
+				kfree_skb(orig_skb);
+				desc->error = -ENOMEM;
+				return 0;
+			}
+			cloned_orig = true;
+			orig_offset = 0;
+		}
+
+		if (!psock->rx_skb_nextp) {
+			/* We are going to append to the frags_list of head.
+			 * Need to unshare the frag_list.
+			 */
+			err = skb_unclone(head, GFP_ATOMIC);
+			if (err) {
+				desc->error = err;
+				return 0;
+			}
+
+			if (unlikely(skb_shinfo(head)->frag_list)) {
+				/* We can't append to an sk_buff that already
+				 * has a frag_list. We create a new head, point
+				 * the frag_list of that to the old head, and
+				 * then are able to use the old head->next for
+				 * appending to the message.
+				 */
+				if (WARN_ON(head->next)) {
+					desc->error = -EINVAL;
+					return 0;
+				}
+
+				skb = alloc_skb(0, GFP_ATOMIC);
+				if (!skb) {
+					desc->error = -ENOMEM;
+					return 0;
+				}
+				skb->len = head->len;
+				skb->data_len = head->len;
+				skb->truesize = head->truesize;
+				*kcm_rx_msg(skb) = *kcm_rx_msg(head);
+				psock->rx_skb_nextp = &head->next;
+				skb_shinfo(skb)->frag_list = head;
+				psock->rx_skb_head = skb;
+				head = skb;
+			} else {
+				psock->rx_skb_nextp =
+				    &skb_shinfo(head)->frag_list;
+			}
+		}
+	}
+
+	while (eaten < orig_len) {
+		/* Always clone since we will consume something */
+		skb = skb_clone(orig_skb, GFP_ATOMIC);
+		if (!skb) {
+			desc->error = -ENOMEM;
+			break;
+		}
+
+		cand_len = orig_len - eaten;
+
+		head = psock->rx_skb_head;
+		if (!head) {
+			head = skb;
+			psock->rx_skb_head = head;
+			/* Will set rx_skb_nextp on next packet if needed */
+			psock->rx_skb_nextp = NULL;
+			rxm = kcm_rx_msg(head);
+			memset(rxm, 0, sizeof(*rxm));
+			rxm->offset = orig_offset + eaten;
+		} else {
+			/* Unclone since we may be appending to an skb that we
+			 * already share a frag_list with.
+			 */
+			err = skb_unclone(skb, GFP_ATOMIC);
+			if (err) {
+				desc->error = err;
+				break;
+			}
+
+			rxm = kcm_rx_msg(head);
+			*psock->rx_skb_nextp = skb;
+			psock->rx_skb_nextp = &skb->next;
+			head->data_len += skb->len;
+			head->len += skb->len;
+			head->truesize += skb->truesize;
+		}
+
+		if (!rxm->full_len) {
+			ssize_t len;
+
+			len = KCM_RUN_FILTER(psock->bpf_prog, head);
+
+			if (!len) {
+				/* Need more header to determine length */
+				rxm->accum_len += cand_len;
+				eaten += cand_len;
+				WARN_ON(eaten != orig_len);
+				break;
+			} else if (len <= (ssize_t)head->len -
+					  skb->len - rxm->offset) {
+				/* Length must be into new skb (and also
+				 * greater than zero)
+				 */
+				desc->error = -EPROTO;
+				psock->rx_skb_head = NULL;
+				kcm_abort_rx_psock(psock, EPROTO, head);
+				break;
+			}
+
+			rxm->full_len = len;
+		}
+
+		extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
+
+		if (extra < 0) {
+			/* Message not complete yet. */
+			rxm->accum_len += cand_len;
+			eaten += cand_len;
+			WARN_ON(eaten != orig_len);
+			break;
+		}
+
+		/* Positive extra indicates ore bytes than needed for the
+		 * message
+		 */
+
+		WARN_ON(extra > cand_len);
+
+		eaten += (cand_len - extra);
+
+		/* Hurray, we have a new message! */
+		psock->rx_skb_head = NULL;
+
+try_queue:
+		kcm = reserve_rx_kcm(psock, head);
+		if (!kcm) {
+			/* Unable to reserve a KCM, message is held in psock. */
+			break;
+		}
+
+		if (kcm_queue_rcv_skb(&kcm->sk, head)) {
+			/* Should mean socket buffer full */
+			unreserve_rx_kcm(psock, false);
+			goto try_queue;
+		}
+	}
+
+	if (cloned_orig)
+		kfree_skb(orig_skb);
+
+	return eaten;
+}
+
+/* Called with lock held on lower socket */
+static int psock_tcp_read_sock(struct kcm_psock *psock)
+{
+	read_descriptor_t desc;
+
+	desc.arg.data = psock;
+	desc.error = 0;
+	desc.count = 1; /* give more than one skb per call */
+
+	/* sk should be locked here, so okay to do tcp_read_sock */
+	tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
+
+	unreserve_rx_kcm(psock, true);
+
+	return desc.error;
+}
+
+/* Lower sock lock held */
+static void psock_tcp_data_ready(struct sock *sk)
+{
+	struct kcm_psock *psock;
+
+	read_lock_bh(&sk->sk_callback_lock);
+
+	psock = (struct kcm_psock *)sk->sk_user_data;
+	if (unlikely(!psock || psock->rx_stopped))
+		goto out;
+
+	if (psock->ready_rx_msg)
+		goto out;
+
+	if (psock_tcp_read_sock(psock) == -ENOMEM)
+		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void do_psock_rx_work(struct kcm_psock *psock)
+{
+	read_descriptor_t rd_desc;
+	struct sock *csk = psock->sk;
+
+	/* We need the read lock to synchronize with psock_tcp_data_ready. We
+	 * need the socket lock for calling tcp_read_sock.
+	 */
+	lock_sock(csk);
+	read_lock_bh(&csk->sk_callback_lock);
+
+	if (unlikely(csk->sk_user_data != psock))
+		goto out;
+
+	if (unlikely(psock->rx_stopped))
+		goto out;
+
+	if (psock->ready_rx_msg)
+		goto out;
+
+	rd_desc.arg.data = psock;
+
+	if (psock_tcp_read_sock(psock) == -ENOMEM)
+		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+	read_unlock_bh(&csk->sk_callback_lock);
+	release_sock(csk);
+}
+
+static void psock_rx_work(struct work_struct *w)
+{
+	do_psock_rx_work(container_of(w, struct kcm_psock, rx_work));
+}
+
+static void psock_rx_delayed_work(struct work_struct *w)
+{
+	do_psock_rx_work(container_of(w, struct kcm_psock,
+				      rx_delayed_work.work));
+}
+
+static void psock_tcp_state_change(struct sock *sk)
+{
+	/* TCP only does a POLLIN for a half close. Do a POLLHUP here
+	 * since application will normally not poll with POLLIN
+	 * on the TCP sockets.
+	 */
+
+	report_csk_error(sk, EPIPE);
+}
+
+static void psock_tcp_write_space(struct sock *sk)
+{
+	struct kcm_psock *psock;
+	struct kcm_mux *mux;
+	struct kcm_sock *kcm;
+
+	read_lock_bh(&sk->sk_callback_lock);
+
+	psock = (struct kcm_psock *)sk->sk_user_data;
+	if (unlikely(!psock))
+		goto out;
+
+	mux = psock->mux;
+
+	spin_lock_bh(&mux->lock);
+
+	/* Check if the socket is reserved so someone is waiting for sending. */
+	kcm = psock->tx_kcm;
+	if (kcm)
+		queue_work(kcm_wq, &kcm->tx_work);
+
+	spin_unlock_bh(&mux->lock);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void unreserve_psock(struct kcm_sock *kcm);
+
+/* kcm sock is locked. */
+static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+
+	psock = kcm->tx_psock;
+
+	smp_rmb(); /* Must read tx_psock before tx_wait */
+
+	if (psock) {
+		WARN_ON(kcm->tx_wait);
+		if (unlikely(psock->tx_stopped))
+			unreserve_psock(kcm);
+		else
+			return kcm->tx_psock;
+	}
+
+	spin_lock_bh(&mux->lock);
+
+	/* Check again under lock to see if psock was reserved for this
+	 * psock via psock_unreserve.
+	 */
+	psock = kcm->tx_psock;
+	if (unlikely(psock)) {
+		WARN_ON(kcm->tx_wait);
+		spin_unlock_bh(&mux->lock);
+		return kcm->tx_psock;
+	}
+
+	if (!list_empty(&mux->psocks_avail)) {
+		psock = list_first_entry(&mux->psocks_avail,
+					 struct kcm_psock,
+					 psock_avail_list);
+		list_del(&psock->psock_avail_list);
+		if (kcm->tx_wait) {
+			list_del(&kcm->wait_psock_list);
+			kcm->tx_wait = false;
+		}
+		kcm->tx_psock = psock;
+		psock->tx_kcm = kcm;
+	} else if (!kcm->tx_wait) {
+		list_add_tail(&kcm->wait_psock_list,
+			      &mux->kcm_tx_waiters);
+		kcm->tx_wait = true;
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+	return psock;
+}
+
+/* mux lock held */
+static void psock_now_avail(struct kcm_psock *psock)
+{
+	struct kcm_mux *mux = psock->mux;
+	struct kcm_sock *kcm;
+
+	if (list_empty(&mux->kcm_tx_waiters)) {
+		list_add_tail(&psock->psock_avail_list,
+			      &mux->psocks_avail);
+	} else {
+		kcm = list_first_entry(&mux->kcm_tx_waiters,
+				       struct kcm_sock,
+				       wait_psock_list);
+		list_del(&kcm->wait_psock_list);
+		kcm->tx_wait = false;
+		psock->tx_kcm = kcm;
+
+		/* Commit before changing tx_psock since that is read in
+		 * reserve_psock before queuing work.
+		 */
+		smp_mb();
+
+		kcm->tx_psock = psock;
+		queue_work(kcm_wq, &kcm->tx_work);
+	}
+}
+
+/* kcm sock is locked. */
+static void unreserve_psock(struct kcm_sock *kcm)
+{
+	struct kcm_psock *psock;
+	struct kcm_mux *mux = kcm->mux;
+
+	spin_lock_bh(&mux->lock);
+
+	psock = kcm->tx_psock;
+
+	if (WARN_ON(!psock)) {
+		spin_unlock_bh(&mux->lock);
+		return;
+	}
+
+	smp_rmb(); /* Read tx_psock before tx_wait */
+
+	WARN_ON(kcm->tx_wait);
+
+	kcm->tx_psock = NULL;
+	psock->tx_kcm = NULL;
+
+	if (unlikely(psock->tx_stopped)) {
+		if (psock->done) {
+			/* Deferred free */
+			list_del(&psock->psock_list);
+			mux->psocks_cnt--;
+			sock_put(psock->sk);
+			fput(psock->sk->sk_socket->file);
+			kmem_cache_free(kcm_psockp, psock);
+		}
+
+		/* Don't put back on available list */
+
+		spin_unlock_bh(&mux->lock);
+
+		return;
+	}
+
+	psock_now_avail(psock);
+
+	spin_unlock_bh(&mux->lock);
+}
+
+/* Write any messages ready on the kcm socket.  Called with kcm sock lock
+ * held.  Return bytes actually sent or error.
+ */
+static int kcm_write_msgs(struct kcm_sock *kcm)
+{
+	struct sock *sk = &kcm->sk;
+	struct kcm_psock *psock;
+	struct sk_buff *skb, *head;
+	struct kcm_tx_msg *txm;
+	unsigned short fragidx, frag_offset;
+	unsigned int sent, total_sent = 0;
+	int ret = 0;
+
+	kcm->tx_wait_more = false;
+	psock = kcm->tx_psock;
+	if (unlikely(psock && psock->tx_stopped)) {
+		/* A reserved psock was aborted asynchronously. Unreserve
+		 * it and we'll retry the message.
+		 */
+		unreserve_psock(kcm);
+		if (skb_queue_empty(&sk->sk_write_queue))
+			return 0;
+
+		kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
+
+	} else if (skb_queue_empty(&sk->sk_write_queue)) {
+		return 0;
+	}
+
+	head = skb_peek(&sk->sk_write_queue);
+	txm = kcm_tx_msg(head);
+
+	if (txm->sent) {
+		/* Send of first skbuff in queue already in progress */
+		if (WARN_ON(!psock)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		sent = txm->sent;
+		frag_offset = txm->frag_offset;
+		fragidx = txm->fragidx;
+		skb = txm->frag_skb;
+
+		goto do_frag;
+	}
+
+try_again:
+	psock = reserve_psock(kcm);
+	if (!psock)
+		goto out;
+
+	do {
+		skb = head;
+		txm = kcm_tx_msg(head);
+		sent = 0;
+
+do_frag_list:
+		if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
+		     fragidx++) {
+			skb_frag_t *frag;
+
+			frag_offset = 0;
+do_frag:
+			frag = &skb_shinfo(skb)->frags[fragidx];
+			if (WARN_ON(!frag->size)) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			ret = kernel_sendpage(psock->sk->sk_socket,
+					      frag->page.p,
+					      frag->page_offset + frag_offset,
+					      frag->size - frag_offset,
+					      MSG_DONTWAIT);
+			if (ret <= 0) {
+				if (ret == -EAGAIN) {
+					/* Save state to try again when there's
+					 * write space on the socket
+					 */
+					txm->sent = sent;
+					txm->frag_offset = frag_offset;
+					txm->fragidx = fragidx;
+					txm->frag_skb = skb;
+
+					ret = 0;
+					goto out;
+				}
+
+				/* Hard failure in sending message, abort this
+				 * psock since it has lost framing
+				 * synchonization and retry sending the
+				 * message from the beginning.
+				 */
+				kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
+						   true);
+				unreserve_psock(kcm);
+
+				txm->sent = 0;
+				ret = 0;
+
+				goto try_again;
+			}
+
+			sent += ret;
+			frag_offset += ret;
+			if (frag_offset < frag->size) {
+				/* Not finished with this frag */
+				goto do_frag;
+			}
+		}
+
+		if (skb == head) {
+			if (skb_has_frag_list(skb)) {
+				skb = skb_shinfo(skb)->frag_list;
+				goto do_frag_list;
+			}
+		} else if (skb->next) {
+			skb = skb->next;
+			goto do_frag_list;
+		}
+
+		/* Successfully sent the whole packet, account for it. */
+		skb_dequeue(&sk->sk_write_queue);
+		kfree_skb(head);
+		sk->sk_wmem_queued -= sent;
+		total_sent += sent;
+	} while ((head = skb_peek(&sk->sk_write_queue)));
+out:
+	if (!head) {
+		/* Done with all queued messages. */
+		WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+		unreserve_psock(kcm);
+	}
+
+	/* Check if write space is available */
+	sk->sk_write_space(sk);
+
+	return total_sent ? : ret;
+}
+
+static void kcm_tx_work(struct work_struct *w)
+{
+	struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
+	struct sock *sk = &kcm->sk;
+	int err;
+
+	lock_sock(sk);
+
+	/* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
+	 * aborts
+	 */
+	err = kcm_write_msgs(kcm);
+	if (err < 0) {
+		/* Hard failure in write, report error on KCM socket */
+		pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
+		report_csk_error(&kcm->sk, -err);
+		goto out;
+	}
+
+	/* Primarily for SOCK_SEQPACKET sockets */
+	if (likely(sk->sk_socket) &&
+	    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_space(sk);
+	}
+
+out:
+	release_sock(sk);
+}
+
+static void kcm_push(struct kcm_sock *kcm)
+{
+	if (kcm->tx_wait_more)
+		kcm_write_msgs(kcm);
+}
+
+static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	struct sk_buff *skb = NULL, *head = NULL;
+	size_t copy, copied = 0;
+	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	int eor = (sock->type == SOCK_DGRAM) ?
+		  !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
+	int err = -EPIPE;
+
+	lock_sock(sk);
+
+	/* Per tcp_sendmsg this should be in poll */
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	if (sk->sk_err)
+		goto out_error;
+
+	if (kcm->seq_skb) {
+		/* Previously opened message */
+		head = kcm->seq_skb;
+		skb = kcm_tx_msg(head)->last_skb;
+		goto start;
+	}
+
+	/* Call the sk_stream functions to manage the sndbuf mem. */
+	if (!sk_stream_memory_free(sk)) {
+		kcm_push(kcm);
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+	}
+
+	/* New message, alloc head skb */
+	head = alloc_skb(0, sk->sk_allocation);
+	while (!head) {
+		kcm_push(kcm);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+
+		head = alloc_skb(0, sk->sk_allocation);
+	}
+
+	skb = head;
+
+	/* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+	 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+	 */
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+start:
+	while (msg_data_left(msg)) {
+		bool merge = true;
+		int i = skb_shinfo(skb)->nr_frags;
+		struct page_frag *pfrag = sk_page_frag(sk);
+
+		if (!sk_page_frag_refill(sk, pfrag))
+			goto wait_for_memory;
+
+		if (!skb_can_coalesce(skb, i, pfrag->page,
+				      pfrag->offset)) {
+			if (i == MAX_SKB_FRAGS) {
+				struct sk_buff *tskb;
+
+				tskb = alloc_skb(0, sk->sk_allocation);
+				if (!tskb)
+					goto wait_for_memory;
+
+				if (head == skb)
+					skb_shinfo(head)->frag_list = tskb;
+				else
+					skb->next = tskb;
+
+				skb = tskb;
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				continue;
+			}
+			merge = false;
+		}
+
+		copy = min_t(int, msg_data_left(msg),
+			     pfrag->size - pfrag->offset);
+
+		if (!sk_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+
+		err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+					       pfrag->page,
+					       pfrag->offset,
+					       copy);
+		if (err)
+			goto out_error;
+
+		/* Update the skb. */
+		if (merge) {
+			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+		} else {
+			skb_fill_page_desc(skb, i, pfrag->page,
+					   pfrag->offset, copy);
+			get_page(pfrag->page);
+		}
+
+		pfrag->offset += copy;
+		copied += copy;
+		if (head != skb) {
+			head->len += copy;
+			head->data_len += copy;
+		}
+
+		continue;
+
+wait_for_memory:
+		kcm_push(kcm);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+	}
+
+	if (eor) {
+		bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+		/* Message complete, queue it on send buffer */
+		__skb_queue_tail(&sk->sk_write_queue, head);
+		kcm->seq_skb = NULL;
+
+		if (msg->msg_flags & MSG_BATCH) {
+			kcm->tx_wait_more = true;
+		} else if (kcm->tx_wait_more || not_busy) {
+			err = kcm_write_msgs(kcm);
+			if (err < 0) {
+				/* We got a hard error in write_msgs but have
+				 * already queued this message. Report an error
+				 * in the socket, but don't affect return value
+				 * from sendmsg
+				 */
+				pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+				report_csk_error(&kcm->sk, -err);
+			}
+		}
+	} else {
+		/* Message not complete, save state */
+partial_message:
+		kcm->seq_skb = head;
+		kcm_tx_msg(head)->last_skb = skb;
+	}
+
+	release_sock(sk);
+	return copied;
+
+out_error:
+	kcm_push(kcm);
+
+	if (copied && sock->type == SOCK_SEQPACKET) {
+		/* Wrote some bytes before encountering an
+		 * error, return partial success.
+		 */
+		goto partial_message;
+	}
+
+	if (head != kcm->seq_skb)
+		kfree_skb(head);
+
+	err = sk_stream_error(sk, msg->msg_flags, err);
+
+	/* make sure we wake any epoll edge trigger waiter */
+	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+		sk->sk_write_space(sk);
+
+	release_sock(sk);
+	return err;
+}
+
+static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
+				     long timeo, int *err)
+{
+	struct sk_buff *skb;
+
+	while (!(skb = skb_peek(&sk->sk_receive_queue))) {
+		if (sk->sk_err) {
+			*err = sock_error(sk);
+			return NULL;
+		}
+
+		if (sock_flag(sk, SOCK_DONE))
+			return NULL;
+
+		if ((flags & MSG_DONTWAIT) || !timeo) {
+			*err = -EAGAIN;
+			return NULL;
+		}
+
+		sk_wait_data(sk, &timeo, NULL);
+
+		/* Handle signals */
+		if (signal_pending(current)) {
+			*err = sock_intr_errno(timeo);
+			return NULL;
+		}
+	}
+
+	return skb;
+}
+
+static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
+		       size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+	long timeo;
+	struct kcm_rx_msg *rxm;
+	int copied = 0;
+	struct sk_buff *skb;
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	lock_sock(sk);
+
+	skb = kcm_wait_data(sk, flags, timeo, &err);
+	if (!skb)
+		goto out;
+
+	/* Okay, have a message on the receive queue */
+
+	rxm = kcm_rx_msg(skb);
+
+	if (len > rxm->full_len)
+		len = rxm->full_len;
+
+	err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
+	if (err < 0)
+		goto out;
+
+	copied = len;
+	if (likely(!(flags & MSG_PEEK))) {
+		if (copied < rxm->full_len) {
+			if (sock->type == SOCK_DGRAM) {
+				/* Truncated message */
+				msg->msg_flags |= MSG_TRUNC;
+				goto msg_finished;
+			}
+			rxm->offset += copied;
+			rxm->full_len -= copied;
+		} else {
+msg_finished:
+			/* Finished with message */
+			msg->msg_flags |= MSG_EOR;
+			skb_unlink(skb, &sk->sk_receive_queue);
+			kfree_skb(skb);
+		}
+	}
+
+out:
+	release_sock(sk);
+
+	return copied ? : err;
+}
+
+/* kcm sock lock held */
+static void kcm_recv_disable(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	if (kcm->rx_disabled)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	kcm->rx_disabled = 1;
+
+	/* If a psock is reserved we'll do cleanup in unreserve */
+	if (!kcm->rx_psock) {
+		if (kcm->rx_wait) {
+			list_del(&kcm->wait_rx_list);
+			kcm->rx_wait = false;
+		}
+
+		requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+	}
+
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+/* kcm sock lock held */
+static void kcm_recv_enable(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	if (!kcm->rx_disabled)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	kcm->rx_disabled = 0;
+	kcm_rcv_ready(kcm);
+
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	int val, valbool;
+	int err = 0;
+
+	if (level != SOL_KCM)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EINVAL;
+
+	valbool = val ? 1 : 0;
+
+	switch (optname) {
+	case KCM_RECV_DISABLE:
+		lock_sock(&kcm->sk);
+		if (valbool)
+			kcm_recv_disable(kcm);
+		else
+			kcm_recv_enable(kcm);
+		release_sock(&kcm->sk);
+		break;
+	default:
+		err = -ENOPROTOOPT;
+	}
+
+	return err;
+}
+
+static int kcm_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	int val, len;
+
+	if (level != SOL_KCM)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case KCM_RECV_DISABLE:
+		val = kcm->rx_disabled;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
+{
+	struct kcm_sock *tkcm;
+	struct list_head *head;
+	int index = 0;
+
+	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
+	 * we set sk_state, otherwise epoll_wait always returns right away with
+	 * POLLHUP
+	 */
+	kcm->sk.sk_state = TCP_ESTABLISHED;
+
+	/* Add to mux's kcm sockets list */
+	kcm->mux = mux;
+	spin_lock_bh(&mux->lock);
+
+	head = &mux->kcm_socks;
+	list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
+		if (tkcm->index != index)
+			break;
+		head = &tkcm->kcm_sock_list;
+		index++;
+	}
+
+	list_add(&kcm->kcm_sock_list, head);
+	kcm->index = index;
+
+	mux->kcm_socks_cnt++;
+	spin_unlock_bh(&mux->lock);
+
+	INIT_WORK(&kcm->tx_work, kcm_tx_work);
+
+	spin_lock_bh(&mux->rx_lock);
+	kcm_rcv_ready(kcm);
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_attach(struct socket *sock, struct socket *csock,
+		      struct bpf_prog *prog)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	struct kcm_mux *mux = kcm->mux;
+	struct sock *csk;
+	struct kcm_psock *psock = NULL, *tpsock;
+	struct list_head *head;
+	int index = 0;
+
+	if (csock->ops->family != PF_INET &&
+	    csock->ops->family != PF_INET6)
+		return -EINVAL;
+
+	csk = csock->sk;
+	if (!csk)
+		return -EINVAL;
+
+	/* Only support TCP for now */
+	if (csk->sk_protocol != IPPROTO_TCP)
+		return -EINVAL;
+
+	psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
+	if (!psock)
+		return -ENOMEM;
+
+	psock->mux = mux;
+	psock->sk = csk;
+	psock->bpf_prog = prog;
+	INIT_WORK(&psock->rx_work, psock_rx_work);
+	INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
+
+	sock_hold(csk);
+
+	write_lock_bh(&csk->sk_callback_lock);
+	psock->save_data_ready = csk->sk_data_ready;
+	psock->save_write_space = csk->sk_write_space;
+	psock->save_state_change = csk->sk_state_change;
+	csk->sk_user_data = psock;
+	csk->sk_data_ready = psock_tcp_data_ready;
+	csk->sk_write_space = psock_tcp_write_space;
+	csk->sk_state_change = psock_tcp_state_change;
+	write_unlock_bh(&csk->sk_callback_lock);
+
+	/* Finished initialization, now add the psock to the MUX. */
+	spin_lock_bh(&mux->lock);
+	head = &mux->psocks;
+	list_for_each_entry(tpsock, &mux->psocks, psock_list) {
+		if (tpsock->index != index)
+			break;
+		head = &tpsock->psock_list;
+		index++;
+	}
+
+	list_add(&psock->psock_list, head);
+	psock->index = index;
+
+	mux->psocks_cnt++;
+	psock_now_avail(psock);
+	spin_unlock_bh(&mux->lock);
+
+	/* Schedule RX work in case there are already bytes queued */
+	queue_work(kcm_wq, &psock->rx_work);
+
+	return 0;
+}
+
+static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
+{
+	struct socket *csock;
+	struct bpf_prog *prog;
+	int err;
+
+	csock = sockfd_lookup(info->fd, &err);
+	if (!csock)
+		return -ENOENT;
+
+	prog = bpf_prog_get(info->bpf_fd);
+	if (IS_ERR(prog)) {
+		err = PTR_ERR(prog);
+		goto out;
+	}
+
+	if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+		bpf_prog_put(prog);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = kcm_attach(sock, csock, prog);
+	if (err) {
+		bpf_prog_put(prog);
+		goto out;
+	}
+
+	/* Keep reference on file also */
+
+	return 0;
+out:
+	fput(csock->file);
+	return err;
+}
+
+static void kcm_unattach(struct kcm_psock *psock)
+{
+	struct sock *csk = psock->sk;
+	struct kcm_mux *mux = psock->mux;
+
+	/* Stop getting callbacks from TCP socket. After this there should
+	 * be no way to reserve a kcm for this psock.
+	 */
+	write_lock_bh(&csk->sk_callback_lock);
+	csk->sk_user_data = NULL;
+	csk->sk_data_ready = psock->save_data_ready;
+	csk->sk_write_space = psock->save_write_space;
+	csk->sk_state_change = psock->save_state_change;
+	psock->rx_stopped = 1;
+
+	if (WARN_ON(psock->rx_kcm)) {
+		write_unlock_bh(&csk->sk_callback_lock);
+		return;
+	}
+
+	spin_lock_bh(&mux->rx_lock);
+
+	/* Stop receiver activities. After this point psock should not be
+	 * able to get onto ready list either through callbacks or work.
+	 */
+	if (psock->ready_rx_msg) {
+		list_del(&psock->psock_ready_list);
+		kfree_skb(psock->ready_rx_msg);
+		psock->ready_rx_msg = NULL;
+	}
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	write_unlock_bh(&csk->sk_callback_lock);
+
+	cancel_work_sync(&psock->rx_work);
+	cancel_delayed_work_sync(&psock->rx_delayed_work);
+
+	bpf_prog_put(psock->bpf_prog);
+
+	kfree_skb(psock->rx_skb_head);
+	psock->rx_skb_head = NULL;
+
+	spin_lock_bh(&mux->lock);
+
+	if (psock->tx_kcm) {
+		/* psock was reserved.  Just mark it finished and we will clean
+		 * up in the kcm paths, we need kcm lock which can not be
+		 * acquired here.
+		 */
+		spin_unlock_bh(&mux->lock);
+
+		/* We are unattaching a socket that is reserved. Abort the
+		 * socket since we may be out of sync in sending on it. We need
+		 * to do this without the mux lock.
+		 */
+		kcm_abort_tx_psock(psock, EPIPE, false);
+
+		spin_lock_bh(&mux->lock);
+		if (!psock->tx_kcm) {
+			/* psock now unreserved in window mux was unlocked */
+			goto no_reserved;
+		}
+		psock->done = 1;
+
+		/* Commit done before queuing work to process it */
+		smp_mb();
+
+		/* Queue tx work to make sure psock->done is handled */
+		queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+		spin_unlock_bh(&mux->lock);
+	} else {
+no_reserved:
+		if (!psock->tx_stopped)
+			list_del(&psock->psock_avail_list);
+		list_del(&psock->psock_list);
+		mux->psocks_cnt--;
+		spin_unlock_bh(&mux->lock);
+
+		sock_put(csk);
+		fput(csk->sk_socket->file);
+		kmem_cache_free(kcm_psockp, psock);
+	}
+}
+
+static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+	struct socket *csock;
+	struct sock *csk;
+	int err;
+
+	csock = sockfd_lookup(info->fd, &err);
+	if (!csock)
+		return -ENOENT;
+
+	csk = csock->sk;
+	if (!csk) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = -ENOENT;
+
+	spin_lock_bh(&mux->lock);
+
+	list_for_each_entry(psock, &mux->psocks, psock_list) {
+		if (psock->sk != csk)
+			continue;
+
+		/* Found the matching psock */
+
+		if (psock->unattaching || WARN_ON(psock->done)) {
+			err = -EALREADY;
+			break;
+		}
+
+		psock->unattaching = 1;
+
+		spin_unlock_bh(&mux->lock);
+
+		kcm_unattach(psock);
+
+		err = 0;
+		goto out;
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+out:
+	fput(csock->file);
+	return err;
+}
+
+static struct proto kcm_proto = {
+	.name	= "KCM",
+	.owner	= THIS_MODULE,
+	.obj_size = sizeof(struct kcm_sock),
+};
+
+/* Clone a kcm socket. */
+static int kcm_clone(struct socket *osock, struct kcm_clone *info,
+		     struct socket **newsockp)
+{
+	struct socket *newsock;
+	struct sock *newsk;
+	struct file *newfile;
+	int err, newfd;
+
+	err = -ENFILE;
+	newsock = sock_alloc();
+	if (!newsock)
+		goto out;
+
+	newsock->type = osock->type;
+	newsock->ops = osock->ops;
+
+	__module_get(newsock->ops->owner);
+
+	newfd = get_unused_fd_flags(0);
+	if (unlikely(newfd < 0)) {
+		err = newfd;
+		goto out_fd_fail;
+	}
+
+	newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
+	if (unlikely(IS_ERR(newfile))) {
+		err = PTR_ERR(newfile);
+		goto out_sock_alloc_fail;
+	}
+
+	newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
+			 &kcm_proto, true);
+	if (!newsk) {
+		err = -ENOMEM;
+		goto out_sk_alloc_fail;
+	}
+
+	sock_init_data(newsock, newsk);
+	init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
+
+	fd_install(newfd, newfile);
+	*newsockp = newsock;
+	info->fd = newfd;
+
+	return 0;
+
+out_sk_alloc_fail:
+	fput(newfile);
+out_sock_alloc_fail:
+	put_unused_fd(newfd);
+out_fd_fail:
+	sock_release(newsock);
+out:
+	return err;
+}
+
+static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	switch (cmd) {
+	case SIOCKCMATTACH: {
+		struct kcm_attach info;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_attach_ioctl(sock, &info);
+
+		break;
+	}
+	case SIOCKCMUNATTACH: {
+		struct kcm_unattach info;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_unattach_ioctl(sock, &info);
+
+		break;
+	}
+	case SIOCKCMCLONE: {
+		struct kcm_clone info;
+		struct socket *newsock = NULL;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_clone(sock, &info, &newsock);
+
+		if (!err) {
+			if (copy_to_user((void __user *)arg, &info,
+					 sizeof(info))) {
+				err = -EFAULT;
+				sock_release(newsock);
+			}
+		}
+
+		break;
+	}
+	default:
+		err = -ENOIOCTLCMD;
+		break;
+	}
+
+	return err;
+}
+
+static void free_mux(struct rcu_head *rcu)
+{
+	struct kcm_mux *mux = container_of(rcu,
+	    struct kcm_mux, rcu);
+
+	kmem_cache_free(kcm_muxp, mux);
+}
+
+static void release_mux(struct kcm_mux *mux)
+{
+	struct kcm_net *knet = mux->knet;
+	struct kcm_psock *psock, *tmp_psock;
+
+	/* Release psocks */
+	list_for_each_entry_safe(psock, tmp_psock,
+				 &mux->psocks, psock_list) {
+		if (!WARN_ON(psock->unattaching))
+			kcm_unattach(psock);
+	}
+
+	if (WARN_ON(mux->psocks_cnt))
+		return;
+
+	__skb_queue_purge(&mux->rx_hold_queue);
+
+	mutex_lock(&knet->mutex);
+	list_del_rcu(&mux->kcm_mux_list);
+	knet->count--;
+	mutex_unlock(&knet->mutex);
+
+	call_rcu(&mux->rcu, free_mux);
+}
+
+static void kcm_done(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct sock *sk = &kcm->sk;
+	int socks_cnt;
+
+	spin_lock_bh(&mux->rx_lock);
+	if (kcm->rx_psock) {
+		/* Cleanup in unreserve_rx_kcm */
+		WARN_ON(kcm->done);
+		kcm->rx_disabled = 1;
+		kcm->done = 1;
+		spin_unlock_bh(&mux->rx_lock);
+		return;
+	}
+
+	if (kcm->rx_wait) {
+		list_del(&kcm->wait_rx_list);
+		kcm->rx_wait = false;
+	}
+	/* Move any pending receive messages to other kcm sockets */
+	requeue_rx_msgs(mux, &sk->sk_receive_queue);
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	if (WARN_ON(sk_rmem_alloc_get(sk)))
+		return;
+
+	/* Detach from MUX */
+	spin_lock_bh(&mux->lock);
+
+	list_del(&kcm->kcm_sock_list);
+	mux->kcm_socks_cnt--;
+	socks_cnt = mux->kcm_socks_cnt;
+
+	spin_unlock_bh(&mux->lock);
+
+	if (!socks_cnt) {
+		/* We are done with the mux now. */
+		release_mux(mux);
+	}
+
+	WARN_ON(kcm->rx_wait);
+
+	sock_put(&kcm->sk);
+}
+
+/* Called by kcm_release to close a KCM socket.
+ * If this is the last KCM socket on the MUX, destroy the MUX.
+ */
+static int kcm_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm;
+	struct kcm_mux *mux;
+	struct kcm_psock *psock;
+
+	if (!sk)
+		return 0;
+
+	kcm = kcm_sk(sk);
+	mux = kcm->mux;
+
+	sock_orphan(sk);
+	kfree_skb(kcm->seq_skb);
+
+	lock_sock(sk);
+	/* Purge queue under lock to avoid race condition with tx_work trying
+	 * to act when queue is nonempty. If tx_work runs after this point
+	 * it will just return.
+	 */
+	__skb_queue_purge(&sk->sk_write_queue);
+	release_sock(sk);
+
+	spin_lock_bh(&mux->lock);
+	if (kcm->tx_wait) {
+		/* Take of tx_wait list, after this point there should be no way
+		 * that a psock will be assigned to this kcm.
+		 */
+		list_del(&kcm->wait_psock_list);
+		kcm->tx_wait = false;
+	}
+	spin_unlock_bh(&mux->lock);
+
+	/* Cancel work. After this point there should be no outside references
+	 * to the kcm socket.
+	 */
+	cancel_work_sync(&kcm->tx_work);
+
+	lock_sock(sk);
+	psock = kcm->tx_psock;
+	if (psock) {
+		/* A psock was reserved, so we need to kill it since it
+		 * may already have some bytes queued from a message. We
+		 * need to do this after removing kcm from tx_wait list.
+		 */
+		kcm_abort_tx_psock(psock, EPIPE, false);
+		unreserve_psock(kcm);
+	}
+	release_sock(sk);
+
+	WARN_ON(kcm->tx_wait);
+	WARN_ON(kcm->tx_psock);
+
+	sock->sk = NULL;
+
+	kcm_done(kcm);
+
+	return 0;
+}
+
+static const struct proto_ops kcm_ops = {
+	.family =	PF_KCM,
+	.owner =	THIS_MODULE,
+	.release =	kcm_release,
+	.bind =		sock_no_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	sock_no_getname,
+	.poll =		datagram_poll,
+	.ioctl =	kcm_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	kcm_setsockopt,
+	.getsockopt =	kcm_getsockopt,
+	.sendmsg =	kcm_sendmsg,
+	.recvmsg =	kcm_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+/* Create proto operation for kcm sockets */
+static int kcm_create(struct net *net, struct socket *sock,
+		      int protocol, int kern)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+	struct sock *sk;
+	struct kcm_mux *mux;
+
+	switch (sock->type) {
+	case SOCK_DGRAM:
+	case SOCK_SEQPACKET:
+		sock->ops = &kcm_ops;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	if (protocol != KCMPROTO_CONNECTED)
+		return -EPROTONOSUPPORT;
+
+	sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	/* Allocate a kcm mux, shared between KCM sockets */
+	mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
+	if (!mux) {
+		sk_free(sk);
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&mux->lock);
+	spin_lock_init(&mux->rx_lock);
+	INIT_LIST_HEAD(&mux->kcm_socks);
+	INIT_LIST_HEAD(&mux->kcm_rx_waiters);
+	INIT_LIST_HEAD(&mux->kcm_tx_waiters);
+
+	INIT_LIST_HEAD(&mux->psocks);
+	INIT_LIST_HEAD(&mux->psocks_ready);
+	INIT_LIST_HEAD(&mux->psocks_avail);
+
+	mux->knet = knet;
+
+	/* Add new MUX to list */
+	mutex_lock(&knet->mutex);
+	list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
+	knet->count++;
+	mutex_unlock(&knet->mutex);
+
+	skb_queue_head_init(&mux->rx_hold_queue);
+
+	/* Init KCM socket */
+	sock_init_data(sock, sk);
+	init_kcm_sock(kcm_sk(sk), mux);
+
+	return 0;
+}
+
+static struct net_proto_family kcm_family_ops = {
+	.family = PF_KCM,
+	.create = kcm_create,
+	.owner  = THIS_MODULE,
+};
+
+static __net_init int kcm_init_net(struct net *net)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	INIT_LIST_HEAD_RCU(&knet->mux_list);
+	mutex_init(&knet->mutex);
+
+	return 0;
+}
+
+static __net_exit void kcm_exit_net(struct net *net)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	/* All KCM sockets should be closed at this point, which should mean
+	 * that all multiplexors and psocks have been destroyed.
+	 */
+	WARN_ON(!list_empty(&knet->mux_list));
+}
+
+static struct pernet_operations kcm_net_ops = {
+	.init = kcm_init_net,
+	.exit = kcm_exit_net,
+	.id   = &kcm_net_id,
+	.size = sizeof(struct kcm_net),
+};
+
+static int __init kcm_init(void)
+{
+	int err = -ENOMEM;
+
+	kcm_muxp = kmem_cache_create("kcm_mux_cache",
+				     sizeof(struct kcm_mux), 0,
+				     SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	if (!kcm_muxp)
+		goto fail;
+
+	kcm_psockp = kmem_cache_create("kcm_psock_cache",
+				       sizeof(struct kcm_psock), 0,
+					SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	if (!kcm_psockp)
+		goto fail;
+
+	kcm_wq = create_singlethread_workqueue("kkcmd");
+	if (!kcm_wq)
+		goto fail;
+
+	err = proto_register(&kcm_proto, 1);
+	if (err)
+		goto fail;
+
+	err = sock_register(&kcm_family_ops);
+	if (err)
+		goto sock_register_fail;
+
+	err = register_pernet_device(&kcm_net_ops);
+	if (err)
+		goto net_ops_fail;
+
+	return 0;
+
+net_ops_fail:
+	sock_unregister(PF_KCM);
+
+sock_register_fail:
+	proto_unregister(&kcm_proto);
+
+fail:
+	kmem_cache_destroy(kcm_muxp);
+	kmem_cache_destroy(kcm_psockp);
+
+	if (kcm_wq)
+		destroy_workqueue(kcm_wq);
+
+	return err;
+}
+
+static void __exit kcm_exit(void)
+{
+	unregister_pernet_device(&kcm_net_ops);
+	sock_unregister(PF_KCM);
+	proto_unregister(&kcm_proto);
+	destroy_workqueue(kcm_wq);
+
+	kmem_cache_destroy(kcm_muxp);
+	kmem_cache_destroy(kcm_psockp);
+}
+
+module_init(kcm_init);
+module_exit(kcm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_KCM);
+
-- 
cgit v1.2.3-59-g8ed1b


From cd6e111bf5be5c70aef96a86d791ee7be0c0e137 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:07 -0800
Subject: kcm: Add statistics and proc interfaces

This patch adds various counters for KCM. These include counters for
messages and bytes received or sent, as well as counters for number of
attached/unattached TCP sockets and other error or edge events.

The statistics are exposed via a proc interface. /proc/net/kcm provides
statistics per KCM socket and per psock (attached TCP sockets).
/proc/net/kcm_stats provides aggregate statistics.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/kcm.h |  94 ++++++++++++
 net/kcm/Makefile  |   2 +-
 net/kcm/kcmproc.c | 422 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/kcm/kcmsock.c |  80 +++++++++++
 4 files changed, 597 insertions(+), 1 deletion(-)
 create mode 100644 net/kcm/kcmproc.c

(limited to 'net')

diff --git a/include/net/kcm.h b/include/net/kcm.h
index 1bcae39070ec..39c7abe98552 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -17,6 +17,42 @@
 
 extern unsigned int kcm_net_id;
 
+#define KCM_STATS_ADD(stat, count) ((stat) += (count))
+#define KCM_STATS_INCR(stat) ((stat)++)
+
+struct kcm_psock_stats {
+	unsigned long long rx_msgs;
+	unsigned long long rx_bytes;
+	unsigned long long tx_msgs;
+	unsigned long long tx_bytes;
+	unsigned int rx_aborts;
+	unsigned int rx_mem_fail;
+	unsigned int rx_need_more_hdr;
+	unsigned int rx_bad_hdr_len;
+	unsigned long long reserved;
+	unsigned long long unreserved;
+	unsigned int tx_aborts;
+};
+
+struct kcm_mux_stats {
+	unsigned long long rx_msgs;
+	unsigned long long rx_bytes;
+	unsigned long long tx_msgs;
+	unsigned long long tx_bytes;
+	unsigned int rx_ready_drops;
+	unsigned int tx_retries;
+	unsigned int psock_attach;
+	unsigned int psock_unattach_rsvd;
+	unsigned int psock_unattach;
+};
+
+struct kcm_stats {
+	unsigned long long rx_msgs;
+	unsigned long long rx_bytes;
+	unsigned long long tx_msgs;
+	unsigned long long tx_bytes;
+};
+
 struct kcm_tx_msg {
 	unsigned int sent;
 	unsigned int fragidx;
@@ -41,6 +77,8 @@ struct kcm_sock {
 	u32 done : 1;
 	struct work_struct done_work;
 
+	struct kcm_stats stats;
+
 	/* Transmit */
 	struct kcm_psock *tx_psock;
 	struct work_struct tx_work;
@@ -77,6 +115,8 @@ struct kcm_psock {
 
 	struct list_head psock_list;
 
+	struct kcm_psock_stats stats;
+
 	/* Receive */
 	struct sk_buff *rx_skb_head;
 	struct sk_buff **rx_skb_nextp;
@@ -86,15 +126,21 @@ struct kcm_psock {
 	struct delayed_work rx_delayed_work;
 	struct bpf_prog *bpf_prog;
 	struct kcm_sock *rx_kcm;
+	unsigned long long saved_rx_bytes;
+	unsigned long long saved_rx_msgs;
 
 	/* Transmit */
 	struct kcm_sock *tx_kcm;
 	struct list_head psock_avail_list;
+	unsigned long long saved_tx_bytes;
+	unsigned long long saved_tx_msgs;
 };
 
 /* Per net MUX list */
 struct kcm_net {
 	struct mutex mutex;
+	struct kcm_psock_stats aggregate_psock_stats;
+	struct kcm_mux_stats aggregate_mux_stats;
 	struct list_head mux_list;
 	int count;
 };
@@ -110,6 +156,9 @@ struct kcm_mux {
 	struct list_head psocks;	/* List of all psocks on MUX */
 	int psocks_cnt;		/* Total attached sockets */
 
+	struct kcm_mux_stats stats;
+	struct kcm_psock_stats aggregate_psock_stats;
+
 	/* Receive */
 	spinlock_t rx_lock ____cacheline_aligned_in_smp;
 	struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */
@@ -122,4 +171,49 @@ struct kcm_mux {
 	struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */
 };
 
+#ifdef CONFIG_PROC_FS
+int kcm_proc_init(void);
+void kcm_proc_exit(void);
+#else
+static int kcm_proc_init(void) { return 0; }
+static void kcm_proc_exit(void) { }
+#endif
+
+static inline void aggregate_psock_stats(struct kcm_psock_stats *stats,
+					 struct kcm_psock_stats *agg_stats)
+{
+	/* Save psock statistics in the mux when psock is being unattached. */
+
+#define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat)
+	SAVE_PSOCK_STATS(rx_msgs);
+	SAVE_PSOCK_STATS(rx_bytes);
+	SAVE_PSOCK_STATS(rx_aborts);
+	SAVE_PSOCK_STATS(rx_mem_fail);
+	SAVE_PSOCK_STATS(rx_need_more_hdr);
+	SAVE_PSOCK_STATS(rx_bad_hdr_len);
+	SAVE_PSOCK_STATS(tx_msgs);
+	SAVE_PSOCK_STATS(tx_bytes);
+	SAVE_PSOCK_STATS(reserved);
+	SAVE_PSOCK_STATS(unreserved);
+	SAVE_PSOCK_STATS(tx_aborts);
+#undef SAVE_PSOCK_STATS
+}
+
+static inline void aggregate_mux_stats(struct kcm_mux_stats *stats,
+				       struct kcm_mux_stats *agg_stats)
+{
+	/* Save psock statistics in the mux when psock is being unattached. */
+
+#define SAVE_MUX_STATS(_stat) (agg_stats->_stat += stats->_stat)
+	SAVE_MUX_STATS(rx_msgs);
+	SAVE_MUX_STATS(rx_bytes);
+	SAVE_MUX_STATS(tx_msgs);
+	SAVE_MUX_STATS(tx_bytes);
+	SAVE_MUX_STATS(rx_ready_drops);
+	SAVE_MUX_STATS(psock_attach);
+	SAVE_MUX_STATS(psock_unattach_rsvd);
+	SAVE_MUX_STATS(psock_unattach);
+#undef SAVE_MUX_STATS
+}
+
 #endif /* __NET_KCM_H_ */
diff --git a/net/kcm/Makefile b/net/kcm/Makefile
index cb525f7c5a13..71256133e677 100644
--- a/net/kcm/Makefile
+++ b/net/kcm/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_AF_KCM) += kcm.o
 
-kcm-y := kcmsock.o
+kcm-y := kcmsock.o kcmproc.o
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
new file mode 100644
index 000000000000..5eb9809c0f59
--- /dev/null
+++ b/net/kcm/kcmproc.c
@@ -0,0 +1,422 @@
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/rculist.h>
+#include <linux/seq_file.h>
+#include <linux/socket.h>
+#include <net/inet_sock.h>
+#include <net/kcm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/tcp.h>
+
+#ifdef CONFIG_PROC_FS
+struct kcm_seq_muxinfo {
+	char				*name;
+	const struct file_operations	*seq_fops;
+	const struct seq_operations	seq_ops;
+};
+
+static struct kcm_mux *kcm_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	return list_first_or_null_rcu(&knet->mux_list,
+				      struct kcm_mux, kcm_mux_list);
+}
+
+static struct kcm_mux *kcm_get_next(struct kcm_mux *mux)
+{
+	struct kcm_net *knet = mux->knet;
+
+	return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list,
+				     struct kcm_mux, kcm_mux_list);
+}
+
+static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct net *net = seq_file_net(seq);
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+	struct kcm_mux *m;
+
+	list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) {
+		if (!pos)
+			return m;
+		--pos;
+	}
+	return NULL;
+}
+
+static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	void *p;
+
+	if (v == SEQ_START_TOKEN)
+		p = kcm_get_first(seq);
+	else
+		p = kcm_get_next(v);
+	++*pos;
+	return p;
+}
+
+static void *kcm_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu)
+{
+	rcu_read_lock();
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+	else
+		return kcm_get_idx(seq, *pos - 1);
+}
+
+static void kcm_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu)
+{
+	rcu_read_unlock();
+}
+
+struct kcm_proc_mux_state {
+	struct seq_net_private p;
+	int idx;
+};
+
+static int kcm_seq_open(struct inode *inode, struct file *file)
+{
+	struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode);
+	int err;
+
+	err = seq_open_net(inode, file, &muxinfo->seq_ops,
+			   sizeof(struct kcm_proc_mux_state));
+	if (err < 0)
+		return err;
+	return err;
+}
+
+static void kcm_format_mux_header(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	seq_printf(seq,
+		   "*** KCM statistics (%d MUX) ****\n",
+		   knet->count);
+
+	seq_printf(seq,
+		   "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s",
+		   "Object",
+		   "RX-Msgs",
+		   "RX-Bytes",
+		   "TX-Msgs",
+		   "TX-Bytes",
+		   "Recv-Q",
+		   "Rmem",
+		   "Send-Q",
+		   "Smem",
+		   "Status");
+
+	/* XXX: pdsts header stuff here */
+	seq_puts(seq, "\n");
+}
+
+static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq,
+			    int i, int *len)
+{
+	seq_printf(seq,
+		   "   kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ",
+		   kcm->index,
+		   kcm->stats.rx_msgs,
+		   kcm->stats.rx_bytes,
+		   kcm->stats.tx_msgs,
+		   kcm->stats.tx_bytes,
+		   kcm->sk.sk_receive_queue.qlen,
+		   sk_rmem_alloc_get(&kcm->sk),
+		   kcm->sk.sk_write_queue.qlen,
+		   "-");
+
+	if (kcm->tx_psock)
+		seq_printf(seq, "Psck-%u ", kcm->tx_psock->index);
+
+	if (kcm->tx_wait)
+		seq_puts(seq, "TxWait ");
+
+	if (kcm->tx_wait_more)
+		seq_puts(seq, "WMore ");
+
+	if (kcm->rx_wait)
+		seq_puts(seq, "RxWait ");
+
+	seq_puts(seq, "\n");
+}
+
+static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
+			     int i, int *len)
+{
+	seq_printf(seq,
+		   "   psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ",
+		   psock->index,
+		   psock->stats.rx_msgs,
+		   psock->stats.rx_bytes,
+		   psock->stats.tx_msgs,
+		   psock->stats.tx_bytes,
+		   psock->sk->sk_receive_queue.qlen,
+		   atomic_read(&psock->sk->sk_rmem_alloc),
+		   psock->sk->sk_write_queue.qlen,
+		   atomic_read(&psock->sk->sk_wmem_alloc));
+
+	if (psock->done)
+		seq_puts(seq, "Done ");
+
+	if (psock->tx_stopped)
+		seq_puts(seq, "TxStop ");
+
+	if (psock->rx_stopped)
+		seq_puts(seq, "RxStop ");
+
+	if (psock->tx_kcm)
+		seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index);
+
+	if (psock->ready_rx_msg)
+		seq_puts(seq, "RdyRx ");
+
+	seq_puts(seq, "\n");
+}
+
+static void
+kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq)
+{
+	int i, len;
+	struct kcm_sock *kcm;
+	struct kcm_psock *psock;
+
+	/* mux information */
+	seq_printf(seq,
+		   "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ",
+		   "mux", "",
+		   mux->stats.rx_msgs,
+		   mux->stats.rx_bytes,
+		   mux->stats.tx_msgs,
+		   mux->stats.tx_bytes,
+		   "-", "-", "-", "-");
+
+	seq_printf(seq, "KCMs: %d, Psocks %d\n",
+		   mux->kcm_socks_cnt, mux->psocks_cnt);
+
+	/* kcm sock information */
+	i = 0;
+	spin_lock_bh(&mux->lock);
+	list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) {
+		kcm_format_sock(kcm, seq, i, &len);
+		i++;
+	}
+	i = 0;
+	list_for_each_entry(psock, &mux->psocks, psock_list) {
+		kcm_format_psock(psock, seq, i, &len);
+		i++;
+	}
+	spin_unlock_bh(&mux->lock);
+}
+
+static int kcm_seq_show(struct seq_file *seq, void *v)
+{
+	struct kcm_proc_mux_state *mux_state;
+
+	mux_state = seq->private;
+	if (v == SEQ_START_TOKEN) {
+		mux_state->idx = 0;
+		kcm_format_mux_header(seq);
+	} else {
+		kcm_format_mux(v, mux_state->idx, seq);
+		mux_state->idx++;
+	}
+	return 0;
+}
+
+static const struct file_operations kcm_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= kcm_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+static struct kcm_seq_muxinfo kcm_seq_muxinfo = {
+	.name		= "kcm",
+	.seq_fops	= &kcm_seq_fops,
+	.seq_ops	= {
+		.show	= kcm_seq_show,
+		.start	= kcm_seq_start,
+		.next	= kcm_seq_next,
+		.stop	= kcm_seq_stop,
+	}
+};
+
+static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo)
+{
+	struct proc_dir_entry *p;
+	int rc = 0;
+
+	p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net,
+			     muxinfo->seq_fops, muxinfo);
+	if (!p)
+		rc = -ENOMEM;
+	return rc;
+}
+EXPORT_SYMBOL(kcm_proc_register);
+
+static void kcm_proc_unregister(struct net *net,
+				struct kcm_seq_muxinfo *muxinfo)
+{
+	remove_proc_entry(muxinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL(kcm_proc_unregister);
+
+static int kcm_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct kcm_psock_stats psock_stats;
+	struct kcm_mux_stats mux_stats;
+	struct kcm_mux *mux;
+	struct kcm_psock *psock;
+	struct net *net = seq->private;
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	memset(&mux_stats, 0, sizeof(mux_stats));
+	memset(&psock_stats, 0, sizeof(psock_stats));
+
+	mutex_lock(&knet->mutex);
+
+	aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats);
+	aggregate_psock_stats(&knet->aggregate_psock_stats,
+			      &psock_stats);
+
+	list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) {
+		spin_lock_bh(&mux->lock);
+		aggregate_mux_stats(&mux->stats, &mux_stats);
+		aggregate_psock_stats(&mux->aggregate_psock_stats,
+				      &psock_stats);
+		list_for_each_entry(psock, &mux->psocks, psock_list)
+			aggregate_psock_stats(&psock->stats, &psock_stats);
+		spin_unlock_bh(&mux->lock);
+	}
+
+	mutex_unlock(&knet->mutex);
+
+	seq_printf(seq,
+		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n",
+		   "MUX",
+		   "RX-Msgs",
+		   "RX-Bytes",
+		   "TX-Msgs",
+		   "TX-Bytes",
+		   "TX-Retries",
+		   "Attach",
+		   "Unattach",
+		   "UnattchRsvd",
+		   "RX-RdyDrops");
+
+	seq_printf(seq,
+		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n",
+		   "",
+		   mux_stats.rx_msgs,
+		   mux_stats.rx_bytes,
+		   mux_stats.tx_msgs,
+		   mux_stats.tx_bytes,
+		   mux_stats.tx_retries,
+		   mux_stats.psock_attach,
+		   mux_stats.psock_unattach_rsvd,
+		   mux_stats.psock_unattach,
+		   mux_stats.rx_ready_drops);
+
+	seq_printf(seq,
+		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
+		   "Psock",
+		   "RX-Msgs",
+		   "RX-Bytes",
+		   "TX-Msgs",
+		   "TX-Bytes",
+		   "Reserved",
+		   "Unreserved",
+		   "RX-Aborts",
+		   "RX-MemFail",
+		   "RX-NeedMor",
+		   "RX-BadLen",
+		   "TX-Aborts");
+
+	seq_printf(seq,
+		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u\n",
+		   "",
+		   psock_stats.rx_msgs,
+		   psock_stats.rx_bytes,
+		   psock_stats.tx_msgs,
+		   psock_stats.tx_bytes,
+		   psock_stats.reserved,
+		   psock_stats.unreserved,
+		   psock_stats.rx_aborts,
+		   psock_stats.rx_mem_fail,
+		   psock_stats.rx_need_more_hdr,
+		   psock_stats.rx_bad_hdr_len,
+		   psock_stats.tx_aborts);
+
+	return 0;
+}
+
+static int kcm_stats_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, kcm_stats_seq_show);
+}
+
+static const struct file_operations kcm_stats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = kcm_stats_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release_net,
+};
+
+static int kcm_proc_init_net(struct net *net)
+{
+	int err;
+
+	if (!proc_create("kcm_stats", S_IRUGO, net->proc_net,
+			 &kcm_stats_seq_fops)) {
+		err = -ENOMEM;
+		goto out_kcm_stats;
+	}
+
+	err = kcm_proc_register(net, &kcm_seq_muxinfo);
+	if (err)
+		goto out_kcm;
+
+	return 0;
+
+out_kcm:
+	remove_proc_entry("kcm_stats", net->proc_net);
+out_kcm_stats:
+	return err;
+}
+
+static void kcm_proc_exit_net(struct net *net)
+{
+	kcm_proc_unregister(net, &kcm_seq_muxinfo);
+	remove_proc_entry("kcm_stats", net->proc_net);
+}
+
+static struct pernet_operations kcm_net_ops = {
+	.init = kcm_proc_init_net,
+	.exit = kcm_proc_exit_net,
+};
+
+int __init kcm_proc_init(void)
+{
+	return register_pernet_subsys(&kcm_net_ops);
+}
+
+void __exit kcm_proc_exit(void)
+{
+	unregister_pernet_subsys(&kcm_net_ops);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 30ef69ac6b81..f938d7d3e6e2 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -59,6 +59,7 @@ static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
 		return;
 
 	psock->rx_stopped = 1;
+	KCM_STATS_INCR(psock->stats.rx_aborts);
 
 	/* Report an error on the lower socket */
 	report_csk_error(csk, err);
@@ -80,6 +81,7 @@ static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
 	}
 
 	psock->tx_stopped = 1;
+	KCM_STATS_INCR(psock->stats.tx_aborts);
 
 	if (!psock->tx_kcm) {
 		/* Take off psocks_avail list */
@@ -101,6 +103,29 @@ static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
 	report_csk_error(csk, err);
 }
 
+/* RX mux lock held. */
+static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
+				    struct kcm_psock *psock)
+{
+	KCM_STATS_ADD(mux->stats.rx_bytes,
+		      psock->stats.rx_bytes - psock->saved_rx_bytes);
+	mux->stats.rx_msgs +=
+		psock->stats.rx_msgs - psock->saved_rx_msgs;
+	psock->saved_rx_msgs = psock->stats.rx_msgs;
+	psock->saved_rx_bytes = psock->stats.rx_bytes;
+}
+
+static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
+				    struct kcm_psock *psock)
+{
+	KCM_STATS_ADD(mux->stats.tx_bytes,
+		      psock->stats.tx_bytes - psock->saved_tx_bytes);
+	mux->stats.tx_msgs +=
+		psock->stats.tx_msgs - psock->saved_tx_msgs;
+	psock->saved_tx_msgs = psock->stats.tx_msgs;
+	psock->saved_tx_bytes = psock->stats.tx_bytes;
+}
+
 static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
 /* KCM is ready to receive messages on its queue-- either the KCM is new or
@@ -254,6 +279,8 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
 		return psock->rx_kcm;
 	}
 
+	kcm_update_rx_mux_stats(mux, psock);
+
 	if (list_empty(&mux->kcm_rx_waiters)) {
 		psock->ready_rx_msg = head;
 		list_add_tail(&psock->psock_ready_list,
@@ -356,10 +383,12 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 			 */
 			orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
 			if (!orig_skb) {
+				KCM_STATS_INCR(psock->stats.rx_mem_fail);
 				desc->error = -ENOMEM;
 				return 0;
 			}
 			if (!pskb_pull(orig_skb, orig_offset)) {
+				KCM_STATS_INCR(psock->stats.rx_mem_fail);
 				kfree_skb(orig_skb);
 				desc->error = -ENOMEM;
 				return 0;
@@ -374,6 +403,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 			 */
 			err = skb_unclone(head, GFP_ATOMIC);
 			if (err) {
+				KCM_STATS_INCR(psock->stats.rx_mem_fail);
 				desc->error = err;
 				return 0;
 			}
@@ -392,6 +422,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 
 				skb = alloc_skb(0, GFP_ATOMIC);
 				if (!skb) {
+					KCM_STATS_INCR(psock->stats.rx_mem_fail);
 					desc->error = -ENOMEM;
 					return 0;
 				}
@@ -414,6 +445,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 		/* Always clone since we will consume something */
 		skb = skb_clone(orig_skb, GFP_ATOMIC);
 		if (!skb) {
+			KCM_STATS_INCR(psock->stats.rx_mem_fail);
 			desc->error = -ENOMEM;
 			break;
 		}
@@ -435,6 +467,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 			 */
 			err = skb_unclone(skb, GFP_ATOMIC);
 			if (err) {
+				KCM_STATS_INCR(psock->stats.rx_mem_fail);
 				desc->error = err;
 				break;
 			}
@@ -456,6 +489,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				/* Need more header to determine length */
 				rxm->accum_len += cand_len;
 				eaten += cand_len;
+				KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
 				WARN_ON(eaten != orig_len);
 				break;
 			} else if (len <= (ssize_t)head->len -
@@ -463,6 +497,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				/* Length must be into new skb (and also
 				 * greater than zero)
 				 */
+				KCM_STATS_INCR(psock->stats.rx_bad_hdr_len);
 				desc->error = -EPROTO;
 				psock->rx_skb_head = NULL;
 				kcm_abort_rx_psock(psock, EPROTO, head);
@@ -492,6 +527,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 
 		/* Hurray, we have a new message! */
 		psock->rx_skb_head = NULL;
+		KCM_STATS_INCR(psock->stats.rx_msgs);
 
 try_queue:
 		kcm = reserve_rx_kcm(psock, head);
@@ -510,6 +546,8 @@ try_queue:
 	if (cloned_orig)
 		kfree_skb(orig_skb);
 
+	KCM_STATS_ADD(psock->stats.rx_bytes, eaten);
+
 	return eaten;
 }
 
@@ -671,6 +709,7 @@ static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
 		}
 		kcm->tx_psock = psock;
 		psock->tx_kcm = kcm;
+		KCM_STATS_INCR(psock->stats.reserved);
 	} else if (!kcm->tx_wait) {
 		list_add_tail(&kcm->wait_psock_list,
 			      &mux->kcm_tx_waiters);
@@ -705,6 +744,7 @@ static void psock_now_avail(struct kcm_psock *psock)
 		smp_mb();
 
 		kcm->tx_psock = psock;
+		KCM_STATS_INCR(psock->stats.reserved);
 		queue_work(kcm_wq, &kcm->tx_work);
 	}
 }
@@ -726,10 +766,13 @@ static void unreserve_psock(struct kcm_sock *kcm)
 
 	smp_rmb(); /* Read tx_psock before tx_wait */
 
+	kcm_update_tx_mux_stats(mux, psock);
+
 	WARN_ON(kcm->tx_wait);
 
 	kcm->tx_psock = NULL;
 	psock->tx_kcm = NULL;
+	KCM_STATS_INCR(psock->stats.unreserved);
 
 	if (unlikely(psock->tx_stopped)) {
 		if (psock->done) {
@@ -753,6 +796,15 @@ static void unreserve_psock(struct kcm_sock *kcm)
 	spin_unlock_bh(&mux->lock);
 }
 
+static void kcm_report_tx_retry(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	spin_lock_bh(&mux->lock);
+	KCM_STATS_INCR(mux->stats.tx_retries);
+	spin_unlock_bh(&mux->lock);
+}
+
 /* Write any messages ready on the kcm socket.  Called with kcm sock lock
  * held.  Return bytes actually sent or error.
  */
@@ -773,6 +825,7 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
 		 * it and we'll retry the message.
 		 */
 		unreserve_psock(kcm);
+		kcm_report_tx_retry(kcm);
 		if (skb_queue_empty(&sk->sk_write_queue))
 			return 0;
 
@@ -856,6 +909,7 @@ do_frag:
 				unreserve_psock(kcm);
 
 				txm->sent = 0;
+				kcm_report_tx_retry(kcm);
 				ret = 0;
 
 				goto try_again;
@@ -863,6 +917,7 @@ do_frag:
 
 			sent += ret;
 			frag_offset += ret;
+			KCM_STATS_ADD(psock->stats.tx_bytes, ret);
 			if (frag_offset < frag->size) {
 				/* Not finished with this frag */
 				goto do_frag;
@@ -884,6 +939,7 @@ do_frag:
 		kfree_skb(head);
 		sk->sk_wmem_queued -= sent;
 		total_sent += sent;
+		KCM_STATS_INCR(psock->stats.tx_msgs);
 	} while ((head = skb_peek(&sk->sk_write_queue)));
 out:
 	if (!head) {
@@ -1061,6 +1117,7 @@ wait_for_memory:
 		/* Message complete, queue it on send buffer */
 		__skb_queue_tail(&sk->sk_write_queue, head);
 		kcm->seq_skb = NULL;
+		KCM_STATS_INCR(kcm->stats.tx_msgs);
 
 		if (msg->msg_flags & MSG_BATCH) {
 			kcm->tx_wait_more = true;
@@ -1083,6 +1140,8 @@ partial_message:
 		kcm_tx_msg(head)->last_skb = skb;
 	}
 
+	KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
+
 	release_sock(sk);
 	return copied;
 
@@ -1144,6 +1203,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
 		       size_t len, int flags)
 {
 	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
 	int err = 0;
 	long timeo;
 	struct kcm_rx_msg *rxm;
@@ -1171,6 +1231,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
 
 	copied = len;
 	if (likely(!(flags & MSG_PEEK))) {
+		KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
 		if (copied < rxm->full_len) {
 			if (sock->type == SOCK_DGRAM) {
 				/* Truncated message */
@@ -1183,6 +1244,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
 msg_finished:
 			/* Finished with message */
 			msg->msg_flags |= MSG_EOR;
+			KCM_STATS_INCR(kcm->stats.rx_msgs);
 			skb_unlink(skb, &sk->sk_receive_queue);
 			kfree_skb(skb);
 		}
@@ -1394,6 +1456,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	list_add(&psock->psock_list, head);
 	psock->index = index;
 
+	KCM_STATS_INCR(mux->stats.psock_attach);
 	mux->psocks_cnt++;
 	psock_now_avail(psock);
 	spin_unlock_bh(&mux->lock);
@@ -1469,6 +1532,7 @@ static void kcm_unattach(struct kcm_psock *psock)
 		list_del(&psock->psock_ready_list);
 		kfree_skb(psock->ready_rx_msg);
 		psock->ready_rx_msg = NULL;
+		KCM_STATS_INCR(mux->stats.rx_ready_drops);
 	}
 
 	spin_unlock_bh(&mux->rx_lock);
@@ -1485,11 +1549,16 @@ static void kcm_unattach(struct kcm_psock *psock)
 
 	spin_lock_bh(&mux->lock);
 
+	aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
+
+	KCM_STATS_INCR(mux->stats.psock_unattach);
+
 	if (psock->tx_kcm) {
 		/* psock was reserved.  Just mark it finished and we will clean
 		 * up in the kcm paths, we need kcm lock which can not be
 		 * acquired here.
 		 */
+		KCM_STATS_INCR(mux->stats.psock_unattach_rsvd);
 		spin_unlock_bh(&mux->lock);
 
 		/* We are unattaching a socket that is reserved. Abort the
@@ -1717,6 +1786,9 @@ static void release_mux(struct kcm_mux *mux)
 	__skb_queue_purge(&mux->rx_hold_queue);
 
 	mutex_lock(&knet->mutex);
+	aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
+	aggregate_psock_stats(&mux->aggregate_psock_stats,
+			      &knet->aggregate_psock_stats);
 	list_del_rcu(&mux->kcm_mux_list);
 	knet->count--;
 	mutex_unlock(&knet->mutex);
@@ -1979,8 +2051,15 @@ static int __init kcm_init(void)
 	if (err)
 		goto net_ops_fail;
 
+	err = kcm_proc_init();
+	if (err)
+		goto proc_init_fail;
+
 	return 0;
 
+proc_init_fail:
+	unregister_pernet_device(&kcm_net_ops);
+
 net_ops_fail:
 	sock_unregister(PF_KCM);
 
@@ -1999,6 +2078,7 @@ fail:
 
 static void __exit kcm_exit(void)
 {
+	kcm_proc_exit();
 	unregister_pernet_device(&kcm_net_ops);
 	sock_unregister(PF_KCM);
 	proto_unregister(&kcm_proto);
-- 
cgit v1.2.3-59-g8ed1b


From 91687355b92735e5f247ed163b3b0b4d14c3cab6 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:08 -0800
Subject: kcm: Splice support

Implement kcm_splice_read. This is supported only for seqpacket.
Add kcm_seqpacket_ops and set splice read to kcm_splice_read.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/kcm/kcmsock.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 96 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index f938d7d3e6e2..982ea5f77bfc 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1256,6 +1256,76 @@ out:
 	return copied ? : err;
 }
 
+static ssize_t kcm_sock_splice(struct sock *sk,
+			       struct pipe_inode_info *pipe,
+			       struct splice_pipe_desc *spd)
+{
+	int ret;
+
+	release_sock(sk);
+	ret = splice_to_pipe(pipe, spd);
+	lock_sock(sk);
+
+	return ret;
+}
+
+static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
+			       struct pipe_inode_info *pipe, size_t len,
+			       unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	long timeo;
+	struct kcm_rx_msg *rxm;
+	int err = 0;
+	size_t copied;
+	struct sk_buff *skb;
+
+	/* Only support splice for SOCKSEQPACKET */
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	lock_sock(sk);
+
+	skb = kcm_wait_data(sk, flags, timeo, &err);
+	if (!skb)
+		goto err_out;
+
+	/* Okay, have a message on the receive queue */
+
+	rxm = kcm_rx_msg(skb);
+
+	if (len > rxm->full_len)
+		len = rxm->full_len;
+
+	copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags,
+				 kcm_sock_splice);
+	if (copied < 0) {
+		err = copied;
+		goto err_out;
+	}
+
+	KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
+
+	rxm->offset += copied;
+	rxm->full_len -= copied;
+
+	/* We have no way to return MSG_EOR. If all the bytes have been
+	 * read we still leave the message in the receive socket buffer.
+	 * A subsequent recvmsg needs to be done to return MSG_EOR and
+	 * finish reading the message.
+	 */
+
+	release_sock(sk);
+
+	return copied;
+
+err_out:
+	release_sock(sk);
+
+	return err;
+}
+
 /* kcm sock lock held */
 static void kcm_recv_disable(struct kcm_sock *kcm)
 {
@@ -1907,7 +1977,7 @@ static int kcm_release(struct socket *sock)
 	return 0;
 }
 
-static const struct proto_ops kcm_ops = {
+static const struct proto_ops kcm_dgram_ops = {
 	.family =	PF_KCM,
 	.owner =	THIS_MODULE,
 	.release =	kcm_release,
@@ -1928,6 +1998,28 @@ static const struct proto_ops kcm_ops = {
 	.sendpage =	sock_no_sendpage,
 };
 
+static const struct proto_ops kcm_seqpacket_ops = {
+	.family =	PF_KCM,
+	.owner =	THIS_MODULE,
+	.release =	kcm_release,
+	.bind =		sock_no_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	sock_no_getname,
+	.poll =		datagram_poll,
+	.ioctl =	kcm_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	kcm_setsockopt,
+	.getsockopt =	kcm_getsockopt,
+	.sendmsg =	kcm_sendmsg,
+	.recvmsg =	kcm_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+	.splice_read =	kcm_splice_read,
+};
+
 /* Create proto operation for kcm sockets */
 static int kcm_create(struct net *net, struct socket *sock,
 		      int protocol, int kern)
@@ -1938,8 +2030,10 @@ static int kcm_create(struct net *net, struct socket *sock,
 
 	switch (sock->type) {
 	case SOCK_DGRAM:
+		sock->ops = &kcm_dgram_ops;
+		break;
 	case SOCK_SEQPACKET:
-		sock->ops = &kcm_ops;
+		sock->ops = &kcm_seqpacket_ops;
 		break;
 	default:
 		return -ESOCKTNOSUPPORT;
-- 
cgit v1.2.3-59-g8ed1b


From f29698fc6b3a45a5c6147eca8379f38be8232117 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:09 -0800
Subject: kcm: Sendpage support

Implement kcm_sendpage. Set in sendpage to kcm_sendpage in both
dgram and seqpacket ops.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/kcm/kcmsock.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 145 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 982ea5f77bfc..9ac24995691c 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -990,6 +990,149 @@ static void kcm_push(struct kcm_sock *kcm)
 		kcm_write_msgs(kcm);
 }
 
+static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
+			    int offset, size_t size, int flags)
+
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	struct sk_buff *skb = NULL, *head = NULL;
+	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	bool eor;
+	int err = 0;
+	int i;
+
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
+	/* No MSG_EOR from splice, only look at MSG_MORE */
+	eor = !(flags & MSG_MORE);
+
+	lock_sock(sk);
+
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	err = -EPIPE;
+	if (sk->sk_err)
+		goto out_error;
+
+	if (kcm->seq_skb) {
+		/* Previously opened message */
+		head = kcm->seq_skb;
+		skb = kcm_tx_msg(head)->last_skb;
+		i = skb_shinfo(skb)->nr_frags;
+
+		if (skb_can_coalesce(skb, i, page, offset)) {
+			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+			skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+			goto coalesced;
+		}
+
+		if (i >= MAX_SKB_FRAGS) {
+			struct sk_buff *tskb;
+
+			tskb = alloc_skb(0, sk->sk_allocation);
+			while (!tskb) {
+				kcm_push(kcm);
+				err = sk_stream_wait_memory(sk, &timeo);
+				if (err)
+					goto out_error;
+			}
+
+			if (head == skb)
+				skb_shinfo(head)->frag_list = tskb;
+			else
+				skb->next = tskb;
+
+			skb = tskb;
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			i = 0;
+		}
+	} else {
+		/* Call the sk_stream functions to manage the sndbuf mem. */
+		if (!sk_stream_memory_free(sk)) {
+			kcm_push(kcm);
+			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			err = sk_stream_wait_memory(sk, &timeo);
+			if (err)
+				goto out_error;
+		}
+
+		head = alloc_skb(0, sk->sk_allocation);
+		while (!head) {
+			kcm_push(kcm);
+			err = sk_stream_wait_memory(sk, &timeo);
+			if (err)
+				goto out_error;
+		}
+
+		skb = head;
+		i = 0;
+	}
+
+	get_page(page);
+	skb_fill_page_desc(skb, i, page, offset, size);
+	skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+coalesced:
+	skb->len += size;
+	skb->data_len += size;
+	skb->truesize += size;
+	sk->sk_wmem_queued += size;
+	sk_mem_charge(sk, size);
+
+	if (head != skb) {
+		head->len += size;
+		head->data_len += size;
+		head->truesize += size;
+	}
+
+	if (eor) {
+		bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+		/* Message complete, queue it on send buffer */
+		__skb_queue_tail(&sk->sk_write_queue, head);
+		kcm->seq_skb = NULL;
+		KCM_STATS_INCR(kcm->stats.tx_msgs);
+
+		if (flags & MSG_BATCH) {
+			kcm->tx_wait_more = true;
+		} else if (kcm->tx_wait_more || not_busy) {
+			err = kcm_write_msgs(kcm);
+			if (err < 0) {
+				/* We got a hard error in write_msgs but have
+				 * already queued this message. Report an error
+				 * in the socket, but don't affect return value
+				 * from sendmsg
+				 */
+				pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+				report_csk_error(&kcm->sk, -err);
+			}
+		}
+	} else {
+		/* Message not complete, save state */
+		kcm->seq_skb = head;
+		kcm_tx_msg(head)->last_skb = skb;
+	}
+
+	KCM_STATS_ADD(kcm->stats.tx_bytes, size);
+
+	release_sock(sk);
+	return size;
+
+out_error:
+	kcm_push(kcm);
+
+	err = sk_stream_error(sk, flags, err);
+
+	/* make sure we wake any epoll edge trigger waiter */
+	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+		sk->sk_write_space(sk);
+
+	release_sock(sk);
+	return err;
+}
+
 static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 {
 	struct sock *sk = sock->sk;
@@ -1995,7 +2138,7 @@ static const struct proto_ops kcm_dgram_ops = {
 	.sendmsg =	kcm_sendmsg,
 	.recvmsg =	kcm_recvmsg,
 	.mmap =		sock_no_mmap,
-	.sendpage =	sock_no_sendpage,
+	.sendpage =	kcm_sendpage,
 };
 
 static const struct proto_ops kcm_seqpacket_ops = {
@@ -2016,7 +2159,7 @@ static const struct proto_ops kcm_seqpacket_ops = {
 	.sendmsg =	kcm_sendmsg,
 	.recvmsg =	kcm_recvmsg,
 	.mmap =		sock_no_mmap,
-	.sendpage =	sock_no_sendpage,
+	.sendpage =	kcm_sendpage,
 	.splice_read =	kcm_splice_read,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 7ced95ef525c329f947c424859cf2b0a3b731f8c Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:10 -0800
Subject: kcm: Add memory limit for receive message construction

Message assembly is performed on the TCP socket. This is logically
equivalent of an application that performs a peek on the socket to find
out how much memory is needed for a receive buffer. The receive socket
buffer also provides the maximum message size which is checked.

The receive algorithm is something like:

   1) Receive the first skbuf for a message (or skbufs if multiple are
      needed to determine message length).
   2) Check the message length against the number of bytes in the TCP
      receive queue (tcp_inq()).
	- If all the bytes of the message are in the queue (incluing the
	  skbuf received), then proceed with message assembly (it should
	  complete with the tcp_read_sock)
        - Else, mark the psock with the number of bytes needed to
	  complete the message.
   3) In TCP data ready function, if the psock indicates that we are
      waiting for the rest of the bytes of a messages, check the number
      of queued bytes against that.
        - If there are still not enough bytes for the message, just
	  return
        - Else, clear the waiting bytes and proceed to receive the
	  skbufs.  The message should now be received in one
	  tcp_read_sock

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/kcm.h |  4 ++++
 net/kcm/kcmproc.c |  6 ++++--
 net/kcm/kcmsock.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/kcm.h b/include/net/kcm.h
index 39c7abe98552..d892956ff552 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -28,6 +28,7 @@ struct kcm_psock_stats {
 	unsigned int rx_aborts;
 	unsigned int rx_mem_fail;
 	unsigned int rx_need_more_hdr;
+	unsigned int rx_msg_too_big;
 	unsigned int rx_bad_hdr_len;
 	unsigned long long reserved;
 	unsigned long long unreserved;
@@ -66,6 +67,7 @@ struct kcm_rx_msg {
 	int full_len;
 	int accum_len;
 	int offset;
+	int early_eaten;
 };
 
 /* Socket structure for KCM client sockets */
@@ -128,6 +130,7 @@ struct kcm_psock {
 	struct kcm_sock *rx_kcm;
 	unsigned long long saved_rx_bytes;
 	unsigned long long saved_rx_msgs;
+	unsigned int rx_need_bytes;
 
 	/* Transmit */
 	struct kcm_sock *tx_kcm;
@@ -190,6 +193,7 @@ static inline void aggregate_psock_stats(struct kcm_psock_stats *stats,
 	SAVE_PSOCK_STATS(rx_aborts);
 	SAVE_PSOCK_STATS(rx_mem_fail);
 	SAVE_PSOCK_STATS(rx_need_more_hdr);
+	SAVE_PSOCK_STATS(rx_msg_too_big);
 	SAVE_PSOCK_STATS(rx_bad_hdr_len);
 	SAVE_PSOCK_STATS(tx_msgs);
 	SAVE_PSOCK_STATS(tx_bytes);
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 5eb9809c0f59..7638b3555b17 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -331,7 +331,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   mux_stats.rx_ready_drops);
 
 	seq_printf(seq,
-		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
+		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
 		   "Psock",
 		   "RX-Msgs",
 		   "RX-Bytes",
@@ -343,10 +343,11 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   "RX-MemFail",
 		   "RX-NeedMor",
 		   "RX-BadLen",
+		   "RX-TooBig",
 		   "TX-Aborts");
 
 	seq_printf(seq,
-		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u\n",
+		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u\n",
 		   "",
 		   psock_stats.rx_msgs,
 		   psock_stats.rx_bytes,
@@ -358,6 +359,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   psock_stats.rx_mem_fail,
 		   psock_stats.rx_need_more_hdr,
 		   psock_stats.rx_bad_hdr_len,
+		   psock_stats.rx_msg_too_big,
 		   psock_stats.tx_aborts);
 
 	return 0;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 9ac24995691c..8bc38d3fff9a 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -375,6 +375,19 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 	if (head) {
 		/* Message already in progress */
 
+		rxm = kcm_rx_msg(head);
+		if (unlikely(rxm->early_eaten)) {
+			/* Already some number of bytes on the receive sock
+			 * data saved in rx_skb_head, just indicate they
+			 * are consumed.
+			 */
+			eaten = orig_len <= rxm->early_eaten ?
+				orig_len : rxm->early_eaten;
+			rxm->early_eaten -= eaten;
+
+			return eaten;
+		}
+
 		if (unlikely(orig_offset)) {
 			/* Getting data with a non-zero offset when a message is
 			 * in progress is not expected. If it does happen, we
@@ -492,6 +505,13 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
 				WARN_ON(eaten != orig_len);
 				break;
+			} else if (len > psock->sk->sk_rcvbuf) {
+				/* Message length exceeds maximum allowed */
+				KCM_STATS_INCR(psock->stats.rx_msg_too_big);
+				desc->error = -EMSGSIZE;
+				psock->rx_skb_head = NULL;
+				kcm_abort_rx_psock(psock, EMSGSIZE, head);
+				break;
 			} else if (len <= (ssize_t)head->len -
 					  skb->len - rxm->offset) {
 				/* Length must be into new skb (and also
@@ -511,6 +531,23 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 
 		if (extra < 0) {
 			/* Message not complete yet. */
+			if (rxm->full_len - rxm->accum_len >
+			    tcp_inq(psock->sk)) {
+				/* Don't have the whole messages in the socket
+				 * buffer. Set psock->rx_need_bytes to wait for
+				 * the rest of the message. Also, set "early
+				 * eaten" since we've already buffered the skb
+				 * but don't consume yet per tcp_read_sock.
+				 */
+
+				psock->rx_need_bytes = rxm->full_len -
+						       rxm->accum_len;
+				rxm->accum_len += cand_len;
+				rxm->early_eaten = cand_len;
+				KCM_STATS_ADD(psock->stats.rx_bytes, cand_len);
+				desc->count = 0; /* Stop reading socket */
+				break;
+			}
 			rxm->accum_len += cand_len;
 			eaten += cand_len;
 			WARN_ON(eaten != orig_len);
@@ -582,6 +619,13 @@ static void psock_tcp_data_ready(struct sock *sk)
 	if (psock->ready_rx_msg)
 		goto out;
 
+	if (psock->rx_need_bytes) {
+		if (tcp_inq(sk) >= psock->rx_need_bytes)
+			psock->rx_need_bytes = 0;
+		else
+			goto out;
+	}
+
 	if (psock_tcp_read_sock(psock) == -ENOMEM)
 		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
 
-- 
cgit v1.2.3-59-g8ed1b


From 29152a34f72cb4d7ab32885ad2f20a482c92a8f3 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:11 -0800
Subject: kcm: Add receive message timeout

This patch adds receive timeout for message assembly on the attached TCP
sockets. The timeout is set when a new messages is started and the whole
message has not been received by TCP (not in the receive queue). If the
completely message is subsequently received the timer is cancelled, if the
timer expires the RX side is aborted.

The timeout value is taken from the socket timeout (SO_RCVTIMEO) that is
set on a TCP socket (i.e. set by get sockopt before attaching a TCP socket
to KCM.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/kcm.h |  3 +++
 net/kcm/kcmproc.c |  6 ++++--
 net/kcm/kcmsock.c | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/kcm.h b/include/net/kcm.h
index d892956ff552..95c425ca97b6 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -29,6 +29,7 @@ struct kcm_psock_stats {
 	unsigned int rx_mem_fail;
 	unsigned int rx_need_more_hdr;
 	unsigned int rx_msg_too_big;
+	unsigned int rx_msg_timeouts;
 	unsigned int rx_bad_hdr_len;
 	unsigned long long reserved;
 	unsigned long long unreserved;
@@ -130,6 +131,7 @@ struct kcm_psock {
 	struct kcm_sock *rx_kcm;
 	unsigned long long saved_rx_bytes;
 	unsigned long long saved_rx_msgs;
+	struct timer_list rx_msg_timer;
 	unsigned int rx_need_bytes;
 
 	/* Transmit */
@@ -194,6 +196,7 @@ static inline void aggregate_psock_stats(struct kcm_psock_stats *stats,
 	SAVE_PSOCK_STATS(rx_mem_fail);
 	SAVE_PSOCK_STATS(rx_need_more_hdr);
 	SAVE_PSOCK_STATS(rx_msg_too_big);
+	SAVE_PSOCK_STATS(rx_msg_timeouts);
 	SAVE_PSOCK_STATS(rx_bad_hdr_len);
 	SAVE_PSOCK_STATS(tx_msgs);
 	SAVE_PSOCK_STATS(tx_bytes);
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 7638b3555b17..738008726cc6 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -331,7 +331,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   mux_stats.rx_ready_drops);
 
 	seq_printf(seq,
-		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
+		   "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
 		   "Psock",
 		   "RX-Msgs",
 		   "RX-Bytes",
@@ -344,10 +344,11 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   "RX-NeedMor",
 		   "RX-BadLen",
 		   "RX-TooBig",
+		   "RX-Timeout",
 		   "TX-Aborts");
 
 	seq_printf(seq,
-		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u\n",
+		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n",
 		   "",
 		   psock_stats.rx_msgs,
 		   psock_stats.rx_bytes,
@@ -360,6 +361,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 		   psock_stats.rx_need_more_hdr,
 		   psock_stats.rx_bad_hdr_len,
 		   psock_stats.rx_msg_too_big,
+		   psock_stats.rx_msg_timeouts,
 		   psock_stats.tx_aborts);
 
 	return 0;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 8bc38d3fff9a..40662d73204f 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -55,6 +55,8 @@ static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
 
 	/* Unrecoverable error in receive */
 
+	del_timer(&psock->rx_msg_timer);
+
 	if (psock->rx_stopped)
 		return;
 
@@ -351,6 +353,12 @@ static void unreserve_rx_kcm(struct kcm_psock *psock,
 	spin_unlock_bh(&mux->rx_lock);
 }
 
+static void kcm_start_rx_timer(struct kcm_psock *psock)
+{
+	if (psock->sk->sk_rcvtimeo)
+		mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo);
+}
+
 /* Macro to invoke filter function. */
 #define KCM_RUN_FILTER(prog, ctx) \
 	(*prog->bpf_func)(ctx, prog->insnsi)
@@ -500,6 +508,10 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 
 			if (!len) {
 				/* Need more header to determine length */
+				if (!rxm->accum_len) {
+					/* Start RX timer for new message */
+					kcm_start_rx_timer(psock);
+				}
 				rxm->accum_len += cand_len;
 				eaten += cand_len;
 				KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
@@ -540,6 +552,11 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				 * but don't consume yet per tcp_read_sock.
 				 */
 
+				if (!rxm->accum_len) {
+					/* Start RX timer for new message */
+					kcm_start_rx_timer(psock);
+				}
+
 				psock->rx_need_bytes = rxm->full_len -
 						       rxm->accum_len;
 				rxm->accum_len += cand_len;
@@ -563,6 +580,7 @@ static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 		eaten += (cand_len - extra);
 
 		/* Hurray, we have a new message! */
+		del_timer(&psock->rx_msg_timer);
 		psock->rx_skb_head = NULL;
 		KCM_STATS_INCR(psock->stats.rx_msgs);
 
@@ -1656,6 +1674,15 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
 	spin_unlock_bh(&mux->rx_lock);
 }
 
+static void kcm_rx_msg_timeout(unsigned long arg)
+{
+	struct kcm_psock *psock = (struct kcm_psock *)arg;
+
+	/* Message assembly timed out */
+	KCM_STATS_INCR(psock->stats.rx_msg_timeouts);
+	kcm_abort_rx_psock(psock, ETIMEDOUT, NULL);
+}
+
 static int kcm_attach(struct socket *sock, struct socket *csock,
 		      struct bpf_prog *prog)
 {
@@ -1685,6 +1712,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	psock->mux = mux;
 	psock->sk = csk;
 	psock->bpf_prog = prog;
+
+	setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout,
+		    (unsigned long)psock);
+
 	INIT_WORK(&psock->rx_work, psock_rx_work);
 	INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
 
@@ -1796,6 +1827,7 @@ static void kcm_unattach(struct kcm_psock *psock)
 
 	write_unlock_bh(&csk->sk_callback_lock);
 
+	del_timer_sync(&psock->rx_msg_timer);
 	cancel_work_sync(&psock->rx_work);
 	cancel_delayed_work_sync(&psock->rx_delayed_work);
 
-- 
cgit v1.2.3-59-g8ed1b


From ea47781c26510e5d97f80f9aceafe9065bd5e3aa Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 9 Mar 2016 21:58:33 -0500
Subject: ax25: add link layer header validation function

As variable length protocol, AX25 fails link layer header validation
tests based on a minimum length. header_ops.validate allows protocols
to validate headers that are shorter than hard_header_len. Implement
this callback for AX25.

See also http://comments.gmane.org/gmane.linux.network/401064

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ax25/ax25_ip.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'net')

diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index b563a3f5f2a8..2fa3be965101 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -228,8 +228,23 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
 }
 #endif
 
+static bool ax25_validate_header(const char *header, unsigned int len)
+{
+	ax25_digi digi;
+
+	if (!len)
+		return false;
+
+	if (header[0])
+		return true;
+
+	return ax25_addr_parse(header + 1, len - 1, NULL, NULL, &digi, NULL,
+			       NULL);
+}
+
 const struct header_ops ax25_header_ops = {
 	.create = ax25_hard_header,
+	.validate = ax25_validate_header,
 };
 
 EXPORT_SYMBOL(ax25_header_ops);
-- 
cgit v1.2.3-59-g8ed1b


From 9ed988cd591500c040b2a6257bc68543e08ceeef Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 9 Mar 2016 21:58:34 -0500
Subject: packet: validate variable length ll headers

Replace link layer header validation check ll_header_truncate with
more generic dev_validate_header.

Validation based on hard_header_len incorrectly drops valid packets
in variable length protocols, such as AX25. dev_validate_header
calls header_ops.validate for such protocols to ensure correctness
below hard_header_len.

See also http://comments.gmane.org/gmane.linux.network/401064

Fixes 9c7077622dd9 ("packet: make packet_snd fail on len smaller than l2 header")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 43 ++++++++++++++++++-------------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d41b1074cb2d..1ecfa710ca98 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1915,6 +1915,10 @@ retry:
 		goto retry;
 	}
 
+	if (!dev_validate_header(dev, skb->data, len)) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
 	if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
 	    !packet_extra_vlan_len_allowed(dev, skb)) {
 		err = -EMSGSIZE;
@@ -2393,18 +2397,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 
-static bool ll_header_truncated(const struct net_device *dev, int len)
-{
-	/* net device doesn't like empty head */
-	if (unlikely(len < dev->hard_header_len)) {
-		net_warn_ratelimited("%s: packet size is too short (%d < %d)\n",
-				     current->comm, len, dev->hard_header_len);
-		return true;
-	}
-
-	return false;
-}
-
 static void tpacket_set_protocol(const struct net_device *dev,
 				 struct sk_buff *skb)
 {
@@ -2522,16 +2514,20 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		if (unlikely(err < 0))
 			return -EINVAL;
 	} else if (copylen) {
+		int hdrlen = min_t(int, copylen, tp_len);
+
 		skb_push(skb, dev->hard_header_len);
 		skb_put(skb, copylen - dev->hard_header_len);
-		err = skb_store_bits(skb, 0, data, copylen);
+		err = skb_store_bits(skb, 0, data, hdrlen);
 		if (unlikely(err))
 			return err;
+		if (!dev_validate_header(dev, skb->data, hdrlen))
+			return -EINVAL;
 		if (!skb->protocol)
 			tpacket_set_protocol(dev, skb);
 
-		data += copylen;
-		to_write -= copylen;
+		data += hdrlen;
+		to_write -= hdrlen;
 	}
 
 	offset = offset_in_page(data);
@@ -2703,13 +2699,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 			copylen = __virtio16_to_cpu(vio_le(),
 						    vnet_hdr->hdr_len);
 		}
-		if (dev->hard_header_len) {
-			if (ll_header_truncated(dev, tp_len)) {
-				tp_len = -EINVAL;
-				goto tpacket_error;
-			}
-			copylen = max_t(int, copylen, dev->hard_header_len);
-		}
+		copylen = max_t(int, copylen, dev->hard_header_len);
 		skb = sock_alloc_send_skb(&po->sk,
 				hlen + tlen + sizeof(struct sockaddr_ll) +
 				(copylen - dev->hard_header_len),
@@ -2905,9 +2895,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
 		if (unlikely(offset < 0))
 			goto out_free;
-	} else {
-		if (ll_header_truncated(dev, len))
-			goto out_free;
 	}
 
 	/* Returns -EFAULT on error */
@@ -2915,6 +2902,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	if (err)
 		goto out_free;
 
+	if (sock->type == SOCK_RAW &&
+	    !dev_validate_header(dev, skb->data, len)) {
+		err = -EINVAL;
+		goto out_free;
+	}
+
 	sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 
 	if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
-- 
cgit v1.2.3-59-g8ed1b


From 6aaf37b41a37b184e509aa3ec29d96b7712dfd43 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Fri, 4 Mar 2016 10:10:21 +0100
Subject: 6lowpan: debugfs: add missing static

This patch solves the sparse warning:

net/6lowpan/debugfs.c:164:30: warning: symbol 'lowpan_ctx_pfx_fops' was
not declared. Should it be static?
net/6lowpan/debugfs.c:241:30: warning: symbol 'lowpan_context_fops' was
not declared. Should it be static?

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/6lowpan/debugfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
index aa49ff4ce6fd..0793a8157472 100644
--- a/net/6lowpan/debugfs.c
+++ b/net/6lowpan/debugfs.c
@@ -161,7 +161,7 @@ out:
 	return status;
 }
 
-const struct file_operations lowpan_ctx_pfx_fops = {
+static const struct file_operations lowpan_ctx_pfx_fops = {
 	.open		= lowpan_ctx_pfx_open,
 	.read		= seq_read,
 	.write		= lowpan_ctx_pfx_write,
@@ -238,7 +238,7 @@ static int lowpan_context_open(struct inode *inode, struct file *file)
 	return single_open(file, lowpan_context_show, inode->i_private);
 }
 
-const struct file_operations lowpan_context_fops = {
+static const struct file_operations lowpan_context_fops = {
 	.open		= lowpan_context_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-- 
cgit v1.2.3-59-g8ed1b


From 24c4a81d765738655f94d242eb821cb26e6c143a Mon Sep 17 00:00:00 2001
From: Alexander Aring <aar@pengutronix.de>
Date: Mon, 7 Mar 2016 20:07:31 +0100
Subject: 6lowpan: iphc: fix SAM/DAM bit comment

This patch fixes the comments for SAM/DAM value.

Signed-off-by: Alexander Aring <aar@pengutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/6lowpan/iphc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 72172514fea0..99bb22aea346 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -788,7 +788,7 @@ static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr,
 	}
 
 	memset(&tmp, 0, sizeof(tmp));
-	/* check for SAM/DAM = 01 */
+	/* check for SAM/DAM = 10 */
 	tmp.s6_addr[11] = 0xFF;
 	tmp.s6_addr[12] = 0xFE;
 	memcpy(&tmp.s6_addr[14], &ipaddr->s6_addr[14], 2);
@@ -801,7 +801,7 @@ static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr,
 	}
 
 	memset(&tmp, 0, sizeof(tmp));
-	/* check for SAM/DAM = 10, should always match */
+	/* check for SAM/DAM = 01, should always match */
 	memcpy(&tmp.s6_addr[8], &ipaddr->s6_addr[8], 8);
 	/* context information are always used */
 	ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
-- 
cgit v1.2.3-59-g8ed1b


From eec7a01dc8366f7e43fc0417f4aee70eaeaca9a9 Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Wed, 9 Mar 2016 17:30:32 +0200
Subject: Bluetooth: Move memset closer to where it's needed

Minor fix to not do the memset until the variable it clears is
actually used.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_conn.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 32575b49f4a0..7264025dc781 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -726,14 +726,14 @@ static void hci_req_add_le_create_conn(struct hci_request *req,
 	struct hci_dev *hdev = conn->hdev;
 	u8 own_addr_type;
 
-	memset(&cp, 0, sizeof(cp));
-
 	/* Update random address, but set require_privacy to false so
 	 * that we never connect with an non-resolvable address.
 	 */
 	if (hci_update_random_address(req, false, &own_addr_type))
 		return;
 
+	memset(&cp, 0, sizeof(cp));
+
 	/* Set window to be the same value as the interval to enable
 	 * continuous scanning.
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From d43efbd0d545f476be0bbef30cbe18234f8a9631 Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Wed, 9 Mar 2016 17:30:33 +0200
Subject: Bluetooth: Fix adding discoverable to adv instance flags

When lookup up the advertising instance flags for the default
advertising instance (0) the discoverable flag should be filled in
based on the HCI_DISCOVERABLE flag.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index c78ee2dc9323..77be344efd18 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -866,6 +866,9 @@ static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
 		if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
 			flags |= MGMT_ADV_FLAG_CONNECTABLE;
 
+		if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+			flags |= MGMT_ADV_FLAG_DISCOV;
+
 		return flags;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 82a37adeedd38880940e2772ec1ae27a09353e5a Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Wed, 9 Mar 2016 17:30:34 +0200
Subject: Bluetooth: Add support for limited privacy mode

Introduce a limited privacy mode indicated by value 0x02 to the mgmt
Set Privacy command.

With value 0x02 the kernel will use privacy mode with a resolvable
private address. In case the controller is bondable and discoverable
the identity address will be used.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h |  1 +
 net/bluetooth/hci_conn.c    | 13 ++++++++++--
 net/bluetooth/hci_request.c | 51 +++++++++++++++++++++++++++++++++++++++------
 net/bluetooth/hci_request.h |  2 +-
 net/bluetooth/mgmt.c        | 20 ++++++++++++++++--
 5 files changed, 76 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 339ea57be423..5d38d980b89d 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -233,6 +233,7 @@ enum {
 	HCI_SC_ENABLED,
 	HCI_SC_ONLY,
 	HCI_PRIVACY,
+	HCI_LIMITED_PRIVACY,
 	HCI_RPA_EXPIRED,
 	HCI_RPA_RESOLVING,
 	HCI_HS_ENABLED,
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 7264025dc781..bf9f8a801a2e 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -719,6 +719,13 @@ done:
 	hci_dev_unlock(hdev);
 }
 
+static bool conn_use_rpa(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
 static void hci_req_add_le_create_conn(struct hci_request *req,
 				       struct hci_conn *conn)
 {
@@ -729,7 +736,8 @@ static void hci_req_add_le_create_conn(struct hci_request *req,
 	/* Update random address, but set require_privacy to false so
 	 * that we never connect with an non-resolvable address.
 	 */
-	if (hci_update_random_address(req, false, &own_addr_type))
+	if (hci_update_random_address(req, false, conn_use_rpa(conn),
+				      &own_addr_type))
 		return;
 
 	memset(&cp, 0, sizeof(cp));
@@ -774,7 +782,8 @@ static void hci_req_directed_advertising(struct hci_request *req,
 	/* Set require_privacy to false so that the remote device has a
 	 * chance of identifying us.
 	 */
-	if (hci_update_random_address(req, false, &own_addr_type) < 0)
+	if (hci_update_random_address(req, false, conn_use_rpa(conn),
+				      &own_addr_type) < 0)
 		return;
 
 	memset(&cp, 0, sizeof(cp));
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 77be344efd18..95a545ca9dbc 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -771,6 +771,11 @@ static u8 update_white_list(struct hci_request *req)
 	return 0x01;
 }
 
+static bool scan_use_rpa(struct hci_dev *hdev)
+{
+	return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
 void hci_req_add_le_passive_scan(struct hci_request *req)
 {
 	struct hci_cp_le_set_scan_param param_cp;
@@ -785,7 +790,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
 	 * advertising with our address will be correctly reported
 	 * by the controller.
 	 */
-	if (hci_update_random_address(req, false, &own_addr_type))
+	if (hci_update_random_address(req, false, scan_use_rpa(hdev),
+				      &own_addr_type))
 		return;
 
 	/* Adding or removing entries from the white list must
@@ -881,6 +887,29 @@ static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
 	return adv_instance->flags;
 }
 
+static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags)
+{
+	/* If privacy is not enabled don't use RPA */
+	if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
+		return false;
+
+	/* If basic privacy mode is enabled use RPA */
+	if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+		return true;
+
+	/* If limited privacy mode is enabled don't use RPA if we're
+	 * both discoverable and bondable.
+	 */
+	if ((flags & MGMT_ADV_FLAG_DISCOV) &&
+	    hci_dev_test_flag(hdev, HCI_BONDABLE))
+		return false;
+
+	/* We're neither bondable nor discoverable in the limited
+	 * privacy mode, therefore use RPA.
+	 */
+	return true;
+}
+
 void __hci_req_enable_advertising(struct hci_request *req)
 {
 	struct hci_dev *hdev = req->hdev;
@@ -914,7 +943,9 @@ void __hci_req_enable_advertising(struct hci_request *req)
 	 * advertising is used. In that case it is fine to use a
 	 * non-resolvable private address.
 	 */
-	if (hci_update_random_address(req, !connectable, &own_addr_type) < 0)
+	if (hci_update_random_address(req, !connectable,
+				      adv_use_rpa(hdev, flags),
+				      &own_addr_type) < 0)
 		return;
 
 	memset(&cp, 0, sizeof(cp));
@@ -1328,7 +1359,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
 }
 
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
-			      u8 *own_addr_type)
+			      bool use_rpa, u8 *own_addr_type)
 {
 	struct hci_dev *hdev = req->hdev;
 	int err;
@@ -1337,7 +1368,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy,
 	 * current RPA has expired or there is something else than
 	 * the current RPA in use, then generate a new one.
 	 */
-	if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+	if (use_rpa) {
 		int to;
 
 		*own_addr_type = ADDR_LE_DEV_RANDOM;
@@ -1599,9 +1630,16 @@ static int discoverable_update(struct hci_request *req, unsigned long opt)
 	/* Advertising instances don't use the global discoverable setting, so
 	 * only update AD if advertising was enabled using Set Advertising.
 	 */
-	if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+	if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
 		__hci_req_update_adv_data(req, 0x00);
 
+		/* Discoverable mode affects the local advertising
+		 * address in limited privacy mode.
+		 */
+		if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+			__hci_req_enable_advertising(req);
+	}
+
 	hci_dev_unlock(hdev);
 
 	return 0;
@@ -1944,7 +1982,8 @@ static int active_scan(struct hci_request *req, unsigned long opt)
 	 * address (when privacy feature has been enabled) or non-resolvable
 	 * private address.
 	 */
-	err = hci_update_random_address(req, true, &own_addr_type);
+	err = hci_update_random_address(req, true, scan_use_rpa(hdev),
+					&own_addr_type);
 	if (err < 0)
 		own_addr_type = ADDR_LE_DEV_PUBLIC;
 
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 64ff8c040d50..b2d044bdc732 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -89,7 +89,7 @@ static inline void hci_req_update_scan(struct hci_dev *hdev)
 void __hci_req_update_scan(struct hci_request *req);
 
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
-			      u8 *own_addr_type);
+			      bool use_rpa, u8 *own_addr_type);
 
 int hci_abort_conn(struct hci_conn *conn, u8 reason);
 void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 5a5089cb6570..2ca355519d79 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1382,8 +1382,19 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data,
 	if (err < 0)
 		goto unlock;
 
-	if (changed)
+	if (changed) {
+		/* In limited privacy mode the change of bondable mode
+		 * may affect the local advertising address.
+		 */
+		if (hdev_is_powered(hdev) &&
+		    hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+		    hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
+		    hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+			queue_work(hdev->req_workqueue,
+				   &hdev->discoverable_update);
+
 		err = new_settings(hdev, sk);
+	}
 
 unlock:
 	hci_dev_unlock(hdev);
@@ -4423,7 +4434,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
 				       MGMT_STATUS_NOT_SUPPORTED);
 
-	if (cp->privacy != 0x00 && cp->privacy != 0x01)
+	if (cp->privacy != 0x00 && cp->privacy != 0x01 && cp->privacy != 0x02)
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
 				       MGMT_STATUS_INVALID_PARAMS);
 
@@ -4442,10 +4453,15 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
 		changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY);
 		memcpy(hdev->irk, cp->irk, sizeof(hdev->irk));
 		hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+		if (cp->privacy == 0x02)
+			hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY);
+		else
+			hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
 	} else {
 		changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY);
 		memset(hdev->irk, 0, sizeof(hdev->irk));
 		hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+		hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
 	}
 
 	err = send_settings_rsp(sk, MGMT_OP_SET_PRIVACY, hdev);
-- 
cgit v1.2.3-59-g8ed1b


From 9c176cec42e4b0deeed901fc363bf4235c5ca96a Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Wed, 9 Mar 2016 17:52:24 +0200
Subject: Bluetooth: Increment management interface revision

Increment the mgmt revision due to the recently added limited
privacy mode.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 2ca355519d79..c856fb65812c 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
 #include "mgmt_util.h"
 
 #define MGMT_VERSION	1
-#define MGMT_REVISION	11
+#define MGMT_REVISION	12
 
 static const u16 mgmt_commands[] = {
 	MGMT_OP_READ_INDEX_LIST,
-- 
cgit v1.2.3-59-g8ed1b


From 04761890a7cec6a1ff9aafd909004da4fe8059db Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Tue, 8 Mar 2016 10:36:20 +0100
Subject: net: dsa: Fix cleanup resources upon module removal

The initial commit badly merged into the dsa_resume method instead
of the dsa_remove_dst method.
As consequence, the dst->master_netdev->dsa_ptr is not set to NULL on
removal and re-bind of the dsa device fails with error -17.

Fixes: b0dc635d923c ("net: dsa: cleanup resources upon module removal ")
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Acked-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index fa4daba8db55..d8fb47fcad05 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -935,6 +935,14 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 {
 	int i;
 
+	dst->master_netdev->dsa_ptr = NULL;
+
+	/* If we used a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point get sent
+	 * without the tag and go through the regular receive path.
+	 */
+	wmb();
+
 	for (i = 0; i < dst->pd->nr_chips; i++) {
 		struct dsa_switch *ds = dst->ds[i];
 
@@ -988,14 +996,6 @@ static int dsa_suspend(struct device *d)
 	struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
 	int i, ret = 0;
 
-	dst->master_netdev->dsa_ptr = NULL;
-
-	/* If we used a tagging format that doesn't have an ethertype
-	 * field, make sure that all packets from this point get sent
-	 * without the tag and go through the regular receive path.
-	 */
-	wmb();
-
 	for (i = 0; i < dst->pd->nr_chips; i++) {
 		struct dsa_switch *ds = dst->ds[i];
 
-- 
cgit v1.2.3-59-g8ed1b


From 5b33f48842fa1e13e9c0ea8cc59c1d0df19042db Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 8 Mar 2016 12:42:29 +0200
Subject: net/flower: Introduce hardware offload support

This patch is based on a patch made by John Fastabend.
It adds support for offloading cls_flower.
when NETIF_F_HW_TC is on:
  flags = 0       => Rule will be processed twice - by hardware, and if
                     still relevant, by software.
  flags = SKIP_HW => Rull will be processed by software only

If hardware fail/not capabale to apply the rule, operation will NOT
fail. Filter will be processed by SW only.

Acked-by: Jiri Pirko <jiri@mellanox.com>
Suggested-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  2 ++
 include/net/pkt_cls.h        | 14 ++++++++++
 include/uapi/linux/pkt_cls.h |  2 ++
 net/sched/cls_flower.c       | 64 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 81 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fd30cb545c45..41df0b450757 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -786,6 +786,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 enum {
 	TC_SETUP_MQPRIO,
 	TC_SETUP_CLSU32,
+	TC_SETUP_CLSFLOWER,
 };
 
 struct tc_cls_u32_offload;
@@ -795,6 +796,7 @@ struct tc_to_netdev {
 	union {
 		u8 tc;
 		struct tc_cls_u32_offload *cls_u32;
+		struct tc_cls_flower_offload *cls_flower;
 	};
 };
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index bea14eee373e..5b4e8f08b8f0 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -409,4 +409,18 @@ static inline bool tc_should_offload(struct net_device *dev, u32 flags)
 	return true;
 }
 
+enum tc_fl_command {
+	TC_CLSFLOWER_REPLACE,
+	TC_CLSFLOWER_DESTROY,
+};
+
+struct tc_cls_flower_offload {
+	enum tc_fl_command command;
+	u64 cookie;
+	struct flow_dissector *dissector;
+	struct fl_flow_key *mask;
+	struct fl_flow_key *key;
+	struct tcf_exts *exts;
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9874f5680926..c43c5f78b9c4 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -417,6 +417,8 @@ enum {
 	TCA_FLOWER_KEY_TCP_DST,		/* be16 */
 	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
+
+	TCA_FLOWER_FLAGS,
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 95b021243233..25d87666bf1e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -165,6 +165,51 @@ static void fl_destroy_filter(struct rcu_head *head)
 	kfree(f);
 }
 
+static void fl_hw_destroy_filter(struct tcf_proto *tp, u64 cookie)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_flower_offload offload = {0};
+	struct tc_to_netdev tc;
+
+	if (!tc_should_offload(dev, 0))
+		return;
+
+	offload.command = TC_CLSFLOWER_DESTROY;
+	offload.cookie = cookie;
+
+	tc.type = TC_SETUP_CLSFLOWER;
+	tc.cls_flower = &offload;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
+static void fl_hw_replace_filter(struct tcf_proto *tp,
+				 struct flow_dissector *dissector,
+				 struct fl_flow_key *mask,
+				 struct fl_flow_key *key,
+				 struct tcf_exts *actions,
+				 u64 cookie, u32 flags)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_flower_offload offload = {0};
+	struct tc_to_netdev tc;
+
+	if (!tc_should_offload(dev, flags))
+		return;
+
+	offload.command = TC_CLSFLOWER_REPLACE;
+	offload.cookie = cookie;
+	offload.dissector = dissector;
+	offload.mask = mask;
+	offload.key = key;
+	offload.exts = actions;
+
+	tc.type = TC_SETUP_CLSFLOWER;
+	tc.cls_flower = &offload;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
 static bool fl_destroy(struct tcf_proto *tp, bool force)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -174,6 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
 		return false;
 
 	list_for_each_entry_safe(f, next, &head->filters, list) {
+		fl_hw_destroy_filter(tp, (u64)f);
 		list_del_rcu(&f->list);
 		call_rcu(&f->rcu, fl_destroy_filter);
 	}
@@ -459,6 +505,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	struct cls_fl_filter *fnew;
 	struct nlattr *tb[TCA_FLOWER_MAX + 1];
 	struct fl_flow_mask mask = {};
+	u32 flags = 0;
 	int err;
 
 	if (!tca[TCA_OPTIONS])
@@ -486,6 +533,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	}
 	fnew->handle = handle;
 
+	if (tb[TCA_FLOWER_FLAGS])
+		flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
 	err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
 	if (err)
 		goto errout;
@@ -498,9 +548,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 				     head->ht_params);
 	if (err)
 		goto errout;
-	if (fold)
+
+	fl_hw_replace_filter(tp,
+			     &head->dissector,
+			     &mask.key,
+			     &fnew->key,
+			     &fnew->exts,
+			     (u64)fnew,
+			     flags);
+
+	if (fold) {
 		rhashtable_remove_fast(&head->ht, &fold->ht_node,
 				       head->ht_params);
+		fl_hw_destroy_filter(tp, (u64)fold);
+	}
 
 	*arg = (unsigned long) fnew;
 
@@ -527,6 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
 	rhashtable_remove_fast(&head->ht, &f->ht_node,
 			       head->ht_params);
 	list_del_rcu(&f->list);
+	fl_hw_destroy_filter(tp, (u64)f);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, fl_destroy_filter);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 8de2d793daf784f8f109565bcc023a6d198bad85 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 8 Mar 2016 12:42:30 +0200
Subject: net/flow_dissector: Make dissector_uses_key() and
 skb_flow_dissector_target() public

Will be used in a following patch to query if a key is being used, and
what it's value in the target object.

Acked-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 13 +++++++++++++
 net/core/flow_dissector.c    | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 8c8548cf5888..d3d60dccd19f 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -184,4 +184,17 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys)
 
 u32 flow_hash_from_keys(struct flow_keys *keys);
 
+static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector,
+				      enum flow_dissector_key_id key_id)
+{
+	return flow_dissector->used_keys & (1 << key_id);
+}
+
+static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
+					      enum flow_dissector_key_id key_id,
+					      void *target_container)
+{
+	return ((char *)target_container) + flow_dissector->offset[key_id];
+}
+
 #endif
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 7c7b8739b8b8..a669dea146c6 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -19,25 +19,12 @@
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
-static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
-			       enum flow_dissector_key_id key_id)
-{
-	return flow_dissector->used_keys & (1 << key_id);
-}
-
 static void dissector_set_key(struct flow_dissector *flow_dissector,
 			      enum flow_dissector_key_id key_id)
 {
 	flow_dissector->used_keys |= (1 << key_id);
 }
 
-static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
-				       enum flow_dissector_key_id key_id,
-				       void *target_container)
-{
-	return ((char *) target_container) + flow_dissector->offset[key_id];
-}
-
 void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count)
-- 
cgit v1.2.3-59-g8ed1b


From 7cf20fc62428367bbf853a8d968804a6ec6a4973 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 3 Mar 2016 09:36:06 +0100
Subject: net/9p: convert to new CQ API

Trivial conversion to the new RDMA CQ API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 net/9p/trans_rdma.c | 86 +++++++++++++++++++----------------------------------
 1 file changed, 31 insertions(+), 55 deletions(-)

(limited to 'net')

diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 52b4a2f993f2..1852e383afd6 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -109,14 +109,13 @@ struct p9_trans_rdma {
 /**
  * p9_rdma_context - Keeps track of in-process WR
  *
- * @wc_op: The original WR op for when the CQE completes in error.
  * @busa: Bus address to unmap when the WR completes
  * @req: Keeps track of requests (send)
  * @rc: Keepts track of replies (receive)
  */
 struct p9_rdma_req;
 struct p9_rdma_context {
-	enum ib_wc_opcode wc_op;
+	struct ib_cqe cqe;
 	dma_addr_t busa;
 	union {
 		struct p9_req_t *req;
@@ -284,9 +283,12 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
 }
 
 static void
-handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
-	    struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
+recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct p9_client *client = cq->cq_context;
+	struct p9_trans_rdma *rdma = client->trans;
+	struct p9_rdma_context *c =
+		container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
 	struct p9_req_t *req;
 	int err = 0;
 	int16_t tag;
@@ -295,7 +297,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
 	ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize,
 							 DMA_FROM_DEVICE);
 
-	if (status != IB_WC_SUCCESS)
+	if (wc->status != IB_WC_SUCCESS)
 		goto err_out;
 
 	err = p9_parse_header(c->rc, NULL, NULL, &tag, 1);
@@ -316,21 +318,32 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
 	req->rc = c->rc;
 	p9_client_cb(client, req, REQ_STATUS_RCVD);
 
+ out:
+	up(&rdma->rq_sem);
+	kfree(c);
 	return;
 
  err_out:
-	p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, status);
+	p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n",
+			req, err, wc->status);
 	rdma->state = P9_RDMA_FLUSHING;
 	client->status = Disconnected;
+	goto out;
 }
 
 static void
-handle_send(struct p9_client *client, struct p9_trans_rdma *rdma,
-	    struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
+send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct p9_client *client = cq->cq_context;
+	struct p9_trans_rdma *rdma = client->trans;
+	struct p9_rdma_context *c =
+		container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
+
 	ib_dma_unmap_single(rdma->cm_id->device,
 			    c->busa, c->req->tc->size,
 			    DMA_TO_DEVICE);
+	up(&rdma->sq_sem);
+	kfree(c);
 }
 
 static void qp_event_handler(struct ib_event *event, void *context)
@@ -339,42 +352,6 @@ static void qp_event_handler(struct ib_event *event, void *context)
 		 event->event, context);
 }
 
-static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
-{
-	struct p9_client *client = cq_context;
-	struct p9_trans_rdma *rdma = client->trans;
-	int ret;
-	struct ib_wc wc;
-
-	ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
-	while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
-		struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id;
-
-		switch (c->wc_op) {
-		case IB_WC_RECV:
-			handle_recv(client, rdma, c, wc.status, wc.byte_len);
-			up(&rdma->rq_sem);
-			break;
-
-		case IB_WC_SEND:
-			handle_send(client, rdma, c, wc.status, wc.byte_len);
-			up(&rdma->sq_sem);
-			break;
-
-		default:
-			pr_err("unexpected completion type, c->wc_op=%d, wc.opcode=%d, status=%d\n",
-			       c->wc_op, wc.opcode, wc.status);
-			break;
-		}
-		kfree(c);
-	}
-}
-
-static void cq_event_handler(struct ib_event *e, void *v)
-{
-	p9_debug(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v);
-}
-
 static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
 {
 	if (!rdma)
@@ -387,7 +364,7 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
 		ib_dealloc_pd(rdma->pd);
 
 	if (rdma->cq && !IS_ERR(rdma->cq))
-		ib_destroy_cq(rdma->cq);
+		ib_free_cq(rdma->cq);
 
 	if (rdma->cm_id && !IS_ERR(rdma->cm_id))
 		rdma_destroy_id(rdma->cm_id);
@@ -408,13 +385,14 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
 	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
 		goto error;
 
+	c->cqe.done = recv_done;
+
 	sge.addr = c->busa;
 	sge.length = client->msize;
 	sge.lkey = rdma->pd->local_dma_lkey;
 
 	wr.next = NULL;
-	c->wc_op = IB_WC_RECV;
-	wr.wr_id = (unsigned long) c;
+	wr.wr_cqe = &c->cqe;
 	wr.sg_list = &sge;
 	wr.num_sge = 1;
 	return ib_post_recv(rdma->qp, &wr, &bad_wr);
@@ -499,13 +477,14 @@ dont_need_post_recv:
 		goto send_error;
 	}
 
+	c->cqe.done = send_done;
+
 	sge.addr = c->busa;
 	sge.length = c->req->tc->size;
 	sge.lkey = rdma->pd->local_dma_lkey;
 
 	wr.next = NULL;
-	c->wc_op = IB_WC_SEND;
-	wr.wr_id = (unsigned long) c;
+	wr.wr_cqe = &c->cqe;
 	wr.opcode = IB_WR_SEND;
 	wr.send_flags = IB_SEND_SIGNALED;
 	wr.sg_list = &sge;
@@ -642,7 +621,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 	struct p9_trans_rdma *rdma;
 	struct rdma_conn_param conn_param;
 	struct ib_qp_init_attr qp_attr;
-	struct ib_cq_init_attr cq_attr = {};
 
 	/* Parse the transport specific mount options */
 	err = parse_opts(args, &opts);
@@ -695,13 +673,11 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 		goto error;
 
 	/* Create the Completion Queue */
-	cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1;
-	rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
-				cq_event_handler, client,
-				&cq_attr);
+	rdma->cq = ib_alloc_cq(rdma->cm_id->device, client,
+			opts.sq_depth + opts.rq_depth + 1,
+			0, IB_POLL_SOFTIRQ);
 	if (IS_ERR(rdma->cq))
 		goto error;
-	ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
 
 	/* Create the Protection Domain */
 	rdma->pd = ib_alloc_pd(rdma->cm_id->device);
-- 
cgit v1.2.3-59-g8ed1b


From f0716cd6eb89f769bff3b37120219720c994bb7e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 9 Mar 2016 00:04:21 +0100
Subject: netfilter: nft_compat: check match/targetinfo attr size

We copy according to ->target|matchsize, so check that the netlink attribute
(which can include padding and might be larger) contains enough data.

Reported-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 454841baa4d0..6228c422c766 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -660,6 +660,9 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 	if (IS_ERR(match))
 		return ERR_PTR(-ENOENT);
 
+	if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO]))
+		return ERR_PTR(-EINVAL);
+
 	/* This is the first time we use this match, allocate operations */
 	nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
 	if (nft_match == NULL)
@@ -740,6 +743,9 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	if (IS_ERR(target))
 		return ERR_PTR(-ENOENT);
 
+	if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO]))
+		return ERR_PTR(-EINVAL);
+
 	/* This is the first time we use this target, allocate operations */
 	nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
 	if (nft_target == NULL)
-- 
cgit v1.2.3-59-g8ed1b


From 6a19cc8c892b113fb84947207346f783de76dbd1 Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Fri, 11 Mar 2016 09:56:32 +0200
Subject: Bluetooth: Fix setting correct flags in AD

A recent change added MGMT_ADV_FLAG_DISCOV to the flags returned by
get_adv_instance_flags(), however failed to take into account limited
discoverable mode. This patch fixes the issue by setting the correct
discoverability flag in the AD data.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 95a545ca9dbc..6e125d76df0d 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -872,7 +872,9 @@ static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
 		if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
 			flags |= MGMT_ADV_FLAG_CONNECTABLE;
 
-		if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+		if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+			flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
+		else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
 			flags |= MGMT_ADV_FLAG_DISCOV;
 
 		return flags;
-- 
cgit v1.2.3-59-g8ed1b


From 6a0e78072c2ae7b20b14e0249d8108441ea928d2 Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Fri, 11 Mar 2016 09:56:33 +0200
Subject: Bluetooth: Fix potential buffer overflow with Add Advertising

The Add Advertising command handler does the appropriate checks for
the AD and Scan Response data, however fails to take into account the
general length of the mgmt command itself, which could lead to
potential buffer overflows. This patch adds the necessary check that
the mgmt command length is consistent with the given ad and scan_rsp
lengths.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Cc: stable@vger.kernel.org
---
 net/bluetooth/mgmt.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index c856fb65812c..9e4b931588cf 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -5995,6 +5995,10 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
 				       MGMT_STATUS_INVALID_PARAMS);
 
+	if (data_len != sizeof(*cp) + cp->adv_data_len + cp->scan_rsp_len)
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+				       MGMT_STATUS_INVALID_PARAMS);
+
 	flags = __le32_to_cpu(cp->flags);
 	timeout = __le16_to_cpu(cp->timeout);
 	duration = __le16_to_cpu(cp->duration);
-- 
cgit v1.2.3-59-g8ed1b


From 8208d21bf309551686b7a76d19059ae182a956d0 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Fri, 11 Mar 2016 11:08:45 +0200
Subject: net/flower: Fix pointer cast

Cast pointer to unsigned long instead of u64, to fix compilation warning
on 32 bit arch, spotted by 0day build.

Fixes: 5b33f48 ("net/flower: Introduce hardware offload support")
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h  |  2 +-
 net/sched/cls_flower.c | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 5b4e8f08b8f0..caa5e18636df 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -416,7 +416,7 @@ enum tc_fl_command {
 
 struct tc_cls_flower_offload {
 	enum tc_fl_command command;
-	u64 cookie;
+	unsigned long cookie;
 	struct flow_dissector *dissector;
 	struct fl_flow_key *mask;
 	struct fl_flow_key *key;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 25d87666bf1e..2181ffc76638 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -165,7 +165,7 @@ static void fl_destroy_filter(struct rcu_head *head)
 	kfree(f);
 }
 
-static void fl_hw_destroy_filter(struct tcf_proto *tp, u64 cookie)
+static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload offload = {0};
@@ -188,7 +188,7 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
 				 struct fl_flow_key *mask,
 				 struct fl_flow_key *key,
 				 struct tcf_exts *actions,
-				 u64 cookie, u32 flags)
+				 unsigned long cookie, u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload offload = {0};
@@ -219,7 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
 		return false;
 
 	list_for_each_entry_safe(f, next, &head->filters, list) {
-		fl_hw_destroy_filter(tp, (u64)f);
+		fl_hw_destroy_filter(tp, (unsigned long)f);
 		list_del_rcu(&f->list);
 		call_rcu(&f->rcu, fl_destroy_filter);
 	}
@@ -554,13 +554,13 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 			     &mask.key,
 			     &fnew->key,
 			     &fnew->exts,
-			     (u64)fnew,
+			     (unsigned long)fnew,
 			     flags);
 
 	if (fold) {
 		rhashtable_remove_fast(&head->ht, &fold->ht_node,
 				       head->ht_params);
-		fl_hw_destroy_filter(tp, (u64)fold);
+		fl_hw_destroy_filter(tp, (unsigned long)fold);
 	}
 
 	*arg = (unsigned long) fnew;
@@ -588,7 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
 	rhashtable_remove_fast(&head->ht, &f->ht_node,
 			       head->ht_params);
 	list_del_rcu(&f->list);
-	fl_hw_destroy_filter(tp, (u64)f);
+	fl_hw_destroy_filter(tp, (unsigned long)f);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, fl_destroy_filter);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 4c656c13b254d598e83e586b7b4d36a2043dad85 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemming@brocade.com>
Date: Tue, 8 Mar 2016 12:59:35 -0800
Subject: bridge: allow zero ageing time

This fixes a regression in the bridge ageing time caused by:
commit c62987bbd8a1 ("bridge: push bridge setting ageing_time down to switchdev")

There are users of Linux bridge which use the feature that if ageing time
is set to 0 it causes entries to never expire. See:
  https://www.linuxfoundation.org/collaborate/workgroups/networking/bridge

For a pure software bridge, it is unnecessary for the code to have
arbitrary restrictions on what values are allowable.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  4 ----
 net/bridge/br_stp.c       | 11 ++++++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index a338a688ee4a..dcb89e3515db 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -46,10 +46,6 @@ struct br_ip_list {
 #define BR_LEARNING_SYNC	BIT(9)
 #define BR_PROXYARP_WIFI	BIT(10)
 
-/* values as per ieee8021QBridgeFdbAgingTime */
-#define BR_MIN_AGEING_TIME	(10 * HZ)
-#define BR_MAX_AGEING_TIME	(1000000 * HZ)
-
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index c22816a0b1b1..e23449094188 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -562,6 +562,14 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
 
 }
 
+/* Set time interval that dynamic forwarding entries live
+ * For pure software bridge, allow values outside the 802.1
+ * standard specification for special cases:
+ *  0 - entry never ages (all permanant)
+ *  1 - entry disappears (no persistance)
+ *
+ * Offloaded switch entries maybe more restrictive
+ */
 int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
 {
 	struct switchdev_attr attr = {
@@ -573,9 +581,6 @@ int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
 	unsigned long t = clock_t_to_jiffies(ageing_time);
 	int err;
 
-	if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME)
-		return -ERANGE;
-
 	err = switchdev_port_attr_set(br->dev, &attr);
 	if (err)
 		return err;
-- 
cgit v1.2.3-59-g8ed1b


From 134611446dc657e1bbc73ca0e4e6b599df687db0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:02 +0100
Subject: ip_tunnel: add support for setting flow label via collect metadata

This patch extends udp_tunnel6_xmit_skb() to pass in the IPv6 flow label
from call sites. Currently, there's no such option and it's always set to
zero when writing ip6_flow_hdr(). Add a label member to ip_tunnel_key, so
that flow-based tunnels via collect metadata frontends can make use of it.
vxlan and geneve will be converted to add flow label support separately.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c       | 2 +-
 drivers/net/vxlan.c        | 2 +-
 include/net/dst_metadata.h | 5 ++++-
 include/net/ip_tunnels.h   | 4 +++-
 include/net/udp_tunnel.h   | 4 ++--
 net/ipv6/ip6_udp_tunnel.c  | 6 +++---
 net/tipc/udp_media.c       | 2 +-
 7 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6a0cbbe03e5d..89ccff79d76c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1054,7 +1054,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		ttl = ttl ? : ip6_dst_hoplimit(dst);
 	}
 	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
-			     &fl6.saddr, &fl6.daddr, prio, ttl,
+			     &fl6.saddr, &fl6.daddr, prio, ttl, 0,
 			     sport, geneve->dst_port,
 			     !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX));
 	return NETDEV_TX_OK;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2399099e68cf..8bdcd5ea8424 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2066,7 +2066,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			return;
 		}
 		udp_tunnel6_xmit_skb(ndst, sk, skb, dev,
-				     &saddr, &dst->sin6.sin6_addr, tos, ttl,
+				     &saddr, &dst->sin6.sin6_addr, tos, ttl, 0,
 				     src_port, dst_port, !udp_sum);
 #endif
 	}
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 84b833af6882..5db9f5910428 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -126,7 +126,7 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 
 	ip_tunnel_key_init(&tun_dst->u.tun_info.key,
 			   iph->saddr, iph->daddr, iph->tos, iph->ttl,
-			   0, 0, tunnel_id, flags);
+			   0, 0, 0, tunnel_id, flags);
 	return tun_dst;
 }
 
@@ -152,8 +152,11 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
 
 	info->key.u.ipv6.src = ip6h->saddr;
 	info->key.u.ipv6.dst = ip6h->daddr;
+
 	info->key.tos = ipv6_get_dsfield(ip6h);
 	info->key.ttl = ip6h->hop_limit;
+	info->key.label = ip6_flowlabel(ip6h);
+
 	return tun_dst;
 }
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 0acd80fadb32..5dc2e454f866 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -48,6 +48,7 @@ struct ip_tunnel_key {
 	__be16			tun_flags;
 	u8			tos;		/* TOS for IPv4, TC for IPv6 */
 	u8			ttl;		/* TTL for IPv4, HL for IPv6 */
+	__be32			label;		/* Flow Label for IPv6 */
 	__be16			tp_src;
 	__be16			tp_dst;
 };
@@ -181,7 +182,7 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
 
 static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
 				      __be32 saddr, __be32 daddr,
-				      u8 tos, u8 ttl,
+				      u8 tos, u8 ttl, __be32 label,
 				      __be16 tp_src, __be16 tp_dst,
 				      __be64 tun_id, __be16 tun_flags)
 {
@@ -192,6 +193,7 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
 	       0, IP_TUNNEL_KEY_IPV4_PAD_LEN);
 	key->tos = tos;
 	key->ttl = ttl;
+	key->label = label;
 	key->tun_flags = tun_flags;
 
 	/* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 97f5adb121a6..b83114077cee 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -88,8 +88,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			 struct sk_buff *skb,
 			 struct net_device *dev, struct in6_addr *saddr,
 			 struct in6_addr *daddr,
-			 __u8 prio, __u8 ttl, __be16 src_port,
-			 __be16 dst_port, bool nocheck);
+			 __u8 prio, __u8 ttl, __be32 label,
+			 __be16 src_port, __be16 dst_port, bool nocheck);
 #endif
 
 void udp_tunnel_sock_release(struct socket *sock);
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 14dacf1df529..a7520528ecd2 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -73,8 +73,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			 struct sk_buff *skb,
 			 struct net_device *dev, struct in6_addr *saddr,
 			 struct in6_addr *daddr,
-			 __u8 prio, __u8 ttl, __be16 src_port,
-			 __be16 dst_port, bool nocheck)
+			 __u8 prio, __u8 ttl, __be32 label,
+			 __be16 src_port, __be16 dst_port, bool nocheck)
 {
 	struct udphdr *uh;
 	struct ipv6hdr *ip6h;
@@ -98,7 +98,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 	__skb_push(skb, sizeof(*ip6h));
 	skb_reset_network_header(skb);
 	ip6h		  = ipv6_hdr(skb);
-	ip6_flow_hdr(ip6h, prio, htonl(0));
+	ip6_flow_hdr(ip6h, prio, label);
 	ip6h->payload_len = htons(skb->len);
 	ip6h->nexthdr     = IPPROTO_UDP;
 	ip6h->hop_limit   = ttl;
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 49b3c2ede7ab..c94f9a15e2cd 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -196,7 +196,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
 		ttl = ip6_dst_hoplimit(ndst);
 		err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb,
 					   ndst->dev, &src->ipv6,
-					   &dst->ipv6, 0, ttl, src->udp_port,
+					   &dst->ipv6, 0, ttl, 0, src->udp_port,
 					   dst->udp_port, false);
 #endif
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 4018ab1875e0d00b84ac61bc15427136ad55849e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:05 +0100
Subject: bpf: support flow label for bpf_skb_{set, get}_tunnel_key

This patch extends bpf_tunnel_key with a tunnel_label member, that maps
to ip_tunnel_key's label so underlying backends like vxlan and geneve
can propagate the label to udp_tunnel6_xmit_skb(), where it's being set
in the IPv6 header. It allows for having 20 more bits to encode/decode
flow related meta information programmatically. Tested with vxlan and
geneve.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0e30b19012a5..924f537183fd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
+	__u32 tunnel_label;
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index a66dc03c261f..6fc3893a6170 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1770,12 +1770,15 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		return -EPROTO;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, tunnel_label):
+			goto set_compat;
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
 			 * a common path later on.
 			 */
 			if (ip_tunnel_info_af(info) != AF_INET)
 				return -EINVAL;
+set_compat:
 			to = (struct bpf_tunnel_key *)compat;
 			break;
 		default:
@@ -1787,11 +1790,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
 
-	if (flags & BPF_F_TUNINFO_IPV6)
+	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
 		       sizeof(to->remote_ipv6));
-	else
+		to->tunnel_label = be32_to_cpu(info->key.label);
+	} else {
 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+	}
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
 		memcpy((void *)(long) r2, to, size);
@@ -1850,6 +1855,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, tunnel_label):
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
 			 * a common path later on.
@@ -1862,6 +1868,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 			return -EINVAL;
 		}
 	}
+	if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label))
+		return -EINVAL;
 
 	skb_dst_drop(skb);
 	dst_hold((struct dst_entry *) md);
@@ -1882,6 +1890,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		info->mode |= IP_TUNNEL_INFO_IPV6;
 		memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
 		       sizeof(from->remote_ipv6));
+		info->key.label = cpu_to_be32(from->tunnel_label) &
+				  IPV6_FLOWLABEL_MASK;
 	} else {
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
 		if (flags & BPF_F_ZERO_CSUM_TX)
-- 
cgit v1.2.3-59-g8ed1b


From d157bd761585605b7882935ffb86286919f62ea1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 10 Mar 2016 01:56:23 +0100
Subject: netfilter: x_tables: check for size overflow

Ben Hawkes says:
 integer overflow in xt_alloc_table_info, which on 32-bit systems can
 lead to small structure allocation and a copy_from_user based heap
 corruption.

Reported-by: Ben Hawkes <hawkes@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index d0cd2b9bf844..582c9cfd6567 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -659,6 +659,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 	struct xt_table_info *info = NULL;
 	size_t sz = sizeof(*info) + size;
 
+	if (sz < sizeof(*info))
+		return NULL;
+
 	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
 	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
 		return NULL;
-- 
cgit v1.2.3-59-g8ed1b


From c194cf93c164ed1c71142485ee0f70f9f2d1fe35 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Wed, 9 Mar 2016 09:24:23 -0800
Subject: gro: Defer clearing of flush bit in tunnel paths

This patch updates the GRO handlers for GRE, VXLAN, GENEVE, and FOU so that
we do not clear the flush bit until after we have called the next level GRO
handler.  Previously this was being cleared before parsing through the list
of frames, however this resulted in several paths where either the bit
needed to be reset but wasn't as in the case of FOU, or cases where it was
being set as in GENEVE.  By just deferring the clearing of the bit until
after the next level protocol has been parsed we can avoid any unnecessary
bit twiddling and avoid bugs.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c   | 7 ++-----
 drivers/net/vxlan.c    | 3 +--
 net/ipv4/fou.c         | 3 +--
 net/ipv4/gre_offload.c | 3 +--
 4 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 33185b9a435e..192631a345df 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -463,8 +463,6 @@ static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
 			goto out;
 	}
 
-	flush = 0;
-
 	for (p = *head; p; p = p->next) {
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
@@ -481,14 +479,13 @@ static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
 
 	rcu_read_lock();
 	ptype = gro_find_receive_by_type(type);
-	if (!ptype) {
-		flush = 1;
+	if (!ptype)
 		goto out_unlock;
-	}
 
 	skb_gro_pull(skb, gh_len);
 	skb_gro_postpull_rcsum(skb, gh, gh_len);
 	pp = ptype->callbacks.gro_receive(head, skb);
+	flush = 0;
 
 out_unlock:
 	rcu_read_unlock();
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 8eda76f9e474..800106a7246c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -591,8 +591,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 
 	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
 
-	flush = 0;
-
 	for (p = *head; p; p = p->next) {
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
@@ -606,6 +604,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 	}
 
 	pp = eth_gro_receive(head, skb);
+	flush = 0;
 
 out:
 	skb_gro_remcsum_cleanup(skb, &grc);
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 88dab0c1670c..780484243e14 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -319,8 +319,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 
 	skb_gro_pull(skb, hdrlen);
 
-	flush = 0;
-
 	for (p = *head; p; p = p->next) {
 		const struct guehdr *guehdr2;
 
@@ -352,6 +350,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 		goto out_unlock;
 
 	pp = ops->callbacks.gro_receive(head, skb);
+	flush = 0;
 
 out_unlock:
 	rcu_read_unlock();
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 47f4c544c916..540866dbd27d 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -175,8 +175,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
 					     null_compute_pseudo);
 	}
 
-	flush = 0;
-
 	for (p = *head; p; p = p->next) {
 		const struct gre_base_hdr *greh2;
 
@@ -213,6 +211,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
 	skb_gro_postpull_rcsum(skb, greh, grehlen);
 
 	pp = ptype->callbacks.gro_receive(head, skb);
+	flush = 0;
 
 out_unlock:
 	rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From dad8aff754247781514c9d38f6db89557685136d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 9 Mar 2016 23:22:56 +0000
Subject: rxrpc: Replace all unsigned with unsigned int

Replace all "unsigned" types with "unsigned int" types.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/af_rxrpc.c      |  2 +-
 net/rxrpc/ar-ack.c        | 12 ++++++------
 net/rxrpc/ar-call.c       |  4 ++--
 net/rxrpc/ar-connection.c |  2 +-
 net/rxrpc/ar-internal.h   | 22 +++++++++++-----------
 net/rxrpc/ar-output.c     |  2 +-
 net/rxrpc/ar-transport.c  |  2 +-
 net/rxrpc/sysctl.c        | 32 ++++++++++++++++----------------
 8 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index a76501757b59..9d935fa5a2a9 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -81,7 +81,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
 				  struct sockaddr_rxrpc *srx,
 				  int len)
 {
-	unsigned tail;
+	unsigned int tail;
 
 	if (len < sizeof(struct sockaddr_rxrpc))
 		return -EINVAL;
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
index 20f3f001694e..16d967075eaf 100644
--- a/net/rxrpc/ar-ack.c
+++ b/net/rxrpc/ar-ack.c
@@ -23,7 +23,7 @@
  * How long to wait before scheduling ACK generation after seeing a
  * packet with RXRPC_REQUEST_ACK set (in jiffies).
  */
-unsigned rxrpc_requested_ack_delay = 1;
+unsigned int rxrpc_requested_ack_delay = 1;
 
 /*
  * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
@@ -32,7 +32,7 @@ unsigned rxrpc_requested_ack_delay = 1;
  * all consumed within this time we will send a DELAY ACK if an ACK was not
  * requested to let the sender know it doesn't need to resend.
  */
-unsigned rxrpc_soft_ack_delay = 1 * HZ;
+unsigned int rxrpc_soft_ack_delay = 1 * HZ;
 
 /*
  * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
@@ -41,7 +41,7 @@ unsigned rxrpc_soft_ack_delay = 1 * HZ;
  * further packets aren't immediately received to decide when to send an IDLE
  * ACK let the other end know that it can free up its Tx buffer space.
  */
-unsigned rxrpc_idle_ack_delay = 0.5 * HZ;
+unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
 
 /*
  * Receive window size in packets.  This indicates the maximum number of
@@ -49,19 +49,19 @@ unsigned rxrpc_idle_ack_delay = 0.5 * HZ;
  * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
  * packets.
  */
-unsigned rxrpc_rx_window_size = 32;
+unsigned int rxrpc_rx_window_size = 32;
 
 /*
  * Maximum Rx MTU size.  This indicates to the sender the size of jumbo packet
  * made by gluing normal packets together that we're willing to handle.
  */
-unsigned rxrpc_rx_mtu = 5692;
+unsigned int rxrpc_rx_mtu = 5692;
 
 /*
  * The maximum number of fragments in a received jumbo packet that we tell the
  * sender that we're willing to handle.
  */
-unsigned rxrpc_rx_jumbo_max = 4;
+unsigned int rxrpc_rx_jumbo_max = 4;
 
 static const char *rxrpc_acks(u8 reason)
 {
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
index 4a499e0100f1..7c8d300ade9b 100644
--- a/net/rxrpc/ar-call.c
+++ b/net/rxrpc/ar-call.c
@@ -21,12 +21,12 @@
 /*
  * Maximum lifetime of a call (in jiffies).
  */
-unsigned rxrpc_max_call_lifetime = 60 * HZ;
+unsigned int rxrpc_max_call_lifetime = 60 * HZ;
 
 /*
  * Time till dead call expires after last use (in jiffies).
  */
-unsigned rxrpc_dead_call_expiry = 2 * HZ;
+unsigned int rxrpc_dead_call_expiry = 2 * HZ;
 
 const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
 	[RXRPC_CALL_CLIENT_SEND_REQUEST]	= "ClSndReq",
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
index 53df14cb8d25..9942da1edbf6 100644
--- a/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@ -21,7 +21,7 @@
 /*
  * Time till a connection expires after last use (in seconds).
  */
-unsigned rxrpc_connection_expiry = 10 * 60;
+unsigned int rxrpc_connection_expiry = 10 * 60;
 
 static void rxrpc_connection_reaper(struct work_struct *work);
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 8b495aed517d..a3002f4ddc90 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -478,12 +478,12 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * ar-ack.c
  */
-extern unsigned rxrpc_requested_ack_delay;
-extern unsigned rxrpc_soft_ack_delay;
-extern unsigned rxrpc_idle_ack_delay;
-extern unsigned rxrpc_rx_window_size;
-extern unsigned rxrpc_rx_mtu;
-extern unsigned rxrpc_rx_jumbo_max;
+extern unsigned int rxrpc_requested_ack_delay;
+extern unsigned int rxrpc_soft_ack_delay;
+extern unsigned int rxrpc_idle_ack_delay;
+extern unsigned int rxrpc_rx_window_size;
+extern unsigned int rxrpc_rx_mtu;
+extern unsigned int rxrpc_rx_jumbo_max;
 
 void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
 void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
@@ -492,8 +492,8 @@ void rxrpc_process_call(struct work_struct *);
 /*
  * ar-call.c
  */
-extern unsigned rxrpc_max_call_lifetime;
-extern unsigned rxrpc_dead_call_expiry;
+extern unsigned int rxrpc_max_call_lifetime;
+extern unsigned int rxrpc_dead_call_expiry;
 extern struct kmem_cache *rxrpc_call_jar;
 extern struct list_head rxrpc_calls;
 extern rwlock_t rxrpc_call_lock;
@@ -516,7 +516,7 @@ void __exit rxrpc_destroy_all_calls(void);
 /*
  * ar-connection.c
  */
-extern unsigned rxrpc_connection_expiry;
+extern unsigned int rxrpc_connection_expiry;
 extern struct list_head rxrpc_connections;
 extern rwlock_t rxrpc_connection_lock;
 
@@ -579,7 +579,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t,
 /*
  * ar-output.c
  */
-extern unsigned rxrpc_resend_timeout;
+extern unsigned int rxrpc_resend_timeout;
 
 int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *);
 int rxrpc_client_sendmsg(struct rxrpc_sock *, struct rxrpc_transport *,
@@ -627,7 +627,7 @@ void rxrpc_packet_destructor(struct sk_buff *);
 /*
  * ar-transport.c
  */
-extern unsigned rxrpc_transport_expiry;
+extern unsigned int rxrpc_transport_expiry;
 
 struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *,
 					    struct rxrpc_peer *, gfp_t);
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index 14c8df6b7f41..d36fb6e1a29c 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -21,7 +21,7 @@
 /*
  * Time till packet resend (in jiffies).
  */
-unsigned rxrpc_resend_timeout = 4 * HZ;
+unsigned int rxrpc_resend_timeout = 4 * HZ;
 
 static int rxrpc_send_data(struct rxrpc_sock *rx,
 			   struct rxrpc_call *call,
diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c
index 5f9b9d462f53..66a1a5676446 100644
--- a/net/rxrpc/ar-transport.c
+++ b/net/rxrpc/ar-transport.c
@@ -20,7 +20,7 @@
 /*
  * Time after last use at which transport record is cleaned up.
  */
-unsigned rxrpc_transport_expiry = 3600 * 24;
+unsigned int rxrpc_transport_expiry = 3600 * 24;
 
 static void rxrpc_transport_reaper(struct work_struct *work);
 
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 093547ac2bcd..d20ed575acf4 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -15,11 +15,11 @@
 #include "ar-internal.h"
 
 static struct ctl_table_header *rxrpc_sysctl_reg_table;
-static const unsigned zero = 0;
-static const unsigned one = 1;
-static const unsigned four = 4;
-static const unsigned n_65535 = 65535;
-static const unsigned n_max_acks = RXRPC_MAXACKS;
+static const unsigned int zero = 0;
+static const unsigned int one = 1;
+static const unsigned int four = 4;
+static const unsigned int n_65535 = 65535;
+static const unsigned int n_max_acks = RXRPC_MAXACKS;
 
 /*
  * RxRPC operating parameters.
@@ -32,7 +32,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "req_ack_delay",
 		.data		= &rxrpc_requested_ack_delay,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_ms_jiffies,
 		.extra1		= (void *)&zero,
@@ -40,7 +40,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "soft_ack_delay",
 		.data		= &rxrpc_soft_ack_delay,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_ms_jiffies,
 		.extra1		= (void *)&one,
@@ -48,7 +48,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "idle_ack_delay",
 		.data		= &rxrpc_idle_ack_delay,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_ms_jiffies,
 		.extra1		= (void *)&one,
@@ -56,7 +56,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "resend_timeout",
 		.data		= &rxrpc_resend_timeout,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_ms_jiffies,
 		.extra1		= (void *)&one,
@@ -66,7 +66,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "max_call_lifetime",
 		.data		= &rxrpc_max_call_lifetime,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 		.extra1		= (void *)&one,
@@ -74,7 +74,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "dead_call_expiry",
 		.data		= &rxrpc_dead_call_expiry,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 		.extra1		= (void *)&one,
@@ -84,7 +84,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "connection_expiry",
 		.data		= &rxrpc_connection_expiry,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (void *)&one,
@@ -92,7 +92,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "transport_expiry",
 		.data		= &rxrpc_transport_expiry,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (void *)&one,
@@ -102,7 +102,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "rx_window_size",
 		.data		= &rxrpc_rx_window_size,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (void *)&one,
@@ -111,7 +111,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "rx_mtu",
 		.data		= &rxrpc_rx_mtu,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (void *)&one,
@@ -120,7 +120,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 	{
 		.procname	= "rx_jumbo_max",
 		.data		= &rxrpc_rx_jumbo_max,
-		.maxlen		= sizeof(unsigned),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= (void *)&one,
-- 
cgit v1.2.3-59-g8ed1b


From 39d2adebf137de5f900843f69f5e500932e31047 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 10 Mar 2016 15:31:57 +0800
Subject: sctp: fix the transports round robin issue when init is retransmitted

prior to this patch, at the beginning if we have two paths in one assoc,
they may have the same params other than the last_time_heard, it will try
the paths like this:

1st cycle
  try trans1 fail.
  then trans2 is selected.(cause it's last_time_heard is after trans1).

2nd cycle:
  try  trans2 fail
  then trans2 is selected.(cause it's last_time_heard is after trans1).

3rd cycle:
  try  trans2 fail
  then trans2 is selected.(cause it's last_time_heard is after trans1).

....

trans1 will never have change to be selected, which is not what we expect.
we should keeping round robin all the paths if they are just added at the
beginning.

So at first every tranport's last_time_heard should be initialized 0, so
that we ensure they have the same value at the beginning, only by this,
all the transports could get equal chance to be selected.

Then for sctp_trans_elect_best, it should return the trans_next one when
*trans == *trans_next, so that we can try next if it fails,  but now it
always return trans. so we can fix it by exchanging these two params when
we calls sctp_trans_elect_tie().

Fixes: 4c47af4d5eb2 ('net: sctp: rework multihoming retransmission path selection to rfc4960')
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c | 2 +-
 net/sctp/transport.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 2bf8ec92dde4..cd873446433c 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1263,7 +1263,7 @@ static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr,
 	if (score_curr > score_best)
 		return curr;
 	else if (score_curr == score_best)
-		return sctp_trans_elect_tie(curr, best);
+		return sctp_trans_elect_tie(best, curr);
 	else
 		return best;
 }
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index a431c14044a4..d517153891a6 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
 	 */
 	peer->rto = msecs_to_jiffies(net->sctp.rto_initial);
 
-	peer->last_time_heard = ktime_get();
+	peer->last_time_heard = ktime_set(0, 0);
 	peer->last_time_ecne_reduced = jiffies;
 
 	peer->param_flags = SPP_HB_DISABLE |
-- 
cgit v1.2.3-59-g8ed1b


From 136ba622de49a6bf1f6e5eab3391ed5d5dbe30e3 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Thu, 10 Mar 2016 08:55:50 +0000
Subject: netconf: add macro to represent all attributes

This patch adds macro NETCONFA_ALL to represent all type of netconf
attributes for IPv4 and IPv6.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netconf.h |  1 +
 net/ipv4/devinet.c           | 40 +++++++++++++++++++++++-----------------
 net/ipv6/addrconf.c          | 36 +++++++++++++++++++++---------------
 3 files changed, 45 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index 23cbd34e4ac7..45dfad509c4d 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -19,6 +19,7 @@ enum {
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
+#define NETCONFA_ALL	-1
 
 #define NETCONFA_IFINDEX_ALL		-1
 #define NETCONFA_IFINDEX_DEFAULT	-2
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 8c3df2ccba45..65e76a48382c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1753,17 +1753,20 @@ static int inet_netconf_msgsize_devconf(int type)
 {
 	int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
 		   + nla_total_size(4);	/* NETCONFA_IFINDEX */
+	bool all = false;
 
-	/* type -1 is used for ALL */
-	if (type == -1 || type == NETCONFA_FORWARDING)
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_FORWARDING)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_RP_FILTER)
+	if (all || type == NETCONFA_RP_FILTER)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_MC_FORWARDING)
+	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
 		size += nla_total_size(4);
 
 	return size;
@@ -1776,36 +1779,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 {
 	struct nlmsghdr  *nlh;
 	struct netconfmsg *ncm;
+	bool all = false;
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
 			flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (type == NETCONFA_ALL)
+		all = true;
+
 	ncm = nlmsg_data(nlh);
 	ncm->ncm_family = AF_INET;
 
 	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
 		goto nla_put_failure;
 
-	/* type -1 is used for ALL */
-	if ((type == -1 || type == NETCONFA_FORWARDING) &&
+	if ((all || type == NETCONFA_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_FORWARDING,
 			IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+	if ((all || type == NETCONFA_RP_FILTER) &&
 	    nla_put_s32(skb, NETCONFA_RP_FILTER,
 			IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
 			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 			IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
 		goto nla_put_failure;
@@ -1893,14 +1899,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
 	}
 
 	err = -ENOBUFS;
-	skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+	skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
 	err = inet_netconf_fill_devconf(skb, ifindex, devconf,
 					NETLINK_CB(in_skb).portid,
 					nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
-					-1);
+					NETCONFA_ALL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
 		WARN_ON(err == -EMSGSIZE);
@@ -1944,7 +1950,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
 						      cb->nlh->nlmsg_seq,
 						      RTM_NEWNETCONF,
 						      NLM_F_MULTI,
-						      -1) < 0) {
+						      NETCONFA_ALL) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -1960,7 +1966,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) < 0)
+					      NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
@@ -1971,7 +1977,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) < 0)
+					      NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8c0dab2de5c9..27aed1afcf81 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -473,18 +473,21 @@ static int inet6_netconf_msgsize_devconf(int type)
 {
 	int size =  NLMSG_ALIGN(sizeof(struct netconfmsg))
 		    + nla_total_size(4);	/* NETCONFA_IFINDEX */
+	bool all = false;
 
-	/* type -1 is used for ALL */
-	if (type == -1 || type == NETCONFA_FORWARDING)
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_FORWARDING)
 		size += nla_total_size(4);
 #ifdef CONFIG_IPV6_MROUTE
-	if (type == -1 || type == NETCONFA_MC_FORWARDING)
+	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
 #endif
-	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 
-	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
 		size += nla_total_size(4);
 
 	return size;
@@ -497,33 +500,36 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 {
 	struct nlmsghdr  *nlh;
 	struct netconfmsg *ncm;
+	bool all = false;
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
 			flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (type == NETCONFA_ALL)
+		all = true;
+
 	ncm = nlmsg_data(nlh);
 	ncm->ncm_family = AF_INET6;
 
 	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
 		goto nla_put_failure;
 
-	/* type -1 is used for ALL */
-	if ((type == -1 || type == NETCONFA_FORWARDING) &&
+	if ((all || type == NETCONFA_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0)
 		goto nla_put_failure;
 #ifdef CONFIG_IPV6_MROUTE
-	if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			devconf->mc_forwarding) < 0)
 		goto nla_put_failure;
 #endif
-	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
 		goto nla_put_failure;
 
-	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 			devconf->ignore_routes_with_linkdown) < 0)
 		goto nla_put_failure;
@@ -609,14 +615,14 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
 	}
 
 	err = -ENOBUFS;
-	skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+	skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
 	err = inet6_netconf_fill_devconf(skb, ifindex, devconf,
 					 NETLINK_CB(in_skb).portid,
 					 nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
-					 -1);
+					 NETCONFA_ALL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
 		WARN_ON(err == -EMSGSIZE);
@@ -660,7 +666,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
 						       cb->nlh->nlmsg_seq,
 						       RTM_NEWNETCONF,
 						       NLM_F_MULTI,
-						       -1) < 0) {
+						       NETCONFA_ALL) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -676,7 +682,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) < 0)
+					       NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
@@ -687,7 +693,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) < 0)
+					       NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
-- 
cgit v1.2.3-59-g8ed1b


From 6f15cdbf8a8ac2e22767cc8b1eae225702733c95 Mon Sep 17 00:00:00 2001
From: Samuel Gauthier <samuel.gauthier@6wind.com>
Date: Thu, 10 Mar 2016 17:14:59 +0100
Subject: ovs: allow nl 'flow set' to use ufid without flow key

When we want to change a flow using netlink, we have to identify it to
be able to perform a lookup. Both the flow key and unique flow ID
(ufid) are valid identifiers, but we always have to specify the flow
key in the netlink message. When both attributes are there, the ufid
is used. The flow key is used to validate the actions provided by
the userland.

This commit allows to use the ufid without having to provide the flow
key, as it is already done in the netlink 'flow get' and 'flow del'
path. The flow key remains mandatory when an action is provided.

Signed-off-by: Samuel Gauthier <samuel.gauthier@6wind.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index e6a7d494df24..0cc66a4e492d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1096,26 +1096,32 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	struct sw_flow_match match;
 	struct sw_flow_id sfid;
 	u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
-	int error;
+	int error = 0;
 	bool log = !a[OVS_FLOW_ATTR_PROBE];
 	bool ufid_present;
 
-	/* Extract key. */
-	error = -EINVAL;
-	if (!a[OVS_FLOW_ATTR_KEY]) {
-		OVS_NLERR(log, "Flow key attribute not present in set flow.");
-		goto error;
-	}
-
 	ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
-	ovs_match_init(&match, &key, &mask);
-	error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
-				  a[OVS_FLOW_ATTR_MASK], log);
+	if (a[OVS_FLOW_ATTR_KEY]) {
+		ovs_match_init(&match, &key, &mask);
+		error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
+					  a[OVS_FLOW_ATTR_MASK], log);
+	} else if (!ufid_present) {
+		OVS_NLERR(log,
+			  "Flow set message rejected, Key attribute missing.");
+		error = -EINVAL;
+	}
 	if (error)
 		goto error;
 
 	/* Validate actions. */
 	if (a[OVS_FLOW_ATTR_ACTIONS]) {
+		if (!a[OVS_FLOW_ATTR_KEY]) {
+			OVS_NLERR(log,
+				  "Flow key attribute not present in set flow.");
+			error = -EINVAL;
+			goto error;
+		}
+
 		acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
 					&mask, log);
 		if (IS_ERR(acts)) {
-- 
cgit v1.2.3-59-g8ed1b


From cea8768f333e3f0bc231d8b815aa4a9e63fa990c Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 10 Mar 2016 18:33:07 -0300
Subject: sctp: allow sctp_transmit_packet and others to use gfp

Currently sctp_sendmsg() triggers some calls that will allocate memory
with GFP_ATOMIC even when not necessary. In the case of
sctp_packet_transmit it will allocate a linear skb that will be used to
construct the packet and this may cause sends to fail due to ENOMEM more
often than anticipated specially with big MTUs.

This patch thus allows it to inherit gfp flags from upper calls so that
it can use GFP_KERNEL if it was triggered by a sctp_sendmsg call or
similar. All others, like retransmits or flushes started from BH, are
still allocated using GFP_ATOMIC.

In netperf tests this didn't result in any performance drawbacks when
memory is not too fragmented and made it trigger ENOMEM way less often.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h      |  2 +-
 include/net/sctp/structs.h | 10 +++---
 net/sctp/associola.c       |  2 +-
 net/sctp/chunk.c           |  6 ++--
 net/sctp/input.c           |  2 +-
 net/sctp/output.c          |  6 ++--
 net/sctp/outqueue.c        | 30 ++++++++---------
 net/sctp/sm_make_chunk.c   | 80 +++++++++++++++++++++++++++-------------------
 net/sctp/sm_sideeffect.c   | 23 ++++++-------
 9 files changed, 89 insertions(+), 72 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 487ef34bbd63..efc01743b9d6 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -201,7 +201,7 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *,
 struct sctp_chunk * sctp_make_datafrag_empty(struct sctp_association *,
 					const struct sctp_sndrcvinfo *sinfo,
 					int len, const __u8 flags,
-					__u16 ssn);
+					__u16 ssn, gfp_t gfp);
 struct sctp_chunk *sctp_make_ecne(const struct sctp_association *,
 				  const __u32);
 struct sctp_chunk *sctp_make_sack(const struct sctp_association *);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d05b56641abc..9d237669c52c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -655,7 +655,7 @@ void sctp_chunk_free(struct sctp_chunk *);
 void  *sctp_addto_chunk(struct sctp_chunk *, int len, const void *data);
 struct sctp_chunk *sctp_chunkify(struct sk_buff *,
 				 const struct sctp_association *,
-				 struct sock *);
+				 struct sock *, gfp_t gfp);
 void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *,
 		     union sctp_addr *);
 const union sctp_addr *sctp_source(const struct sctp_chunk *chunk);
@@ -717,10 +717,10 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *,
 				     __u16 sport, __u16 dport);
 struct sctp_packet *sctp_packet_config(struct sctp_packet *, __u32 vtag, int);
 sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *,
-                                       struct sctp_chunk *, int);
+				       struct sctp_chunk *, int, gfp_t);
 sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *,
                                      struct sctp_chunk *);
-int sctp_packet_transmit(struct sctp_packet *);
+int sctp_packet_transmit(struct sctp_packet *, gfp_t);
 void sctp_packet_free(struct sctp_packet *);
 
 static inline int sctp_packet_empty(struct sctp_packet *packet)
@@ -1053,7 +1053,7 @@ struct sctp_outq {
 void sctp_outq_init(struct sctp_association *, struct sctp_outq *);
 void sctp_outq_teardown(struct sctp_outq *);
 void sctp_outq_free(struct sctp_outq*);
-int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk);
+int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t);
 int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *);
 int sctp_outq_is_empty(const struct sctp_outq *);
 void sctp_outq_restart(struct sctp_outq *);
@@ -1061,7 +1061,7 @@ void sctp_outq_restart(struct sctp_outq *);
 void sctp_retransmit(struct sctp_outq *, struct sctp_transport *,
 		     sctp_retransmit_reason_t);
 void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8);
-int sctp_outq_uncork(struct sctp_outq *);
+int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
 /* Uncork and flush an outqueue.  */
 static inline void sctp_outq_cork(struct sctp_outq *q)
 {
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index cd873446433c..a19b3e607703 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1493,7 +1493,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
 
 		asoc->peer.sack_needed = 0;
 
-		sctp_outq_tail(&asoc->outqueue, sack);
+		sctp_outq_tail(&asoc->outqueue, sack, GFP_ATOMIC);
 
 		/* Stop the SACK timer.  */
 		timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 3aa43073e0b9..958ef5f33f4b 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -260,7 +260,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 				frag |= SCTP_DATA_SACK_IMM;
 		}
 
-		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0);
+		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
+						 0, GFP_KERNEL);
 
 		if (!chunk) {
 			err = -ENOMEM;
@@ -296,7 +297,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		    (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
 			frag |= SCTP_DATA_SACK_IMM;
 
-		chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0);
+		chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
+						 0, GFP_KERNEL);
 
 		if (!chunk) {
 			err = -ENOMEM;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 21a2d6b7abaf..db76f1ab4ac2 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -221,7 +221,7 @@ int sctp_rcv(struct sk_buff *skb)
 		goto discard_release;
 
 	/* Create an SCTP packet structure. */
-	chunk = sctp_chunkify(skb, asoc, sk);
+	chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC);
 	if (!chunk)
 		goto discard_release;
 	SCTP_INPUT_CB(skb)->chunk = chunk;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 9d610eddd19e..736c004abfbc 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -153,7 +153,7 @@ void sctp_packet_free(struct sctp_packet *packet)
  */
 sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 				       struct sctp_chunk *chunk,
-				       int one_packet)
+				       int one_packet, gfp_t gfp)
 {
 	sctp_xmit_t retval;
 	int error = 0;
@@ -163,7 +163,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 	switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
 	case SCTP_XMIT_PMTU_FULL:
 		if (!packet->has_cookie_echo) {
-			error = sctp_packet_transmit(packet);
+			error = sctp_packet_transmit(packet, gfp);
 			if (error < 0)
 				chunk->skb->sk->sk_err = -error;
 
@@ -376,7 +376,7 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
  *
  * The return value is a normal kernel error return value.
  */
-int sctp_packet_transmit(struct sctp_packet *packet)
+int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 {
 	struct sctp_transport *tp = packet->transport;
 	struct sctp_association *asoc = tp->asoc;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index c0380cfb16ae..f03541d0f12d 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
 
 static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
 
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout);
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 
 /* Add data to the front of the queue. */
 static inline void sctp_outq_head_data(struct sctp_outq *q,
@@ -285,7 +285,7 @@ void sctp_outq_free(struct sctp_outq *q)
 }
 
 /* Put a new chunk in an sctp_outq.  */
-int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
+int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 {
 	struct net *net = sock_net(q->asoc->base.sk);
 	int error = 0;
@@ -341,7 +341,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
 		return error;
 
 	if (!q->cork)
-		error = sctp_outq_flush(q, 0);
+		error = sctp_outq_flush(q, 0, gfp);
 
 	return error;
 }
@@ -510,7 +510,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 	 * will be flushed at the end.
 	 */
 	if (reason != SCTP_RTXR_FAST_RTX)
-		error = sctp_outq_flush(q, /* rtx_timeout */ 1);
+		error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
 
 	if (error)
 		q->asoc->base.sk->sk_err = -error;
@@ -601,12 +601,12 @@ redo:
 				 * control chunks are already freed so there
 				 * is nothing we can do.
 				 */
-				sctp_packet_transmit(pkt);
+				sctp_packet_transmit(pkt, GFP_ATOMIC);
 				goto redo;
 			}
 
 			/* Send this packet.  */
-			error = sctp_packet_transmit(pkt);
+			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
 
 			/* If we are retransmitting, we should only
 			 * send a single packet.
@@ -622,7 +622,7 @@ redo:
 
 		case SCTP_XMIT_RWND_FULL:
 			/* Send this packet. */
-			error = sctp_packet_transmit(pkt);
+			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
 
 			/* Stop sending DATA as there is no more room
 			 * at the receiver.
@@ -632,7 +632,7 @@ redo:
 
 		case SCTP_XMIT_DELAY:
 			/* Send this packet. */
-			error = sctp_packet_transmit(pkt);
+			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
 
 			/* Stop sending DATA because of nagle delay. */
 			done = 1;
@@ -685,12 +685,12 @@ redo:
 }
 
 /* Cork the outqueue so queued chunks are really queued. */
-int sctp_outq_uncork(struct sctp_outq *q)
+int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
 {
 	if (q->cork)
 		q->cork = 0;
 
-	return sctp_outq_flush(q, 0);
+	return sctp_outq_flush(q, 0, gfp);
 }
 
 
@@ -703,7 +703,7 @@ int sctp_outq_uncork(struct sctp_outq *q)
  * locking concerns must be made.  Today we use the sock lock to protect
  * this function.
  */
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_packet *packet;
 	struct sctp_packet singleton;
@@ -825,7 +825,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			sctp_packet_init(&singleton, transport, sport, dport);
 			sctp_packet_config(&singleton, vtag, 0);
 			sctp_packet_append_chunk(&singleton, chunk);
-			error = sctp_packet_transmit(&singleton);
+			error = sctp_packet_transmit(&singleton, gfp);
 			if (error < 0)
 				return error;
 			break;
@@ -856,7 +856,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 		case SCTP_CID_ASCONF:
 		case SCTP_CID_FWD_TSN:
 			status = sctp_packet_transmit_chunk(packet, chunk,
-							    one_packet);
+							    one_packet, gfp);
 			if (status  != SCTP_XMIT_OK) {
 				/* put the chunk back */
 				list_add(&chunk->list, &q->control_chunk_list);
@@ -1011,7 +1011,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 				 atomic_read(&chunk->skb->users) : -1);
 
 			/* Add the chunk to the packet.  */
-			status = sctp_packet_transmit_chunk(packet, chunk, 0);
+			status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
 
 			switch (status) {
 			case SCTP_XMIT_PMTU_FULL:
@@ -1088,7 +1088,7 @@ sctp_flush_out:
 						      send_ready);
 		packet = &t->packet;
 		if (!sctp_packet_empty(packet))
-			error = sctp_packet_transmit(packet);
+			error = sctp_packet_transmit(packet, gfp);
 
 		/* Clear the burst limited state, if any */
 		sctp_transport_burst_reset(t);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5d6a03fad378..8449ca26aa0b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -62,11 +62,13 @@
 #include <net/sctp/sm.h>
 
 static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
-					    __u8 type, __u8 flags, int paylen);
+					    __u8 type, __u8 flags, int paylen,
+					    gfp_t gfp);
 static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
-					 __u8 flags, int paylen);
+					 __u8 flags, int paylen, gfp_t gfp);
 static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
-					   __u8 type, __u8 flags, int paylen);
+					   __u8 type, __u8 flags, int paylen,
+					   gfp_t gfp);
 static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const struct sctp_chunk *init_chunk,
@@ -318,7 +320,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	 * PLEASE DO NOT FIXME [This version does not support Host Name.]
 	 */
 
-	retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize);
+	retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp);
 	if (!retval)
 		goto nodata;
 
@@ -465,7 +467,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 					num_ext);
 
 	/* Now allocate and fill out the chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize);
+	retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
 	if (!retval)
 		goto nomem_chunk;
 
@@ -570,7 +572,8 @@ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
 	cookie_len = asoc->peer.cookie_len;
 
 	/* Build a cookie echo chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len);
+	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0,
+				   cookie_len, GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 	retval->subh.cookie_hdr =
@@ -615,7 +618,7 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
 {
 	struct sctp_chunk *retval;
 
-	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0);
+	retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -664,7 +667,7 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
 
 	cwr.lowest_tsn = htonl(lowest_tsn);
 	retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0,
-				   sizeof(sctp_cwrhdr_t));
+				   sizeof(sctp_cwrhdr_t), GFP_ATOMIC);
 
 	if (!retval)
 		goto nodata;
@@ -698,7 +701,7 @@ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
 
 	ecne.lowest_tsn = htonl(lowest_tsn);
 	retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0,
-				   sizeof(sctp_ecnehdr_t));
+				   sizeof(sctp_ecnehdr_t), GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 	retval->subh.ecne_hdr =
@@ -713,7 +716,8 @@ nodata:
  */
 struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
 				       const struct sctp_sndrcvinfo *sinfo,
-				       int data_len, __u8 flags, __u16 ssn)
+				       int data_len, __u8 flags, __u16 ssn,
+				       gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 	struct sctp_datahdr dp;
@@ -734,7 +738,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
 		dp.ssn = htons(ssn);
 
 	chunk_len = sizeof(dp) + data_len;
-	retval = sctp_make_data(asoc, flags, chunk_len);
+	retval = sctp_make_data(asoc, flags, chunk_len, gfp);
 	if (!retval)
 		goto nodata;
 
@@ -781,7 +785,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 		+ sizeof(__u32) * num_dup_tsns;
 
 	/* Create the chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len);
+	retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
@@ -861,7 +865,7 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
 	shut.cum_tsn_ack = htonl(ctsn);
 
 	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0,
-				   sizeof(sctp_shutdownhdr_t));
+				   sizeof(sctp_shutdownhdr_t), GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
@@ -879,7 +883,8 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
 {
 	struct sctp_chunk *retval;
 
-	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0);
+	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0,
+				   GFP_ATOMIC);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -908,7 +913,8 @@ struct sctp_chunk *sctp_make_shutdown_complete(
 	 */
 	flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T;
 
-	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0);
+	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags,
+				   0, GFP_ATOMIC);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -947,7 +953,8 @@ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
 			flags = SCTP_CHUNK_FLAG_T;
 	}
 
-	retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint);
+	retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint,
+				   GFP_ATOMIC);
 
 	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
 	 *
@@ -1139,7 +1146,8 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
 	struct sctp_chunk *retval;
 	sctp_sender_hb_info_t hbinfo;
 
-	retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo));
+	retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0,
+				   sizeof(hbinfo), GFP_ATOMIC);
 
 	if (!retval)
 		goto nodata;
@@ -1167,7 +1175,8 @@ struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
 {
 	struct sctp_chunk *retval;
 
-	retval  = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen);
+	retval  = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen,
+				    GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
@@ -1200,7 +1209,7 @@ static struct sctp_chunk *sctp_make_op_error_space(
 	struct sctp_chunk *retval;
 
 	retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0,
-				   sizeof(sctp_errhdr_t) + size);
+				   sizeof(sctp_errhdr_t) + size, GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
@@ -1271,7 +1280,8 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
 		return NULL;
 
 	retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0,
-			hmac_desc->hmac_len + sizeof(sctp_authhdr_t));
+			hmac_desc->hmac_len + sizeof(sctp_authhdr_t),
+			GFP_ATOMIC);
 	if (!retval)
 		return NULL;
 
@@ -1309,11 +1319,11 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
  */
 struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
 			    const struct sctp_association *asoc,
-			    struct sock *sk)
+			    struct sock *sk, gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 
-	retval = kmem_cache_zalloc(sctp_chunk_cachep, GFP_ATOMIC);
+	retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp);
 
 	if (!retval)
 		goto nodata;
@@ -1361,7 +1371,8 @@ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk)
  * arguments, reserving enough space for a 'paylen' byte payload.
  */
 static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
-					    __u8 type, __u8 flags, int paylen)
+					    __u8 type, __u8 flags, int paylen,
+					    gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 	sctp_chunkhdr_t *chunk_hdr;
@@ -1369,8 +1380,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
 	struct sock *sk;
 
 	/* No need to allocate LL here, as this is only a chunk. */
-	skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen),
-			GFP_ATOMIC);
+	skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp);
 	if (!skb)
 		goto nodata;
 
@@ -1381,7 +1391,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
 	chunk_hdr->length = htons(sizeof(sctp_chunkhdr_t));
 
 	sk = asoc ? asoc->base.sk : NULL;
-	retval = sctp_chunkify(skb, asoc, sk);
+	retval = sctp_chunkify(skb, asoc, sk, gfp);
 	if (!retval) {
 		kfree_skb(skb);
 		goto nodata;
@@ -1400,16 +1410,18 @@ nodata:
 }
 
 static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
-					 __u8 flags, int paylen)
+					 __u8 flags, int paylen, gfp_t gfp)
 {
-	return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen);
+	return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp);
 }
 
 static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
-					    __u8 type, __u8 flags, int paylen)
+					    __u8 type, __u8 flags, int paylen,
+					    gfp_t gfp)
 {
-	struct sctp_chunk *chunk = _sctp_make_chunk(asoc, type, flags, paylen);
+	struct sctp_chunk *chunk;
 
+	chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp);
 	if (chunk)
 		sctp_control_set_owner_w(chunk);
 
@@ -2756,7 +2768,8 @@ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc,
 	length += addrlen;
 
 	/* Create the chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length);
+	retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length,
+				   GFP_ATOMIC);
 	if (!retval)
 		return NULL;
 
@@ -2940,7 +2953,8 @@ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *as
 	int			length = sizeof(asconf) + vparam_len;
 
 	/* Create the chunk.  */
-	retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length);
+	retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length,
+				   GFP_ATOMIC);
 	if (!retval)
 		return NULL;
 
@@ -3500,7 +3514,7 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 
 	hint = (nstreams + 1) * sizeof(__u32);
 
-	retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint);
+	retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC);
 
 	if (!retval)
 		return NULL;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index b5327bb77458..3c22c41a2bc2 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1019,13 +1019,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
  * encouraged for small fragments.
  */
 static int sctp_cmd_send_msg(struct sctp_association *asoc,
-				struct sctp_datamsg *msg)
+				struct sctp_datamsg *msg, gfp_t gfp)
 {
 	struct sctp_chunk *chunk;
 	int error = 0;
 
 	list_for_each_entry(chunk, &msg->chunks, frag_list) {
-		error = sctp_outq_tail(&asoc->outqueue, chunk);
+		error = sctp_outq_tail(&asoc->outqueue, chunk, gfp);
 		if (error)
 			break;
 	}
@@ -1249,7 +1249,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 		case SCTP_CMD_NEW_ASOC:
 			/* Register a new association.  */
 			if (local_cork) {
-				sctp_outq_uncork(&asoc->outqueue);
+				sctp_outq_uncork(&asoc->outqueue, gfp);
 				local_cork = 0;
 			}
 
@@ -1269,7 +1269,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_DELETE_TCB:
 			if (local_cork) {
-				sctp_outq_uncork(&asoc->outqueue);
+				sctp_outq_uncork(&asoc->outqueue, gfp);
 				local_cork = 0;
 			}
 			/* Delete the current association.  */
@@ -1423,13 +1423,14 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				local_cork = 1;
 			}
 			/* Send a chunk to our peer.  */
-			error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk);
+			error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk,
+					       gfp);
 			break;
 
 		case SCTP_CMD_SEND_PKT:
 			/* Send a full packet to our peer.  */
 			packet = cmd->obj.packet;
-			sctp_packet_transmit(packet);
+			sctp_packet_transmit(packet, gfp);
 			sctp_ootb_pkt_free(packet);
 			break;
 
@@ -1639,7 +1640,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 			 */
 			chunk->pdiscard = 1;
 			if (asoc) {
-				sctp_outq_uncork(&asoc->outqueue);
+				sctp_outq_uncork(&asoc->outqueue, gfp);
 				local_cork = 0;
 			}
 			break;
@@ -1677,7 +1678,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 		case SCTP_CMD_FORCE_PRIM_RETRAN:
 			t = asoc->peer.retran_path;
 			asoc->peer.retran_path = asoc->peer.primary_path;
-			error = sctp_outq_uncork(&asoc->outqueue);
+			error = sctp_outq_uncork(&asoc->outqueue, gfp);
 			local_cork = 0;
 			asoc->peer.retran_path = t;
 			break;
@@ -1704,7 +1705,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				sctp_outq_cork(&asoc->outqueue);
 				local_cork = 1;
 			}
-			error = sctp_cmd_send_msg(asoc, cmd->obj.msg);
+			error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
 			break;
 		case SCTP_CMD_SEND_NEXT_ASCONF:
 			sctp_cmd_send_asconf(asoc);
@@ -1734,9 +1735,9 @@ out:
 	 */
 	if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
 		if (chunk->end_of_packet || chunk->singleton)
-			error = sctp_outq_uncork(&asoc->outqueue);
+			error = sctp_outq_uncork(&asoc->outqueue, gfp);
 	} else if (local_cork)
-		error = sctp_outq_uncork(&asoc->outqueue);
+		error = sctp_outq_uncork(&asoc->outqueue, gfp);
 	return error;
 nomem:
 	error = -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From 885eb0a516e4d686849b91c5a1ba25c70b7a6540 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 11 Mar 2016 09:43:58 +0100
Subject: net: adjust napi_consume_skb to handle non-NAPI callers

Some drivers reuse/share code paths that free SKBs between NAPI
and non-NAPI calls. Adjust napi_consume_skb to handle this
use-case.

Before, calls from netpoll (w/ IRQs disabled) was handled and
indicated with a budget zero indication.  Use the same zero
indication to handle calls not originating from NAPI/softirq.
Simply handled by using dev_consume_skb_any().

This adds an extra branch+call for the netpoll case (checking
in_irq() + irqs_disabled()), but that is okay as this is a slowpath.

Suggested-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 51d768e7bc90..f044f970f1a6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -801,9 +801,9 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
 	if (unlikely(!skb))
 		return;
 
-	/* if budget is 0 assume netpoll w/ IRQs disabled */
+	/* Zero budget indicate non-NAPI context called us, like netpoll */
 	if (unlikely(!budget)) {
-		dev_consume_skb_irq(skb);
+		dev_consume_skb_any(skb);
 		return;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From f3c986908cc3b369b57f75de306c635a0074b76b Mon Sep 17 00:00:00 2001
From: "liping.zhang" <liping.zhang@spreadtrum.com>
Date: Fri, 11 Mar 2016 23:08:36 +0800
Subject: net: socket: use pr_info_once to tip the obsolete usage of PF_PACKET

There is no need to use the static variable here, pr_info_once is more
concise.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 886649c88d8f..c5ddc52cf2b2 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1107,12 +1107,8 @@ int __sock_create(struct net *net, int family, int type, int protocol,
 	   deadlock in module load.
 	 */
 	if (family == PF_INET && type == SOCK_PACKET) {
-		static int warned;
-		if (!warned) {
-			warned = 1;
-			pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
-				current->comm);
-		}
+		pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
+			     current->comm);
 		family = PF_PACKET;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From fbd40ea0180a2d328c5adc61414dc8bab9335ce2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 13 Mar 2016 23:28:00 -0400
Subject: ipv4: Don't do expensive useless work during inetdev destroy.

When an inetdev is destroyed, every address assigned to the interface
is removed.  And in this scenerio we do two pointless things which can
be very expensive if the number of assigned interfaces is large:

1) Address promotion.  We are deleting all addresses, so there is no
   point in doing this.

2) A full nf conntrack table purge for every address.  We only need to
   do this once, as is already caught by the existing
   masq_dev_notifier so masq_inet_event() can skip this.

Reported-by: Solar Designer <solar@openwall.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Tested-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
 net/ipv4/devinet.c                          |  4 ++++
 net/ipv4/fib_frontend.c                     |  4 ++++
 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c | 12 ++++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 65e76a48382c..e333bc86bd39 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -334,6 +334,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 
 	ASSERT_RTNL();
 
+	if (in_dev->dead)
+		goto no_promotions;
+
 	/* 1. Deleting primary ifaddr forces deletion all secondaries
 	 * unless alias promotion is set
 	 **/
@@ -380,6 +383,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 			fib_del_ifaddr(ifa, ifa1);
 	}
 
+no_promotions:
 	/* 2. Unlink it */
 
 	*ifap = ifa1->ifa_next;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 473447593060..21add552e56a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -922,6 +922,9 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 		subnet = 1;
 	}
 
+	if (in_dev->dead)
+		goto no_promotions;
+
 	/* Deletion is more complicated than add.
 	 * We should take care of not to delete too much :-)
 	 *
@@ -997,6 +1000,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 		}
 	}
 
+no_promotions:
 	if (!(ok & BRD_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 	if (subnet && ifa->ifa_prefixlen < 31) {
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index c6eb42100e9a..ea91058b5f6f 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -108,10 +108,18 @@ static int masq_inet_event(struct notifier_block *this,
 			   unsigned long event,
 			   void *ptr)
 {
-	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+	struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
 	struct netdev_notifier_info info;
 
-	netdev_notifier_info_init(&info, dev);
+	/* The masq_dev_notifier will catch the case of the device going
+	 * down.  So if the inetdev is dead and being destroyed we have
+	 * no work to do.  Otherwise this is an individual address removal
+	 * and we have to perform the flush.
+	 */
+	if (idev->dead)
+		return NOTIFY_DONE;
+
+	netdev_notifier_info_init(&info, idev->dev);
 	return masq_device_event(this, event, &info);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 1e94082963747b551b129528714827f76a090e93 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 11 Mar 2016 14:05:41 -0800
Subject: ipv6: Pass proto to csum_ipv6_magic as __u8 instead of unsigned short

This patch updates csum_ipv6_magic so that it correctly recognizes that
protocol is a unsigned 8 bit value.

This will allow us to better understand what limitations may or may not be
present in how we handle the data.  For example there are a number of
places that call htonl on the protocol value.  This is likely not necessary
and can be replaced with a multiplication by ntohl(1) which will be
converted to a shift by the compiler.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/asm/checksum.h    | 3 +--
 arch/arm/include/asm/checksum.h      | 4 ++--
 arch/frv/include/asm/checksum.h      | 2 +-
 arch/ia64/include/asm/checksum.h     | 4 ++--
 arch/m68k/include/asm/checksum.h     | 2 +-
 arch/mips/include/asm/checksum.h     | 2 +-
 arch/parisc/include/asm/checksum.h   | 2 +-
 arch/score/include/asm/checksum.h    | 5 ++---
 arch/sh/include/asm/checksum_32.h    | 3 +--
 arch/sparc/include/asm/checksum_32.h | 3 +--
 arch/sparc/include/asm/checksum_64.h | 3 +--
 arch/x86/include/asm/checksum_32.h   | 3 +--
 arch/x86/include/asm/checksum_64.h   | 2 +-
 arch/x86/lib/csum-wrappers_64.c      | 2 +-
 arch/x86/um/asm/checksum_32.h        | 2 +-
 arch/xtensa/include/asm/checksum.h   | 2 +-
 include/net/ip6_checksum.h           | 3 +--
 net/ipv6/ip6_checksum.c              | 3 +--
 18 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/arch/alpha/include/asm/checksum.h b/arch/alpha/include/asm/checksum.h
index cba34b1c738c..f2bbdd2ace51 100644
--- a/arch/alpha/include/asm/checksum.h
+++ b/arch/alpha/include/asm/checksum.h
@@ -67,6 +67,5 @@ static inline __sum16 csum_fold(__wsum csum)
 #define _HAVE_ARCH_IPV6_CSUM
 extern __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 			       const struct in6_addr *daddr,
-			       __u32 len, unsigned short proto,
-			       __wsum sum);
+			       __u32 len, __u8 proto, __wsum sum);
 #endif
diff --git a/arch/arm/include/asm/checksum.h b/arch/arm/include/asm/checksum.h
index 42d020b7dfba..524692f4acab 100644
--- a/arch/arm/include/asm/checksum.h
+++ b/arch/arm/include/asm/checksum.h
@@ -144,8 +144,8 @@ __csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __
 		__be32 proto, __wsum sum);
 
 static inline __sum16
-csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len,
-		unsigned short proto, __wsum sum)
+csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
+		__u32 len, __u8 proto, __wsum sum)
 {
 	return csum_fold(__csum_ipv6_magic(saddr, daddr, htonl(len),
 					   htonl(proto), sum));
diff --git a/arch/frv/include/asm/checksum.h b/arch/frv/include/asm/checksum.h
index cd59cd4fd2d9..b77388c5901d 100644
--- a/arch/frv/include/asm/checksum.h
+++ b/arch/frv/include/asm/checksum.h
@@ -135,7 +135,7 @@ extern __sum16 ip_compute_csum(const void *buff, int len);
 #define _HAVE_ARCH_IPV6_CSUM
 static inline __sum16
 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, unsigned short proto, __wsum sum)
+		__u32 len, __u8 proto, __wsum sum)
 {
 	unsigned long tmp, tmp2;
 
diff --git a/arch/ia64/include/asm/checksum.h b/arch/ia64/include/asm/checksum.h
index ac9c687e8384..7accf54162b2 100644
--- a/arch/ia64/include/asm/checksum.h
+++ b/arch/ia64/include/asm/checksum.h
@@ -69,7 +69,7 @@ static inline __sum16 csum_fold(__wsum csum)
 #define _HAVE_ARCH_IPV6_CSUM	1
 struct in6_addr;
 extern __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-	const struct in6_addr *daddr, __u32 len, unsigned short proto,
-	__wsum csum);
+			       const struct in6_addr *daddr,
+			       __u32 len, __u8 proto, __wsum csum);
 
 #endif /* _ASM_IA64_CHECKSUM_H */
diff --git a/arch/m68k/include/asm/checksum.h b/arch/m68k/include/asm/checksum.h
index 2f88d867c711..75e91f03b178 100644
--- a/arch/m68k/include/asm/checksum.h
+++ b/arch/m68k/include/asm/checksum.h
@@ -117,7 +117,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16
 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, unsigned short proto, __wsum sum)
+		__u32 len, __u8 proto, __wsum sum)
 {
 	register unsigned long tmp;
 	__asm__("addl %2@,%0\n\t"
diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h
index c635541d40b8..bce1ce53149a 100644
--- a/arch/mips/include/asm/checksum.h
+++ b/arch/mips/include/asm/checksum.h
@@ -215,7 +215,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 					  const struct in6_addr *daddr,
-					  __u32 len, unsigned short proto,
+					  __u32 len, __u8 proto,
 					  __wsum sum)
 {
 	__wsum tmp;
diff --git a/arch/parisc/include/asm/checksum.h b/arch/parisc/include/asm/checksum.h
index 9815ab1fc8aa..60c2c42619c9 100644
--- a/arch/parisc/include/asm/checksum.h
+++ b/arch/parisc/include/asm/checksum.h
@@ -122,7 +122,7 @@ static inline __sum16 ip_compute_csum(const void *buf, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 					  const struct in6_addr *daddr,
-					  __u32 len, unsigned short proto,
+					  __u32 len, __u8 proto,
 					  __wsum sum)
 {
 	__asm__ __volatile__ (
diff --git a/arch/score/include/asm/checksum.h b/arch/score/include/asm/checksum.h
index a375bc2700be..539d9fd45d21 100644
--- a/arch/score/include/asm/checksum.h
+++ b/arch/score/include/asm/checksum.h
@@ -179,9 +179,8 @@ static inline unsigned short ip_compute_csum(const void *buff, int len)
 
 #define _HAVE_ARCH_IPV6_CSUM
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-				const struct in6_addr *daddr,
-				__u32 len, unsigned short proto,
-				__wsum sum)
+				      const struct in6_addr *daddr,
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	__asm__ __volatile__(
 		".set\tvolatile\t\t\t# csum_ipv6_magic\n\t"
diff --git a/arch/sh/include/asm/checksum_32.h b/arch/sh/include/asm/checksum_32.h
index fd730f140c06..9c84386d35cb 100644
--- a/arch/sh/include/asm/checksum_32.h
+++ b/arch/sh/include/asm/checksum_32.h
@@ -159,8 +159,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
-				      __u32 len, unsigned short proto,
-				      __wsum sum)
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	unsigned int __dummy;
 	__asm__("clrt\n\t"
diff --git a/arch/sparc/include/asm/checksum_32.h b/arch/sparc/include/asm/checksum_32.h
index 86ae655a3c0f..eff748c871ec 100644
--- a/arch/sparc/include/asm/checksum_32.h
+++ b/arch/sparc/include/asm/checksum_32.h
@@ -199,8 +199,7 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
-				      __u32 len, unsigned short proto,
-				      __wsum sum)
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	__asm__ __volatile__ (
 		"addcc	%3, %4, %%g4\n\t"
diff --git a/arch/sparc/include/asm/checksum_64.h b/arch/sparc/include/asm/checksum_64.h
index ef0c6f48189a..0395d75322e9 100644
--- a/arch/sparc/include/asm/checksum_64.h
+++ b/arch/sparc/include/asm/checksum_64.h
@@ -125,8 +125,7 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
-				      __u32 len, unsigned short proto,
-				      __wsum sum)
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	__asm__ __volatile__ (
 "	addcc		%3, %4, %%g7\n"
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 6f380605403d..532f85e6651f 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -149,8 +149,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
-				      __u32 len, unsigned short proto,
-				      __wsum sum)
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	asm("addl 0(%1), %0	;\n"
 	    "adcl 4(%1), %0	;\n"
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 97b98e2039bc..c020ee75dce7 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -177,7 +177,7 @@ struct in6_addr;
 #define _HAVE_ARCH_IPV6_CSUM 1
 extern __sum16
 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, unsigned short proto, __wsum sum);
+		__u32 len, __u8 proto, __wsum sum);
 
 static inline unsigned add32_with_carry(unsigned a, unsigned b)
 {
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 1318f75d56e4..28a6654f0d08 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck);
 
 __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 			const struct in6_addr *daddr,
-			__u32 len, unsigned short proto, __wsum sum)
+			__u32 len, __u8 proto, __wsum sum)
 {
 	__u64 rest, sum64;
 
diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h
index ab77b6f9a4bf..83a75f8a1233 100644
--- a/arch/x86/um/asm/checksum_32.h
+++ b/arch/x86/um/asm/checksum_32.h
@@ -13,7 +13,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 					  const struct in6_addr *daddr,
-					  __u32 len, unsigned short proto,
+					  __u32 len, __u8 proto,
 					  __wsum sum)
 {
 	__asm__(
diff --git a/arch/xtensa/include/asm/checksum.h b/arch/xtensa/include/asm/checksum.h
index 62254e6688f5..ec35074fcb03 100644
--- a/arch/xtensa/include/asm/checksum.h
+++ b/arch/xtensa/include/asm/checksum.h
@@ -175,7 +175,7 @@ static __inline__ __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 					  const struct in6_addr *daddr,
-					  __u32 len, unsigned short proto,
+					  __u32 len, __u8 proto,
 					  __wsum sum)
 {
 	unsigned int __dummy;
diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h
index 1a49b73f7f6e..cca840584c88 100644
--- a/include/net/ip6_checksum.h
+++ b/include/net/ip6_checksum.h
@@ -37,8 +37,7 @@
 #ifndef _HAVE_ARCH_IPV6_CSUM
 __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 			const struct in6_addr *daddr,
-			__u32 len, unsigned short proto,
-			__wsum csum);
+			__u32 len, __u8 proto, __wsum csum);
 #endif
 
 static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto)
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index 8f920580976f..b2025bf3da4a 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -6,8 +6,7 @@
 #ifndef _HAVE_ARCH_IPV6_CSUM
 __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 			const struct in6_addr *daddr,
-			__u32 len, unsigned short proto,
-			__wsum csum)
+			__u32 len, __u8 proto, __wsum csum)
 {
 
 	int carry;
-- 
cgit v1.2.3-59-g8ed1b


From 08334824951dd6d1295860da07b1236d18b0b8df Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 11 Mar 2016 14:05:47 -0800
Subject: GSO/UDP: Use skb->len instead of udph->len to determine length of
 original skb

It is possible for tunnels to end up generating IP or IPv6 datagrams that
are larger than 64K and expecting to be segmented.  As such we need to deal
with length values greater than 64K.  In order to accommodate this we need
to update the code to work with a 32b length value instead of a 16b one.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f5abb1ae1358..8a3405a80260 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -40,13 +40,19 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	__be16 protocol = skb->protocol;
 	u16 mac_len = skb->mac_len;
 	int udp_offset, outer_hlen;
-	u32 partial;
+	__wsum partial;
 
 	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
 		goto out;
 
-	/* adjust partial header checksum to negate old length */
-	partial = (__force u32)uh->check + (__force u16)~uh->len;
+	/* Adjust partial header checksum to negate old length.
+	 * We cannot rely on the value contained in uh->len as it is
+	 * possible that the actual value exceeds the boundaries of the
+	 * 16 bit length field due to the header being added outside of an
+	 * IP or IPv6 frame that was already limited to 64K - 1.
+	 */
+	partial = csum_sub(csum_unfold(uh->check),
+			   (__force __wsum)htonl(skb->len));
 
 	/* setup inner skb. */
 	skb->encapsulation = 0;
@@ -119,8 +125,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		if (!need_csum)
 			continue;
 
-		uh->check = ~csum_fold((__force __wsum)
-				       ((__force u32)len + partial));
+		uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len));
 
 		if (skb->encapsulation || !offload_csum) {
 			uh->check = gso_make_checksum(skb, ~uh->check);
-- 
cgit v1.2.3-59-g8ed1b


From 8cb2d8bf57e6e004c37db2fb4ce74f4d032b7cd0 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Mon, 14 Mar 2016 09:39:04 +0100
Subject: net: add a hardware buffer management helper API

This basic implementation allows to share code between driver using
hardware buffer management. As the code is hardware agnostic, there is
few helpers, most of the optimization brought by the an HW BM has to be
done at driver level.

Tested-by: Sebastian Careba <nitroshift@yahoo.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/hwbm.h | 28 ++++++++++++++++++
 net/Kconfig        |  3 ++
 net/core/Makefile  |  1 +
 net/core/hwbm.c    | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 119 insertions(+)
 create mode 100644 include/net/hwbm.h
 create mode 100644 net/core/hwbm.c

(limited to 'net')

diff --git a/include/net/hwbm.h b/include/net/hwbm.h
new file mode 100644
index 000000000000..47d08662501b
--- /dev/null
+++ b/include/net/hwbm.h
@@ -0,0 +1,28 @@
+#ifndef _HWBM_H
+#define _HWBM_H
+
+struct hwbm_pool {
+	/* Capacity of the pool */
+	int size;
+	/* Size of the buffers managed */
+	int frag_size;
+	/* Number of buffers currently used by this pool */
+	int buf_num;
+	/* constructor called during alocation */
+	int (*construct)(struct hwbm_pool *bm_pool, void *buf);
+	/* protect acces to the buffer counter*/
+	spinlock_t lock;
+	/* private data */
+	void *priv;
+};
+#ifdef CONFIG_HWBM
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf);
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp);
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp);
+#else
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) {}
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) { return 0; }
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+{ return 0; }
+#endif /* CONFIG_HWBM */
+#endif /* _HWBM_H */
diff --git a/net/Kconfig b/net/Kconfig
index 10640d5f8bee..e13449870d06 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -253,6 +253,9 @@ config XPS
 	depends on SMP
 	default y
 
+config HWBM
+       bool
+
 config SOCK_CGROUP_DATA
 	bool
 	default n
diff --git a/net/core/Makefile b/net/core/Makefile
index 014422e2561f..d6508c2ddca5 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -25,4 +25,5 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
new file mode 100644
index 000000000000..941c28486896
--- /dev/null
+++ b/net/core/hwbm.c
@@ -0,0 +1,87 @@
+/* Support for hardware buffer manager.
+ *
+ * Copyright (C) 2016 Marvell
+ *
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/skbuff.h>
+#include <net/hwbm.h>
+
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf)
+{
+	if (likely(bm_pool->frag_size <= PAGE_SIZE))
+		skb_free_frag(buf);
+	else
+		kfree(buf);
+}
+EXPORT_SYMBOL_GPL(hwbm_buf_free);
+
+/* Refill processing for HW buffer management */
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
+{
+	int frag_size = bm_pool->frag_size;
+	void *buf;
+
+	if (likely(frag_size <= PAGE_SIZE))
+		buf = netdev_alloc_frag(frag_size);
+	else
+		buf = kmalloc(frag_size, gfp);
+
+	if (!buf)
+		return -ENOMEM;
+
+	if (bm_pool->construct)
+		if (bm_pool->construct(bm_pool, buf)) {
+			hwbm_buf_free(bm_pool, buf);
+			return -ENOMEM;
+		}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_refill);
+
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+{
+	int err, i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&bm_pool->lock, flags);
+	if (bm_pool->buf_num == bm_pool->size) {
+		pr_warn("pool already filled\n");
+		return bm_pool->buf_num;
+	}
+
+	if (buf_num + bm_pool->buf_num > bm_pool->size) {
+		pr_warn("cannot allocate %d buffers for pool\n",
+			buf_num);
+		return 0;
+	}
+
+	if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
+		pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
+			buf_num,  bm_pool->buf_num);
+		return 0;
+	}
+
+	for (i = 0; i < buf_num; i++) {
+		err = hwbm_pool_refill(bm_pool, gfp);
+		if (err < 0)
+			break;
+	}
+
+	/* Update BM driver with number of buffers added to pool */
+	bm_pool->buf_num += i;
+
+	pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
+	spin_unlock_irqrestore(&bm_pool->lock, flags);
+
+	return i;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_add);
-- 
cgit v1.2.3-59-g8ed1b


From 9bd160bfa27fa41927dbbce7ee0ea779700e09ef Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Mon, 14 Mar 2016 09:43:52 +0100
Subject: tipc: make sure IPv6 header fits in skb headroom

Expand headroom further in order to be able to fit the larger IPv6
header. Prior to this patch this caused a skb under panic for certain
tipc packets when using IPv6 UDP bearer(s).

Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/udp_media.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index c94f9a15e2cd..c9cf2be3674a 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -53,7 +53,7 @@
 /* IANA assigned UDP port */
 #define UDP_PORT_DEFAULT	6118
 
-#define UDP_MIN_HEADROOM        28
+#define UDP_MIN_HEADROOM        48
 
 /**
  * struct udp_media_addr - IP/UDP addressing information
-- 
cgit v1.2.3-59-g8ed1b


From 34b88a68f26a75e4fded796f1a49c40f82234b7d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 14 Mar 2016 09:56:35 -0300
Subject: net: Fix use after free in the recvmmsg exit path

The syzkaller fuzzer hit the following use-after-free:

  Call Trace:
   [<ffffffff8175ea0e>] __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:295
   [<ffffffff851cc31a>] __sys_recvmmsg+0x6fa/0x7f0 net/socket.c:2261
   [<     inline     >] SYSC_recvmmsg net/socket.c:2281
   [<ffffffff851cc57f>] SyS_recvmmsg+0x16f/0x180 net/socket.c:2270
   [<ffffffff86332bb6>] entry_SYSCALL_64_fastpath+0x16/0x7a
  arch/x86/entry/entry_64.S:185

And, as Dmitry rightly assessed, that is because we can drop the
reference and then touch it when the underlying recvmsg calls return
some packets and then hit an error, which will make recvmmsg to set
sock->sk->sk_err, oops, fix it.

Reported-and-Tested-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Fixes: a2e2725541fa ("net: Introduce recvmmsg socket syscall")
http://lkml.kernel.org/r/20160122211644.GC2470@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index c5ddc52cf2b2..5f77a8e93830 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2244,31 +2244,31 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 		cond_resched();
 	}
 
-out_put:
-	fput_light(sock->file, fput_needed);
-
 	if (err == 0)
-		return datagrams;
+		goto out_put;
 
-	if (datagrams != 0) {
+	if (datagrams == 0) {
+		datagrams = err;
+		goto out_put;
+	}
+
+	/*
+	 * We may return less entries than requested (vlen) if the
+	 * sock is non block and there aren't enough datagrams...
+	 */
+	if (err != -EAGAIN) {
 		/*
-		 * We may return less entries than requested (vlen) if the
-		 * sock is non block and there aren't enough datagrams...
+		 * ... or  if recvmsg returns an error after we
+		 * received some datagrams, where we record the
+		 * error to return on the next call or if the
+		 * app asks about it using getsockopt(SO_ERROR).
 		 */
-		if (err != -EAGAIN) {
-			/*
-			 * ... or  if recvmsg returns an error after we
-			 * received some datagrams, where we record the
-			 * error to return on the next call or if the
-			 * app asks about it using getsockopt(SO_ERROR).
-			 */
-			sock->sk->sk_err = -err;
-		}
-
-		return datagrams;
+		sock->sk->sk_err = -err;
 	}
+out_put:
+	fput_light(sock->file, fput_needed);
 
-	return err;
+	return datagrams;
 }
 
 SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
-- 
cgit v1.2.3-59-g8ed1b


From 8e0cc8c326d99e41468c96fea9785ab78883a281 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 14 Mar 2016 15:18:38 +0100
Subject: net: caif: fix misleading indentation

gcc points out code that is not indented the way it is
interpreted:

net/caif/cfpkt_skbuff.c: In function 'cfpkt_setlen':
net/caif/cfpkt_skbuff.c:289:4: error: statement is indented as if it were guarded by... [-Werror=misleading-indentation]
    return cfpkt_getlen(pkt);
    ^~~~~~
net/caif/cfpkt_skbuff.c:286:3: note: ...this 'else' clause, but it is not
   else
   ^~~~

It is clear from the context that not returning here would be
a bug, as we'd end up passing a negative length into a function
that takes a u16 length, so it is not missing curly braces
here, and I'm assuming that the indentation is the only part
that's wrong about it.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/cfpkt_skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index f6c3b2137eea..59ce1fcc220c 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -286,7 +286,7 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len)
 		else
 			skb_trim(skb, len);
 
-			return cfpkt_getlen(pkt);
+		return cfpkt_getlen(pkt);
 	}
 
 	/* Need to expand SKB */
-- 
cgit v1.2.3-59-g8ed1b


From a44d6eacdaf56f74fad699af7f4925a5f5ac0e7f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 14 Mar 2016 10:52:15 -0700
Subject: tcp: Add RFC4898 tcpEStatsPerfDataSegsOut/In

Per RFC4898, they count segments sent/received
containing a positive length data segment (that includes
retransmission segments carrying data).  Unlike
tcpi_segs_out/in, tcpi_data_segs_out/in excludes segments
carrying no data (e.g. pure ack).

The patch also updates the segs_in in tcp_fastopen_add_skb()
so that segs_in >= data_segs_in property is kept.

Together with retransmission data, tcpi_data_segs_out
gives a better signal on the rxmit rate.

v6: Rebase on the latest net-next

v5: Eric pointed out that checking skb->len is still needed in
tcp_fastopen_add_skb() because skb can carry a FIN without data.
Hence, instead of open coding segs_in and data_segs_in, tcp_segs_in()
helper is used.  Comment is added to the fastopen case to explain why
segs_in has to be reset and tcp_segs_in() has to be called before
__skb_pull().

v4: Add comment to the changes in tcp_fastopen_add_skb()
and also add remark on this case in the commit message.

v3: Add const modifier to the skb parameter in tcp_segs_in()

v2: Rework based on recent fix by Eric:
commit a9d99ce28ed3 ("tcp: fix tcpi_segs_in after connection establishment")

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Chris Rapier <rapier@psc.edu>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Marcelo Ricardo Leitner <mleitner@redhat.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  6 ++++++
 include/net/tcp.h        | 10 ++++++++++
 include/uapi/linux/tcp.h |  2 ++
 net/ipv4/tcp.c           |  2 ++
 net/ipv4/tcp_fastopen.c  |  8 ++++++++
 net/ipv4/tcp_ipv4.c      |  2 +-
 net/ipv4/tcp_minisocks.c |  2 +-
 net/ipv4/tcp_output.c    |  4 +++-
 net/ipv6/tcp_ipv6.c      |  2 +-
 9 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bcbf51da4e1e..7be9b1242354 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -158,6 +158,9 @@ struct tcp_sock {
 	u32	segs_in;	/* RFC4898 tcpEStatsPerfSegsIn
 				 * total number of segments in.
 				 */
+	u32	data_segs_in;	/* RFC4898 tcpEStatsPerfDataSegsIn
+				 * total number of data segments in.
+				 */
  	u32	rcv_nxt;	/* What we want to receive next 	*/
 	u32	copied_seq;	/* Head of yet unread data		*/
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
@@ -165,6 +168,9 @@ struct tcp_sock {
 	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
 				 * The total number of segments sent.
 				 */
+	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
+				 * total number of data segments sent.
+				 */
 	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0302636af98c..c8dbd293daae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1840,4 +1840,14 @@ static inline int tcp_inq(struct sock *sk)
 	return answ;
 }
 
+static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+	u16 segs_in;
+
+	segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tp->segs_in += segs_in;
+	if (skb->len > tcp_hdrlen(skb))
+		tp->data_segs_in += segs_in;
+}
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index fe95446e9abf..53e8e3fe6b1b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -199,6 +199,8 @@ struct tcp_info {
 
 	__u32	tcpi_notsent_bytes;
 	__u32	tcpi_min_rtt;
+	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
+	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a265f00b9df9..992b3103ec3e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2715,6 +2715,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_notsent_bytes = max(0, notsent_bytes);
 
 	info->tcpi_min_rtt = tcp_min_rtt(tp);
+	info->tcpi_data_segs_in = tp->data_segs_in;
+	info->tcpi_data_segs_out = tp->data_segs_out;
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index fdb286ddba04..4fc0061bebf4 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -140,6 +140,14 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 		return;
 
 	skb_dst_drop(skb);
+	/* segs_in has been initialized to 1 in tcp_create_openreq_child().
+	 * Hence, reset segs_in to 0 before calling tcp_segs_in()
+	 * to avoid double counting.  Also, tcp_segs_in() expects
+	 * skb->len to include the tcp_hdrlen.  Hence, it should
+	 * be called before __skb_pull().
+	 */
+	tp->segs_in = 0;
+	tcp_segs_in(tp, skb);
 	__skb_pull(skb, tcp_hdrlen(skb));
 	skb_set_owner_r(skb, sk);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4c8d58dfac9b..0b02ef773705 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1650,7 +1650,7 @@ process:
 	sk_incoming_cpu_update(sk);
 
 	bh_lock_sock_nested(sk);
-	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ae90e4b34bd3..acb366dd61e6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -812,7 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 	int ret = 0;
 	int state = child->sk_state;
 
-	tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {
 		ret = tcp_rcv_state_process(child, skb);
 		/* Wakeup parent, send SIGIO */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d2c7a400456..7d2dc015cd19 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1003,8 +1003,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	if (likely(tcb->tcp_flags & TCPHDR_ACK))
 		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
 
-	if (skb->len != tcp_header_size)
+	if (skb->len != tcp_header_size) {
 		tcp_event_data_sent(tp, sk);
+		tp->data_segs_out += tcp_skb_pcount(skb);
+	}
 
 	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
 		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 33f2820181f9..9c16565b70cc 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1443,7 +1443,7 @@ process:
 	sk_incoming_cpu_update(sk);
 
 	bh_lock_sock_nested(sk);
-	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
-- 
cgit v1.2.3-59-g8ed1b


From b71be352f76ac3c0559feec2fd960ab9c31404c9 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 12 Mar 2016 00:01:37 +0100
Subject: dsa: slave: Don't reference NULL pointer during phy_disconnect

When the phy is disconnected, the parent pointer to the netdev it was
attached to is set to NULL. The code then tries to suspend the phy,
but dsa_slave_fixed_link_update needs the parent pointer to determine
which switch the phy is connected to. So it dereferenced a NULL
pointer. Check for this condition.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 27bf03d11670..49056d90b179 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -896,11 +896,15 @@ static void dsa_slave_adjust_link(struct net_device *dev)
 static int dsa_slave_fixed_link_update(struct net_device *dev,
 				       struct fixed_phy_status *status)
 {
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->parent;
+	struct dsa_slave_priv *p;
+	struct dsa_switch *ds;
 
-	if (ds->drv->fixed_link_update)
-		ds->drv->fixed_link_update(ds, p->port, status);
+	if (dev) {
+		p = netdev_priv(dev);
+		ds = p->parent;
+		if (ds->drv->fixed_link_update)
+			ds->drv->fixed_link_update(ds, p->port, status);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 3a44514ff979d7b0192a9f67445509f4ea7dfe2f Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 12 Mar 2016 00:01:38 +0100
Subject: dsa: Destroy fixed link phys after the phy has been disconnected

The phy is disconnected from the slave in dsa_slave_destroy(). Don't
destroy fixed link phys until after this, since there can be fixed
linked phys connected to ports.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index d8fb47fcad05..1018e7dcfcc9 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -430,7 +430,18 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 		hwmon_device_unregister(ds->hwmon_dev);
 #endif
 
-	/* Disable configuration of the CPU and DSA ports */
+	/* Destroy network devices for physical switch ports. */
+	for (port = 0; port < DSA_MAX_PORTS; port++) {
+		if (!(ds->phys_port_mask & (1 << port)))
+			continue;
+
+		if (!ds->ports[port])
+			continue;
+
+		dsa_slave_destroy(ds->ports[port]);
+	}
+
+	/* Remove any fixed link PHYs */
 	for (port = 0; port < DSA_MAX_PORTS; port++) {
 		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
 			continue;
@@ -448,17 +459,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 		}
 	}
 
-	/* Destroy network devices for physical switch ports. */
-	for (port = 0; port < DSA_MAX_PORTS; port++) {
-		if (!(ds->phys_port_mask & (1 << port)))
-			continue;
-
-		if (!ds->ports[port])
-			continue;
-
-		dsa_slave_destroy(ds->ports[port]);
-	}
-
 	mdiobus_unregister(ds->slave_mii_bus);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From ec777e6b4254648be51146d4e7d735ee3d06bd13 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 12 Mar 2016 00:01:39 +0100
Subject: dsa: dsa: Fix freeing of fixed-phys from user ports.

All ports types can have a fixed PHY associated with it. Remove the
check which limits removal to only CPU and DSA ports.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 1018e7dcfcc9..f100f340d93f 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -443,9 +443,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 
 	/* Remove any fixed link PHYs */
 	for (port = 0; port < DSA_MAX_PORTS; port++) {
-		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
-			continue;
-
 		port_dn = cd->port_dn[port];
 		if (of_phy_is_fixed_link(port_dn)) {
 			phydev = of_phy_find_device(port_dn);
-- 
cgit v1.2.3-59-g8ed1b


From 5bcbe0f35fb13e31fdd0b2dc9695f19ab0208207 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 12 Mar 2016 00:01:40 +0100
Subject: phy: fixed: Fix removal of phys.

The fixed phys delete function simply removed the fixed phy from the
internal linked list and freed the memory. It however did not
unregister the associated phy device. This meant it was still possible
to find the phy device on the mdio bus.

Make fixed_phy_del() an internal function and add a
fixed_phy_unregister() to unregisters the phy device and then uses
fixed_phy_del() to free resources.

Modify DSA to use this new API function, so we don't leak phys.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/fixed_phy.c | 11 +++++++++--
 include/linux/phy_fixed.h   |  5 ++---
 net/dsa/dsa.c               |  4 +---
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index ab9c473d75ea..fc07a8866020 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -285,7 +285,7 @@ err_regs:
 }
 EXPORT_SYMBOL_GPL(fixed_phy_add);
 
-void fixed_phy_del(int phy_addr)
+static void fixed_phy_del(int phy_addr)
 {
 	struct fixed_mdio_bus *fmb = &platform_fmb;
 	struct fixed_phy *fp, *tmp;
@@ -300,7 +300,6 @@ void fixed_phy_del(int phy_addr)
 		}
 	}
 }
-EXPORT_SYMBOL_GPL(fixed_phy_del);
 
 static int phy_fixed_addr;
 static DEFINE_SPINLOCK(phy_fixed_addr_lock);
@@ -371,6 +370,14 @@ struct phy_device *fixed_phy_register(unsigned int irq,
 }
 EXPORT_SYMBOL_GPL(fixed_phy_register);
 
+void fixed_phy_unregister(struct phy_device *phy)
+{
+	phy_device_remove(phy);
+
+	fixed_phy_del(phy->mdio.addr);
+}
+EXPORT_SYMBOL_GPL(fixed_phy_unregister);
+
 static int __init fixed_mdio_bus_init(void)
 {
 	struct fixed_mdio_bus *fmb = &platform_fmb;
diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 2400d2ea4f34..1d41ec44e39d 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -19,7 +19,7 @@ extern struct phy_device *fixed_phy_register(unsigned int irq,
 					     struct fixed_phy_status *status,
 					     int link_gpio,
 					     struct device_node *np);
-extern void fixed_phy_del(int phy_addr);
+extern void fixed_phy_unregister(struct phy_device *phydev);
 extern int fixed_phy_set_link_update(struct phy_device *phydev,
 			int (*link_update)(struct net_device *,
 					   struct fixed_phy_status *));
@@ -40,9 +40,8 @@ static inline struct phy_device *fixed_phy_register(unsigned int irq,
 {
 	return ERR_PTR(-ENODEV);
 }
-static inline int fixed_phy_del(int phy_addr)
+static inline void fixed_phy_unregister(struct phy_device *phydev)
 {
-	return -ENODEV;
 }
 static inline int fixed_phy_set_link_update(struct phy_device *phydev,
 			int (*link_update)(struct net_device *,
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index f100f340d93f..c28c47463b7e 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -447,11 +447,9 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 		if (of_phy_is_fixed_link(port_dn)) {
 			phydev = of_phy_find_device(port_dn);
 			if (phydev) {
-				int addr = phydev->mdio.addr;
-
 				phy_device_free(phydev);
 				of_node_put(port_dn);
-				fixed_phy_del(addr);
+				fixed_phy_unregister(phydev);
 			}
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 8626c56c8279b88b91b8df55345cdf89542d5e90 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Mar 2016 11:14:42 +0100
Subject: bridge: fix potential use-after-free when hook returns QUEUE or
 STOLEN verdict

Zefir Kurtisi reported kernel panic with an openwrt specific patch.
However, it turns out that mainline has a similar bug waiting to happen.

Once NF_HOOK() returns the skb is in undefined state and must not be
used.   Moreover, the okfn must consume the skb to support async
processing (NF_QUEUE).

Current okfn in this spot doesn't consume it and caller assumes that
NF_HOOK return value tells us if skb was freed or not, but thats wrong.

It "works" because no in-tree user registers a NFPROTO_BRIDGE hook at
LOCAL_IN that returns STOLEN or NF_QUEUE verdicts.

Once we add NF_QUEUE support for nftables bridge this will break --
NF_QUEUE holds the skb for async processing, caller will erronoulsy
return RX_HANDLER_PASS and on reinject netfilter will access free'd skb.

Fix this by pushing skb up the stack in the okfn instead.

NB: It also seems dubious to use LOCAL_IN while bypassing PRE_ROUTING
completely in this case but this is how its been forever so it seems
preferable to not change this.

Cc: Felix Fietkau <nbd@openwrt.org>
Cc: Zefir Kurtisi <zefir.kurtisi@neratec.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Tested-by: Zefir Kurtisi <zefir.kurtisi@neratec.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_input.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index f7fba74108a9..160797722228 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -222,7 +222,10 @@ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_bu
 	/* check if vlan is allowed, to avoid spoofing */
 	if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
 		br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
-	return 0;	 /* process further */
+
+	BR_INPUT_SKB_CB(skb)->brdev = p->br->dev;
+	br_pass_frame_up(skb);
+	return 0;
 }
 
 /*
@@ -284,14 +287,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		}
 
 		/* Deliver packet to local host only */
-		if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
-			    dev_net(skb->dev), NULL, skb, skb->dev, NULL,
-			    br_handle_local_finish)) {
-			return RX_HANDLER_CONSUMED; /* consumed by filter */
-		} else {
-			*pskb = skb;
-			return RX_HANDLER_PASS;	/* continue processing */
-		}
+		NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev),
+			NULL, skb, skb->dev, NULL, br_handle_local_finish);
+		return RX_HANDLER_CONSUMED;
 	}
 
 forward:
-- 
cgit v1.2.3-59-g8ed1b


From 71327a4e7d997276d49db92fd3d30008389ee6d5 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Sun, 13 Mar 2016 16:21:32 -0400
Subject: net: dsa: rename port_*_bridge routines

Rename DSA port_join_bridge and port_leave_bridge routines to
respectively port_bridge_join and port_bridge_leave in order to respect
an implicit Port::Bridge namespace.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dsa/dsa.txt | 4 ++--
 drivers/net/dsa/bcm_sf2.c            | 4 ++--
 drivers/net/dsa/mv88e6171.c          | 4 ++--
 drivers/net/dsa/mv88e6352.c          | 4 ++--
 include/net/dsa.h                    | 4 ++--
 net/dsa/slave.c                      | 8 ++++----
 6 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index 974e9c387d1e..3b196c304b73 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -521,12 +521,12 @@ See Documentation/hwmon/sysfs-interface for details.
 Bridge layer
 ------------
 
-- port_join_bridge: bridge layer function invoked when a given switch port is
+- port_bridge_join: bridge layer function invoked when a given switch port is
   added to a bridge, this function should be doing the necessary at the switch
   level to permit the joining port from being added to the relevant logical
   domain for it to ingress/egress traffic with other members of the bridge.
 
-- port_leave_bridge: bridge layer function invoked when a given switch port is
+- port_bridge_leave: bridge layer function invoked when a given switch port is
   removed from a bridge, this function should be doing the necessary at the
   switch level to deny the leaving port from ingress/egress traffic from the
   remaining bridge members. When the port leaves the bridge, it should be aged
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 3f627598f277..4bcc9ebf5e06 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1387,8 +1387,8 @@ static struct dsa_switch_driver bcm_sf2_switch_driver = {
 	.port_disable		= bcm_sf2_port_disable,
 	.get_eee		= bcm_sf2_sw_get_eee,
 	.set_eee		= bcm_sf2_sw_set_eee,
-	.port_join_bridge	= bcm_sf2_sw_br_join,
-	.port_leave_bridge	= bcm_sf2_sw_br_leave,
+	.port_bridge_join	= bcm_sf2_sw_br_join,
+	.port_bridge_leave	= bcm_sf2_sw_br_leave,
 	.port_stp_update	= bcm_sf2_sw_br_set_stp_state,
 	.port_fdb_prepare	= bcm_sf2_sw_fdb_prepare,
 	.port_fdb_add		= bcm_sf2_sw_fdb_add,
diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
index d72ccbdf53ec..c0164b98fc08 100644
--- a/drivers/net/dsa/mv88e6171.c
+++ b/drivers/net/dsa/mv88e6171.c
@@ -103,8 +103,8 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
 #endif
 	.get_regs_len		= mv88e6xxx_get_regs_len,
 	.get_regs		= mv88e6xxx_get_regs,
-	.port_join_bridge	= mv88e6xxx_port_bridge_join,
-	.port_leave_bridge	= mv88e6xxx_port_bridge_leave,
+	.port_bridge_join	= mv88e6xxx_port_bridge_join,
+	.port_bridge_leave	= mv88e6xxx_port_bridge_leave,
 	.port_stp_update        = mv88e6xxx_port_stp_update,
 	.port_vlan_filtering	= mv88e6xxx_port_vlan_filtering,
 	.port_vlan_prepare	= mv88e6xxx_port_vlan_prepare,
diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c
index a41fa5043d77..5f528abc8af1 100644
--- a/drivers/net/dsa/mv88e6352.c
+++ b/drivers/net/dsa/mv88e6352.c
@@ -324,8 +324,8 @@ struct dsa_switch_driver mv88e6352_switch_driver = {
 	.set_eeprom		= mv88e6352_set_eeprom,
 	.get_regs_len		= mv88e6xxx_get_regs_len,
 	.get_regs		= mv88e6xxx_get_regs,
-	.port_join_bridge	= mv88e6xxx_port_bridge_join,
-	.port_leave_bridge	= mv88e6xxx_port_bridge_leave,
+	.port_bridge_join	= mv88e6xxx_port_bridge_join,
+	.port_bridge_leave	= mv88e6xxx_port_bridge_leave,
 	.port_stp_update	= mv88e6xxx_port_stp_update,
 	.port_vlan_filtering	= mv88e6xxx_port_vlan_filtering,
 	.port_vlan_prepare	= mv88e6xxx_port_vlan_prepare,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 26c0a3fa009a..004e034184c1 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -296,9 +296,9 @@ struct dsa_switch_driver {
 	/*
 	 * Bridge integration
 	 */
-	int	(*port_join_bridge)(struct dsa_switch *ds, int port,
+	int	(*port_bridge_join)(struct dsa_switch *ds, int port,
 				    struct net_device *bridge);
-	int	(*port_leave_bridge)(struct dsa_switch *ds, int port);
+	int	(*port_bridge_leave)(struct dsa_switch *ds, int port);
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 49056d90b179..52653d715f64 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -448,8 +448,8 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 
 	p->bridge_dev = br;
 
-	if (ds->drv->port_join_bridge)
-		ret = ds->drv->port_join_bridge(ds, p->port, br);
+	if (ds->drv->port_bridge_join)
+		ret = ds->drv->port_bridge_join(ds, p->port, br);
 
 	return ret;
 }
@@ -461,8 +461,8 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
 	int ret = -EOPNOTSUPP;
 
 
-	if (ds->drv->port_leave_bridge)
-		ret = ds->drv->port_leave_bridge(ds, p->port);
+	if (ds->drv->port_bridge_leave)
+		ret = ds->drv->port_bridge_leave(ds, p->port);
 
 	p->bridge_dev = NULL;
 
-- 
cgit v1.2.3-59-g8ed1b


From 16bfa7024eba5e36aff38ba62086b9027373007d Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Sun, 13 Mar 2016 16:21:33 -0400
Subject: net: dsa: make port_bridge_leave return void

netdev_upper_dev_unlink() which notifies NETDEV_CHANGEUPPER, returns
void, as well as del_nbp(). So there's no advantage to catch an eventual
error from the port_bridge_leave routine at the DSA level.

Make this routine void for the DSA layer and its existing drivers.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c   |  4 +---
 drivers/net/dsa/mv88e6xxx.c | 28 +++++++++-------------------
 drivers/net/dsa/mv88e6xxx.h |  2 +-
 include/net/dsa.h           |  2 +-
 net/dsa/slave.c             |  9 +++------
 5 files changed, 15 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 4bcc9ebf5e06..95944d5e3e22 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -516,7 +516,7 @@ static int bcm_sf2_sw_br_join(struct dsa_switch *ds, int port,
 	return 0;
 }
 
-static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
+static void bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
 {
 	struct bcm_sf2_priv *priv = ds_to_priv(ds);
 	struct net_device *bridge = priv->port_sts[port].bridge_dev;
@@ -543,8 +543,6 @@ static int bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
 	core_writel(priv, p_ctl, CORE_PORT_VLAN_CTL_PORT(port));
 	priv->port_sts[port].vlan_ctl_mask = p_ctl;
 	priv->port_sts[port].bridge_dev = NULL;
-
-	return 0;
 }
 
 static int bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 5309c738ff00..fa086e09d6b7 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2219,39 +2219,29 @@ unlock:
 	return err;
 }
 
-int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
+void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
 {
 	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 	struct net_device *bridge = ps->ports[port].bridge_dev;
 	u16 fid;
-	int i, err;
+	int i;
 
 	mutex_lock(&ps->smi_mutex);
 
 	/* Give the port a fresh Filtering Information Database */
-	err = _mv88e6xxx_fid_new(ds, &fid);
-	if (err)
-		goto unlock;
-
-	err = _mv88e6xxx_port_fid_set(ds, port, fid);
-	if (err)
-		goto unlock;
+	if (_mv88e6xxx_fid_new(ds, &fid) ||
+	    _mv88e6xxx_port_fid_set(ds, port, fid))
+		netdev_warn(ds->ports[port], "failed to assign a new FID\n");
 
 	/* Unassign the bridge and remap each port's VLANTable */
 	ps->ports[port].bridge_dev = NULL;
 
-	for (i = 0; i < ps->num_ports; ++i) {
-		if (i == port || ps->ports[i].bridge_dev == bridge) {
-			err = _mv88e6xxx_port_based_vlan_map(ds, i);
-			if (err)
-				break;
-		}
-	}
+	for (i = 0; i < ps->num_ports; ++i)
+		if (i == port || ps->ports[i].bridge_dev == bridge)
+			if (_mv88e6xxx_port_based_vlan_map(ds, i))
+				netdev_warn(ds->ports[i], "failed to remap\n");
 
-unlock:
 	mutex_unlock(&ps->smi_mutex);
-
-	return err;
 }
 
 static void mv88e6xxx_bridge_work(struct work_struct *work)
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 281cefe86afd..9a038aba48fb 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -488,7 +488,7 @@ int mv88e6xxx_set_eee(struct dsa_switch *ds, int port,
 		      struct phy_device *phydev, struct ethtool_eee *e);
 int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
 			       struct net_device *bridge);
-int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port);
+void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port);
 int mv88e6xxx_port_stp_update(struct dsa_switch *ds, int port, u8 state);
 int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port,
 				  bool vlan_filtering);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 004e034184c1..6463bb2863ac 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -298,7 +298,7 @@ struct dsa_switch_driver {
 	 */
 	int	(*port_bridge_join)(struct dsa_switch *ds, int port,
 				    struct net_device *bridge);
-	int	(*port_bridge_leave)(struct dsa_switch *ds, int port);
+	void	(*port_bridge_leave)(struct dsa_switch *ds, int port);
 	int	(*port_stp_update)(struct dsa_switch *ds, int port,
 				   u8 state);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 52653d715f64..8e00f1d83eb8 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -454,15 +454,14 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 	return ret;
 }
 
-static int dsa_slave_bridge_port_leave(struct net_device *dev)
+static void dsa_slave_bridge_port_leave(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = p->parent;
-	int ret = -EOPNOTSUPP;
 
 
 	if (ds->drv->port_bridge_leave)
-		ret = ds->drv->port_bridge_leave(ds, p->port);
+		ds->drv->port_bridge_leave(ds, p->port);
 
 	p->bridge_dev = NULL;
 
@@ -470,8 +469,6 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
 	 * so allow it to be in BR_STATE_FORWARDING to be kept functional
 	 */
 	dsa_slave_stp_update(dev, BR_STATE_FORWARDING);
-
-	return ret;
 }
 
 static int dsa_slave_port_attr_get(struct net_device *dev,
@@ -1156,7 +1153,7 @@ static int dsa_slave_master_changed(struct net_device *dev)
 	    !strcmp(master->rtnl_link_ops->kind, "bridge"))
 		err = dsa_slave_bridge_port_join(dev, master);
 	else if (dsa_port_is_bridged(p))
-		err = dsa_slave_bridge_port_leave(dev);
+		dsa_slave_bridge_port_leave(dev);
 
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6debb68a2d1acd1f5eca1d9d1c79a54dc998b6d6 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Sun, 13 Mar 2016 16:21:34 -0400
Subject: net: dsa: refine netdev event notifier

Rework the netdev event handler, similar to what the Mellanox Spectrum
driver does, to easily welcome more events later (for example
NETDEV_PRECHANGEUPPER) and use netdev helpers (such as
netif_is_bridge_master).

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 54 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8e00f1d83eb8..a575f0350d5a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -451,7 +451,7 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
 	if (ds->drv->port_bridge_join)
 		ret = ds->drv->port_bridge_join(ds, p->port, br);
 
-	return ret;
+	return ret == -EOPNOTSUPP ? 0 : ret;
 }
 
 static void dsa_slave_bridge_port_leave(struct net_device *dev)
@@ -1143,40 +1143,46 @@ static bool dsa_slave_dev_check(struct net_device *dev)
 	return dev->netdev_ops == &dsa_slave_netdev_ops;
 }
 
-static int dsa_slave_master_changed(struct net_device *dev)
+static int dsa_slave_port_upper_event(struct net_device *dev,
+				      unsigned long event, void *ptr)
 {
-	struct net_device *master = netdev_master_upper_dev_get(dev);
-	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct netdev_notifier_changeupper_info *info = ptr;
+	struct net_device *upper = info->upper_dev;
 	int err = 0;
 
-	if (master && master->rtnl_link_ops &&
-	    !strcmp(master->rtnl_link_ops->kind, "bridge"))
-		err = dsa_slave_bridge_port_join(dev, master);
-	else if (dsa_port_is_bridged(p))
-		dsa_slave_bridge_port_leave(dev);
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		if (netif_is_bridge_master(upper)) {
+			if (info->linking)
+				err = dsa_slave_bridge_port_join(dev, upper);
+			else
+				dsa_slave_bridge_port_leave(dev);
+		}
 
-	return err;
+		break;
+	}
+
+	return notifier_from_errno(err);
 }
 
-int dsa_slave_netdevice_event(struct notifier_block *unused,
-			      unsigned long event, void *ptr)
+static int dsa_slave_port_event(struct net_device *dev, unsigned long event,
+				void *ptr)
 {
-	struct net_device *dev;
-	int err = 0;
-
 	switch (event) {
 	case NETDEV_CHANGEUPPER:
-		dev = netdev_notifier_info_to_dev(ptr);
-		if (!dsa_slave_dev_check(dev))
-			goto out;
+		return dsa_slave_port_upper_event(dev, event, ptr);
+	}
 
-		err = dsa_slave_master_changed(dev);
-		if (err && err != -EOPNOTSUPP)
-			netdev_warn(dev, "failed to reflect master change\n");
+	return NOTIFY_DONE;
+}
 
-		break;
-	}
+int dsa_slave_netdevice_event(struct notifier_block *unused,
+			      unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (dsa_slave_dev_check(dev))
+		return dsa_slave_port_event(dev, event, ptr);
 
-out:
 	return NOTIFY_DONE;
 }
-- 
cgit v1.2.3-59-g8ed1b


From bfa3f9d7f3b349acea8982d2248e33a0ed84c687 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:16 -0800
Subject: netfilter: Remove IP_CT_NEW_REPLY definition.

Remove the definition of IP_CT_NEW_REPLY from the kernel as it does
not make sense.  This allows the definition of IP_CT_NUMBER to be
simplified as well.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++++++++---
 net/openvswitch/conntrack.c                        |  2 --
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 319f47128db8..6d074d14ee27 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -20,9 +20,15 @@ enum ip_conntrack_info {
 
 	IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
 	IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
-	IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,	
-	/* Number of distinct IP_CT types (no NEW in reply dirn). */
-	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
+	/* No NEW in reply direction. */
+
+	/* Number of distinct IP_CT types. */
+	IP_CT_NUMBER,
+
+	/* only for userspace compatibility */
+#ifndef __KERNEL__
+	IP_CT_NEW_REPLY = IP_CT_NUMBER,
+#endif
 };
 
 #define NF_CT_STATE_INVALID_BIT			(1 << 0)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index ee6ff8ffc12d..304529015744 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -75,7 +75,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 	switch (ctinfo) {
 	case IP_CT_ESTABLISHED_REPLY:
 	case IP_CT_RELATED_REPLY:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_REPLY_DIR;
 		break;
 	default:
@@ -92,7 +91,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 		ct_state |= OVS_CS_F_RELATED;
 		break;
 	case IP_CT_NEW:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_NEW;
 		break;
 	default:
-- 
cgit v1.2.3-59-g8ed1b


From 264619055bd52bc2278af848472176642d759874 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:17 -0800
Subject: netfilter: Allow calling into nat helper without skb_dst.

NAT checksum recalculation code assumes existence of skb_dst, which
becomes a problem for a later patch in the series ("openvswitch:
Interface with NAT.").  Simplify this by removing the check on
skb_dst, as the checksum will be dealt with later in the stack.

Suggested-by: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 30 ++++++++----------------------
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 30 ++++++++----------------------
 2 files changed, 16 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 61c7cc22ea68..f8aad03d674b 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -127,29 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
 				    u8 proto, void *data, __sum16 *check,
 				    int datalen, int oldlen)
 {
-	const struct iphdr *iph = ip_hdr(skb);
-	struct rtable *rt = skb_rtable(skb);
-
 	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		if (!(rt->rt_flags & RTCF_LOCAL) &&
-		    (!skb->dev || skb->dev->features &
-		     (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_headroom(skb) +
-					  skb_network_offset(skb) +
-					  ip_hdrlen(skb);
-			skb->csum_offset = (void *)check - data;
-			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-						    datalen, proto, 0);
-		} else {
-			*check = 0;
-			*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-						   datalen, proto,
-						   csum_partial(data, datalen,
-								0));
-			if (proto == IPPROTO_UDP && !*check)
-				*check = CSUM_MANGLED_0;
-		}
+		const struct iphdr *iph = ip_hdr(skb);
+
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+			ip_hdrlen(skb);
+		skb->csum_offset = (void *)check - data;
+		*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+					    proto, 0);
 	} else
 		inet_proto_csum_replace2(check, skb,
 					 htons(oldlen), htons(datalen), true);
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 6ce309928841..e0be97e636a4 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -131,29 +131,15 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
 				    u8 proto, void *data, __sum16 *check,
 				    int datalen, int oldlen)
 {
-	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
-	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-
 	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		if (!(rt->rt6i_flags & RTF_LOCAL) &&
-		    (!skb->dev || skb->dev->features &
-		     (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_headroom(skb) +
-					  skb_network_offset(skb) +
-					  (data - (void *)skb->data);
-			skb->csum_offset = (void *)check - data;
-			*check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
-						  datalen, proto, 0);
-		} else {
-			*check = 0;
-			*check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
-						 datalen, proto,
-						 csum_partial(data, datalen,
-							      0));
-			if (proto == IPPROTO_UDP && !*check)
-				*check = CSUM_MANGLED_0;
-		}
+		const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+			(data - (void *)skb->data);
+		skb->csum_offset = (void *)check - data;
+		*check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+					  datalen, proto, 0);
 	} else
 		inet_proto_csum_replace2(check, skb,
 					 htons(oldlen), htons(datalen), true);
-- 
cgit v1.2.3-59-g8ed1b


From 9f13ded8d3c715147c4759f937cfb712c185ca13 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:18 -0800
Subject: openvswitch: Add commentary to conntrack.c

This makes the code easier to understand and the following patches
more focused.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/openvswitch/conntrack.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 304529015744..2c2bf071f6d6 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -152,8 +152,12 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 	ct = nf_ct_get(skb, &ctinfo);
 	if (ct) {
 		state = ovs_ct_get_state(ctinfo);
+		/* All unconfirmed entries are NEW connections. */
 		if (!nf_ct_is_confirmed(ct))
 			state |= OVS_CS_F_NEW;
+		/* OVS persists the related flag for the duration of the
+		 * connection.
+		 */
 		if (ct->master)
 			state |= OVS_CS_F_RELATED;
 		zone = nf_ct_zone(ct);
@@ -165,6 +169,9 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 	__ovs_ct_update_key(key, state, zone, ct);
 }
 
+/* This is called to initialize CT key fields possibly coming in from the local
+ * stack.
+ */
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 {
 	ovs_ct_update_key(skb, NULL, key, false);
@@ -199,7 +206,6 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
 	struct nf_conn *ct;
 	u32 new_mark;
 
-
 	/* The connection could be invalid, in which case set_mark is no-op. */
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct)
@@ -375,6 +381,11 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
 	return true;
 }
 
+/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
+ * not done already.  Update key with new CT state.
+ * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
+ * set to NULL and 0 will be returned.
+ */
 static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 			   const struct ovs_conntrack_info *info,
 			   struct sk_buff *skb)
@@ -418,6 +429,13 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 {
 	struct nf_conntrack_expect *exp;
 
+	/* If we pass an expected packet through nf_conntrack_in() the
+	 * expectation is typically removed, but the packet could still be
+	 * lost in upcall processing.  To prevent this from happening we
+	 * perform an explicit expectation lookup.  Expected connections are
+	 * always new, and will be passed through conntrack only when they are
+	 * committed, as it is OK to remove the expectation at that time.
+	 */
 	exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
 	if (exp) {
 		u8 state;
@@ -455,6 +473,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 	err = __ovs_ct_lookup(net, key, info, skb);
 	if (err)
 		return err;
+	/* This is a no-op if the connection has already been confirmed. */
 	if (nf_conntrack_confirm(skb) != NF_ACCEPT)
 		return -EINVAL;
 
-- 
cgit v1.2.3-59-g8ed1b


From 394e910e909b174270b8231fd51942eb2f541fb9 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:19 -0800
Subject: openvswitch: Update the CT state key only after nf_conntrack_in().

Only a successful nf_conntrack_in() call can effect a connection state
change, so it suffices to update the key only after the
nf_conntrack_in() returns.

This change is needed for the later NAT patches.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/openvswitch/conntrack.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 2c2bf071f6d6..a487bb3486e1 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -382,7 +382,8 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
 }
 
 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
- * not done already.  Update key with new CT state.
+ * not done already.  Update key with new CT state after passing the packet
+ * through conntrack.
  * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
  * set to NULL and 0 will be returned.
  */
@@ -411,14 +412,14 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 				    skb) != NF_ACCEPT)
 			return -ENOENT;
 
+		ovs_ct_update_key(skb, info, key, true);
+
 		if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
 			WARN_ONCE(1, "helper rejected packet");
 			return -EINVAL;
 		}
 	}
 
-	ovs_ct_update_key(skb, info, key, true);
-
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 289f225349cb2a97448fd14599ab34b741f706f3 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:20 -0800
Subject: openvswitch: Find existing conntrack entry after upcall.

Add a new function ovs_ct_find_existing() to find an existing
conntrack entry for which this packet was already applied to.  This is
only to be called when there is evidence that the packet was already
tracked and committed, but we lost the ct reference due to an
userspace upcall.

ovs_ct_find_existing() is called from skb_nfct_cached(), which can now
hide the fact that the ct reference may have been lost due to an
upcall.  This allows ovs_ct_commit() to be simplified.

This patch is needed by later "openvswitch: Interface with NAT" patch,
as we need to be able to pass the packet through NAT using the
original ct reference also after the reference is lost after an
upcall.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/openvswitch/conntrack.c | 103 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 90 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index a487bb3486e1..ae36fe2ed483 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -356,14 +356,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
 	return __nf_ct_expect_find(net, zone, &tuple);
 }
 
+/* This replicates logic from nf_conntrack_core.c that is not exported. */
+static enum ip_conntrack_info
+ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
+{
+	const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+		return IP_CT_ESTABLISHED_REPLY;
+	/* Once we've had two way comms, always ESTABLISHED. */
+	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+		return IP_CT_ESTABLISHED;
+	if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+		return IP_CT_RELATED;
+	return IP_CT_NEW;
+}
+
+/* Find an existing connection which this packet belongs to without
+ * re-attributing statistics or modifying the connection state.  This allows an
+ * skb->nfct lost due to an upcall to be recovered during actions execution.
+ *
+ * Must be called with rcu_read_lock.
+ *
+ * On success, populates skb->nfct and skb->nfctinfo, and returns the
+ * connection.  Returns NULL if there is no existing entry.
+ */
+static struct nf_conn *
+ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
+		     u8 l3num, struct sk_buff *skb)
+{
+	struct nf_conntrack_l3proto *l3proto;
+	struct nf_conntrack_l4proto *l4proto;
+	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_tuple_hash *h;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	unsigned int dataoff;
+	u8 protonum;
+
+	l3proto = __nf_ct_l3proto_find(l3num);
+	if (!l3proto) {
+		pr_debug("ovs_ct_find_existing: Can't get l3proto\n");
+		return NULL;
+	}
+	if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+				 &protonum) <= 0) {
+		pr_debug("ovs_ct_find_existing: Can't get protonum\n");
+		return NULL;
+	}
+	l4proto = __nf_ct_l4proto_find(l3num, protonum);
+	if (!l4proto) {
+		pr_debug("ovs_ct_find_existing: Can't get l4proto\n");
+		return NULL;
+	}
+	if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+			     protonum, net, &tuple, l3proto, l4proto)) {
+		pr_debug("ovs_ct_find_existing: Can't get tuple\n");
+		return NULL;
+	}
+
+	/* look for tuple match */
+	h = nf_conntrack_find_get(net, zone, &tuple);
+	if (!h)
+		return NULL;   /* Not found. */
+
+	ct = nf_ct_tuplehash_to_ctrack(h);
+
+	ctinfo = ovs_ct_get_info(h);
+	if (ctinfo == IP_CT_NEW) {
+		/* This should not happen. */
+		WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct);
+	}
+	skb->nfct = &ct->ct_general;
+	skb->nfctinfo = ctinfo;
+	return ct;
+}
+
 /* Determine whether skb->nfct is equal to the result of conntrack lookup. */
-static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
-			    const struct ovs_conntrack_info *info)
+static bool skb_nfct_cached(struct net *net,
+			    const struct sw_flow_key *key,
+			    const struct ovs_conntrack_info *info,
+			    struct sk_buff *skb)
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
 	ct = nf_ct_get(skb, &ctinfo);
+	/* If no ct, check if we have evidence that an existing conntrack entry
+	 * might be found for this skb.  This happens when we lose a skb->nfct
+	 * due to an upcall.  If the connection was not confirmed, it is not
+	 * cached and needs to be run through conntrack again.
+	 */
+	if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
+	    !(key->ct.state & OVS_CS_F_INVALID) &&
+	    key->ct.zone == info->zone.id)
+		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
 	if (!ct)
 		return false;
 	if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -396,7 +483,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	 * actually run the packet through conntrack twice unless it's for a
 	 * different zone.
 	 */
-	if (!skb_nfct_cached(net, skb, info)) {
+	if (!skb_nfct_cached(net, key, info, skb)) {
 		struct nf_conn *tmpl = info->ct;
 
 		/* Associate skb with specified zone. */
@@ -459,18 +546,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 			 const struct ovs_conntrack_info *info,
 			 struct sk_buff *skb)
 {
-	u8 state;
 	int err;
 
-	state = key->ct.state;
-	if (key->ct.zone == info->zone.id &&
-	    ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
-		/* Previous lookup has shown that this connection is already
-		 * tracked and committed. Skip committing.
-		 */
-		return 0;
-	}
-
 	err = __ovs_ct_lookup(net, key, info, skb);
 	if (err)
 		return err;
-- 
cgit v1.2.3-59-g8ed1b


From 5b6b929376a621e2bd3367f5de563d7123506597 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:21 -0800
Subject: openvswitch: Handle NF_REPEAT in conntrack action.

Repeat the nf_conntrack_in() call when it returns NF_REPEAT.  This
avoids dropping a SYN packet re-opening an existing TCP connection.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/openvswitch/conntrack.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index ae36fe2ed483..85256b312455 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -485,6 +485,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	 */
 	if (!skb_nfct_cached(net, key, info, skb)) {
 		struct nf_conn *tmpl = info->ct;
+		int err;
 
 		/* Associate skb with specified zone. */
 		if (tmpl) {
@@ -495,8 +496,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 			skb->nfctinfo = IP_CT_NEW;
 		}
 
-		if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING,
-				    skb) != NF_ACCEPT)
+		/* Repeat if requested, see nf_iterate(). */
+		do {
+			err = nf_conntrack_in(net, info->family,
+					      NF_INET_PRE_ROUTING, skb);
+		} while (err == NF_REPEAT);
+
+		if (err != NF_ACCEPT)
 			return -ENOENT;
 
 		ovs_ct_update_key(skb, info, key, true);
-- 
cgit v1.2.3-59-g8ed1b


From 28b6e0c1ace45779c60e7cefe6d469b7ecb520b8 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:22 -0800
Subject: openvswitch: Delay conntrack helper call for new connections.

There is no need to help connections that are not confirmed, so we can
delay helping new connections to the time when they are confirmed.
This change is needed for NAT support, and having this as a separate
patch will make the following NAT patch a bit easier to review.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/openvswitch/conntrack.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 85256b312455..f718b724e650 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -483,7 +483,11 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	 * actually run the packet through conntrack twice unless it's for a
 	 * different zone.
 	 */
-	if (!skb_nfct_cached(net, key, info, skb)) {
+	bool cached = skb_nfct_cached(net, key, info, skb);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	if (!cached) {
 		struct nf_conn *tmpl = info->ct;
 		int err;
 
@@ -506,11 +510,18 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 			return -ENOENT;
 
 		ovs_ct_update_key(skb, info, key, true);
+	}
 
-		if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
-			WARN_ONCE(1, "helper rejected packet");
-			return -EINVAL;
-		}
+	/* Call the helper only if:
+	 * - nf_conntrack_in() was executed above ("!cached") for a confirmed
+	 *   connection, or
+	 * - When committing an unconfirmed connection.
+	 */
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct && (nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
+	    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+		WARN_ONCE(1, "helper rejected packet");
+		return -EINVAL;
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 05752523e56502cd9975aec0a2ded465d51a71f3 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:23 -0800
Subject: openvswitch: Interface with NAT.

Extend OVS conntrack interface to cover NAT.  New nested
OVS_CT_ATTR_NAT attribute may be used to include NAT with a CT action.
A bare OVS_CT_ATTR_NAT only mangles existing and expected connections.
If OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
attributes, new (non-committed/non-confirmed) connections are mangled
according to the rest of the nested attributes.

The corresponding OVS userspace patch series includes test cases (in
tests/system-traffic.at) that also serve as example uses.

This work extends on a branch by Thomas Graf at
https://github.com/tgraf/ovs/tree/nat.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/openvswitch.h |  49 ++++
 net/openvswitch/Kconfig          |   3 +-
 net/openvswitch/conntrack.c      | 524 +++++++++++++++++++++++++++++++++++++--
 net/openvswitch/conntrack.h      |   3 +-
 4 files changed, 551 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index a27222d5b413..616d04761730 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -454,6 +454,14 @@ struct ovs_key_ct_labels {
 #define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
 #define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
 #define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
+#define OVS_CS_F_SRC_NAT           0x40 /* Packet's source address/port was
+					 * mangled by NAT.
+					 */
+#define OVS_CS_F_DST_NAT           0x80 /* Packet's destination address/port
+					 * was mangled by NAT.
+					 */
+
+#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
@@ -632,6 +640,8 @@ struct ovs_action_hash {
  * mask. For each bit set in the mask, the corresponding bit in the value is
  * copied to the connection tracking label field in the connection.
  * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
+ * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
+ * translation (NAT) on the packet.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -641,11 +651,50 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_LABELS,     /* labels to associate with this connection. */
 	OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
 				   related connections. */
+	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
 	__OVS_CT_ATTR_MAX
 };
 
 #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
 
+/**
+ * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT.
+ *
+ * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port).
+ * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination
+ * address/port).  Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be
+ * specified.  Effective only for packets for ct_state NEW connections.
+ * Packets of committed connections are mangled by the NAT action according to
+ * the committed NAT type regardless of the flags specified.  As a corollary, a
+ * NAT action without a NAT type flag will only mangle packets of committed
+ * connections.  The following NAT attributes only apply for NEW
+ * (non-committed) connections, and they may be included only when the CT
+ * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or
+ * @OVS_NAT_ATTR_DST is also included.
+ * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port)
+ * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port)
+ * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots
+ * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5)
+ * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping
+ */
+enum ovs_nat_attr {
+	OVS_NAT_ATTR_UNSPEC,
+	OVS_NAT_ATTR_SRC,
+	OVS_NAT_ATTR_DST,
+	OVS_NAT_ATTR_IP_MIN,
+	OVS_NAT_ATTR_IP_MAX,
+	OVS_NAT_ATTR_PROTO_MIN,
+	OVS_NAT_ATTR_PROTO_MAX,
+	OVS_NAT_ATTR_PERSISTENT,
+	OVS_NAT_ATTR_PROTO_HASH,
+	OVS_NAT_ATTR_PROTO_RANDOM,
+	__OVS_NAT_ATTR_MAX,
+};
+
+#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)
+
 /**
  * enum ovs_action_attr - Action types.
  *
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index cd5fd9d728a7..234a73344c6e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,7 +6,8 @@ config OPENVSWITCH
 	tristate "Open vSwitch"
 	depends on INET
 	depends on !NF_CONNTRACK || \
-		   (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))
+		   (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
+				     (!NF_NAT || NF_NAT)))
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index f718b724e650..dc5eb29fe7d6 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
 
 #include <linux/module.h>
 #include <linux/openvswitch.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
 #include <net/ip.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#endif
+
 #include "datapath.h"
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_netlink.h"
 
 struct ovs_ct_len_tbl {
-	size_t maxlen;
-	size_t minlen;
+	int maxlen;
+	int minlen;
 };
 
 /* Metadata mark for masked write to conntrack mark */
@@ -42,15 +52,25 @@ struct md_labels {
 	struct ovs_key_ct_labels mask;
 };
 
+enum ovs_ct_nat {
+	OVS_CT_NAT = 1 << 0,     /* NAT for committed connections only. */
+	OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
+	OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
+};
+
 /* Conntrack action context for execution. */
 struct ovs_conntrack_info {
 	struct nf_conntrack_helper *helper;
 	struct nf_conntrack_zone zone;
 	struct nf_conn *ct;
 	u8 commit : 1;
+	u8 nat : 3;                 /* enum ovs_ct_nat */
 	u16 family;
 	struct md_mark mark;
 	struct md_labels labels;
+#ifdef CONFIG_NF_NAT_NEEDED
+	struct nf_nat_range range;  /* Only present for SRC NAT and DST NAT. */
+#endif
 };
 
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -137,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 	ovs_ct_get_labels(ct, &key->ct.labels);
 }
 
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
- * previously sent the packet to conntrack via the ct action.
+/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action.  If
+ * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
+ * initialized from the connection status.
  */
 static void ovs_ct_update_key(const struct sk_buff *skb,
 			      const struct ovs_conntrack_info *info,
-			      struct sw_flow_key *key, bool post_ct)
+			      struct sw_flow_key *key, bool post_ct,
+			      bool keep_nat_flags)
 {
 	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	enum ip_conntrack_info ctinfo;
@@ -160,6 +183,14 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 		 */
 		if (ct->master)
 			state |= OVS_CS_F_RELATED;
+		if (keep_nat_flags) {
+			state |= key->ct.state & OVS_CS_F_NAT_MASK;
+		} else {
+			if (ct->status & IPS_SRC_NAT)
+				state |= OVS_CS_F_SRC_NAT;
+			if (ct->status & IPS_DST_NAT)
+				state |= OVS_CS_F_DST_NAT;
+		}
 		zone = nf_ct_zone(ct);
 	} else if (post_ct) {
 		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -174,7 +205,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
  */
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 {
-	ovs_ct_update_key(skb, NULL, key, false);
+	ovs_ct_update_key(skb, NULL, key, false, false);
 }
 
 int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -263,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 	enum ip_conntrack_info ctinfo;
 	unsigned int protoff;
 	struct nf_conn *ct;
+	int err;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -299,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 		return NF_DROP;
 	}
 
-	return helper->help(skb, protoff, ct, ctinfo);
+	err = helper->help(skb, protoff, ct, ctinfo);
+	if (err != NF_ACCEPT)
+		return err;
+
+	/* Adjust seqs after helper.  This is needed due to some helpers (e.g.,
+	 * FTP with NAT) adusting the TCP payload size when mangling IP
+	 * addresses and/or port numbers in the text-based control connection.
+	 */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+	    !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+		return NF_DROP;
+	return NF_ACCEPT;
 }
 
 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -468,6 +511,200 @@ static bool skb_nfct_cached(struct net *net,
 	return true;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+			      enum ip_conntrack_info ctinfo,
+			      const struct nf_nat_range *range,
+			      enum nf_nat_manip_type maniptype)
+{
+	int hooknum, nh_off, err = NF_ACCEPT;
+
+	nh_off = skb_network_offset(skb);
+	skb_pull(skb, nh_off);
+
+	/* See HOOK2MANIP(). */
+	if (maniptype == NF_NAT_MANIP_SRC)
+		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+	else
+		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (skb->protocol == htons(ETH_P_IP) &&
+		    ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   hooknum))
+				err = NF_DROP;
+			goto push;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			__be16 frag_off;
+			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+			int hdrlen = ipv6_skip_exthdr(skb,
+						      sizeof(struct ipv6hdr),
+						      &nexthdr, &frag_off);
+
+			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+				if (!nf_nat_icmpv6_reply_translation(skb, ct,
+								     ctinfo,
+								     hooknum,
+								     hdrlen))
+					err = NF_DROP;
+				goto push;
+			}
+#endif
+		}
+		/* Non-ICMP, fall thru to initialize if needed. */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			/* Initialize according to the NAT action. */
+			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+				/* Action is set up to establish a new
+				 * mapping.
+				 */
+				? nf_nat_setup_info(ct, range, maniptype)
+				: nf_nat_alloc_null_binding(ct, hooknum);
+			if (err != NF_ACCEPT)
+				goto push;
+		}
+		break;
+
+	case IP_CT_ESTABLISHED:
+	case IP_CT_ESTABLISHED_REPLY:
+		break;
+
+	default:
+		err = NF_DROP;
+		goto push;
+	}
+
+	err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+push:
+	skb_push(skb, nh_off);
+
+	return err;
+}
+
+static void ovs_nat_update_key(struct sw_flow_key *key,
+			       const struct sk_buff *skb,
+			       enum nf_nat_manip_type maniptype)
+{
+	if (maniptype == NF_NAT_MANIP_SRC) {
+		__be16 src;
+
+		key->ct.state |= OVS_CS_F_SRC_NAT;
+		if (key->eth.type == htons(ETH_P_IP))
+			key->ipv4.addr.src = ip_hdr(skb)->saddr;
+		else if (key->eth.type == htons(ETH_P_IPV6))
+			memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
+			       sizeof(key->ipv6.addr.src));
+		else
+			return;
+
+		if (key->ip.proto == IPPROTO_UDP)
+			src = udp_hdr(skb)->source;
+		else if (key->ip.proto == IPPROTO_TCP)
+			src = tcp_hdr(skb)->source;
+		else if (key->ip.proto == IPPROTO_SCTP)
+			src = sctp_hdr(skb)->source;
+		else
+			return;
+
+		key->tp.src = src;
+	} else {
+		__be16 dst;
+
+		key->ct.state |= OVS_CS_F_DST_NAT;
+		if (key->eth.type == htons(ETH_P_IP))
+			key->ipv4.addr.dst = ip_hdr(skb)->daddr;
+		else if (key->eth.type == htons(ETH_P_IPV6))
+			memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
+			       sizeof(key->ipv6.addr.dst));
+		else
+			return;
+
+		if (key->ip.proto == IPPROTO_UDP)
+			dst = udp_hdr(skb)->dest;
+		else if (key->ip.proto == IPPROTO_TCP)
+			dst = tcp_hdr(skb)->dest;
+		else if (key->ip.proto == IPPROTO_SCTP)
+			dst = sctp_hdr(skb)->dest;
+		else
+			return;
+
+		key->tp.dst = dst;
+	}
+}
+
+/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+		      const struct ovs_conntrack_info *info,
+		      struct sk_buff *skb, struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo)
+{
+	enum nf_nat_manip_type maniptype;
+	int err;
+
+	if (nf_ct_is_untracked(ct)) {
+		/* A NAT action may only be performed on tracked packets. */
+		return NF_ACCEPT;
+	}
+
+	/* Add NAT extension if not confirmed yet. */
+	if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+		return NF_ACCEPT;   /* Can't NAT. */
+
+	/* Determine NAT type.
+	 * Check if the NAT type can be deduced from the tracked connection.
+	 * Make sure expected traffic is NATted only when committing.
+	 */
+	if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
+	    ct->status & IPS_NAT_MASK &&
+	    (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) {
+		/* NAT an established or related connection like before. */
+		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+			/* This is the REPLY direction for a connection
+			 * for which NAT was applied in the forward
+			 * direction.  Do the reverse NAT.
+			 */
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+		else
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+	} else if (info->nat & OVS_CT_SRC_NAT) {
+		maniptype = NF_NAT_MANIP_SRC;
+	} else if (info->nat & OVS_CT_DST_NAT) {
+		maniptype = NF_NAT_MANIP_DST;
+	} else {
+		return NF_ACCEPT; /* Connection is not NATed. */
+	}
+	err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+
+	/* Mark NAT done if successful and update the flow key. */
+	if (err == NF_ACCEPT)
+		ovs_nat_update_key(key, skb, maniptype);
+
+	return err;
+}
+#else /* !CONFIG_NF_NAT_NEEDED */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+		      const struct ovs_conntrack_info *info,
+		      struct sk_buff *skb, struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo)
+{
+	return NF_ACCEPT;
+}
+#endif
+
 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
  * not done already.  Update key with new CT state after passing the packet
  * through conntrack.
@@ -509,19 +746,43 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		if (err != NF_ACCEPT)
 			return -ENOENT;
 
-		ovs_ct_update_key(skb, info, key, true);
+		/* Clear CT state NAT flags to mark that we have not yet done
+		 * NAT after the nf_conntrack_in() call.  We can actually clear
+		 * the whole state, as it will be re-initialized below.
+		 */
+		key->ct.state = 0;
+
+		/* Update the key, but keep the NAT flags. */
+		ovs_ct_update_key(skb, info, key, true, true);
 	}
 
-	/* Call the helper only if:
-	 * - nf_conntrack_in() was executed above ("!cached") for a confirmed
-	 *   connection, or
-	 * - When committing an unconfirmed connection.
-	 */
 	ct = nf_ct_get(skb, &ctinfo);
-	if (ct && (nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
-	    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
-		WARN_ONCE(1, "helper rejected packet");
-		return -EINVAL;
+	if (ct) {
+		/* Packets starting a new connection must be NATted before the
+		 * helper, so that the helper knows about the NAT.  We enforce
+		 * this by delaying both NAT and helper calls for unconfirmed
+		 * connections until the committing CT action.  For later
+		 * packets NAT and Helper may be called in either order.
+		 *
+		 * NAT will be done only if the CT action has NAT, and only
+		 * once per packet (per zone), as guarded by the NAT bits in
+		 * the key->ct.state.
+		 */
+		if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+		    (nf_ct_is_confirmed(ct) || info->commit) &&
+		    ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
+			return -EINVAL;
+		}
+
+		/* Call the helper only if:
+		 * - nf_conntrack_in() was executed above ("!cached") for a
+		 *   confirmed connection, or
+		 * - When committing an unconfirmed connection.
+		 */
+		if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
+		    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+			return -EINVAL;
+		}
 	}
 
 	return 0;
@@ -545,15 +806,13 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	if (exp) {
 		u8 state;
 
+		/* NOTE: New connections are NATted and Helped only when
+		 * committed, so we are not calling into NAT here.
+		 */
 		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
 		__ovs_ct_update_key(key, state, &info->zone, exp->master);
-	} else {
-		int err;
-
-		err = __ovs_ct_lookup(net, key, info, skb);
-		if (err)
-			return err;
-	}
+	} else
+		return __ovs_ct_lookup(net, key, info, skb);
 
 	return 0;
 }
@@ -653,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 	return 0;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static int parse_nat(const struct nlattr *attr,
+		     struct ovs_conntrack_info *info, bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool have_ip_max = false;
+	bool have_proto_max = false;
+	bool ip_vers = (info->family == NFPROTO_IPV6);
+
+	nla_for_each_nested(a, attr, rem) {
+		static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
+			[OVS_NAT_ATTR_SRC] = {0, 0},
+			[OVS_NAT_ATTR_DST] = {0, 0},
+			[OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
+			[OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
+			[OVS_NAT_ATTR_PERSISTENT] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
+		};
+		int type = nla_type(a);
+
+		if (type > OVS_NAT_ATTR_MAX) {
+			OVS_NLERR(log,
+				  "Unknown NAT attribute (type=%d, max=%d).\n",
+				  type, OVS_NAT_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
+			OVS_NLERR(log,
+				  "NAT attribute type %d has unexpected length (%d != %d).\n",
+				  type, nla_len(a),
+				  ovs_nat_attr_lens[type][ip_vers]);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NAT_ATTR_SRC:
+		case OVS_NAT_ATTR_DST:
+			if (info->nat) {
+				OVS_NLERR(log,
+					  "Only one type of NAT may be specified.\n"
+					  );
+				return -ERANGE;
+			}
+			info->nat |= OVS_CT_NAT;
+			info->nat |= ((type == OVS_NAT_ATTR_SRC)
+					? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
+			break;
+
+		case OVS_NAT_ATTR_IP_MIN:
+			nla_memcpy(&info->range.min_addr, a, nla_len(a));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_IP_MAX:
+			have_ip_max = true;
+			nla_memcpy(&info->range.max_addr, a,
+				   sizeof(info->range.max_addr));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MIN:
+			info->range.min_proto.all = htons(nla_get_u16(a));
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MAX:
+			have_proto_max = true;
+			info->range.max_proto.all = htons(nla_get_u16(a));
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PERSISTENT:
+			info->range.flags |= NF_NAT_RANGE_PERSISTENT;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_HASH:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_RANDOM:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
+			break;
+
+		default:
+			OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
+		return -EINVAL;
+	}
+	if (!info->nat) {
+		/* Do not allow flags if no type is given. */
+		if (info->range.flags) {
+			OVS_NLERR(log,
+				  "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
+				  );
+			return -EINVAL;
+		}
+		info->nat = OVS_CT_NAT;   /* NAT existing connections. */
+	} else if (!info->commit) {
+		OVS_NLERR(log,
+			  "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
+			  );
+		return -EINVAL;
+	}
+	/* Allow missing IP_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
+		memcpy(&info->range.max_addr, &info->range.min_addr,
+		       sizeof(info->range.max_addr));
+	}
+	/* Allow missing PROTO_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+	    !have_proto_max) {
+		info->range.max_proto.all = info->range.min_proto.all;
+	}
+	return 0;
+}
+#endif
+
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0, .maxlen = 0 },
 	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
@@ -662,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_LABELS]	= { .minlen = sizeof(struct md_labels),
 				    .maxlen = sizeof(struct md_labels) },
 	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,
-				    .maxlen = NF_CT_HELPER_NAME_LEN }
+				    .maxlen = NF_CT_HELPER_NAME_LEN },
+#ifdef CONFIG_NF_NAT_NEEDED
+	/* NAT length is checked when parsing the nested attributes. */
+	[OVS_CT_ATTR_NAT]	= { .minlen = 0, .maxlen = INT_MAX },
+#endif
 };
 
 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -729,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 				return -EINVAL;
 			}
 			break;
+#ifdef CONFIG_NF_NAT_NEEDED
+		case OVS_CT_ATTR_NAT: {
+			int err = parse_nat(a, info, log);
+
+			if (err)
+				return err;
+			break;
+		}
+#endif
 		default:
 			OVS_NLERR(log, "Unknown conntrack attr (%d)",
 				  type);
@@ -816,6 +1217,74 @@ err_free_ct:
 	return err;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
+			       struct sk_buff *skb)
+{
+	struct nlattr *start;
+
+	start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
+	if (!start)
+		return false;
+
+	if (info->nat & OVS_CT_SRC_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
+			return false;
+	} else if (info->nat & OVS_CT_DST_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
+			return false;
+	} else {
+		goto out;
+	}
+
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
+		if (info->family == NFPROTO_IPV4) {
+			if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					    info->range.min_addr.ip) ||
+			    (info->range.max_addr.ip
+			     != info->range.min_addr.ip &&
+			     (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
+					      info->range.max_addr.ip))))
+				return false;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+		} else if (info->family == NFPROTO_IPV6) {
+			if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					     &info->range.min_addr.in6) ||
+			    (memcmp(&info->range.max_addr.in6,
+				    &info->range.min_addr.in6,
+				    sizeof(info->range.max_addr.in6)) &&
+			     (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
+					       &info->range.max_addr.in6))))
+				return false;
+#endif
+		} else {
+			return false;
+		}
+	}
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+	    (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
+			 ntohs(info->range.min_proto.all)) ||
+	     (info->range.max_proto.all != info->range.min_proto.all &&
+	      nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
+			  ntohs(info->range.max_proto.all)))))
+		return false;
+
+	if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
+		return false;
+out:
+	nla_nest_end(skb, start);
+
+	return true;
+}
+#endif
+
 int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 			  struct sk_buff *skb)
 {
@@ -844,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 				   ct_info->helper->name))
 			return -EMSGSIZE;
 	}
-
+#ifdef CONFIG_NF_NAT_NEEDED
+	if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
+		return -EMSGSIZE;
+#endif
 	nla_nest_end(skb, start);
 
 	return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index a7544f405c16..8f6230bd6183 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
 
 #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
 			   OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
-			   OVS_CS_F_INVALID | OVS_CS_F_TRACKED)
+			   OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
+			   OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 #else
 #include <linux/errno.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From acffb584cda7069b0c2c83045503ccd07516a891 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 Mar 2016 15:40:00 -0700
Subject: net: diag: add a scheduling point in inet_diag_dump_icsk()

On loaded TCP servers, looking at millions of sockets can hold
cpu for many seconds, if the lookup condition is very narrow.

(eg : ss dst 1.2.3.4 )

Better add a cond_resched() to allow other processes to access
the cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 50c0d96b8441..5fdb02f5598e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -879,6 +879,7 @@ next_normal:
 		}
 
 		spin_unlock_bh(lock);
+		cond_resched();
 	}
 
 done:
-- 
cgit v1.2.3-59-g8ed1b


From e39365be031e37b229f745ea49db0b25e82436fa Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Mon, 14 Mar 2016 12:39:02 +0100
Subject: netfilter: nf_conntrack: consolidate lock/unlock into unlock_wait

The spin_lock()/spin_unlock() is synchronizing on the
nf_conntrack_locks_all_lock which is equivalent to
spin_unlock_wait() but the later should be more efficient.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f60b4fdeeb8c..afde5f5e728a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -74,8 +74,7 @@ void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
 	spin_lock(lock);
 	while (unlikely(nf_conntrack_locks_all)) {
 		spin_unlock(lock);
-		spin_lock(&nf_conntrack_locks_all_lock);
-		spin_unlock(&nf_conntrack_locks_all_lock);
+		spin_unlock_wait(&nf_conntrack_locks_all_lock);
 		spin_lock(lock);
 	}
 }
@@ -121,8 +120,7 @@ static void nf_conntrack_all_lock(void)
 	nf_conntrack_locks_all = true;
 
 	for (i = 0; i < CONNTRACK_LOCKS; i++) {
-		spin_lock(&nf_conntrack_locks[i]);
-		spin_unlock(&nf_conntrack_locks[i]);
+		spin_unlock_wait(&nf_conntrack_locks[i]);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 25528213fe9f75f4e286f08d35a73ca2bb634a50 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 15 Mar 2016 14:52:49 -0700
Subject: tags: Fix DEFINE_PER_CPU expansions

$ make tags
  GEN     tags
ctags: Warning: drivers/acpi/processor_idle.c:64: null expansion of name pattern "\1"
ctags: Warning: drivers/xen/events/events_2l.c:41: null expansion of name pattern "\1"
ctags: Warning: kernel/locking/lockdep.c:151: null expansion of name pattern "\1"
ctags: Warning: kernel/rcu/rcutorture.c:133: null expansion of name pattern "\1"
ctags: Warning: kernel/rcu/rcutorture.c:135: null expansion of name pattern "\1"
ctags: Warning: kernel/workqueue.c:323: null expansion of name pattern "\1"
ctags: Warning: net/ipv4/syncookies.c:53: null expansion of name pattern "\1"
ctags: Warning: net/ipv6/syncookies.c:44: null expansion of name pattern "\1"
ctags: Warning: net/rds/page.c:45: null expansion of name pattern "\1"

Which are all the result of the DEFINE_PER_CPU pattern:

  scripts/tags.sh:200:	'/\<DEFINE_PER_CPU([^,]*, *\([[:alnum:]_]*\)/\1/v/'
  scripts/tags.sh:201:	'/\<DEFINE_PER_CPU_SHARED_ALIGNED([^,]*, *\([[:alnum:]_]*\)/\1/v/'

The below cures them. All except the workqueue one are within reasonable
distance of the 80 char limit. TJ do you have any preference on how to
fix the wq one, or shall we just not care its too long?

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/processor_idle.c  | 4 ++--
 drivers/xen/events/events_2l.c | 5 +++--
 kernel/locking/lockdep.c       | 3 +--
 kernel/rcu/rcutorture.c        | 6 ++----
 kernel/workqueue.c             | 3 +--
 net/ipv4/syncookies.c          | 3 +--
 net/ipv6/syncookies.c          | 3 +--
 net/rds/page.c                 | 4 ++--
 8 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 175c86bee3a9..9ca2b2fefd76 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -61,8 +61,8 @@ module_param(latency_factor, uint, 0644);
 
 static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device);
 
-static DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX],
-								acpi_cstate);
+static
+DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], acpi_cstate);
 
 static int disabled_by_idle_boot_param(void)
 {
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
index 7dd46312c180..403fe3955393 100644
--- a/drivers/xen/events/events_2l.c
+++ b/drivers/xen/events/events_2l.c
@@ -38,8 +38,9 @@
 /* Find the first set bit in a evtchn mask */
 #define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
 
-static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD],
-		      cpu_evtchn_mask);
+#define EVTCHN_MASK_SIZE (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)
+
+static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_MASK_SIZE], cpu_evtchn_mask);
 
 static unsigned evtchn_2l_max_channels(void)
 {
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f894a2cd9b2a..53ab2f85d77e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -148,8 +148,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 }
 
 #ifdef CONFIG_LOCK_STAT
-static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
-		      cpu_lock_stats);
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats);
 
 static inline u64 lockstat_clock(void)
 {
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d2988d047d66..4d5cc6aa7e1e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current;
 static unsigned long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
-		      rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
-		      rcu_torture_batch) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 };
 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
 static atomic_t n_rcu_torture_alloc;
 static atomic_t n_rcu_torture_alloc_fail;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7ff5dc7d2ac5..16e13d8628a3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -320,8 +320,7 @@ static bool wq_debug_force_rr_cpu = false;
 module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
 
 /* the per-cpu worker pools */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
-				     cpu_worker_pools);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
 
 static DEFINE_IDR(worker_pool_idr);	/* PR: idr of all pools */
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 643a86c49020..2d5589b61e9f 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -50,8 +50,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
 #define TSBITS	6
 #define TSMASK	(((__u32)1 << TSBITS) - 1)
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
-		      ipv4_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
 
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 2906ef20795e..aae3e5ca63ea 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -41,8 +41,7 @@ static __u16 const msstab[] = {
 	9000 - 60,
 };
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
-		      ipv6_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch);
 
 static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
 		       __be16 sport, __be16 dport, u32 count, int c)
diff --git a/net/rds/page.c b/net/rds/page.c
index 5a14e6d6a926..616f21f4e7d7 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -42,8 +42,8 @@ struct rds_page_remainder {
 	unsigned long	r_offset;
 };
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
-				     rds_page_remainders);
+static
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
 
 /*
  * returns 0 on success or -errno on failure.
-- 
cgit v1.2.3-59-g8ed1b


From 489ce5f441e4094a3419d7e8e29fd756b7f391d4 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Sun, 13 Mar 2016 11:48:24 +0100
Subject: sctp: consolidate local_bh_disable/enable + spin_lock/unlock to _bh
 variant

local_bh_disable() + spin_lock() is equivalent to spin_lock_bh(), same for
the unlock/enable case, so replace the calls by the appropriate wrappers.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b89501e5c1a1..f28ecf88cfea 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7254,14 +7254,12 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	/* Hook this new socket in to the bind_hash list. */
 	head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk),
 						 inet_sk(oldsk)->inet_num)];
-	local_bh_disable();
-	spin_lock(&head->lock);
+	spin_lock_bh(&head->lock);
 	pp = sctp_sk(oldsk)->bind_hash;
 	sk_add_bind_node(newsk, &pp->owner);
 	sctp_sk(newsk)->bind_hash = pp;
 	inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num;
-	spin_unlock(&head->lock);
-	local_bh_enable();
+	spin_unlock_bh(&head->lock);
 
 	/* Copy the bind_addr list from the original endpoint to the new
 	 * endpoint so that we can handle restarts properly
-- 
cgit v1.2.3-59-g8ed1b


From 793cf87de9d1a62dc9079c3ec5fcc01cfc62fafb Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 14 Mar 2016 01:05:38 +0000
Subject: ethtool: Set cmd field in ETHTOOL_GLINKSETTINGS response to wrong
 nwords

When the ETHTOOL_GLINKSETTINGS implementation finds that userland is
using the wrong number of words of link mode bitmaps (or is trying to
find out the right numbers) it sets the cmd field to 0 in the response
structure.

This is inconsistent with the implementation of every other ethtool
command, so let's remove that inconsistency before it gets into a
stable release.

Fixes: 3f1ac7a700d03 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API")
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 2966cd0d7c93..f426c5ad6149 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -655,7 +655,7 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
 	    != link_ksettings.base.link_mode_masks_nwords) {
 		/* wrong link mode nbits requested */
 		memset(&link_ksettings, 0, sizeof(link_ksettings));
-		/* keep cmd field reset to 0 */
+		link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
 		/* send back number of words required as negative val */
 		compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
 				   "need too many bits for link modes!");
-- 
cgit v1.2.3-59-g8ed1b


From fe896d1878949ea92ba547587bc3075cc688fb8f Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 17 Mar 2016 14:19:26 -0700
Subject: mm: introduce page reference manipulation functions

The success of CMA allocation largely depends on the success of
migration and key factor of it is page reference count.  Until now, page
reference is manipulated by direct calling atomic functions so we cannot
follow up who and where manipulate it.  Then, it is hard to find actual
reason of CMA allocation failure.  CMA allocation should be guaranteed
to succeed so finding offending place is really important.

In this patch, call sites where page reference is manipulated are
converted to introduced wrapper function.  This is preparation step to
add tracepoint to each page reference manipulation function.  With this
facility, we can easily find reason of CMA allocation failure.  There is
no functional change in this patch.

In addition, this patch also converts reference read sites.  It will
help a second step that renames page._count to something else and
prevents later attempt to direct access to it (Suggested by Andrew).

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/mm/gup.c                                |  2 +-
 arch/powerpc/mm/mmu_context_hash64.c              |  3 +-
 arch/powerpc/mm/pgtable_64.c                      |  2 +-
 arch/powerpc/platforms/512x/mpc512x_shared.c      |  2 +-
 arch/x86/mm/gup.c                                 |  2 +-
 drivers/block/aoe/aoecmd.c                        |  4 +-
 drivers/net/ethernet/freescale/gianfar.c          |  2 +-
 drivers/net/ethernet/intel/fm10k/fm10k_main.c     |  2 +-
 drivers/net/ethernet/intel/igb/igb_main.c         |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  2 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c        |  9 ++-
 drivers/net/ethernet/sun/niu.c                    |  2 +-
 fs/nilfs2/page.c                                  |  2 +-
 include/linux/mm.h                                | 25 ++-----
 include/linux/page_ref.h                          | 85 +++++++++++++++++++++++
 include/linux/pagemap.h                           | 19 +----
 mm/debug.c                                        |  2 +-
 mm/huge_memory.c                                  |  6 +-
 mm/internal.h                                     |  7 +-
 mm/memory_hotplug.c                               |  4 +-
 mm/migrate.c                                      | 10 +--
 mm/page_alloc.c                                   | 12 ++--
 mm/vmscan.c                                       |  6 +-
 net/core/sock.c                                   |  2 +-
 25 files changed, 134 insertions(+), 82 deletions(-)
 create mode 100644 include/linux/page_ref.h

(limited to 'net')

diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 1afd87c999b0..6cdffc76735c 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -64,7 +64,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 {
 	VM_BUG_ON(page != compound_head(page));
 	VM_BUG_ON(page_count(page) == 0);
-	atomic_add(nr, &page->_count);
+	page_ref_add(page, nr);
 	SetPageReferenced(page);
 }
 
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index 4e4efbc2658e..9ca6fe16cb29 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -118,8 +118,7 @@ static void destroy_pagetable_page(struct mm_struct *mm)
 	/* drop all the pending references */
 	count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
 	/* We allow PTE_FRAG_NR fragments from a PTE page */
-	count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
-	if (!count) {
+	if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) {
 		pgtable_page_dtor(page);
 		free_hot_cold_page(page, 0);
 	}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index cdf2123d46db..d9cc66cbdbb7 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -403,7 +403,7 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
 	 * count.
 	 */
 	if (likely(!mm->context.pte_frag)) {
-		atomic_set(&page->_count, PTE_FRAG_NR);
+		set_page_count(page, PTE_FRAG_NR);
 		mm->context.pte_frag = ret + PTE_FRAG_SIZE;
 	}
 	spin_unlock(&mm->page_table_lock);
diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index 711f3d352af7..452da2391153 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -188,7 +188,7 @@ static struct fsl_diu_shared_fb __attribute__ ((__aligned__(8))) diu_shared_fb;
 static inline void mpc512x_free_bootmem(struct page *page)
 {
 	BUG_ON(PageTail(page));
-	BUG_ON(atomic_read(&page->_count) > 1);
+	BUG_ON(page_ref_count(page) > 1);
 	free_reserved_page(page);
 }
 
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d8a798d8bf50..f8d0b5e8bdfd 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -131,7 +131,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 {
 	VM_BUG_ON_PAGE(page != compound_head(page), page);
 	VM_BUG_ON_PAGE(page_count(page) == 0, page);
-	atomic_add(nr, &page->_count);
+	page_ref_add(page, nr);
 	SetPageReferenced(page);
 }
 
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index d048d2009e89..437b3a822f44 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -875,7 +875,7 @@ bio_pageinc(struct bio *bio)
 		 * compound pages is no longer allowed by the kernel.
 		 */
 		page = compound_head(bv.bv_page);
-		atomic_inc(&page->_count);
+		page_ref_inc(page);
 	}
 }
 
@@ -888,7 +888,7 @@ bio_pagedec(struct bio *bio)
 
 	bio_for_each_segment(bv, bio, iter) {
 		page = compound_head(bv.bv_page);
-		atomic_dec(&page->_count);
+		page_ref_dec(page);
 	}
 }
 
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index b9ecf197ad11..f21b2c479780 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -2944,7 +2944,7 @@ static bool gfar_add_rx_frag(struct gfar_rx_buff *rxb, u32 lstatus,
 	/* change offset to the other half */
 	rxb->page_offset ^= GFAR_RXB_TRUESIZE;
 
-	atomic_inc(&page->_count);
+	page_ref_inc(page);
 
 	return true;
 }
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index b243c3cbe68f..b4547ebed774 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -243,7 +243,7 @@ static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer,
 	/* Even if we own the page, we are not allowed to use atomic_set()
 	 * This would break get_page_unless_zero() users.
 	 */
-	atomic_inc(&page->_count);
+	page_ref_inc(page);
 
 	return true;
 }
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 31e5f3942839..5b4ad1ad4d5f 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6630,7 +6630,7 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
 	/* Even if we own the page, we are not allowed to use atomic_set()
 	 * This would break get_page_unless_zero() users.
 	 */
-	atomic_inc(&page->_count);
+	page_ref_inc(page);
 
 	return true;
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c4003a88bbf6..e6035ff6b861 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1942,7 +1942,7 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
 	/* Even if we own the page, we are not allowed to use atomic_set()
 	 * This would break get_page_unless_zero() users.
 	 */
-	atomic_inc(&page->_count);
+	page_ref_inc(page);
 
 	return true;
 }
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 3558f019b631..0ea14c0a2e74 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -837,7 +837,7 @@ add_tail_frag:
 	/* Even if we own the page, we are not allowed to use atomic_set()
 	 * This would break get_page_unless_zero() users.
 	 */
-	atomic_inc(&page->_count);
+	page_ref_inc(page);
 
 	return true;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 41440b2b20a3..86bcfe510e4e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -82,8 +82,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 	/* Not doing get_page() for each frag is a big win
 	 * on asymetric workloads. Note we can not use atomic_set().
 	 */
-	atomic_add(page_alloc->page_size / frag_info->frag_stride - 1,
-		   &page->_count);
+	page_ref_add(page, page_alloc->page_size / frag_info->frag_stride - 1);
 	return 0;
 }
 
@@ -127,7 +126,7 @@ out:
 			dma_unmap_page(priv->ddev, page_alloc[i].dma,
 				page_alloc[i].page_size, PCI_DMA_FROMDEVICE);
 			page = page_alloc[i].page;
-			atomic_set(&page->_count, 1);
+			set_page_count(page, 1);
 			put_page(page);
 		}
 	}
@@ -165,7 +164,7 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
 
 		en_dbg(DRV, priv, "  frag %d allocator: - size:%d frags:%d\n",
 		       i, ring->page_alloc[i].page_size,
-		       atomic_read(&ring->page_alloc[i].page->_count));
+		       page_ref_count(ring->page_alloc[i].page));
 	}
 	return 0;
 
@@ -177,7 +176,7 @@ out:
 		dma_unmap_page(priv->ddev, page_alloc->dma,
 			       page_alloc->page_size, PCI_DMA_FROMDEVICE);
 		page = page_alloc->page;
-		atomic_set(&page->_count, 1);
+		set_page_count(page, 1);
 		put_page(page);
 		page_alloc->page = NULL;
 	}
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index ab6051a43134..9cc45649f477 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -3341,7 +3341,7 @@ static int niu_rbr_add_page(struct niu *np, struct rx_ring_info *rp,
 
 	niu_hash_page(rp, page, addr);
 	if (rp->rbr_blocks_per_page > 1)
-		atomic_add(rp->rbr_blocks_per_page - 1, &page->_count);
+		page_ref_add(page, rp->rbr_blocks_per_page - 1);
 
 	for (i = 0; i < rp->rbr_blocks_per_page; i++) {
 		__le32 *rbr = &rp->rbr[start_index + i];
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 45d650addd56..c20df77eff99 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -180,7 +180,7 @@ void nilfs_page_bug(struct page *page)
 
 	printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
 	       "mapping=%p ino=%lu\n",
-	       page, atomic_read(&page->_count),
+	       page, page_ref_count(page),
 	       (unsigned long long)page->index, page->flags, m, ino);
 
 	if (page_has_buffers(page)) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f7fd64227d3a..997fc2e5d9d8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -22,6 +22,7 @@
 #include <linux/resource.h>
 #include <linux/page_ext.h>
 #include <linux/err.h>
+#include <linux/page_ref.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -386,8 +387,8 @@ static inline int pmd_devmap(pmd_t pmd)
  */
 static inline int put_page_testzero(struct page *page)
 {
-	VM_BUG_ON_PAGE(atomic_read(&page->_count) == 0, page);
-	return atomic_dec_and_test(&page->_count);
+	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+	return page_ref_dec_and_test(page);
 }
 
 /*
@@ -398,7 +399,7 @@ static inline int put_page_testzero(struct page *page)
  */
 static inline int get_page_unless_zero(struct page *page)
 {
-	return atomic_inc_not_zero(&page->_count);
+	return page_ref_add_unless(page, 1, 0);
 }
 
 extern int page_is_ram(unsigned long pfn);
@@ -486,11 +487,6 @@ static inline int total_mapcount(struct page *page)
 }
 #endif
 
-static inline int page_count(struct page *page)
-{
-	return atomic_read(&compound_head(page)->_count);
-}
-
 static inline struct page *virt_to_head_page(const void *x)
 {
 	struct page *page = virt_to_page(x);
@@ -498,15 +494,6 @@ static inline struct page *virt_to_head_page(const void *x)
 	return compound_head(page);
 }
 
-/*
- * Setup the page count before being freed into the page allocator for
- * the first time (boot or memory hotplug)
- */
-static inline void init_page_count(struct page *page)
-{
-	atomic_set(&page->_count, 1);
-}
-
 void __put_page(struct page *page);
 
 void put_pages_list(struct list_head *pages);
@@ -716,8 +703,8 @@ static inline void get_page(struct page *page)
 	 * Getting a normal page or the head of a compound page
 	 * requires to already have an elevated page->_count.
 	 */
-	VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
-	atomic_inc(&page->_count);
+	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
+	page_ref_inc(page);
 
 	if (unlikely(is_zone_device_page(page)))
 		get_zone_device_page(page);
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
new file mode 100644
index 000000000000..30f5817f6b8e
--- /dev/null
+++ b/include/linux/page_ref.h
@@ -0,0 +1,85 @@
+#ifndef _LINUX_PAGE_REF_H
+#define _LINUX_PAGE_REF_H
+
+#include <linux/atomic.h>
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
+
+static inline int page_ref_count(struct page *page)
+{
+	return atomic_read(&page->_count);
+}
+
+static inline int page_count(struct page *page)
+{
+	return atomic_read(&compound_head(page)->_count);
+}
+
+static inline void set_page_count(struct page *page, int v)
+{
+	atomic_set(&page->_count, v);
+}
+
+/*
+ * Setup the page count before being freed into the page allocator for
+ * the first time (boot or memory hotplug)
+ */
+static inline void init_page_count(struct page *page)
+{
+	set_page_count(page, 1);
+}
+
+static inline void page_ref_add(struct page *page, int nr)
+{
+	atomic_add(nr, &page->_count);
+}
+
+static inline void page_ref_sub(struct page *page, int nr)
+{
+	atomic_sub(nr, &page->_count);
+}
+
+static inline void page_ref_inc(struct page *page)
+{
+	atomic_inc(&page->_count);
+}
+
+static inline void page_ref_dec(struct page *page)
+{
+	atomic_dec(&page->_count);
+}
+
+static inline int page_ref_sub_and_test(struct page *page, int nr)
+{
+	return atomic_sub_and_test(nr, &page->_count);
+}
+
+static inline int page_ref_dec_and_test(struct page *page)
+{
+	return atomic_dec_and_test(&page->_count);
+}
+
+static inline int page_ref_dec_return(struct page *page)
+{
+	return atomic_dec_return(&page->_count);
+}
+
+static inline int page_ref_add_unless(struct page *page, int nr, int u)
+{
+	return atomic_add_unless(&page->_count, nr, u);
+}
+
+static inline int page_ref_freeze(struct page *page, int count)
+{
+	return likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+}
+
+static inline void page_ref_unfreeze(struct page *page, int count)
+{
+	VM_BUG_ON_PAGE(page_count(page) != 0, page);
+	VM_BUG_ON(count == 0);
+
+	atomic_set(&page->_count, count);
+}
+
+#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 183b15ea052b..1ebd65c91422 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -165,7 +165,7 @@ static inline int page_cache_get_speculative(struct page *page)
 	 * SMP requires.
 	 */
 	VM_BUG_ON_PAGE(page_count(page) == 0, page);
-	atomic_inc(&page->_count);
+	page_ref_inc(page);
 
 #else
 	if (unlikely(!get_page_unless_zero(page))) {
@@ -194,10 +194,10 @@ static inline int page_cache_add_speculative(struct page *page, int count)
 	VM_BUG_ON(!in_atomic());
 # endif
 	VM_BUG_ON_PAGE(page_count(page) == 0, page);
-	atomic_add(count, &page->_count);
+	page_ref_add(page, count);
 
 #else
-	if (unlikely(!atomic_add_unless(&page->_count, count, 0)))
+	if (unlikely(!page_ref_add_unless(page, count, 0)))
 		return 0;
 #endif
 	VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page);
@@ -205,19 +205,6 @@ static inline int page_cache_add_speculative(struct page *page, int count)
 	return 1;
 }
 
-static inline int page_freeze_refs(struct page *page, int count)
-{
-	return likely(atomic_cmpxchg(&page->_count, count, 0) == count);
-}
-
-static inline void page_unfreeze_refs(struct page *page, int count)
-{
-	VM_BUG_ON_PAGE(page_count(page) != 0, page);
-	VM_BUG_ON(count == 0);
-
-	atomic_set(&page->_count, count);
-}
-
 #ifdef CONFIG_NUMA
 extern struct page *__page_cache_alloc(gfp_t gfp);
 #else
diff --git a/mm/debug.c b/mm/debug.c
index df7247b0b532..8865bfb41b0b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -43,7 +43,7 @@ const struct trace_print_flags vmaflag_names[] = {
 void __dump_page(struct page *page, const char *reason)
 {
 	pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
-		  page, atomic_read(&page->_count), page_mapcount(page),
+		  page, page_ref_count(page), page_mapcount(page),
 		  page->mapping, page->index);
 	if (PageCompound(page))
 		pr_cont(" compound_mapcount: %d", compound_mapcount(page));
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e08b1659ff19..bb944c771c82 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2888,7 +2888,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	page = pmd_page(*pmd);
 	VM_BUG_ON_PAGE(!page_count(page), page);
-	atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+	page_ref_add(page, HPAGE_PMD_NR - 1);
 	write = pmd_write(*pmd);
 	young = pmd_young(*pmd);
 	dirty = pmd_dirty(*pmd);
@@ -3257,7 +3257,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
 	struct page *page_tail = head + tail;
 
 	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
-	VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+	VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
 
 	/*
 	 * tail_page->_count is zero and not changing from under us. But
@@ -3270,7 +3270,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
 	 * atomic_set() here would be safe on all archs (and not only on x86),
 	 * it's safer to use atomic_inc().
 	 */
-	atomic_inc(&page_tail->_count);
+	page_ref_inc(page_tail);
 
 	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	page_tail->flags |= (head->flags &
diff --git a/mm/internal.h b/mm/internal.h
index 4042a8a05672..57d7b0e839f0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,11 +38,6 @@
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 
-static inline void set_page_count(struct page *page, int v)
-{
-	atomic_set(&page->_count, v);
-}
-
 extern int __do_page_cache_readahead(struct address_space *mapping,
 		struct file *filp, pgoff_t offset, unsigned long nr_to_read,
 		unsigned long lookahead_size);
@@ -64,7 +59,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
 static inline void set_page_refcounted(struct page *page)
 {
 	VM_BUG_ON_PAGE(PageTail(page), page);
-	VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
+	VM_BUG_ON_PAGE(page_ref_count(page), page);
 	set_page_count(page, 1);
 }
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c832ef3565cc..e62aa078f5c9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -167,7 +167,7 @@ void get_page_bootmem(unsigned long info,  struct page *page,
 	page->lru.next = (struct list_head *) type;
 	SetPagePrivate(page);
 	set_page_private(page, info);
-	atomic_inc(&page->_count);
+	page_ref_inc(page);
 }
 
 void put_page_bootmem(struct page *page)
@@ -178,7 +178,7 @@ void put_page_bootmem(struct page *page)
 	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
 	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
 
-	if (atomic_dec_return(&page->_count) == 1) {
+	if (page_ref_dec_return(page) == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		INIT_LIST_HEAD(&page->lru);
diff --git a/mm/migrate.c b/mm/migrate.c
index fdaf0818fb30..577c94b8e959 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -349,7 +349,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 		return -EAGAIN;
 	}
 
-	if (!page_freeze_refs(page, expected_count)) {
+	if (!page_ref_freeze(page, expected_count)) {
 		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
@@ -363,7 +363,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	 */
 	if (mode == MIGRATE_ASYNC && head &&
 			!buffer_migrate_lock_buffers(head, mode)) {
-		page_unfreeze_refs(page, expected_count);
+		page_ref_unfreeze(page, expected_count);
 		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
@@ -397,7 +397,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	 * to one less reference.
 	 * We know this isn't the last reference.
 	 */
-	page_unfreeze_refs(page, expected_count - 1);
+	page_ref_unfreeze(page, expected_count - 1);
 
 	spin_unlock(&mapping->tree_lock);
 	/* Leave irq disabled to prevent preemption while updating stats */
@@ -451,7 +451,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 		return -EAGAIN;
 	}
 
-	if (!page_freeze_refs(page, expected_count)) {
+	if (!page_ref_freeze(page, expected_count)) {
 		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
@@ -463,7 +463,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 	radix_tree_replace_slot(pslot, newpage);
 
-	page_unfreeze_refs(page, expected_count - 1);
+	page_ref_unfreeze(page, expected_count - 1);
 
 	spin_unlock_irq(&mapping->tree_lock);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 096a00d98a45..30134a8f7cc8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -766,7 +766,7 @@ static inline int free_pages_check(struct page *page)
 		bad_reason = "nonzero mapcount";
 	if (unlikely(page->mapping != NULL))
 		bad_reason = "non-NULL mapping";
-	if (unlikely(atomic_read(&page->_count) != 0))
+	if (unlikely(page_ref_count(page) != 0))
 		bad_reason = "nonzero _count";
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
@@ -1462,7 +1462,7 @@ static inline int check_new_page(struct page *page)
 		bad_reason = "nonzero mapcount";
 	if (unlikely(page->mapping != NULL))
 		bad_reason = "non-NULL mapping";
-	if (unlikely(atomic_read(&page->_count) != 0))
+	if (unlikely(page_ref_count(page) != 0))
 		bad_reason = "nonzero _count";
 	if (unlikely(page->flags & __PG_HWPOISON)) {
 		bad_reason = "HWPoisoned (hardware-corrupted)";
@@ -3475,7 +3475,7 @@ refill:
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
-		atomic_add(size - 1, &page->_count);
+		page_ref_add(page, size - 1);
 
 		/* reset page count bias and offset to start of new frag */
 		nc->pfmemalloc = page_is_pfmemalloc(page);
@@ -3487,7 +3487,7 @@ refill:
 	if (unlikely(offset < 0)) {
 		page = virt_to_page(nc->va);
 
-		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
 			goto refill;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -3495,7 +3495,7 @@ refill:
 		size = nc->size;
 #endif
 		/* OK, page count is 0, we can safely set it */
-		atomic_set(&page->_count, size);
+		set_page_count(page, size);
 
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias = size;
@@ -6852,7 +6852,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 		 * This check already skips compound tails of THP
 		 * because their page->_count is zero at all time.
 		 */
-		if (!atomic_read(&page->_count)) {
+		if (!page_ref_count(page)) {
 			if (PageBuddy(page))
 				iter += (1 << page_order(page)) - 1;
 			continue;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b41b82d4bab1..b934223eaa45 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -638,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
 	 */
-	if (!page_freeze_refs(page, 2))
+	if (!page_ref_freeze(page, 2))
 		goto cannot_free;
 	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
 	if (unlikely(PageDirty(page))) {
-		page_unfreeze_refs(page, 2);
+		page_ref_unfreeze(page, 2);
 		goto cannot_free;
 	}
 
@@ -704,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 		 * drops the pagecache ref for us without requiring another
 		 * atomic operation.
 		 */
-		page_unfreeze_refs(page, 1);
+		page_ref_unfreeze(page, 1);
 		return 1;
 	}
 	return 0;
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c1c8bc93412..67e7efe12ff7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1903,7 +1903,7 @@ EXPORT_SYMBOL(sock_cmsg_send);
 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 {
 	if (pfrag->page) {
-		if (atomic_read(&pfrag->page->_count) == 1) {
+		if (page_ref_count(pfrag->page) == 1) {
 			pfrag->offset = 0;
 			return true;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From f6773c5e95a6dc1af82157d4e96e412dee3abf31 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Mar 2016 21:59:49 -0700
Subject: vlan: propagate gso_max_segs

vlan drivers lack proper propagation of gso_max_segs from
lower device.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipvlan/ipvlan_main.c | 2 ++
 drivers/net/macvlan.c            | 2 ++
 net/8021q/vlan.c                 | 1 +
 net/8021q/vlan_dev.c             | 1 +
 4 files changed, 6 insertions(+)

(limited to 'net')

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 5802b9025765..57941d3f4227 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -119,6 +119,7 @@ static int ipvlan_init(struct net_device *dev)
 	dev->features = phy_dev->features & IPVLAN_FEATURES;
 	dev->features |= NETIF_F_LLTX;
 	dev->gso_max_size = phy_dev->gso_max_size;
+	dev->gso_max_segs = phy_dev->gso_max_segs;
 	dev->hard_header_len = phy_dev->hard_header_len;
 
 	ipvlan_set_lockdep_class(dev);
@@ -592,6 +593,7 @@ static int ipvlan_device_event(struct notifier_block *unused,
 		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
 			ipvlan->dev->features = dev->features & IPVLAN_FEATURES;
 			ipvlan->dev->gso_max_size = dev->gso_max_size;
+			ipvlan->dev->gso_max_segs = dev->gso_max_segs;
 			netdev_features_change(ipvlan->dev);
 		}
 		break;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 6e953e3a460a..2bcf1f321bea 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -803,6 +803,7 @@ static int macvlan_init(struct net_device *dev)
 	dev->hw_features	|= NETIF_F_LRO;
 	dev->vlan_features	= lowerdev->vlan_features & MACVLAN_FEATURES;
 	dev->gso_max_size	= lowerdev->gso_max_size;
+	dev->gso_max_segs	= lowerdev->gso_max_segs;
 	dev->hard_header_len	= lowerdev->hard_header_len;
 
 	macvlan_set_lockdep_class(dev);
@@ -1532,6 +1533,7 @@ static int macvlan_device_event(struct notifier_block *unused,
 	case NETDEV_FEAT_CHANGE:
 		list_for_each_entry(vlan, &port->vlans, list) {
 			vlan->dev->gso_max_size = dev->gso_max_size;
+			vlan->dev->gso_max_segs = dev->gso_max_segs;
 			netdev_update_features(vlan->dev);
 		}
 		break;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 69929c05c843..a1e273af6fc8 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -311,6 +311,7 @@ static void vlan_transfer_features(struct net_device *dev,
 	struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
 
 	vlandev->gso_max_size = dev->gso_max_size;
+	vlandev->gso_max_segs = dev->gso_max_segs;
 
 	if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
 		vlandev->hard_header_len = dev->hard_header_len;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e416a4038a12..e7e62570bdb8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -551,6 +551,7 @@ static int vlan_dev_init(struct net_device *dev)
 	dev->features |= real_dev->vlan_features | NETIF_F_LLTX |
 			 NETIF_F_GSO_SOFTWARE;
 	dev->gso_max_size = real_dev->gso_max_size;
+	dev->gso_max_segs = real_dev->gso_max_segs;
 	if (dev->features & NETIF_F_VLAN_FEATURES)
 		netdev_warn(real_dev, "VLAN features are set incorrectly.  Q-in-Q configurations may not work correctly.\n");
 
-- 
cgit v1.2.3-59-g8ed1b


From e316ea62e3203d524ff0239a40c56d3a39ad1b5c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Mar 2016 22:52:15 -0700
Subject: tcp/dccp: remove obsolete WARN_ON() in icmp handlers

Now SYN_RECV request sockets are installed in ehash table, an ICMP
handler can find a request socket while another cpu handles an incoming
packet transforming this SYN_RECV request socket into an ESTABLISHED
socket.

We need to remove the now obsolete WARN_ON(req->sk), since req->sk
is set when a new child is created and added into listener accept queue.

If this race happens, the ICMP will do nothing special.

Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Ben Lazarus <blazarus@google.com>
Reported-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c     | 2 --
 net/ipv4/tcp_ipv4.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b5672e5fe649..9c67a961ba53 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -204,8 +204,6 @@ void dccp_req_err(struct sock *sk, u64 seq)
 	 * ICMPs are not backlogged, hence we cannot get an established
 	 * socket here.
 	 */
-	WARN_ON(req->sk);
-
 	if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) {
 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 	} else {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0b02ef773705..e7528b101e68 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -319,8 +319,6 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 	/* ICMPs are not backlogged, hence we cannot get
 	 * an established socket here.
 	 */
-	WARN_ON(req->sk);
-
 	if (seq != tcp_rsk(req)->snt_isn) {
 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 	} else if (abort) {
-- 
cgit v1.2.3-59-g8ed1b


From b73f96fcb49ec90c2f837719893e7b25fcdf08d8 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 18 Mar 2016 23:27:28 +0800
Subject: net: dst_cache_per_cpu_dst_set() can be static

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dst_cache.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 3938f3f38d69..554d36449231 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -28,8 +28,8 @@ struct dst_cache_pcpu {
 	};
 };
 
-void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
-			       struct dst_entry *dst, u32 cookie)
+static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
+				      struct dst_entry *dst, u32 cookie)
 {
 	dst_release(dst_cache->dst);
 	if (dst)
@@ -39,8 +39,8 @@ void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
 	dst_cache->dst = dst;
 }
 
-struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
-					struct dst_cache_pcpu *idst)
+static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
+					       struct dst_cache_pcpu *idst)
 {
 	struct dst_entry *dst;
 
-- 
cgit v1.2.3-59-g8ed1b


From e014e8468552236f0f0cb64c7c341c1dce506070 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sat, 19 Mar 2016 00:54:50 +0800
Subject: ovs: internal_set_rx_headroom() can be static

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/vport-internal_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 83a5534abd31..7c8b90bf0e54 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -138,7 +138,7 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	return stats;
 }
 
-void internal_set_rx_headroom(struct net_device *dev, int new_hr)
+static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
 {
 	dev->needed_headroom = new_hr;
 }
-- 
cgit v1.2.3-59-g8ed1b


From fe3a5f6c795810edb1646a840fec3c8c350c2a4e Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Tue, 15 Mar 2016 12:00:35 +0900
Subject: openvswitch: allow output of MPLS packets on tunnel vports

Currently output of MPLS packets on tunnel vports is not allowed by Open
vSwitch. This is because historically encapsulation was done in such a way
that the inner_protocol field of the skb needed to hold the inner protocol
for both MPLS and tunnel encapsulation in order for GSO segmentation to be
performed correctly.

Since b2acd1dc3949 ("openvswitch: Use regular GRE net_device instead of
vport") Open vSwitch makes use of lwt to output to tunnel netdevs which
perform encapsulation. As no drivers expose support for MPLS offloads this
means that GSO packets are segmented in software by validate_xmit_skb(),
which is called from __dev_queue_xmit(), before tunnel encapsulation occurs.
This means that the inner protocol of MPLS is no longer needed by the time
encapsulation occurs and the contention on the inner_protocol field of the
skb no longer occurs.

Thus it is now safe to output MPLS to tunnel vports.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Jesse Gross <jesse@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow_netlink.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 58b8efc23668..689c17264221 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2044,9 +2044,6 @@ static int validate_set(const struct nlattr *a,
 		break;
 
 	case OVS_KEY_ATTR_TUNNEL:
-		if (eth_p_mpls(eth_type))
-			return -EINVAL;
-
 		if (masked)
 			return -EINVAL; /* Masked tunnel set not supported. */
 
-- 
cgit v1.2.3-59-g8ed1b


From 3a461da1d03e7a857edfa6a002040d07e118c639 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 15 Mar 2016 22:41:22 +0100
Subject: cls_bpf: reset class and reuse major in da

There are two issues with the current code. First one is that we need
to set res->class to 0 in case we use non-default classid matching.

This is important for the case where cls_bpf was initially set up with
an optional binding to a default class with tcf_bind_filter(), where
the underlying qdisc implements bind_tcf() that fills res->class and
tests for it later on when doing the classification. Convention for
these cases is that after tc_classify() was called, such qdiscs (atm,
drr, qfq, cbq, hfsc, htb) first test class, and if 0, then they lookup
based on classid.

Second, there's a bug with da mode, where res->classid is only assigned
a 16 bit minor, but it needs to expand to the full 32 bit major/minor
combination instead, therefore we need to expand with the bound major.
This is fine as classes belonging to a classful qdisc must share the
same major.

Fixes: 045efa82ff56 ("cls_bpf: introduce integrated actions")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_bpf.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 8dc84300ee79..425fe6a0eda3 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -103,8 +103,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		}
 
 		if (prog->exts_integrated) {
-			res->class = prog->res.class;
-			res->classid = qdisc_skb_cb(skb)->tc_classid;
+			res->class   = 0;
+			res->classid = TC_H_MAJ(prog->res.classid) |
+				       qdisc_skb_cb(skb)->tc_classid;
 
 			ret = cls_bpf_exec_opcode(filter_res);
 			if (ret == TC_ACT_UNSPEC)
@@ -114,10 +115,12 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 		if (filter_res == 0)
 			continue;
-
-		*res = prog->res;
-		if (filter_res != -1)
+		if (filter_res != -1) {
+			res->class   = 0;
 			res->classid = filter_res;
+		} else {
+			*res = prog->res;
+		}
 
 		ret = tcf_exts_exec(skb, &prog->exts, res);
 		if (ret < 0)
-- 
cgit v1.2.3-59-g8ed1b


From 09c37a2c5bbc28b5fbc07a01db4bccdbd0a5d8a2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 16 Mar 2016 01:42:49 +0100
Subject: bpf: make skb->tc_classid also readable

Currently, the tc_classid from eBPF skb context is write-only, but there's
no good reason for tc programs to limit it to write-only. For example,
it can be used to transfer its state via tail calls where the resulting
tc_classid gets filled gradually.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 6fc3893a6170..69c7b2fecf44 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2069,16 +2069,14 @@ static bool sk_filter_is_valid_access(int off, int size,
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type)
 {
-	if (off == offsetof(struct __sk_buff, tc_classid))
-		return type == BPF_WRITE ? true : false;
-
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct __sk_buff, mark):
 		case offsetof(struct __sk_buff, tc_index):
 		case offsetof(struct __sk_buff, priority):
 		case offsetof(struct __sk_buff, cb[0]) ...
-			offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]):
+		case offsetof(struct __sk_buff, tc_classid):
 			break;
 		default:
 			return false;
@@ -2195,8 +2193,10 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		ctx_off -= offsetof(struct __sk_buff, tc_classid);
 		ctx_off += offsetof(struct sk_buff, cb);
 		ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
-		WARN_ON(type != BPF_WRITE);
-		*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+		else
+			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_index):
-- 
cgit v1.2.3-59-g8ed1b


From 808c1b697c3c4dd2a7132882424c390b0d0acfb9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 16 Mar 2016 01:42:50 +0100
Subject: bpf, dst: add and use dst_tclassid helper

We can just add a small helper dst_tclassid() for retrieving the
dst->tclassid value. It makes the code a bit better in that we can
get rid of the ifdef from filter.c by moving this into the header.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 12 ++++++++++++
 net/core/filter.c |  9 +--------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/dst.h b/include/net/dst.h
index c7329dcd90cc..5c98443c1c9e 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -398,6 +398,18 @@ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
 	__skb_tunnel_rx(skb, dev, net);
 }
 
+static inline u32 dst_tclassid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	const struct dst_entry *dst;
+
+	dst = skb_dst(skb);
+	if (dst)
+		return dst->tclassid;
+#endif
+	return 0;
+}
+
 int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 static inline int dst_discard(struct sk_buff *skb)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 69c7b2fecf44..4c35d8325c34 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1682,14 +1682,7 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
 
 static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
-#ifdef CONFIG_IP_ROUTE_CLASSID
-	const struct dst_entry *dst;
-
-	dst = skb_dst((struct sk_buff *) (unsigned long) r1);
-	if (dst)
-		return dst->tclassid;
-#endif
-	return 0;
+	return dst_tclassid((struct sk_buff *) (unsigned long) r1);
 }
 
 static const struct bpf_func_proto bpf_get_route_realm_proto = {
-- 
cgit v1.2.3-59-g8ed1b


From fca5fdf67de9e092fda23c9eb059ba968e7b5267 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 16 Mar 2016 01:42:51 +0100
Subject: ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it

eBPF defines this as BPF_TUNLEN_MAX and OVS just uses the hard-coded
value inside struct sw_flow_key. Thus, add and use IP_TUNNEL_OPTS_MAX
for this, which makes the code a bit more generic and allows to remove
BPF_TUNLEN_MAX from eBPF code.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h  | 7 +++++++
 net/core/filter.c         | 9 ++-------
 net/ipv4/ip_tunnel_core.c | 6 ++++++
 net/openvswitch/flow.h    | 2 +-
 4 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 5dc2e454f866..c35dda9ec991 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -7,6 +7,8 @@
 #include <linux/socket.h>
 #include <linux/types.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/bitops.h>
+
 #include <net/dsfield.h>
 #include <net/gro_cells.h>
 #include <net/inet_ecn.h>
@@ -57,6 +59,11 @@ struct ip_tunnel_key {
 #define IP_TUNNEL_INFO_TX	0x01	/* represents tx tunnel parameters */
 #define IP_TUNNEL_INFO_IPV6	0x02	/* key contains IPv6 addresses */
 
+/* Maximum tunnel options length. */
+#define IP_TUNNEL_OPTS_MAX					\
+	GENMASK((FIELD_SIZEOF(struct ip_tunnel_info,		\
+			      options_len) * BITS_PER_BYTE) - 1, 0)
+
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
 #ifdef CONFIG_DST_CACHE
diff --git a/net/core/filter.c b/net/core/filter.c
index 4c35d8325c34..b7177d01ecb0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1904,8 +1904,6 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-#define BPF_TUNLEN_MAX	255
-
 static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
@@ -1915,7 +1913,7 @@ static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
 
 	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
 		return -EINVAL;
-	if (unlikely(size > BPF_TUNLEN_MAX))
+	if (unlikely(size > IP_TUNNEL_OPTS_MAX))
 		return -ENOMEM;
 
 	ip_tunnel_info_opts_set(info, from, size);
@@ -1936,13 +1934,10 @@ static const struct bpf_func_proto *
 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 {
 	if (!md_dst) {
-		BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info,
-					  options_len) != 1);
-
 		/* Race is not possible, since it's called from verifier
 		 * that is holding verifier mutex.
 		 */
-		md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX,
+		md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
 						   GFP_KERNEL);
 		if (!md_dst)
 			return NULL;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index eaca2449a09a..d27276f6f8dd 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -398,6 +398,12 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
 
 void __init ip_tunnel_core_init(void)
 {
+	/* If you land here, make sure whether increasing ip_tunnel_info's
+	 * options_len is a reasonable choice with its usage in front ends
+	 * (f.e., it's part of flow keys, etc).
+	 */
+	BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
+
 	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
 	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1d055c559eaf..03378e75a67c 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -55,7 +55,7 @@ struct ovs_tunnel_info {
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
 
 struct sw_flow_key {
-	u8 tun_opts[255];
+	u8 tun_opts[IP_TUNNEL_OPTS_MAX];
 	u8 tun_opts_len;
 	struct ip_tunnel_key tun_key;	/* Encapsulating tunnel key. */
 	struct {
-- 
cgit v1.2.3-59-g8ed1b


From c6a58ffed53612be86b758df1cdb0b0f4305e9cb Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Wed, 16 Mar 2016 11:38:12 -0700
Subject: RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket

Add per-net sysctl tunables to set the size of sndbuf and
rcvbuf on the kernel tcp socket.

The tunables are added at /proc/sys/net/rds/tcp/rds_tcp_sndbuf
and /proc/sys/net/rds/tcp/rds_tcp_rcvbuf.

These values must be set before accept() or connect(),
and there may be an arbitrary number of existing rds-tcp
sockets when the tunable is modified. To make sure that all
connections in the netns pick up the same value for the tunable,
we reset existing rds-tcp connections in the netns, so that
they can reconnect with the new parameters.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 135 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index ad60299b088b..380278540465 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -54,6 +54,35 @@ static struct kmem_cache *rds_tcp_conn_slab;
 
 #define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
 
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *fpos);
+
+int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
+int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
+
+static struct ctl_table rds_tcp_sysctl_table[] = {
+#define	RDS_TCP_SNDBUF	0
+	{
+		.procname       = "rds_tcp_sndbuf",
+		/* data is per-net pointer */
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = rds_tcp_skbuf_handler,
+		.extra1		= &rds_tcp_min_sndbuf,
+	},
+#define	RDS_TCP_RCVBUF	1
+	{
+		.procname       = "rds_tcp_rcvbuf",
+		/* data is per-net pointer */
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = rds_tcp_skbuf_handler,
+		.extra1		= &rds_tcp_min_rcvbuf,
+	},
+	{ }
+};
+
 /* doing it this way avoids calling tcp_sk() */
 void rds_tcp_nonagle(struct socket *sock)
 {
@@ -66,15 +95,6 @@ void rds_tcp_nonagle(struct socket *sock)
 	set_fs(oldfs);
 }
 
-/* All module specific customizations to the RDS-TCP socket should be done in
- * rds_tcp_tune() and applied after socket creation. In general these
- * customizations should be tunable via module_param()
- */
-void rds_tcp_tune(struct socket *sock)
-{
-	rds_tcp_nonagle(sock);
-}
-
 u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
 {
 	return tcp_sk(tc->t_sock->sk)->snd_nxt;
@@ -272,8 +292,34 @@ static int rds_tcp_netid;
 struct rds_tcp_net {
 	struct socket *rds_tcp_listen_sock;
 	struct work_struct rds_tcp_accept_w;
+	struct ctl_table_header *rds_tcp_sysctl;
+	struct ctl_table *ctl_table;
+	int sndbuf_size;
+	int rcvbuf_size;
 };
 
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation.
+ */
+void rds_tcp_tune(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+	rds_tcp_nonagle(sock);
+	lock_sock(sk);
+	if (rtn->sndbuf_size > 0) {
+		sk->sk_sndbuf = rtn->sndbuf_size;
+		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	}
+	if (rtn->rcvbuf_size > 0) {
+		sk->sk_sndbuf = rtn->rcvbuf_size;
+		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	}
+	release_sock(sk);
+}
+
 static void rds_tcp_accept_worker(struct work_struct *work)
 {
 	struct rds_tcp_net *rtn = container_of(work,
@@ -295,20 +341,60 @@ void rds_tcp_accept_work(struct sock *sk)
 static __net_init int rds_tcp_init_net(struct net *net)
 {
 	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+	struct ctl_table *tbl;
+	int err = 0;
 
+	memset(rtn, 0, sizeof(*rtn));
+
+	/* {snd, rcv}buf_size default to 0, which implies we let the
+	 * stack pick the value, and permit auto-tuning of buffer size.
+	 */
+	if (net == &init_net) {
+		tbl = rds_tcp_sysctl_table;
+	} else {
+		tbl = kmemdup(rds_tcp_sysctl_table,
+			      sizeof(rds_tcp_sysctl_table), GFP_KERNEL);
+		if (!tbl) {
+			pr_warn("could not set allocate syctl table\n");
+			return -ENOMEM;
+		}
+		rtn->ctl_table = tbl;
+	}
+	tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size;
+	tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size;
+	rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl);
+	if (!rtn->rds_tcp_sysctl) {
+		pr_warn("could not register sysctl\n");
+		err = -ENOMEM;
+		goto fail;
+	}
 	rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
 	if (!rtn->rds_tcp_listen_sock) {
 		pr_warn("could not set up listen sock\n");
-		return -EAFNOSUPPORT;
+		unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+		rtn->rds_tcp_sysctl = NULL;
+		err = -EAFNOSUPPORT;
+		goto fail;
 	}
 	INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
 	return 0;
+
+fail:
+	if (net != &init_net)
+		kfree(tbl);
+	return err;
 }
 
 static void __net_exit rds_tcp_exit_net(struct net *net)
 {
 	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 
+	if (rtn->rds_tcp_sysctl)
+		unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+
+	if (net != &init_net && rtn->ctl_table)
+		kfree(rtn->ctl_table);
+
 	/* If rds_tcp_exit_net() is called as a result of netns deletion,
 	 * the rds_tcp_kill_sock() device notifier would already have cleaned
 	 * up the listen socket, thus there is no work to do in this function.
@@ -383,6 +469,45 @@ static struct notifier_block rds_tcp_dev_notifier = {
 	.priority = -10, /* must be called after other network notifiers */
 };
 
+/* when sysctl is used to modify some kernel socket parameters,this
+ * function  resets the RDS connections in that netns  so that we can
+ * restart with new parameters.  The assumption is that such reset
+ * events are few and far-between.
+ */
+static void rds_tcp_sysctl_reset(struct net *net)
+{
+	struct rds_tcp_connection *tc, *_tc;
+
+	spin_lock_irq(&rds_tcp_conn_lock);
+	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+		struct net *c_net = read_pnet(&tc->conn->c_net);
+
+		if (net != c_net || !tc->t_sock)
+			continue;
+
+		rds_conn_drop(tc->conn); /* reconnect with new parameters */
+	}
+	spin_unlock_irq(&rds_tcp_conn_lock);
+}
+
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *fpos)
+{
+	struct net *net = current->nsproxy->net_ns;
+	int err;
+
+	err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos);
+	if (err < 0) {
+		pr_warn("Invalid input. Must be >= %d\n",
+			*(int *)(ctl->extra1));
+		return err;
+	}
+	if (write)
+		rds_tcp_sysctl_reset(net);
+	return 0;
+}
+
 static void rds_tcp_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-- 
cgit v1.2.3-59-g8ed1b


From a3382e408b645b4f68ec01f9c048e356c62598fb Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Wed, 16 Mar 2016 11:38:13 -0700
Subject: RDS: TCP: Remove unused constant

RDS_TCP_DEFAULT_BUFSIZE has been unused since commit 1edd6a14d24f
("RDS-TCP: Do not bloat sndbuf/rcvbuf in rds_tcp_tune").

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 380278540465..61ed2a8764ba 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -52,8 +52,6 @@ static LIST_HEAD(rds_tcp_conn_list);
 
 static struct kmem_cache *rds_tcp_conn_slab;
 
-#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
-
 static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *fpos);
-- 
cgit v1.2.3-59-g8ed1b