From 4bdafb9ddfa4b3d970e2194d00e1c6d5002f513f Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 26 Sep 2018 21:12:22 +0100
Subject: drm/i915: Remove i915.enable_ppgtt override

Now that we are confident in providing full-ppgtt where supported,
remove the ability to override the context isolation.

v2: Remove faked aliasing-ppgtt for testing as it no longer is accepted.
v3: s/USES/HAS/ to match usage and reject attempts to load the module on
old GVT-g setups that do not provide support for full-ppgtt.
v4: Insulate ABI ppGTT values from our internal enum (later plans
involve moving ppGTT depth out of the enum, thus potentially breaking
ABI unless we document the current values).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Zhi Wang <zhi.a.wang@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180926201222.5643-1-chris@chris-wilson.co.uk
---
 include/uapi/drm/i915_drm.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index a4446f452040..298b2e197744 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -412,6 +412,14 @@ typedef struct drm_i915_irq_wait {
 	int irq_seq;
 } drm_i915_irq_wait_t;
 
+/*
+ * Different modes of per-process Graphics Translation Table,
+ * see I915_PARAM_HAS_ALIASING_PPGTT
+ */
+#define I915_GEM_PPGTT_NONE	0
+#define I915_GEM_PPGTT_ALIASING	1
+#define I915_GEM_PPGTT_FULL	2
+
 /* Ioctl to query kernel params:
  */
 #define I915_PARAM_IRQ_ACTIVE            1
-- 
cgit v1.2.3-59-g8ed1b


From 4fa825bf405082a342b3a8717337ba58c7260c3f Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 28 Sep 2018 16:21:25 -0700
Subject: drm/v3d: Add some better documentation of the in_sync arguments.

Since this is UAPI, it's good to document what exactly the guarantees
we're providing are.

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20180928232126.4332-3-eric@anholt.net
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 include/uapi/drm/v3d_drm.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
index 7b6627783608..f446656d00b1 100644
--- a/include/uapi/drm/v3d_drm.h
+++ b/include/uapi/drm/v3d_drm.h
@@ -58,6 +58,11 @@ struct drm_v3d_submit_cl {
 	 * coordinate shader to determine where primitives land on the screen,
 	 * then writes out the state updates and draw calls necessary per tile
 	 * to the tile allocation BO.
+	 *
+	 * This BCL will block on any previous BCL submitted on the
+	 * same FD, but not on any RCL or BCLs submitted by other
+	 * clients -- that is left up to the submitter to control
+	 * using in_sync_bcl if necessary.
 	 */
 	__u32 bcl_start;
 
@@ -69,6 +74,11 @@ struct drm_v3d_submit_cl {
 	 * This is the second set of commands executed, which will either
 	 * execute the tiles that have been set up by the BCL, or a fixed set
 	 * of tiles (in the case of RCL-only blits).
+	 *
+	 * This RCL will block on this submit's BCL, and any previous
+	 * RCL submitted on the same FD, but not on any RCL or BCLs
+	 * submitted by other clients -- that is left up to the
+	 * submitter to control using in_sync_rcl if necessary.
 	 */
 	__u32 rcl_start;
 
-- 
cgit v1.2.3-59-g8ed1b


From 48197bc564c7a1888c86024a1ba4f956e0ec2300 Mon Sep 17 00:00:00 2001
From: Chunming Zhou <david1.zhou@amd.com>
Date: Thu, 18 Oct 2018 14:18:36 +0800
Subject: drm: add syncobj timeline support v9
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch is for VK_KHR_timeline_semaphore extension, semaphore is called syncobj in kernel side:
This extension introduces a new type of syncobj that has an integer payload
identifying a point in a timeline. Such timeline syncobjs support the
following operations:
   * CPU query - A host operation that allows querying the payload of the
     timeline syncobj.
   * CPU wait - A host operation that allows a blocking wait for a
     timeline syncobj to reach a specified value.
   * Device wait - A device operation that allows waiting for a
     timeline syncobj to reach a specified value.
   * Device signal - A device operation that allows advancing the
     timeline syncobj to a specified value.

v1:
Since it's a timeline, that means the front time point(PT) always is signaled before the late PT.
a. signal PT design:
Signal PT fence N depends on PT[N-1] fence and signal opertion fence, when PT[N] fence is signaled,
the timeline will increase to value of PT[N].
b. wait PT design:
Wait PT fence is signaled by reaching timeline point value, when timeline is increasing, will compare
wait PTs value with new timeline value, if PT value is lower than timeline value, then wait PT will be
signaled, otherwise keep in list. syncobj wait operation can wait on any point of timeline,
so need a RB tree to order them. And wait PT could ahead of signal PT, we need a sumission fence to
perform that.

v2:
1. remove unused DRM_SYNCOBJ_CREATE_TYPE_NORMAL. (Christian)
2. move unexposed denitions to .c file. (Daniel Vetter)
3. split up the change to drm_syncobj_find_fence() in a separate patch. (Christian)
4. split up the change to drm_syncobj_replace_fence() in a separate patch.
5. drop the submission_fence implementation and instead use wait_event() for that. (Christian)
6. WARN_ON(point != 0) for NORMAL type syncobj case. (Daniel Vetter)

v3:
1. replace normal syncobj with timeline implemenation. (Vetter and Christian)
    a. normal syncobj signal op will create a signal PT to tail of signal pt list.
    b. normal syncobj wait op will create a wait pt with last signal point, and this wait PT is only signaled by related signal point PT.
2. many bug fix and clean up
3. stub fence moving is moved to other patch.

v4：
1. fix RB tree loop with while(node=rb_first(...)). (Christian)
2. fix syncobj lifecycle. (Christian)
3. only enable_signaling when there is wait_pt. (Christian)
4. fix timeline path issues.
5. write a timeline test in libdrm

v5: (Christian)
1. semaphore is called syncobj in kernel side.
2. don't need 'timeline' characters in some function name.
3. keep syncobj cb.

v6: (Christian)
1. merge syncobj_timeline to syncobj structure.
2. simplify some check sentences.
3. some misc change.
4. fix CTS failed issue.

v7: (Christian)
1. error handling when creating signal pt.
2. remove timeline naming in func.
3. export flags in find_fence.
4. allow reset timeline.

v8:
1. use wait_event_interruptible without timeout
2. rename _TYPE_INDIVIDUAL to _TYPE_BINARY

v9:
1. rename signal_pt->base to signal_pt->fence_array to avoid misleading
2. improve kerneldoc

individual syncobj is tested by ./deqp-vk -n dEQP-VK*semaphore*
timeline syncobj is tested by ./amdgpu_test -s 9

Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Cc: Christian Konig <christian.koenig@amd.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Daniel Rakos <Daniel.Rakos@amd.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/257258/
---
 drivers/gpu/drm/drm_syncobj.c              | 306 +++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   2 +-
 include/drm/drm_syncobj.h                  |  67 +++----
 include/uapi/drm/drm.h                     |   1 +
 4 files changed, 301 insertions(+), 75 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index e2c5b3ca4824..57bf6006394d 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -56,6 +56,9 @@
 #include "drm_internal.h"
 #include <drm/drm_syncobj.h>
 
+/* merge normal syncobj to timeline syncobj, the point interval is 1 */
+#define DRM_SYNCOBJ_BINARY_POINT 1
+
 struct drm_syncobj_stub_fence {
 	struct dma_fence base;
 	spinlock_t lock;
@@ -71,6 +74,11 @@ static const struct dma_fence_ops drm_syncobj_stub_fence_ops = {
 	.get_timeline_name = drm_syncobj_stub_fence_get_name,
 };
 
+struct drm_syncobj_signal_pt {
+	struct dma_fence_array *fence_array;
+	u64    value;
+	struct list_head list;
+};
 
 /**
  * drm_syncobj_find - lookup and reference a sync object.
@@ -113,8 +121,8 @@ static int drm_syncobj_fence_get_or_add_callback(struct drm_syncobj *syncobj,
 {
 	int ret;
 
-	*fence = drm_syncobj_fence_get(syncobj);
-	if (*fence)
+	ret = drm_syncobj_search_fence(syncobj, 0, 0, fence);
+	if (!ret)
 		return 1;
 
 	spin_lock(&syncobj->lock);
@@ -122,10 +130,12 @@ static int drm_syncobj_fence_get_or_add_callback(struct drm_syncobj *syncobj,
 	 * have the lock, try one more time just to be sure we don't add a
 	 * callback when a fence has already been set.
 	 */
-	if (syncobj->fence) {
-		*fence = dma_fence_get(rcu_dereference_protected(syncobj->fence,
-								 lockdep_is_held(&syncobj->lock)));
-		ret = 1;
+	if (!list_empty(&syncobj->signal_pt_list)) {
+		spin_unlock(&syncobj->lock);
+		drm_syncobj_search_fence(syncobj, 0, 0, fence);
+		if (*fence)
+			return 1;
+		spin_lock(&syncobj->lock);
 	} else {
 		*fence = NULL;
 		drm_syncobj_add_callback_locked(syncobj, cb, func);
@@ -153,6 +163,160 @@ void drm_syncobj_remove_callback(struct drm_syncobj *syncobj,
 	spin_unlock(&syncobj->lock);
 }
 
+static void drm_syncobj_init(struct drm_syncobj *syncobj)
+{
+	spin_lock(&syncobj->lock);
+	syncobj->timeline_context = dma_fence_context_alloc(1);
+	syncobj->timeline = 0;
+	syncobj->signal_point = 0;
+	init_waitqueue_head(&syncobj->wq);
+
+	INIT_LIST_HEAD(&syncobj->signal_pt_list);
+	spin_unlock(&syncobj->lock);
+}
+
+static void drm_syncobj_fini(struct drm_syncobj *syncobj)
+{
+	struct drm_syncobj_signal_pt *signal_pt = NULL, *tmp;
+
+	spin_lock(&syncobj->lock);
+	list_for_each_entry_safe(signal_pt, tmp,
+				 &syncobj->signal_pt_list, list) {
+		list_del(&signal_pt->list);
+		dma_fence_put(&signal_pt->fence_array->base);
+		kfree(signal_pt);
+	}
+	spin_unlock(&syncobj->lock);
+}
+
+static struct dma_fence
+*drm_syncobj_find_signal_pt_for_point(struct drm_syncobj *syncobj,
+				      uint64_t point)
+{
+	struct drm_syncobj_signal_pt *signal_pt;
+
+	if ((syncobj->type == DRM_SYNCOBJ_TYPE_TIMELINE) &&
+	    (point <= syncobj->timeline)) {
+		struct drm_syncobj_stub_fence *fence =
+			kzalloc(sizeof(struct drm_syncobj_stub_fence),
+				GFP_KERNEL);
+
+		if (!fence)
+			return NULL;
+		spin_lock_init(&fence->lock);
+		dma_fence_init(&fence->base,
+			       &drm_syncobj_stub_fence_ops,
+			       &fence->lock,
+			       syncobj->timeline_context,
+			       point);
+
+		dma_fence_signal(&fence->base);
+		return &fence->base;
+	}
+
+	list_for_each_entry(signal_pt, &syncobj->signal_pt_list, list) {
+		if (point > signal_pt->value)
+			continue;
+		if ((syncobj->type == DRM_SYNCOBJ_TYPE_BINARY) &&
+		    (point != signal_pt->value))
+			continue;
+		return dma_fence_get(&signal_pt->fence_array->base);
+	}
+	return NULL;
+}
+
+static int drm_syncobj_create_signal_pt(struct drm_syncobj *syncobj,
+					struct dma_fence *fence,
+					u64 point)
+{
+	struct drm_syncobj_signal_pt *signal_pt =
+		kzalloc(sizeof(struct drm_syncobj_signal_pt), GFP_KERNEL);
+	struct drm_syncobj_signal_pt *tail_pt;
+	struct dma_fence **fences;
+	int num_fences = 0;
+	int ret = 0, i;
+
+	if (!signal_pt)
+		return -ENOMEM;
+	if (!fence)
+		goto out;
+
+	fences = kmalloc_array(sizeof(void *), 2, GFP_KERNEL);
+	if (!fences) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	fences[num_fences++] = dma_fence_get(fence);
+	/* timeline syncobj must take this dependency */
+	if (syncobj->type == DRM_SYNCOBJ_TYPE_TIMELINE) {
+		spin_lock(&syncobj->lock);
+		if (!list_empty(&syncobj->signal_pt_list)) {
+			tail_pt = list_last_entry(&syncobj->signal_pt_list,
+						  struct drm_syncobj_signal_pt, list);
+			fences[num_fences++] =
+				dma_fence_get(&tail_pt->fence_array->base);
+		}
+		spin_unlock(&syncobj->lock);
+	}
+	signal_pt->fence_array = dma_fence_array_create(num_fences, fences,
+							syncobj->timeline_context,
+							point, false);
+	if (!signal_pt->fence_array) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	spin_lock(&syncobj->lock);
+	if (syncobj->signal_point >= point) {
+		DRM_WARN("A later signal is ready!");
+		spin_unlock(&syncobj->lock);
+		goto exist;
+	}
+	signal_pt->value = point;
+	list_add_tail(&signal_pt->list, &syncobj->signal_pt_list);
+	syncobj->signal_point = point;
+	spin_unlock(&syncobj->lock);
+	wake_up_all(&syncobj->wq);
+
+	return 0;
+exist:
+	dma_fence_put(&signal_pt->fence_array->base);
+fail:
+	for (i = 0; i < num_fences; i++)
+		dma_fence_put(fences[i]);
+	kfree(fences);
+out:
+	kfree(signal_pt);
+	return ret;
+}
+
+static void drm_syncobj_garbage_collection(struct drm_syncobj *syncobj)
+{
+	struct drm_syncobj_signal_pt *signal_pt, *tmp, *tail_pt;
+
+	spin_lock(&syncobj->lock);
+	tail_pt = list_last_entry(&syncobj->signal_pt_list,
+				  struct drm_syncobj_signal_pt,
+				  list);
+	list_for_each_entry_safe(signal_pt, tmp,
+				 &syncobj->signal_pt_list, list) {
+		if (syncobj->type == DRM_SYNCOBJ_TYPE_BINARY &&
+		    signal_pt == tail_pt)
+			continue;
+		if (dma_fence_is_signaled(&signal_pt->fence_array->base)) {
+			syncobj->timeline = signal_pt->value;
+			list_del(&signal_pt->list);
+			dma_fence_put(&signal_pt->fence_array->base);
+			kfree(signal_pt);
+		} else {
+			/*signal_pt is in order in list, from small to big, so
+			 * the later must not be signal either */
+			break;
+		}
+	}
+
+	spin_unlock(&syncobj->lock);
+}
 /**
  * drm_syncobj_replace_fence - replace fence in a sync object.
  * @syncobj: Sync object to replace fence in
@@ -165,28 +329,29 @@ void drm_syncobj_replace_fence(struct drm_syncobj *syncobj,
 			       u64 point,
 			       struct dma_fence *fence)
 {
-	struct dma_fence *old_fence;
-	struct drm_syncobj_cb *cur, *tmp;
-
-	if (fence)
-		dma_fence_get(fence);
-
-	spin_lock(&syncobj->lock);
-
-	old_fence = rcu_dereference_protected(syncobj->fence,
-					      lockdep_is_held(&syncobj->lock));
-	rcu_assign_pointer(syncobj->fence, fence);
+	u64 pt_value = point;
+
+	drm_syncobj_garbage_collection(syncobj);
+	if (syncobj->type == DRM_SYNCOBJ_TYPE_BINARY) {
+		if (!fence) {
+			drm_syncobj_fini(syncobj);
+			drm_syncobj_init(syncobj);
+			return;
+		}
+		pt_value = syncobj->signal_point +
+			DRM_SYNCOBJ_BINARY_POINT;
+	}
+	drm_syncobj_create_signal_pt(syncobj, fence, pt_value);
+	if (fence) {
+		struct drm_syncobj_cb *cur, *tmp;
 
-	if (fence != old_fence) {
+		spin_lock(&syncobj->lock);
 		list_for_each_entry_safe(cur, tmp, &syncobj->cb_list, node) {
 			list_del_init(&cur->node);
 			cur->func(syncobj, cur);
 		}
+		spin_unlock(&syncobj->lock);
 	}
-
-	spin_unlock(&syncobj->lock);
-
-	dma_fence_put(old_fence);
 }
 EXPORT_SYMBOL(drm_syncobj_replace_fence);
 
@@ -209,6 +374,64 @@ static int drm_syncobj_assign_null_handle(struct drm_syncobj *syncobj)
 	return 0;
 }
 
+static int
+drm_syncobj_point_get(struct drm_syncobj *syncobj, u64 point, u64 flags,
+		      struct dma_fence **fence)
+{
+	int ret = 0;
+
+	if (flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT) {
+		ret = wait_event_interruptible(syncobj->wq,
+					       point <= syncobj->signal_point);
+		if (ret < 0)
+			return ret;
+	}
+	spin_lock(&syncobj->lock);
+	*fence = drm_syncobj_find_signal_pt_for_point(syncobj, point);
+	if (!*fence)
+		ret = -EINVAL;
+	spin_unlock(&syncobj->lock);
+	return ret;
+}
+
+/**
+ * drm_syncobj_search_fence - lookup and reference the fence in a sync object or
+ * in a timeline point
+ * @syncobj: sync object pointer
+ * @point: timeline point
+ * @flags: DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT or not
+ * @fence: out parameter for the fence
+ *
+ * if flags is DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, the function will block
+ * here until specific timeline points is reached.
+ * if not, you need a submit thread and block in userspace until all future
+ * timeline points have materialized, only then you can submit to the kernel,
+ * otherwise, function will fail to return fence.
+ *
+ * Returns 0 on success or a negative error value on failure. On success @fence
+ * contains a reference to the fence, which must be released by calling
+ * dma_fence_put().
+ */
+int drm_syncobj_search_fence(struct drm_syncobj *syncobj, u64 point,
+			     u64 flags, struct dma_fence **fence)
+{
+	u64 pt_value = point;
+
+	if (!syncobj)
+		return -ENOENT;
+
+	drm_syncobj_garbage_collection(syncobj);
+	if (syncobj->type == DRM_SYNCOBJ_TYPE_BINARY) {
+		/*BINARY syncobj always wait on last pt */
+		pt_value = syncobj->signal_point;
+
+		if (pt_value == 0)
+			pt_value += DRM_SYNCOBJ_BINARY_POINT;
+	}
+	return drm_syncobj_point_get(syncobj, pt_value, flags, fence);
+}
+EXPORT_SYMBOL(drm_syncobj_search_fence);
+
 /**
  * drm_syncobj_find_fence - lookup and reference the fence in a sync object
  * @file_private: drm file private pointer
@@ -218,7 +441,7 @@ static int drm_syncobj_assign_null_handle(struct drm_syncobj *syncobj)
  * @fence: out parameter for the fence
  *
  * This is just a convenience function that combines drm_syncobj_find() and
- * drm_syncobj_fence_get().
+ * drm_syncobj_lookup_fence().
  *
  * Returns 0 on success or a negative error value on failure. On success @fence
  * contains a reference to the fence, which must be released by calling
@@ -229,15 +452,9 @@ int drm_syncobj_find_fence(struct drm_file *file_private,
 			   struct dma_fence **fence)
 {
 	struct drm_syncobj *syncobj = drm_syncobj_find(file_private, handle);
-	int ret = 0;
-
-	if (!syncobj)
-		return -ENOENT;
+	int ret;
 
-	*fence = drm_syncobj_fence_get(syncobj);
-	if (!*fence) {
-		ret = -EINVAL;
-	}
+	ret = drm_syncobj_search_fence(syncobj, point, flags, fence);
 	drm_syncobj_put(syncobj);
 	return ret;
 }
@@ -254,7 +471,7 @@ void drm_syncobj_free(struct kref *kref)
 	struct drm_syncobj *syncobj = container_of(kref,
 						   struct drm_syncobj,
 						   refcount);
-	drm_syncobj_replace_fence(syncobj, 0, NULL);
+	drm_syncobj_fini(syncobj);
 	kfree(syncobj);
 }
 EXPORT_SYMBOL(drm_syncobj_free);
@@ -284,6 +501,11 @@ int drm_syncobj_create(struct drm_syncobj **out_syncobj, uint32_t flags,
 	kref_init(&syncobj->refcount);
 	INIT_LIST_HEAD(&syncobj->cb_list);
 	spin_lock_init(&syncobj->lock);
+	if (flags & DRM_SYNCOBJ_CREATE_TYPE_TIMELINE)
+		syncobj->type = DRM_SYNCOBJ_TYPE_TIMELINE;
+	else
+		syncobj->type = DRM_SYNCOBJ_TYPE_BINARY;
+	drm_syncobj_init(syncobj);
 
 	if (flags & DRM_SYNCOBJ_CREATE_SIGNALED) {
 		ret = drm_syncobj_assign_null_handle(syncobj);
@@ -566,7 +788,8 @@ drm_syncobj_create_ioctl(struct drm_device *dev, void *data,
 		return -EOPNOTSUPP;
 
 	/* no valid flags yet */
-	if (args->flags & ~DRM_SYNCOBJ_CREATE_SIGNALED)
+	if (args->flags & ~(DRM_SYNCOBJ_CREATE_SIGNALED |
+			    DRM_SYNCOBJ_CREATE_TYPE_TIMELINE))
 		return -EINVAL;
 
 	return drm_syncobj_create_as_handle(file_private,
@@ -659,9 +882,8 @@ static void syncobj_wait_syncobj_func(struct drm_syncobj *syncobj,
 	struct syncobj_wait_entry *wait =
 		container_of(cb, struct syncobj_wait_entry, syncobj_cb);
 
-	/* This happens inside the syncobj lock */
-	wait->fence = dma_fence_get(rcu_dereference_protected(syncobj->fence,
-							      lockdep_is_held(&syncobj->lock)));
+	drm_syncobj_search_fence(syncobj, 0, 0, &wait->fence);
+
 	wake_up_process(wait->task);
 }
 
@@ -687,7 +909,8 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
 	signaled_count = 0;
 	for (i = 0; i < count; ++i) {
 		entries[i].task = current;
-		entries[i].fence = drm_syncobj_fence_get(syncobjs[i]);
+		drm_syncobj_search_fence(syncobjs[i], 0, 0,
+					 &entries[i].fence);
 		if (!entries[i].fence) {
 			if (flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT) {
 				continue;
@@ -949,12 +1172,13 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
 	if (ret < 0)
 		return ret;
 
-	for (i = 0; i < args->count_handles; i++)
-		drm_syncobj_replace_fence(syncobjs[i], 0, NULL);
-
+	for (i = 0; i < args->count_handles; i++) {
+		drm_syncobj_fini(syncobjs[i]);
+		drm_syncobj_init(syncobjs[i]);
+	}
 	drm_syncobj_array_free(syncobjs, args->count_handles);
 
-	return 0;
+	return ret;
 }
 
 int
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 22b4cb775576..6d99651c6c4b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -2152,7 +2152,7 @@ await_fence_array(struct i915_execbuffer *eb,
 		if (!(flags & I915_EXEC_FENCE_WAIT))
 			continue;
 
-		fence = drm_syncobj_fence_get(syncobj);
+		drm_syncobj_search_fence(syncobj, 0, 0, &fence);
 		if (!fence)
 			return -EINVAL;
 
diff --git a/include/drm/drm_syncobj.h b/include/drm/drm_syncobj.h
index 2eda44def639..5e8c5c027e09 100644
--- a/include/drm/drm_syncobj.h
+++ b/include/drm/drm_syncobj.h
@@ -30,10 +30,15 @@
 
 struct drm_syncobj_cb;
 
+enum drm_syncobj_type {
+	DRM_SYNCOBJ_TYPE_BINARY,
+	DRM_SYNCOBJ_TYPE_TIMELINE
+};
+
 /**
  * struct drm_syncobj - sync object.
  *
- * This structure defines a generic sync object which wraps a &dma_fence.
+ * This structure defines a generic sync object which is timeline based.
  */
 struct drm_syncobj {
 	/**
@@ -41,19 +46,36 @@ struct drm_syncobj {
 	 */
 	struct kref refcount;
 	/**
-	 * @fence:
-	 * NULL or a pointer to the fence bound to this object.
-	 *
-	 * This field should not be used directly. Use drm_syncobj_fence_get()
-	 * and drm_syncobj_replace_fence() instead.
+	 * @type: indicate syncobj type
+	 */
+	enum drm_syncobj_type type;
+	/**
+	 * @wq: wait signal operation work queue
+	 */
+	wait_queue_head_t	wq;
+	/**
+	 * @timeline_context: fence context used by timeline
 	 */
-	struct dma_fence __rcu *fence;
+	u64 timeline_context;
 	/**
-	 * @cb_list: List of callbacks to call when the &fence gets replaced.
+	 * @timeline: syncobj timeline value, which indicates point is signaled.
 	 */
+	u64 timeline;
+	/**
+	 * @signal_point: which indicates the latest signaler point.
+	 */
+	u64 signal_point;
+	/**
+	 * @signal_pt_list: signaler point list.
+	 */
+	struct list_head signal_pt_list;
+
+	/**
+         * @cb_list: List of callbacks to call when the &fence gets replaced.
+         */
 	struct list_head cb_list;
 	/**
-	 * @lock: Protects &cb_list and write-locks &fence.
+	 * @lock: Protects syncobj list and write-locks &fence.
 	 */
 	spinlock_t lock;
 	/**
@@ -68,7 +90,7 @@ typedef void (*drm_syncobj_func_t)(struct drm_syncobj *syncobj,
 /**
  * struct drm_syncobj_cb - callback for drm_syncobj_add_callback
  * @node: used by drm_syncob_add_callback to append this struct to
- *	  &drm_syncobj.cb_list
+ *       &drm_syncobj.cb_list
  * @func: drm_syncobj_func_t to call
  *
  * This struct will be initialized by drm_syncobj_add_callback, additional
@@ -106,29 +128,6 @@ drm_syncobj_put(struct drm_syncobj *obj)
 	kref_put(&obj->refcount, drm_syncobj_free);
 }
 
-/**
- * drm_syncobj_fence_get - get a reference to a fence in a sync object
- * @syncobj: sync object.
- *
- * This acquires additional reference to &drm_syncobj.fence contained in @obj,
- * if not NULL. It is illegal to call this without already holding a reference.
- * No locks required.
- *
- * Returns:
- * Either the fence of @obj or NULL if there's none.
- */
-static inline struct dma_fence *
-drm_syncobj_fence_get(struct drm_syncobj *syncobj)
-{
-	struct dma_fence *fence;
-
-	rcu_read_lock();
-	fence = dma_fence_get_rcu_safe(&syncobj->fence);
-	rcu_read_unlock();
-
-	return fence;
-}
-
 struct drm_syncobj *drm_syncobj_find(struct drm_file *file_private,
 				     u32 handle);
 void drm_syncobj_replace_fence(struct drm_syncobj *syncobj, u64 point,
@@ -142,5 +141,7 @@ int drm_syncobj_create(struct drm_syncobj **out_syncobj, uint32_t flags,
 int drm_syncobj_get_handle(struct drm_file *file_private,
 			   struct drm_syncobj *syncobj, u32 *handle);
 int drm_syncobj_get_fd(struct drm_syncobj *syncobj, int *p_fd);
+int drm_syncobj_search_fence(struct drm_syncobj *syncobj, u64 point, u64 flags,
+			     struct dma_fence **fence);
 
 #endif
diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index 300f336633f2..cebdb2541eb7 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -717,6 +717,7 @@ struct drm_prime_handle {
 struct drm_syncobj_create {
 	__u32 handle;
 #define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0)
+#define DRM_SYNCOBJ_CREATE_TYPE_TIMELINE (1 << 1)
 	__u32 flags;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From cd956bfcd0f58d20485ac0a785415f7d9327a95f Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Tue, 23 Oct 2018 11:07:07 +0100
Subject: drm/i915/perf: add a parameter to control the size of OA buffer

The way our hardware is designed doesn't seem to let us use the
MI_RECORD_PERF_COUNT command without setting up a circular buffer.

In the case where the user didn't request OA reports to be available
through the i915 perf stream, we can set the OA buffer to the minimum
size to avoid consuming memory which won't be used by the driver.

v2: Simplify oa buffer size exponent selection (Chris)
    Reuse vma size field (Lionel)

v3: Restrict size opening parameter to values supported by HW (Chris)

v4: Drop out of date comment (Matt)
    Add debug message when buffer size is rejected (Matt)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181023100707.31738-5-lionel.g.landwerlin@intel.com
---
 drivers/gpu/drm/i915/i915_drv.h  |  1 +
 drivers/gpu/drm/i915/i915_perf.c | 99 ++++++++++++++++++++++++++--------------
 drivers/gpu/drm/i915/i915_reg.h  |  2 +
 include/uapi/drm/i915_drm.h      |  7 +++
 4 files changed, 76 insertions(+), 33 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 135a8522a803..22cf3c75558c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2006,6 +2006,7 @@ struct drm_i915_private {
 				u32 last_ctx_id;
 				int format;
 				int format_size;
+				int size_exponent;
 
 				/**
 				 * Locks reads and writes to all head/tail state
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 4529edfdcfc8..1712b68de8f5 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -212,13 +212,7 @@
 #include "i915_oa_icl.h"
 #include "intel_lrc_reg.h"
 
-/* HW requires this to be a power of two, between 128k and 16M, though driver
- * is currently generally designed assuming the largest 16M size is used such
- * that the overflow cases are unlikely in normal operation.
- */
-#define OA_BUFFER_SIZE		SZ_16M
-
-#define OA_TAKEN(tail, head)	((tail - head) & (OA_BUFFER_SIZE - 1))
+#define OA_TAKEN(tail, head)	(((tail) - (head)) & (dev_priv->perf.oa.oa_buffer.vma->size - 1))
 
 /**
  * DOC: OA Tail Pointer Race
@@ -361,6 +355,7 @@ struct perf_open_properties {
 	int oa_format;
 	bool oa_periodic;
 	int oa_period_exponent;
+	u32 oa_buffer_size_exponent;
 };
 
 static void free_oa_config(struct drm_i915_private *dev_priv,
@@ -523,7 +518,7 @@ static bool oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 		 * could put the tail out of bounds...
 		 */
 		if (hw_tail >= gtt_offset &&
-		    hw_tail < (gtt_offset + OA_BUFFER_SIZE)) {
+		    hw_tail < (gtt_offset + dev_priv->perf.oa.oa_buffer.vma->size)) {
 			dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset =
 				aging_tail = hw_tail;
 			dev_priv->perf.oa.oa_buffer.aging_timestamp = now;
@@ -652,7 +647,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
-	u32 mask = (OA_BUFFER_SIZE - 1);
+	u32 mask = (dev_priv->perf.oa.oa_buffer.vma->size - 1);
 	size_t start_offset = *offset;
 	unsigned long flags;
 	unsigned int aged_tail_idx;
@@ -692,8 +687,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	 * only be incremented by multiples of the report size (notably also
 	 * all a power of two).
 	 */
-	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
-		      tail > OA_BUFFER_SIZE || tail % report_size,
+	if (WARN_ONCE(head > dev_priv->perf.oa.oa_buffer.vma->size || head % report_size ||
+		      tail > dev_priv->perf.oa.oa_buffer.vma->size || tail % report_size,
 		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 		      head, tail))
 		return -EIO;
@@ -716,7 +711,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * here would imply a driver bug that would result
 		 * in an overrun.
 		 */
-		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
+		if (WARN_ON((dev_priv->perf.oa.oa_buffer.vma->size - head) < report_size)) {
 			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
 			break;
 		}
@@ -875,11 +870,6 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 	 * automatically triggered reports in this condition and so we
 	 * have to assume that old reports are now being trampled
 	 * over.
-	 *
-	 * Considering how we don't currently give userspace control
-	 * over the OA buffer size and always configure a large 16MB
-	 * buffer, then a buffer overflow does anyway likely indicate
-	 * that something has gone quite badly wrong.
 	 */
 	if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
 		ret = append_oa_status(stream, buf, count, offset,
@@ -941,7 +931,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
-	u32 mask = (OA_BUFFER_SIZE - 1);
+	u32 mask = (dev_priv->perf.oa.oa_buffer.vma->size - 1);
 	size_t start_offset = *offset;
 	unsigned long flags;
 	unsigned int aged_tail_idx;
@@ -978,8 +968,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	 * only be incremented by multiples of the report size (notably also
 	 * all a power of two).
 	 */
-	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
-		      tail > OA_BUFFER_SIZE || tail % report_size,
+	if (WARN_ONCE(head > dev_priv->perf.oa.oa_buffer.vma->size || head % report_size ||
+		      tail > dev_priv->perf.oa.oa_buffer.vma->size || tail % report_size,
 		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 		      head, tail))
 		return -EIO;
@@ -999,7 +989,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 		 * here would imply a driver bug that would result
 		 * in an overrun.
 		 */
-		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
+		if (WARN_ON((dev_priv->perf.oa.oa_buffer.vma->size - head) < report_size)) {
 			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
 			break;
 		}
@@ -1394,7 +1384,9 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 
 	I915_WRITE(GEN7_OABUFFER, gtt_offset);
 
-	I915_WRITE(GEN7_OASTATUS1, gtt_offset | OABUFFER_SIZE_16M); /* tail */
+	I915_WRITE(GEN7_OASTATUS1, gtt_offset |
+		   ((dev_priv->perf.oa.oa_buffer.size_exponent - 17) <<
+		    GEN7_OASTATUS1_BUFFER_SIZE_SHIFT)); /* tail */
 
 	/* Mark that we need updated tail pointers to read from... */
 	dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
@@ -1419,7 +1411,8 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 * the assumption that new reports are being written to zeroed
 	 * memory...
 	 */
-	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
+	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0,
+	       dev_priv->perf.oa.oa_buffer.vma->size);
 
 	/* Maybe make ->pollin per-stream state if we support multiple
 	 * concurrent streams in the future.
@@ -1449,7 +1442,9 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 *  bit."
 	 */
 	I915_WRITE(GEN8_OABUFFER, gtt_offset |
-		   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
+		   ((dev_priv->perf.oa.oa_buffer.size_exponent - 17) <<
+		    GEN8_OABUFFER_BUFFER_SIZE_SHIFT) |
+		   GEN8_OABUFFER_MEM_SELECT_GGTT);
 	I915_WRITE(GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
 
 	/* Mark that we need updated tail pointers to read from... */
@@ -1477,7 +1472,8 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 * the assumption that new reports are being written to zeroed
 	 * memory...
 	 */
-	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
+	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0,
+	       dev_priv->perf.oa.oa_buffer.vma->size);
 
 	/*
 	 * Maybe make ->pollin per-stream state if we support multiple
@@ -1486,23 +1482,24 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	dev_priv->perf.oa.pollin = false;
 }
 
-static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+static int alloc_oa_buffer(struct drm_i915_private *dev_priv, int size_exponent)
 {
 	struct drm_i915_gem_object *bo;
 	struct i915_vma *vma;
+	size_t size = 1U << size_exponent;
 	int ret;
 
 	if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
 		return -ENODEV;
 
+	if (WARN_ON(size < SZ_128K || size > SZ_16M))
+		return -EINVAL;
+
 	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
 	if (ret)
 		return ret;
 
-	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
-	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
-
-	bo = i915_gem_object_create(dev_priv, OA_BUFFER_SIZE);
+	bo = i915_gem_object_create(dev_priv, size);
 	if (IS_ERR(bo)) {
 		DRM_ERROR("Failed to allocate OA buffer\n");
 		ret = PTR_ERR(bo);
@@ -1520,6 +1517,7 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 		goto err_unref;
 	}
 	dev_priv->perf.oa.oa_buffer.vma = vma;
+	dev_priv->perf.oa.oa_buffer.size_exponent = size_exponent;
 
 	dev_priv->perf.oa.oa_buffer.vaddr =
 		i915_gem_object_pin_map(bo, I915_MAP_WB);
@@ -1528,9 +1526,10 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 		goto err_unpin;
 	}
 
-	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p\n",
+	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p, size = %llu\n",
 			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
-			 dev_priv->perf.oa.oa_buffer.vaddr);
+			 dev_priv->perf.oa.oa_buffer.vaddr,
+			 dev_priv->perf.oa.oa_buffer.vma->size);
 
 	goto unlock;
 
@@ -2090,7 +2089,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	intel_runtime_pm_get(dev_priv);
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
-	ret = alloc_oa_buffer(dev_priv);
+	ret = alloc_oa_buffer(dev_priv, props->oa_buffer_size_exponent);
 	if (ret)
 		goto err_oa_buf_alloc;
 
@@ -2649,6 +2648,26 @@ static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
 			 1000ULL * INTEL_INFO(dev_priv)->cs_timestamp_frequency_khz);
 }
 
+static int
+select_oa_buffer_exponent(struct drm_i915_private *i915,
+			  u64 requested_size)
+{
+	int order;
+
+	/*
+	 * When no size is specified, use the largest size supported by all
+	 * generations.
+	 */
+	if (!requested_size)
+		return order_base_2(SZ_16M);
+
+	order = order_base_2(clamp_t(u64, requested_size, SZ_128K, SZ_16M));
+	if (requested_size != (1UL << order))
+		return -EINVAL;
+
+	return order;
+}
+
 /**
  * read_properties_unlocked - validate + copy userspace stream open properties
  * @dev_priv: i915 device instance
@@ -2776,6 +2795,14 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 			props->oa_periodic = true;
 			props->oa_period_exponent = value;
 			break;
+		case DRM_I915_PERF_PROP_OA_BUFFER_SIZE:
+			ret = select_oa_buffer_exponent(dev_priv, value);
+			if (ret < 0) {
+				DRM_DEBUG("OA buffer size invalid %llu\n", value);
+				return ret;
+			}
+			props->oa_buffer_size_exponent = ret;
+			break;
 		case DRM_I915_PERF_PROP_MAX:
 			MISSING_CASE(id);
 			return -EINVAL;
@@ -2784,6 +2811,12 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		uprop += 2;
 	}
 
+	/* If no buffer size was requested, select the default one. */
+	if (!props->oa_buffer_size_exponent) {
+		props->oa_buffer_size_exponent =
+			select_oa_buffer_exponent(dev_priv, 0);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 8bd61f946714..4f9c2fe51f27 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -569,12 +569,14 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define GEN8_OABUFFER_UDW _MMIO(0x23b4)
 #define GEN8_OABUFFER _MMIO(0x2b14)
 #define  GEN8_OABUFFER_MEM_SELECT_GGTT      (1 << 0)  /* 0: PPGTT, 1: GGTT */
+#define  GEN8_OABUFFER_BUFFER_SIZE_SHIFT    3
 
 #define GEN7_OASTATUS1 _MMIO(0x2364)
 #define  GEN7_OASTATUS1_TAIL_MASK	    0xffffffc0
 #define  GEN7_OASTATUS1_COUNTER_OVERFLOW    (1 << 2)
 #define  GEN7_OASTATUS1_OABUFFER_OVERFLOW   (1 << 1)
 #define  GEN7_OASTATUS1_REPORT_LOST	    (1 << 0)
+#define  GEN7_OASTATUS1_BUFFER_SIZE_SHIFT   3
 
 #define GEN7_OASTATUS2 _MMIO(0x2368)
 #define  GEN7_OASTATUS2_HEAD_MASK           0xffffffc0
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 298b2e197744..e477ef8c644e 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1540,6 +1540,13 @@ enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_OA_EXPONENT,
 
+	/**
+	 * Specify a global OA buffer size to be allocated in bytes. The size
+	 * specified must be supported by HW (currently supported sizes are
+	 * powers of 2 ranging from 128Kb to 16Mb).
+	 */
+	DRM_I915_PERF_PROP_OA_BUFFER_SIZE,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 23c42a403a9cfdbad6004a556c927be7dd61a8ee Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 27 Oct 2018 15:07:40 +0200
Subject: netfilter: ipset: Introduction of new commands and protocol version 7

Two new commands (IPSET_CMD_GET_BYNAME, IPSET_CMD_GET_BYINDEX) are
introduced. The new commands makes possible to eliminate the getsockopt
operation (in iptables set/SET match/target) and thus use only netlink
communication between userspace and kernel for ipset. With the new
protocol version, userspace can exactly know which functionality is
supported by the running kernel.

Both the kernel and userspace is fully backward compatible.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h      |   2 +-
 include/uapi/linux/netfilter/ipset/ip_set.h |  19 ++--
 net/netfilter/ipset/ip_set_core.c           | 164 +++++++++++++++++++++++++---
 3 files changed, 160 insertions(+), 25 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 34fc80f3eb90..c4ce07402c24 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -303,11 +303,11 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
 /* Netlink CB args */
 enum {
 	IPSET_CB_NET = 0,	/* net namespace */
+	IPSET_CB_PROTO,		/* ipset protocol */
 	IPSET_CB_DUMP,		/* dump single set/all sets */
 	IPSET_CB_INDEX,		/* set index */
 	IPSET_CB_PRIVATE,	/* set private data */
 	IPSET_CB_ARG0,		/* type specific */
-	IPSET_CB_ARG1,
 };
 
 /* register and unregister set references */
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 60236f694143..ea69ca21ff23 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -13,8 +13,9 @@
 
 #include <linux/types.h>
 
-/* The protocol version */
-#define IPSET_PROTOCOL		6
+/* The protocol versions */
+#define IPSET_PROTOCOL		7
+#define IPSET_PROTOCOL_MIN	6
 
 /* The max length of strings including NUL: set and type identifiers */
 #define IPSET_MAXNAMELEN	32
@@ -38,17 +39,19 @@ enum ipset_cmd {
 	IPSET_CMD_TEST,		/* 11: Test an element in a set */
 	IPSET_CMD_HEADER,	/* 12: Get set header data only */
 	IPSET_CMD_TYPE,		/* 13: Get set type */
+	IPSET_CMD_GET_BYNAME,	/* 14: Get set index by name */
+	IPSET_CMD_GET_BYINDEX,	/* 15: Get set name by index */
 	IPSET_MSG_MAX,		/* Netlink message commands */
 
 	/* Commands in userspace: */
-	IPSET_CMD_RESTORE = IPSET_MSG_MAX, /* 14: Enter restore mode */
-	IPSET_CMD_HELP,		/* 15: Get help */
-	IPSET_CMD_VERSION,	/* 16: Get program version */
-	IPSET_CMD_QUIT,		/* 17: Quit from interactive mode */
+	IPSET_CMD_RESTORE = IPSET_MSG_MAX, /* 16: Enter restore mode */
+	IPSET_CMD_HELP,		/* 17: Get help */
+	IPSET_CMD_VERSION,	/* 18: Get program version */
+	IPSET_CMD_QUIT,		/* 19: Quit from interactive mode */
 
 	IPSET_CMD_MAX,
 
-	IPSET_CMD_COMMIT = IPSET_CMD_MAX, /* 18: Commit buffered commands */
+	IPSET_CMD_COMMIT = IPSET_CMD_MAX, /* 20: Commit buffered commands */
 };
 
 /* Attributes at command level */
@@ -66,6 +69,7 @@ enum {
 	IPSET_ATTR_LINENO,	/* 9: Restore lineno */
 	IPSET_ATTR_PROTOCOL_MIN, /* 10: Minimal supported version number */
 	IPSET_ATTR_REVISION_MIN	= IPSET_ATTR_PROTOCOL_MIN, /* type rev min */
+	IPSET_ATTR_INDEX,	/* 11: Kernel index of set */
 	__IPSET_ATTR_CMD_MAX,
 };
 #define IPSET_ATTR_CMD_MAX	(__IPSET_ATTR_CMD_MAX - 1)
@@ -223,6 +227,7 @@ enum ipset_adt {
 
 /* Sets are identified by an index in kernel space. Tweak with ip_set_id_t
  * and IPSET_INVALID_ID if you want to increase the max number of sets.
+ * Also, IPSET_ATTR_INDEX must be changed.
  */
 typedef __u16 ip_set_id_t;
 
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index bc4bd247bb7d..847f764b2aeb 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -768,11 +768,21 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
  * The commands are serialized by the nfnl mutex.
  */
 
+static inline u8 protocol(const struct nlattr * const tb[])
+{
+	return nla_get_u8(tb[IPSET_ATTR_PROTOCOL]);
+}
+
 static inline bool
 protocol_failed(const struct nlattr * const tb[])
 {
-	return !tb[IPSET_ATTR_PROTOCOL] ||
-	       nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
+	return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) != IPSET_PROTOCOL;
+}
+
+static inline bool
+protocol_min_failed(const struct nlattr * const tb[])
+{
+	return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) < IPSET_PROTOCOL_MIN;
 }
 
 static inline u32
@@ -886,7 +896,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
 	u32 flags = flag_exist(nlh);
 	int ret = 0;
 
-	if (unlikely(protocol_failed(attr) ||
+	if (unlikely(protocol_min_failed(attr) ||
 		     !attr[IPSET_ATTR_SETNAME] ||
 		     !attr[IPSET_ATTR_TYPENAME] ||
 		     !attr[IPSET_ATTR_REVISION] ||
@@ -1024,7 +1034,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
 	ip_set_id_t i;
 	int ret = 0;
 
-	if (unlikely(protocol_failed(attr)))
+	if (unlikely(protocol_min_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
 	/* Must wait for flush to be really finished in list:set */
@@ -1102,7 +1112,7 @@ static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 	struct ip_set *s;
 	ip_set_id_t i;
 
-	if (unlikely(protocol_failed(attr)))
+	if (unlikely(protocol_min_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
 	if (!attr[IPSET_ATTR_SETNAME]) {
@@ -1144,7 +1154,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
 	ip_set_id_t i;
 	int ret = 0;
 
-	if (unlikely(protocol_failed(attr) ||
+	if (unlikely(protocol_min_failed(attr) ||
 		     !attr[IPSET_ATTR_SETNAME] ||
 		     !attr[IPSET_ATTR_SETNAME2]))
 		return -IPSET_ERR_PROTOCOL;
@@ -1193,7 +1203,7 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 	ip_set_id_t from_id, to_id;
 	char from_name[IPSET_MAXNAMELEN];
 
-	if (unlikely(protocol_failed(attr) ||
+	if (unlikely(protocol_min_failed(attr) ||
 		     !attr[IPSET_ATTR_SETNAME] ||
 		     !attr[IPSET_ATTR_SETNAME2]))
 		return -IPSET_ERR_PROTOCOL;
@@ -1288,6 +1298,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
 	nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len,
 		  ip_set_setname_policy, NULL);
 
+	cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]);
 	if (cda[IPSET_ATTR_SETNAME]) {
 		struct ip_set *set;
 
@@ -1389,7 +1400,8 @@ dump_last:
 			ret = -EMSGSIZE;
 			goto release_refcount;
 		}
-		if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+		if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL,
+			       cb->args[IPSET_CB_PROTO]) ||
 		    nla_put_string(skb, IPSET_ATTR_SETNAME, set->name))
 			goto nla_put_failure;
 		if (dump_flags & IPSET_FLAG_LIST_SETNAME)
@@ -1404,6 +1416,9 @@ dump_last:
 			    nla_put_u8(skb, IPSET_ATTR_REVISION,
 				       set->revision))
 				goto nla_put_failure;
+			if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN &&
+			    nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index)))
+				goto nla_put_failure;
 			ret = set->variant->head(set, skb);
 			if (ret < 0)
 				goto release_refcount;
@@ -1463,7 +1478,7 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 		       const struct nlattr * const attr[],
 		       struct netlink_ext_ack *extack)
 {
-	if (unlikely(protocol_failed(attr)))
+	if (unlikely(protocol_min_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
 	{
@@ -1557,7 +1572,7 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 	bool use_lineno;
 	int ret = 0;
 
-	if (unlikely(protocol_failed(attr) ||
+	if (unlikely(protocol_min_failed(attr) ||
 		     !attr[IPSET_ATTR_SETNAME] ||
 		     !((attr[IPSET_ATTR_DATA] != NULL) ^
 		       (attr[IPSET_ATTR_ADT] != NULL)) ||
@@ -1612,7 +1627,7 @@ static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 	bool use_lineno;
 	int ret = 0;
 
-	if (unlikely(protocol_failed(attr) ||
+	if (unlikely(protocol_min_failed(attr) ||
 		     !attr[IPSET_ATTR_SETNAME] ||
 		     !((attr[IPSET_ATTR_DATA] != NULL) ^
 		       (attr[IPSET_ATTR_ADT] != NULL)) ||
@@ -1664,7 +1679,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 	struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
 	int ret = 0;
 
-	if (unlikely(protocol_failed(attr) ||
+	if (unlikely(protocol_min_failed(attr) ||
 		     !attr[IPSET_ATTR_SETNAME] ||
 		     !attr[IPSET_ATTR_DATA] ||
 		     !flag_nested(attr[IPSET_ATTR_DATA])))
@@ -1701,7 +1716,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
 	struct nlmsghdr *nlh2;
 	int ret = 0;
 
-	if (unlikely(protocol_failed(attr) ||
+	if (unlikely(protocol_min_failed(attr) ||
 		     !attr[IPSET_ATTR_SETNAME]))
 		return -IPSET_ERR_PROTOCOL;
 
@@ -1717,7 +1732,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
 			 IPSET_CMD_HEADER);
 	if (!nlh2)
 		goto nlmsg_failure;
-	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) ||
 	    nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) ||
 	    nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) ||
 	    nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) ||
@@ -1758,7 +1773,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 	const char *typename;
 	int ret = 0;
 
-	if (unlikely(protocol_failed(attr) ||
+	if (unlikely(protocol_min_failed(attr) ||
 		     !attr[IPSET_ATTR_TYPENAME] ||
 		     !attr[IPSET_ATTR_FAMILY]))
 		return -IPSET_ERR_PROTOCOL;
@@ -1777,7 +1792,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 			 IPSET_CMD_TYPE);
 	if (!nlh2)
 		goto nlmsg_failure;
-	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) ||
 	    nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) ||
 	    nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) ||
 	    nla_put_u8(skb2, IPSET_ATTR_REVISION, max) ||
@@ -1828,6 +1843,111 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
 		goto nlmsg_failure;
 	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL))
 		goto nla_put_failure;
+	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL_MIN, IPSET_PROTOCOL_MIN))
+		goto nla_put_failure;
+	nlmsg_end(skb2, nlh2);
+
+	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+	kfree_skb(skb2);
+	return -EMSGSIZE;
+}
+
+/* Get set by name or index, from userspace */
+
+static int ip_set_byname(struct net *net, struct sock *ctnl,
+			 struct sk_buff *skb, const struct nlmsghdr *nlh,
+			 const struct nlattr * const attr[],
+			 struct netlink_ext_ack *extack)
+{
+	struct ip_set_net *inst = ip_set_pernet(net);
+	struct sk_buff *skb2;
+	struct nlmsghdr *nlh2;
+	ip_set_id_t id = IPSET_INVALID_ID;
+	const struct ip_set *set;
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr) ||
+		     !attr[IPSET_ATTR_SETNAME]))
+		return -IPSET_ERR_PROTOCOL;
+
+	set = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &id);
+	if (id == IPSET_INVALID_ID)
+		return -ENOENT;
+
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+			 IPSET_CMD_GET_BYNAME);
+	if (!nlh2)
+		goto nlmsg_failure;
+	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) ||
+	    nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) ||
+	    nla_put_net16(skb2, IPSET_ATTR_INDEX, htons(id)))
+		goto nla_put_failure;
+	nlmsg_end(skb2, nlh2);
+
+	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+	kfree_skb(skb2);
+	return -EMSGSIZE;
+}
+
+static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = {
+	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
+	[IPSET_ATTR_INDEX]	= { .type = NLA_U16 },
+};
+
+static int ip_set_byindex(struct net *net, struct sock *ctnl,
+			  struct sk_buff *skb, const struct nlmsghdr *nlh,
+			  const struct nlattr * const attr[],
+			  struct netlink_ext_ack *extack)
+{
+	struct ip_set_net *inst = ip_set_pernet(net);
+	struct sk_buff *skb2;
+	struct nlmsghdr *nlh2;
+	ip_set_id_t id = IPSET_INVALID_ID;
+	const struct ip_set *set;
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr) ||
+		     !attr[IPSET_ATTR_INDEX]))
+		return -IPSET_ERR_PROTOCOL;
+
+	id = ip_set_get_h16(attr[IPSET_ATTR_INDEX]);
+	if (id >= inst->ip_set_max)
+		return -ENOENT;
+	set = ip_set(inst, id);
+	if (set == NULL)
+		return -ENOENT;
+
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+			 IPSET_CMD_GET_BYINDEX);
+	if (!nlh2)
+		goto nlmsg_failure;
+	if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) ||
+	    nla_put_string(skb, IPSET_ATTR_SETNAME, set->name))
+		goto nla_put_failure;
 	nlmsg_end(skb2, nlh2);
 
 	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
@@ -1913,6 +2033,16 @@ static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_protocol_policy,
 	},
+	[IPSET_CMD_GET_BYNAME]	= {
+		.call		= ip_set_byname,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_setname_policy,
+	},
+	[IPSET_CMD_GET_BYINDEX]	= {
+		.call		= ip_set_byindex,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_index_policy,
+	},
 };
 
 static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
@@ -1958,7 +2088,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 			goto done;
 		}
 
-		if (req_version->version != IPSET_PROTOCOL) {
+		if (req_version->version < IPSET_PROTOCOL_MIN) {
 			ret = -EPROTO;
 			goto done;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 697b6b85042ef0fb9838e3100700eed727c211e9 Mon Sep 17 00:00:00 2001
From: Alexandru Gheorghe <alexandru-cosmin.gheorghe@arm.com>
Date: Thu, 1 Nov 2018 15:11:30 +0000
Subject: drm/fourcc: Add fourcc for Mali linear tiled formats

Mali-DP implements a number of tiled yuv formats which are not
currently described in drm_fourcc.h.
This adds those definitions and describes their memory layout by
using the newly added char_per_block, block_w, block_h.

Reviewed-by: Brian Starkey <brian.starkey@arm.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Alexandru Gheorghe <alexandru-cosmin.gheorghe@arm.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181101151051.1509-3-alexandru-cosmin.gheorghe@arm.com
---
 drivers/gpu/drm/drm_fourcc.c  | 12 ++++++++++++
 include/uapi/drm/drm_fourcc.h | 14 ++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c
index 8c4a79b3a4ad..f523948c82b1 100644
--- a/drivers/gpu/drm/drm_fourcc.c
+++ b/drivers/gpu/drm/drm_fourcc.c
@@ -225,6 +225,18 @@ const struct drm_format_info *__drm_format_info(u32 format)
 		{ .format = DRM_FORMAT_UYVY,		.depth = 0,  .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true },
 		{ .format = DRM_FORMAT_VYUY,		.depth = 0,  .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true },
 		{ .format = DRM_FORMAT_AYUV,		.depth = 0,  .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true, .is_yuv = true },
+		{ .format = DRM_FORMAT_Y0L0,		.depth = 0,  .num_planes = 1,
+		  .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 },
+		  .hsub = 2, .vsub = 2, .has_alpha = true, .is_yuv = true },
+		{ .format = DRM_FORMAT_X0L0,		.depth = 0,  .num_planes = 1,
+		  .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 },
+		  .hsub = 2, .vsub = 2, .is_yuv = true },
+		{ .format = DRM_FORMAT_Y0L2,		.depth = 0,  .num_planes = 1,
+		  .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 },
+		  .hsub = 2, .vsub = 2, .has_alpha = true, .is_yuv = true },
+		{ .format = DRM_FORMAT_X0L2,		.depth = 0,  .num_planes = 1,
+		  .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 },
+		  .hsub = 2, .vsub = 2, .is_yuv = true },
 	};
 
 	unsigned int i;
diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index 0cd40ebfa1b1..e7e48f1f4a74 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -152,6 +152,20 @@ extern "C" {
 
 #define DRM_FORMAT_AYUV		fourcc_code('A', 'Y', 'U', 'V') /* [31:0] A:Y:Cb:Cr 8:8:8:8 little endian */
 
+/*
+ * packed YCbCr420 2x2 tiled formats
+ * first 64 bits will contain Y,Cb,Cr components for a 2x2 tile
+ */
+/* [63:0]   A3:A2:Y3:0:Cr0:0:Y2:0:A1:A0:Y1:0:Cb0:0:Y0:0  1:1:8:2:8:2:8:2:1:1:8:2:8:2:8:2 little endian */
+#define DRM_FORMAT_Y0L0		fourcc_code('Y', '0', 'L', '0')
+/* [63:0]   X3:X2:Y3:0:Cr0:0:Y2:0:X1:X0:Y1:0:Cb0:0:Y0:0  1:1:8:2:8:2:8:2:1:1:8:2:8:2:8:2 little endian */
+#define DRM_FORMAT_X0L0		fourcc_code('X', '0', 'L', '0')
+
+/* [63:0]   A3:A2:Y3:Cr0:Y2:A1:A0:Y1:Cb0:Y0  1:1:10:10:10:1:1:10:10:10 little endian */
+#define DRM_FORMAT_Y0L2		fourcc_code('Y', '0', 'L', '2')
+/* [63:0]   X3:X2:Y3:Cr0:Y2:X1:X0:Y1:Cb0:Y0  1:1:10:10:10:1:1:10:10:10 little endian */
+#define DRM_FORMAT_X0L2		fourcc_code('X', '0', 'L', '2')
+
 /*
  * 2 plane RGB + A
  * index 0 = RGB plane, same format as the corresponding non _A8 format has
-- 
cgit v1.2.3-59-g8ed1b


From ce331f8f7c04186b7b130550250d9c4e752b4f8f Mon Sep 17 00:00:00 2001
From: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Date: Tue, 23 Oct 2018 10:04:54 -0400
Subject: drm/amdgpu: Add DCC flags for GFX9 amdgpu_bo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[Why]
Hardware support for Delta Color Compression (DCC) decompression is
available in DC for GFX9 but there's no way for userspace to enable
the feature.

Enabling the feature can provide improved GFX performance and
power savings in many situations.

[How]
Extend the GFX9 tiling flags to include DCC parameters. These are
logically grouped together with tiling flags even if they are
technically distinct.

This trivially maintains backwards compatibility with existing
users of amdgpu_gem_metadata. No new IOCTls or data structures are
needed to support DCC.

This patch helps expose DCC attributes to both libdrm and amdgpu_dm.

Signed-off-by: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/drm/amdgpu_drm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 370e9a5536ef..be84e43c1e19 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -326,6 +326,12 @@ struct drm_amdgpu_gem_userptr {
 /* GFX9 and later: */
 #define AMDGPU_TILING_SWIZZLE_MODE_SHIFT		0
 #define AMDGPU_TILING_SWIZZLE_MODE_MASK			0x1f
+#define AMDGPU_TILING_DCC_OFFSET_256B_SHIFT		5
+#define AMDGPU_TILING_DCC_OFFSET_256B_MASK		0xFFFFFF
+#define AMDGPU_TILING_DCC_PITCH_MAX_SHIFT		29
+#define AMDGPU_TILING_DCC_PITCH_MAX_MASK		0x3FFF
+#define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT		43
+#define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK		0x1
 
 /* Set/Get helpers for tiling flags. */
 #define AMDGPU_TILING_SET(field, value) \
-- 
cgit v1.2.3-59-g8ed1b


From 9349e23907be1954ccdf6d771d640e2788da1643 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 1 Nov 2018 14:03:08 +0300
Subject: uapi: fix linux/kfd_ioctl.h userspace compilation errors

Consistently use types provided by <linux/types.h> via <drm/drm.h>
to fix the following linux/kfd_ioctl.h userspace compilation errors:

/usr/include/linux/kfd_ioctl.h:250:2: error: unknown type name 'uint32_t'
  uint32_t reset_type;
/usr/include/linux/kfd_ioctl.h:251:2: error: unknown type name 'uint32_t'
  uint32_t reset_cause;
/usr/include/linux/kfd_ioctl.h:252:2: error: unknown type name 'uint32_t'
  uint32_t memory_lost;
/usr/include/linux/kfd_ioctl.h:253:2: error: unknown type name 'uint32_t'
  uint32_t gpu_id;

Fixes: 0c119abad7f0d ("drm/amd: Add kfd ioctl defines for hw_exception event")
Cc: <stable@vger.kernel.org> # v4.19
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index f5ff8a76e208..dae897f38e59 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -255,10 +255,10 @@ struct kfd_hsa_memory_exception_data {
 
 /* hw exception data */
 struct kfd_hsa_hw_exception_data {
-	uint32_t reset_type;
-	uint32_t reset_cause;
-	uint32_t memory_lost;
-	uint32_t gpu_id;
+	__u32 reset_type;
+	__u32 reset_cause;
+	__u32 memory_lost;
+	__u32 gpu_id;
 };
 
 /* Event data */
-- 
cgit v1.2.3-59-g8ed1b


From 788012e33cb4ba7eedae6b621f0fa27f9196f2d0 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 1 Nov 2018 14:03:28 +0300
Subject: uapi: fix more linux/kfd_ioctl.h userspace compilation errors

Consistently use types provided by <linux/types.h> via <drm/drm.h>
to fix struct kfd_ioctl_get_queue_wave_state_args userspace compilation errors.

Fixes: 5df099e8bc83f ("drm/amdkfd: Add wavefront context save state retrieval ioctl")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index dae897f38e59..b01eb502d49c 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -83,11 +83,11 @@ struct kfd_ioctl_set_cu_mask_args {
 };
 
 struct kfd_ioctl_get_queue_wave_state_args {
-	uint64_t ctl_stack_address;	/* to KFD */
-	uint32_t ctl_stack_used_size;	/* from KFD */
-	uint32_t save_area_used_size;	/* from KFD */
-	uint32_t queue_id;		/* to KFD */
-	uint32_t pad;
+	__u64 ctl_stack_address;	/* to KFD */
+	__u32 ctl_stack_used_size;	/* from KFD */
+	__u32 save_area_used_size;	/* from KFD */
+	__u32 queue_id;			/* to KFD */
+	__u32 pad;
 };
 
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
-- 
cgit v1.2.3-59-g8ed1b


From e20cf8d3f1f763ad28a9cb3b41305b8a8a42653e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 7 Nov 2018 12:38:29 +0100
Subject: udp: implement GRO for plain UDP sockets.

This is the RX counterpart of commit bec1f6f69736 ("udp: generate gso
with UDP_SEGMENT"). When UDP_GRO is enabled, such socket is also
eligible for GRO in the rx path: UDP segments directed to such socket
are assembled into a larger GSO_UDP_L4 packet.

The core UDP GRO support is enabled with setsockopt(UDP_GRO).

Initial benchmark numbers:

Before:
udp rx:   1079 MB/s   769065 calls/s

After:
udp rx:   1466 MB/s    24877 calls/s

This change introduces a side effect in respect to UDP tunnels:
after a UDP tunnel creation, now the kernel performs a lookup per ingress
UDP packet, while before such lookup happened only if the ingress packet
carried a valid internal header csum.

rfc v2 -> rfc v3:
 - fixed typos in macro name and comments
 - really enforce UDP_GRO_CNT_MAX, instead of UDP_GRO_CNT_MAX + 1
 - acquire socket lock in UDP_GRO setsockopt

rfc v1 -> rfc v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h      |   3 +-
 include/uapi/linux/udp.h |   1 +
 net/ipv4/udp.c           |   8 ++++
 net/ipv4/udp_offload.c   | 109 +++++++++++++++++++++++++++++++++++++----------
 net/ipv6/udp_offload.c   |   6 +--
 5 files changed, 99 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index a4dafff407fb..f613b329852e 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -50,11 +50,12 @@ struct udp_sock {
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
 			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-			 encap_enabled:1; /* This socket enabled encap
+			 encap_enabled:1, /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
 					   * this
 					   */
+			 gro_enabled:1;	/* Can accept GRO packets */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 09502de447f5..30baccb6c9c4 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -33,6 +33,7 @@ struct udphdr {
 #define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
 #define UDP_SEGMENT	103	/* Set GSO segmentation size */
+#define UDP_GRO		104	/* This socket can receive UDP GRO packets */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f81409921e27..9fc08b098ced 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2473,6 +2473,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		up->gso_size = val;
 		break;
 
+	case UDP_GRO:
+		lock_sock(sk);
+		if (valbool)
+			udp_tunnel_encap_enable(sk->sk_socket);
+		up->gro_enabled = valbool;
+		release_sock(sk);
+		break;
+
 	/*
 	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
 	 */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 802f2bc00d69..0646d61f4fa8 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,6 +343,54 @@ out:
 	return segs;
 }
 
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+					       struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_hdr(skb);
+	struct sk_buff *pp = NULL;
+	struct udphdr *uh2;
+	struct sk_buff *p;
+
+	/* requires non zero csum, for symmetry with GSO */
+	if (!uh->check) {
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+
+	/* pull encapsulating udp header */
+	skb_gro_pull(skb, sizeof(struct udphdr));
+	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+	list_for_each_entry(p, head, list) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		uh2 = udp_hdr(p);
+
+		/* Match ports only, as csum is always non zero */
+		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		/* Terminate the flow on len mismatch or if it grow "too much".
+		 * Under small packet flood GRO count could elsewhere grow a lot
+		 * leading to execessive truesize values
+		 */
+		if (!skb_gro_receive(p, skb) &&
+		    NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+			pp = p;
+		else if (uh->len != uh2->len)
+			pp = p;
+
+		return pp;
+	}
+
+	/* mismatch, but we never need to flush */
+	return NULL;
+}
+
 struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 				struct udphdr *uh, udp_lookup_t lookup)
 {
@@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 	int flush = 1;
 	struct sock *sk;
 
+	rcu_read_lock();
+	sk = (*lookup)(skb, uh->source, uh->dest);
+	if (!sk)
+		goto out_unlock;
+
+	if (udp_sk(sk)->gro_enabled) {
+		pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+		rcu_read_unlock();
+		return pp;
+	}
+
 	if (NAPI_GRO_CB(skb)->encap_mark ||
 	    (skb->ip_summed != CHECKSUM_PARTIAL &&
 	     NAPI_GRO_CB(skb)->csum_cnt == 0 &&
-	     !NAPI_GRO_CB(skb)->csum_valid))
-		goto out;
+	     !NAPI_GRO_CB(skb)->csum_valid) ||
+	    !udp_sk(sk)->gro_receive)
+		goto out_unlock;
 
 	/* mark that this skb passed once through the tunnel gro layer */
 	NAPI_GRO_CB(skb)->encap_mark = 1;
 
-	rcu_read_lock();
-	sk = (*lookup)(skb, uh->source, uh->dest);
-
-	if (sk && udp_sk(sk)->gro_receive)
-		goto unflush;
-	goto out_unlock;
-
-unflush:
 	flush = 0;
 
 	list_for_each_entry(p, head, list) {
@@ -394,7 +446,6 @@ unflush:
 
 out_unlock:
 	rcu_read_unlock();
-out:
 	skb_gro_flush_final(skb, pp, flush);
 	return pp;
 }
@@ -427,6 +478,19 @@ flush:
 	return NULL;
 }
 
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_hdr(skb);
+
+	skb->csum_start = (unsigned char *)uh - skb->head;
+	skb->csum_offset = offsetof(struct udphdr, check);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+	return 0;
+}
+
 int udp_gro_complete(struct sk_buff *skb, int nhoff,
 		     udp_lookup_t lookup)
 {
@@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
 
 	uh->len = newlen;
 
-	/* Set encapsulation before calling into inner gro_complete() functions
-	 * to make them set up the inner offsets.
-	 */
-	skb->encapsulation = 1;
-
 	rcu_read_lock();
 	sk = (*lookup)(skb, uh->source, uh->dest);
-	if (sk && udp_sk(sk)->gro_complete)
+	if (sk && udp_sk(sk)->gro_enabled) {
+		err = udp_gro_complete_segment(skb);
+	} else if (sk && udp_sk(sk)->gro_complete) {
+		skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+					: SKB_GSO_UDP_TUNNEL;
+
+		/* Set encapsulation before calling into inner gro_complete()
+		 * functions to make them set up the inner offsets.
+		 */
+		skb->encapsulation = 1;
 		err = udp_sk(sk)->gro_complete(sk, skb,
 				nhoff + sizeof(struct udphdr));
+	}
 	rcu_read_unlock();
 
 	if (skb->remcsum_offload)
@@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-	if (uh->check) {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+	if (uh->check)
 		uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
 					  iph->daddr, 0);
-	} else {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-	}
 
 	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 1b8e161ac527..828b2457f97b 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -147,13 +147,9 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-	if (uh->check) {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+	if (uh->check)
 		uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
 					  &ipv6h->daddr, 0);
-	} else {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-	}
 
 	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 783195ec1cada862d54dee8f312a60bcbba5c0e4 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Thu, 8 Nov 2018 09:39:46 +0100
Subject: drm/syncobj: disable the timeline UAPI for now v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Until we have sorted out all problems.

v2: return -EINVAL during create if flag is set.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/260937/
---
 drivers/gpu/drm/drm_syncobj.c | 4 ++++
 include/drm/drm_syncobj.h     | 3 +++
 include/uapi/drm/drm.h        | 1 -
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index da8175d9c6ff..da2b85eec6cf 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -483,6 +483,10 @@ int drm_syncobj_create(struct drm_syncobj **out_syncobj, uint32_t flags,
 	int ret;
 	struct drm_syncobj *syncobj;
 
+	/* Disabled for now */
+	if (flags & DRM_SYNCOBJ_CREATE_TYPE_TIMELINE)
+		return -EINVAL;
+
 	syncobj = kzalloc(sizeof(struct drm_syncobj), GFP_KERNEL);
 	if (!syncobj)
 		return -ENOMEM;
diff --git a/include/drm/drm_syncobj.h b/include/drm/drm_syncobj.h
index 29244cbcd05e..ffd1f4fcf519 100644
--- a/include/drm/drm_syncobj.h
+++ b/include/drm/drm_syncobj.h
@@ -30,6 +30,9 @@
 
 struct drm_syncobj_cb;
 
+/* Move the define here for the moment to avoid exposing the UAPI just yet */
+#define DRM_SYNCOBJ_CREATE_TYPE_TIMELINE (1 << 1)
+
 enum drm_syncobj_type {
 	DRM_SYNCOBJ_TYPE_BINARY,
 	DRM_SYNCOBJ_TYPE_TIMELINE
diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index cebdb2541eb7..300f336633f2 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -717,7 +717,6 @@ struct drm_prime_handle {
 struct drm_syncobj_create {
 	__u32 handle;
 #define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0)
-#define DRM_SYNCOBJ_CREATE_TYPE_TIMELINE (1 << 1)
 	__u32 flags;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From b4d3069783bccf0c965468da7db141d359d796fc Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 8 Nov 2018 12:19:16 +0100
Subject: vxlan: Allow configuration of DF behaviour

Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.

For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.

According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.

Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.

This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.

v2:
- DF behaviour configuration only applies for non-lwt tunnels, move DF
  setting to if (!info) block in vxlan_xmit_one() (Stephen Hemminger)

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 29 ++++++++++++++++++++++++++++-
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  9 +++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 0851af6733f3..c3e65e78f015 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2278,13 +2278,24 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto tx_error;
 		}
 
-		/* Bypass encapsulation if the destination is local */
 		if (!info) {
+			/* Bypass encapsulation if the destination is local */
 			err = encap_bypass_if_local(skb, dev, vxlan, dst,
 						    dst_port, ifindex, vni,
 						    &rt->dst, rt->rt_flags);
 			if (err)
 				goto out_unlock;
+
+			if (vxlan->cfg.df == VXLAN_DF_SET) {
+				df = htons(IP_DF);
+			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
+				struct ethhdr *eth = eth_hdr(skb);
+
+				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
+				    (ntohs(eth->h_proto) == ETH_P_IP &&
+				     old_iph->frag_off & htons(IP_DF)))
+					df = htons(IP_DF);
+			}
 		} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
 			df = htons(IP_DF);
 		}
@@ -2837,6 +2848,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
 	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
+	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -2893,6 +2905,16 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
+	if (data[IFLA_VXLAN_DF]) {
+		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
+		if (df < 0 || df > VXLAN_DF_MAX) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF],
+					    "Invalid DF attribute");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -3538,6 +3560,9 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
 	}
 
+	if (data[IFLA_VXLAN_DF])
+		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
 	return 0;
 }
 
@@ -3630,6 +3655,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
 		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
@@ -3696,6 +3722,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
 		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
 	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 			!!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 03431c148e16..ec999c49df1f 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -216,6 +216,7 @@ struct vxlan_config {
 	unsigned long		age_interval;
 	unsigned int		addrmax;
 	bool			no_share;
+	enum ifla_vxlan_df	df;
 };
 
 struct vxlan_dev_node {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 1debfa42cba1..efc588949431 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -533,6 +533,7 @@ enum {
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
+	IFLA_VXLAN_DF,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
@@ -542,6 +543,14 @@ struct ifla_vxlan_port_range {
 	__be16	high;
 };
 
+enum ifla_vxlan_df {
+	VXLAN_DF_UNSET = 0,
+	VXLAN_DF_SET,
+	VXLAN_DF_INHERIT,
+	__VXLAN_DF_END,
+	VXLAN_DF_MAX = __VXLAN_DF_END - 1,
+};
+
 /* GENEVE section */
 enum {
 	IFLA_GENEVE_UNSPEC,
-- 
cgit v1.2.3-59-g8ed1b


From a025fb5f49ad38cf749753b16fcd031d0d678f2b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 8 Nov 2018 12:19:19 +0100
Subject: geneve: Allow configuration of DF behaviour

draft-ietf-nvo3-geneve-08 says:

   It is strongly RECOMMENDED that Path MTU Discovery ([RFC1191],
   [RFC1981]) be used by setting the DF bit in the IP header when Geneve
   packets are transmitted over IPv4 (this is the default with IPv6).

Now that ICMP error handling is working for GENEVE, we can comply with
this recommendation.

Make this configurable, though, to avoid breaking existing setups. By
default, DF won't be set. It can be set or inherited from inner IPv4
packets. If it's configured to be inherited and we are encapsulating IPv6,
it will be set.

This only applies to non-lwt tunnels: if an external control plane is
used, tunnel key will still control the DF flag.

v2:
- DF behaviour configuration only applies for non-lwt tunnels, apply DF
  setting only if (!geneve->collect_md) in geneve_xmit_skb()
  (Stephen Hemminger)

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c         | 55 +++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/if_link.h |  9 ++++++++
 2 files changed, 56 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 4dc457489770..7c53e06b31c3 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -70,6 +70,7 @@ struct geneve_dev {
 	bool		   collect_md;
 	bool		   use_udp6_rx_checksums;
 	bool		   ttl_inherit;
+	enum ifla_geneve_df df;
 };
 
 struct geneve_sock {
@@ -875,8 +876,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	struct rtable *rt;
 	struct flowi4 fl4;
 	__u8 tos, ttl;
+	__be16 df = 0;
 	__be16 sport;
-	__be16 df;
 	int err;
 
 	rt = geneve_get_v4_rt(skb, dev, gs4, &fl4, info);
@@ -890,6 +891,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (geneve->collect_md) {
 		tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
 		ttl = key->ttl;
+
+		df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
 	} else {
 		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb);
 		if (geneve->ttl_inherit)
@@ -897,8 +900,22 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		else
 			ttl = key->ttl;
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+		if (geneve->df == GENEVE_DF_SET) {
+			df = htons(IP_DF);
+		} else if (geneve->df == GENEVE_DF_INHERIT) {
+			struct ethhdr *eth = eth_hdr(skb);
+
+			if (ntohs(eth->h_proto) == ETH_P_IPV6) {
+				df = htons(IP_DF);
+			} else if (ntohs(eth->h_proto) == ETH_P_IP) {
+				struct iphdr *iph = ip_hdr(skb);
+
+				if (iph->frag_off & htons(IP_DF))
+					df = htons(IP_DF);
+			}
+		}
 	}
-	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
 
 	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
 	if (unlikely(err))
@@ -1145,6 +1162,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
 	[IFLA_GENEVE_TTL_INHERIT]	= { .type = NLA_U8 },
+	[IFLA_GENEVE_DF]		= { .type = NLA_U8 },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1180,6 +1198,16 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
+	if (data[IFLA_GENEVE_DF]) {
+		enum ifla_geneve_df df = nla_get_u8(data[IFLA_GENEVE_DF]);
+
+		if (df < 0 || df > GENEVE_DF_MAX) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_GENEVE_DF],
+					    "Invalid DF attribute");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -1225,7 +1253,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 			    struct netlink_ext_ack *extack,
 			    const struct ip_tunnel_info *info,
 			    bool metadata, bool ipv6_rx_csum,
-			    bool ttl_inherit)
+			    bool ttl_inherit, enum ifla_geneve_df df)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *t, *geneve = netdev_priv(dev);
@@ -1275,6 +1303,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	geneve->collect_md = metadata;
 	geneve->use_udp6_rx_checksums = ipv6_rx_csum;
 	geneve->ttl_inherit = ttl_inherit;
+	geneve->df = df;
 
 	err = register_netdevice(dev);
 	if (err)
@@ -1294,7 +1323,7 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack,
 			  struct ip_tunnel_info *info, bool *metadata,
 			  bool *use_udp6_rx_checksums, bool *ttl_inherit,
-			  bool changelink)
+			  enum ifla_geneve_df *df, bool changelink)
 {
 	int attrtype;
 
@@ -1382,6 +1411,9 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_GENEVE_TOS])
 		info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
+	if (data[IFLA_GENEVE_DF])
+		*df = nla_get_u8(data[IFLA_GENEVE_DF]);
+
 	if (data[IFLA_GENEVE_LABEL]) {
 		info->key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
 				  IPV6_FLOWLABEL_MASK;
@@ -1500,6 +1532,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack)
 {
+	enum ifla_geneve_df df = GENEVE_DF_UNSET;
 	bool use_udp6_rx_checksums = false;
 	struct ip_tunnel_info info;
 	bool ttl_inherit = false;
@@ -1508,12 +1541,12 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 
 	init_tnl_info(&info, GENEVE_UDP_PORT);
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, &ttl_inherit, false);
+			     &use_udp6_rx_checksums, &ttl_inherit, &df, false);
 	if (err)
 		return err;
 
 	err = geneve_configure(net, dev, extack, &info, metadata,
-			       use_udp6_rx_checksums, ttl_inherit);
+			       use_udp6_rx_checksums, ttl_inherit, df);
 	if (err)
 		return err;
 
@@ -1576,6 +1609,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct ip_tunnel_info info;
 	bool metadata;
 	bool use_udp6_rx_checksums;
+	enum ifla_geneve_df df;
 	bool ttl_inherit;
 	int err;
 
@@ -1591,7 +1625,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
 	ttl_inherit = geneve->ttl_inherit;
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, &ttl_inherit, true);
+			     &use_udp6_rx_checksums, &ttl_inherit, &df, true);
 	if (err)
 		return err;
 
@@ -1624,6 +1658,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_GENEVE_DF */
 		nla_total_size(sizeof(__be32)) +  /* IFLA_GENEVE_LABEL */
 		nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
 		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
@@ -1672,6 +1707,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_GENEVE_DF, geneve->df))
+		goto nla_put_failure;
+
 	if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst))
 		goto nla_put_failure;
 
@@ -1723,7 +1761,8 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 		return dev;
 
 	init_tnl_info(&info, dst_port);
-	err = geneve_configure(net, dev, NULL, &info, true, true, false);
+	err = geneve_configure(net, dev, NULL, &info,
+			       true, true, false, GENEVE_DF_UNSET);
 	if (err) {
 		free_netdev(dev);
 		return ERR_PTR(err);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index efc588949431..f42c069d81db 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -566,10 +566,19 @@ enum {
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
 	IFLA_GENEVE_LABEL,
 	IFLA_GENEVE_TTL_INHERIT,
+	IFLA_GENEVE_DF,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
 
+enum ifla_geneve_df {
+	GENEVE_DF_UNSET = 0,
+	GENEVE_DF_SET,
+	GENEVE_DF_INHERIT,
+	__GENEVE_DF_END,
+	GENEVE_DF_MAX = __GENEVE_DF_END - 1,
+};
+
 /* PPP section */
 enum {
 	IFLA_PPP_UNSPEC,
-- 
cgit v1.2.3-59-g8ed1b


From c8123ead13a5c92dc5fd15c0fdfe88eef41e6ac1 Mon Sep 17 00:00:00 2001
From: Nitin Hande <nitin.hande@gmail.com>
Date: Sun, 28 Oct 2018 21:02:45 -0700
Subject: bpf: Extend the sk_lookup() helper to XDP hookpoint.

This patch proposes to extend the sk_lookup() BPF API to the
XDP hookpoint. The sk_lookup() helper supports a lookup
on incoming packet to find the corresponding socket that will
receive this packet. Current support for this BPF API is
at the tc hookpoint. This patch will extend this API at XDP
hookpoint. A XDP program can map the incoming packet to the
5-tuple parameter and invoke the API to find the corresponding
socket structure.

Signed-off-by: Nitin Hande <Nitin.Hande@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |   4 ++
 net/core/filter.c        | 105 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 90 insertions(+), 19 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 852dc17ab47a..47d606d744cc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2201,6 +2201,8 @@ union bpf_attr {
  *		**CONFIG_NET** configuration option.
  *	Return
  *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		For sockets with reuseport option, *struct bpf_sock*
+ *		return is from reuse->socks[] using hash of the packet.
  *
  * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
  *	Description
@@ -2233,6 +2235,8 @@ union bpf_attr {
  *		**CONFIG_NET** configuration option.
  *	Return
  *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		For sockets with reuseport option, *struct bpf_sock*
+ *		return is from reuse->socks[] using hash of the packet.
  *
  * int bpf_sk_release(struct bpf_sock *sk)
  *	Description
diff --git a/net/core/filter.c b/net/core/filter.c
index ba97a6bee6f9..53d50fb75ea1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4845,38 +4845,32 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
 
 #ifdef CONFIG_INET
 static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
-			      struct sk_buff *skb, u8 family, u8 proto)
+			      int dif, int sdif, u8 family, u8 proto)
 {
 	bool refcounted = false;
 	struct sock *sk = NULL;
-	int dif = 0;
-
-	if (skb->dev)
-		dif = skb->dev->ifindex;
 
 	if (family == AF_INET) {
 		__be32 src4 = tuple->ipv4.saddr;
 		__be32 dst4 = tuple->ipv4.daddr;
-		int sdif = inet_sdif(skb);
 
 		if (proto == IPPROTO_TCP)
-			sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
+			sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0,
 					   src4, tuple->ipv4.sport,
 					   dst4, tuple->ipv4.dport,
 					   dif, sdif, &refcounted);
 		else
 			sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
 					       dst4, tuple->ipv4.dport,
-					       dif, sdif, &udp_table, skb);
+					       dif, sdif, &udp_table, NULL);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else {
 		struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
 		struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
 		u16 hnum = ntohs(tuple->ipv6.dport);
-		int sdif = inet6_sdif(skb);
 
 		if (proto == IPPROTO_TCP)
-			sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
+			sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0,
 					    src6, tuple->ipv6.sport,
 					    dst6, hnum,
 					    dif, sdif, &refcounted);
@@ -4885,7 +4879,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
 							    src6, tuple->ipv6.sport,
 							    dst6, hnum,
 							    dif, sdif,
-							    &udp_table, skb);
+							    &udp_table, NULL);
 #endif
 	}
 
@@ -4902,31 +4896,33 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
  * callers to satisfy BPF_CALL declarations.
  */
 static unsigned long
-bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
-	      u8 proto, u64 netns_id, u64 flags)
+__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+		struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
+		u64 flags)
 {
-	struct net *caller_net;
 	struct sock *sk = NULL;
 	u8 family = AF_UNSPEC;
 	struct net *net;
+	int sdif;
 
 	family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
 	if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
 		goto out;
 
-	if (skb->dev)
-		caller_net = dev_net(skb->dev);
+	if (family == AF_INET)
+		sdif = inet_sdif(skb);
 	else
-		caller_net = sock_net(skb->sk);
+		sdif = inet6_sdif(skb);
+
 	if (netns_id) {
 		net = get_net_ns_by_id(caller_net, netns_id);
 		if (unlikely(!net))
 			goto out;
-		sk = sk_lookup(net, tuple, skb, family, proto);
+		sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
 		put_net(net);
 	} else {
 		net = caller_net;
-		sk = sk_lookup(net, tuple, skb, family, proto);
+		sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
 	}
 
 	if (sk)
@@ -4935,6 +4931,25 @@ out:
 	return (unsigned long) sk;
 }
 
+static unsigned long
+bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+	      u8 proto, u64 netns_id, u64 flags)
+{
+	struct net *caller_net;
+	int ifindex;
+
+	if (skb->dev) {
+		caller_net = dev_net(skb->dev);
+		ifindex = skb->dev->ifindex;
+	} else {
+		caller_net = sock_net(skb->sk);
+		ifindex = 0;
+	}
+
+	return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex,
+			      proto, netns_id, flags);
+}
+
 BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
 	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
 {
@@ -4984,6 +4999,50 @@ static const struct bpf_func_proto bpf_sk_release_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_SOCKET,
 };
+
+BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
+	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+	struct net *caller_net = dev_net(ctx->rxq->dev);
+	int ifindex = ctx->rxq->dev->ifindex;
+
+	return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex,
+			      IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
+	.func           = bpf_xdp_sk_lookup_udp,
+	.gpl_only       = false,
+	.pkt_access     = true,
+	.ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type      = ARG_ANYTHING,
+	.arg5_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
+	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+	struct net *caller_net = dev_net(ctx->rxq->dev);
+	int ifindex = ctx->rxq->dev->ifindex;
+
+	return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex,
+			      IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
+	.func           = bpf_xdp_sk_lookup_tcp,
+	.gpl_only       = false,
+	.pkt_access     = true,
+	.ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type      = ARG_ANYTHING,
+	.arg5_type      = ARG_ANYTHING,
+};
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5234,6 +5293,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_adjust_tail_proto;
 	case BPF_FUNC_fib_lookup:
 		return &bpf_xdp_fib_lookup_proto;
+#ifdef CONFIG_INET
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_xdp_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_xdp_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 9bb7e0f24e7e7d00daa1219b14539e2e602649b2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 10 Sep 2018 13:29:12 +0200
Subject: cfg80211: add peer measurement with FTM initiator API

Add a new "peer measurement" API, that can be used to measure
certain things related to a peer. Right now, only implement
FTM (flight time measurement) over it, but the idea is that
it'll be extensible to also support measuring the necessary
things to calculate e.g. angle-of-arrival for WiGig.

The API is structured to have a generic list of peers and
channels to measure with/on, and then for each of those a
set of measurements (again, only FTM right now) to perform.

Results are sent to the requesting socket, including a final
complete message.

Closing the controlling netlink socket will abort a running
measurement.

v3:
 - add a bit to report "final" for partial results
 - remove list keeping etc. and just unicast out the results
   to the requester (big code reduction ...)
 - also send complete message unicast, and as a result
   remove the multicast group
 - separate out struct cfg80211_pmsr_ftm_request_peer
   from struct cfg80211_pmsr_request_peer
 - document timeout == 0 if no timeout
 - disallow setting timeout nl80211 attribute to 0,
   must not include attribute for no timeout
 - make MAC address randomization optional
 - change num bursts exponent default to 0 (1 burst, rather
   rather than the old default of 15==don't care)

v4:
 - clarify NL80211_ATTR_TIMEOUT documentation

v5:
 - remove unnecessary nl80211 multicast/family changes
 - remove partial results bit/flag, final is sufficient
 - add max_bursts_exponent, max_ftms_per_burst to capability
 - rename "frames per burst" -> "FTMs per burst"

v6:
 - rename cfg80211_pmsr_free_wdev() to cfg80211_pmsr_wdev_down()
   and call it in leave, so the device can't go down with any
   pending measurements

v7:
 - wording fixes (Lior)
 - fix ftm.max_bursts_exponent to allow having the limit of 0 (Lior)

v8:
 - copyright statements
 - minor coding style fixes
 - fix error path leak

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 263 +++++++++++++++++++
 include/uapi/linux/nl80211.h | 418 ++++++++++++++++++++++++++++++
 net/wireless/Makefile        |   1 +
 net/wireless/core.c          |  34 +++
 net/wireless/core.h          |   5 +
 net/wireless/nl80211.c       | 192 ++++++++++++--
 net/wireless/nl80211.h       |  32 +++
 net/wireless/pmsr.c          | 590 +++++++++++++++++++++++++++++++++++++++++++
 net/wireless/rdev-ops.h      |  25 ++
 net/wireless/trace.h         |  68 +++++
 10 files changed, 1609 insertions(+), 19 deletions(-)
 create mode 100644 net/wireless/pmsr.c

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1fa41b7a1be3..c21c5c70a2fd 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2848,6 +2848,190 @@ struct cfg80211_ftm_responder_stats {
 	u32 out_of_window_triggers_num;
 };
 
+/**
+ * struct cfg80211_pmsr_ftm_result - FTM result
+ * @failure_reason: if this measurement failed (PMSR status is
+ *	%NL80211_PMSR_STATUS_FAILURE), this gives a more precise
+ *	reason than just "failure"
+ * @burst_index: if reporting partial results, this is the index
+ *	in [0 .. num_bursts-1] of the burst that's being reported
+ * @num_ftmr_attempts: number of FTM request frames transmitted
+ * @num_ftmr_successes: number of FTM request frames acked
+ * @busy_retry_time: if failure_reason is %NL80211_PMSR_FTM_FAILURE_PEER_BUSY,
+ *	fill this to indicate in how many seconds a retry is deemed possible
+ *	by the responder
+ * @num_bursts_exp: actual number of bursts exponent negotiated
+ * @burst_duration: actual burst duration negotiated
+ * @ftms_per_burst: actual FTMs per burst negotiated
+ * @lci_len: length of LCI information (if present)
+ * @civicloc_len: length of civic location information (if present)
+ * @lci: LCI data (may be %NULL)
+ * @civicloc: civic location data (may be %NULL)
+ * @rssi_avg: average RSSI over FTM action frames reported
+ * @rssi_spread: spread of the RSSI over FTM action frames reported
+ * @tx_rate: bitrate for transmitted FTM action frame response
+ * @rx_rate: bitrate of received FTM action frame
+ * @rtt_avg: average of RTTs measured (must have either this or @dist_avg)
+ * @rtt_variance: variance of RTTs measured (note that standard deviation is
+ *	the square root of the variance)
+ * @rtt_spread: spread of the RTTs measured
+ * @dist_avg: average of distances (mm) measured
+ *	(must have either this or @rtt_avg)
+ * @dist_variance: variance of distances measured (see also @rtt_variance)
+ * @dist_spread: spread of distances measured (see also @rtt_spread)
+ * @num_ftmr_attempts_valid: @num_ftmr_attempts is valid
+ * @num_ftmr_successes_valid: @num_ftmr_successes is valid
+ * @rssi_avg_valid: @rssi_avg is valid
+ * @rssi_spread_valid: @rssi_spread is valid
+ * @tx_rate_valid: @tx_rate is valid
+ * @rx_rate_valid: @rx_rate is valid
+ * @rtt_avg_valid: @rtt_avg is valid
+ * @rtt_variance_valid: @rtt_variance is valid
+ * @rtt_spread_valid: @rtt_spread is valid
+ * @dist_avg_valid: @dist_avg is valid
+ * @dist_variance_valid: @dist_variance is valid
+ * @dist_spread_valid: @dist_spread is valid
+ */
+struct cfg80211_pmsr_ftm_result {
+	const u8 *lci;
+	const u8 *civicloc;
+	unsigned int lci_len;
+	unsigned int civicloc_len;
+	enum nl80211_peer_measurement_ftm_failure_reasons failure_reason;
+	u32 num_ftmr_attempts, num_ftmr_successes;
+	s16 burst_index;
+	u8 busy_retry_time;
+	u8 num_bursts_exp;
+	u8 burst_duration;
+	u8 ftms_per_burst;
+	s32 rssi_avg;
+	s32 rssi_spread;
+	struct rate_info tx_rate, rx_rate;
+	s64 rtt_avg;
+	s64 rtt_variance;
+	s64 rtt_spread;
+	s64 dist_avg;
+	s64 dist_variance;
+	s64 dist_spread;
+
+	u16 num_ftmr_attempts_valid:1,
+	    num_ftmr_successes_valid:1,
+	    rssi_avg_valid:1,
+	    rssi_spread_valid:1,
+	    tx_rate_valid:1,
+	    rx_rate_valid:1,
+	    rtt_avg_valid:1,
+	    rtt_variance_valid:1,
+	    rtt_spread_valid:1,
+	    dist_avg_valid:1,
+	    dist_variance_valid:1,
+	    dist_spread_valid:1;
+};
+
+/**
+ * struct cfg80211_pmsr_result - peer measurement result
+ * @addr: address of the peer
+ * @host_time: host time (use ktime_get_boottime() adjust to the time when the
+ *	measurement was made)
+ * @ap_tsf: AP's TSF at measurement time
+ * @status: status of the measurement
+ * @final: if reporting partial results, mark this as the last one; if not
+ *	reporting partial results always set this flag
+ * @ap_tsf_valid: indicates the @ap_tsf value is valid
+ * @type: type of the measurement reported, note that we only support reporting
+ *	one type at a time, but you can report multiple results separately and
+ *	they're all aggregated for userspace.
+ */
+struct cfg80211_pmsr_result {
+	u64 host_time, ap_tsf;
+	enum nl80211_peer_measurement_status status;
+
+	u8 addr[ETH_ALEN];
+
+	u8 final:1,
+	   ap_tsf_valid:1;
+
+	enum nl80211_peer_measurement_type type;
+
+	union {
+		struct cfg80211_pmsr_ftm_result ftm;
+	};
+};
+
+/**
+ * struct cfg80211_pmsr_ftm_request_peer - FTM request data
+ * @requested: indicates FTM is requested
+ * @preamble: frame preamble to use
+ * @burst_period: burst period to use
+ * @asap: indicates to use ASAP mode
+ * @num_bursts_exp: number of bursts exponent
+ * @burst_duration: burst duration
+ * @ftms_per_burst: number of FTMs per burst
+ * @ftmr_retries: number of retries for FTM request
+ * @request_lci: request LCI information
+ * @request_civicloc: request civic location information
+ *
+ * See also nl80211 for the respective attribute documentation.
+ */
+struct cfg80211_pmsr_ftm_request_peer {
+	enum nl80211_preamble preamble;
+	u16 burst_period;
+	u8 requested:1,
+	   asap:1,
+	   request_lci:1,
+	   request_civicloc:1;
+	u8 num_bursts_exp;
+	u8 burst_duration;
+	u8 ftms_per_burst;
+	u8 ftmr_retries;
+};
+
+/**
+ * struct cfg80211_pmsr_request_peer - peer data for a peer measurement request
+ * @addr: MAC address
+ * @chandef: channel to use
+ * @report_ap_tsf: report the associated AP's TSF
+ * @ftm: FTM data, see &struct cfg80211_pmsr_ftm_request_peer
+ */
+struct cfg80211_pmsr_request_peer {
+	u8 addr[ETH_ALEN];
+	struct cfg80211_chan_def chandef;
+	u8 report_ap_tsf:1;
+	struct cfg80211_pmsr_ftm_request_peer ftm;
+};
+
+/**
+ * struct cfg80211_pmsr_request - peer measurement request
+ * @cookie: cookie, set by cfg80211
+ * @nl_portid: netlink portid - used by cfg80211
+ * @drv_data: driver data for this request, if required for aborting,
+ *	not otherwise freed or anything by cfg80211
+ * @mac_addr: MAC address used for (randomised) request
+ * @mac_addr_mask: MAC address mask used for randomisation, bits that
+ *	are 0 in the mask should be randomised, bits that are 1 should
+ *	be taken from the @mac_addr
+ * @list: used by cfg80211 to hold on to the request
+ * @timeout: timeout (in milliseconds) for the whole operation, if
+ *	zero it means there's no timeout
+ * @n_peers: number of peers to do measurements with
+ * @peers: per-peer measurement request data
+ */
+struct cfg80211_pmsr_request {
+	u64 cookie;
+	void *drv_data;
+	u32 n_peers;
+	u32 nl_portid;
+
+	u32 timeout;
+
+	u8 mac_addr[ETH_ALEN] __aligned(2);
+	u8 mac_addr_mask[ETH_ALEN] __aligned(2);
+
+	struct list_head list;
+
+	struct cfg80211_pmsr_request_peer peers[];
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -3183,6 +3367,8 @@ struct cfg80211_ftm_responder_stats {
  *
  * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available.
  *	Statistics should be cumulative, currently no way to reset is provided.
+ * @start_pmsr: start peer measurement (e.g. FTM)
+ * @abort_pmsr: abort peer measurement
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -3492,6 +3678,11 @@ struct cfg80211_ops {
 	int	(*get_ftm_responder_stats)(struct wiphy *wiphy,
 				struct net_device *dev,
 				struct cfg80211_ftm_responder_stats *ftm_stats);
+
+	int	(*start_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
+			      struct cfg80211_pmsr_request *request);
+	void	(*abort_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
+			      struct cfg80211_pmsr_request *request);
 };
 
 /*
@@ -3863,6 +4054,42 @@ struct wiphy_iftype_ext_capab {
 	u8 extended_capabilities_len;
 };
 
+/**
+ * struct cfg80211_pmsr_capabilities - cfg80211 peer measurement capabilities
+ * @max_peers: maximum number of peers in a single measurement
+ * @report_ap_tsf: can report assoc AP's TSF for radio resource measurement
+ * @randomize_mac_addr: can randomize MAC address for measurement
+ * @ftm.supported: FTM measurement is supported
+ * @ftm.asap: ASAP-mode is supported
+ * @ftm.non_asap: non-ASAP-mode is supported
+ * @ftm.request_lci: can request LCI data
+ * @ftm.request_civicloc: can request civic location data
+ * @ftm.preambles: bitmap of preambles supported (&enum nl80211_preamble)
+ * @ftm.bandwidths: bitmap of bandwidths supported (&enum nl80211_chan_width)
+ * @ftm.max_bursts_exponent: maximum burst exponent supported
+ *	(set to -1 if not limited; note that setting this will necessarily
+ *	forbid using the value 15 to let the responder pick)
+ * @ftm.max_ftms_per_burst: maximum FTMs per burst supported (set to 0 if
+ *	not limited)
+ */
+struct cfg80211_pmsr_capabilities {
+	unsigned int max_peers;
+	u8 report_ap_tsf:1,
+	   randomize_mac_addr:1;
+
+	struct {
+		u32 preambles;
+		u32 bandwidths;
+		s8 max_bursts_exponent;
+		u8 max_ftms_per_burst;
+		u8 supported:1,
+		   asap:1,
+		   non_asap:1,
+		   request_lci:1,
+		   request_civicloc:1;
+	} ftm;
+};
+
 /**
  * struct wiphy - wireless hardware description
  * @reg_notifier: the driver's regulatory notification callback,
@@ -4027,6 +4254,8 @@ struct wiphy_iftype_ext_capab {
  * @txq_limit: configuration of internal TX queue frame limit
  * @txq_memory_limit: configuration internal TX queue memory limit
  * @txq_quantum: configuration of internal TX queue scheduler quantum
+ *
+ * @pmsr_capa: peer measurement capabilities
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
@@ -4163,6 +4392,8 @@ struct wiphy {
 	u32 txq_memory_limit;
 	u32 txq_quantum;
 
+	const struct cfg80211_pmsr_capabilities *pmsr_capa;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
@@ -4365,6 +4596,9 @@ struct cfg80211_cqm_config;
  * @owner_nlportid: (private) owner socket port ID
  * @nl_owner_dead: (private) owner socket went away
  * @cqm_config: (private) nl80211 RSSI monitor state
+ * @pmsr_list: (private) peer measurement requests
+ * @pmsr_lock: (private) peer measurements requests/results lock
+ * @pmsr_free_wk: (private) peer measurements cleanup work
  */
 struct wireless_dev {
 	struct wiphy *wiphy;
@@ -4436,6 +4670,10 @@ struct wireless_dev {
 #endif
 
 	struct cfg80211_cqm_config *cqm_config;
+
+	struct list_head pmsr_list;
+	spinlock_t pmsr_lock;
+	struct work_struct pmsr_free_wk;
 };
 
 static inline u8 *wdev_address(struct wireless_dev *wdev)
@@ -6630,6 +6868,31 @@ int cfg80211_external_auth_request(struct net_device *netdev,
 				   struct cfg80211_external_auth_params *params,
 				   gfp_t gfp);
 
+/**
+ * cfg80211_pmsr_report - report peer measurement result data
+ * @wdev: the wireless device reporting the measurement
+ * @req: the original measurement request
+ * @result: the result data
+ * @gfp: allocation flags
+ */
+void cfg80211_pmsr_report(struct wireless_dev *wdev,
+			  struct cfg80211_pmsr_request *req,
+			  struct cfg80211_pmsr_result *result,
+			  gfp_t gfp);
+
+/**
+ * cfg80211_pmsr_complete - report peer measurement completed
+ * @wdev: the wireless device reporting the measurement
+ * @req: the original measurement request
+ * @gfp: allocation flags
+ *
+ * Report that the entire measurement completed, after this
+ * the request pointer will no longer be valid.
+ */
+void cfg80211_pmsr_complete(struct wireless_dev *wdev,
+			    struct cfg80211_pmsr_request *req,
+			    gfp_t gfp);
+
 /* Logging, debugging and troubleshooting/diagnostic helpers. */
 
 /* wiphy_printk helpers, similar to dev_printk */
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 6d610bae30a9..e45b88925783 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1036,6 +1036,30 @@
  * @NL80211_CMD_GET_FTM_RESPONDER_STATS: Retrieve FTM responder statistics, in
  *	the %NL80211_ATTR_FTM_RESPONDER_STATS attribute.
  *
+ * @NL80211_CMD_PEER_MEASUREMENT_START: start a (set of) peer measurement(s)
+ *	with the given parameters, which are encapsulated in the nested
+ *	%NL80211_ATTR_PEER_MEASUREMENTS attribute. Optionally, MAC address
+ *	randomization may be enabled and configured by specifying the
+ *	%NL80211_ATTR_MAC and %NL80211_ATTR_MAC_MASK attributes.
+ *	If a timeout is requested, use the %NL80211_ATTR_TIMEOUT attribute.
+ *	A u64 cookie for further %NL80211_ATTR_COOKIE use is is returned in
+ *	the netlink extended ack message.
+ *
+ *	To cancel a measurement, close the socket that requested it.
+ *
+ *	Measurement results are reported to the socket that requested the
+ *	measurement using @NL80211_CMD_PEER_MEASUREMENT_RESULT when they
+ *	become available, so applications must ensure a large enough socket
+ *	buffer size.
+ *
+ *	Depending on driver support it may or may not be possible to start
+ *	multiple concurrent measurements.
+ * @NL80211_CMD_PEER_MEASUREMENT_RESULT: This command number is used for the
+ *	result notification from the driver to the requesting socket.
+ * @NL80211_CMD_PEER_MEASUREMENT_COMPLETE: Notification only, indicating that
+ *	the measurement completed, using the measurement cookie
+ *	(%NL80211_ATTR_COOKIE).
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1250,6 +1274,10 @@ enum nl80211_commands {
 
 	NL80211_CMD_GET_FTM_RESPONDER_STATS,
 
+	NL80211_CMD_PEER_MEASUREMENT_START,
+	NL80211_CMD_PEER_MEASUREMENT_RESULT,
+	NL80211_CMD_PEER_MEASUREMENT_COMPLETE,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2254,6 +2282,16 @@ enum nl80211_commands {
  * @NL80211_ATTR_FTM_RESPONDER_STATS: Nested attribute with FTM responder
  *	statistics, see &enum nl80211_ftm_responder_stats.
  *
+ * @NL80211_ATTR_TIMEOUT: Timeout for the given operation in milliseconds (u32),
+ *	if the attribute is not given no timeout is requested. Note that 0 is an
+ *	invalid value.
+ *
+ * @NL80211_ATTR_PEER_MEASUREMENTS: peer measurements request (and result)
+ *	data, uses nested attributes specified in
+ *	&enum nl80211_peer_measurement_attrs.
+ *	This is also used for capability advertisement in the wiphy information,
+ *	with the appropriate sub-attributes.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2699,6 +2737,10 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_FTM_RESPONDER_STATS,
 
+	NL80211_ATTR_TIMEOUT,
+
+	NL80211_ATTR_PEER_MEASUREMENTS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -5906,4 +5948,380 @@ enum nl80211_ftm_responder_stats {
 	NL80211_FTM_STATS_MAX = __NL80211_FTM_STATS_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_preamble - frame preamble types
+ * @NL80211_PREAMBLE_LEGACY: legacy (HR/DSSS, OFDM, ERP PHY) preamble
+ * @NL80211_PREAMBLE_HT: HT preamble
+ * @NL80211_PREAMBLE_VHT: VHT preamble
+ * @NL80211_PREAMBLE_DMG: DMG preamble
+ */
+enum nl80211_preamble {
+	NL80211_PREAMBLE_LEGACY,
+	NL80211_PREAMBLE_HT,
+	NL80211_PREAMBLE_VHT,
+	NL80211_PREAMBLE_DMG,
+};
+
+/**
+ * enum nl80211_peer_measurement_type - peer measurement types
+ * @NL80211_PMSR_TYPE_INVALID: invalid/unused, needed as we use
+ *	these numbers also for attributes
+ *
+ * @NL80211_PMSR_TYPE_FTM: flight time measurement
+ *
+ * @NUM_NL80211_PMSR_TYPES: internal
+ * @NL80211_PMSR_TYPE_MAX: highest type number
+ */
+enum nl80211_peer_measurement_type {
+	NL80211_PMSR_TYPE_INVALID,
+
+	NL80211_PMSR_TYPE_FTM,
+
+	NUM_NL80211_PMSR_TYPES,
+	NL80211_PMSR_TYPE_MAX = NUM_NL80211_PMSR_TYPES - 1
+};
+
+/**
+ * enum nl80211_peer_measurement_status - peer measurement status
+ * @NL80211_PMSR_STATUS_SUCCESS: measurement completed successfully
+ * @NL80211_PMSR_STATUS_REFUSED: measurement was locally refused
+ * @NL80211_PMSR_STATUS_TIMEOUT: measurement timed out
+ * @NL80211_PMSR_STATUS_FAILURE: measurement failed, a type-dependent
+ *	reason may be available in the response data
+ */
+enum nl80211_peer_measurement_status {
+	NL80211_PMSR_STATUS_SUCCESS,
+	NL80211_PMSR_STATUS_REFUSED,
+	NL80211_PMSR_STATUS_TIMEOUT,
+	NL80211_PMSR_STATUS_FAILURE,
+};
+
+/**
+ * enum nl80211_peer_measurement_req - peer measurement request attributes
+ * @__NL80211_PMSR_REQ_ATTR_INVALID: invalid
+ *
+ * @NL80211_PMSR_REQ_ATTR_DATA: This is a nested attribute with measurement
+ *	type-specific request data inside. The attributes used are from the
+ *	enums named nl80211_peer_measurement_<type>_req.
+ * @NL80211_PMSR_REQ_ATTR_GET_AP_TSF: include AP TSF timestamp, if supported
+ *	(flag attribute)
+ *
+ * @NUM_NL80211_PMSR_REQ_ATTRS: internal
+ * @NL80211_PMSR_REQ_ATTR_MAX: highest attribute number
+ */
+enum nl80211_peer_measurement_req {
+	__NL80211_PMSR_REQ_ATTR_INVALID,
+
+	NL80211_PMSR_REQ_ATTR_DATA,
+	NL80211_PMSR_REQ_ATTR_GET_AP_TSF,
+
+	/* keep last */
+	NUM_NL80211_PMSR_REQ_ATTRS,
+	NL80211_PMSR_REQ_ATTR_MAX = NUM_NL80211_PMSR_REQ_ATTRS - 1
+};
+
+/**
+ * enum nl80211_peer_measurement_resp - peer measurement response attributes
+ * @__NL80211_PMSR_RESP_ATTR_INVALID: invalid
+ *
+ * @NL80211_PMSR_RESP_ATTR_DATA: This is a nested attribute with measurement
+ *	type-specific results inside. The attributes used are from the enums
+ *	named nl80211_peer_measurement_<type>_resp.
+ * @NL80211_PMSR_RESP_ATTR_STATUS: u32 value with the measurement status
+ *	(using values from &enum nl80211_peer_measurement_status.)
+ * @NL80211_PMSR_RESP_ATTR_HOST_TIME: host time (%CLOCK_BOOTTIME) when the
+ *	result was measured; this value is not expected to be accurate to
+ *	more than 20ms. (u64, nanoseconds)
+ * @NL80211_PMSR_RESP_ATTR_AP_TSF: TSF of the AP that the interface
+ *	doing the measurement is connected to when the result was measured.
+ *	This shall be accurately reported if supported and requested
+ *	(u64, usec)
+ * @NL80211_PMSR_RESP_ATTR_FINAL: If results are sent to the host partially
+ *	(*e.g. with FTM per-burst data) this flag will be cleared on all but
+ *	the last result; if all results are combined it's set on the single
+ *	result.
+ * @NL80211_PMSR_RESP_ATTR_PAD: padding for 64-bit attributes, ignore
+ *
+ * @NUM_NL80211_PMSR_RESP_ATTRS: internal
+ * @NL80211_PMSR_RESP_ATTR_MAX: highest attribute number
+ */
+enum nl80211_peer_measurement_resp {
+	__NL80211_PMSR_RESP_ATTR_INVALID,
+
+	NL80211_PMSR_RESP_ATTR_DATA,
+	NL80211_PMSR_RESP_ATTR_STATUS,
+	NL80211_PMSR_RESP_ATTR_HOST_TIME,
+	NL80211_PMSR_RESP_ATTR_AP_TSF,
+	NL80211_PMSR_RESP_ATTR_FINAL,
+	NL80211_PMSR_RESP_ATTR_PAD,
+
+	/* keep last */
+	NUM_NL80211_PMSR_RESP_ATTRS,
+	NL80211_PMSR_RESP_ATTR_MAX = NUM_NL80211_PMSR_RESP_ATTRS - 1
+};
+
+/**
+ * enum nl80211_peer_measurement_peer_attrs - peer attributes for measurement
+ * @__NL80211_PMSR_PEER_ATTR_INVALID: invalid
+ *
+ * @NL80211_PMSR_PEER_ATTR_ADDR: peer's MAC address
+ * @NL80211_PMSR_PEER_ATTR_CHAN: channel definition, nested, using top-level
+ *	attributes like %NL80211_ATTR_WIPHY_FREQ etc.
+ * @NL80211_PMSR_PEER_ATTR_REQ: This is a nested attribute indexed by
+ *	measurement type, with attributes from the
+ *	&enum nl80211_peer_measurement_req inside.
+ * @NL80211_PMSR_PEER_ATTR_RESP: This is a nested attribute indexed by
+ *	measurement type, with attributes from the
+ *	&enum nl80211_peer_measurement_resp inside.
+ *
+ * @NUM_NL80211_PMSR_PEER_ATTRS: internal
+ * @NL80211_PMSR_PEER_ATTR_MAX: highest attribute number
+ */
+enum nl80211_peer_measurement_peer_attrs {
+	__NL80211_PMSR_PEER_ATTR_INVALID,
+
+	NL80211_PMSR_PEER_ATTR_ADDR,
+	NL80211_PMSR_PEER_ATTR_CHAN,
+	NL80211_PMSR_PEER_ATTR_REQ,
+	NL80211_PMSR_PEER_ATTR_RESP,
+
+	/* keep last */
+	NUM_NL80211_PMSR_PEER_ATTRS,
+	NL80211_PMSR_PEER_ATTR_MAX = NUM_NL80211_PMSR_PEER_ATTRS - 1,
+};
+
+/**
+ * enum nl80211_peer_measurement_attrs - peer measurement attributes
+ * @__NL80211_PMSR_ATTR_INVALID: invalid
+ *
+ * @NL80211_PMSR_ATTR_MAX_PEERS: u32 attribute used for capability
+ *	advertisement only, indicates the maximum number of peers
+ *	measurements can be done with in a single request
+ * @NL80211_PMSR_ATTR_REPORT_AP_TSF: flag attribute in capability
+ *	indicating that the connected AP's TSF can be reported in
+ *	measurement results
+ * @NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR: flag attribute in capability
+ *	indicating that MAC address randomization is supported.
+ * @NL80211_PMSR_ATTR_TYPE_CAPA: capabilities reported by the device,
+ *	this contains a nesting indexed by measurement type, and
+ *	type-specific capabilities inside, which are from the enums
+ *	named nl80211_peer_measurement_<type>_capa.
+ * @NL80211_PMSR_ATTR_PEERS: nested attribute, the nesting index is
+ *	meaningless, just a list of peers to measure with, with the
+ *	sub-attributes taken from
+ *	&enum nl80211_peer_measurement_peer_attrs.
+ *
+ * @NUM_NL80211_PMSR_ATTR: internal
+ * @NL80211_PMSR_ATTR_MAX: highest attribute number
+ */
+enum nl80211_peer_measurement_attrs {
+	__NL80211_PMSR_ATTR_INVALID,
+
+	NL80211_PMSR_ATTR_MAX_PEERS,
+	NL80211_PMSR_ATTR_REPORT_AP_TSF,
+	NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR,
+	NL80211_PMSR_ATTR_TYPE_CAPA,
+	NL80211_PMSR_ATTR_PEERS,
+
+	/* keep last */
+	NUM_NL80211_PMSR_ATTR,
+	NL80211_PMSR_ATTR_MAX = NUM_NL80211_PMSR_ATTR - 1
+};
+
+/**
+ * enum nl80211_peer_measurement_ftm_capa - FTM capabilities
+ * @__NL80211_PMSR_FTM_CAPA_ATTR_INVALID: invalid
+ *
+ * @NL80211_PMSR_FTM_CAPA_ATTR_ASAP: flag attribute indicating ASAP mode
+ *	is supported
+ * @NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP: flag attribute indicating non-ASAP
+ *	mode is supported
+ * @NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI: flag attribute indicating if LCI
+ *	data can be requested during the measurement
+ * @NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC: flag attribute indicating if civic
+ *	location data can be requested during the measurement
+ * @NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES: u32 bitmap attribute of bits
+ *	from &enum nl80211_preamble.
+ * @NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS: bitmap of values from
+ *	&enum nl80211_chan_width indicating the supported channel
+ *	bandwidths for FTM. Note that a higher channel bandwidth may be
+ *	configured to allow for other measurements types with different
+ *	bandwidth requirement in the same measurement.
+ * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT: u32 attribute indicating
+ *	the maximum bursts exponent that can be used (if not present anything
+ *	is valid)
+ * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST: u32 attribute indicating
+ *	the maximum FTMs per burst (if not present anything is valid)
+ *
+ * @NUM_NL80211_PMSR_FTM_CAPA_ATTR: internal
+ * @NL80211_PMSR_FTM_CAPA_ATTR_MAX: highest attribute number
+ */
+enum nl80211_peer_measurement_ftm_capa {
+	__NL80211_PMSR_FTM_CAPA_ATTR_INVALID,
+
+	NL80211_PMSR_FTM_CAPA_ATTR_ASAP,
+	NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP,
+	NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI,
+	NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC,
+	NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES,
+	NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS,
+	NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT,
+	NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST,
+
+	/* keep last */
+	NUM_NL80211_PMSR_FTM_CAPA_ATTR,
+	NL80211_PMSR_FTM_CAPA_ATTR_MAX = NUM_NL80211_PMSR_FTM_CAPA_ATTR - 1
+};
+
+/**
+ * enum nl80211_peer_measurement_ftm_req - FTM request attributes
+ * @__NL80211_PMSR_FTM_REQ_ATTR_INVALID: invalid
+ *
+ * @NL80211_PMSR_FTM_REQ_ATTR_ASAP: ASAP mode requested (flag)
+ * @NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE: preamble type (see
+ *	&enum nl80211_preamble), optional for DMG (u32)
+ * @NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP: number of bursts exponent as in
+ *	802.11-2016 9.4.2.168 "Fine Timing Measurement Parameters element"
+ *	(u8, 0-15, optional with default 15 i.e. "no preference")
+ * @NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD: interval between bursts in units
+ *	of 100ms (u16, optional with default 0)
+ * @NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION: burst duration, as in 802.11-2016
+ *	Table 9-257 "Burst Duration field encoding" (u8, 0-15, optional with
+ *	default 15 i.e. "no preference")
+ * @NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST: number of successful FTM frames
+ *	requested per burst
+ *	(u8, 0-31, optional with default 0 i.e. "no preference")
+ * @NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES: number of FTMR frame retries
+ *	(u8, default 3)
+ * @NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI: request LCI data (flag)
+ * @NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC: request civic location data
+ *	(flag)
+ *
+ * @NUM_NL80211_PMSR_FTM_REQ_ATTR: internal
+ * @NL80211_PMSR_FTM_REQ_ATTR_MAX: highest attribute number
+ */
+enum nl80211_peer_measurement_ftm_req {
+	__NL80211_PMSR_FTM_REQ_ATTR_INVALID,
+
+	NL80211_PMSR_FTM_REQ_ATTR_ASAP,
+	NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE,
+	NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP,
+	NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD,
+	NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION,
+	NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST,
+	NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES,
+	NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI,
+	NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC,
+
+	/* keep last */
+	NUM_NL80211_PMSR_FTM_REQ_ATTR,
+	NL80211_PMSR_FTM_REQ_ATTR_MAX = NUM_NL80211_PMSR_FTM_REQ_ATTR - 1
+};
+
+/**
+ * enum nl80211_peer_measurement_ftm_failure_reasons - FTM failure reasons
+ * @NL80211_PMSR_FTM_FAILURE_UNSPECIFIED: unspecified failure, not used
+ * @NL80211_PMSR_FTM_FAILURE_NO_RESPONSE: no response from the FTM responder
+ * @NL80211_PMSR_FTM_FAILURE_REJECTED: FTM responder rejected measurement
+ * @NL80211_PMSR_FTM_FAILURE_WRONG_CHANNEL: we already know the peer is
+ *	on a different channel, so can't measure (if we didn't know, we'd
+ *	try and get no response)
+ * @NL80211_PMSR_FTM_FAILURE_PEER_NOT_CAPABLE: peer can't actually do FTM
+ * @NL80211_PMSR_FTM_FAILURE_INVALID_TIMESTAMP: invalid T1/T4 timestamps
+ *	received
+ * @NL80211_PMSR_FTM_FAILURE_PEER_BUSY: peer reports busy, you may retry
+ *	later (see %NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME)
+ * @NL80211_PMSR_FTM_FAILURE_BAD_CHANGED_PARAMS: parameters were changed
+ *	by the peer and are no longer supported
+ */
+enum nl80211_peer_measurement_ftm_failure_reasons {
+	NL80211_PMSR_FTM_FAILURE_UNSPECIFIED,
+	NL80211_PMSR_FTM_FAILURE_NO_RESPONSE,
+	NL80211_PMSR_FTM_FAILURE_REJECTED,
+	NL80211_PMSR_FTM_FAILURE_WRONG_CHANNEL,
+	NL80211_PMSR_FTM_FAILURE_PEER_NOT_CAPABLE,
+	NL80211_PMSR_FTM_FAILURE_INVALID_TIMESTAMP,
+	NL80211_PMSR_FTM_FAILURE_PEER_BUSY,
+	NL80211_PMSR_FTM_FAILURE_BAD_CHANGED_PARAMS,
+};
+
+/**
+ * enum nl80211_peer_measurement_ftm_resp - FTM response attributes
+ * @__NL80211_PMSR_FTM_RESP_ATTR_INVALID: invalid
+ *
+ * @NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON: FTM-specific failure reason
+ *	(u32, optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX: optional, if bursts are reported
+ *	as separate results then it will be the burst index 0...(N-1) and
+ *	the top level will indicate partial results (u32)
+ * @NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS: number of FTM Request frames
+ *	transmitted (u32, optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES: number of FTM Request frames
+ *	that were acknowleged (u32, optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME: retry time received from the
+ *	busy peer (u32, seconds)
+ * @NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP: actual number of bursts exponent
+ *	used by the responder (similar to request, u8)
+ * @NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION: actual burst duration used by
+ *	the responder (similar to request, u8)
+ * @NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST: actual FTMs per burst used
+ *	by the responder (similar to request, u8)
+ * @NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG: average RSSI across all FTM action
+ *	frames (optional, s32, 1/2 dBm)
+ * @NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD: RSSI spread across all FTM action
+ *	frames (optional, s32, 1/2 dBm)
+ * @NL80211_PMSR_FTM_RESP_ATTR_TX_RATE: bitrate we used for the response to the
+ *	FTM action frame (optional, nested, using &enum nl80211_rate_info
+ *	attributes)
+ * @NL80211_PMSR_FTM_RESP_ATTR_RX_RATE: bitrate the responder used for the FTM
+ *	action frame (optional, nested, using &enum nl80211_rate_info attrs)
+ * @NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG: average RTT (s64, picoseconds, optional
+ *	but one of RTT/DIST must be present)
+ * @NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE: RTT variance (u64, ps^2, note that
+ *	standard deviation is the square root of variance, optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD: RTT spread (u64, picoseconds,
+ *	optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG: average distance (s64, mm, optional
+ *	but one of RTT/DIST must be present)
+ * @NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE: distance variance (u64, mm^2, note
+ *	that standard deviation is the square root of variance, optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD: distance spread (u64, mm, optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_LCI: LCI data from peer (binary, optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC: civic location data from peer
+ *	(binary, optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_PAD: ignore, for u64/s64 padding only
+ *
+ * @NUM_NL80211_PMSR_FTM_RESP_ATTR: internal
+ * @NL80211_PMSR_FTM_RESP_ATTR_MAX: highest attribute number
+ */
+enum nl80211_peer_measurement_ftm_resp {
+	__NL80211_PMSR_FTM_RESP_ATTR_INVALID,
+
+	NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON,
+	NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX,
+	NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS,
+	NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES,
+	NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME,
+	NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP,
+	NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION,
+	NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST,
+	NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG,
+	NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD,
+	NL80211_PMSR_FTM_RESP_ATTR_TX_RATE,
+	NL80211_PMSR_FTM_RESP_ATTR_RX_RATE,
+	NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG,
+	NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE,
+	NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD,
+	NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG,
+	NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE,
+	NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD,
+	NL80211_PMSR_FTM_RESP_ATTR_LCI,
+	NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC,
+	NL80211_PMSR_FTM_RESP_ATTR_PAD,
+
+	/* keep last */
+	NUM_NL80211_PMSR_FTM_RESP_ATTR,
+	NL80211_PMSR_FTM_RESP_ATTR_MAX = NUM_NL80211_PMSR_FTM_RESP_ATTR - 1
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 1d84f91bbfb0..72a224ce8e0a 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o
 
 cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
 cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o
+cfg80211-y += pmsr.o
 cfg80211-$(CONFIG_OF) += of.o
 cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o
 cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 5bd01058b9e6..0a3092c56b3e 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -4,6 +4,7 @@
  * Copyright 2006-2010		Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -664,6 +665,34 @@ int wiphy_register(struct wiphy *wiphy)
 		return -EINVAL;
 #endif
 
+	if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported))
+		return -EINVAL;
+
+	if (wiphy->pmsr_capa && wiphy->pmsr_capa->ftm.supported) {
+		if (WARN_ON(!wiphy->pmsr_capa->ftm.asap &&
+			    !wiphy->pmsr_capa->ftm.non_asap))
+			return -EINVAL;
+		if (WARN_ON(!wiphy->pmsr_capa->ftm.preambles ||
+			    !wiphy->pmsr_capa->ftm.bandwidths))
+			return -EINVAL;
+		if (WARN_ON(wiphy->pmsr_capa->ftm.preambles &
+				~(BIT(NL80211_PREAMBLE_LEGACY) |
+				  BIT(NL80211_PREAMBLE_HT) |
+				  BIT(NL80211_PREAMBLE_VHT) |
+				  BIT(NL80211_PREAMBLE_DMG))))
+			return -EINVAL;
+		if (WARN_ON(wiphy->pmsr_capa->ftm.bandwidths &
+				~(BIT(NL80211_CHAN_WIDTH_20_NOHT) |
+				  BIT(NL80211_CHAN_WIDTH_20) |
+				  BIT(NL80211_CHAN_WIDTH_40) |
+				  BIT(NL80211_CHAN_WIDTH_80) |
+				  BIT(NL80211_CHAN_WIDTH_80P80) |
+				  BIT(NL80211_CHAN_WIDTH_160) |
+				  BIT(NL80211_CHAN_WIDTH_5) |
+				  BIT(NL80211_CHAN_WIDTH_10))))
+			return -EINVAL;
+	}
+
 	/*
 	 * if a wiphy has unsupported modes for regulatory channel enforcement,
 	 * opt-out of enforcement checking
@@ -1087,6 +1116,8 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
 	ASSERT_RTNL();
 	ASSERT_WDEV_LOCK(wdev);
 
+	cfg80211_pmsr_wdev_down(wdev);
+
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_ADHOC:
 		__cfg80211_leave_ibss(rdev, dev, true);
@@ -1174,6 +1205,9 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
 	spin_lock_init(&wdev->event_lock);
 	INIT_LIST_HEAD(&wdev->mgmt_registrations);
 	spin_lock_init(&wdev->mgmt_registrations_lock);
+	INIT_LIST_HEAD(&wdev->pmsr_list);
+	spin_lock_init(&wdev->pmsr_lock);
+	INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk);
 
 	/*
 	 * We get here also when the interface changes network namespaces,
diff --git a/net/wireless/core.h b/net/wireless/core.h
index c61dbba8bf47..c5d6f3418601 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -3,6 +3,7 @@
  * Wireless configuration interface internals.
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2018 Intel Corporation
  */
 #ifndef __NET_WIRELESS_CORE_H
 #define __NET_WIRELESS_CORE_H
@@ -530,4 +531,8 @@ void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
 
 void cfg80211_cqm_config_free(struct wireless_dev *wdev);
 
+void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid);
+void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev);
+void cfg80211_pmsr_free_wk(struct work_struct *work);
+
 #endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 744b5851bbf9..6fd93eb0df6d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -240,7 +240,63 @@ nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = {
 					     .len = U8_MAX },
 };
 
-static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
+static const struct nla_policy
+nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = {
+	[NL80211_PMSR_FTM_REQ_ATTR_ASAP] = { .type = NLA_FLAG },
+	[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE] = { .type = NLA_U32 },
+	[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP] =
+		NLA_POLICY_MAX(NLA_U8, 15),
+	[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD] = { .type = NLA_U16 },
+	[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION] =
+		NLA_POLICY_MAX(NLA_U8, 15),
+	[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST] =
+		NLA_POLICY_MAX(NLA_U8, 15),
+	[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES] = { .type = NLA_U8 },
+	[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI] = { .type = NLA_FLAG },
+	[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG },
+};
+
+static const struct nla_policy
+nl80211_pmsr_req_data_policy[NL80211_PMSR_TYPE_MAX + 1] = {
+	[NL80211_PMSR_TYPE_FTM] =
+		NLA_POLICY_NESTED(NL80211_PMSR_FTM_REQ_ATTR_MAX,
+				  nl80211_pmsr_ftm_req_attr_policy),
+};
+
+static const struct nla_policy
+nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = {
+	[NL80211_PMSR_REQ_ATTR_DATA] =
+		NLA_POLICY_NESTED(NL80211_PMSR_TYPE_MAX,
+				  nl80211_pmsr_req_data_policy),
+	[NL80211_PMSR_REQ_ATTR_GET_AP_TSF] = { .type = NLA_FLAG },
+};
+
+static const struct nla_policy
+nl80211_psmr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = {
+	[NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR,
+	/*
+	 * we could specify this again to be the top-level policy,
+	 * but that would open us up to recursion problems ...
+	 */
+	[NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_NESTED },
+	[NL80211_PMSR_PEER_ATTR_REQ] =
+		NLA_POLICY_NESTED(NL80211_PMSR_REQ_ATTR_MAX,
+				  nl80211_pmsr_req_attr_policy),
+	[NL80211_PMSR_PEER_ATTR_RESP] = { .type = NLA_REJECT },
+};
+
+static const struct nla_policy
+nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = {
+	[NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_REJECT },
+	[NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_REJECT },
+	[NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_REJECT },
+	[NL80211_PMSR_ATTR_TYPE_CAPA] = { .type = NLA_REJECT },
+	[NL80211_PMSR_ATTR_PEERS] =
+		NLA_POLICY_NESTED_ARRAY(NL80211_PMSR_PEER_ATTR_MAX,
+					nl80211_psmr_peer_attr_policy),
+};
+
+const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
 	[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
 				      .len = 20-1 },
@@ -497,6 +553,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 		.type = NLA_NESTED,
 		.validation_data = nl80211_ftm_responder_policy,
 	},
+	[NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1),
+	[NL80211_ATTR_PEER_MEASUREMENTS] =
+		NLA_POLICY_NESTED(NL80211_PMSR_FTM_REQ_ATTR_MAX,
+				  nl80211_pmsr_attr_policy),
 };
 
 /* policy for the key attributes */
@@ -637,9 +697,9 @@ nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = {
 	[NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 },
 };
 
-static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
-				     struct cfg80211_registered_device **rdev,
-				     struct wireless_dev **wdev)
+int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
+			      struct cfg80211_registered_device **rdev,
+			      struct wireless_dev **wdev)
 {
 	int err;
 
@@ -684,8 +744,8 @@ static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
 }
 
 /* message building helper */
-static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
-				   int flags, u8 cmd)
+void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
+		     int flags, u8 cmd)
 {
 	/* since there is no private header just add the generic one */
 	return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd);
@@ -1615,6 +1675,91 @@ static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev,
 	return -ENOBUFS;
 }
 
+static int
+nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap,
+			   struct sk_buff *msg)
+{
+	struct nlattr *ftm;
+
+	if (!cap->ftm.supported)
+		return 0;
+
+	ftm = nla_nest_start(msg, NL80211_PMSR_TYPE_FTM);
+	if (!ftm)
+		return -ENOBUFS;
+
+	if (cap->ftm.asap && nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_ASAP))
+		return -ENOBUFS;
+	if (cap->ftm.non_asap &&
+	    nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP))
+		return -ENOBUFS;
+	if (cap->ftm.request_lci &&
+	    nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI))
+		return -ENOBUFS;
+	if (cap->ftm.request_civicloc &&
+	    nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC))
+		return -ENOBUFS;
+	if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES,
+			cap->ftm.preambles))
+		return -ENOBUFS;
+	if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS,
+			cap->ftm.bandwidths))
+		return -ENOBUFS;
+	if (cap->ftm.max_bursts_exponent >= 0 &&
+	    nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT,
+			cap->ftm.max_bursts_exponent))
+		return -ENOBUFS;
+	if (cap->ftm.max_ftms_per_burst &&
+	    nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST,
+			cap->ftm.max_ftms_per_burst))
+		return -ENOBUFS;
+
+	nla_nest_end(msg, ftm);
+	return 0;
+}
+
+static int nl80211_send_pmsr_capa(struct cfg80211_registered_device *rdev,
+				  struct sk_buff *msg)
+{
+	const struct cfg80211_pmsr_capabilities *cap = rdev->wiphy.pmsr_capa;
+	struct nlattr *pmsr, *caps;
+
+	if (!cap)
+		return 0;
+
+	/*
+	 * we don't need to clean up anything here since the caller
+	 * will genlmsg_cancel() if we fail
+	 */
+
+	pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS);
+	if (!pmsr)
+		return -ENOBUFS;
+
+	if (nla_put_u32(msg, NL80211_PMSR_ATTR_MAX_PEERS, cap->max_peers))
+		return -ENOBUFS;
+
+	if (cap->report_ap_tsf &&
+	    nla_put_flag(msg, NL80211_PMSR_ATTR_REPORT_AP_TSF))
+		return -ENOBUFS;
+
+	if (cap->randomize_mac_addr &&
+	    nla_put_flag(msg, NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR))
+		return -ENOBUFS;
+
+	caps = nla_nest_start(msg, NL80211_PMSR_ATTR_TYPE_CAPA);
+	if (!caps)
+		return -ENOBUFS;
+
+	if (nl80211_send_pmsr_ftm_capa(cap, msg))
+		return -ENOBUFS;
+
+	nla_nest_end(msg, caps);
+	nla_nest_end(msg, pmsr);
+
+	return 0;
+}
+
 struct nl80211_dump_wiphy_state {
 	s64 filter_wiphy;
 	long start;
@@ -2118,6 +2263,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 				goto nla_put_failure;
 		}
 
+		state->split_start++;
+		break;
+	case 14:
+		if (nl80211_send_pmsr_capa(rdev, msg))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -2318,9 +2469,9 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
 		wdev->iftype == NL80211_IFTYPE_P2P_GO;
 }
 
-static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
-				 struct genl_info *info,
-				 struct cfg80211_chan_def *chandef)
+int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
+			  struct genl_info *info,
+			  struct cfg80211_chan_def *chandef)
 {
 	struct netlink_ext_ack *extack = info->extack;
 	struct nlattr **attrs = info->attrs;
@@ -2794,12 +2945,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
-static inline u64 wdev_id(struct wireless_dev *wdev)
-{
-	return (u64)wdev->identifier |
-	       ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32);
-}
-
 static int nl80211_send_chandef(struct sk_buff *msg,
 				const struct cfg80211_chan_def *chandef)
 {
@@ -4521,8 +4666,7 @@ static int parse_station_flags(struct genl_info *info,
 	return 0;
 }
 
-static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
-				 int attr)
+bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr)
 {
 	struct nlattr *rate;
 	u32 bitrate;
@@ -6855,8 +6999,8 @@ static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy,
 	return 0;
 }
 
-static int nl80211_parse_random_mac(struct nlattr **attrs,
-				    u8 *mac_addr, u8 *mac_addr_mask)
+int nl80211_parse_random_mac(struct nlattr **attrs,
+			     u8 *mac_addr, u8 *mac_addr_mask)
 {
 	int i;
 
@@ -13898,6 +14042,14 @@ static const struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_NETDEV |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_PEER_MEASUREMENT_START,
+		.doit = nl80211_pmsr_start,
+		.policy = nl80211_policy,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
@@ -15881,6 +16033,8 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
 			} else if (wdev->conn_owner_nlportid == notify->portid) {
 				schedule_work(&wdev->disconnect_wk);
 			}
+
+			cfg80211_release_pmsr(wdev, notify->portid);
 		}
 
 		spin_lock_bh(&rdev->beacon_registrations_lock);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 79e47fe60c35..531c82dcba6b 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -1,4 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions of this file
+ * Copyright (C) 2018 Intel Corporation
+ */
 #ifndef __NET_WIRELESS_NL80211_H
 #define __NET_WIRELESS_NL80211_H
 
@@ -6,6 +10,30 @@
 
 int nl80211_init(void);
 void nl80211_exit(void);
+
+extern const struct nla_policy nl80211_policy[NUM_NL80211_ATTR];
+
+void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
+		     int flags, u8 cmd);
+bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
+			  int attr);
+
+static inline u64 wdev_id(struct wireless_dev *wdev)
+{
+	return (u64)wdev->identifier |
+	       ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32);
+}
+
+int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
+			      struct cfg80211_registered_device **rdev,
+			      struct wireless_dev **wdev);
+
+int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
+			  struct genl_info *info,
+			  struct cfg80211_chan_def *chandef);
+int nl80211_parse_random_mac(struct nlattr **attrs,
+			     u8 *mac_addr, u8 *mac_addr_mask);
+
 void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
 			  enum nl80211_commands cmd);
 void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
@@ -95,4 +123,8 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev);
 
 void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev);
 
+/* peer measurement */
+int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info);
+int nl80211_pmsr_dump_results(struct sk_buff *skb, struct netlink_callback *cb);
+
 #endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
new file mode 100644
index 000000000000..de9286703280
--- /dev/null
+++ b/net/wireless/pmsr.c
@@ -0,0 +1,590 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Intel Corporation
+ */
+#ifndef __PMSR_H
+#define __PMSR_H
+#include <net/cfg80211.h>
+#include "core.h"
+#include "nl80211.h"
+#include "rdev-ops.h"
+
+static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
+			  struct nlattr *ftmreq,
+			  struct cfg80211_pmsr_request_peer *out,
+			  struct genl_info *info)
+{
+	const struct cfg80211_pmsr_capabilities *capa = rdev->wiphy.pmsr_capa;
+	struct nlattr *tb[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1];
+	u32 preamble = NL80211_PREAMBLE_DMG; /* only optional in DMG */
+
+	/* validate existing data */
+	if (!(rdev->wiphy.pmsr_capa->ftm.bandwidths & BIT(out->chandef.width))) {
+		NL_SET_ERR_MSG(info->extack, "FTM: unsupported bandwidth");
+		return -EINVAL;
+	}
+
+	/* no validation needed - was already done via nested policy */
+	nla_parse_nested(tb, NL80211_PMSR_FTM_REQ_ATTR_MAX, ftmreq, NULL, NULL);
+
+	if (tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE])
+		preamble = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]);
+
+	/* set up values - struct is 0-initialized */
+	out->ftm.requested = true;
+
+	switch (out->chandef.chan->band) {
+	case NL80211_BAND_60GHZ:
+		/* optional */
+		break;
+	default:
+		if (!tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) {
+			NL_SET_ERR_MSG(info->extack,
+				       "FTM: must specify preamble");
+			return -EINVAL;
+		}
+	}
+
+	if (!(capa->ftm.preambles & BIT(preamble))) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE],
+				    "FTM: invalid preamble");
+		return -EINVAL;
+	}
+
+	out->ftm.preamble = preamble;
+
+	out->ftm.burst_period = 0;
+	if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD])
+		out->ftm.burst_period =
+			nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]);
+
+	out->ftm.asap = !!tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP];
+	if (out->ftm.asap && !capa->ftm.asap) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP],
+				    "FTM: ASAP mode not supported");
+		return -EINVAL;
+	}
+
+	if (!out->ftm.asap && !capa->ftm.non_asap) {
+		NL_SET_ERR_MSG(info->extack,
+			       "FTM: non-ASAP mode not supported");
+		return -EINVAL;
+	}
+
+	out->ftm.num_bursts_exp = 0;
+	if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP])
+		out->ftm.num_bursts_exp =
+			nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]);
+
+	if (capa->ftm.max_bursts_exponent >= 0 &&
+	    out->ftm.num_bursts_exp > capa->ftm.max_bursts_exponent) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP],
+				    "FTM: max NUM_BURSTS_EXP must be set lower than the device limit");
+		return -EINVAL;
+	}
+
+	out->ftm.burst_duration = 15;
+	if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION])
+		out->ftm.burst_duration =
+			nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]);
+
+	out->ftm.ftms_per_burst = 0;
+	if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST])
+		out->ftm.ftms_per_burst =
+			nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]);
+
+	if (capa->ftm.max_ftms_per_burst &&
+	    (out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst ||
+	     out->ftm.ftms_per_burst == 0)) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST],
+				    "FTM: FTMs per burst must be set lower than the device limit but non-zero");
+		return -EINVAL;
+	}
+
+	out->ftm.ftmr_retries = 3;
+	if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES])
+		out->ftm.ftmr_retries =
+			nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]);
+
+	out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI];
+	if (out->ftm.request_lci && !capa->ftm.request_lci) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI],
+				    "FTM: LCI request not supported");
+	}
+
+	out->ftm.request_civicloc =
+		!!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC];
+	if (out->ftm.request_civicloc && !capa->ftm.request_civicloc) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC],
+			    "FTM: civic location request not supported");
+	}
+
+	return 0;
+}
+
+static int pmsr_parse_peer(struct cfg80211_registered_device *rdev,
+			   struct nlattr *peer,
+			   struct cfg80211_pmsr_request_peer *out,
+			   struct genl_info *info)
+{
+	struct nlattr *tb[NL80211_PMSR_PEER_ATTR_MAX + 1];
+	struct nlattr *req[NL80211_PMSR_REQ_ATTR_MAX + 1];
+	struct nlattr *treq;
+	int err, rem;
+
+	/* no validation needed - was already done via nested policy */
+	nla_parse_nested(tb, NL80211_PMSR_PEER_ATTR_MAX, peer, NULL, NULL);
+
+	if (!tb[NL80211_PMSR_PEER_ATTR_ADDR] ||
+	    !tb[NL80211_PMSR_PEER_ATTR_CHAN] ||
+	    !tb[NL80211_PMSR_PEER_ATTR_REQ]) {
+		NL_SET_ERR_MSG_ATTR(info->extack, peer,
+				    "insufficient peer data");
+		return -EINVAL;
+	}
+
+	memcpy(out->addr, nla_data(tb[NL80211_PMSR_PEER_ATTR_ADDR]), ETH_ALEN);
+
+	/* reuse info->attrs */
+	memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1));
+	/* need to validate here, we don't want to have validation recursion */
+	err = nla_parse_nested(info->attrs, NL80211_ATTR_MAX,
+			       tb[NL80211_PMSR_PEER_ATTR_CHAN],
+			       nl80211_policy, info->extack);
+	if (err)
+		return err;
+
+	err = nl80211_parse_chandef(rdev, info, &out->chandef);
+	if (err)
+		return err;
+
+	/* no validation needed - was already done via nested policy */
+	nla_parse_nested(req, NL80211_PMSR_REQ_ATTR_MAX,
+			 tb[NL80211_PMSR_PEER_ATTR_REQ],
+			 NULL, NULL);
+
+	if (!req[NL80211_PMSR_REQ_ATTR_DATA]) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_PEER_ATTR_REQ],
+				    "missing request type/data");
+		return -EINVAL;
+	}
+
+	if (req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF])
+		out->report_ap_tsf = true;
+
+	if (out->report_ap_tsf && !rdev->wiphy.pmsr_capa->report_ap_tsf) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF],
+				    "reporting AP TSF is not supported");
+		return -EINVAL;
+	}
+
+	nla_for_each_nested(treq, req[NL80211_PMSR_REQ_ATTR_DATA], rem) {
+		switch (nla_type(treq)) {
+		case NL80211_PMSR_TYPE_FTM:
+			err = pmsr_parse_ftm(rdev, treq, out, info);
+			break;
+		default:
+			NL_SET_ERR_MSG_ATTR(info->extack, treq,
+					    "unsupported measurement type");
+			err = -EINVAL;
+		}
+	}
+
+	if (err)
+		return err;
+
+	return 0;
+}
+
+int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *reqattr = info->attrs[NL80211_ATTR_PEER_MEASUREMENTS];
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct wireless_dev *wdev = info->user_ptr[1];
+	struct cfg80211_pmsr_request *req;
+	struct nlattr *peers, *peer;
+	int count, rem, err, idx;
+
+	if (!rdev->wiphy.pmsr_capa)
+		return -EOPNOTSUPP;
+
+	if (!reqattr)
+		return -EINVAL;
+
+	peers = nla_find(nla_data(reqattr), nla_len(reqattr),
+			 NL80211_PMSR_ATTR_PEERS);
+	if (!peers)
+		return -EINVAL;
+
+	count = 0;
+	nla_for_each_nested(peer, peers, rem) {
+		count++;
+
+		if (count > rdev->wiphy.pmsr_capa->max_peers) {
+			NL_SET_ERR_MSG_ATTR(info->extack, peer,
+					    "Too many peers used");
+			return -EINVAL;
+		}
+	}
+
+	req = kzalloc(struct_size(req, peers, count), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	if (info->attrs[NL80211_ATTR_TIMEOUT])
+		req->timeout = nla_get_u32(info->attrs[NL80211_ATTR_TIMEOUT]);
+
+	if (info->attrs[NL80211_ATTR_MAC]) {
+		if (!rdev->wiphy.pmsr_capa->randomize_mac_addr) {
+			NL_SET_ERR_MSG_ATTR(info->extack,
+					    info->attrs[NL80211_ATTR_MAC],
+					    "device cannot randomize MAC address");
+			err = -EINVAL;
+			goto out_err;
+		}
+
+		err = nl80211_parse_random_mac(info->attrs, req->mac_addr,
+					       req->mac_addr_mask);
+		if (err)
+			goto out_err;
+	} else {
+		memcpy(req->mac_addr, nla_data(info->attrs[NL80211_ATTR_MAC]),
+		       ETH_ALEN);
+		memset(req->mac_addr_mask, 0xff, ETH_ALEN);
+	}
+
+	idx = 0;
+	nla_for_each_nested(peer, peers, rem) {
+		/* NB: this reuses info->attrs, but we no longer need it */
+		err = pmsr_parse_peer(rdev, peer, &req->peers[idx], info);
+		if (err)
+			goto out_err;
+		idx++;
+	}
+
+	req->n_peers = count;
+	req->cookie = cfg80211_assign_cookie(rdev);
+
+	err = rdev_start_pmsr(rdev, wdev, req);
+	if (err)
+		goto out_err;
+
+	list_add_tail(&req->list, &wdev->pmsr_list);
+
+	nl_set_extack_cookie_u64(info->extack, req->cookie);
+	return 0;
+out_err:
+	kfree(req);
+	return err;
+}
+
+void cfg80211_pmsr_complete(struct wireless_dev *wdev,
+			    struct cfg80211_pmsr_request *req,
+			    gfp_t gfp)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct sk_buff *msg;
+	void *hdr;
+
+	trace_cfg80211_pmsr_complete(wdev->wiphy, wdev, req->cookie);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		goto free_request;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0,
+			     NL80211_CMD_PEER_MEASUREMENT_COMPLETE);
+	if (!hdr)
+		goto free_msg;
+
+	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+			      NL80211_ATTR_PAD))
+		goto free_msg;
+
+	if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie,
+			      NL80211_ATTR_PAD))
+		goto free_msg;
+
+	genlmsg_end(msg, hdr);
+	genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid);
+	goto free_request;
+free_msg:
+	nlmsg_free(msg);
+free_request:
+	spin_lock_bh(&wdev->pmsr_lock);
+	list_del(&req->list);
+	spin_unlock_bh(&wdev->pmsr_lock);
+	kfree(req);
+}
+EXPORT_SYMBOL_GPL(cfg80211_pmsr_complete);
+
+static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg,
+				     struct cfg80211_pmsr_result *res)
+{
+	if (res->status == NL80211_PMSR_STATUS_FAILURE) {
+		if (nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON,
+				res->ftm.failure_reason))
+			goto error;
+
+		if (res->ftm.failure_reason ==
+			NL80211_PMSR_FTM_FAILURE_PEER_BUSY &&
+		    res->ftm.busy_retry_time &&
+		    nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME,
+				res->ftm.busy_retry_time))
+			goto error;
+
+		return 0;
+	}
+
+#define PUT(tp, attr, val)						\
+	do {								\
+		if (nla_put_##tp(msg,					\
+				 NL80211_PMSR_FTM_RESP_ATTR_##attr,	\
+				 res->ftm.val))				\
+			goto error;					\
+	} while (0)
+
+#define PUTOPT(tp, attr, val)						\
+	do {								\
+		if (res->ftm.val##_valid)				\
+			PUT(tp, attr, val);				\
+	} while (0)
+
+#define PUT_U64(attr, val)						\
+	do {								\
+		if (nla_put_u64_64bit(msg,				\
+				      NL80211_PMSR_FTM_RESP_ATTR_##attr,\
+				      res->ftm.val,			\
+				      NL80211_PMSR_FTM_RESP_ATTR_PAD))	\
+			goto error;					\
+	} while (0)
+
+#define PUTOPT_U64(attr, val)						\
+	do {								\
+		if (res->ftm.val##_valid)				\
+			PUT_U64(attr, val);				\
+	} while (0)
+
+	if (res->ftm.burst_index >= 0)
+		PUT(u32, BURST_INDEX, burst_index);
+	PUTOPT(u32, NUM_FTMR_ATTEMPTS, num_ftmr_attempts);
+	PUTOPT(u32, NUM_FTMR_SUCCESSES, num_ftmr_successes);
+	PUT(u8, NUM_BURSTS_EXP, num_bursts_exp);
+	PUT(u8, BURST_DURATION, burst_duration);
+	PUT(u8, FTMS_PER_BURST, ftms_per_burst);
+	PUTOPT(s32, RSSI_AVG, rssi_avg);
+	PUTOPT(s32, RSSI_SPREAD, rssi_spread);
+	if (res->ftm.tx_rate_valid &&
+	    !nl80211_put_sta_rate(msg, &res->ftm.tx_rate,
+				  NL80211_PMSR_FTM_RESP_ATTR_TX_RATE))
+		goto error;
+	if (res->ftm.rx_rate_valid &&
+	    !nl80211_put_sta_rate(msg, &res->ftm.rx_rate,
+				  NL80211_PMSR_FTM_RESP_ATTR_RX_RATE))
+		goto error;
+	PUTOPT_U64(RTT_AVG, rtt_avg);
+	PUTOPT_U64(RTT_VARIANCE, rtt_variance);
+	PUTOPT_U64(RTT_SPREAD, rtt_spread);
+	PUTOPT_U64(DIST_AVG, dist_avg);
+	PUTOPT_U64(DIST_VARIANCE, dist_variance);
+	PUTOPT_U64(DIST_SPREAD, dist_spread);
+	if (res->ftm.lci && res->ftm.lci_len &&
+	    nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_LCI,
+		    res->ftm.lci_len, res->ftm.lci))
+		goto error;
+	if (res->ftm.civicloc && res->ftm.civicloc_len &&
+	    nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC,
+		    res->ftm.civicloc_len, res->ftm.civicloc))
+		goto error;
+#undef PUT
+#undef PUTOPT
+#undef PUT_U64
+#undef PUTOPT_U64
+
+	return 0;
+error:
+	return -ENOSPC;
+}
+
+static int nl80211_pmsr_send_result(struct sk_buff *msg,
+				    struct cfg80211_pmsr_result *res)
+{
+	struct nlattr *pmsr, *peers, *peer, *resp, *data, *typedata;
+
+	pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS);
+	if (!pmsr)
+		goto error;
+
+	peers = nla_nest_start(msg, NL80211_PMSR_ATTR_PEERS);
+	if (!peers)
+		goto error;
+
+	peer = nla_nest_start(msg, 1);
+	if (!peer)
+		goto error;
+
+	if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN, res->addr))
+		goto error;
+
+	resp = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_RESP);
+	if (!resp)
+		goto error;
+
+	if (nla_put_u32(msg, NL80211_PMSR_RESP_ATTR_STATUS, res->status) ||
+	    nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_HOST_TIME,
+			      res->host_time, NL80211_PMSR_RESP_ATTR_PAD))
+		goto error;
+
+	if (res->ap_tsf_valid &&
+	    nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_AP_TSF,
+			      res->host_time, NL80211_PMSR_RESP_ATTR_PAD))
+		goto error;
+
+	if (res->final && nla_put_flag(msg, NL80211_PMSR_RESP_ATTR_FINAL))
+		goto error;
+
+	data = nla_nest_start(msg, NL80211_PMSR_RESP_ATTR_DATA);
+	if (!data)
+		goto error;
+
+	typedata = nla_nest_start(msg, res->type);
+	if (!typedata)
+		goto error;
+
+	switch (res->type) {
+	case NL80211_PMSR_TYPE_FTM:
+		if (nl80211_pmsr_send_ftm_res(msg, res))
+			goto error;
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	nla_nest_end(msg, typedata);
+	nla_nest_end(msg, data);
+	nla_nest_end(msg, resp);
+	nla_nest_end(msg, peer);
+	nla_nest_end(msg, peers);
+	nla_nest_end(msg, pmsr);
+
+	return 0;
+error:
+	return -ENOSPC;
+}
+
+void cfg80211_pmsr_report(struct wireless_dev *wdev,
+			  struct cfg80211_pmsr_request *req,
+			  struct cfg80211_pmsr_result *result,
+			  gfp_t gfp)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct sk_buff *msg;
+	void *hdr;
+	int err;
+
+	trace_cfg80211_pmsr_report(wdev->wiphy, wdev, req->cookie,
+				   result->addr);
+
+	/*
+	 * Currently, only variable items are LCI and civic location,
+	 * both of which are reasonably short so we don't need to
+	 * worry about them here for the allocation.
+	 */
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PEER_MEASUREMENT_RESULT);
+	if (!hdr)
+		goto free;
+
+	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+			      NL80211_ATTR_PAD))
+		goto free;
+
+	if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie,
+			      NL80211_ATTR_PAD))
+		goto free;
+
+	err = nl80211_pmsr_send_result(msg, result);
+	if (err) {
+		pr_err_ratelimited("peer measurement result: message didn't fit!");
+		goto free;
+	}
+
+	genlmsg_end(msg, hdr);
+	genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid);
+	return;
+free:
+	nlmsg_free(msg);
+}
+EXPORT_SYMBOL_GPL(cfg80211_pmsr_report);
+
+void cfg80211_pmsr_free_wk(struct work_struct *work)
+{
+	struct wireless_dev *wdev = container_of(work, struct wireless_dev,
+						 pmsr_free_wk);
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct cfg80211_pmsr_request *req, *tmp;
+	LIST_HEAD(free_list);
+
+	spin_lock_bh(&wdev->pmsr_lock);
+	list_for_each_entry_safe(req, tmp, &wdev->pmsr_list, list) {
+		if (req->nl_portid)
+			continue;
+		list_move_tail(&req->list, &free_list);
+	}
+	spin_unlock_bh(&wdev->pmsr_lock);
+
+	list_for_each_entry_safe(req, tmp, &free_list, list) {
+		wdev_lock(wdev);
+		rdev_abort_pmsr(rdev, wdev, req);
+		wdev_unlock(wdev);
+
+		kfree(req);
+	}
+}
+
+void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev)
+{
+	struct cfg80211_pmsr_request *req;
+	bool found = false;
+
+	spin_lock_bh(&wdev->pmsr_lock);
+	list_for_each_entry(req, &wdev->pmsr_list, list) {
+		found = true;
+		req->nl_portid = 0;
+	}
+	spin_unlock_bh(&wdev->pmsr_lock);
+
+	if (found)
+		schedule_work(&wdev->pmsr_free_wk);
+	flush_work(&wdev->pmsr_free_wk);
+	WARN_ON(!list_empty(&wdev->pmsr_list));
+}
+
+void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid)
+{
+	struct cfg80211_pmsr_request *req;
+
+	spin_lock_bh(&wdev->pmsr_lock);
+	list_for_each_entry(req, &wdev->pmsr_list, list) {
+		if (req->nl_portid == portid) {
+			req->nl_portid = 0;
+			schedule_work(&wdev->pmsr_free_wk);
+		}
+	}
+	spin_unlock_bh(&wdev->pmsr_lock);
+}
+
+#endif /* __PMSR_H */
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 51380b5c32f2..5cb48d135fab 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1247,4 +1247,29 @@ rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_start_pmsr(struct cfg80211_registered_device *rdev,
+		struct wireless_dev *wdev,
+		struct cfg80211_pmsr_request *request)
+{
+	int ret = -EOPNOTSUPP;
+
+	trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie);
+	if (rdev->ops->start_pmsr)
+		ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
+static inline void
+rdev_abort_pmsr(struct cfg80211_registered_device *rdev,
+		struct wireless_dev *wdev,
+		struct cfg80211_pmsr_request *request)
+{
+	trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie);
+	if (rdev->ops->abort_pmsr)
+		rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request);
+	trace_rdev_return_void(&rdev->wiphy);
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index f7909867d8fb..44b2ce1bb13a 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -361,6 +361,24 @@ DECLARE_EVENT_CLASS(wiphy_wdev_evt,
 	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
 );
 
+DECLARE_EVENT_CLASS(wiphy_wdev_cookie_evt,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
+	TP_ARGS(wiphy, wdev, cookie),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(u64, cookie)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->cookie = cookie;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %lld",
+		  WIPHY_PR_ARG, WDEV_PR_ARG,
+		  (unsigned long long)__entry->cookie)
+);
+
 DEFINE_EVENT(wiphy_wdev_evt, rdev_return_wdev,
 	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
 	TP_ARGS(wiphy, wdev)
@@ -2502,6 +2520,16 @@ TRACE_EVENT(rdev_get_ftm_responder_stats,
 		__entry->out_of_window)
 );
 
+DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_start_pmsr,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
+	TP_ARGS(wiphy, wdev, cookie)
+);
+
+DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_abort_pmsr,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
+	TP_ARGS(wiphy, wdev, cookie)
+);
+
 /*************************************************************
  *	     cfg80211 exported functions traces		     *
  *************************************************************/
@@ -3294,6 +3322,46 @@ TRACE_EVENT(cfg80211_stop_iface,
 	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
 		  WIPHY_PR_ARG, WDEV_PR_ARG)
 );
+
+TRACE_EVENT(cfg80211_pmsr_report,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 u64 cookie, const u8 *addr),
+	TP_ARGS(wiphy, wdev, cookie, addr),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(u64, cookie)
+		MAC_ENTRY(addr)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->cookie = cookie;
+		MAC_ASSIGN(addr, addr);
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld, " MAC_PR_FMT,
+		  WIPHY_PR_ARG, WDEV_PR_ARG,
+		  (unsigned long long)__entry->cookie,
+		  MAC_PR_ARG(addr))
+);
+
+TRACE_EVENT(cfg80211_pmsr_complete,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
+	TP_ARGS(wiphy, wdev, cookie),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(u64, cookie)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->cookie = cookie;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld",
+		  WIPHY_PR_ARG, WDEV_PR_ARG,
+		  (unsigned long long)__entry->cookie)
+);
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3-59-g8ed1b


From dbdaee7aa6e61f56aac61b71a7807e76f92cc895 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Thu, 25 Oct 2018 15:48:53 -0400
Subject: {nl,mac}80211: report gate connectivity in station info

Capture the current state of gate connectivity from the mesh
formation field in mesh config whenever we receive a beacon,
and report that via GET_STATION.  This allows applications
doing mesh peering in userspace to make peering decisions
based on peers' current upstream connectivity.

Signed-off-by: Bob Copeland <bobcopeland@fb.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    | 2 ++
 include/net/cfg80211.h       | 3 +++
 include/uapi/linux/nl80211.h | 3 +++
 net/mac80211/mesh_plink.c    | 3 +++
 net/mac80211/sta_info.c      | 4 +++-
 net/mac80211/sta_info.h      | 2 ++
 net/wireless/nl80211.c       | 1 +
 7 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 0ef67f837ae1..407d6fd66fa9 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -812,6 +812,8 @@ enum mesh_config_capab_flags {
 	IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL	= 0x40,
 };
 
+#define IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE 0x1
+
 /**
  * mesh channel switch parameters element's flag indicator
  *
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index c21c5c70a2fd..24d2db8e082d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1296,6 +1296,7 @@ struct cfg80211_tid_stats {
  * @rx_beacon: number of beacons received from this peer
  * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received
  *	from this peer
+ * @connected_to_gate: true if mesh STA has a path to mesh gate
  * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
  * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
  *	(IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
@@ -1350,6 +1351,8 @@ struct station_info {
 	u64 rx_beacon;
 	u64 rx_duration;
 	u8 rx_beacon_signal_avg;
+	u8 connected_to_gate;
+
 	struct cfg80211_tid_stats *pertid;
 	s8 ack_signal;
 	s8 avg_ack_signal;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e45b88925783..ff6005edf32f 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3116,6 +3116,8 @@ enum nl80211_sta_bss_param {
  *	with an FCS error (u32, from this station). This count may not include
  *	some packets with an FCS error due to TA corruption. Hence this counter
  *	might not be fully accurate.
+ * @NL80211_STA_INFO_CONNECTED_TO_GATE: set to true if STA has a path to a
+ *	mesh gate
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3158,6 +3160,7 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_ACK_SIGNAL_AVG,
 	NL80211_STA_INFO_RX_MPDUS,
 	NL80211_STA_INFO_FCS_ERROR_COUNT,
+	NL80211_STA_INFO_CONNECTED_TO_GATE,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 5b5b0f95ffd1..5f45a2b273df 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -590,6 +590,9 @@ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata,
 	if (!sta)
 		goto out;
 
+	sta->mesh->connected_to_gate = elems->mesh_config->meshconf_form &
+		IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE;
+
 	if (mesh_peer_accepts_plinks(elems) &&
 	    sta->mesh->plink_state == NL80211_PLINK_LISTEN &&
 	    sdata->u.mesh.accepting_plinks &&
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 11b7ae691db0..c4a8f115ed33 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2264,7 +2264,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 				 BIT_ULL(NL80211_STA_INFO_PLINK_STATE) |
 				 BIT_ULL(NL80211_STA_INFO_LOCAL_PM) |
 				 BIT_ULL(NL80211_STA_INFO_PEER_PM) |
-				 BIT_ULL(NL80211_STA_INFO_NONPEER_PM);
+				 BIT_ULL(NL80211_STA_INFO_NONPEER_PM) |
+				 BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE);
 
 		sinfo->llid = sta->mesh->llid;
 		sinfo->plid = sta->mesh->plid;
@@ -2276,6 +2277,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 		sinfo->local_pm = sta->mesh->local_pm;
 		sinfo->peer_pm = sta->mesh->peer_pm;
 		sinfo->nonpeer_pm = sta->mesh->nonpeer_pm;
+		sinfo->connected_to_gate = sta->mesh->connected_to_gate;
 #endif
 	}
 
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 9a04327d71d1..8eb29041be54 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -364,6 +364,7 @@ DECLARE_EWMA(mesh_fail_avg, 20, 8)
  * @nonpeer_pm: STA power save mode towards non-peer neighbors
  * @processed_beacon: set to true after peer rates and capabilities are
  *	processed
+ * @connected_to_gate: true if mesh STA has a path to a mesh gate
  * @fail_avg: moving percentage of failed MSDUs
  */
 struct mesh_sta {
@@ -381,6 +382,7 @@ struct mesh_sta {
 	u8 plink_retries;
 
 	bool processed_beacon;
+	bool connected_to_gate;
 
 	enum nl80211_plink_state plink_state;
 	u32 plink_timeout;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5e7178954d61..f231059242cc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4883,6 +4883,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 	PUT_SINFO(LOCAL_PM, local_pm, u32);
 	PUT_SINFO(PEER_PM, peer_pm, u32);
 	PUT_SINFO(NONPEER_PM, nonpeer_pm, u32);
+	PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8);
 
 	if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) {
 		bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM);
-- 
cgit v1.2.3-59-g8ed1b


From 01d66fbd5b18ac9f01a6a2ae1278189d19208ad5 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Thu, 25 Oct 2018 17:36:34 -0400
Subject: {nl,mac}80211: add dot11MeshConnectedToMeshGate to meshconf

When userspace is controlling mesh routing, it may have better
knowledge about whether a mesh STA is connected to a mesh
gate than the kernel mpath table.  Add dot11MeshConnectedToMeshGate
to the mesh config so that such applications can explicitly
signal that a mesh STA is connected to a gate, which will then
be advertised in the beacon.

Signed-off-by: Bob Copeland <bobcopeland@fb.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h        | 5 +++++
 include/uapi/linux/nl80211.h  | 8 +++++++-
 net/mac80211/cfg.c            | 3 +++
 net/mac80211/debugfs_netdev.c | 3 +++
 net/mac80211/mesh.c           | 3 ++-
 net/wireless/nl80211.c        | 8 +++++++-
 6 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 24d2db8e082d..16d595b93ba3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1562,6 +1562,10 @@ struct bss_parameters {
  * @plink_timeout: If no tx activity is seen from a STA we've established
  *	peering with for longer than this time (in seconds), then remove it
  *	from the STA's list of peers.  Default is 30 minutes.
+ * @dot11MeshConnectedToMeshGate: if set to true, advertise that this STA is
+ *      connected to a mesh gate in mesh formation info.  If false, the
+ *      value in mesh formation is determined by the presence of root paths
+ *      in the mesh path table
  */
 struct mesh_config {
 	u16 dot11MeshRetryTimeout;
@@ -1581,6 +1585,7 @@ struct mesh_config {
 	u16 dot11MeshHWMPperrMinInterval;
 	u16 dot11MeshHWMPnetDiameterTraversalTime;
 	u8 dot11MeshHWMPRootMode;
+	bool dot11MeshConnectedToMeshGate;
 	u16 dot11MeshHWMPRannInterval;
 	bool dot11MeshGateAnnouncementProtocol;
 	bool dot11MeshForwarding;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index ff6005edf32f..51bd85b7d839 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3117,7 +3117,7 @@ enum nl80211_sta_bss_param {
  *	some packets with an FCS error due to TA corruption. Hence this counter
  *	might not be fully accurate.
  * @NL80211_STA_INFO_CONNECTED_TO_GATE: set to true if STA has a path to a
- *	mesh gate
+ *	mesh gate (u8, 0 or 1)
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3940,6 +3940,11 @@ enum nl80211_mesh_power_mode {
  *	remove it from the STA's list of peers. You may set this to 0 to disable
  *	the removal of the STA. Default is 30 minutes.
  *
+ * @NL80211_MESHCONF_CONNECTED_TO_GATE: If set to true then this mesh STA
+ *	will advertise that it is connected to a gate in the mesh formation
+ *	field.  If left unset then the mesh formation field will only
+ *	advertise such if there is an active root mesh path.
+ *
  * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use
  */
 enum nl80211_meshconf_params {
@@ -3972,6 +3977,7 @@ enum nl80211_meshconf_params {
 	NL80211_MESHCONF_POWER_MODE,
 	NL80211_MESHCONF_AWAKE_WINDOW,
 	NL80211_MESHCONF_PLINK_TIMEOUT,
+	NL80211_MESHCONF_CONNECTED_TO_GATE,
 
 	/* keep last */
 	__NL80211_MESHCONF_ATTR_AFTER_LAST,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 2fccccfbbf4d..cf8f946ae724 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2028,6 +2028,9 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy,
 			nconf->dot11MeshAwakeWindowDuration;
 	if (_chg_mesh_attr(NL80211_MESHCONF_PLINK_TIMEOUT, mask))
 		conf->plink_timeout = nconf->plink_timeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask))
+		conf->dot11MeshConnectedToMeshGate =
+			nconf->dot11MeshConnectedToMeshGate;
 	ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON);
 	return 0;
 }
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index c813207bb123..cff0fb3578c9 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -641,6 +641,8 @@ IEEE80211_IF_FILE(dot11MeshHWMPconfirmationInterval,
 IEEE80211_IF_FILE(power_mode, u.mesh.mshcfg.power_mode, DEC);
 IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration,
 		  u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC);
+IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate,
+		  u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC);
 #endif
 
 #define DEBUGFS_ADD_MODE(name, mode) \
@@ -762,6 +764,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata)
 	MESHPARAMS_ADD(dot11MeshHWMPconfirmationInterval);
 	MESHPARAMS_ADD(power_mode);
 	MESHPARAMS_ADD(dot11MeshAwakeWindowDuration);
+	MESHPARAMS_ADD(dot11MeshConnectedToMeshGate);
 #undef MESHPARAMS_ADD
 }
 #endif
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 19205c821dee..4869280a6413 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -255,7 +255,8 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
 	u8 *pos, neighbors;
 	u8 meshconf_len = sizeof(struct ieee80211_meshconf_ie);
 	bool is_connected_to_gate = ifmsh->num_gates > 0 ||
-		ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol;
+		ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol ||
+		ifmsh->mshcfg.dot11MeshConnectedToMeshGate;
 
 	if (skb_tailroom(skb) < 2 + meshconf_len)
 		return -ENOMEM;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f231059242cc..d5f0ffd076b2 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6275,7 +6275,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb,
 	    nla_put_u16(msg, NL80211_MESHCONF_AWAKE_WINDOW,
 			cur_params.dot11MeshAwakeWindowDuration) ||
 	    nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT,
-			cur_params.plink_timeout))
+			cur_params.plink_timeout) ||
+	    nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE,
+		       cur_params.dot11MeshConnectedToMeshGate))
 		goto nla_put_failure;
 	nla_nest_end(msg, pinfoattr);
 	genlmsg_end(msg, hdr);
@@ -6332,6 +6334,7 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = {
 				 NL80211_MESH_POWER_MAX),
 	[NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 },
 	[NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 },
+	[NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
 };
 
 static const struct nla_policy
@@ -6443,6 +6446,9 @@ do {									\
 	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, mask,
 				  NL80211_MESHCONF_RSSI_THRESHOLD,
 				  nla_get_s32);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToMeshGate, mask,
+				  NL80211_MESHCONF_CONNECTED_TO_GATE,
+				  nla_get_u8);
 	/*
 	 * Check HT operation mode based on
 	 * IEEE 802.11-2016 9.4.2.57 HT Operation element.
-- 
cgit v1.2.3-59-g8ed1b


From 361800876f80da3915c46e388fc682532228b2c3 Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Fri, 9 Nov 2018 11:14:44 +0100
Subject: ptp: add PTP_SYS_OFFSET_EXTENDED ioctl

The PTP_SYS_OFFSET ioctl, which can be used to measure the offset
between a PHC and the system clock, includes the total time that the
driver needs to read the PHC timestamp.

This typically involves reading of multiple PCI registers (sometimes in
multiple iterations) and the register that contains the lowest bits of
the timestamp is not read in the middle between the two readings of the
system clock. This asymmetry causes the measured offset to have a
significant error.

Introduce a new ioctl, driver function, and helper functions, which
allow the reading of the lowest register to be isolated from the other
readings in order to reduce the asymmetry. The ioctl returns three
timestamps for each measurement:
- system time right before reading the lowest bits of the PHC timestamp
- PHC time
- system time immediately after reading the lowest bits of the PHC
  timestamp

Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_chardev.c        | 33 +++++++++++++++++++++++++++++++++
 include/linux/ptp_clock_kernel.h | 31 +++++++++++++++++++++++++++++++
 include/uapi/linux/ptp_clock.h   | 12 ++++++++++++
 3 files changed, 76 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 3c681bed5703..aad0d36cf5c0 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -122,10 +122,12 @@ int ptp_open(struct posix_clock *pc, fmode_t fmode)
 long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 {
 	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_sys_offset_extended *extoff = NULL;
 	struct ptp_sys_offset_precise precise_offset;
 	struct system_device_crosststamp xtstamp;
 	struct ptp_clock_info *ops = ptp->info;
 	struct ptp_sys_offset *sysoff = NULL;
+	struct ptp_system_timestamp sts;
 	struct ptp_clock_request req;
 	struct ptp_clock_caps caps;
 	struct ptp_clock_time *pct;
@@ -211,6 +213,36 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			err = -EFAULT;
 		break;
 
+	case PTP_SYS_OFFSET_EXTENDED:
+		if (!ptp->info->gettimex64) {
+			err = -EOPNOTSUPP;
+			break;
+		}
+		extoff = memdup_user((void __user *)arg, sizeof(*extoff));
+		if (IS_ERR(extoff)) {
+			err = PTR_ERR(extoff);
+			extoff = NULL;
+			break;
+		}
+		if (extoff->n_samples > PTP_MAX_SAMPLES) {
+			err = -EINVAL;
+			break;
+		}
+		for (i = 0; i < extoff->n_samples; i++) {
+			err = ptp->info->gettimex64(ptp->info, &ts, &sts);
+			if (err)
+				goto out;
+			extoff->ts[i][0].sec = sts.pre_ts.tv_sec;
+			extoff->ts[i][0].nsec = sts.pre_ts.tv_nsec;
+			extoff->ts[i][1].sec = ts.tv_sec;
+			extoff->ts[i][1].nsec = ts.tv_nsec;
+			extoff->ts[i][2].sec = sts.post_ts.tv_sec;
+			extoff->ts[i][2].nsec = sts.post_ts.tv_nsec;
+		}
+		if (copy_to_user((void __user *)arg, extoff, sizeof(*extoff)))
+			err = -EFAULT;
+		break;
+
 	case PTP_SYS_OFFSET:
 		sysoff = memdup_user((void __user *)arg, sizeof(*sysoff));
 		if (IS_ERR(sysoff)) {
@@ -284,6 +316,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 	}
 
 out:
+	kfree(extoff);
 	kfree(sysoff);
 	return err;
 }
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 51349d124ee5..a1ec0448e341 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -39,6 +39,15 @@ struct ptp_clock_request {
 };
 
 struct system_device_crosststamp;
+
+/**
+ * struct ptp_system_timestamp - system time corresponding to a PHC timestamp
+ */
+struct ptp_system_timestamp {
+	struct timespec64 pre_ts;
+	struct timespec64 post_ts;
+};
+
 /**
  * struct ptp_clock_info - decribes a PTP hardware clock
  *
@@ -75,6 +84,14 @@ struct system_device_crosststamp;
  * @gettime64:  Reads the current time from the hardware clock.
  *              parameter ts: Holds the result.
  *
+ * @gettimex64:  Reads the current time from the hardware clock and optionally
+ *               also the system clock.
+ *               parameter ts: Holds the PHC timestamp.
+ *               parameter sts: If not NULL, it holds a pair of timestamps from
+ *               the system clock. The first reading is made right before
+ *               reading the lowest bits of the PHC timestamp and the second
+ *               reading immediately follows that.
+ *
  * @getcrosststamp:  Reads the current time from the hardware clock and
  *                   system clock simultaneously.
  *                   parameter cts: Contains timestamp (device,system) pair,
@@ -124,6 +141,8 @@ struct ptp_clock_info {
 	int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
 	int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
 	int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
+	int (*gettimex64)(struct ptp_clock_info *ptp, struct timespec64 *ts,
+			  struct ptp_system_timestamp *sts);
 	int (*getcrosststamp)(struct ptp_clock_info *ptp,
 			      struct system_device_crosststamp *cts);
 	int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts);
@@ -247,4 +266,16 @@ static inline int ptp_schedule_worker(struct ptp_clock *ptp,
 
 #endif
 
+static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
+{
+	if (sts)
+		ktime_get_real_ts64(&sts->pre_ts);
+}
+
+static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
+{
+	if (sts)
+		ktime_get_real_ts64(&sts->post_ts);
+}
+
 #endif
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 3039bf6a742e..d73d83950265 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -84,6 +84,16 @@ struct ptp_sys_offset {
 	struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1];
 };
 
+struct ptp_sys_offset_extended {
+	unsigned int n_samples; /* Desired number of measurements. */
+	unsigned int rsv[3];    /* Reserved for future use. */
+	/*
+	 * Array of [system, phc, system] time stamps. The kernel will provide
+	 * 3*n_samples time stamps.
+	 */
+	struct ptp_clock_time ts[PTP_MAX_SAMPLES][3];
+};
+
 struct ptp_sys_offset_precise {
 	struct ptp_clock_time device;
 	struct ptp_clock_time sys_realtime;
@@ -136,6 +146,8 @@ struct ptp_pin_desc {
 #define PTP_PIN_SETFUNC    _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc)
 #define PTP_SYS_OFFSET_PRECISE \
 	_IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise)
+#define PTP_SYS_OFFSET_EXTENDED \
+	_IOW(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended)
 
 struct ptp_extts_event {
 	struct ptp_clock_time t; /* Time event occured. */
-- 
cgit v1.2.3-59-g8ed1b


From 48872c11b77271ef9b070bdc50afe6655c4eb9aa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 11 Nov 2018 09:11:31 -0800
Subject: net_sched: sch_fq: add dctcp-like marking

Similar to 80ba92fa1a92 ("codel: add ce_threshold attribute")

After EDT adoption, it became easier to implement DCTCP-like CE marking.

In many cases, queues are not building in the network fabric but on
the hosts themselves.

If packets leaving fq missed their Earliest Departure Time by XXX usec,
we mark them with ECN CE. This gives a feedback (after one RTT) to
the sender to slow down and find better operating mode.

Example :

tc qd replace dev eth0 root fq ce_threshold 2.5ms

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  3 +++
 net/sched/sch_fq.c             | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 89ee47c2f17d..ee017bc057a3 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -864,6 +864,8 @@ enum {
 
 	TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
 
+	TCA_FQ_CE_THRESHOLD,	/* DCTCP-like CE-marking threshold */
+
 	__TCA_FQ_MAX
 };
 
@@ -882,6 +884,7 @@ struct tc_fq_qd_stats {
 	__u32	inactive_flows;
 	__u32	throttled_flows;
 	__u32	unthrottle_latency_ns;
+	__u64	ce_mark;		/* packets above ce_threshold */
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4b1af706896c..3671eab91107 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -94,6 +94,7 @@ struct fq_sched_data {
 	u32		flow_refill_delay;
 	u32		flow_plimit;	/* max packets per flow */
 	unsigned long	flow_max_rate;	/* optional max rate per flow */
+	u64		ce_threshold;
 	u32		orphan_mask;	/* mask for orphaned skb */
 	u32		low_rate_threshold;
 	struct rb_root	*fq_root;
@@ -107,6 +108,7 @@ struct fq_sched_data {
 	u64		stat_gc_flows;
 	u64		stat_internal_packets;
 	u64		stat_throttled;
+	u64		stat_ce_mark;
 	u64		stat_flows_plimit;
 	u64		stat_pkts_too_long;
 	u64		stat_allocation_errors;
@@ -454,6 +456,11 @@ begin:
 			fq_flow_set_throttled(q, f);
 			goto begin;
 		}
+		if (time_next_packet &&
+		    (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+			INET_ECN_set_ce(skb);
+			q->stat_ce_mark++;
+		}
 	}
 
 	skb = fq_dequeue_head(sch, f);
@@ -650,6 +657,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
 	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
 	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
+	[TCA_FQ_CE_THRESHOLD]		= { .type = NLA_U32 },
 };
 
 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -729,6 +737,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_FQ_ORPHAN_MASK])
 		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
 
+	if (tb[TCA_FQ_CE_THRESHOLD])
+		q->ce_threshold = (u64)NSEC_PER_USEC *
+				  nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+
 	if (!err) {
 		sch_tree_unlock(sch);
 		err = fq_resize(sch, fq_log);
@@ -779,6 +791,10 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 	q->fq_trees_log		= ilog2(1024);
 	q->orphan_mask		= 1024 - 1;
 	q->low_rate_threshold	= 550000 / 8;
+
+	/* Default ce_threshold of 4294 seconds */
+	q->ce_threshold		= (u64)NSEC_PER_USEC * ~0U;
+
 	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
 
 	if (opt)
@@ -792,6 +808,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 ce_threshold = q->ce_threshold;
 	struct nlattr *opts;
 
 	opts = nla_nest_start(skb, TCA_OPTIONS);
@@ -800,6 +817,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 
 	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
 
+	do_div(ce_threshold, NSEC_PER_USEC);
+
 	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
 	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
 	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
@@ -812,6 +831,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
 	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
 			q->low_rate_threshold) ||
+	    nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
 		goto nla_put_failure;
 
@@ -841,6 +861,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.throttled_flows	  = q->throttled_flows;
 	st.unthrottle_latency_ns  = min_t(unsigned long,
 					  q->unthrottle_latency_ns, ~0U);
+	st.ce_mark		  = q->stat_ce_mark;
 	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
-- 
cgit v1.2.3-59-g8ed1b


From 9b076f1c0f4869b838a1b7aa0edb5664d47ec8aa Mon Sep 17 00:00:00 2001
From: Matthew Bobrowski <mbobrowski@mbobrowski.org>
Date: Thu, 8 Nov 2018 14:07:14 +1100
Subject: fanotify: introduce new event mask FAN_OPEN_EXEC

A new event mask FAN_OPEN_EXEC has been defined so that users have the
ability to receive events specifically when a file has been opened with
the intent to be executed. Events of FAN_OPEN_EXEC type will be
generated when a file has been opened using either execve(), execveat()
or uselib() system calls.

The feature is implemented within fsnotify_open() by generating the
FAN_OPEN_EXEC event type if __FMODE_EXEC is set within file->f_flags.

Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c    | 3 ++-
 fs/notify/fsnotify.c             | 2 +-
 include/linux/fanotify.h         | 2 +-
 include/linux/fsnotify.h         | 2 ++
 include/linux/fsnotify_backend.h | 7 +++++--
 include/uapi/linux/fanotify.h    | 1 +
 6 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f4f8359bc597..5a1a15f646ba 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -210,8 +210,9 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
 	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
 	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+	BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
 
-	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 10);
+	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 11);
 
 	mask = fanotify_group_event_mask(iter_info, mask, data, data_type);
 	if (!mask)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index d2c34900ae05..b3f58f36a0ab 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -401,7 +401,7 @@ static __init int fsnotify_init(void)
 {
 	int ret;
 
-	BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
+	BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24);
 
 	ret = init_srcu_struct(&fsnotify_mark_srcu);
 	if (ret)
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index a5a60691e48b..c521e4264f2b 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -37,7 +37,7 @@
 
 /* Events that user can request to be notified on */
 #define FANOTIFY_EVENTS		(FAN_ACCESS | FAN_MODIFY | \
-				 FAN_CLOSE | FAN_OPEN)
+				 FAN_CLOSE | FAN_OPEN | FAN_OPEN_EXEC)
 
 /* Events that require a permission response from user */
 #define FANOTIFY_PERM_EVENTS	(FAN_OPEN_PERM | FAN_ACCESS_PERM)
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index fd1ce10553bf..1fe5ac93b252 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -215,6 +215,8 @@ static inline void fsnotify_open(struct file *file)
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_ISDIR;
+	if (file->f_flags & __FMODE_EXEC)
+		mask |= FS_OPEN_EXEC;
 
 	fsnotify_parent(path, NULL, mask);
 	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 135b973e44d1..39d94e62a836 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -38,6 +38,7 @@
 #define FS_DELETE		0x00000200	/* Subfile was deleted */
 #define FS_DELETE_SELF		0x00000400	/* Self was deleted */
 #define FS_MOVE_SELF		0x00000800	/* Self was moved */
+#define FS_OPEN_EXEC		0x00001000	/* File was opened for exec */
 
 #define FS_UNMOUNT		0x00002000	/* inode on umount fs */
 #define FS_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
@@ -62,7 +63,8 @@
 #define FS_EVENTS_POSS_ON_CHILD   (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
 				   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
 				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
-				   FS_DELETE | FS_OPEN_PERM | FS_ACCESS_PERM)
+				   FS_DELETE | FS_OPEN_PERM | FS_ACCESS_PERM | \
+				   FS_OPEN_EXEC)
 
 #define FS_MOVE			(FS_MOVED_FROM | FS_MOVED_TO)
 
@@ -74,7 +76,8 @@
 			     FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE | \
 			     FS_DELETE | FS_DELETE_SELF | FS_MOVE_SELF | \
 			     FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
-			     FS_OPEN_PERM | FS_ACCESS_PERM | FS_DN_RENAME)
+			     FS_OPEN_PERM | FS_ACCESS_PERM | FS_DN_RENAME | \
+			     FS_OPEN_EXEC)
 
 /* Extra flags that may be reported with event or control handling of events */
 #define ALL_FSNOTIFY_FLAGS  (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index b86740d1c50a..d9664fbc905b 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -10,6 +10,7 @@
 #define FAN_CLOSE_WRITE		0x00000008	/* Writtable file closed */
 #define FAN_CLOSE_NOWRITE	0x00000010	/* Unwrittable file closed */
 #define FAN_OPEN		0x00000020	/* File was opened */
+#define FAN_OPEN_EXEC		0x00001000	/* File was opened for exec */
 
 #define FAN_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
 
-- 
cgit v1.2.3-59-g8ed1b


From 66917a3130f218dcef9eeab4fd11a71cd00cd7c9 Mon Sep 17 00:00:00 2001
From: Matthew Bobrowski <mbobrowski@mbobrowski.org>
Date: Thu, 8 Nov 2018 14:12:44 +1100
Subject: fanotify: introduce new event mask FAN_OPEN_EXEC_PERM

A new event mask FAN_OPEN_EXEC_PERM has been defined. This allows users
to receive events and grant access to files that are intending to be
opened for execution. Events of FAN_OPEN_EXEC_PERM type will be
generated when a file has been opened by using either execve(),
execveat() or uselib() system calls.

This acts in the same manner as previous permission event mask, meaning
that an access response is required from the user application in order
to permit any further operations on the file.

Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c    |  3 ++-
 fs/notify/fsnotify.c             |  2 +-
 include/linux/fanotify.h         |  3 ++-
 include/linux/fsnotify.h         | 17 ++++++++++++-----
 include/linux/fsnotify_backend.h |  8 +++++---
 include/uapi/linux/fanotify.h    |  1 +
 6 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 5a1a15f646ba..3723f3d18d20 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -211,8 +211,9 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
 	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
 	BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
+	BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
 
-	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 11);
+	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12);
 
 	mask = fanotify_group_event_mask(iter_info, mask, data, data_type);
 	if (!mask)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index b3f58f36a0ab..ecf09b6243d9 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -401,7 +401,7 @@ static __init int fsnotify_init(void)
 {
 	int ret;
 
-	BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24);
+	BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);
 
 	ret = init_srcu_struct(&fsnotify_mark_srcu);
 	if (ret)
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index c521e4264f2b..9e2142795335 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -40,7 +40,8 @@
 				 FAN_CLOSE | FAN_OPEN | FAN_OPEN_EXEC)
 
 /* Events that require a permission response from user */
-#define FANOTIFY_PERM_EVENTS	(FAN_OPEN_PERM | FAN_ACCESS_PERM)
+#define FANOTIFY_PERM_EVENTS	(FAN_OPEN_PERM | FAN_ACCESS_PERM | \
+				 FAN_OPEN_EXEC_PERM)
 
 /* Extra flags that may be reported with event or control handling of events */
 #define FANOTIFY_EVENT_FLAGS	(FAN_EVENT_ON_CHILD | FAN_ONDIR)
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index c29f2f072c2c..2ccb08cb5d6a 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -40,9 +40,10 @@ static inline int fsnotify_path(struct inode *inode, const struct path *path,
 	return fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }
 
-/* simple call site for access decisions */
+/* Simple call site for access decisions */
 static inline int fsnotify_perm(struct file *file, int mask)
 {
+	int ret;
 	const struct path *path = &file->f_path;
 	struct inode *inode = file_inode(file);
 	__u32 fsnotify_mask = 0;
@@ -51,12 +52,18 @@ static inline int fsnotify_perm(struct file *file, int mask)
 		return 0;
 	if (!(mask & (MAY_READ | MAY_OPEN)))
 		return 0;
-	if (mask & MAY_OPEN)
+	if (mask & MAY_OPEN) {
 		fsnotify_mask = FS_OPEN_PERM;
-	else if (mask & MAY_READ)
+
+		if (file->f_flags & __FMODE_EXEC) {
+			ret = fsnotify_path(inode, path, FS_OPEN_EXEC_PERM);
+
+			if (ret)
+				return ret;
+		}
+	} else if (mask & MAY_READ) {
 		fsnotify_mask = FS_ACCESS_PERM;
-	else
-		BUG();
+	}
 
 	return fsnotify_path(inode, path, fsnotify_mask);
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 39d94e62a836..7639774e7475 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -46,6 +46,7 @@
 
 #define FS_OPEN_PERM		0x00010000	/* open event in an permission hook */
 #define FS_ACCESS_PERM		0x00020000	/* access event in a permissions hook */
+#define FS_OPEN_EXEC_PERM	0x00040000	/* open/exec event in a permission hook */
 
 #define FS_EXCL_UNLINK		0x04000000	/* do not send events if object is unlinked */
 #define FS_ISDIR		0x40000000	/* event occurred against dir */
@@ -64,11 +65,12 @@
 				   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
 				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
 				   FS_DELETE | FS_OPEN_PERM | FS_ACCESS_PERM | \
-				   FS_OPEN_EXEC)
+				   FS_OPEN_EXEC | FS_OPEN_EXEC_PERM)
 
 #define FS_MOVE			(FS_MOVED_FROM | FS_MOVED_TO)
 
-#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM)
+#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
+				  FS_OPEN_EXEC_PERM)
 
 /* Events that can be reported to backends */
 #define ALL_FSNOTIFY_EVENTS (FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
@@ -77,7 +79,7 @@
 			     FS_DELETE | FS_DELETE_SELF | FS_MOVE_SELF | \
 			     FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
 			     FS_OPEN_PERM | FS_ACCESS_PERM | FS_DN_RENAME | \
-			     FS_OPEN_EXEC)
+			     FS_OPEN_EXEC | FS_OPEN_EXEC_PERM)
 
 /* Extra flags that may be reported with event or control handling of events */
 #define ALL_FSNOTIFY_FLAGS  (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index d9664fbc905b..909c98fcace2 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -16,6 +16,7 @@
 
 #define FAN_OPEN_PERM		0x00010000	/* File open in perm check */
 #define FAN_ACCESS_PERM		0x00020000	/* File accessed in perm check */
+#define FAN_OPEN_EXEC_PERM	0x00040000	/* File open/exec in perm check */
 
 #define FAN_ONDIR		0x40000000	/* event occurred against dir */
 
-- 
cgit v1.2.3-59-g8ed1b


From a56f9c868ccf56f0ab6e3e64693e6a39323bf8d8 Mon Sep 17 00:00:00 2001
From: Robert Foss <robert.foss@collabora.com>
Date: Mon, 12 Nov 2018 17:51:55 +0100
Subject: drm/virtio: add uapi for in and out explicit fences

Add a new field called fence_fd that will be used by userspace to send
in-fences to the kernel and receive out-fences created by the kernel.

This uapi enables virtio to take advantage of explicit synchronization of
dma-bufs.

There are two new flags:

* VIRTGPU_EXECBUF_FENCE_FD_IN to be used when passing an in-fence fd.
* VIRTGPU_EXECBUF_FENCE_FD_OUT to be used when requesting an out-fence fd

The execbuffer IOCTL is now read-write to allow the userspace to read the
out-fence.

On error -1 should be returned in the fence_fd field.

Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.com>
Signed-off-by: Robert Foss <robert.foss@collabora.com>
Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20181112165157.32765-3-robert.foss@collabora.com
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/virtio/virtgpu_ioctl.c |  5 +++++
 include/uapi/drm/virtgpu_drm.h         | 13 ++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
index d69fc356df0a..3d497835b0f5 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
@@ -119,6 +119,11 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data,
 	if (vgdev->has_virgl_3d == false)
 		return -ENOSYS;
 
+	if ((exbuf->flags & ~VIRTGPU_EXECBUF_FLAGS))
+		return -EINVAL;
+
+	exbuf->fence_fd = -1;
+
 	INIT_LIST_HEAD(&validate_list);
 	if (exbuf->num_bo_handles) {
 
diff --git a/include/uapi/drm/virtgpu_drm.h b/include/uapi/drm/virtgpu_drm.h
index 9a781f0611df..91062f4ac7c5 100644
--- a/include/uapi/drm/virtgpu_drm.h
+++ b/include/uapi/drm/virtgpu_drm.h
@@ -47,6 +47,13 @@ extern "C" {
 #define DRM_VIRTGPU_WAIT     0x08
 #define DRM_VIRTGPU_GET_CAPS  0x09
 
+#define VIRTGPU_EXECBUF_FENCE_FD_IN	0x01
+#define VIRTGPU_EXECBUF_FENCE_FD_OUT	0x02
+#define VIRTGPU_EXECBUF_FLAGS  (\
+		VIRTGPU_EXECBUF_FENCE_FD_IN |\
+		VIRTGPU_EXECBUF_FENCE_FD_OUT |\
+		0)
+
 struct drm_virtgpu_map {
 	__u64 offset; /* use for mmap system call */
 	__u32 handle;
@@ -54,12 +61,12 @@ struct drm_virtgpu_map {
 };
 
 struct drm_virtgpu_execbuffer {
-	__u32		flags;		/* for future use */
+	__u32 flags;
 	__u32 size;
 	__u64 command; /* void* */
 	__u64 bo_handles;
 	__u32 num_bo_handles;
-	__u32 pad;
+	__s32 fence_fd;
 };
 
 #define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */
@@ -137,7 +144,7 @@ struct drm_virtgpu_get_caps {
 	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map)
 
 #define DRM_IOCTL_VIRTGPU_EXECBUFFER \
-	DRM_IOW(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\
 		struct drm_virtgpu_execbuffer)
 
 #define DRM_IOCTL_VIRTGPU_GETPARAM \
-- 
cgit v1.2.3-59-g8ed1b


From 2cd7b6f08bc4cf06d1f00f8b42720a8457861ca1 Mon Sep 17 00:00:00 2001
From: Robert Foss <robert.foss@collabora.com>
Date: Mon, 12 Nov 2018 17:51:56 +0100
Subject: drm/virtio: add in/out fence support for explicit synchronization

When the execbuf call receives an in-fence it will get the dma_fence
related to that fence fd and wait on it before submitting the draw call.

On the out-fence side we get fence returned by the submitted draw call
and attach it to a sync_file and send the sync_file fd to userspace. On
error -1 is returned to userspace.

VIRTGPU_EXECBUF_FENCE_FD_IN & VIRTGPU_EXECBUF_FENCE_FD_OUT
are supported at the simultaneously and can be flagged
for simultaneously.

Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.com>
Signed-off-by: Robert Foss <robert.foss@collabora.com>
Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20181112165157.32765-4-robert.foss@collabora.com
Suggested-by: Rob Herring <robh@kernel.org>
Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/virtio/virtgpu_ioctl.c | 81 +++++++++++++++++++++++++++-------
 include/uapi/drm/virtgpu_drm.h         |  2 +-
 2 files changed, 65 insertions(+), 18 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
index 3d497835b0f5..340f2513d829 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
@@ -28,6 +28,7 @@
 #include <drm/drmP.h>
 #include <drm/virtgpu_drm.h>
 #include <drm/ttm/ttm_execbuf_util.h>
+#include <linux/sync_file.h>
 
 #include "virtgpu_drv.h"
 
@@ -105,7 +106,7 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data,
 	struct virtio_gpu_device *vgdev = dev->dev_private;
 	struct virtio_gpu_fpriv *vfpriv = drm_file->driver_priv;
 	struct drm_gem_object *gobj;
-	struct virtio_gpu_fence *fence;
+	struct virtio_gpu_fence *out_fence;
 	struct virtio_gpu_object *qobj;
 	int ret;
 	uint32_t *bo_handles = NULL;
@@ -114,6 +115,9 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data,
 	struct ttm_validate_buffer *buflist = NULL;
 	int i;
 	struct ww_acquire_ctx ticket;
+	struct sync_file *sync_file;
+	int in_fence_fd = exbuf->fence_fd;
+	int out_fence_fd = -1;
 	void *buf;
 
 	if (vgdev->has_virgl_3d == false)
@@ -124,6 +128,33 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data,
 
 	exbuf->fence_fd = -1;
 
+	if (exbuf->flags & VIRTGPU_EXECBUF_FENCE_FD_IN) {
+		struct dma_fence *in_fence;
+
+		in_fence = sync_file_get_fence(in_fence_fd);
+
+		if (!in_fence)
+			return -EINVAL;
+
+		/*
+		 * Wait if the fence is from a foreign context, or if the fence
+		 * array contains any fence from a foreign context.
+		 */
+		ret = 0;
+		if (!dma_fence_match_context(in_fence, vgdev->fence_drv.context))
+			ret = dma_fence_wait(in_fence, true);
+
+		dma_fence_put(in_fence);
+		if (ret)
+			return ret;
+	}
+
+	if (exbuf->flags & VIRTGPU_EXECBUF_FENCE_FD_OUT) {
+		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
+		if (out_fence_fd < 0)
+			return out_fence_fd;
+	}
+
 	INIT_LIST_HEAD(&validate_list);
 	if (exbuf->num_bo_handles) {
 
@@ -133,26 +164,22 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data,
 					   sizeof(struct ttm_validate_buffer),
 					   GFP_KERNEL | __GFP_ZERO);
 		if (!bo_handles || !buflist) {
-			kvfree(bo_handles);
-			kvfree(buflist);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto out_unused_fd;
 		}
 
 		user_bo_handles = (void __user *)(uintptr_t)exbuf->bo_handles;
 		if (copy_from_user(bo_handles, user_bo_handles,
 				   exbuf->num_bo_handles * sizeof(uint32_t))) {
 			ret = -EFAULT;
-			kvfree(bo_handles);
-			kvfree(buflist);
-			return ret;
+			goto out_unused_fd;
 		}
 
 		for (i = 0; i < exbuf->num_bo_handles; i++) {
 			gobj = drm_gem_object_lookup(drm_file, bo_handles[i]);
 			if (!gobj) {
-				kvfree(bo_handles);
-				kvfree(buflist);
-				return -ENOENT;
+				ret = -ENOENT;
+				goto out_unused_fd;
 			}
 
 			qobj = gem_to_virtio_gpu_obj(gobj);
@@ -161,6 +188,7 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data,
 			list_add(&buflist[i].head, &validate_list);
 		}
 		kvfree(bo_handles);
+		bo_handles = NULL;
 	}
 
 	ret = virtio_gpu_object_list_validate(&ticket, &validate_list);
@@ -174,28 +202,47 @@ static int virtio_gpu_execbuffer_ioctl(struct drm_device *dev, void *data,
 		goto out_unresv;
 	}
 
-	fence = virtio_gpu_fence_alloc(vgdev);
-	if (!fence) {
-		kfree(buf);
+	out_fence = virtio_gpu_fence_alloc(vgdev);
+	if(!out_fence) {
 		ret = -ENOMEM;
-		goto out_unresv;
+		goto out_memdup;
+	}
+
+	if (out_fence_fd >= 0) {
+		sync_file = sync_file_create(&out_fence->f);
+		if (!sync_file) {
+			dma_fence_put(&out_fence->f);
+			ret = -ENOMEM;
+			goto out_memdup;
+		}
+
+		exbuf->fence_fd = out_fence_fd;
+		fd_install(out_fence_fd, sync_file->file);
 	}
+
 	virtio_gpu_cmd_submit(vgdev, buf, exbuf->size,
-			      vfpriv->ctx_id, &fence);
+			      vfpriv->ctx_id, &out_fence);
 
-	ttm_eu_fence_buffer_objects(&ticket, &validate_list, &fence->f);
+	ttm_eu_fence_buffer_objects(&ticket, &validate_list, &out_fence->f);
 
 	/* fence the command bo */
 	virtio_gpu_unref_list(&validate_list);
 	kvfree(buflist);
-	dma_fence_put(&fence->f);
 	return 0;
 
+out_memdup:
+	kfree(buf);
 out_unresv:
 	ttm_eu_backoff_reservation(&ticket, &validate_list);
 out_free:
 	virtio_gpu_unref_list(&validate_list);
+out_unused_fd:
+	kvfree(bo_handles);
 	kvfree(buflist);
+
+	if (out_fence_fd >= 0)
+		put_unused_fd(out_fence_fd);
+
 	return ret;
 }
 
diff --git a/include/uapi/drm/virtgpu_drm.h b/include/uapi/drm/virtgpu_drm.h
index 91062f4ac7c5..f06a789f34cd 100644
--- a/include/uapi/drm/virtgpu_drm.h
+++ b/include/uapi/drm/virtgpu_drm.h
@@ -66,7 +66,7 @@ struct drm_virtgpu_execbuffer {
 	__u64 command; /* void* */
 	__u64 bo_handles;
 	__u32 num_bo_handles;
-	__s32 fence_fd;
+	__s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */
 };
 
 #define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */
-- 
cgit v1.2.3-59-g8ed1b


From 5c72299fba9df407c2f2994e194edebf878996ee Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Mon, 12 Nov 2018 16:15:55 -0800
Subject: net: sched: cls_flower: Classify packets using port ranges

Added support in tc flower for filtering based on port ranges.

Example:
1. Match on a port range:
-------------------------
$ tc filter add dev enp4s0 protocol ip parent ffff:\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent ffff:
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 1 ref 1 bind 1 installed 85 sec used 3 sec
        Action statistics:
        Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

2. Match on IP address and port range:
--------------------------------------
$ tc filter add dev enp4s0 protocol ip parent ffff:\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent ffff:
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 2 ref 1 bind 1 installed 58 sec used 2 sec
        Action statistics:
        Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

v4:
1. Added condition before setting port key.
2. Organized setting and dumping port range keys into functions
   and added validation of input range.

v3:
1. Moved new fields in UAPI enum to the end of enum.
2. Removed couple of empty lines.

v2:
Addressed Jiri's comments:
1. Added separate functions for dst and src comparisons.
2. Removed endpoint enum.
3. Added new bit TCA_FLOWER_FLAGS_RANGE to decide normal/range
  lookup.
4. Cleaned up fl_lookup function.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |   7 ++
 net/sched/cls_flower.c       | 155 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 156 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1e612d..95d0db2a8350 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -485,6 +485,11 @@ enum {
 
 	TCA_FLOWER_IN_HW_COUNT,
 
+	TCA_FLOWER_KEY_PORT_SRC_MIN,	/* be16 */
+	TCA_FLOWER_KEY_PORT_SRC_MAX,	/* be16 */
+	TCA_FLOWER_KEY_PORT_DST_MIN,	/* be16 */
+	TCA_FLOWER_KEY_PORT_DST_MAX,	/* be16 */
+
 	__TCA_FLOWER_MAX,
 };
 
@@ -518,6 +523,8 @@ enum {
 	TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
 };
 
+#define TCA_FLOWER_MASK_FLAGS_RANGE	(1 << 0) /* Range-based match */
+
 /* Match-all classifier */
 
 enum {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c6c327874abc..85e9f8e1da10 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -55,6 +55,8 @@ struct fl_flow_key {
 	struct flow_dissector_key_ip ip;
 	struct flow_dissector_key_ip enc_ip;
 	struct flow_dissector_key_enc_opts enc_opts;
+	struct flow_dissector_key_ports tp_min;
+	struct flow_dissector_key_ports tp_max;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -65,6 +67,7 @@ struct fl_flow_mask_range {
 struct fl_flow_mask {
 	struct fl_flow_key key;
 	struct fl_flow_mask_range range;
+	u32 flags;
 	struct rhash_head ht_node;
 	struct rhashtable ht;
 	struct rhashtable_params filter_ht_params;
@@ -179,13 +182,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
 	memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
-				       struct fl_flow_key *mkey)
+static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
+				  struct fl_flow_key *key,
+				  struct fl_flow_key *mkey)
+{
+	__be16 min_mask, max_mask, min_val, max_val;
+
+	min_mask = htons(filter->mask->key.tp_min.dst);
+	max_mask = htons(filter->mask->key.tp_max.dst);
+	min_val = htons(filter->key.tp_min.dst);
+	max_val = htons(filter->key.tp_max.dst);
+
+	if (min_mask && max_mask) {
+		if (htons(key->tp.dst) < min_val ||
+		    htons(key->tp.dst) > max_val)
+			return false;
+
+		/* skb does not have min and max values */
+		mkey->tp_min.dst = filter->mkey.tp_min.dst;
+		mkey->tp_max.dst = filter->mkey.tp_max.dst;
+	}
+	return true;
+}
+
+static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
+				  struct fl_flow_key *key,
+				  struct fl_flow_key *mkey)
+{
+	__be16 min_mask, max_mask, min_val, max_val;
+
+	min_mask = htons(filter->mask->key.tp_min.src);
+	max_mask = htons(filter->mask->key.tp_max.src);
+	min_val = htons(filter->key.tp_min.src);
+	max_val = htons(filter->key.tp_max.src);
+
+	if (min_mask && max_mask) {
+		if (htons(key->tp.src) < min_val ||
+		    htons(key->tp.src) > max_val)
+			return false;
+
+		/* skb does not have min and max values */
+		mkey->tp_min.src = filter->mkey.tp_min.src;
+		mkey->tp_max.src = filter->mkey.tp_max.src;
+	}
+	return true;
+}
+
+static struct cls_fl_filter *__fl_lookup(struct fl_flow_mask *mask,
+					 struct fl_flow_key *mkey)
 {
 	return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
 				      mask->filter_ht_params);
 }
 
+static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask,
+					     struct fl_flow_key *mkey,
+					     struct fl_flow_key *key)
+{
+	struct cls_fl_filter *filter, *f;
+
+	list_for_each_entry_rcu(filter, &mask->filters, list) {
+		if (!fl_range_port_dst_cmp(filter, key, mkey))
+			continue;
+
+		if (!fl_range_port_src_cmp(filter, key, mkey))
+			continue;
+
+		f = __fl_lookup(mask, mkey);
+		if (f)
+			return f;
+	}
+	return NULL;
+}
+
+static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
+				       struct fl_flow_key *mkey,
+				       struct fl_flow_key *key)
+{
+	if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
+		return fl_lookup_range(mask, mkey, key);
+
+	return __fl_lookup(mask, mkey);
+}
+
 static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
@@ -208,7 +287,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 		fl_set_masked_key(&skb_mkey, &skb_key, mask);
 
-		f = fl_lookup(mask, &skb_mkey);
+		f = fl_lookup(mask, &skb_mkey, &skb_key);
 		if (f && !tc_skip_sw(f->flags)) {
 			*res = f->res;
 			return tcf_exts_exec(skb, &f->exts, res);
@@ -514,6 +593,31 @@ static void fl_set_key_val(struct nlattr **tb,
 		memcpy(mask, nla_data(tb[mask_type]), len);
 }
 
+static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
+				 struct fl_flow_key *mask)
+{
+	fl_set_key_val(tb, &key->tp_min.dst,
+		       TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst,
+		       TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst));
+	fl_set_key_val(tb, &key->tp_max.dst,
+		       TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst,
+		       TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst));
+	fl_set_key_val(tb, &key->tp_min.src,
+		       TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src,
+		       TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src));
+	fl_set_key_val(tb, &key->tp_max.src,
+		       TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src,
+		       TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src));
+
+	if ((mask->tp_min.dst && mask->tp_max.dst &&
+	     htons(key->tp_max.dst) <= htons(key->tp_min.dst)) ||
+	     (mask->tp_min.src && mask->tp_max.src &&
+	      htons(key->tp_max.src) <= htons(key->tp_min.src)))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int fl_set_key_mpls(struct nlattr **tb,
 			   struct flow_dissector_key_mpls *key_val,
 			   struct flow_dissector_key_mpls *key_mask)
@@ -921,6 +1025,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       sizeof(key->arp.tha));
 	}
 
+	if (key->basic.ip_proto == IPPROTO_TCP ||
+	    key->basic.ip_proto == IPPROTO_UDP ||
+	    key->basic.ip_proto == IPPROTO_SCTP) {
+		ret = fl_set_key_port_range(tb, key, mask);
+		if (ret)
+			return ret;
+	}
+
 	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
 	    tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
 		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -1038,8 +1150,9 @@ static void fl_init_dissector(struct flow_dissector *dissector,
 			     FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
-	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
-			     FLOW_DISSECTOR_KEY_PORTS, tp);
+	if (FL_KEY_IS_MASKED(mask, tp) ||
+	    FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max))
+		FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_IP, ip);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1086,6 +1199,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
 
 	fl_mask_copy(newmask, mask);
 
+	if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) ||
+	    (newmask->key.tp_min.src && newmask->key.tp_max.src))
+		newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE;
+
 	err = fl_init_mask_hashtable(newmask);
 	if (err)
 		goto errout_free;
@@ -1239,7 +1356,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		goto errout_idr;
 
 	if (!tc_skip_sw(fnew->flags)) {
-		if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
+		if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) {
 			err = -EEXIST;
 			goto errout_mask;
 		}
@@ -1476,6 +1593,26 @@ static int fl_dump_key_val(struct sk_buff *skb,
 	return 0;
 }
 
+static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key,
+				  struct fl_flow_key *mask)
+{
+	if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN,
+			    &mask->tp_min.dst, TCA_FLOWER_UNSPEC,
+			    sizeof(key->tp_min.dst)) ||
+	    fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX,
+			    &mask->tp_max.dst, TCA_FLOWER_UNSPEC,
+			    sizeof(key->tp_max.dst)) ||
+	    fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN,
+			    &mask->tp_min.src, TCA_FLOWER_UNSPEC,
+			    sizeof(key->tp_min.src)) ||
+	    fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX,
+			    &mask->tp_max.src, TCA_FLOWER_UNSPEC,
+			    sizeof(key->tp_max.src)))
+		return -1;
+
+	return 0;
+}
+
 static int fl_dump_key_mpls(struct sk_buff *skb,
 			    struct flow_dissector_key_mpls *mpls_key,
 			    struct flow_dissector_key_mpls *mpls_mask)
@@ -1812,6 +1949,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 				  sizeof(key->arp.tha))))
 		goto nla_put_failure;
 
+	if ((key->basic.ip_proto == IPPROTO_TCP ||
+	     key->basic.ip_proto == IPPROTO_UDP ||
+	     key->basic.ip_proto == IPPROTO_SCTP) &&
+	     fl_dump_key_port_range(skb, key, mask))
+		goto nla_put_failure;
+
 	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
 			    TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
-- 
cgit v1.2.3-59-g8ed1b


From dfdda82e3b84c13601be09f8351ec4f15a4fbe03 Mon Sep 17 00:00:00 2001
From: Vitaly Chikunov <vt@altlinux.org>
Date: Wed, 7 Nov 2018 00:00:02 +0300
Subject: crypto: streebog - register Streebog in hash info for IMA

Register Streebog hash function in Hash Info arrays to let IMA use
it for its purposes.

Cc: linux-integrity@vger.kernel.org
Signed-off-by: Vitaly Chikunov <vt@altlinux.org>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/hash_info.c             | 4 ++++
 include/crypto/hash_info.h     | 1 +
 include/uapi/linux/hash_info.h | 2 ++
 3 files changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/crypto/hash_info.c b/crypto/hash_info.c
index 7b1e0b188ce6..1dd095e4b451 100644
--- a/crypto/hash_info.c
+++ b/crypto/hash_info.c
@@ -32,6 +32,8 @@ const char *const hash_algo_name[HASH_ALGO__LAST] = {
 	[HASH_ALGO_TGR_160]	= "tgr160",
 	[HASH_ALGO_TGR_192]	= "tgr192",
 	[HASH_ALGO_SM3_256]	= "sm3-256",
+	[HASH_ALGO_STREEBOG_256] = "streebog256",
+	[HASH_ALGO_STREEBOG_512] = "streebog512",
 };
 EXPORT_SYMBOL_GPL(hash_algo_name);
 
@@ -54,5 +56,7 @@ const int hash_digest_size[HASH_ALGO__LAST] = {
 	[HASH_ALGO_TGR_160]	= TGR160_DIGEST_SIZE,
 	[HASH_ALGO_TGR_192]	= TGR192_DIGEST_SIZE,
 	[HASH_ALGO_SM3_256]	= SM3256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE,
 };
 EXPORT_SYMBOL_GPL(hash_digest_size);
diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h
index 56f217d41f12..91786b68dbdb 100644
--- a/include/crypto/hash_info.h
+++ b/include/crypto/hash_info.h
@@ -15,6 +15,7 @@
 
 #include <crypto/sha.h>
 #include <crypto/md5.h>
+#include <crypto/streebog.h>
 
 #include <uapi/linux/hash_info.h>
 
diff --git a/include/uapi/linux/hash_info.h b/include/uapi/linux/hash_info.h
index eea5d02c58de..74a8609fcb4d 100644
--- a/include/uapi/linux/hash_info.h
+++ b/include/uapi/linux/hash_info.h
@@ -33,6 +33,8 @@ enum hash_algo {
 	HASH_ALGO_TGR_160,
 	HASH_ALGO_TGR_192,
 	HASH_ALGO_SM3_256,
+	HASH_ALGO_STREEBOG_256,
+	HASH_ALGO_STREEBOG_512,
 	HASH_ALGO__LAST
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 80e22e961dfd15530215f6f6dcd94cd8f65ba1ea Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Nov 2018 22:23:49 -0800
Subject: net: sched: gred: provide a better structured dump and expose stats

Currently all GRED's virtual queue data is dumped in a single
array in a single attribute.  This makes it pretty much impossible
to add new fields.  In order to expose more detailed stats add a
new set of attributes.  We can now expose the 64 bit value of bytesin
and all the mark stats which were not part of the original design.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 26 +++++++++++++++++++++
 net/sched/sch_gred.c           | 53 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index ee017bc057a3..c8f717346b60 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -291,11 +291,37 @@ enum {
        TCA_GRED_DPS,
        TCA_GRED_MAX_P,
        TCA_GRED_LIMIT,
+       TCA_GRED_VQ_LIST,	/* nested TCA_GRED_VQ_ENTRY */
        __TCA_GRED_MAX,
 };
 
 #define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
 
+enum {
+	TCA_GRED_VQ_ENTRY_UNSPEC,
+	TCA_GRED_VQ_ENTRY,	/* nested TCA_GRED_VQ_* */
+	__TCA_GRED_VQ_ENTRY_MAX,
+};
+#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1)
+
+enum {
+	TCA_GRED_VQ_UNSPEC,
+	TCA_GRED_VQ_PAD,
+	TCA_GRED_VQ_DP,			/* u32 */
+	TCA_GRED_VQ_STAT_BYTES,		/* u64 */
+	TCA_GRED_VQ_STAT_PACKETS,	/* u32 */
+	TCA_GRED_VQ_STAT_BACKLOG,	/* u32 */
+	TCA_GRED_VQ_STAT_PROB_DROP,	/* u32 */
+	TCA_GRED_VQ_STAT_PROB_MARK,	/* u32 */
+	TCA_GRED_VQ_STAT_FORCED_DROP,	/* u32 */
+	TCA_GRED_VQ_STAT_FORCED_MARK,	/* u32 */
+	TCA_GRED_VQ_STAT_PDROP,		/* u32 */
+	TCA_GRED_VQ_STAT_OTHER,		/* u32 */
+	__TCA_GRED_VQ_MAX
+};
+
+#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1)
+
 struct tc_gred_qopt {
 	__u32		limit;        /* HARD maximal queue length (bytes)    */
 	__u32		qth_min;      /* Min average length threshold (bytes) */
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 6f209c83ee7a..dc09a32c4b4f 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -404,6 +404,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
 	[TCA_GRED_DPS]		= { .len = sizeof(struct tc_gred_sopt) },
 	[TCA_GRED_MAX_P]	= { .type = NLA_U32 },
 	[TCA_GRED_LIMIT]	= { .type = NLA_U32 },
+	[TCA_GRED_VQ_LIST]	= { .type = NLA_REJECT },
 };
 
 static int gred_change(struct Qdisc *sch, struct nlattr *opt,
@@ -517,7 +518,7 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt,
 static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct gred_sched *table = qdisc_priv(sch);
-	struct nlattr *parms, *opts = NULL;
+	struct nlattr *parms, *vqs, *opts = NULL;
 	int i;
 	u32 max_p[MAX_DPs];
 	struct tc_gred_sopt sopt = {
@@ -544,6 +545,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit))
 		goto nla_put_failure;
 
+	/* Old style all-in-one dump of VQs */
 	parms = nla_nest_start(skb, TCA_GRED_PARMS);
 	if (parms == NULL)
 		goto nla_put_failure;
@@ -594,6 +596,55 @@ append_opt:
 
 	nla_nest_end(skb, parms);
 
+	/* Dump the VQs again, in more structured way */
+	vqs = nla_nest_start(skb, TCA_GRED_VQ_LIST);
+	if (!vqs)
+		goto nla_put_failure;
+
+	for (i = 0; i < MAX_DPs; i++) {
+		struct gred_sched_data *q = table->tab[i];
+		struct nlattr *vq;
+
+		if (!q)
+			continue;
+
+		vq = nla_nest_start(skb, TCA_GRED_VQ_ENTRY);
+		if (!vq)
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP))
+			goto nla_put_failure;
+
+		/* Stats */
+		if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin,
+				      TCA_GRED_VQ_PAD))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PACKETS, q->packetsin))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_BACKLOG,
+				gred_backlog(table, q, sch)))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_DROP,
+				q->stats.prob_drop))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_MARK,
+				q->stats.prob_mark))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_DROP,
+				q->stats.forced_drop))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_MARK,
+				q->stats.forced_mark))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_OTHER, q->stats.other))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, vq);
+	}
+	nla_nest_end(skb, vqs);
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
-- 
cgit v1.2.3-59-g8ed1b


From 72111015024f4eddb5aac400ddbe38a4f8f0279a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Nov 2018 22:23:51 -0800
Subject: net: sched: gred: allow manipulating per-DP RED flags

Allow users to set and dump RED flags (ECN enabled and harddrop)
on per-virtual queue basis.  Validation of attributes is split
from changes to make sure we won't have to undo previous operations
when we find out configuration is invalid.

The objective is to allow changing per-Qdisc parameters without
overwriting the per-vq configured flags.

Old user space will not pass the TCA_GRED_VQ_FLAGS attribute and
per-Qdisc flags will always get propagated to the virtual queues.

New user space which wants to make use of per-vq flags should set
per-Qdisc flags to 0 and then configure per-vq flags as it
sees fit.  Once per-vq flags are set per-Qdisc flags can't be
changed to non-zero.  Vice versa - if the per-Qdisc flags are
non-zero the TCA_GRED_VQ_FLAGS attribute has to either be omitted
or set to the same value as per-Qdisc flags.

Update per-Qdisc parameters:
per-Qdisc | per-VQ | result
        0 |      0 | all vq flags updated
	0 |  non-0 | error (vq flags in use)
    non-0 |      0 | -- impossible --
    non-0 |  non-0 | all vq flags updated

Update per-VQ state (flags parameter not specified):
   no change to flags

Update per-VQ state (flags parameter set):
per-Qdisc | per-VQ | result
        0 |   any  | per-vq flags updated
    non-0 |      0 | -- impossible --
    non-0 |  non-0 | error (per-Qdisc flags in use)

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |   1 +
 net/sched/sch_gred.c           | 144 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 144 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index c8f717346b60..0d18b1d1fbbc 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -317,6 +317,7 @@ enum {
 	TCA_GRED_VQ_STAT_FORCED_MARK,	/* u32 */
 	TCA_GRED_VQ_STAT_PDROP,		/* u32 */
 	TCA_GRED_VQ_STAT_OTHER,		/* u32 */
+	TCA_GRED_VQ_FLAGS,		/* u32 */
 	__TCA_GRED_VQ_MAX
 };
 
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 47133106c7e2..8b8c325f48bc 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -152,6 +152,19 @@ static int gred_use_harddrop(struct gred_sched_data *q)
 	return q->red_flags & TC_RED_HARDDROP;
 }
 
+static bool gred_per_vq_red_flags_used(struct gred_sched *table)
+{
+	unsigned int i;
+
+	/* Local per-vq flags couldn't have been set unless global are 0 */
+	if (table->red_flags)
+		return false;
+	for (i = 0; i < MAX_DPs; i++)
+		if (table->tab[i] && table->tab[i]->red_flags)
+			return true;
+	return false;
+}
+
 static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			struct sk_buff **to_free)
 {
@@ -329,6 +342,10 @@ static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps,
 		NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count");
 		return -EINVAL;
 	}
+	if (sopt->flags && gred_per_vq_red_flags_used(table)) {
+		NL_SET_ERR_MSG_MOD(extack, "can't set per-Qdisc RED flags when per-virtual queue flags are used");
+		return -EINVAL;
+	}
 
 	sch_tree_lock(sch);
 	table->DPs = sopt->DPs;
@@ -410,15 +427,127 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
 	return 0;
 }
 
+static const struct nla_policy gred_vq_policy[TCA_GRED_VQ_MAX + 1] = {
+	[TCA_GRED_VQ_DP]	= { .type = NLA_U32 },
+	[TCA_GRED_VQ_FLAGS]	= { .type = NLA_U32 },
+};
+
+static const struct nla_policy gred_vqe_policy[TCA_GRED_VQ_ENTRY_MAX + 1] = {
+	[TCA_GRED_VQ_ENTRY]	= { .type = NLA_NESTED },
+};
+
 static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
 	[TCA_GRED_PARMS]	= { .len = sizeof(struct tc_gred_qopt) },
 	[TCA_GRED_STAB]		= { .len = 256 },
 	[TCA_GRED_DPS]		= { .len = sizeof(struct tc_gred_sopt) },
 	[TCA_GRED_MAX_P]	= { .type = NLA_U32 },
 	[TCA_GRED_LIMIT]	= { .type = NLA_U32 },
-	[TCA_GRED_VQ_LIST]	= { .type = NLA_REJECT },
+	[TCA_GRED_VQ_LIST]	= { .type = NLA_NESTED },
 };
 
+static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry)
+{
+	struct nlattr *tb[TCA_GRED_VQ_MAX + 1];
+	u32 dp;
+
+	nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL);
+
+	dp = nla_get_u32(tb[TCA_GRED_VQ_DP]);
+
+	if (tb[TCA_GRED_VQ_FLAGS])
+		table->tab[dp]->red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]);
+}
+
+static void gred_vqs_apply(struct gred_sched *table, struct nlattr *vqs)
+{
+	const struct nlattr *attr;
+	int rem;
+
+	nla_for_each_nested(attr, vqs, rem) {
+		switch (nla_type(attr)) {
+		case TCA_GRED_VQ_ENTRY:
+			gred_vq_apply(table, attr);
+			break;
+		}
+	}
+}
+
+static int gred_vq_validate(struct gred_sched *table, u32 cdp,
+			    const struct nlattr *entry,
+			    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_GRED_VQ_MAX + 1];
+	int err;
+	u32 dp;
+
+	err = nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy,
+			       extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_GRED_VQ_DP]) {
+		NL_SET_ERR_MSG_MOD(extack, "Virtual queue with no index specified");
+		return -EINVAL;
+	}
+	dp = nla_get_u32(tb[TCA_GRED_VQ_DP]);
+	if (dp >= table->DPs) {
+		NL_SET_ERR_MSG_MOD(extack, "Virtual queue with index out of bounds");
+		return -EINVAL;
+	}
+	if (dp != cdp && !table->tab[dp]) {
+		NL_SET_ERR_MSG_MOD(extack, "Virtual queue not yet instantiated");
+		return -EINVAL;
+	}
+
+	if (tb[TCA_GRED_VQ_FLAGS]) {
+		u32 red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]);
+
+		if (table->red_flags && table->red_flags != red_flags) {
+			NL_SET_ERR_MSG_MOD(extack, "can't change per-virtual queue RED flags when per-Qdisc flags are used");
+			return -EINVAL;
+		}
+		if (red_flags & ~GRED_VQ_RED_FLAGS) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "invalid RED flags specified");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int gred_vqs_validate(struct gred_sched *table, u32 cdp,
+			     struct nlattr *vqs, struct netlink_ext_ack *extack)
+{
+	const struct nlattr *attr;
+	int rem, err;
+
+	err = nla_validate_nested(vqs, TCA_GRED_VQ_ENTRY_MAX,
+				  gred_vqe_policy, extack);
+	if (err < 0)
+		return err;
+
+	nla_for_each_nested(attr, vqs, rem) {
+		switch (nla_type(attr)) {
+		case TCA_GRED_VQ_ENTRY:
+			err = gred_vq_validate(table, cdp, attr, extack);
+			if (err)
+				return err;
+			break;
+		default:
+			NL_SET_ERR_MSG_MOD(extack, "GRED_VQ_LIST can contain only entry attributes");
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Trailing data after parsing virtual queue list");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 		       struct netlink_ext_ack *extack)
 {
@@ -460,6 +589,13 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 		return -EINVAL;
 	}
 
+	if (tb[TCA_GRED_VQ_LIST]) {
+		err = gred_vqs_validate(table, ctl->DP, tb[TCA_GRED_VQ_LIST],
+					extack);
+		if (err)
+			return err;
+	}
+
 	if (gred_rio_mode(table)) {
 		if (ctl->prio == 0) {
 			int def_prio = GRED_DEF_PRIO;
@@ -483,6 +619,9 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 	if (err < 0)
 		goto err_unlock_free;
 
+	if (tb[TCA_GRED_VQ_LIST])
+		gred_vqs_apply(table, tb[TCA_GRED_VQ_LIST]);
+
 	if (gred_rio_mode(table)) {
 		gred_disable_wred_mode(table);
 		if (gred_wred_mode_check(sch))
@@ -627,6 +766,9 @@ append_opt:
 		if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP))
 			goto nla_put_failure;
 
+		if (nla_put_u32(skb, TCA_GRED_VQ_FLAGS, q->red_flags))
+			goto nla_put_failure;
+
 		/* Stats */
 		if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin,
 				      TCA_GRED_VQ_PAD))
-- 
cgit v1.2.3-59-g8ed1b


From 54e8cb786130949a4d37792383cb528176771e5d Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Thu, 15 Nov 2018 15:26:51 -0800
Subject: uapi/ethtool: fix spelling errors

Trivial spelling errors found by codespell.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index c8f8e2455bf3..17be76aeb468 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -882,7 +882,7 @@ struct ethtool_rx_flow_spec {
 	__u32		location;
 };
 
-/* How rings are layed out when accessing virtual functions or
+/* How rings are laid out when accessing virtual functions or
  * offloaded queues is device specific. To allow users to do flow
  * steering and specify these queues the ring cookie is partitioned
  * into a 32bit queue index with an 8 bit virtual function id.
@@ -891,7 +891,7 @@ struct ethtool_rx_flow_spec {
  * devices start supporting PCIe w/ARI. However at the moment I
  * do not know of any devices that support this so I do not reserve
  * space for this at this time. If a future patch consumes the next
- * byte it should be aware of this possiblity.
+ * byte it should be aware of this possibility.
  */
 #define ETHTOOL_RX_FLOW_SPEC_RING	0x00000000FFFFFFFFLL
 #define ETHTOOL_RX_FLOW_SPEC_RING_VF	0x000000FF00000000LL
-- 
cgit v1.2.3-59-g8ed1b


From e8bd8fca6773ef49390269bd467bf940a0841ccf Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Thu, 15 Nov 2018 16:44:12 -0800
Subject: tcp: add SRTT to SCM_TIMESTAMPING_OPT_STATS

Add TCP_NLA_SRTT to SCM_TIMESTAMPING_OPT_STATS that reports the smoothed
round trip time in microseconds (tcp_sock.srtt_us >> 3).

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 1 +
 net/ipv4/tcp.c           | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index e02d31986ff9..8bb6cc5f3235 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -266,6 +266,7 @@ enum {
 	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */
 	TCP_NLA_DSACK_DUPS,	/* DSACK blocks received */
 	TCP_NLA_REORD_SEEN,	/* reordering events seen */
+	TCP_NLA_SRTT,		/* smoothed RTT in usecs */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ca2b08c0b4a0..252048776dbb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3242,6 +3242,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
 		0;
 }
 
@@ -3295,6 +3296,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 			  TCP_NLA_PAD);
 	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
 	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+	nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
 
 	return stats;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 8d951a75d022d94a05f5fa74217670a981e8302d Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Fri, 16 Nov 2018 15:51:59 +1100
Subject: net/ncsi: Configure multi-package, multi-channel modes with failover

This patch extends the ncsi-netlink interface with two new commands and
three new attributes to configure multiple packages and/or channels at
once, and configure specific failover modes.

NCSI_CMD_SET_PACKAGE mask and NCSI_CMD_SET_CHANNEL_MASK set a whitelist
of packages or channels allowed to be configured with the
NCSI_ATTR_PACKAGE_MASK and NCSI_ATTR_CHANNEL_MASK attributes
respectively. If one of these whitelists is set only packages or
channels matching the whitelist are considered for the channel queue in
ncsi_choose_active_channel().

These commands may also use the NCSI_ATTR_MULTI_FLAG to signal that
multiple packages or channels may be configured simultaneously. NCSI
hardware arbitration (HWA) must be available in order to enable
multi-package mode. Multi-channel mode is always available.

If the NCSI_ATTR_CHANNEL_ID attribute is present in the
NCSI_CMD_SET_CHANNEL_MASK command the it sets the preferred channel as
with the NCSI_CMD_SET_INTERFACE command. The combination of preferred
channel and channel whitelist defines a primary channel and the allowed
failover channels.
If the NCSI_ATTR_MULTI_FLAG attribute is also present then the preferred
channel is configured for Tx/Rx and the other channels are enabled only
for Rx.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ncsi.h |  15 +++
 net/ncsi/internal.h       |  16 ++-
 net/ncsi/ncsi-aen.c       |  63 ++++++++---
 net/ncsi/ncsi-manage.c    | 264 ++++++++++++++++++++++++++++++++++++++--------
 net/ncsi/ncsi-netlink.c   | 221 +++++++++++++++++++++++++++++++++-----
 net/ncsi/ncsi-rsp.c       |   2 +-
 6 files changed, 493 insertions(+), 88 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h
index 0a26a5576645..a3f87c54fdb3 100644
--- a/include/uapi/linux/ncsi.h
+++ b/include/uapi/linux/ncsi.h
@@ -26,6 +26,12 @@
  * @NCSI_CMD_SEND_CMD: send NC-SI command to network card.
  *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID
  *	and NCSI_ATTR_CHANNEL_ID.
+ * @NCSI_CMD_SET_PACKAGE_MASK: set a whitelist of allowed packages.
+ *	Requires NCSI_ATTR_IFINDEX and NCSI_ATTR_PACKAGE_MASK.
+ * @NCSI_CMD_SET_CHANNEL_MASK: set a whitelist of allowed channels.
+ *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID, and
+ *	NCSI_ATTR_CHANNEL_MASK. If NCSI_ATTR_CHANNEL_ID is present it sets
+ *	the primary channel.
  * @NCSI_CMD_MAX: highest command number
  */
 enum ncsi_nl_commands {
@@ -34,6 +40,8 @@ enum ncsi_nl_commands {
 	NCSI_CMD_SET_INTERFACE,
 	NCSI_CMD_CLEAR_INTERFACE,
 	NCSI_CMD_SEND_CMD,
+	NCSI_CMD_SET_PACKAGE_MASK,
+	NCSI_CMD_SET_CHANNEL_MASK,
 
 	__NCSI_CMD_AFTER_LAST,
 	NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1
@@ -48,6 +56,10 @@ enum ncsi_nl_commands {
  * @NCSI_ATTR_PACKAGE_ID: package ID
  * @NCSI_ATTR_CHANNEL_ID: channel ID
  * @NCSI_ATTR_DATA: command payload
+ * @NCSI_ATTR_MULTI_FLAG: flag to signal that multi-mode should be enabled with
+ *	NCSI_CMD_SET_PACKAGE_MASK or NCSI_CMD_SET_CHANNEL_MASK.
+ * @NCSI_ATTR_PACKAGE_MASK: 32-bit mask of allowed packages.
+ * @NCSI_ATTR_CHANNEL_MASK: 32-bit mask of allowed channels.
  * @NCSI_ATTR_MAX: highest attribute number
  */
 enum ncsi_nl_attrs {
@@ -57,6 +69,9 @@ enum ncsi_nl_attrs {
 	NCSI_ATTR_PACKAGE_ID,
 	NCSI_ATTR_CHANNEL_ID,
 	NCSI_ATTR_DATA,
+	NCSI_ATTR_MULTI_FLAG,
+	NCSI_ATTR_PACKAGE_MASK,
+	NCSI_ATTR_CHANNEL_MASK,
 
 	__NCSI_ATTR_AFTER_LAST,
 	NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index bda51cb179fe..9e3642b802c4 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -222,6 +222,10 @@ struct ncsi_package {
 	unsigned int         channel_num; /* Number of channels     */
 	struct list_head     channels;    /* List of chanels        */
 	struct list_head     node;        /* Form list of packages  */
+
+	bool                 multi_channel; /* Enable multiple channels  */
+	u32                  channel_whitelist; /* Channels to configure */
+	struct ncsi_channel  *preferred_channel; /* Primary channel      */
 };
 
 struct ncsi_request {
@@ -297,8 +301,6 @@ struct ncsi_dev_priv {
 	unsigned int        package_num;     /* Number of packages         */
 	struct list_head    packages;        /* List of packages           */
 	struct ncsi_channel *hot_channel;    /* Channel was ever active    */
-	struct ncsi_package *force_package;  /* Force a specific package   */
-	struct ncsi_channel *force_channel;  /* Force a specific channel   */
 	struct ncsi_request requests[256];   /* Request table              */
 	unsigned int        request_id;      /* Last used request ID       */
 #define NCSI_REQ_START_IDX	1
@@ -311,6 +313,9 @@ struct ncsi_dev_priv {
 	struct list_head    node;            /* Form NCSI device list      */
 #define NCSI_MAX_VLAN_VIDS	15
 	struct list_head    vlan_vids;       /* List of active VLAN IDs */
+
+	bool                multi_package;   /* Enable multiple packages   */
+	u32                 package_whitelist; /* Packages to configure    */
 };
 
 struct ncsi_cmd_arg {
@@ -364,6 +369,13 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp,
 void ncsi_free_request(struct ncsi_request *nr);
 struct ncsi_dev *ncsi_find_dev(struct net_device *dev);
 int ncsi_process_next_channel(struct ncsi_dev_priv *ndp);
+bool ncsi_channel_has_link(struct ncsi_channel *channel);
+bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
+			  struct ncsi_channel *channel);
+int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp,
+			   struct ncsi_package *np,
+			   struct ncsi_channel *disable,
+			   struct ncsi_channel *enable);
 
 /* Packet handlers */
 u32 ncsi_calculate_checksum(unsigned char *data, int len);
diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index 57f77e5d381a..26d67e27551f 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -50,14 +50,15 @@ static int ncsi_validate_aen_pkt(struct ncsi_aen_pkt_hdr *h,
 static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 				struct ncsi_aen_pkt_hdr *h)
 {
-	struct ncsi_aen_lsc_pkt *lsc;
-	struct ncsi_channel *nc;
+	struct ncsi_channel *nc, *tmp;
 	struct ncsi_channel_mode *ncm;
-	bool chained;
-	int state;
 	unsigned long old_data, data;
-	unsigned long flags;
+	struct ncsi_aen_lsc_pkt *lsc;
+	struct ncsi_package *np;
 	bool had_link, has_link;
+	unsigned long flags;
+	bool chained;
+	int state;
 
 	/* Find the NCSI channel */
 	ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc);
@@ -92,14 +93,52 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	if ((had_link == has_link) || chained)
 		return 0;
 
-	if (had_link)
-		ndp->flags |= NCSI_DEV_RESHUFFLE;
-	ncsi_stop_channel_monitor(nc);
-	spin_lock_irqsave(&ndp->lock, flags);
-	list_add_tail_rcu(&nc->link, &ndp->channel_queue);
-	spin_unlock_irqrestore(&ndp->lock, flags);
+	if (!ndp->multi_package && !nc->package->multi_channel) {
+		if (had_link) {
+			ndp->flags |= NCSI_DEV_RESHUFFLE;
+			ncsi_stop_channel_monitor(nc);
+			spin_lock_irqsave(&ndp->lock, flags);
+			list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+			spin_unlock_irqrestore(&ndp->lock, flags);
+			return ncsi_process_next_channel(ndp);
+		}
+		/* Configured channel came up */
+		return 0;
+	}
 
-	return ncsi_process_next_channel(ndp);
+	if (had_link) {
+		ncm = &nc->modes[NCSI_MODE_TX_ENABLE];
+		if (ncsi_channel_is_last(ndp, nc)) {
+			/* No channels left, reconfigure */
+			return ncsi_reset_dev(&ndp->ndev);
+		} else if (ncm->enable) {
+			/* Need to failover Tx channel */
+			ncsi_update_tx_channel(ndp, nc->package, nc, NULL);
+		}
+	} else if (has_link && nc->package->preferred_channel == nc) {
+		/* Return Tx to preferred channel */
+		ncsi_update_tx_channel(ndp, nc->package, NULL, nc);
+	} else if (has_link) {
+		NCSI_FOR_EACH_PACKAGE(ndp, np) {
+			NCSI_FOR_EACH_CHANNEL(np, tmp) {
+				/* Enable Tx on this channel if the current Tx
+				 * channel is down.
+				 */
+				ncm = &tmp->modes[NCSI_MODE_TX_ENABLE];
+				if (ncm->enable &&
+				    !ncsi_channel_has_link(tmp)) {
+					ncsi_update_tx_channel(ndp, nc->package,
+							       tmp, nc);
+					break;
+				}
+			}
+		}
+	}
+
+	/* Leave configured channels active in a multi-channel scenario so
+	 * AEN events are still received.
+	 */
+	return 0;
 }
 
 static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp,
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index c814ce9bb3de..92e59f07f9a7 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -28,6 +28,29 @@
 LIST_HEAD(ncsi_dev_list);
 DEFINE_SPINLOCK(ncsi_dev_lock);
 
+bool ncsi_channel_has_link(struct ncsi_channel *channel)
+{
+	return !!(channel->modes[NCSI_MODE_LINK].data[2] & 0x1);
+}
+
+bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp,
+			  struct ncsi_channel *channel)
+{
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+
+	NCSI_FOR_EACH_PACKAGE(ndp, np)
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			if (nc == channel)
+				continue;
+			if (nc->state == NCSI_CHANNEL_ACTIVE &&
+			    ncsi_channel_has_link(nc))
+				return false;
+		}
+
+	return true;
+}
+
 static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
 {
 	struct ncsi_dev *nd = &ndp->ndev;
@@ -52,7 +75,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
 				continue;
 			}
 
-			if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
+			if (ncsi_channel_has_link(nc)) {
 				spin_unlock_irqrestore(&nc->lock, flags);
 				nd->link_up = 1;
 				goto report;
@@ -267,6 +290,7 @@ struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp,
 	np->ndp = ndp;
 	spin_lock_init(&np->lock);
 	INIT_LIST_HEAD(&np->channels);
+	np->channel_whitelist = UINT_MAX;
 
 	spin_lock_irqsave(&ndp->lock, flags);
 	tmp = ncsi_find_package(ndp, id);
@@ -728,13 +752,144 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id)
 
 #endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
 
+/* Determine if a given channel from the channel_queue should be used for Tx */
+static bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp,
+			       struct ncsi_channel *nc)
+{
+	struct ncsi_channel_mode *ncm;
+	struct ncsi_channel *channel;
+	struct ncsi_package *np;
+
+	/* Check if any other channel has Tx enabled; a channel may have already
+	 * been configured and removed from the channel queue.
+	 */
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		if (!ndp->multi_package && np != nc->package)
+			continue;
+		NCSI_FOR_EACH_CHANNEL(np, channel) {
+			ncm = &channel->modes[NCSI_MODE_TX_ENABLE];
+			if (ncm->enable)
+				return false;
+		}
+	}
+
+	/* This channel is the preferred channel and has link */
+	list_for_each_entry_rcu(channel, &ndp->channel_queue, link) {
+		np = channel->package;
+		if (np->preferred_channel &&
+		    ncsi_channel_has_link(np->preferred_channel)) {
+			return np->preferred_channel == nc;
+		}
+	}
+
+	/* This channel has link */
+	if (ncsi_channel_has_link(nc))
+		return true;
+
+	list_for_each_entry_rcu(channel, &ndp->channel_queue, link)
+		if (ncsi_channel_has_link(channel))
+			return false;
+
+	/* No other channel has link; default to this one */
+	return true;
+}
+
+/* Change the active Tx channel in a multi-channel setup */
+int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp,
+			   struct ncsi_package *package,
+			   struct ncsi_channel *disable,
+			   struct ncsi_channel *enable)
+{
+	struct ncsi_cmd_arg nca;
+	struct ncsi_channel *nc;
+	struct ncsi_package *np;
+	int ret = 0;
+
+	if (!package->multi_channel && !ndp->multi_package)
+		netdev_warn(ndp->ndev.dev,
+			    "NCSI: Trying to update Tx channel in single-channel mode\n");
+	nca.ndp = ndp;
+	nca.req_flags = 0;
+
+	/* Find current channel with Tx enabled */
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		if (disable)
+			break;
+		if (!ndp->multi_package && np != package)
+			continue;
+
+		NCSI_FOR_EACH_CHANNEL(np, nc)
+			if (nc->modes[NCSI_MODE_TX_ENABLE].enable) {
+				disable = nc;
+				break;
+			}
+	}
+
+	/* Find a suitable channel for Tx */
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		if (enable)
+			break;
+		if (!ndp->multi_package && np != package)
+			continue;
+		if (!(ndp->package_whitelist & (0x1 << np->id)))
+			continue;
+
+		if (np->preferred_channel &&
+		    ncsi_channel_has_link(np->preferred_channel)) {
+			enable = np->preferred_channel;
+			break;
+		}
+
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			if (!(np->channel_whitelist & 0x1 << nc->id))
+				continue;
+			if (nc->state != NCSI_CHANNEL_ACTIVE)
+				continue;
+			if (ncsi_channel_has_link(nc)) {
+				enable = nc;
+				break;
+			}
+		}
+	}
+
+	if (disable == enable)
+		return -1;
+
+	if (!enable)
+		return -1;
+
+	if (disable) {
+		nca.channel = disable->id;
+		nca.package = disable->package->id;
+		nca.type = NCSI_PKT_CMD_DCNT;
+		ret = ncsi_xmit_cmd(&nca);
+		if (ret)
+			netdev_err(ndp->ndev.dev,
+				   "Error %d sending DCNT\n",
+				   ret);
+	}
+
+	netdev_info(ndp->ndev.dev, "NCSI: channel %u enables Tx\n", enable->id);
+
+	nca.channel = enable->id;
+	nca.package = enable->package->id;
+	nca.type = NCSI_PKT_CMD_ECNT;
+	ret = ncsi_xmit_cmd(&nca);
+	if (ret)
+		netdev_err(ndp->ndev.dev,
+			   "Error %d sending ECNT\n",
+			   ret);
+
+	return ret;
+}
+
 static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 {
-	struct ncsi_dev *nd = &ndp->ndev;
-	struct net_device *dev = nd->dev;
 	struct ncsi_package *np = ndp->active_package;
 	struct ncsi_channel *nc = ndp->active_channel;
 	struct ncsi_channel *hot_nc = NULL;
+	struct ncsi_dev *nd = &ndp->ndev;
+	struct net_device *dev = nd->dev;
 	struct ncsi_cmd_arg nca;
 	unsigned char index;
 	unsigned long flags;
@@ -856,20 +1011,29 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 		} else if (nd->state == ncsi_dev_state_config_ebf) {
 			nca.type = NCSI_PKT_CMD_EBF;
 			nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap;
-			nd->state = ncsi_dev_state_config_ecnt;
+			if (ncsi_channel_is_tx(ndp, nc))
+				nd->state = ncsi_dev_state_config_ecnt;
+			else
+				nd->state = ncsi_dev_state_config_ec;
 #if IS_ENABLED(CONFIG_IPV6)
 			if (ndp->inet6_addr_num > 0 &&
 			    (nc->caps[NCSI_CAP_GENERIC].cap &
 			     NCSI_CAP_GENERIC_MC))
 				nd->state = ncsi_dev_state_config_egmf;
-			else
-				nd->state = ncsi_dev_state_config_ecnt;
 		} else if (nd->state == ncsi_dev_state_config_egmf) {
 			nca.type = NCSI_PKT_CMD_EGMF;
 			nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
-			nd->state = ncsi_dev_state_config_ecnt;
+			if (ncsi_channel_is_tx(ndp, nc))
+				nd->state = ncsi_dev_state_config_ecnt;
+			else
+				nd->state = ncsi_dev_state_config_ec;
 #endif /* CONFIG_IPV6 */
 		} else if (nd->state == ncsi_dev_state_config_ecnt) {
+			if (np->preferred_channel &&
+			    nc != np->preferred_channel)
+				netdev_info(ndp->ndev.dev,
+					    "NCSI: Tx failed over to channel %u\n",
+					    nc->id);
 			nca.type = NCSI_PKT_CMD_ECNT;
 			nd->state = ncsi_dev_state_config_ec;
 		} else if (nd->state == ncsi_dev_state_config_ec) {
@@ -959,43 +1123,35 @@ error:
 
 static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
 {
-	struct ncsi_package *np, *force_package;
-	struct ncsi_channel *nc, *found, *hot_nc, *force_channel;
+	struct ncsi_channel *nc, *found, *hot_nc;
 	struct ncsi_channel_mode *ncm;
-	unsigned long flags;
+	unsigned long flags, cflags;
+	struct ncsi_package *np;
+	bool with_link;
 
 	spin_lock_irqsave(&ndp->lock, flags);
 	hot_nc = ndp->hot_channel;
-	force_channel = ndp->force_channel;
-	force_package = ndp->force_package;
 	spin_unlock_irqrestore(&ndp->lock, flags);
 
-	/* Force a specific channel whether or not it has link if we have been
-	 * configured to do so
-	 */
-	if (force_package && force_channel) {
-		found = force_channel;
-		ncm = &found->modes[NCSI_MODE_LINK];
-		if (!(ncm->data[2] & 0x1))
-			netdev_info(ndp->ndev.dev,
-				    "NCSI: Channel %u forced, but it is link down\n",
-				    found->id);
-		goto out;
-	}
-
-	/* The search is done once an inactive channel with up
-	 * link is found.
+	/* By default the search is done once an inactive channel with up
+	 * link is found, unless a preferred channel is set.
+	 * If multi_package or multi_channel are configured all channels in the
+	 * whitelist are added to the channel queue.
 	 */
 	found = NULL;
+	with_link = false;
 	NCSI_FOR_EACH_PACKAGE(ndp, np) {
-		if (ndp->force_package && np != ndp->force_package)
+		if (!(ndp->package_whitelist & (0x1 << np->id)))
 			continue;
 		NCSI_FOR_EACH_CHANNEL(np, nc) {
-			spin_lock_irqsave(&nc->lock, flags);
+			if (!(np->channel_whitelist & (0x1 << nc->id)))
+				continue;
+
+			spin_lock_irqsave(&nc->lock, cflags);
 
 			if (!list_empty(&nc->link) ||
 			    nc->state != NCSI_CHANNEL_INACTIVE) {
-				spin_unlock_irqrestore(&nc->lock, flags);
+				spin_unlock_irqrestore(&nc->lock, cflags);
 				continue;
 			}
 
@@ -1007,32 +1163,49 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
 
 			ncm = &nc->modes[NCSI_MODE_LINK];
 			if (ncm->data[2] & 0x1) {
-				spin_unlock_irqrestore(&nc->lock, flags);
 				found = nc;
-				goto out;
+				with_link = true;
 			}
 
-			spin_unlock_irqrestore(&nc->lock, flags);
+			/* If multi_channel is enabled configure all valid
+			 * channels whether or not they currently have link
+			 * so they will have AENs enabled.
+			 */
+			if (with_link || np->multi_channel) {
+				spin_lock_irqsave(&ndp->lock, flags);
+				list_add_tail_rcu(&nc->link,
+						  &ndp->channel_queue);
+				spin_unlock_irqrestore(&ndp->lock, flags);
+
+				netdev_dbg(ndp->ndev.dev,
+					   "NCSI: Channel %u added to queue (link %s)\n",
+					   nc->id,
+					   ncm->data[2] & 0x1 ? "up" : "down");
+			}
+
+			spin_unlock_irqrestore(&nc->lock, cflags);
+
+			if (with_link && !np->multi_channel)
+				break;
 		}
+		if (with_link && !ndp->multi_package)
+			break;
 	}
 
-	if (!found) {
+	if (list_empty(&ndp->channel_queue) && found) {
+		netdev_info(ndp->ndev.dev,
+			    "NCSI: No channel with link found, configuring channel %u\n",
+			    found->id);
+		spin_lock_irqsave(&ndp->lock, flags);
+		list_add_tail_rcu(&found->link, &ndp->channel_queue);
+		spin_unlock_irqrestore(&ndp->lock, flags);
+	} else if (!found) {
 		netdev_warn(ndp->ndev.dev,
-			    "NCSI: No channel found with link\n");
+			    "NCSI: No channel found to configure!\n");
 		ncsi_report_link(ndp, true);
 		return -ENODEV;
 	}
 
-	ncm = &found->modes[NCSI_MODE_LINK];
-	netdev_dbg(ndp->ndev.dev,
-		   "NCSI: Channel %u added to queue (link %s)\n",
-		   found->id, ncm->data[2] & 0x1 ? "up" : "down");
-
-out:
-	spin_lock_irqsave(&ndp->lock, flags);
-	list_add_tail_rcu(&found->link, &ndp->channel_queue);
-	spin_unlock_irqrestore(&ndp->lock, flags);
-
 	return ncsi_process_next_channel(ndp);
 }
 
@@ -1517,6 +1690,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 	INIT_LIST_HEAD(&ndp->channel_queue);
 	INIT_LIST_HEAD(&ndp->vlan_vids);
 	INIT_WORK(&ndp->work, ncsi_dev_work);
+	ndp->package_whitelist = UINT_MAX;
 
 	/* Initialize private NCSI device */
 	spin_lock_init(&ndp->lock);
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index cde48fe43dba..5d782445d2fc 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -30,6 +30,9 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
 	[NCSI_ATTR_PACKAGE_ID] =	{ .type = NLA_U32 },
 	[NCSI_ATTR_CHANNEL_ID] =	{ .type = NLA_U32 },
 	[NCSI_ATTR_DATA] =		{ .type = NLA_BINARY, .len = 2048 },
+	[NCSI_ATTR_MULTI_FLAG] =	{ .type = NLA_FLAG },
+	[NCSI_ATTR_PACKAGE_MASK] =	{ .type = NLA_U32 },
+	[NCSI_ATTR_CHANNEL_MASK] =	{ .type = NLA_U32 },
 };
 
 static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex)
@@ -69,7 +72,7 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
 	nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]);
 	if (nc->state == NCSI_CHANNEL_ACTIVE)
 		nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE);
-	if (ndp->force_channel == nc)
+	if (nc == nc->package->preferred_channel)
 		nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED);
 
 	nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version);
@@ -114,7 +117,7 @@ static int ncsi_write_package_info(struct sk_buff *skb,
 		if (!pnest)
 			return -ENOMEM;
 		nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id);
-		if (ndp->force_package == np)
+		if ((0x1 << np->id) == ndp->package_whitelist)
 			nla_put_flag(skb, NCSI_PKG_ATTR_FORCED);
 		cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST);
 		if (!cnest) {
@@ -290,45 +293,54 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
 	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
 	package = NULL;
 
-	spin_lock_irqsave(&ndp->lock, flags);
-
 	NCSI_FOR_EACH_PACKAGE(ndp, np)
 		if (np->id == package_id)
 			package = np;
 	if (!package) {
 		/* The user has set a package that does not exist */
-		spin_unlock_irqrestore(&ndp->lock, flags);
 		return -ERANGE;
 	}
 
 	channel = NULL;
-	if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
-		/* Allow any channel */
-		channel_id = NCSI_RESERVED_CHANNEL;
-	} else {
+	if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
 		channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
 		NCSI_FOR_EACH_CHANNEL(package, nc)
-			if (nc->id == channel_id)
+			if (nc->id == channel_id) {
 				channel = nc;
+				break;
+			}
+		if (!channel) {
+			netdev_info(ndp->ndev.dev,
+				    "NCSI: Channel %u does not exist!\n",
+				    channel_id);
+			return -ERANGE;
+		}
 	}
 
-	if (channel_id != NCSI_RESERVED_CHANNEL && !channel) {
-		/* The user has set a channel that does not exist on this
-		 * package
-		 */
-		spin_unlock_irqrestore(&ndp->lock, flags);
-		netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n",
-			    channel_id);
-		return -ERANGE;
-	}
-
-	ndp->force_package = package;
-	ndp->force_channel = channel;
+	spin_lock_irqsave(&ndp->lock, flags);
+	ndp->package_whitelist = 0x1 << package->id;
+	ndp->multi_package = false;
 	spin_unlock_irqrestore(&ndp->lock, flags);
 
-	netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n",
-		    package_id, channel_id,
-		    channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : "");
+	spin_lock_irqsave(&package->lock, flags);
+	package->multi_channel = false;
+	if (channel) {
+		package->channel_whitelist = 0x1 << channel->id;
+		package->preferred_channel = channel;
+	} else {
+		/* Allow any channel */
+		package->channel_whitelist = UINT_MAX;
+		package->preferred_channel = NULL;
+	}
+	spin_unlock_irqrestore(&package->lock, flags);
+
+	if (channel)
+		netdev_info(ndp->ndev.dev,
+			    "Set package 0x%x, channel 0x%x as preferred\n",
+			    package_id, channel_id);
+	else
+		netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n",
+			    package_id);
 
 	/* Update channel configuration */
 	if (!(ndp->flags & NCSI_DEV_RESET))
@@ -340,6 +352,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
 static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
 {
 	struct ncsi_dev_priv *ndp;
+	struct ncsi_package *np;
 	unsigned long flags;
 
 	if (!info || !info->attrs)
@@ -353,11 +366,19 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
 	if (!ndp)
 		return -ENODEV;
 
-	/* Clear any override */
+	/* Reset any whitelists and disable multi mode */
 	spin_lock_irqsave(&ndp->lock, flags);
-	ndp->force_package = NULL;
-	ndp->force_channel = NULL;
+	ndp->package_whitelist = UINT_MAX;
+	ndp->multi_package = false;
 	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		spin_lock_irqsave(&np->lock, flags);
+		np->multi_channel = false;
+		np->channel_whitelist = UINT_MAX;
+		np->preferred_channel = NULL;
+		spin_unlock_irqrestore(&np->lock, flags);
+	}
 	netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n");
 
 	/* Update channel configuration */
@@ -563,6 +584,138 @@ int ncsi_send_netlink_err(struct net_device *dev,
 	return nlmsg_unicast(net->genl_sock, skb, snd_portid);
 }
 
+static int ncsi_set_package_mask_nl(struct sk_buff *msg,
+				    struct genl_info *info)
+{
+	struct ncsi_dev_priv *ndp;
+	unsigned long flags;
+	int rc;
+
+	if (!info || !info->attrs)
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_IFINDEX])
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_PACKAGE_MASK])
+		return -EINVAL;
+
+	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
+			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+	if (!ndp)
+		return -ENODEV;
+
+	spin_lock_irqsave(&ndp->lock, flags);
+	if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
+		if (ndp->flags & NCSI_DEV_HWA) {
+			ndp->multi_package = true;
+			rc = 0;
+		} else {
+			netdev_err(ndp->ndev.dev,
+				   "NCSI: Can't use multiple packages without HWA\n");
+			rc = -EPERM;
+		}
+	} else {
+		ndp->multi_package = false;
+		rc = 0;
+	}
+
+	if (!rc)
+		ndp->package_whitelist =
+			nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]);
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	if (!rc) {
+		/* Update channel configuration */
+		if (!(ndp->flags & NCSI_DEV_RESET))
+			ncsi_reset_dev(&ndp->ndev);
+	}
+
+	return rc;
+}
+
+static int ncsi_set_channel_mask_nl(struct sk_buff *msg,
+				    struct genl_info *info)
+{
+	struct ncsi_package *np, *package;
+	struct ncsi_channel *nc, *channel;
+	u32 package_id, channel_id;
+	struct ncsi_dev_priv *ndp;
+	unsigned long flags;
+
+	if (!info || !info->attrs)
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_IFINDEX])
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_PACKAGE_ID])
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_CHANNEL_MASK])
+		return -EINVAL;
+
+	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
+			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+	if (!ndp)
+		return -ENODEV;
+
+	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
+	package = NULL;
+	NCSI_FOR_EACH_PACKAGE(ndp, np)
+		if (np->id == package_id) {
+			package = np;
+			break;
+		}
+	if (!package)
+		return -ERANGE;
+
+	spin_lock_irqsave(&package->lock, flags);
+
+	channel = NULL;
+	if (info->attrs[NCSI_ATTR_CHANNEL_ID]) {
+		channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
+		NCSI_FOR_EACH_CHANNEL(np, nc)
+			if (nc->id == channel_id) {
+				channel = nc;
+				break;
+			}
+		if (!channel) {
+			spin_unlock_irqrestore(&package->lock, flags);
+			return -ERANGE;
+		}
+		netdev_dbg(ndp->ndev.dev,
+			   "NCSI: Channel %u set as preferred channel\n",
+			   channel->id);
+	}
+
+	package->channel_whitelist =
+		nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]);
+	if (package->channel_whitelist == 0)
+		netdev_dbg(ndp->ndev.dev,
+			   "NCSI: Package %u set to all channels disabled\n",
+			   package->id);
+
+	package->preferred_channel = channel;
+
+	if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) {
+		package->multi_channel = true;
+		netdev_info(ndp->ndev.dev,
+			    "NCSI: Multi-channel enabled on package %u\n",
+			    package_id);
+	} else {
+		package->multi_channel = false;
+	}
+
+	spin_unlock_irqrestore(&package->lock, flags);
+
+	/* Update channel configuration */
+	if (!(ndp->flags & NCSI_DEV_RESET))
+		ncsi_reset_dev(&ndp->ndev);
+
+	return 0;
+}
+
 static const struct genl_ops ncsi_ops[] = {
 	{
 		.cmd = NCSI_CMD_PKG_INFO,
@@ -589,6 +742,18 @@ static const struct genl_ops ncsi_ops[] = {
 		.doit = ncsi_send_cmd_nl,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NCSI_CMD_SET_PACKAGE_MASK,
+		.policy = ncsi_genl_policy,
+		.doit = ncsi_set_package_mask_nl,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NCSI_CMD_SET_CHANNEL_MASK,
+		.policy = ncsi_genl_policy,
+		.doit = ncsi_set_channel_mask_nl,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family ncsi_genl_family __ro_after_init = {
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 77e07ba3f493..de7737a27889 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -256,7 +256,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr)
 	if (!ncm->enable)
 		return 0;
 
-	ncm->enable = 1;
+	ncm->enable = 0;
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From fe841686470d383e33b606d0704ef4295141c582 Mon Sep 17 00:00:00 2001
From: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Date: Fri, 16 Nov 2018 15:55:09 +0200
Subject: Revert "drm/i915/perf: add a parameter to control the size of OA
 buffer"

Userspace portion is still missing.

This reverts commit cd956bfcd0f58d20485ac0a785415f7d9327a95f.

Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181116135510.13807-1-joonas.lahtinen@linux.intel.com
---
 drivers/gpu/drm/i915/i915_drv.h  |  1 -
 drivers/gpu/drm/i915/i915_perf.c | 99 ++++++++++++++--------------------------
 drivers/gpu/drm/i915/i915_reg.h  |  2 -
 include/uapi/drm/i915_drm.h      |  7 ---
 4 files changed, 33 insertions(+), 76 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d69b71d368d3..017f851a586a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2004,7 +2004,6 @@ struct drm_i915_private {
 				u32 last_ctx_id;
 				int format;
 				int format_size;
-				int size_exponent;
 
 				/**
 				 * Locks reads and writes to all head/tail state
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 2c2b63be7a6c..c762418d3b01 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -212,7 +212,13 @@
 #include "i915_oa_icl.h"
 #include "intel_lrc_reg.h"
 
-#define OA_TAKEN(tail, head)	(((tail) - (head)) & (dev_priv->perf.oa.oa_buffer.vma->size - 1))
+/* HW requires this to be a power of two, between 128k and 16M, though driver
+ * is currently generally designed assuming the largest 16M size is used such
+ * that the overflow cases are unlikely in normal operation.
+ */
+#define OA_BUFFER_SIZE		SZ_16M
+
+#define OA_TAKEN(tail, head)	((tail - head) & (OA_BUFFER_SIZE - 1))
 
 /**
  * DOC: OA Tail Pointer Race
@@ -356,7 +362,6 @@ struct perf_open_properties {
 	int oa_format;
 	bool oa_periodic;
 	int oa_period_exponent;
-	u32 oa_buffer_size_exponent;
 };
 
 static void free_oa_config(struct drm_i915_private *dev_priv,
@@ -519,7 +524,7 @@ static bool oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 		 * could put the tail out of bounds...
 		 */
 		if (hw_tail >= gtt_offset &&
-		    hw_tail < (gtt_offset + dev_priv->perf.oa.oa_buffer.vma->size)) {
+		    hw_tail < (gtt_offset + OA_BUFFER_SIZE)) {
 			dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset =
 				aging_tail = hw_tail;
 			dev_priv->perf.oa.oa_buffer.aging_timestamp = now;
@@ -648,7 +653,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
-	u32 mask = (dev_priv->perf.oa.oa_buffer.vma->size - 1);
+	u32 mask = (OA_BUFFER_SIZE - 1);
 	size_t start_offset = *offset;
 	unsigned long flags;
 	unsigned int aged_tail_idx;
@@ -688,8 +693,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	 * only be incremented by multiples of the report size (notably also
 	 * all a power of two).
 	 */
-	if (WARN_ONCE(head > dev_priv->perf.oa.oa_buffer.vma->size || head % report_size ||
-		      tail > dev_priv->perf.oa.oa_buffer.vma->size || tail % report_size,
+	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
+		      tail > OA_BUFFER_SIZE || tail % report_size,
 		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 		      head, tail))
 		return -EIO;
@@ -712,7 +717,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * here would imply a driver bug that would result
 		 * in an overrun.
 		 */
-		if (WARN_ON((dev_priv->perf.oa.oa_buffer.vma->size - head) < report_size)) {
+		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
 			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
 			break;
 		}
@@ -871,6 +876,11 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 	 * automatically triggered reports in this condition and so we
 	 * have to assume that old reports are now being trampled
 	 * over.
+	 *
+	 * Considering how we don't currently give userspace control
+	 * over the OA buffer size and always configure a large 16MB
+	 * buffer, then a buffer overflow does anyway likely indicate
+	 * that something has gone quite badly wrong.
 	 */
 	if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
 		ret = append_oa_status(stream, buf, count, offset,
@@ -932,7 +942,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
-	u32 mask = (dev_priv->perf.oa.oa_buffer.vma->size - 1);
+	u32 mask = (OA_BUFFER_SIZE - 1);
 	size_t start_offset = *offset;
 	unsigned long flags;
 	unsigned int aged_tail_idx;
@@ -969,8 +979,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	 * only be incremented by multiples of the report size (notably also
 	 * all a power of two).
 	 */
-	if (WARN_ONCE(head > dev_priv->perf.oa.oa_buffer.vma->size || head % report_size ||
-		      tail > dev_priv->perf.oa.oa_buffer.vma->size || tail % report_size,
+	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
+		      tail > OA_BUFFER_SIZE || tail % report_size,
 		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 		      head, tail))
 		return -EIO;
@@ -990,7 +1000,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 		 * here would imply a driver bug that would result
 		 * in an overrun.
 		 */
-		if (WARN_ON((dev_priv->perf.oa.oa_buffer.vma->size - head) < report_size)) {
+		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
 			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
 			break;
 		}
@@ -1385,9 +1395,7 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 
 	I915_WRITE(GEN7_OABUFFER, gtt_offset);
 
-	I915_WRITE(GEN7_OASTATUS1, gtt_offset |
-		   ((dev_priv->perf.oa.oa_buffer.size_exponent - 17) <<
-		    GEN7_OASTATUS1_BUFFER_SIZE_SHIFT)); /* tail */
+	I915_WRITE(GEN7_OASTATUS1, gtt_offset | OABUFFER_SIZE_16M); /* tail */
 
 	/* Mark that we need updated tail pointers to read from... */
 	dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
@@ -1412,8 +1420,7 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 * the assumption that new reports are being written to zeroed
 	 * memory...
 	 */
-	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0,
-	       dev_priv->perf.oa.oa_buffer.vma->size);
+	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
 
 	/* Maybe make ->pollin per-stream state if we support multiple
 	 * concurrent streams in the future.
@@ -1443,9 +1450,7 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 *  bit."
 	 */
 	I915_WRITE(GEN8_OABUFFER, gtt_offset |
-		   ((dev_priv->perf.oa.oa_buffer.size_exponent - 17) <<
-		    GEN8_OABUFFER_BUFFER_SIZE_SHIFT) |
-		   GEN8_OABUFFER_MEM_SELECT_GGTT);
+		   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
 	I915_WRITE(GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
 
 	/* Mark that we need updated tail pointers to read from... */
@@ -1473,8 +1478,7 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 * the assumption that new reports are being written to zeroed
 	 * memory...
 	 */
-	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0,
-	       dev_priv->perf.oa.oa_buffer.vma->size);
+	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
 
 	/*
 	 * Maybe make ->pollin per-stream state if we support multiple
@@ -1483,24 +1487,23 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	dev_priv->perf.oa.pollin = false;
 }
 
-static int alloc_oa_buffer(struct drm_i915_private *dev_priv, int size_exponent)
+static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 {
 	struct drm_i915_gem_object *bo;
 	struct i915_vma *vma;
-	size_t size = 1U << size_exponent;
 	int ret;
 
 	if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
 		return -ENODEV;
 
-	if (WARN_ON(size < SZ_128K || size > SZ_16M))
-		return -EINVAL;
-
 	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
 	if (ret)
 		return ret;
 
-	bo = i915_gem_object_create(dev_priv, size);
+	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
+	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
+
+	bo = i915_gem_object_create(dev_priv, OA_BUFFER_SIZE);
 	if (IS_ERR(bo)) {
 		DRM_ERROR("Failed to allocate OA buffer\n");
 		ret = PTR_ERR(bo);
@@ -1518,7 +1521,6 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv, int size_exponent)
 		goto err_unref;
 	}
 	dev_priv->perf.oa.oa_buffer.vma = vma;
-	dev_priv->perf.oa.oa_buffer.size_exponent = size_exponent;
 
 	dev_priv->perf.oa.oa_buffer.vaddr =
 		i915_gem_object_pin_map(bo, I915_MAP_WB);
@@ -1527,10 +1529,9 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv, int size_exponent)
 		goto err_unpin;
 	}
 
-	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p, size = %llu\n",
+	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p\n",
 			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
-			 dev_priv->perf.oa.oa_buffer.vaddr,
-			 dev_priv->perf.oa.oa_buffer.vma->size);
+			 dev_priv->perf.oa.oa_buffer.vaddr);
 
 	goto unlock;
 
@@ -2090,7 +2091,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	intel_runtime_pm_get(dev_priv);
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
-	ret = alloc_oa_buffer(dev_priv, props->oa_buffer_size_exponent);
+	ret = alloc_oa_buffer(dev_priv);
 	if (ret)
 		goto err_oa_buf_alloc;
 
@@ -2649,26 +2650,6 @@ static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
 			 1000ULL * INTEL_INFO(dev_priv)->cs_timestamp_frequency_khz);
 }
 
-static int
-select_oa_buffer_exponent(struct drm_i915_private *i915,
-			  u64 requested_size)
-{
-	int order;
-
-	/*
-	 * When no size is specified, use the largest size supported by all
-	 * generations.
-	 */
-	if (!requested_size)
-		return order_base_2(SZ_16M);
-
-	order = order_base_2(clamp_t(u64, requested_size, SZ_128K, SZ_16M));
-	if (requested_size != (1UL << order))
-		return -EINVAL;
-
-	return order;
-}
-
 /**
  * read_properties_unlocked - validate + copy userspace stream open properties
  * @dev_priv: i915 device instance
@@ -2796,14 +2777,6 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 			props->oa_periodic = true;
 			props->oa_period_exponent = value;
 			break;
-		case DRM_I915_PERF_PROP_OA_BUFFER_SIZE:
-			ret = select_oa_buffer_exponent(dev_priv, value);
-			if (ret < 0) {
-				DRM_DEBUG("OA buffer size invalid %llu\n", value);
-				return ret;
-			}
-			props->oa_buffer_size_exponent = ret;
-			break;
 		case DRM_I915_PERF_PROP_MAX:
 			MISSING_CASE(id);
 			return -EINVAL;
@@ -2812,12 +2785,6 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		uprop += 2;
 	}
 
-	/* If no buffer size was requested, select the default one. */
-	if (!props->oa_buffer_size_exponent) {
-		props->oa_buffer_size_exponent =
-			select_oa_buffer_exponent(dev_priv, 0);
-	}
-
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 94ba86018a4f..edb58af1e903 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -586,14 +586,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define GEN8_OABUFFER_UDW _MMIO(0x23b4)
 #define GEN8_OABUFFER _MMIO(0x2b14)
 #define  GEN8_OABUFFER_MEM_SELECT_GGTT      (1 << 0)  /* 0: PPGTT, 1: GGTT */
-#define  GEN8_OABUFFER_BUFFER_SIZE_SHIFT    3
 
 #define GEN7_OASTATUS1 _MMIO(0x2364)
 #define  GEN7_OASTATUS1_TAIL_MASK	    0xffffffc0
 #define  GEN7_OASTATUS1_COUNTER_OVERFLOW    (1 << 2)
 #define  GEN7_OASTATUS1_OABUFFER_OVERFLOW   (1 << 1)
 #define  GEN7_OASTATUS1_REPORT_LOST	    (1 << 0)
-#define  GEN7_OASTATUS1_BUFFER_SIZE_SHIFT   3
 
 #define GEN7_OASTATUS2 _MMIO(0x2368)
 #define  GEN7_OASTATUS2_HEAD_MASK           0xffffffc0
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index e477ef8c644e..298b2e197744 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1540,13 +1540,6 @@ enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_OA_EXPONENT,
 
-	/**
-	 * Specify a global OA buffer size to be allocated in bytes. The size
-	 * specified must be supported by HW (currently supported sizes are
-	 * powers of 2 ranging from 128Kb to 16Mb).
-	 */
-	DRM_I915_PERF_PROP_OA_BUFFER_SIZE,
-
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 2cc0eeb67636e0339ad7b6cdfa305f63983642af Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 18 Nov 2018 16:08:51 +0800
Subject: sctp: define subscribe in sctp_sock as __u16

The member subscribe in sctp_sock is used to indicate to which of
the events it is subscribed, more like a group of flags. So it's
better to be defined as __u16 (2 bytpes), instead of struct
sctp_event_subscribe (13 bytes).

Note that sctp_event_subscribe is an UAPI struct, used on sockopt
calls, and thus it will not be removed. This patch only changes
the internal storage of the flags.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h   |  2 +-
 include/net/sctp/ulpevent.h  | 39 ++++++++++++++++++++++++---------------
 include/uapi/linux/sctp.h    |  6 +++++-
 net/sctp/chunk.c             |  4 ++--
 net/sctp/socket.c            | 35 ++++++++++++++++++++++++++---------
 net/sctp/stream_interleave.c | 11 ++++++-----
 net/sctp/ulpqueue.c          |  8 ++++----
 7 files changed, 68 insertions(+), 37 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index af9d494120ba..bc7808aa2760 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -217,7 +217,7 @@ struct sctp_sock {
 	 * These two structures must be grouped together for the usercopy
 	 * whitelist region.
 	 */
-	struct sctp_event_subscribe subscribe;
+	__u16 subscribe;
 	struct sctp_initmsg initmsg;
 
 	int user_frag;
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 51b4e0626c34..bd922a0fe914 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -164,30 +164,39 @@ void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event,
 
 __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event);
 
+static inline void sctp_ulpevent_type_set(__u16 *subscribe,
+					  __u16 sn_type, __u8 on)
+{
+	if (sn_type > SCTP_SN_TYPE_MAX)
+		return;
+
+	if (on)
+		*subscribe |=  (1 << (sn_type - SCTP_SN_TYPE_BASE));
+	else
+		*subscribe &= ~(1 << (sn_type - SCTP_SN_TYPE_BASE));
+}
+
 /* Is this event type enabled? */
-static inline int sctp_ulpevent_type_enabled(__u16 sn_type,
-					     struct sctp_event_subscribe *mask)
+static inline bool sctp_ulpevent_type_enabled(__u16 subscribe, __u16 sn_type)
 {
-	int offset = sn_type - SCTP_SN_TYPE_BASE;
-	char *amask = (char *) mask;
+	if (sn_type > SCTP_SN_TYPE_MAX)
+		return false;
 
-	if (offset >= sizeof(struct sctp_event_subscribe))
-		return 0;
-	return amask[offset];
+	return subscribe & (1 << (sn_type - SCTP_SN_TYPE_BASE));
 }
 
 /* Given an event subscription, is this event enabled? */
-static inline int sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event,
-					   struct sctp_event_subscribe *mask)
+static inline bool sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event,
+					    __u16 subscribe)
 {
 	__u16 sn_type;
-	int enabled = 1;
 
-	if (sctp_ulpevent_is_notification(event)) {
-		sn_type = sctp_ulpevent_get_notification_type(event);
-		enabled = sctp_ulpevent_type_enabled(sn_type, mask);
-	}
-	return enabled;
+	if (!sctp_ulpevent_is_notification(event))
+		return true;
+
+	sn_type = sctp_ulpevent_get_notification_type(event);
+
+	return sctp_ulpevent_type_enabled(subscribe, sn_type);
 }
 
 #endif /* __sctp_ulpevent_h__ */
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c81feb373d3e..66afa5b4ab6b 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -632,7 +632,9 @@ union sctp_notification {
  */
 
 enum sctp_sn_type {
-	SCTP_SN_TYPE_BASE     = (1<<15),
+	SCTP_SN_TYPE_BASE	= (1<<15),
+	SCTP_DATA_IO_EVENT	= SCTP_SN_TYPE_BASE,
+#define SCTP_DATA_IO_EVENT		SCTP_DATA_IO_EVENT
 	SCTP_ASSOC_CHANGE,
 #define SCTP_ASSOC_CHANGE		SCTP_ASSOC_CHANGE
 	SCTP_PEER_ADDR_CHANGE,
@@ -657,6 +659,8 @@ enum sctp_sn_type {
 #define SCTP_ASSOC_RESET_EVENT		SCTP_ASSOC_RESET_EVENT
 	SCTP_STREAM_CHANGE_EVENT,
 #define SCTP_STREAM_CHANGE_EVENT	SCTP_STREAM_CHANGE_EVENT
+	SCTP_SN_TYPE_MAX	= SCTP_STREAM_CHANGE_EVENT,
+#define SCTP_SN_TYPE_MAX		SCTP_SN_TYPE_MAX
 };
 
 /* Notification error codes used to fill up the error fields in some
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ce8087846f05..6c761af960fd 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -109,8 +109,8 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 				error = asoc->outqueue.error;
 
 			sp = sctp_sk(asoc->base.sk);
-			notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED,
-							    &sp->subscribe);
+			notify = sctp_ulpevent_type_enabled(sp->subscribe,
+							    SCTP_SEND_FAILED);
 		}
 
 		/* Generate a SEND FAILED event only if enabled. */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5299add6d7aa..9d7512958a6a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2230,7 +2230,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	if (sp->recvrcvinfo)
 		sctp_ulpevent_read_rcvinfo(event, msg);
 	/* Check if we allow SCTP_SNDRCVINFO. */
-	if (sp->subscribe.sctp_data_io_event)
+	if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_DATA_IO_EVENT))
 		sctp_ulpevent_read_sndrcvinfo(event, msg);
 
 	err = copied;
@@ -2304,21 +2304,28 @@ static int sctp_setsockopt_disable_fragments(struct sock *sk,
 static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
 				  unsigned int optlen)
 {
-	struct sctp_association *asoc;
-	struct sctp_ulpevent *event;
+	struct sctp_event_subscribe subscribe;
+	__u8 *sn_type = (__u8 *)&subscribe;
+	struct sctp_sock *sp = sctp_sk(sk);
+	int i;
 
 	if (optlen > sizeof(struct sctp_event_subscribe))
 		return -EINVAL;
-	if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen))
+
+	if (copy_from_user(&subscribe, optval, optlen))
 		return -EFAULT;
 
+	for (i = 0; i < optlen; i++)
+		sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i,
+				       sn_type[i]);
+
 	/* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
 	 * if there is no data to be sent or retransmit, the stack will
 	 * immediately send up this notification.
 	 */
-	if (sctp_ulpevent_type_enabled(SCTP_SENDER_DRY_EVENT,
-				       &sctp_sk(sk)->subscribe)) {
-		asoc = sctp_id2assoc(sk, 0);
+	if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) {
+		struct sctp_association *asoc = sctp_id2assoc(sk, 0);
+		struct sctp_ulpevent *event;
 
 		if (asoc && sctp_outq_is_empty(&asoc->outqueue)) {
 			event = sctp_ulpevent_make_sender_dry_event(asoc,
@@ -4722,7 +4729,7 @@ static int sctp_init_sock(struct sock *sk)
 	/* Initialize default event subscriptions. By default, all the
 	 * options are off.
 	 */
-	memset(&sp->subscribe, 0, sizeof(struct sctp_event_subscribe));
+	sp->subscribe = 0;
 
 	/* Default Peer Address Parameters.  These defaults can
 	 * be modified via SCTP_PEER_ADDR_PARAMS
@@ -5267,14 +5274,24 @@ static int sctp_getsockopt_disable_fragments(struct sock *sk, int len,
 static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval,
 				  int __user *optlen)
 {
+	struct sctp_event_subscribe subscribe;
+	__u8 *sn_type = (__u8 *)&subscribe;
+	int i;
+
 	if (len == 0)
 		return -EINVAL;
 	if (len > sizeof(struct sctp_event_subscribe))
 		len = sizeof(struct sctp_event_subscribe);
 	if (put_user(len, optlen))
 		return -EFAULT;
-	if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len))
+
+	for (i = 0; i < len; i++)
+		sn_type[i] = sctp_ulpevent_type_enabled(sctp_sk(sk)->subscribe,
+							SCTP_SN_TYPE_BASE + i);
+
+	if (copy_to_user(optval, &subscribe, len))
 		return -EFAULT;
+
 	return 0;
 }
 
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 2b499a85db0e..ceef5a3a5aac 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -503,7 +503,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
 		sk_incoming_cpu_update(sk);
 	}
 
-	if (!sctp_ulpevent_is_enabled(event, &sp->subscribe))
+	if (!sctp_ulpevent_is_enabled(event, sp->subscribe))
 		goto out_free;
 
 	if (skb_list)
@@ -992,10 +992,11 @@ static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid,
 				      __u32 mid, __u16 flags, gfp_t gfp)
 {
 	struct sock *sk = ulpq->asoc->base.sk;
+	struct sctp_sock *sp = sctp_sk(sk);
 	struct sctp_ulpevent *ev = NULL;
 
-	if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
-					&sctp_sk(sk)->subscribe))
+	if (!sctp_ulpevent_type_enabled(sp->subscribe,
+					SCTP_PARTIAL_DELIVERY_EVENT))
 		return;
 
 	ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED,
@@ -1003,8 +1004,8 @@ static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid,
 	if (ev) {
 		__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
 
-		if (!sctp_sk(sk)->data_ready_signalled) {
-			sctp_sk(sk)->data_ready_signalled = 1;
+		if (!sp->data_ready_signalled) {
+			sp->data_ready_signalled = 1;
 			sk->sk_data_ready(sk);
 		}
 	}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 331cc734e3db..b36dd9024da3 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -219,7 +219,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
 		sk_incoming_cpu_update(sk);
 	}
 	/* Check if the user wishes to receive this event.  */
-	if (!sctp_ulpevent_is_enabled(event, &sp->subscribe))
+	if (!sctp_ulpevent_is_enabled(event, sp->subscribe))
 		goto out_free;
 
 	/* If we are in partial delivery mode, post to the lobby until
@@ -1129,16 +1129,16 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
 {
 	struct sctp_ulpevent *ev = NULL;
-	struct sock *sk;
 	struct sctp_sock *sp;
+	struct sock *sk;
 
 	if (!ulpq->pd_mode)
 		return;
 
 	sk = ulpq->asoc->base.sk;
 	sp = sctp_sk(sk);
-	if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
-				       &sctp_sk(sk)->subscribe))
+	if (sctp_ulpevent_type_enabled(sp->subscribe,
+				       SCTP_PARTIAL_DELIVERY_EVENT))
 		ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
 					      SCTP_PARTIAL_DELIVERY_ABORTED,
 					      0, 0, 0, gfp);
-- 
cgit v1.2.3-59-g8ed1b


From 480ba9c18a27ff77b02a2012e50dfd3e20ee9f7a Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 18 Nov 2018 16:08:54 +0800
Subject: sctp: add sockopt SCTP_EVENT

This patch adds sockopt SCTP_EVENT described in rfc6525#section-6.2.
With this sockopt users can subscribe to an event from a specified
asoc.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  7 ++++
 net/sctp/socket.c         | 88 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 66afa5b4ab6b..d584073532b8 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -129,6 +129,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_STREAM_SCHEDULER_VALUE	124
 #define SCTP_INTERLEAVING_SUPPORTED	125
 #define SCTP_SENDMSG_CONNECT	126
+#define SCTP_EVENT	127
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -1154,6 +1155,12 @@ struct sctp_add_streams {
 	uint16_t sas_outstrms;
 };
 
+struct sctp_event {
+	sctp_assoc_t se_assoc_id;
+	uint16_t se_type;
+	uint8_t se_on;
+};
+
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
 	SCTP_SS_FCFS,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c7718272d69b..e16c090e89f0 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4288,6 +4288,57 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval,
 	return 0;
 }
 
+static int sctp_setsockopt_event(struct sock *sk, char __user *optval,
+				 unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_ulpevent *event;
+	struct sctp_event param;
+	int retval = 0;
+
+	if (optlen < sizeof(param)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	optlen = sizeof(param);
+	if (copy_from_user(&param, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (param.se_type < SCTP_SN_TYPE_BASE ||
+	    param.se_type > SCTP_SN_TYPE_MAX) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, param.se_assoc_id);
+	if (!asoc) {
+		sctp_ulpevent_type_set(&sctp_sk(sk)->subscribe,
+				       param.se_type, param.se_on);
+		goto out;
+	}
+
+	sctp_ulpevent_type_set(&asoc->subscribe, param.se_type, param.se_on);
+
+	if (param.se_type == SCTP_SENDER_DRY_EVENT && param.se_on) {
+		if (sctp_outq_is_empty(&asoc->outqueue)) {
+			event = sctp_ulpevent_make_sender_dry_event(asoc,
+					GFP_USER | __GFP_NOWARN);
+			if (!event) {
+				retval = -ENOMEM;
+				goto out;
+			}
+
+			asoc->stream.si->enqueue_event(&asoc->ulpq, event);
+		}
+	}
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4485,6 +4536,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_REUSE_PORT:
 		retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
 		break;
+	case SCTP_EVENT:
+		retval = sctp_setsockopt_event(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -7430,6 +7484,37 @@ static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
 	return 0;
 }
 
+static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval,
+				 int __user *optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_event param;
+	__u16 subscribe;
+
+	if (len < sizeof(param))
+		return -EINVAL;
+
+	len = sizeof(param);
+	if (copy_from_user(&param, optval, len))
+		return -EFAULT;
+
+	if (param.se_type < SCTP_SN_TYPE_BASE ||
+	    param.se_type > SCTP_SN_TYPE_MAX)
+		return -EINVAL;
+
+	asoc = sctp_id2assoc(sk, param.se_assoc_id);
+	subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe;
+	param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type);
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	if (copy_to_user(optval, &param, len))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -7628,6 +7713,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_REUSE_PORT:
 		retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
 		break;
+	case SCTP_EVENT:
+		retval = sctp_getsockopt_event(sk, len, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 96b3b6c9091d23289721350e32c63cc8749686be Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Fri, 16 Nov 2018 11:41:08 +0000
Subject: bpf: allow zero-initializing hash map seed

Add a new flag BPF_F_ZERO_SEED, which forces a hash map
to initialize the seed to zero. This is useful when doing
performance analysis both on individual BPF programs, as
well as the kernel's hash table implementation.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  3 +++
 kernel/bpf/hashtab.c     | 13 +++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 47d606d744cc..8c01b89a4cb4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -269,6 +269,9 @@ enum bpf_attach_type {
 /* Flag for stack_map, store build_id+offset instead of pointer */
 #define BPF_F_STACK_BUILD_ID	(1U << 5)
 
+/* Zero-initialize hash function seed. This should only be used for testing. */
+#define BPF_F_ZERO_SEED		(1U << 6)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2c1790288138..4b7c76765d9d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -23,7 +23,7 @@
 
 #define HTAB_CREATE_FLAG_MASK						\
 	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
-	 BPF_F_RDONLY | BPF_F_WRONLY)
+	 BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
 
 struct bucket {
 	struct hlist_nulls_head head;
@@ -244,6 +244,7 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	 */
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
 	int numa_node = bpf_map_attr_numa_node(attr);
 
 	BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
@@ -257,6 +258,10 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 		 */
 		return -EPERM;
 
+	if (zero_seed && !capable(CAP_SYS_ADMIN))
+		/* Guard against local DoS, and discourage production use. */
+		return -EPERM;
+
 	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
 		/* reserved bits should not be used */
 		return -EINVAL;
@@ -373,7 +378,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (!htab->buckets)
 		goto free_htab;
 
-	htab->hashrnd = get_random_int();
+	if (htab->map.map_flags & BPF_F_ZERO_SEED)
+		htab->hashrnd = 0;
+	else
+		htab->hashrnd = get_random_int();
+
 	for (i = 0; i < htab->n_buckets; i++) {
 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
 		raw_spin_lock_init(&htab->buckets[i].lock);
-- 
cgit v1.2.3-59-g8ed1b


From 2f1833607aed6a9c1e1729bf0e2588c341ceb409 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Fri, 16 Nov 2018 11:41:09 +0000
Subject: bpf: move BPF_F_QUERY_EFFECTIVE after map flags

BPF_F_QUERY_EFFECTIVE is in the middle of the flags valid
for BPF_MAP_CREATE. Move it to its own section to reduce confusion.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8c01b89a4cb4..05d95290b848 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -257,9 +257,6 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
-/* flags for BPF_PROG_QUERY */
-#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
-
 #define BPF_OBJ_NAME_LEN 16U
 
 /* Flags for accessing BPF object */
@@ -272,6 +269,9 @@ enum bpf_attach_type {
 /* Zero-initialize hash function seed. This should only be used for testing. */
 #define BPF_F_ZERO_SEED		(1U << 6)
 
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
-- 
cgit v1.2.3-59-g8ed1b


From 23464f8c3407b83106463999b64fe10dc66ff6a3 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:33 +0900
Subject: aio: Comment use of IOCB_FLAG_IOPRIO aio flag

Comment the use of the IOCB_FLAG_IOPRIO aio flag similarly to the
IOCB_FLAG_RESFD flag.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/aio_abi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index ce43d340f010..8387e0af0f76 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -50,6 +50,8 @@ enum {
  *
  * IOCB_FLAG_RESFD - Set if the "aio_resfd" member of the "struct iocb"
  *                   is valid.
+ * IOCB_FLAG_IOPRIO - Set if the "aio_reqprio" member of the "struct iocb"
+ *                    is valid.
  */
 #define IOCB_FLAG_RESFD		(1 << 0)
 #define IOCB_FLAG_IOPRIO	(1 << 1)
-- 
cgit v1.2.3-59-g8ed1b


From 61e49394a31aee438e026e553a1422e13e0309d9 Mon Sep 17 00:00:00 2001
From: Stanislav Lisovskiy <stanislav.lisovskiy@intel.com>
Date: Fri, 9 Nov 2018 11:39:15 +0200
Subject: drm: Introduce new DRM_FORMAT_XYUV
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v5: This is YUV444 packed format same as AYUV, but without alpha,
    as supported by i915.

v6: Removed unneeded initializer for new XYUV format.

v7: Added is_yuv field initialization according to latest
    drm_fourcc format structure initialization changes.

v8: Edited commit message to be more clear about skl+, renamed
    PLANE_CTL_FORMAT_AYUV to PLANE_CTL_FORMAT_XYUV as this format
    doesn't support per-pixel alpha. Fixed minor code issues.

v9: Moved DRM format check to proper place in intel_framebuffer_init.

v10: Changed DRM_FORMAT_XYUV to be DRM_FORMAT_XYUV8888

v11: Fixed rebase conflict, caused by added new formats to drm-tip
     meanwhile.

Reviewed-by: Alexandru Gheorghe <alexandru-cosmin.gheorghe@arm.com>
Signed-off-by: Stanislav Lisovskiy <stanislav.lisovskiy@intel.com>
[vsyrjala: Removed stray tab and sorted the formats differently]
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181109093916.25858-2-stanislav.lisovskiy@intel.com
---
 drivers/gpu/drm/drm_fourcc.c  | 1 +
 include/uapi/drm/drm_fourcc.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c
index f523948c82b1..d90ee03a84c6 100644
--- a/drivers/gpu/drm/drm_fourcc.c
+++ b/drivers/gpu/drm/drm_fourcc.c
@@ -224,6 +224,7 @@ const struct drm_format_info *__drm_format_info(u32 format)
 		{ .format = DRM_FORMAT_YVYU,		.depth = 0,  .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true },
 		{ .format = DRM_FORMAT_UYVY,		.depth = 0,  .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true },
 		{ .format = DRM_FORMAT_VYUY,		.depth = 0,  .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true },
+		{ .format = DRM_FORMAT_XYUV8888,	.depth = 0,  .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true },
 		{ .format = DRM_FORMAT_AYUV,		.depth = 0,  .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true, .is_yuv = true },
 		{ .format = DRM_FORMAT_Y0L0,		.depth = 0,  .num_planes = 1,
 		  .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 },
diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index e7e48f1f4a74..0b44260a5ee9 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -151,6 +151,7 @@ extern "C" {
 #define DRM_FORMAT_VYUY		fourcc_code('V', 'Y', 'U', 'Y') /* [31:0] Y1:Cb0:Y0:Cr0 8:8:8:8 little endian */
 
 #define DRM_FORMAT_AYUV		fourcc_code('A', 'Y', 'U', 'V') /* [31:0] A:Y:Cb:Cr 8:8:8:8 little endian */
+#define DRM_FORMAT_XYUV8888		fourcc_code('X', 'Y', 'U', 'V') /* [31:0] X:Y:Cb:Cr 8:8:8:8 little endian */
 
 /*
  * packed YCbCr420 2x2 tiled formats
-- 
cgit v1.2.3-59-g8ed1b


From 177bbc67812d96dfec517ef293017ca614a0955a Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Thu, 4 Oct 2018 16:30:04 -0400
Subject: media: v4l2-common.h: put backwards compat defines under #ifndef
 __KERNEL__
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This ensures that they won't be used in kernel code.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Tested-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/v4l2-common.h | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/v4l2-common.h b/include/uapi/linux/v4l2-common.h
index 4f7b892377cd..7d21c1634b4d 100644
--- a/include/uapi/linux/v4l2-common.h
+++ b/include/uapi/linux/v4l2-common.h
@@ -79,24 +79,11 @@
 /* Current composing area plus all padding pixels */
 #define V4L2_SEL_TGT_COMPOSE_PADDED	0x0103
 
-/* Backward compatibility target definitions --- to be removed. */
-#define V4L2_SEL_TGT_CROP_ACTIVE	V4L2_SEL_TGT_CROP
-#define V4L2_SEL_TGT_COMPOSE_ACTIVE	V4L2_SEL_TGT_COMPOSE
-#define V4L2_SUBDEV_SEL_TGT_CROP_ACTUAL	V4L2_SEL_TGT_CROP
-#define V4L2_SUBDEV_SEL_TGT_COMPOSE_ACTUAL V4L2_SEL_TGT_COMPOSE
-#define V4L2_SUBDEV_SEL_TGT_CROP_BOUNDS	V4L2_SEL_TGT_CROP_BOUNDS
-#define V4L2_SUBDEV_SEL_TGT_COMPOSE_BOUNDS V4L2_SEL_TGT_COMPOSE_BOUNDS
-
 /* Selection flags */
 #define V4L2_SEL_FLAG_GE		(1 << 0)
 #define V4L2_SEL_FLAG_LE		(1 << 1)
 #define V4L2_SEL_FLAG_KEEP_CONFIG	(1 << 2)
 
-/* Backward compatibility flag definitions --- to be removed. */
-#define V4L2_SUBDEV_SEL_FLAG_SIZE_GE	V4L2_SEL_FLAG_GE
-#define V4L2_SUBDEV_SEL_FLAG_SIZE_LE	V4L2_SEL_FLAG_LE
-#define V4L2_SUBDEV_SEL_FLAG_KEEP_CONFIG V4L2_SEL_FLAG_KEEP_CONFIG
-
 struct v4l2_edid {
 	__u32 pad;
 	__u32 start_block;
@@ -105,4 +92,19 @@ struct v4l2_edid {
 	__u8  *edid;
 };
 
+#ifndef __KERNEL__
+/* Backward compatibility target definitions --- to be removed. */
+#define V4L2_SEL_TGT_CROP_ACTIVE	V4L2_SEL_TGT_CROP
+#define V4L2_SEL_TGT_COMPOSE_ACTIVE	V4L2_SEL_TGT_COMPOSE
+#define V4L2_SUBDEV_SEL_TGT_CROP_ACTUAL	V4L2_SEL_TGT_CROP
+#define V4L2_SUBDEV_SEL_TGT_COMPOSE_ACTUAL V4L2_SEL_TGT_COMPOSE
+#define V4L2_SUBDEV_SEL_TGT_CROP_BOUNDS	V4L2_SEL_TGT_CROP_BOUNDS
+#define V4L2_SUBDEV_SEL_TGT_COMPOSE_BOUNDS V4L2_SEL_TGT_COMPOSE_BOUNDS
+
+/* Backward compatibility flag definitions --- to be removed. */
+#define V4L2_SUBDEV_SEL_FLAG_SIZE_GE	V4L2_SEL_FLAG_GE
+#define V4L2_SUBDEV_SEL_FLAG_SIZE_LE	V4L2_SEL_FLAG_LE
+#define V4L2_SUBDEV_SEL_FLAG_KEEP_CONFIG V4L2_SEL_FLAG_KEEP_CONFIG
+#endif
+
 #endif /* __V4L2_COMMON__ */
-- 
cgit v1.2.3-59-g8ed1b


From 2667a2626f4da370409c2830552f6e8c8b8c41e2 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 19 Nov 2018 15:29:08 -0800
Subject: bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO

This patch adds BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
to support the function debug info.

BTF_KIND_FUNC_PROTO must not have a name (i.e. !t->name_off)
and it is followed by >= 0 'struct bpf_param' objects to
describe the function arguments.

The BTF_KIND_FUNC must have a valid name and it must
refer back to a BTF_KIND_FUNC_PROTO.

The above is the conclusion after the discussion between
Edward Cree, Alexei, Daniel, Yonghong and Martin.

By combining BTF_KIND_FUNC and BTF_LIND_FUNC_PROTO,
a complete function signature can be obtained.  It will be
used in the later patches to learn the function signature of
a running bpf program.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/btf.h |  18 ++-
 kernel/bpf/btf.c         | 389 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 354 insertions(+), 53 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 972265f32871..14f66948fc95 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -40,7 +40,8 @@ struct btf_type {
 	/* "size" is used by INT, ENUM, STRUCT and UNION.
 	 * "size" tells the size of the type it is describing.
 	 *
-	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
+	 * FUNC and FUNC_PROTO.
 	 * "type" is a type_id referring to another type.
 	 */
 	union {
@@ -64,8 +65,10 @@ struct btf_type {
 #define BTF_KIND_VOLATILE	9	/* Volatile	*/
 #define BTF_KIND_CONST		10	/* Const	*/
 #define BTF_KIND_RESTRICT	11	/* Restrict	*/
-#define BTF_KIND_MAX		11
-#define NR_BTF_KINDS		12
+#define BTF_KIND_FUNC		12	/* Function	*/
+#define BTF_KIND_FUNC_PROTO	13	/* Function Proto	*/
+#define BTF_KIND_MAX		13
+#define NR_BTF_KINDS		14
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
  * followed by extra data.
@@ -110,4 +113,13 @@ struct btf_member {
 	__u32	offset;	/* offset in bits */
 };
 
+/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param".
+ * The exact number of btf_param is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_param {
+	__u32	name_off;
+	__u32	type;
+};
+
 #endif /* _UAPI__LINUX_BTF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2a50d87de485..6a2be79b73fc 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5,6 +5,7 @@
 #include <uapi/linux/types.h>
 #include <linux/seq_file.h>
 #include <linux/compiler.h>
+#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
@@ -259,6 +260,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_VOLATILE]	= "VOLATILE",
 	[BTF_KIND_CONST]	= "CONST",
 	[BTF_KIND_RESTRICT]	= "RESTRICT",
+	[BTF_KIND_FUNC]		= "FUNC",
+	[BTF_KIND_FUNC_PROTO]	= "FUNC_PROTO",
 };
 
 struct btf_kind_operations {
@@ -281,6 +284,9 @@ struct btf_kind_operations {
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
 static struct btf_type btf_void;
 
+static int btf_resolve(struct btf_verifier_env *env,
+		       const struct btf_type *t, u32 type_id);
+
 static bool btf_type_is_modifier(const struct btf_type *t)
 {
 	/* Some of them is not strictly a C modifier
@@ -314,9 +320,20 @@ static bool btf_type_is_fwd(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
 }
 
+static bool btf_type_is_func(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
+}
+
+static bool btf_type_is_func_proto(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
+}
+
 static bool btf_type_nosize(const struct btf_type *t)
 {
-	return btf_type_is_void(t) || btf_type_is_fwd(t);
+	return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+	       btf_type_is_func(t) || btf_type_is_func_proto(t);
 }
 
 static bool btf_type_nosize_or_null(const struct btf_type *t)
@@ -433,6 +450,30 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 		offset < btf->hdr.str_len;
 }
 
+/* Only C-style identifier is permitted. This can be relaxed if
+ * necessary.
+ */
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+	/* offset must be valid */
+	const char *src = &btf->strings[offset];
+	const char *src_limit;
+
+	if (!isalpha(*src) && *src != '_')
+		return false;
+
+	/* set a limit on identifier length */
+	src_limit = src + KSYM_NAME_LEN;
+	src++;
+	while (*src && src < src_limit) {
+		if (!isalnum(*src) && *src != '_')
+			return false;
+		src++;
+	}
+
+	return !*src;
+}
+
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!offset)
@@ -747,11 +788,15 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
 		/* int, enum or void is a sink */
 		return !btf_type_needs_resolve(next_type);
 	case RESOLVE_PTR:
-		/* int, enum, void, struct or array is a sink for ptr */
+		/* int, enum, void, struct, array, func or func_proto is a sink
+		 * for ptr
+		 */
 		return !btf_type_is_modifier(next_type) &&
 			!btf_type_is_ptr(next_type);
 	case RESOLVE_STRUCT_OR_ARRAY:
-		/* int, enum, void or ptr is a sink for struct and array */
+		/* int, enum, void, ptr, func or func_proto is a sink
+		 * for struct and array
+		 */
 		return !btf_type_is_modifier(next_type) &&
 			!btf_type_is_array(next_type) &&
 			!btf_type_is_struct(next_type);
@@ -1170,10 +1215,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	/* "typedef void new_void", "const void"...etc */
-	if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type))
-		goto resolved;
-
 	if (!env_type_is_resolve_sink(env, next_type) &&
 	    !env_type_is_resolved(env, next_type_id))
 		return env_stack_push(env, next_type, next_type_id);
@@ -1184,13 +1225,18 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	 * save us a few type-following when we use it later (e.g. in
 	 * pretty print).
 	 */
-	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
-	    !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) {
-		btf_verifier_log_type(env, v->t, "Invalid type_id");
-		return -EINVAL;
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+		if (env_type_is_resolved(env, next_type_id))
+			next_type = btf_type_id_resolve(btf, &next_type_id);
+
+		/* "typedef void new_void", "const void"...etc */
+		if (!btf_type_is_void(next_type) &&
+		    !btf_type_is_fwd(next_type)) {
+			btf_verifier_log_type(env, v->t, "Invalid type_id");
+			return -EINVAL;
+		}
 	}
 
-resolved:
 	env_stack_pop_resolved(env, next_type_id, next_type_size);
 
 	return 0;
@@ -1203,7 +1249,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 	const struct btf_type *t = v->t;
 	u32 next_type_id = t->type;
 	struct btf *btf = env->btf;
-	u32 next_type_size = 0;
 
 	next_type = btf_type_by_id(btf, next_type_id);
 	if (!next_type) {
@@ -1211,10 +1256,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	/* "void *" */
-	if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type))
-		goto resolved;
-
 	if (!env_type_is_resolve_sink(env, next_type) &&
 	    !env_type_is_resolved(env, next_type_id))
 		return env_stack_push(env, next_type, next_type_id);
@@ -1241,13 +1282,18 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 					      resolved_type_id);
 	}
 
-	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
-	    !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) {
-		btf_verifier_log_type(env, v->t, "Invalid type_id");
-		return -EINVAL;
+	if (!btf_type_id_size(btf, &next_type_id, NULL)) {
+		if (env_type_is_resolved(env, next_type_id))
+			next_type = btf_type_id_resolve(btf, &next_type_id);
+
+		if (!btf_type_is_void(next_type) &&
+		    !btf_type_is_fwd(next_type) &&
+		    !btf_type_is_func_proto(next_type)) {
+			btf_verifier_log_type(env, v->t, "Invalid type_id");
+			return -EINVAL;
+		}
 	}
 
-resolved:
 	env_stack_pop_resolved(env, next_type_id, 0);
 
 	return 0;
@@ -1787,6 +1833,232 @@ static struct btf_kind_operations enum_ops = {
 	.seq_show = btf_enum_seq_show,
 };
 
+static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
+				     const struct btf_type *t,
+				     u32 meta_left)
+{
+	u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (t->name_off) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_func_proto_log(struct btf_verifier_env *env,
+			       const struct btf_type *t)
+{
+	const struct btf_param *args = (const struct btf_param *)(t + 1);
+	u16 nr_args = btf_type_vlen(t), i;
+
+	btf_verifier_log(env, "return=%u args=(", t->type);
+	if (!nr_args) {
+		btf_verifier_log(env, "void");
+		goto done;
+	}
+
+	if (nr_args == 1 && !args[0].type) {
+		/* Only one vararg */
+		btf_verifier_log(env, "vararg");
+		goto done;
+	}
+
+	btf_verifier_log(env, "%u %s", args[0].type,
+			 btf_name_by_offset(env->btf,
+					    args[0].name_off));
+	for (i = 1; i < nr_args - 1; i++)
+		btf_verifier_log(env, ", %u %s", args[i].type,
+				 btf_name_by_offset(env->btf,
+						    args[i].name_off));
+
+	if (nr_args > 1) {
+		const struct btf_param *last_arg = &args[nr_args - 1];
+
+		if (last_arg->type)
+			btf_verifier_log(env, ", %u %s", last_arg->type,
+					 btf_name_by_offset(env->btf,
+							    last_arg->name_off));
+		else
+			btf_verifier_log(env, ", vararg");
+	}
+
+done:
+	btf_verifier_log(env, ")");
+}
+
+static struct btf_kind_operations func_proto_ops = {
+	.check_meta = btf_func_proto_check_meta,
+	.resolve = btf_df_resolve,
+	/*
+	 * BTF_KIND_FUNC_PROTO cannot be directly referred by
+	 * a struct's member.
+	 *
+	 * It should be a funciton pointer instead.
+	 * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
+	 *
+	 * Hence, there is no btf_func_check_member().
+	 */
+	.check_member = btf_df_check_member,
+	.log_details = btf_func_proto_log,
+	.seq_show = btf_df_seq_show,
+};
+
+static s32 btf_func_check_meta(struct btf_verifier_env *env,
+			       const struct btf_type *t,
+			       u32 meta_left)
+{
+	if (!t->name_off ||
+	    !btf_name_valid_identifier(env->btf, t->name_off)) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return 0;
+}
+
+static struct btf_kind_operations func_ops = {
+	.check_meta = btf_func_check_meta,
+	.resolve = btf_df_resolve,
+	.check_member = btf_df_check_member,
+	.log_details = btf_ref_type_log,
+	.seq_show = btf_df_seq_show,
+};
+
+static int btf_func_proto_check(struct btf_verifier_env *env,
+				const struct btf_type *t)
+{
+	const struct btf_type *ret_type;
+	const struct btf_param *args;
+	const struct btf *btf;
+	u16 nr_args, i;
+	int err;
+
+	btf = env->btf;
+	args = (const struct btf_param *)(t + 1);
+	nr_args = btf_type_vlen(t);
+
+	/* Check func return type which could be "void" (t->type == 0) */
+	if (t->type) {
+		u32 ret_type_id = t->type;
+
+		ret_type = btf_type_by_id(btf, ret_type_id);
+		if (!ret_type) {
+			btf_verifier_log_type(env, t, "Invalid return type");
+			return -EINVAL;
+		}
+
+		if (btf_type_needs_resolve(ret_type) &&
+		    !env_type_is_resolved(env, ret_type_id)) {
+			err = btf_resolve(env, ret_type, ret_type_id);
+			if (err)
+				return err;
+		}
+
+		/* Ensure the return type is a type that has a size */
+		if (!btf_type_id_size(btf, &ret_type_id, NULL)) {
+			btf_verifier_log_type(env, t, "Invalid return type");
+			return -EINVAL;
+		}
+	}
+
+	if (!nr_args)
+		return 0;
+
+	/* Last func arg type_id could be 0 if it is a vararg */
+	if (!args[nr_args - 1].type) {
+		if (args[nr_args - 1].name_off) {
+			btf_verifier_log_type(env, t, "Invalid arg#%u",
+					      nr_args);
+			return -EINVAL;
+		}
+		nr_args--;
+	}
+
+	err = 0;
+	for (i = 0; i < nr_args; i++) {
+		const struct btf_type *arg_type;
+		u32 arg_type_id;
+
+		arg_type_id = args[i].type;
+		arg_type = btf_type_by_id(btf, arg_type_id);
+		if (!arg_type) {
+			btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+			err = -EINVAL;
+			break;
+		}
+
+		if (args[i].name_off &&
+		    (!btf_name_offset_valid(btf, args[i].name_off) ||
+		     !btf_name_valid_identifier(btf, args[i].name_off))) {
+			btf_verifier_log_type(env, t,
+					      "Invalid arg#%u", i + 1);
+			err = -EINVAL;
+			break;
+		}
+
+		if (btf_type_needs_resolve(arg_type) &&
+		    !env_type_is_resolved(env, arg_type_id)) {
+			err = btf_resolve(env, arg_type, arg_type_id);
+			if (err)
+				break;
+		}
+
+		if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
+			btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+			err = -EINVAL;
+			break;
+		}
+	}
+
+	return err;
+}
+
+static int btf_func_check(struct btf_verifier_env *env,
+			  const struct btf_type *t)
+{
+	const struct btf_type *proto_type;
+	const struct btf_param *args;
+	const struct btf *btf;
+	u16 nr_args, i;
+
+	btf = env->btf;
+	proto_type = btf_type_by_id(btf, t->type);
+
+	if (!proto_type || !btf_type_is_func_proto(proto_type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	args = (const struct btf_param *)(proto_type + 1);
+	nr_args = btf_type_vlen(proto_type);
+	for (i = 0; i < nr_args; i++) {
+		if (!args[i].name_off && args[i].type) {
+			btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
 	[BTF_KIND_INT] = &int_ops,
 	[BTF_KIND_PTR] = &ptr_ops,
@@ -1799,6 +2071,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
 	[BTF_KIND_VOLATILE] = &modifier_ops,
 	[BTF_KIND_CONST] = &modifier_ops,
 	[BTF_KIND_RESTRICT] = &modifier_ops,
+	[BTF_KIND_FUNC] = &func_ops,
+	[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
 };
 
 static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -1870,30 +2144,6 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 	return 0;
 }
 
-static int btf_resolve(struct btf_verifier_env *env,
-		       const struct btf_type *t, u32 type_id)
-{
-	const struct resolve_vertex *v;
-	int err = 0;
-
-	env->resolve_mode = RESOLVE_TBD;
-	env_stack_push(env, t, type_id);
-	while (!err && (v = env_stack_peak(env))) {
-		env->log_type_id = v->type_id;
-		err = btf_type_ops(v->t)->resolve(env, v);
-	}
-
-	env->log_type_id = type_id;
-	if (err == -E2BIG)
-		btf_verifier_log_type(env, t,
-				      "Exceeded max resolving depth:%u",
-				      MAX_RESOLVE_DEPTH);
-	else if (err == -EEXIST)
-		btf_verifier_log_type(env, t, "Loop detected");
-
-	return err;
-}
-
 static bool btf_resolve_valid(struct btf_verifier_env *env,
 			      const struct btf_type *t,
 			      u32 type_id)
@@ -1927,6 +2177,39 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
 	return false;
 }
 
+static int btf_resolve(struct btf_verifier_env *env,
+		       const struct btf_type *t, u32 type_id)
+{
+	u32 save_log_type_id = env->log_type_id;
+	const struct resolve_vertex *v;
+	int err = 0;
+
+	env->resolve_mode = RESOLVE_TBD;
+	env_stack_push(env, t, type_id);
+	while (!err && (v = env_stack_peak(env))) {
+		env->log_type_id = v->type_id;
+		err = btf_type_ops(v->t)->resolve(env, v);
+	}
+
+	env->log_type_id = type_id;
+	if (err == -E2BIG) {
+		btf_verifier_log_type(env, t,
+				      "Exceeded max resolving depth:%u",
+				      MAX_RESOLVE_DEPTH);
+	} else if (err == -EEXIST) {
+		btf_verifier_log_type(env, t, "Loop detected");
+	}
+
+	/* Final sanity check */
+	if (!err && !btf_resolve_valid(env, t, type_id)) {
+		btf_verifier_log_type(env, t, "Invalid resolve state");
+		err = -EINVAL;
+	}
+
+	env->log_type_id = save_log_type_id;
+	return err;
+}
+
 static int btf_check_all_types(struct btf_verifier_env *env)
 {
 	struct btf *btf = env->btf;
@@ -1949,10 +2232,16 @@ static int btf_check_all_types(struct btf_verifier_env *env)
 				return err;
 		}
 
-		if (btf_type_needs_resolve(t) &&
-		    !btf_resolve_valid(env, t, type_id)) {
-			btf_verifier_log_type(env, t, "Invalid resolve state");
-			return -EINVAL;
+		if (btf_type_is_func_proto(t)) {
+			err = btf_func_proto_check(env, t);
+			if (err)
+				return err;
+		}
+
+		if (btf_type_is_func(t)) {
+			err = btf_func_check(env, t);
+			if (err)
+				return err;
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 838e96904ff3fc6c30e5ebbc611474669856e3c0 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 19 Nov 2018 15:29:11 -0800
Subject: bpf: Introduce bpf_func_info

This patch added interface to load a program with the following
additional information:
   . prog_btf_fd
   . func_info, func_info_rec_size and func_info_cnt
where func_info will provide function range and type_id
corresponding to each function.

The func_info_rec_size is introduced in the UAPI to specify
struct bpf_func_info size passed from user space. This
intends to make bpf_func_info structure growable in the future.
If the kernel gets a different bpf_func_info size from userspace,
it will try to handle user request with part of bpf_func_info
it can understand. In this patch, kernel can understand
  struct bpf_func_info {
       __u32   insn_offset;
       __u32   type_id;
  };
If user passed a bpf func_info record size of 16 bytes, the
kernel can still handle part of records with the above definition.

If verifier agrees with function range provided by the user,
the bpf_prog ksym for each function will use the func name
provided in the type_id, which is supposed to provide better
encoding as it is not limited by 16 bytes program name
limitation and this is better for bpf program which contains
multiple subprograms.

The bpf_prog_info interface is also extended to
return btf_id, func_info, func_info_rec_size and func_info_cnt
to userspace, so userspace can print out the function prototype
for each xlated function. The insn_offset in the returned
func_info corresponds to the insn offset for xlated functions.
With other jit related fields in bpf_prog_info, userspace can also
print out function prototypes for each jited function.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |   5 +-
 include/linux/bpf_verifier.h |   1 +
 include/linux/btf.h          |   2 +
 include/uapi/linux/bpf.h     |  13 +++++
 kernel/bpf/btf.c             |   4 +-
 kernel/bpf/core.c            |  13 +++++
 kernel/bpf/syscall.c         |  59 +++++++++++++++++++--
 kernel/bpf/verifier.c        | 120 ++++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 209 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 987815152629..7f0e225bf630 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -316,6 +316,8 @@ struct bpf_prog_aux {
 	void *security;
 #endif
 	struct bpf_prog_offload *offload;
+	struct btf *btf;
+	u32 type_id; /* type id for this prog/func */
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -527,7 +529,8 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 }
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr,
+	      union bpf_attr __user *uattr);
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 
 /* Map specifics */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 11f5df1092d9..204382f46fd8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -204,6 +204,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u16 stack_depth; /* max. stack depth used by this function */
+	u32 type_id; /* btf type_id for this subprog */
 };
 
 /* single container for all structs
diff --git a/include/linux/btf.h b/include/linux/btf.h
index e076c4697049..7f2c0a4a45ea 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -46,5 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
 int btf_get_fd_by_id(u32 id);
 u32 btf_id(const struct btf *btf);
+const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
+const char *btf_name_by_offset(const struct btf *btf, u32 offset);
 
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 05d95290b848..c1554aa07465 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -338,6 +338,10 @@ union bpf_attr {
 		 * (context accesses, allowed helpers, etc).
 		 */
 		__u32		expected_attach_type;
+		__u32		prog_btf_fd;	/* fd pointing to BTF type data */
+		__u32		func_info_rec_size;	/* userspace bpf_func_info size */
+		__aligned_u64	func_info;	/* func info */
+		__u32		func_info_cnt;	/* number of bpf_func_info records */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -2638,6 +2642,10 @@ struct bpf_prog_info {
 	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
 	__aligned_u64 jited_func_lens;
+	__u32 btf_id;
+	__u32 func_info_rec_size;
+	__aligned_u64 func_info;
+	__u32 func_info_cnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -2949,4 +2957,9 @@ struct bpf_flow_keys {
 	};
 };
 
+struct bpf_func_info {
+	__u32	insn_offset;
+	__u32	type_id;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6a2be79b73fc..69da9169819a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -474,7 +474,7 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
 	return !*src;
 }
 
-static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!offset)
 		return "(anon)";
@@ -484,7 +484,7 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 		return "(invalid-name-offset)";
 }
 
-static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
 {
 	if (type_id > btf->nr_types)
 		return NULL;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1a796e0799ec..16d77012ad3e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -21,12 +21,14 @@
  * Kris Katterjohn - Added many additional checks in bpf_check_classic()
  */
 
+#include <uapi/linux/btf.h>
 #include <linux/filter.h>
 #include <linux/skbuff.h>
 #include <linux/vmalloc.h>
 #include <linux/random.h>
 #include <linux/moduleloader.h>
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/frame.h>
 #include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
@@ -390,6 +392,8 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
 static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
 	const char *end = sym + KSYM_NAME_LEN;
+	const struct btf_type *type;
+	const char *func_name;
 
 	BUILD_BUG_ON(sizeof("bpf_prog_") +
 		     sizeof(prog->tag) * 2 +
@@ -404,6 +408,15 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 
 	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
 	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
+
+	/* prog->aux->name will be ignored if full btf name is available */
+	if (prog->aux->btf) {
+		type = btf_type_by_id(prog->aux->btf, prog->aux->type_id);
+		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
+		snprintf(sym, (size_t)(end - sym), "_%s", func_name);
+		return;
+	}
+
 	if (prog->aux->name[0])
 		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
 	else
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cf5040fd5434..998377808102 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1213,6 +1213,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del_all(prog);
+		btf_put(prog->aux->btf);
 
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
@@ -1437,9 +1438,9 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD expected_attach_type
+#define	BPF_PROG_LOAD_LAST_FIELD func_info_cnt
 
-static int bpf_prog_load(union bpf_attr *attr)
+static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog;
@@ -1525,7 +1526,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		goto free_prog;
 
 	/* run eBPF verifier */
-	err = bpf_check(&prog, attr);
+	err = bpf_check(&prog, attr, uattr);
 	if (err < 0)
 		goto free_used_maps;
 
@@ -2079,6 +2080,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		info.xlated_prog_len = 0;
 		info.nr_jited_ksyms = 0;
 		info.nr_jited_func_lens = 0;
+		info.func_info_cnt = 0;
 		goto done;
 	}
 
@@ -2216,6 +2218,55 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	if (prog->aux->btf) {
+		u32 ucnt, urec_size;
+
+		info.btf_id = btf_id(prog->aux->btf);
+
+		ucnt = info.func_info_cnt;
+		info.func_info_cnt = prog->aux->func_cnt ? : 1;
+		urec_size = info.func_info_rec_size;
+		info.func_info_rec_size = sizeof(struct bpf_func_info);
+		if (ucnt) {
+			/* expect passed-in urec_size is what the kernel expects */
+			if (urec_size != info.func_info_rec_size)
+				return -EINVAL;
+
+			if (bpf_dump_raw_ok()) {
+				struct bpf_func_info kern_finfo;
+				char __user *user_finfo;
+				u32 i, insn_offset;
+
+				user_finfo = u64_to_user_ptr(info.func_info);
+				if (prog->aux->func_cnt) {
+					ucnt = min_t(u32, info.func_info_cnt, ucnt);
+					insn_offset = 0;
+					for (i = 0; i < ucnt; i++) {
+						kern_finfo.insn_offset = insn_offset;
+						kern_finfo.type_id = prog->aux->func[i]->aux->type_id;
+						if (copy_to_user(user_finfo, &kern_finfo,
+								 sizeof(kern_finfo)))
+							return -EFAULT;
+
+						/* func[i]->len holds the prog len */
+						insn_offset += prog->aux->func[i]->len;
+						user_finfo += urec_size;
+					}
+				} else {
+					kern_finfo.insn_offset = 0;
+					kern_finfo.type_id = prog->aux->type_id;
+					if (copy_to_user(user_finfo, &kern_finfo,
+							 sizeof(kern_finfo)))
+						return -EFAULT;
+				}
+			} else {
+				info.func_info_cnt = 0;
+			}
+		}
+	} else {
+		info.func_info_cnt = 0;
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
@@ -2501,7 +2552,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = map_get_next_key(&attr);
 		break;
 	case BPF_PROG_LOAD:
-		err = bpf_prog_load(&attr);
+		err = bpf_prog_load(&attr, uattr);
 		break;
 	case BPF_OBJ_PIN:
 		err = bpf_obj_pin(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b5222aa61d54..f102c4fd0c5a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11,10 +11,12 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  * General Public License for more details.
  */
+#include <uapi/linux/btf.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/filter.h>
 #include <net/netlink.h>
@@ -4639,6 +4641,114 @@ err_free:
 	return ret;
 }
 
+/* The minimum supported BTF func info size */
+#define MIN_BPF_FUNCINFO_SIZE	8
+#define MAX_FUNCINFO_REC_SIZE	252
+
+static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
+			  union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	u32 i, nfuncs, urec_size, min_size, prev_offset;
+	u32 krec_size = sizeof(struct bpf_func_info);
+	struct bpf_func_info krecord = {};
+	const struct btf_type *type;
+	void __user *urecord;
+	struct btf *btf;
+	int ret = 0;
+
+	nfuncs = attr->func_info_cnt;
+	if (!nfuncs)
+		return 0;
+
+	if (nfuncs != env->subprog_cnt) {
+		verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+		return -EINVAL;
+	}
+
+	urec_size = attr->func_info_rec_size;
+	if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
+	    urec_size > MAX_FUNCINFO_REC_SIZE ||
+	    urec_size % sizeof(u32)) {
+		verbose(env, "invalid func info rec size %u\n", urec_size);
+		return -EINVAL;
+	}
+
+	btf = btf_get_by_fd(attr->prog_btf_fd);
+	if (IS_ERR(btf)) {
+		verbose(env, "unable to get btf from fd\n");
+		return PTR_ERR(btf);
+	}
+
+	urecord = u64_to_user_ptr(attr->func_info);
+	min_size = min_t(u32, krec_size, urec_size);
+
+	for (i = 0; i < nfuncs; i++) {
+		ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
+		if (ret) {
+			if (ret == -E2BIG) {
+				verbose(env, "nonzero tailing record in func info");
+				/* set the size kernel expects so loader can zero
+				 * out the rest of the record.
+				 */
+				if (put_user(min_size, &uattr->func_info_rec_size))
+					ret = -EFAULT;
+			}
+			goto free_btf;
+		}
+
+		if (copy_from_user(&krecord, urecord, min_size)) {
+			ret = -EFAULT;
+			goto free_btf;
+		}
+
+		/* check insn_offset */
+		if (i == 0) {
+			if (krecord.insn_offset) {
+				verbose(env,
+					"nonzero insn_offset %u for the first func info record",
+					krecord.insn_offset);
+				ret = -EINVAL;
+				goto free_btf;
+			}
+		} else if (krecord.insn_offset <= prev_offset) {
+			verbose(env,
+				"same or smaller insn offset (%u) than previous func info record (%u)",
+				krecord.insn_offset, prev_offset);
+			ret = -EINVAL;
+			goto free_btf;
+		}
+
+		if (env->subprog_info[i].start != krecord.insn_offset) {
+			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+			ret = -EINVAL;
+			goto free_btf;
+		}
+
+		/* check type_id */
+		type = btf_type_by_id(btf, krecord.type_id);
+		if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+			verbose(env, "invalid type id %d in func info",
+				krecord.type_id);
+			ret = -EINVAL;
+			goto free_btf;
+		}
+
+		if (i == 0)
+			prog->aux->type_id = krecord.type_id;
+		env->subprog_info[i].type_id = krecord.type_id;
+
+		prev_offset = krecord.insn_offset;
+		urecord += urec_size;
+	}
+
+	prog->aux->btf = btf;
+	return 0;
+
+free_btf:
+	btf_put(btf);
+	return ret;
+}
+
 /* check %cur's range satisfies %old's */
 static bool range_within(struct bpf_reg_state *old,
 			 struct bpf_reg_state *cur)
@@ -5939,6 +6049,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
+		/* the btf will be freed only at prog->aux */
+		func[i]->aux->btf = prog->aux->btf;
+		func[i]->aux->type_id = env->subprog_info[i].type_id;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
 			err = -ENOTSUPP;
@@ -6325,7 +6438,8 @@ static void free_states(struct bpf_verifier_env *env)
 	kfree(env->explored_states);
 }
 
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
+	      union bpf_attr __user *uattr)
 {
 	struct bpf_verifier_env *env;
 	struct bpf_verifier_log *log;
@@ -6397,6 +6511,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (ret < 0)
 		goto skip_full_check;
 
+	ret = check_btf_func(env->prog, env, attr, uattr);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = do_check(env);
 	if (env->cur_state) {
 		free_verifier_state(env->cur_state, true);
-- 
cgit v1.2.3-59-g8ed1b


From 3cd640832894b85b5929d5bda74505452c800421 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Tue, 20 Nov 2018 20:41:05 +0000
Subject: MIPS: ptrace: introduce NT_MIPS_MSA regset

The current methods for obtaining FP context via ptrace only provide
either 32 or 64 bits per data register. With MSA, where vector registers
are aliased with scalar FP data registers, those registers are 128 bits
wide. Thus a new mechanism is required for userland to access those
registers via ptrace. This patch introduces an NT_MIPS_MSA regset which
provides, in this order:

  - The full 128 bits value of each vector register, in native
    endianness saved as though elements are doubles. That is, the format
    of each vector register is as would be obtained by saving it to
    memory using an st.d instruction.

  - The 32 bit scalar FP implementation register (FIR).

  - The 32 bit scalar FP control & status register (FCSR).

  - The 32 bit MSA implementation register (MSAIR).

  - The 32 bit MSA control & status register (MSACSR).

The provision of the FIR & FCSR registers in addition to the MSA
equivalents allows scalar FP context to be retrieved as a subset of
the context available via this regset. Along with the MSA equivalents
they also nicely form the final 128 bit "register" of the regset.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Patchwork: https://patchwork.linux-mips.org/patch/21180/
Cc: linux-mips@linux-mips.org
---
 arch/mips/kernel/ptrace.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h  |   1 +
 2 files changed, 148 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index d7d032f2b656..ea54575255ea 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -622,6 +622,130 @@ static int fp_mode_set(struct task_struct *target,
 
 #endif /* CONFIG_MIPS_FP_SUPPORT */
 
+#ifdef CONFIG_CPU_HAS_MSA
+
+struct msa_control_regs {
+	unsigned int fir;
+	unsigned int fcsr;
+	unsigned int msair;
+	unsigned int msacsr;
+};
+
+static int copy_pad_fprs(struct task_struct *target,
+			 const struct user_regset *regset,
+			 unsigned int *ppos, unsigned int *pcount,
+			 void **pkbuf, void __user **pubuf,
+			 unsigned int live_sz)
+{
+	int i, j, start, start_pad, err;
+	unsigned long long fill = ~0ull;
+	unsigned int cp_sz, pad_sz;
+
+	cp_sz = min(regset->size, live_sz);
+	pad_sz = regset->size - cp_sz;
+	WARN_ON(pad_sz % sizeof(fill));
+
+	i = start = err = 0;
+	for (; i < NUM_FPU_REGS; i++, start += regset->size) {
+		err |= user_regset_copyout(ppos, pcount, pkbuf, pubuf,
+					   &target->thread.fpu.fpr[i],
+					   start, start + cp_sz);
+
+		start_pad = start + cp_sz;
+		for (j = 0; j < (pad_sz / sizeof(fill)); j++) {
+			err |= user_regset_copyout(ppos, pcount, pkbuf, pubuf,
+						   &fill, start_pad,
+						   start_pad + sizeof(fill));
+			start_pad += sizeof(fill);
+		}
+	}
+
+	return err;
+}
+
+static int msa_get(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
+	const unsigned int wr_size = NUM_FPU_REGS * regset->size;
+	const struct msa_control_regs ctrl_regs = {
+		.fir = boot_cpu_data.fpu_id,
+		.fcsr = target->thread.fpu.fcr31,
+		.msair = boot_cpu_data.msa_id,
+		.msacsr = target->thread.fpu.msacsr,
+	};
+	int err;
+
+	if (!tsk_used_math(target)) {
+		/* The task hasn't used FP or MSA, fill with 0xff */
+		err = copy_pad_fprs(target, regset, &pos, &count,
+				    &kbuf, &ubuf, 0);
+	} else if (!test_tsk_thread_flag(target, TIF_MSA_CTX_LIVE)) {
+		/* Copy scalar FP context, fill the rest with 0xff */
+		err = copy_pad_fprs(target, regset, &pos, &count,
+				    &kbuf, &ubuf, 8);
+	} else if (sizeof(target->thread.fpu.fpr[0]) == regset->size) {
+		/* Trivially copy the vector registers */
+		err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					  &target->thread.fpu.fpr,
+					  0, wr_size);
+	} else {
+		/* Copy as much context as possible, fill the rest with 0xff */
+		err = copy_pad_fprs(target, regset, &pos, &count,
+				    &kbuf, &ubuf,
+				    sizeof(target->thread.fpu.fpr[0]));
+	}
+
+	err |= user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &ctrl_regs, wr_size,
+				   wr_size + sizeof(ctrl_regs));
+	return err;
+}
+
+static int msa_set(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	const unsigned int wr_size = NUM_FPU_REGS * regset->size;
+	struct msa_control_regs ctrl_regs;
+	unsigned int cp_sz;
+	int i, err, start;
+
+	init_fp_ctx(target);
+
+	if (sizeof(target->thread.fpu.fpr[0]) == regset->size) {
+		/* Trivially copy the vector registers */
+		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					 &target->thread.fpu.fpr,
+					 0, wr_size);
+	} else {
+		/* Copy as much context as possible */
+		cp_sz = min_t(unsigned int, regset->size,
+			      sizeof(target->thread.fpu.fpr[0]));
+
+		i = start = err = 0;
+		for (; i < NUM_FPU_REGS; i++, start += regset->size) {
+			err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+						  &target->thread.fpu.fpr[i],
+						  start, start + cp_sz);
+		}
+	}
+
+	if (!err)
+		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl_regs,
+					 wr_size, wr_size + sizeof(ctrl_regs));
+	if (!err) {
+		target->thread.fpu.fcr31 = ctrl_regs.fcsr & ~FPU_CSR_ALL_X;
+		target->thread.fpu.msacsr = ctrl_regs.msacsr & ~MSA_CSR_CAUSEF;
+	}
+
+	return err;
+}
+
+#endif /* CONFIG_CPU_HAS_MSA */
+
 #if defined(CONFIG_32BIT) || defined(CONFIG_MIPS32_O32)
 
 /*
@@ -798,6 +922,9 @@ enum mips_regset {
 	REGSET_FPR,
 	REGSET_FP_MODE,
 #endif
+#ifdef CONFIG_CPU_HAS_MSA
+	REGSET_MSA,
+#endif
 };
 
 struct pt_regs_offset {
@@ -922,6 +1049,16 @@ static const struct user_regset mips_regsets[] = {
 		.set		= fp_mode_set,
 	},
 #endif
+#ifdef CONFIG_CPU_HAS_MSA
+	[REGSET_MSA] = {
+		.core_note_type	= NT_MIPS_MSA,
+		.n		= NUM_FPU_REGS + 1,
+		.size		= 16,
+		.align		= 16,
+		.get		= msa_get,
+		.set		= msa_set,
+	},
+#endif
 };
 
 static const struct user_regset_view user_mips_view = {
@@ -972,6 +1109,16 @@ static const struct user_regset mips64_regsets[] = {
 		.set		= fpr_set,
 	},
 #endif
+#ifdef CONFIG_CPU_HAS_MSA
+	[REGSET_MSA] = {
+		.core_note_type	= NT_MIPS_MSA,
+		.n		= NUM_FPU_REGS + 1,
+		.size		= 16,
+		.align		= 16,
+		.get		= msa_get,
+		.set		= msa_set,
+	},
+#endif
 };
 
 static const struct user_regset_view user_mips64_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index c5358e0ae7c5..d1b093f931e3 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -424,6 +424,7 @@ typedef struct elf64_shdr {
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
 #define NT_MIPS_FP_MODE	0x801		/* MIPS floating-point mode */
+#define NT_MIPS_MSA	0x802		/* MIPS SIMD registers */
 
 /* Note header in a PT_NOTE section */
 typedef struct elf32_note {
-- 
cgit v1.2.3-59-g8ed1b


From 610c0c2b2813c36dc16838bbdbba4c29f8680dde Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 30 Oct 2018 07:32:05 +0100
Subject: virtio-gpu: add VIRTIO_GPU_F_EDID feature

The feature allows the guest request an EDID blob (describing monitor
capabilities) for a given scanout (aka virtual monitor connector).

It brings a new command message, which has just a scanout field (beside
the standard virtio-gpu header) and a response message which carries the
EDID data.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20181030063206.19528-2-kraxel@redhat.com
---
 include/uapi/linux/virtio_gpu.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/virtio_gpu.h b/include/uapi/linux/virtio_gpu.h
index f43c3c6171ff..8e88eba1fa7a 100644
--- a/include/uapi/linux/virtio_gpu.h
+++ b/include/uapi/linux/virtio_gpu.h
@@ -41,6 +41,7 @@
 #include <linux/types.h>
 
 #define VIRTIO_GPU_F_VIRGL 0
+#define VIRTIO_GPU_F_EDID  1
 
 enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_UNDEFINED = 0,
@@ -56,6 +57,7 @@ enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING,
 	VIRTIO_GPU_CMD_GET_CAPSET_INFO,
 	VIRTIO_GPU_CMD_GET_CAPSET,
+	VIRTIO_GPU_CMD_GET_EDID,
 
 	/* 3d commands */
 	VIRTIO_GPU_CMD_CTX_CREATE = 0x0200,
@@ -76,6 +78,7 @@ enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_RESP_OK_DISPLAY_INFO,
 	VIRTIO_GPU_RESP_OK_CAPSET_INFO,
 	VIRTIO_GPU_RESP_OK_CAPSET,
+	VIRTIO_GPU_RESP_OK_EDID,
 
 	/* error responses */
 	VIRTIO_GPU_RESP_ERR_UNSPEC = 0x1200,
@@ -291,6 +294,21 @@ struct virtio_gpu_resp_capset {
 	__u8 capset_data[];
 };
 
+/* VIRTIO_GPU_CMD_GET_EDID */
+struct virtio_gpu_cmd_get_edid {
+	struct virtio_gpu_ctrl_hdr hdr;
+	__le32 scanout;
+	__le32 padding;
+};
+
+/* VIRTIO_GPU_RESP_OK_EDID */
+struct virtio_gpu_resp_edid {
+	struct virtio_gpu_ctrl_hdr hdr;
+	__le32 size;
+	__le32 padding;
+	__u8 edid[1024];
+};
+
 #define VIRTIO_GPU_EVENT_DISPLAY (1 << 0)
 
 struct virtio_gpu_config {
-- 
cgit v1.2.3-59-g8ed1b


From f11216b24219ab26d8d159fbfa12dff886b16e32 Mon Sep 17 00:00:00 2001
From: Vlad Dumitrescu <vladum@google.com>
Date: Thu, 22 Nov 2018 14:39:16 -0500
Subject: bpf: add skb->tstamp r/w access from tc clsact and cg skb progs

This could be used to rate limit egress traffic in concert with a qdisc
which supports Earliest Departure Time, such as FQ.

Write access from cg skb progs only with CAP_SYS_ADMIN, since the value
will be used by downstream qdiscs. It might make sense to relax this.

Changes v1 -> v2:
  - allow access from cg skb, write only with CAP_SYS_ADMIN

Signed-off-by: Vlad Dumitrescu <vladum@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h                    |  1 +
 net/core/filter.c                           | 29 +++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h              |  1 +
 tools/testing/selftests/bpf/test_verifier.c | 29 +++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c1554aa07465..23e2031a43d4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2468,6 +2468,7 @@ struct __sk_buff {
 
 	__u32 data_meta;
 	struct bpf_flow_keys *flow_keys;
+	__u64 tstamp;
 };
 
 struct bpf_tunnel_key {
diff --git a/net/core/filter.c b/net/core/filter.c
index f6ca38a7d433..65dc13aeca7c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5573,6 +5573,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 		if (size != sizeof(struct bpf_flow_keys *))
 			return false;
 		break;
+	case bpf_ctx_range(struct __sk_buff, tstamp):
+		if (size != sizeof(__u64))
+			return false;
+		break;
 	default:
 		/* Only narrow read access allowed for now. */
 		if (type == BPF_WRITE) {
@@ -5600,6 +5604,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_end):
 	case bpf_ctx_range(struct __sk_buff, flow_keys):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+	case bpf_ctx_range(struct __sk_buff, tstamp):
 		return false;
 	}
 
@@ -5638,6 +5643,10 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range(struct __sk_buff, priority):
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
+		case bpf_ctx_range(struct __sk_buff, tstamp):
+			if (!capable(CAP_SYS_ADMIN))
+				return false;
+			break;
 		default:
 			return false;
 		}
@@ -5665,6 +5674,7 @@ static bool lwt_is_valid_access(int off, int size,
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, flow_keys):
+	case bpf_ctx_range(struct __sk_buff, tstamp):
 		return false;
 	}
 
@@ -5874,6 +5884,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		case bpf_ctx_range(struct __sk_buff, priority):
 		case bpf_ctx_range(struct __sk_buff, tc_classid):
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+		case bpf_ctx_range(struct __sk_buff, tstamp):
 			break;
 		default:
 			return false;
@@ -6093,6 +6104,7 @@ static bool sk_skb_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, flow_keys):
+	case bpf_ctx_range(struct __sk_buff, tstamp):
 		return false;
 	}
 
@@ -6179,6 +6191,7 @@ static bool flow_dissector_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+	case bpf_ctx_range(struct __sk_buff, tstamp):
 		return false;
 	}
 
@@ -6488,6 +6501,22 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
 				      si->src_reg, off);
 		break;
+
+	case offsetof(struct __sk_buff, tstamp):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_DW,
+					      si->dst_reg, si->src_reg,
+					      bpf_target_off(struct sk_buff,
+							     tstamp, 8,
+							     target_size));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_DW,
+					      si->dst_reg, si->src_reg,
+					      bpf_target_off(struct sk_buff,
+							     tstamp, 8,
+							     target_size));
 	}
 
 	return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c1554aa07465..23e2031a43d4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2468,6 +2468,7 @@ struct __sk_buff {
 
 	__u32 data_meta;
 	struct bpf_flow_keys *flow_keys;
+	__u64 tstamp;
 };
 
 struct bpf_tunnel_key {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 54d16fbdef8b..537a8f91af02 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2446,6 +2446,10 @@ static struct bpf_test tests[] = {
 				    offsetof(struct __sk_buff, tc_index)),
 			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
 				    offsetof(struct __sk_buff, cb[3])),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, tstamp)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, tstamp)),
 			BPF_EXIT_INSN(),
 		},
 		.errstr_unpriv = "",
@@ -5297,6 +5301,31 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "R2 leaks addr into helper function",
 		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
 	},
+	{
+		"write tstamp from CGROUP_SKB",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, tstamp)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "invalid bpf_context access off=152 size=8",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"read tstamp from CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, tstamp)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
 	{
 		"multiple registers share map_lookup_elem result",
 		.insns = {
-- 
cgit v1.2.3-59-g8ed1b


From d644cca50f366cd109845ae92e37c09ed79adf81 Mon Sep 17 00:00:00 2001
From: John Sheu <sheu@chromium.org>
Date: Thu, 15 Nov 2018 10:57:16 -0500
Subject: media: vb2: Allow reqbufs(0) with "in use" MMAP buffers

Videobuf2 presently does not allow VIDIOC_REQBUFS to destroy outstanding
buffers if the queue is of type V4L2_MEMORY_MMAP, and if the buffers are
considered "in use".  This is different behavior than for other memory
types and prevents us from deallocating buffers in following two cases:

1) There are outstanding mmap()ed views on the buffer. However even if
   we put the buffer in reqbufs(0), there will be remaining references,
   due to vma .open/close() adjusting vb2 buffer refcount appropriately.
   This means that the buffer will be in fact freed only when the last
   mmap()ed view is unmapped.

2) Buffer has been exported as a DMABUF. Refcount of the vb2 buffer
   is managed properly by VB2 DMABUF ops, i.e. incremented on DMABUF
   get and decremented on DMABUF release. This means that the buffer
   will be alive until all importers release it.

Considering both cases above, there does not seem to be any need to
prevent reqbufs(0) operation, because buffer lifetime is already
properly managed by both mmap() and DMABUF code paths. Let's remove it
and allow userspace freeing the queue (and potentially allocating a new
one) even though old buffers might be still in processing.

To let userspace know that the kernel now supports orphaning buffers
that are still in use, add a new V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS
to be set by reqbufs and create_bufs.

[p.zabel@pengutronix.de: added V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS,
 updated documentation, and added back debug message]

Signed-off-by: John Sheu <sheu@chromium.org>
Reviewed-by: Pawel Osciak <posciak@chromium.org>
Signed-off-by: Tomasz Figa <tfiga@chromium.org>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
[hverkuil-cisco@xs4all.nl: added V4L2-BUF-CAP-SUPPORTS-ORPHANED-BUFS ref]
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/vidioc-reqbufs.rst | 17 ++++++++++++++---
 drivers/media/common/videobuf2/videobuf2-core.c |  8 +++-----
 drivers/media/common/videobuf2/videobuf2-v4l2.c |  2 +-
 include/uapi/linux/videodev2.h                  |  1 +
 4 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/media/uapi/v4l/vidioc-reqbufs.rst b/Documentation/media/uapi/v4l/vidioc-reqbufs.rst
index d4bbbb0c60e8..e62a15782790 100644
--- a/Documentation/media/uapi/v4l/vidioc-reqbufs.rst
+++ b/Documentation/media/uapi/v4l/vidioc-reqbufs.rst
@@ -59,9 +59,14 @@ When the I/O method is not supported the ioctl returns an ``EINVAL`` error
 code.
 
 Applications can call :ref:`VIDIOC_REQBUFS` again to change the number of
-buffers, however this cannot succeed when any buffers are still mapped.
-A ``count`` value of zero frees all buffers, after aborting or finishing
-any DMA in progress, an implicit
+buffers. Note that if any buffers are still mapped or exported via DMABUF,
+then :ref:`VIDIOC_REQBUFS` can only succeed if the
+``V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS`` capability is set. Otherwise
+:ref:`VIDIOC_REQBUFS` will return the ``EBUSY`` error code.
+If ``V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS`` is set, then these buffers are
+orphaned and will be freed when they are unmapped or when the exported DMABUF
+fds are closed. A ``count`` value of zero frees or orphans all buffers, after
+aborting or finishing any DMA in progress, an implicit
 :ref:`VIDIOC_STREAMOFF <VIDIOC_STREAMON>`.
 
 
@@ -112,6 +117,7 @@ any DMA in progress, an implicit
 .. _V4L2-BUF-CAP-SUPPORTS-USERPTR:
 .. _V4L2-BUF-CAP-SUPPORTS-DMABUF:
 .. _V4L2-BUF-CAP-SUPPORTS-REQUESTS:
+.. _V4L2-BUF-CAP-SUPPORTS-ORPHANED-BUFS:
 
 .. cssclass:: longtable
 
@@ -132,6 +138,11 @@ any DMA in progress, an implicit
     * - ``V4L2_BUF_CAP_SUPPORTS_REQUESTS``
       - 0x00000008
       - This buffer type supports :ref:`requests <media-request-api>`.
+    * - ``V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS``
+      - 0x00000010
+      - The kernel allows calling :ref:`VIDIOC_REQBUFS` while buffers are still
+        mapped or exported via DMABUF. These orphaned buffers will be freed
+        when they are unmapped or when the exported DMABUF fds are closed.
 
 Return Value
 ============
diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index 03954c13024c..04d1250747cf 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -679,11 +679,9 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 		 * are not in use and can be freed.
 		 */
 		mutex_lock(&q->mmap_lock);
-		if (q->memory == VB2_MEMORY_MMAP && __buffers_in_use(q)) {
-			mutex_unlock(&q->mmap_lock);
-			dprintk(1, "memory in use, cannot free\n");
-			return -EBUSY;
-		}
+		if (debug && q->memory == VB2_MEMORY_MMAP &&
+		    __buffers_in_use(q))
+			dprintk(1, "memory in use, orphaning buffers\n");
 
 		/*
 		 * Call queue_cancel to clean up any buffers in the
diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c
index a17033ab2c22..f02d452ceeb9 100644
--- a/drivers/media/common/videobuf2/videobuf2-v4l2.c
+++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c
@@ -624,7 +624,7 @@ EXPORT_SYMBOL(vb2_querybuf);
 
 static void fill_buf_caps(struct vb2_queue *q, u32 *caps)
 {
-	*caps = 0;
+	*caps = V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS;
 	if (q->io_modes & VB2_MMAP)
 		*caps |= V4L2_BUF_CAP_SUPPORTS_MMAP;
 	if (q->io_modes & VB2_USERPTR)
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index c8e8ff810190..2a223835214c 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -879,6 +879,7 @@ struct v4l2_requestbuffers {
 #define V4L2_BUF_CAP_SUPPORTS_USERPTR	(1 << 1)
 #define V4L2_BUF_CAP_SUPPORTS_DMABUF	(1 << 2)
 #define V4L2_BUF_CAP_SUPPORTS_REQUESTS	(1 << 3)
+#define V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS (1 << 4)
 
 /**
  * struct v4l2_plane - plane info for multi-planar buffers
-- 
cgit v1.2.3-59-g8ed1b


From 33b2e1442aeaacabb4e29f8adb31469e87599b16 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Fri, 23 Nov 2018 13:13:03 +0900
Subject: ALSA: firewire-tascam: pick up data of state from tx isochronous
 pakcets

Units of TASCAM FireWire series multiplex PCM frames and state of
control surface into the same tx isochronous packets. One isochronous
packet includes a part of the state in a quadlet data. An image of the
state consists of 64 quadlet data.

This commit demultiplexes the state from tx isochronous packets.

Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/firewire.h        |  2 ++
 sound/firewire/tascam/amdtp-tascam.c | 17 ++++++++++++++++-
 sound/firewire/tascam/tascam.h       |  3 +++
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/sound/firewire.h b/include/uapi/sound/firewire.h
index f0a547d86679..b9bc4bb1756b 100644
--- a/include/uapi/sound/firewire.h
+++ b/include/uapi/sound/firewire.h
@@ -88,4 +88,6 @@ struct snd_firewire_get_info {
  * Returns -EBUSY if the driver is already streaming.
  */
 
+#define SNDRV_FIREWIRE_TASCAM_STATE_COUNT	64
+
 #endif /* _UAPI_SOUND_FIREWIRE_H_INCLUDED */
diff --git a/sound/firewire/tascam/amdtp-tascam.c b/sound/firewire/tascam/amdtp-tascam.c
index ab482423c165..516cb931fd5e 100644
--- a/sound/firewire/tascam/amdtp-tascam.c
+++ b/sound/firewire/tascam/amdtp-tascam.c
@@ -117,6 +117,21 @@ int amdtp_tscm_add_pcm_hw_constraints(struct amdtp_stream *s,
 	return amdtp_stream_add_pcm_hw_constraints(s, runtime);
 }
 
+static void read_status_messages(struct amdtp_stream *s,
+				 __be32 *buffer, unsigned int data_blocks)
+{
+	struct snd_tscm *tscm = container_of(s, struct snd_tscm, tx_stream);
+	int i;
+
+	for (i = 0; i < data_blocks; i++) {
+		unsigned int index;
+
+		index = be32_to_cpu(buffer[0]) % SNDRV_FIREWIRE_TASCAM_STATE_COUNT;
+		tscm->state[index] = buffer[s->data_block_quadlets - 1];
+		buffer += s->data_block_quadlets;
+	}
+}
+
 static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 					   __be32 *buffer,
 					   unsigned int data_blocks,
@@ -128,7 +143,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 	if (data_blocks > 0 && pcm)
 		read_pcm_s32(s, pcm, buffer, data_blocks);
 
-	/* A place holder for control messages. */
+	read_status_messages(s, buffer, data_blocks);
 
 	return data_blocks;
 }
diff --git a/sound/firewire/tascam/tascam.h b/sound/firewire/tascam/tascam.h
index a5bd167eb5d9..c710496a99cf 100644
--- a/sound/firewire/tascam/tascam.h
+++ b/sound/firewire/tascam/tascam.h
@@ -89,6 +89,9 @@ struct snd_tscm {
 
 	/* For MIDI message outgoing transactions. */
 	struct snd_fw_async_midi_port out_ports[TSCM_MIDI_OUT_PORT_MAX];
+
+	// A cache of status information in tx isoc packets.
+	__be32 state[SNDRV_FIREWIRE_TASCAM_STATE_COUNT];
 };
 
 #define TSCM_ADDR_BASE			0xffff00000000ull
-- 
cgit v1.2.3-59-g8ed1b


From 90e8ac5c9d446124a5b43a6f135bf67e060c0c9d Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Fri, 23 Nov 2018 13:13:04 +0900
Subject: ALSA: firewire-tascam: add new hwdep ioctl command to get state image

In a previous commit, ALSA firewire-tascam driver stores state image
from tx isochronous packets. This image includes states of knob, fader,
button of control surface, level of gain/volume of each physical
inputs/outputs, and so on. It's useful for userspace applications to
read whole of the image.

This commit adds a unique ioctl command for ALSA hwdep interface for the
purpose. For actual meaning of each bits in this image, please refer to
discussion in alsa-devel[1].

[1] http://mailman.alsa-project.org/pipermail/alsa-devel/2018-October/140785.html

Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/firewire.h        |  5 +++++
 sound/firewire/tascam/tascam-hwdep.c | 10 ++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/sound/firewire.h b/include/uapi/sound/firewire.h
index b9bc4bb1756b..012f81bf4750 100644
--- a/include/uapi/sound/firewire.h
+++ b/include/uapi/sound/firewire.h
@@ -66,6 +66,7 @@ union snd_firewire_event {
 #define SNDRV_FIREWIRE_IOCTL_GET_INFO _IOR('H', 0xf8, struct snd_firewire_get_info)
 #define SNDRV_FIREWIRE_IOCTL_LOCK      _IO('H', 0xf9)
 #define SNDRV_FIREWIRE_IOCTL_UNLOCK    _IO('H', 0xfa)
+#define SNDRV_FIREWIRE_IOCTL_TASCAM_STATE _IOR('H', 0xfb, struct snd_firewire_tascam_state)
 
 #define SNDRV_FIREWIRE_TYPE_DICE	1
 #define SNDRV_FIREWIRE_TYPE_FIREWORKS	2
@@ -90,4 +91,8 @@ struct snd_firewire_get_info {
 
 #define SNDRV_FIREWIRE_TASCAM_STATE_COUNT	64
 
+struct snd_firewire_tascam_state {
+	__be32 data[SNDRV_FIREWIRE_TASCAM_STATE_COUNT];
+};
+
 #endif /* _UAPI_SOUND_FIREWIRE_H_INCLUDED */
diff --git a/sound/firewire/tascam/tascam-hwdep.c b/sound/firewire/tascam/tascam-hwdep.c
index 4e4c1e9020e8..a80397116c48 100644
--- a/sound/firewire/tascam/tascam-hwdep.c
+++ b/sound/firewire/tascam/tascam-hwdep.c
@@ -123,6 +123,14 @@ static int hwdep_unlock(struct snd_tscm *tscm)
 	return err;
 }
 
+static int tscm_hwdep_state(struct snd_tscm *tscm, void __user *arg)
+{
+	if (copy_to_user(arg, tscm->state, sizeof(tscm->state)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int hwdep_release(struct snd_hwdep *hwdep, struct file *file)
 {
 	struct snd_tscm *tscm = hwdep->private_data;
@@ -147,6 +155,8 @@ static int hwdep_ioctl(struct snd_hwdep *hwdep, struct file *file,
 		return hwdep_lock(tscm);
 	case SNDRV_FIREWIRE_IOCTL_UNLOCK:
 		return hwdep_unlock(tscm);
+	case SNDRV_FIREWIRE_IOCTL_TASCAM_STATE:
+		return tscm_hwdep_state(tscm, (void __user *)arg);
 	default:
 		return -ENOIOCTLCMD;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From d7167422433cdb61e58baee9c25543d0eba95c9d Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Fri, 23 Nov 2018 13:13:05 +0900
Subject: ALSA: firewire-tascam: queue events for change of control surface

Units of TASCAM FireWire series transfer image of states of the unit in
tx isochronous packets. Demultiplexing of the states from the packets
is done in software interrupt context regardless of any process context.
In a view of userspace applications, it needs to have notification
mechanism to catch change of the states.

This commit implements a queue to store events for the notification. The
image of states includes fluctuating data such as level of gain/volume
for physical input/output and position of knobs. Therefore the events
are queued corresponding to some control features only.

Furthermore, the queued events are planned to be consumed by userspace
applications via ALSA hwdep interface. This commit suppresses event
queueing when no applications open the hwdep interface.

However, the queue is maintained in an optimistic scenario, thus without
any care against overrrun. This is reasonable because target events are
useless just to handle PCM frames. It starts queueing when an usespace
application opens hwdep interface, thus it's expected to read the queued
events steadily.

Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/firewire.h        |  6 ++++++
 sound/firewire/tascam/amdtp-tascam.c | 34 +++++++++++++++++++++++++++++++++-
 sound/firewire/tascam/tascam-hwdep.c |  2 ++
 sound/firewire/tascam/tascam.h       |  6 ++++++
 4 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/sound/firewire.h b/include/uapi/sound/firewire.h
index 012f81bf4750..bb067efb09c6 100644
--- a/include/uapi/sound/firewire.h
+++ b/include/uapi/sound/firewire.h
@@ -53,6 +53,12 @@ struct snd_firewire_event_motu_notification {
 	__u32 message;	/* MOTU-specific bits. */
 };
 
+struct snd_firewire_tascam_change {
+	unsigned int index;
+	__be32 before;
+	__be32 after;
+};
+
 union snd_firewire_event {
 	struct snd_firewire_event_common            common;
 	struct snd_firewire_event_lock_status       lock_status;
diff --git a/sound/firewire/tascam/amdtp-tascam.c b/sound/firewire/tascam/amdtp-tascam.c
index 516cb931fd5e..0e8088c9ada9 100644
--- a/sound/firewire/tascam/amdtp-tascam.c
+++ b/sound/firewire/tascam/amdtp-tascam.c
@@ -121,13 +121,45 @@ static void read_status_messages(struct amdtp_stream *s,
 				 __be32 *buffer, unsigned int data_blocks)
 {
 	struct snd_tscm *tscm = container_of(s, struct snd_tscm, tx_stream);
+	bool used = READ_ONCE(tscm->hwdep->used);
 	int i;
 
 	for (i = 0; i < data_blocks; i++) {
 		unsigned int index;
+		__be32 before;
+		__be32 after;
 
 		index = be32_to_cpu(buffer[0]) % SNDRV_FIREWIRE_TASCAM_STATE_COUNT;
-		tscm->state[index] = buffer[s->data_block_quadlets - 1];
+		before = tscm->state[index];
+		after = buffer[s->data_block_quadlets - 1];
+
+		if (used && index > 4 && index < 16) {
+			__be32 mask;
+
+			if (index == 5)
+				mask = cpu_to_be32(~0x0000ffff);
+			else if (index == 6)
+				mask = cpu_to_be32(~0x0000ffff);
+			else if (index == 8)
+				mask = cpu_to_be32(~0x000f0f00);
+			else
+				mask = cpu_to_be32(~0x00000000);
+
+			if ((before ^ after) & mask) {
+				struct snd_firewire_tascam_change *entry =
+						&tscm->queue[tscm->push_pos];
+
+				spin_lock_irq(&tscm->lock);
+				entry->index = index;
+				entry->before = before;
+				entry->after = after;
+				if (++tscm->push_pos >= SND_TSCM_QUEUE_COUNT)
+					tscm->push_pos = 0;
+				spin_unlock_irq(&tscm->lock);
+			}
+		}
+
+		tscm->state[index] = after;
 		buffer += s->data_block_quadlets;
 	}
 }
diff --git a/sound/firewire/tascam/tascam-hwdep.c b/sound/firewire/tascam/tascam-hwdep.c
index a80397116c48..9afa827af05d 100644
--- a/sound/firewire/tascam/tascam-hwdep.c
+++ b/sound/firewire/tascam/tascam-hwdep.c
@@ -195,5 +195,7 @@ int snd_tscm_create_hwdep_device(struct snd_tscm *tscm)
 	hwdep->private_data = tscm;
 	hwdep->exclusive = true;
 
+	tscm->hwdep = hwdep;
+
 	return err;
 }
diff --git a/sound/firewire/tascam/tascam.h b/sound/firewire/tascam/tascam.h
index c710496a99cf..6a411ee0dcf1 100644
--- a/sound/firewire/tascam/tascam.h
+++ b/sound/firewire/tascam/tascam.h
@@ -62,6 +62,8 @@ struct snd_fw_async_midi_port {
 	int consume_bytes;
 };
 
+#define SND_TSCM_QUEUE_COUNT	16
+
 struct snd_tscm {
 	struct snd_card *card;
 	struct fw_unit *unit;
@@ -92,6 +94,10 @@ struct snd_tscm {
 
 	// A cache of status information in tx isoc packets.
 	__be32 state[SNDRV_FIREWIRE_TASCAM_STATE_COUNT];
+	struct snd_hwdep *hwdep;
+	struct snd_firewire_tascam_change queue[SND_TSCM_QUEUE_COUNT];
+	unsigned int pull_pos;
+	unsigned int push_pos;
 };
 
 #define TSCM_ADDR_BASE			0xffff00000000ull
-- 
cgit v1.2.3-59-g8ed1b


From a8c0d13267a4151b2ff124cde6331ec28ed0d55e Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Fri, 23 Nov 2018 13:13:07 +0900
Subject: ALSA: firewire-tascam: notify events of change of state for userspace
 applications

In former commits, ALSA firewire-tascam driver queues events to notify
change of state of control surface to userspace via ALSA hwdep
interface.

This commit implements actual notification of the events. The events are
not governed by real time, thus no need to care underrun.

Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/firewire.h        |  7 ++++
 sound/firewire/tascam/amdtp-tascam.c |  2 ++
 sound/firewire/tascam/tascam-hwdep.c | 65 ++++++++++++++++++++++++++++++++++--
 3 files changed, 72 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/sound/firewire.h b/include/uapi/sound/firewire.h
index bb067efb09c6..ae12826ed641 100644
--- a/include/uapi/sound/firewire.h
+++ b/include/uapi/sound/firewire.h
@@ -12,6 +12,7 @@
 #define SNDRV_FIREWIRE_EVENT_EFW_RESPONSE	0x4e617475
 #define SNDRV_FIREWIRE_EVENT_DIGI00X_MESSAGE	0x746e736c
 #define SNDRV_FIREWIRE_EVENT_MOTU_NOTIFICATION	0x64776479
+#define SNDRV_FIREWIRE_EVENT_TASCAM_CONTROL	0x7473636d
 
 struct snd_firewire_event_common {
 	unsigned int type; /* SNDRV_FIREWIRE_EVENT_xxx */
@@ -59,12 +60,18 @@ struct snd_firewire_tascam_change {
 	__be32 after;
 };
 
+struct snd_firewire_event_tascam_control {
+	unsigned int type;
+	struct snd_firewire_tascam_change changes[0];
+};
+
 union snd_firewire_event {
 	struct snd_firewire_event_common            common;
 	struct snd_firewire_event_lock_status       lock_status;
 	struct snd_firewire_event_dice_notification dice_notification;
 	struct snd_firewire_event_efw_response      efw_response;
 	struct snd_firewire_event_digi00x_message   digi00x_message;
+	struct snd_firewire_event_tascam_control    tascam_control;
 	struct snd_firewire_event_motu_notification motu_notification;
 };
 
diff --git a/sound/firewire/tascam/amdtp-tascam.c b/sound/firewire/tascam/amdtp-tascam.c
index 0e8088c9ada9..a52d1f76c610 100644
--- a/sound/firewire/tascam/amdtp-tascam.c
+++ b/sound/firewire/tascam/amdtp-tascam.c
@@ -156,6 +156,8 @@ static void read_status_messages(struct amdtp_stream *s,
 				if (++tscm->push_pos >= SND_TSCM_QUEUE_COUNT)
 					tscm->push_pos = 0;
 				spin_unlock_irq(&tscm->lock);
+
+				wake_up(&tscm->hwdep_wait);
 			}
 		}
 
diff --git a/sound/firewire/tascam/tascam-hwdep.c b/sound/firewire/tascam/tascam-hwdep.c
index 8f34cede2e9f..0414abf5daa8 100644
--- a/sound/firewire/tascam/tascam-hwdep.c
+++ b/sound/firewire/tascam/tascam-hwdep.c
@@ -35,6 +35,65 @@ static long tscm_hwdep_read_locked(struct snd_tscm *tscm, char __user *buf,
 	return count;
 }
 
+static long tscm_hwdep_read_queue(struct snd_tscm *tscm, char __user *buf,
+				  long remained, loff_t *offset)
+{
+	char __user *pos = buf;
+	unsigned int type = SNDRV_FIREWIRE_EVENT_TASCAM_CONTROL;
+	struct snd_firewire_tascam_change *entries = tscm->queue;
+	long count;
+
+	// At least, one control event can be copied.
+	if (remained < sizeof(type) + sizeof(*entries)) {
+		spin_unlock_irq(&tscm->lock);
+		return -EINVAL;
+	}
+
+	// Copy the type field later.
+	count = sizeof(type);
+	remained -= sizeof(type);
+	pos += sizeof(type);
+
+	while (true) {
+		unsigned int head_pos;
+		unsigned int tail_pos;
+		unsigned int length;
+
+		if (tscm->pull_pos == tscm->push_pos)
+			break;
+		else if (tscm->pull_pos < tscm->push_pos)
+			tail_pos = tscm->push_pos;
+		else
+			tail_pos = SND_TSCM_QUEUE_COUNT;
+		head_pos = tscm->pull_pos;
+
+		length = (tail_pos - head_pos) * sizeof(*entries);
+		if (remained < length)
+			length = rounddown(remained, sizeof(*entries));
+		if (length == 0)
+			break;
+
+		spin_unlock_irq(&tscm->lock);
+		if (copy_to_user(pos, &entries[head_pos], length))
+			return -EFAULT;
+
+		spin_lock_irq(&tscm->lock);
+
+		tscm->pull_pos = tail_pos % SND_TSCM_QUEUE_COUNT;
+
+		count += length;
+		remained -= length;
+		pos += length;
+	}
+
+	spin_unlock_irq(&tscm->lock);
+
+	if (copy_to_user(buf, &type, sizeof(type)))
+		return -EFAULT;
+
+	return count;
+}
+
 static long hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count,
 		       loff_t *offset)
 {
@@ -43,7 +102,7 @@ static long hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count,
 
 	spin_lock_irq(&tscm->lock);
 
-	while (!tscm->dev_lock_changed) {
+	while (!tscm->dev_lock_changed && tscm->push_pos == tscm->pull_pos) {
 		prepare_to_wait(&tscm->hwdep_wait, &wait, TASK_INTERRUPTIBLE);
 		spin_unlock_irq(&tscm->lock);
 		schedule();
@@ -56,6 +115,8 @@ static long hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count,
 	// NOTE: The acquired lock should be released in callee side.
 	if (tscm->dev_lock_changed) {
 		count = tscm_hwdep_read_locked(tscm, buf, count, offset);
+	} else if (tscm->push_pos != tscm->pull_pos) {
+		count = tscm_hwdep_read_queue(tscm, buf, count, offset);
 	} else {
 		spin_unlock_irq(&tscm->lock);
 		count = 0;
@@ -73,7 +134,7 @@ static __poll_t hwdep_poll(struct snd_hwdep *hwdep, struct file *file,
 	poll_wait(file, &tscm->hwdep_wait, wait);
 
 	spin_lock_irq(&tscm->lock);
-	if (tscm->dev_lock_changed)
+	if (tscm->dev_lock_changed || tscm->push_pos != tscm->pull_pos)
 		events = EPOLLIN | EPOLLRDNORM;
 	else
 		events = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 1b09577bef3ac135ed02115943c9ab53f2129555 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Sun, 25 Nov 2018 20:51:12 +0200
Subject: RDMA/uverbs: Add missing driver_data

If the struct is used with a driver_udata it should have a trailing
driver_data flex array to mark it as having udata.

In most cases this forces the end of the struct to be aligned to u64 which
is needed to make the trailing driver_data naturally aligned.

Unfortunately We have a few cases where the base struct is not aligned to
8 bytes, these are marked with a u32 driver_data and userspace will check
for alignment issues when it compiles the driver.

Also remove the empty ib_uverbs_modify_qp_resp as nothing uses this.

pahole says there is no change to any struct sizes by this change.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/uapi/rdma/ib_user_verbs.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 1254b51a551a..c586fc43739c 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -164,6 +164,7 @@ struct ib_uverbs_get_context {
 struct ib_uverbs_get_context_resp {
 	__u32 async_fd;
 	__u32 num_comp_vectors;
+	__aligned_u64 driver_data[0];
 };
 
 struct ib_uverbs_query_device {
@@ -310,6 +311,7 @@ struct ib_uverbs_alloc_pd {
 
 struct ib_uverbs_alloc_pd_resp {
 	__u32 pd_handle;
+	__u32 driver_data[0];
 };
 
 struct ib_uverbs_dealloc_pd {
@@ -325,6 +327,7 @@ struct ib_uverbs_open_xrcd {
 
 struct ib_uverbs_open_xrcd_resp {
 	__u32 xrcd_handle;
+	__u32 driver_data[0];
 };
 
 struct ib_uverbs_close_xrcd {
@@ -345,6 +348,7 @@ struct ib_uverbs_reg_mr_resp {
 	__u32 mr_handle;
 	__u32 lkey;
 	__u32 rkey;
+	__u32 driver_data[0];
 };
 
 struct ib_uverbs_rereg_mr {
@@ -356,11 +360,13 @@ struct ib_uverbs_rereg_mr {
 	__aligned_u64 hca_va;
 	__u32 pd_handle;
 	__u32 access_flags;
+	__aligned_u64 driver_data[0];
 };
 
 struct ib_uverbs_rereg_mr_resp {
 	__u32 lkey;
 	__u32 rkey;
+	__aligned_u64 driver_data[0];
 };
 
 struct ib_uverbs_dereg_mr {
@@ -372,11 +378,13 @@ struct ib_uverbs_alloc_mw {
 	__u32 pd_handle;
 	__u8  mw_type;
 	__u8  reserved[3];
+	__aligned_u64 driver_data[0];
 };
 
 struct ib_uverbs_alloc_mw_resp {
 	__u32 mw_handle;
 	__u32 rkey;
+	__aligned_u64 driver_data[0];
 };
 
 struct ib_uverbs_dealloc_mw {
@@ -419,6 +427,7 @@ struct ib_uverbs_ex_create_cq {
 struct ib_uverbs_create_cq_resp {
 	__u32 cq_handle;
 	__u32 cqe;
+	__aligned_u64 driver_data[0];
 };
 
 struct ib_uverbs_ex_create_cq_resp {
@@ -629,6 +638,7 @@ struct ib_uverbs_create_qp_resp {
 	__u32 max_recv_sge;
 	__u32 max_inline_data;
 	__u32 reserved;
+	__u32 driver_data[0];
 };
 
 struct ib_uverbs_ex_create_qp_resp {
@@ -733,9 +743,6 @@ struct ib_uverbs_ex_modify_qp {
 	__u32	reserved;
 };
 
-struct ib_uverbs_modify_qp_resp {
-};
-
 struct ib_uverbs_ex_modify_qp_resp {
 	__u32  comp_mask;
 	__u32  response_length;
@@ -863,10 +870,12 @@ struct ib_uverbs_create_ah {
 	__u32 pd_handle;
 	__u32 reserved;
 	struct ib_uverbs_ah_attr attr;
+	__aligned_u64 driver_data[0];
 };
 
 struct ib_uverbs_create_ah_resp {
 	__u32 ah_handle;
+	__u32 driver_data[0];
 };
 
 struct ib_uverbs_destroy_ah {
@@ -1175,6 +1184,7 @@ struct ib_uverbs_create_srq_resp {
 	__u32 max_wr;
 	__u32 max_sge;
 	__u32 srqn;
+	__u32 driver_data[0];
 };
 
 struct ib_uverbs_modify_srq {
-- 
cgit v1.2.3-59-g8ed1b


From 89a9157e1253bb3384a7596cb51bf1ceeac063ed Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.bie@intel.com>
Date: Wed, 21 Nov 2018 18:03:18 +0800
Subject: virtio: add packed ring types and macros

Add types and macros for packed ring.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/virtio_config.h |  3 +++
 include/uapi/linux/virtio_ring.h   | 52 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 449132c76b1c..1196e1c1d4f6 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -75,6 +75,9 @@
  */
 #define VIRTIO_F_IOMMU_PLATFORM		33
 
+/* This feature indicates support for the packed virtqueue layout. */
+#define VIRTIO_F_RING_PACKED		34
+
 /*
  * Does the device support Single Root I/O Virtualization?
  */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5faa989b..2414f8af26b3 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -44,6 +44,13 @@
 /* This means the buffer contains a list of buffer descriptors. */
 #define VRING_DESC_F_INDIRECT	4
 
+/*
+ * Mark a descriptor as available or used in packed ring.
+ * Notice: they are defined as shifts instead of shifted values.
+ */
+#define VRING_PACKED_DESC_F_AVAIL	7
+#define VRING_PACKED_DESC_F_USED	15
+
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
  * will still kick if it's out of buffers. */
@@ -53,6 +60,23 @@
  * optimization.  */
 #define VRING_AVAIL_F_NO_INTERRUPT	1
 
+/* Enable events in packed ring. */
+#define VRING_PACKED_EVENT_FLAG_ENABLE	0x0
+/* Disable events in packed ring. */
+#define VRING_PACKED_EVENT_FLAG_DISABLE	0x1
+/*
+ * Enable events for a specific descriptor in packed ring.
+ * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
+ * Only valid if VIRTIO_RING_F_EVENT_IDX has been negotiated.
+ */
+#define VRING_PACKED_EVENT_FLAG_DESC	0x2
+
+/*
+ * Wrap counter bit shift in event suppression structure
+ * of packed ring.
+ */
+#define VRING_PACKED_EVENT_F_WRAP_CTR	15
+
 /* We support indirect buffer descriptors */
 #define VIRTIO_RING_F_INDIRECT_DESC	28
 
@@ -171,4 +195,32 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
 	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
 }
 
+struct vring_packed_desc_event {
+	/* Descriptor Ring Change Event Offset/Wrap Counter. */
+	__le16 off_wrap;
+	/* Descriptor Ring Change Event Flags. */
+	__le16 flags;
+};
+
+struct vring_packed_desc {
+	/* Buffer Address. */
+	__le64 addr;
+	/* Buffer Length. */
+	__le32 len;
+	/* Buffer ID. */
+	__le16 id;
+	/* The flags depending on descriptor type. */
+	__le16 flags;
+};
+
+struct vring_packed {
+	unsigned int num;
+
+	struct vring_packed_desc *desc;
+
+	struct vring_packed_desc_event *driver;
+
+	struct vring_packed_desc_event *device;
+};
+
 #endif /* _UAPI_LINUX_VIRTIO_RING_H */
-- 
cgit v1.2.3-59-g8ed1b


From 3eb152720c734112c5069e531442463e0a2abfb3 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 8 Nov 2018 08:16:51 -0800
Subject: drm/v3d: Fix whitespace inconsistency in the header.

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20181108161654.19888-2-eric@anholt.net
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 include/uapi/drm/v3d_drm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
index f446656d00b1..b1e5de076b0f 100644
--- a/include/uapi/drm/v3d_drm.h
+++ b/include/uapi/drm/v3d_drm.h
@@ -66,7 +66,7 @@ struct drm_v3d_submit_cl {
 	 */
 	__u32 bcl_start;
 
-	 /** End address of the BCL (first byte after the BCL) */
+	/** End address of the BCL (first byte after the BCL) */
 	__u32 bcl_end;
 
 	/* Offset of the render command list.
@@ -82,7 +82,7 @@ struct drm_v3d_submit_cl {
 	 */
 	__u32 rcl_start;
 
-	 /** End address of the RCL (first byte after the RCL) */
+	/** End address of the RCL (first byte after the RCL) */
 	__u32 rcl_end;
 
 	/** An optional sync object to wait on before starting the BCL. */
-- 
cgit v1.2.3-59-g8ed1b


From a428afe82f98d2ffb31c981671630df1fa25906f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Sat, 24 Nov 2018 04:34:20 +0200
Subject: net: bridge: add support for user-controlled bool options

We have been adding many new bridge options, a big number of which are
boolean but still take up netlink attribute ids and waste space in the skb.
Recently we discussed learning from link-local packets[1] and decided
yet another new boolean option will be needed, thus introducing this API
to save some bridge nl space.
The API supports changing the value of multiple boolean options at once
via the br_boolopt_multi struct which has an optmask (which options to
set, bit per opt) and optval (options' new values). Future boolean
options will only be added to the br_boolopt_id enum and then will have
to be handled in br_boolopt_toggle/get. The API will automatically
add the ability to change and export them via netlink, sysfs can use the
single boolopt function versions to do the same. The behaviour with
failing/succeeding is the same as with normal netlink option changing.

If an option requires mapping to internal kernel flag or needs special
configuration to be enabled then it should be handled in
br_boolopt_toggle. It should also be able to retrieve an option's current
state via br_boolopt_get.

v2: WARN_ON() on unsupported option as that shouldn't be possible and
    also will help catch people who add new options without handling
    them for both set and get. Pass down extack so if an option desires
    it could set it on error and be more user-friendly.

[1] https://www.spinics.net/lists/netdev/msg532698.html

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 18 +++++++++++
 include/uapi/linux/if_link.h   |  1 +
 net/bridge/br.c                | 71 ++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_netlink.c        | 17 +++++++++-
 net/bridge/br_private.h        |  8 +++++
 net/core/rtnetlink.c           |  2 +-
 6 files changed, 115 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index e41eda3c71f1..6dc02c03bdf8 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -292,4 +292,22 @@ struct br_mcast_stats {
 	__u64 mcast_bytes[BR_MCAST_DIR_SIZE];
 	__u64 mcast_packets[BR_MCAST_DIR_SIZE];
 };
+
+/* bridge boolean options
+ * IMPORTANT: if adding a new option do not forget to handle
+ *            it in br_boolopt_toggle/get and bridge sysfs
+ */
+enum br_boolopt_id {
+	BR_BOOLOPT_MAX
+};
+
+/* struct br_boolopt_multi - change multiple bridge boolean options
+ *
+ * @optval: new option values (bit per option)
+ * @optmask: options to change (bit per option)
+ */
+struct br_boolopt_multi {
+	__u32 optval;
+	__u32 optmask;
+};
 #endif /* _UAPI_LINUX_IF_BRIDGE_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index f42c069d81db..d6533828123a 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -288,6 +288,7 @@ enum {
 	IFLA_BR_MCAST_IGMP_VERSION,
 	IFLA_BR_MCAST_MLD_VERSION,
 	IFLA_BR_VLAN_STATS_PER_PORT,
+	IFLA_BR_MULTI_BOOLOPT,
 	__IFLA_BR_MAX,
 };
 
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 360ad66c21e9..c527160c1975 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -175,6 +175,77 @@ static struct notifier_block br_switchdev_notifier = {
 	.notifier_call = br_switchdev_event,
 };
 
+/* br_boolopt_toggle - change user-controlled boolean option
+ *
+ * @br: bridge device
+ * @opt: id of the option to change
+ * @on: new option value
+ * @extack: extack for error messages
+ *
+ * Changes the value of the respective boolean option to @on taking care of
+ * any internal option value mapping and configuration.
+ */
+int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
+		      struct netlink_ext_ack *extack)
+{
+	switch (opt) {
+	default:
+		/* shouldn't be called with unsupported options */
+		WARN_ON(1);
+		break;
+	}
+
+	return 0;
+}
+
+int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
+{
+	switch (opt) {
+	default:
+		/* shouldn't be called with unsupported options */
+		WARN_ON(1);
+		break;
+	}
+
+	return 0;
+}
+
+int br_boolopt_multi_toggle(struct net_bridge *br,
+			    struct br_boolopt_multi *bm,
+			    struct netlink_ext_ack *extack)
+{
+	unsigned long bitmap = bm->optmask;
+	int err = 0;
+	int opt_id;
+
+	for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) {
+		bool on = !!(bm->optval & BIT(opt_id));
+
+		err = br_boolopt_toggle(br, opt_id, on, extack);
+		if (err) {
+			br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n",
+				 opt_id, br_boolopt_get(br, opt_id), on, err);
+			break;
+		}
+	}
+
+	return err;
+}
+
+void br_boolopt_multi_get(const struct net_bridge *br,
+			  struct br_boolopt_multi *bm)
+{
+	u32 optval = 0;
+	int opt_id;
+
+	for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++)
+		optval |= (br_boolopt_get(br, opt_id) << opt_id);
+
+	bm->optval = optval;
+	bm->optmask = 0;
+}
+
+/* private bridge options, controlled by the kernel */
 void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on)
 {
 	bool cur = !!br_opt_get(br, opt);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 3345f1984542..13cd50326af2 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1035,6 +1035,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
 	[IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
 	[IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
 	[IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
+	[IFLA_BR_MULTI_BOOLOPT] = { .type = NLA_EXACT_LEN,
+				    .len = sizeof(struct br_boolopt_multi) },
 };
 
 static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1296,6 +1298,15 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 	}
 #endif
 
+	if (data[IFLA_BR_MULTI_BOOLOPT]) {
+		struct br_boolopt_multi *bm;
+
+		bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]);
+		err = br_boolopt_multi_toggle(br, bm, extack);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -1374,6 +1385,7 @@ static size_t br_get_size(const struct net_device *brdev)
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_NF_CALL_IP6TABLES */
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_NF_CALL_ARPTABLES */
 #endif
+	       nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */
 	       0;
 }
 
@@ -1387,6 +1399,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	u32 stp_enabled = br->stp_enabled;
 	u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1];
 	u8 vlan_enabled = br_vlan_enabled(br->dev);
+	struct br_boolopt_multi bm;
 	u64 clockval;
 
 	clockval = br_timer_value(&br->hello_timer);
@@ -1403,6 +1416,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
 		return -EMSGSIZE;
 
+	br_boolopt_multi_get(br, &bm);
 	if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) ||
 	    nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) ||
 	    nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) ||
@@ -1420,7 +1434,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	    nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) ||
 	    nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED,
 		       br->topology_change_detected) ||
-	    nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr))
+	    nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) ||
+	    nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm))
 		return -EMSGSIZE;
 
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index bc2653738fc3..6d4c208fbf08 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -507,6 +507,14 @@ static inline int br_opt_get(const struct net_bridge *br,
 	return test_bit(opt, &br->options);
 }
 
+int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
+		      struct netlink_ext_ack *extack);
+int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt);
+int br_boolopt_multi_toggle(struct net_bridge *br,
+			    struct br_boolopt_multi *bm,
+			    struct netlink_ext_ack *extack);
+void br_boolopt_multi_get(const struct net_bridge *br,
+			  struct br_boolopt_multi *bm);
 void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on);
 
 /* br_device.c */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 86f2d9cbdae3..a498bb41c9aa 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,7 +59,7 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 
-#define RTNL_MAX_TYPE		49
+#define RTNL_MAX_TYPE		50
 #define RTNL_SLAVE_MAX_TYPE	36
 
 struct rtnl_link {
-- 
cgit v1.2.3-59-g8ed1b


From 70e4272b4c81828e7d942209bae83b9d92752cfe Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Sat, 24 Nov 2018 04:34:21 +0200
Subject: net: bridge: add no_linklocal_learn bool option

Use the new boolopt API to add an option which disables learning from
link-local packets. The default is kept as before and learning is
enabled. This is a simple map from a boolopt bit to a bridge private
flag that is tested before learning.

v2: pass NULL for extack via sysfs

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  3 +++
 net/bridge/br.c                |  5 +++++
 net/bridge/br_input.c          |  4 +++-
 net/bridge/br_private.h        |  1 +
 net/bridge/br_sysfs_br.c       | 22 ++++++++++++++++++++++
 5 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 6dc02c03bdf8..773e476a8e54 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -294,10 +294,13 @@ struct br_mcast_stats {
 };
 
 /* bridge boolean options
+ * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets
+ *
  * IMPORTANT: if adding a new option do not forget to handle
  *            it in br_boolopt_toggle/get and bridge sysfs
  */
 enum br_boolopt_id {
+	BR_BOOLOPT_NO_LL_LEARN,
 	BR_BOOLOPT_MAX
 };
 
diff --git a/net/bridge/br.c b/net/bridge/br.c
index c527160c1975..b4a51a053586 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -189,6 +189,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
 		      struct netlink_ext_ack *extack)
 {
 	switch (opt) {
+	case BR_BOOLOPT_NO_LL_LEARN:
+		br_opt_toggle(br, BROPT_NO_LL_LEARN, on);
+		break;
 	default:
 		/* shouldn't be called with unsupported options */
 		WARN_ON(1);
@@ -201,6 +204,8 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
 int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
 {
 	switch (opt) {
+	case BR_BOOLOPT_NO_LL_LEARN:
+		return br_opt_get(br, BROPT_NO_LL_LEARN);
 	default:
 		/* shouldn't be called with unsupported options */
 		WARN_ON(1);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 3ddca11f44c2..5ea7e56119c1 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -188,7 +188,9 @@ static void __br_handle_local_finish(struct sk_buff *skb)
 	u16 vid = 0;
 
 	/* check if vlan is allowed, to avoid spoofing */
-	if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
+	if ((p->flags & BR_LEARNING) &&
+	    !br_opt_get(p->br, BROPT_NO_LL_LEARN) &&
+	    br_should_learn(p, skb, &vid))
 		br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
 }
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6d4c208fbf08..d29f837cd7a2 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -328,6 +328,7 @@ enum net_bridge_opts {
 	BROPT_NEIGH_SUPPRESS_ENABLED,
 	BROPT_MTU_SET_BY_USER,
 	BROPT_VLAN_STATS_PER_PORT,
+	BROPT_NO_LL_LEARN,
 };
 
 struct net_bridge {
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 60182bef6341..6a378a7e16ea 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -328,6 +328,27 @@ static ssize_t flush_store(struct device *d,
 }
 static DEVICE_ATTR_WO(flush);
 
+static ssize_t no_linklocal_learn_show(struct device *d,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN));
+}
+
+static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val)
+{
+	return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, NULL);
+}
+
+static ssize_t no_linklocal_learn_store(struct device *d,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_no_linklocal_learn);
+}
+static DEVICE_ATTR_RW(no_linklocal_learn);
+
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t multicast_router_show(struct device *d,
 				     struct device_attribute *attr, char *buf)
@@ -841,6 +862,7 @@ static struct attribute *bridge_attrs[] = {
 	&dev_attr_gc_timer.attr,
 	&dev_attr_group_addr.attr,
 	&dev_attr_flush.attr,
+	&dev_attr_no_linklocal_learn.attr,
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	&dev_attr_multicast_router.attr,
 	&dev_attr_multicast_snooping.attr,
-- 
cgit v1.2.3-59-g8ed1b


From cff478b9d9ccaee0de0e02700c63addf007b5d3c Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 26 Nov 2018 15:42:04 +0100
Subject: netns: add support of NETNSA_TARGET_NSID

Like it was done for link and address, add the ability to perform get/dump
in another netns by specifying a target nsid attribute.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_namespace.h |  1 +
 net/core/net_namespace.c           | 86 +++++++++++++++++++++++++++++++++-----
 2 files changed, 76 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_namespace.h b/include/uapi/linux/net_namespace.h
index 0187c74d8889..0ed9dd61d32a 100644
--- a/include/uapi/linux/net_namespace.h
+++ b/include/uapi/linux/net_namespace.h
@@ -16,6 +16,7 @@ enum {
 	NETNSA_NSID,
 	NETNSA_PID,
 	NETNSA_FD,
+	NETNSA_TARGET_NSID,
 	__NETNSA_MAX,
 };
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index f8a5966b086c..885c54197e31 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -669,6 +669,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
 	[NETNSA_NSID]		= { .type = NLA_S32 },
 	[NETNSA_PID]		= { .type = NLA_U32 },
 	[NETNSA_FD]		= { .type = NLA_U32 },
+	[NETNSA_TARGET_NSID]	= { .type = NLA_S32 },
 };
 
 static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -780,9 +781,10 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
 		.seq = nlh->nlmsg_seq,
 		.cmd = RTM_NEWNSID,
 	};
+	struct net *peer, *target = net;
+	bool put_target = false;
 	struct nlattr *nla;
 	struct sk_buff *msg;
-	struct net *peer;
 	int err;
 
 	err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
@@ -806,13 +808,27 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return PTR_ERR(peer);
 	}
 
+	if (tb[NETNSA_TARGET_NSID]) {
+		int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);
+
+		target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
+		if (IS_ERR(target)) {
+			NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
+			NL_SET_ERR_MSG(extack,
+				       "Target netns reference is invalid");
+			err = PTR_ERR(target);
+			goto out;
+		}
+		put_target = true;
+	}
+
 	msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
 	if (!msg) {
 		err = -ENOMEM;
 		goto out;
 	}
 
-	fillargs.nsid = peernet2id(net, peer);
+	fillargs.nsid = peernet2id(target, peer);
 	err = rtnl_net_fill(msg, &fillargs);
 	if (err < 0)
 		goto err_out;
@@ -823,15 +839,19 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
 err_out:
 	nlmsg_free(msg);
 out:
+	if (put_target)
+		put_net(target);
 	put_net(peer);
 	return err;
 }
 
 struct rtnl_net_dump_cb {
+	struct net *tgt_net;
 	struct sk_buff *skb;
 	struct net_fill_args fillargs;
 	int idx;
 	int s_idx;
+	bool put_tgt_net;
 };
 
 static int rtnl_net_dumpid_one(int id, void *peer, void *data)
@@ -852,10 +872,50 @@ cont:
 	return 0;
 }
 
+static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
+				   struct rtnl_net_dump_cb *net_cb,
+				   struct netlink_callback *cb)
+{
+	struct netlink_ext_ack *extack = cb->extack;
+	struct nlattr *tb[NETNSA_MAX + 1];
+	int err, i;
+
+	err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
+				 rtnl_net_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= NETNSA_MAX; i++) {
+		if (!tb[i])
+			continue;
+
+		if (i == NETNSA_TARGET_NSID) {
+			struct net *net;
+
+			net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
+			if (IS_ERR(net)) {
+				NL_SET_BAD_ATTR(extack, tb[i]);
+				NL_SET_ERR_MSG(extack,
+					       "Invalid target network namespace id");
+				return PTR_ERR(net);
+			}
+			net_cb->tgt_net = net;
+			net_cb->put_tgt_net = true;
+		} else {
+			NL_SET_BAD_ATTR(extack, tb[i]);
+			NL_SET_ERR_MSG(extack,
+				       "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct net *net = sock_net(skb->sk);
 	struct rtnl_net_dump_cb net_cb = {
+		.tgt_net = sock_net(skb->sk),
 		.skb = skb,
 		.fillargs = {
 			.portid = NETLINK_CB(cb->skb).portid,
@@ -866,19 +926,23 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 		.idx = 0,
 		.s_idx = cb->args[0],
 	};
+	int err = 0;
 
-	if (cb->strict_check &&
-	    nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) {
-			NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request");
-			return -EINVAL;
+	if (cb->strict_check) {
+		err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
+		if (err < 0)
+			goto end;
 	}
 
-	spin_lock_bh(&net->nsid_lock);
-	idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
-	spin_unlock_bh(&net->nsid_lock);
+	spin_lock_bh(&net_cb.tgt_net->nsid_lock);
+	idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+	spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
 
 	cb->args[0] = net_cb.idx;
-	return skb->len;
+end:
+	if (net_cb.put_tgt_net)
+		put_net(net_cb.tgt_net);
+	return err < 0 ? err : skb->len;
 }
 
 static void rtnl_net_notifyid(struct net *net, int cmd, int id)
-- 
cgit v1.2.3-59-g8ed1b


From 288f06a001eb6265122c620295b68a0dd53d1482 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 26 Nov 2018 15:42:06 +0100
Subject: netns: enable to dump full nsid translation table

Like the previous patch, the goal is to ease to convert nsids from one
netns to another netns.
A new attribute (NETNSA_CURRENT_NSID) is added to the kernel answer when
NETNSA_TARGET_NSID is provided, thus the user can easily convert nsids.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_namespace.h |  1 +
 net/core/net_namespace.c           | 32 ++++++++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_namespace.h b/include/uapi/linux/net_namespace.h
index 0ed9dd61d32a..9f9956809565 100644
--- a/include/uapi/linux/net_namespace.h
+++ b/include/uapi/linux/net_namespace.h
@@ -17,6 +17,7 @@ enum {
 	NETNSA_PID,
 	NETNSA_FD,
 	NETNSA_TARGET_NSID,
+	NETNSA_CURRENT_NSID,
 	__NETNSA_MAX,
 };
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index dd25fb22ad45..05b23b285058 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -736,6 +736,7 @@ static int rtnl_net_get_size(void)
 {
 	return NLMSG_ALIGN(sizeof(struct rtgenmsg))
 	       + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
+	       + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
 	       ;
 }
 
@@ -745,6 +746,8 @@ struct net_fill_args {
 	int flags;
 	int cmd;
 	int nsid;
+	bool add_ref;
+	int ref_nsid;
 };
 
 static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
@@ -763,6 +766,10 @@ static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
 	if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
 		goto nla_put_failure;
 
+	if (args->add_ref &&
+	    nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -782,7 +789,6 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
 		.cmd = RTM_NEWNSID,
 	};
 	struct net *peer, *target = net;
-	bool put_target = false;
 	struct nlattr *nla;
 	struct sk_buff *msg;
 	int err;
@@ -824,7 +830,8 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
 			err = PTR_ERR(target);
 			goto out;
 		}
-		put_target = true;
+		fillargs.add_ref = true;
+		fillargs.ref_nsid = peernet2id(net, peer);
 	}
 
 	msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
@@ -844,7 +851,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
 err_out:
 	nlmsg_free(msg);
 out:
-	if (put_target)
+	if (fillargs.add_ref)
 		put_net(target);
 	put_net(peer);
 	return err;
@@ -852,11 +859,11 @@ out:
 
 struct rtnl_net_dump_cb {
 	struct net *tgt_net;
+	struct net *ref_net;
 	struct sk_buff *skb;
 	struct net_fill_args fillargs;
 	int idx;
 	int s_idx;
-	bool put_tgt_net;
 };
 
 static int rtnl_net_dumpid_one(int id, void *peer, void *data)
@@ -868,6 +875,8 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data)
 		goto cont;
 
 	net_cb->fillargs.nsid = id;
+	if (net_cb->fillargs.add_ref)
+		net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
 	ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
 	if (ret < 0)
 		return ret;
@@ -904,8 +913,9 @@ static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
 					       "Invalid target network namespace id");
 				return PTR_ERR(net);
 			}
+			net_cb->fillargs.add_ref = true;
+			net_cb->ref_net = net_cb->tgt_net;
 			net_cb->tgt_net = net;
-			net_cb->put_tgt_net = true;
 		} else {
 			NL_SET_BAD_ATTR(extack, tb[i]);
 			NL_SET_ERR_MSG(extack,
@@ -940,12 +950,22 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	spin_lock_bh(&net_cb.tgt_net->nsid_lock);
+	if (net_cb.fillargs.add_ref &&
+	    !net_eq(net_cb.ref_net, net_cb.tgt_net) &&
+	    !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) {
+		spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
+		err = -EAGAIN;
+		goto end;
+	}
 	idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+	if (net_cb.fillargs.add_ref &&
+	    !net_eq(net_cb.ref_net, net_cb.tgt_net))
+		spin_unlock_bh(&net_cb.ref_net->nsid_lock);
 	spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
 
 	cb->args[0] = net_cb.idx;
 end:
-	if (net_cb.put_tgt_net)
+	if (net_cb.fillargs.add_ref)
 		put_net(net_cb.tgt_net);
 	return err < 0 ? err : skb->len;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7246d8ed4dcce23f7509949a77be15fa9f0e3d28 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 26 Nov 2018 14:16:17 -0800
Subject: bpf: helper to pop data from messages

This adds a BPF SK_MSG program helper so that we can pop data from a
msg. We use this to pop metadata from a previous push data call.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  16 ++++-
 net/core/filter.c        | 171 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_bpf.c       |  17 ++++-
 net/tls/tls_sw.c         |  11 ++-
 4 files changed, 209 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23e2031a43d4..597afdbc1ab9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2268,6 +2268,19 @@ union bpf_attr {
  *
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ *	 Description
+ *		Will remove *pop* bytes from a *msg* starting at byte *start*.
+ *		This may result in **ENOMEM** errors under certain situations if
+ *		an allocation and copy are required due to a full ring buffer.
+ *		However, the helper will try to avoid doing the allocation
+ *		if possible. Other errors can occur if input parameters are
+ *		invalid either due to *start* byte not being valid part of msg
+ *		payload and/or *pop* value being to large.
+ *
+ *	Return
+ *		0 on success, or a negative erro in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2360,7 +2373,8 @@ union bpf_attr {
 	FN(map_push_elem),		\
 	FN(map_pop_elem),		\
 	FN(map_peek_elem),		\
-	FN(msg_push_data),
+	FN(msg_push_data),		\
+	FN(msg_pop_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index f50ea971f7a9..bd0df75dc7b6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2425,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+static void sk_msg_shift_left(struct sk_msg *msg, int i)
+{
+	int prev;
+
+	do {
+		prev = i;
+		sk_msg_iter_var_next(i);
+		msg->sg.data[prev] = msg->sg.data[i];
+	} while (i != msg->sg.end);
+
+	sk_msg_iter_prev(msg, end);
+}
+
+static void sk_msg_shift_right(struct sk_msg *msg, int i)
+{
+	struct scatterlist tmp, sge;
+
+	sk_msg_iter_next(msg, end);
+	sge = sk_msg_elem_cpy(msg, i);
+	sk_msg_iter_var_next(i);
+	tmp = sk_msg_elem_cpy(msg, i);
+
+	while (i != msg->sg.end) {
+		msg->sg.data[i] = sge;
+		sk_msg_iter_var_next(i);
+		sge = tmp;
+		tmp = sk_msg_elem_cpy(msg, i);
+	}
+}
+
+BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
+	   u32, len, u64, flags)
+{
+	u32 i = 0, l, space, offset = 0;
+	u64 last = start + len;
+	int pop;
+
+	if (unlikely(flags))
+		return -EINVAL;
+
+	/* First find the starting scatterlist element */
+	i = msg->sg.start;
+	do {
+		l = sk_msg_elem(msg, i)->length;
+
+		if (start < offset + l)
+			break;
+		offset += l;
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
+
+	/* Bounds checks: start and pop must be inside message */
+	if (start >= offset + l || last >= msg->sg.size)
+		return -EINVAL;
+
+	space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+	pop = len;
+	/* --------------| offset
+	 * -| start      |-------- len -------|
+	 *
+	 *  |----- a ----|-------- pop -------|----- b ----|
+	 *  |______________________________________________| length
+	 *
+	 *
+	 * a:   region at front of scatter element to save
+	 * b:   region at back of scatter element to save when length > A + pop
+	 * pop: region to pop from element, same as input 'pop' here will be
+	 *      decremented below per iteration.
+	 *
+	 * Two top-level cases to handle when start != offset, first B is non
+	 * zero and second B is zero corresponding to when a pop includes more
+	 * than one element.
+	 *
+	 * Then if B is non-zero AND there is no space allocate space and
+	 * compact A, B regions into page. If there is space shift ring to
+	 * the rigth free'ing the next element in ring to place B, leaving
+	 * A untouched except to reduce length.
+	 */
+	if (start != offset) {
+		struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+		int a = start;
+		int b = sge->length - pop - a;
+
+		sk_msg_iter_var_next(i);
+
+		if (pop < sge->length - a) {
+			if (space) {
+				sge->length = a;
+				sk_msg_shift_right(msg, i);
+				nsge = sk_msg_elem(msg, i);
+				get_page(sg_page(sge));
+				sg_set_page(nsge,
+					    sg_page(sge),
+					    b, sge->offset + pop + a);
+			} else {
+				struct page *page, *orig;
+				u8 *to, *from;
+
+				page = alloc_pages(__GFP_NOWARN |
+						   __GFP_COMP   | GFP_ATOMIC,
+						   get_order(a + b));
+				if (unlikely(!page))
+					return -ENOMEM;
+
+				sge->length = a;
+				orig = sg_page(sge);
+				from = sg_virt(sge);
+				to = page_address(page);
+				memcpy(to, from, a);
+				memcpy(to + a, from + a + pop, b);
+				sg_set_page(sge, page, a + b, 0);
+				put_page(orig);
+			}
+			pop = 0;
+		} else if (pop >= sge->length - a) {
+			sge->length = a;
+			pop -= (sge->length - a);
+		}
+	}
+
+	/* From above the current layout _must_ be as follows,
+	 *
+	 * -| offset
+	 * -| start
+	 *
+	 *  |---- pop ---|---------------- b ------------|
+	 *  |____________________________________________| length
+	 *
+	 * Offset and start of the current msg elem are equal because in the
+	 * previous case we handled offset != start and either consumed the
+	 * entire element and advanced to the next element OR pop == 0.
+	 *
+	 * Two cases to handle here are first pop is less than the length
+	 * leaving some remainder b above. Simply adjust the element's layout
+	 * in this case. Or pop >= length of the element so that b = 0. In this
+	 * case advance to next element decrementing pop.
+	 */
+	while (pop) {
+		struct scatterlist *sge = sk_msg_elem(msg, i);
+
+		if (pop < sge->length) {
+			sge->length -= pop;
+			sge->offset += pop;
+			pop = 0;
+		} else {
+			pop -= sge->length;
+			sk_msg_shift_left(msg, i);
+		}
+		sk_msg_iter_var_next(i);
+	}
+
+	sk_mem_uncharge(msg->sk, len - pop);
+	msg->sg.size -= (len - pop);
+	sk_msg_compute_data_pointers(msg);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pop_data_proto = {
+	.func		= bpf_msg_pop_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -5098,6 +5266,7 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_xdp_adjust_meta ||
 	    func == bpf_msg_pull_data ||
 	    func == bpf_msg_push_data ||
+	    func == bpf_msg_pop_data ||
 	    func == bpf_xdp_adjust_tail ||
 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	    func == bpf_lwt_seg6_store_bytes ||
@@ -5394,6 +5563,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_msg_pull_data_proto;
 	case BPF_FUNC_msg_push_data:
 		return &bpf_msg_push_data_proto;
+	case BPF_FUNC_msg_pop_data:
+		return &bpf_msg_pop_data_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 3b45fe530f91..a47c1cdf90fc 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -289,12 +289,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
 {
 	bool cork = false, enospc = msg->sg.start == msg->sg.end;
 	struct sock *sk_redir;
-	u32 tosend;
+	u32 tosend, delta = 0;
 	int ret;
 
 more_data:
-	if (psock->eval == __SK_NONE)
+	if (psock->eval == __SK_NONE) {
+		/* Track delta in msg size to add/subtract it on SK_DROP from
+		 * returned to user copied size. This ensures user doesn't
+		 * get a positive return code with msg_cut_data and SK_DROP
+		 * verdict.
+		 */
+		delta = msg->sg.size;
 		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+		if (msg->sg.size < delta)
+			delta -= msg->sg.size;
+		else
+			delta = 0;
+	}
 
 	if (msg->cork_bytes &&
 	    msg->cork_bytes > msg->sg.size && !enospc) {
@@ -350,7 +361,7 @@ more_data:
 	default:
 		sk_msg_free_partial(sk, msg, tosend);
 		sk_msg_apply_bytes(psock, tosend);
-		*copied -= tosend;
+		*copied -= (tosend + delta);
 		return -EACCES;
 	}
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 7b1af8b59cd2..d4ecc66464e6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -687,6 +687,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
 	struct sock *sk_redir;
 	struct tls_rec *rec;
 	int err = 0, send;
+	u32 delta = 0;
 	bool enospc;
 
 	psock = sk_psock_get(sk);
@@ -694,8 +695,14 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
 		return tls_push_record(sk, flags, record_type);
 more_data:
 	enospc = sk_msg_full(msg);
-	if (psock->eval == __SK_NONE)
+	if (psock->eval == __SK_NONE) {
+		delta = msg->sg.size;
 		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+		if (delta < msg->sg.size)
+			delta -= msg->sg.size;
+		else
+			delta = 0;
+	}
 	if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
 	    !enospc && !full_record) {
 		err = -ENOSPC;
@@ -743,7 +750,7 @@ more_data:
 			msg->apply_bytes -= send;
 		if (msg->sg.size == 0)
 			tls_free_open_rec(sk);
-		*copied -= send;
+		*copied -= (send + delta);
 		err = -EACCES;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From bfc5d839184f53cc16d551873f9254f2d4d493be Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 20 Nov 2018 20:31:08 +0200
Subject: RDMA/mlx5: Attach a DEVX counter via raw flow creation

Allow a user to attach a DEVX counter via mlx5 raw flow creation. In order
to attach a counter we introduce a new attribute:

MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX

A counter can be attached to multiple flow steering rules.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c        | 15 +++++++++++++++
 drivers/infiniband/hw/mlx5/flow.c        | 17 ++++++++++++++++-
 drivers/infiniband/hw/mlx5/main.c        | 29 ++++++++++++++++++++---------
 drivers/infiniband/hw/mlx5/mlx5_ib.h     |  6 ++++--
 include/uapi/rdma/mlx5_user_ioctl_cmds.h |  1 +
 5 files changed, 56 insertions(+), 12 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 9dfc8593ad43..0aa2ee732eaa 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -107,6 +107,21 @@ bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type)
 	}
 }
 
+bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id)
+{
+	struct devx_obj *devx_obj = obj;
+	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
+
+	if (opcode == MLX5_CMD_OP_DEALLOC_FLOW_COUNTER) {
+		*counter_id = MLX5_GET(dealloc_flow_counter_in,
+				       devx_obj->dinbox,
+				       flow_counter_id);
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * As the obj_id in the firmware is not globally unique the object type
  * must be considered upon checking for a valid object id.
diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c
index 0144ac5fde1a..793afa404442 100644
--- a/drivers/infiniband/hw/mlx5/flow.c
+++ b/drivers/infiniband/hw/mlx5/flow.c
@@ -77,6 +77,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
 		uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE);
 	struct mlx5_ib_dev *dev = to_mdev(uobj->context->device);
 	int len, ret, i;
+	u32 counter_id = 0;
 
 	if (!capable(CAP_NET_RAW))
 		return -EPERM;
@@ -128,6 +129,15 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
 		dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT;
 	}
 
+	len = uverbs_attr_get_uobjs_arr(attrs,
+		MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions);
+	if (len) {
+		devx_obj = arr_flow_actions[0]->object;
+
+		if (!mlx5_ib_devx_is_flow_counter(devx_obj, &counter_id))
+			return -EINVAL;
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+	}
 	if (dev->rep)
 		return -ENOTSUPP;
 
@@ -164,6 +174,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
 	}
 
 	flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act,
+					       counter_id,
 					       cmd_in, inlen,
 					       dest_id, dest_type);
 	if (IS_ERR(flow_handler)) {
@@ -524,7 +535,11 @@ DECLARE_UVERBS_NAMED_METHOD(
 			     UA_OPTIONAL),
 	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG,
 			   UVERBS_ATTR_TYPE(u32),
-			   UA_OPTIONAL));
+			   UA_OPTIONAL),
+	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX,
+			     MLX5_IB_OBJECT_DEVX_OBJ,
+			     UVERBS_ACCESS_READ, 1, 1,
+			     UA_OPTIONAL));
 
 DECLARE_UVERBS_NAMED_METHOD_DESTROY(
 	MLX5_IB_METHOD_DESTROY_FLOW,
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 0707ede7dcdd..5236169c42d0 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3713,7 +3713,8 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev,
 		      struct mlx5_flow_destination *dst,
 		      struct mlx5_ib_flow_matcher  *fs_matcher,
 		      struct mlx5_flow_act *flow_act,
-		      void *cmd_in, int inlen)
+		      void *cmd_in, int inlen,
+		      int dst_num)
 {
 	struct mlx5_ib_flow_handler *handler;
 	struct mlx5_flow_spec *spec;
@@ -3735,7 +3736,7 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev,
 	spec->match_criteria_enable = fs_matcher->match_criteria_enable;
 
 	handler->rule = mlx5_add_flow_rules(ft, spec,
-					    flow_act, dst, 1);
+					    flow_act, dst, dst_num);
 
 	if (IS_ERR(handler->rule)) {
 		err = PTR_ERR(handler->rule);
@@ -3798,12 +3799,14 @@ struct mlx5_ib_flow_handler *
 mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
 			struct mlx5_ib_flow_matcher *fs_matcher,
 			struct mlx5_flow_act *flow_act,
+			u32 counter_id,
 			void *cmd_in, int inlen, int dest_id,
 			int dest_type)
 {
 	struct mlx5_flow_destination *dst;
 	struct mlx5_ib_flow_prio *ft_prio;
 	struct mlx5_ib_flow_handler *handler;
+	int dst_num = 0;
 	bool mcast;
 	int err;
 
@@ -3813,7 +3816,7 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
 	if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
 		return ERR_PTR(-ENOMEM);
 
-	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+	dst = kzalloc(sizeof(*dst) * 2, GFP_KERNEL);
 	if (!dst)
 		return ERR_PTR(-ENOMEM);
 
@@ -3827,20 +3830,28 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
 	}
 
 	if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
-		dst->type = dest_type;
-		dst->tir_num = dest_id;
+		dst[dst_num].type = dest_type;
+		dst[dst_num].tir_num = dest_id;
 		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	} else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
-		dst->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
-		dst->ft_num = dest_id;
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
+		dst[dst_num].ft_num = dest_id;
 		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	} else {
-		dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
 		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
 	}
 
+	dst_num++;
+
+	if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dst[dst_num].counter_id = counter_id;
+		dst_num++;
+	}
+
 	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act,
-					cmd_in, inlen);
+					cmd_in, inlen, dst_num);
 
 	if (IS_ERR(handler)) {
 		err = PTR_ERR(handler);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 9b434246d4e3..a2b35a1a5031 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1266,9 +1266,11 @@ extern const struct uapi_definition mlx5_ib_devx_defs[];
 extern const struct uapi_definition mlx5_ib_flow_defs[];
 struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add(
 	struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
-	struct mlx5_flow_act *flow_act, void *cmd_in, int inlen,
-	int dest_id, int dest_type);
+	struct mlx5_flow_act *flow_act, u32 counter_id,
+	void *cmd_in, int inlen, int dest_id, int dest_type);
 bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type);
+bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id);
+int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root);
 void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction);
 #else
 static inline int
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index 408e220034de..b8d121d457f1 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -158,6 +158,7 @@ enum mlx5_ib_create_flow_attrs {
 	MLX5_IB_ATTR_CREATE_FLOW_MATCHER,
 	MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS,
 	MLX5_IB_ATTR_CREATE_FLOW_TAG,
+	MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX,
 };
 
 enum mlx5_ib_destoy_flow_attrs {
-- 
cgit v1.2.3-59-g8ed1b


From 1584f16ca96ef124aad79efa3303cff5f3530e2c Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 28 Nov 2018 15:09:25 -0800
Subject: drm/v3d: Add support for submitting jobs to the TFU.

The TFU can copy from raster, UIF, and SAND input images to UIF output
images, with optional mipmap generation.  This will certainly be
useful for media EGL image input, but is also useful immediately for
mipmap generation without bogging the V3D core down.

For now we only run the queue 1 job deep, and don't have any hang
recovery (though I don't think we should need it, with TFU).  Queuing
multiple jobs in the HW will require synchronizing the YUV coefficient
regs updates since they don't get FIFOed with the job.

v2: Change the ioctl to IOW instead of IOWR, always set COEF0, explain
    why TFU is AUTH, clarify the syncing docs, drop the unused TFU
    interrupt regs (you're expected to use the hub's), don't take
    &bo->base for NULL bos.
v3: Fix a little whitespace alignment (noticed by checkpatch), rebase
    on drm_sched_job_cleanup() changes.

Signed-off-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Dave Emett <david.emett@broadcom.com> (v2)
Link: https://patchwork.freedesktop.org/patch/264607/
---
 drivers/gpu/drm/v3d/v3d_drv.c   |  15 +++-
 drivers/gpu/drm/v3d/v3d_drv.h   |  32 ++++++--
 drivers/gpu/drm/v3d/v3d_gem.c   | 178 +++++++++++++++++++++++++++++++++++-----
 drivers/gpu/drm/v3d/v3d_irq.c   |  12 ++-
 drivers/gpu/drm/v3d/v3d_regs.h  |  49 +++++++++++
 drivers/gpu/drm/v3d/v3d_sched.c | 147 ++++++++++++++++++++++++++++-----
 drivers/gpu/drm/v3d/v3d_trace.h |  20 +++++
 include/uapi/drm/v3d_drm.h      |  25 ++++++
 8 files changed, 427 insertions(+), 51 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c
index 2a85fa68ffea..f0afcec72c34 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.c
+++ b/drivers/gpu/drm/v3d/v3d_drv.c
@@ -112,10 +112,15 @@ static int v3d_get_param_ioctl(struct drm_device *dev, void *data,
 		return 0;
 	}
 
-	/* Any params that aren't just register reads would go here. */
 
-	DRM_DEBUG("Unknown parameter %d\n", args->param);
-	return -EINVAL;
+	switch (args->param) {
+	case DRM_V3D_PARAM_SUPPORTS_TFU:
+		args->value = 1;
+		return 0;
+	default:
+		DRM_DEBUG("Unknown parameter %d\n", args->param);
+		return -EINVAL;
+	}
 }
 
 static int
@@ -170,7 +175,8 @@ static const struct file_operations v3d_drm_fops = {
 /* DRM_AUTH is required on SUBMIT_CL for now, while we don't have GMP
  * protection between clients.  Note that render nodes would be be
  * able to submit CLs that could access BOs from clients authenticated
- * with the master node.
+ * with the master node.  The TFU doesn't use the GMP, so it would
+ * need to stay DRM_AUTH until we do buffer size/offset validation.
  */
 static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CL, v3d_submit_cl_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
@@ -179,6 +185,7 @@ static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(V3D_MMAP_BO, v3d_mmap_bo_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
 };
 
 static const struct vm_operations_struct v3d_vm_ops = {
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index cbe5be0c47eb..dcb772a19191 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -7,19 +7,18 @@
 #include <drm/drm_encoder.h>
 #include <drm/drm_gem.h>
 #include <drm/gpu_scheduler.h>
+#include "uapi/drm/v3d_drm.h"
 
 #define GMP_GRANULARITY (128 * 1024)
 
-/* Enum for each of the V3D queues.  We maintain various queue
- * tracking as an array because at some point we'll want to support
- * the TFU (texture formatting unit) as another queue.
- */
+/* Enum for each of the V3D queues. */
 enum v3d_queue {
 	V3D_BIN,
 	V3D_RENDER,
+	V3D_TFU,
 };
 
-#define V3D_MAX_QUEUES (V3D_RENDER + 1)
+#define V3D_MAX_QUEUES (V3D_TFU + 1)
 
 struct v3d_queue_state {
 	struct drm_gpu_scheduler sched;
@@ -68,6 +67,7 @@ struct v3d_dev {
 
 	struct v3d_exec_info *bin_job;
 	struct v3d_exec_info *render_job;
+	struct v3d_tfu_job *tfu_job;
 
 	struct v3d_queue_state queue[V3D_MAX_QUEUES];
 
@@ -218,6 +218,25 @@ struct v3d_exec_info {
 	u32 qma, qms, qts;
 };
 
+struct v3d_tfu_job {
+	struct drm_sched_job base;
+
+	struct drm_v3d_submit_tfu args;
+
+	/* An optional fence userspace can pass in for the job to depend on. */
+	struct dma_fence *in_fence;
+
+	/* v3d fence to be signaled by IRQ handler when the job is complete. */
+	struct dma_fence *done_fence;
+
+	struct v3d_dev *v3d;
+
+	struct kref refcount;
+
+	/* This is the array of BOs that were looked up at the start of exec. */
+	struct v3d_bo *bo[4];
+};
+
 /**
  * _wait_for - magic (register) wait macro
  *
@@ -281,9 +300,12 @@ int v3d_gem_init(struct drm_device *dev);
 void v3d_gem_destroy(struct drm_device *dev);
 int v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
+int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv);
 int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 void v3d_exec_put(struct v3d_exec_info *exec);
+void v3d_tfu_job_put(struct v3d_tfu_job *exec);
 void v3d_reset(struct v3d_dev *v3d);
 void v3d_invalidate_caches(struct v3d_dev *v3d);
 void v3d_flush_caches(struct v3d_dev *v3d);
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 1e8947c7d954..6abe2fa43306 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -207,26 +207,27 @@ v3d_flush_caches(struct v3d_dev *v3d)
 }
 
 static void
-v3d_attach_object_fences(struct v3d_exec_info *exec)
+v3d_attach_object_fences(struct v3d_bo **bos, int bo_count,
+			 struct dma_fence *fence)
 {
-	struct dma_fence *out_fence = exec->render_done_fence;
 	int i;
 
-	for (i = 0; i < exec->bo_count; i++) {
+	for (i = 0; i < bo_count; i++) {
 		/* XXX: Use shared fences for read-only objects. */
-		reservation_object_add_excl_fence(exec->bo[i]->resv, out_fence);
+		reservation_object_add_excl_fence(bos[i]->resv, fence);
 	}
 }
 
 static void
 v3d_unlock_bo_reservations(struct drm_device *dev,
-			   struct v3d_exec_info *exec,
+			   struct v3d_bo **bos,
+			   int bo_count,
 			   struct ww_acquire_ctx *acquire_ctx)
 {
 	int i;
 
-	for (i = 0; i < exec->bo_count; i++)
-		ww_mutex_unlock(&exec->bo[i]->resv->lock);
+	for (i = 0; i < bo_count; i++)
+		ww_mutex_unlock(&bos[i]->resv->lock);
 
 	ww_acquire_fini(acquire_ctx);
 }
@@ -240,7 +241,8 @@ v3d_unlock_bo_reservations(struct drm_device *dev,
  */
 static int
 v3d_lock_bo_reservations(struct drm_device *dev,
-			 struct v3d_exec_info *exec,
+			 struct v3d_bo **bos,
+			 int bo_count,
 			 struct ww_acquire_ctx *acquire_ctx)
 {
 	int contended_lock = -1;
@@ -250,7 +252,7 @@ v3d_lock_bo_reservations(struct drm_device *dev,
 
 retry:
 	if (contended_lock != -1) {
-		struct v3d_bo *bo = exec->bo[contended_lock];
+		struct v3d_bo *bo = bos[contended_lock];
 
 		ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
 						       acquire_ctx);
@@ -260,20 +262,20 @@ retry:
 		}
 	}
 
-	for (i = 0; i < exec->bo_count; i++) {
+	for (i = 0; i < bo_count; i++) {
 		if (i == contended_lock)
 			continue;
 
-		ret = ww_mutex_lock_interruptible(&exec->bo[i]->resv->lock,
+		ret = ww_mutex_lock_interruptible(&bos[i]->resv->lock,
 						  acquire_ctx);
 		if (ret) {
 			int j;
 
 			for (j = 0; j < i; j++)
-				ww_mutex_unlock(&exec->bo[j]->resv->lock);
+				ww_mutex_unlock(&bos[j]->resv->lock);
 
 			if (contended_lock != -1 && contended_lock >= i) {
-				struct v3d_bo *bo = exec->bo[contended_lock];
+				struct v3d_bo *bo = bos[contended_lock];
 
 				ww_mutex_unlock(&bo->resv->lock);
 			}
@@ -293,10 +295,11 @@ retry:
 	/* Reserve space for our shared (read-only) fence references,
 	 * before we commit the CL to the hardware.
 	 */
-	for (i = 0; i < exec->bo_count; i++) {
-		ret = reservation_object_reserve_shared(exec->bo[i]->resv, 1);
+	for (i = 0; i < bo_count; i++) {
+		ret = reservation_object_reserve_shared(bos[i]->resv, 1);
 		if (ret) {
-			v3d_unlock_bo_reservations(dev, exec, acquire_ctx);
+			v3d_unlock_bo_reservations(dev, bos, bo_count,
+						   acquire_ctx);
 			return ret;
 		}
 	}
@@ -419,6 +422,33 @@ void v3d_exec_put(struct v3d_exec_info *exec)
 	kref_put(&exec->refcount, v3d_exec_cleanup);
 }
 
+static void
+v3d_tfu_job_cleanup(struct kref *ref)
+{
+	struct v3d_tfu_job *job = container_of(ref, struct v3d_tfu_job,
+					       refcount);
+	struct v3d_dev *v3d = job->v3d;
+	unsigned int i;
+
+	dma_fence_put(job->in_fence);
+	dma_fence_put(job->done_fence);
+
+	for (i = 0; i < ARRAY_SIZE(job->bo); i++) {
+		if (job->bo[i])
+			drm_gem_object_put_unlocked(&job->bo[i]->base);
+	}
+
+	pm_runtime_mark_last_busy(v3d->dev);
+	pm_runtime_put_autosuspend(v3d->dev);
+
+	kfree(job);
+}
+
+void v3d_tfu_job_put(struct v3d_tfu_job *job)
+{
+	kref_put(&job->refcount, v3d_tfu_job_cleanup);
+}
+
 int
 v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
 		  struct drm_file *file_priv)
@@ -536,7 +566,8 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
-	ret = v3d_lock_bo_reservations(dev, exec, &acquire_ctx);
+	ret = v3d_lock_bo_reservations(dev, exec->bo, exec->bo_count,
+				       &acquire_ctx);
 	if (ret)
 		goto fail;
 
@@ -570,9 +601,10 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 				  &v3d_priv->sched_entity[V3D_RENDER]);
 	mutex_unlock(&v3d->sched_lock);
 
-	v3d_attach_object_fences(exec);
+	v3d_attach_object_fences(exec->bo, exec->bo_count,
+				 exec->render_done_fence);
 
-	v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
+	v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
 
 	/* Update the return sync object for the */
 	sync_out = drm_syncobj_find(file_priv, args->out_sync);
@@ -588,13 +620,119 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 
 fail_unreserve:
 	mutex_unlock(&v3d->sched_lock);
-	v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
+	v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
 fail:
 	v3d_exec_put(exec);
 
 	return ret;
 }
 
+/**
+ * v3d_submit_tfu_ioctl() - Submits a TFU (texture formatting) job to the V3D.
+ * @dev: DRM device
+ * @data: ioctl argument
+ * @file_priv: DRM file for this fd
+ *
+ * Userspace provides the register setup for the TFU, which we don't
+ * need to validate since the TFU is behind the MMU.
+ */
+int
+v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
+		     struct drm_file *file_priv)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+	struct drm_v3d_submit_tfu *args = data;
+	struct v3d_tfu_job *job;
+	struct ww_acquire_ctx acquire_ctx;
+	struct drm_syncobj *sync_out;
+	struct dma_fence *sched_done_fence;
+	int ret = 0;
+	int bo_count;
+
+	job = kcalloc(1, sizeof(*job), GFP_KERNEL);
+	if (!job)
+		return -ENOMEM;
+
+	ret = pm_runtime_get_sync(v3d->dev);
+	if (ret < 0) {
+		kfree(job);
+		return ret;
+	}
+
+	kref_init(&job->refcount);
+
+	ret = drm_syncobj_find_fence(file_priv, args->in_sync,
+				     0, 0, &job->in_fence);
+	if (ret == -EINVAL)
+		goto fail;
+
+	job->args = *args;
+	job->v3d = v3d;
+
+	spin_lock(&file_priv->table_lock);
+	for (bo_count = 0; bo_count < ARRAY_SIZE(job->bo); bo_count++) {
+		struct drm_gem_object *bo;
+
+		if (!args->bo_handles[bo_count])
+			break;
+
+		bo = idr_find(&file_priv->object_idr,
+			      args->bo_handles[bo_count]);
+		if (!bo) {
+			DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
+				  bo_count, args->bo_handles[bo_count]);
+			ret = -ENOENT;
+			spin_unlock(&file_priv->table_lock);
+			goto fail;
+		}
+		drm_gem_object_get(bo);
+		job->bo[bo_count] = to_v3d_bo(bo);
+	}
+	spin_unlock(&file_priv->table_lock);
+
+	ret = v3d_lock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
+	if (ret)
+		goto fail;
+
+	mutex_lock(&v3d->sched_lock);
+	ret = drm_sched_job_init(&job->base,
+				 &v3d_priv->sched_entity[V3D_TFU],
+				 v3d_priv);
+	if (ret)
+		goto fail_unreserve;
+
+	sched_done_fence = dma_fence_get(&job->base.s_fence->finished);
+
+	kref_get(&job->refcount); /* put by scheduler job completion */
+	drm_sched_entity_push_job(&job->base, &v3d_priv->sched_entity[V3D_TFU]);
+	mutex_unlock(&v3d->sched_lock);
+
+	v3d_attach_object_fences(job->bo, bo_count, sched_done_fence);
+
+	v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
+
+	/* Update the return sync object */
+	sync_out = drm_syncobj_find(file_priv, args->out_sync);
+	if (sync_out) {
+		drm_syncobj_replace_fence(sync_out, 0, sched_done_fence);
+		drm_syncobj_put(sync_out);
+	}
+	dma_fence_put(sched_done_fence);
+
+	v3d_tfu_job_put(job);
+
+	return 0;
+
+fail_unreserve:
+	mutex_unlock(&v3d->sched_lock);
+	v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
+fail:
+	v3d_tfu_job_put(job);
+
+	return ret;
+}
+
 int
 v3d_gem_init(struct drm_device *dev)
 {
diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c
index e07514eb11b5..dd7a7b0bd5a1 100644
--- a/drivers/gpu/drm/v3d/v3d_irq.c
+++ b/drivers/gpu/drm/v3d/v3d_irq.c
@@ -4,8 +4,8 @@
 /**
  * DOC: Interrupt management for the V3D engine
  *
- * When we take a binning or rendering flush done interrupt, we need
- * to signal the fence for that job so that the scheduler can queue up
+ * When we take a bin, render, or TFU done interrupt, we need to
+ * signal the fence for that job so that the scheduler can queue up
  * the next one and unblock any waiters.
  *
  * When we take the binner out of memory interrupt, we need to
@@ -23,7 +23,8 @@
 
 #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV |	\
 			    V3D_HUB_INT_MMU_PTI |	\
-			    V3D_HUB_INT_MMU_CAP))
+			    V3D_HUB_INT_MMU_CAP |	\
+			    V3D_HUB_INT_TFUC))
 
 static void
 v3d_overflow_mem_work(struct work_struct *work)
@@ -117,6 +118,11 @@ v3d_hub_irq(int irq, void *arg)
 	/* Acknowledge the interrupts we're handling here. */
 	V3D_WRITE(V3D_HUB_INT_CLR, intsts);
 
+	if (intsts & V3D_HUB_INT_TFUC) {
+		dma_fence_signal(v3d->tfu_job->done_fence);
+		status = IRQ_HANDLED;
+	}
+
 	if (intsts & (V3D_HUB_INT_MMU_WRV |
 		      V3D_HUB_INT_MMU_PTI |
 		      V3D_HUB_INT_MMU_CAP)) {
diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
index c3a5e4e44f73..6ccdee9d47bd 100644
--- a/drivers/gpu/drm/v3d/v3d_regs.h
+++ b/drivers/gpu/drm/v3d/v3d_regs.h
@@ -86,6 +86,55 @@
 # define V3D_TOP_GR_BRIDGE_SW_INIT_1                   0x0000c
 # define V3D_TOP_GR_BRIDGE_SW_INIT_1_V3D_CLK_108_SW_INIT BIT(0)
 
+#define V3D_TFU_CS                                     0x00400
+/* Stops current job, empties input fifo. */
+# define V3D_TFU_CS_TFURST                             BIT(31)
+# define V3D_TFU_CS_CVTCT_MASK                         V3D_MASK(23, 16)
+# define V3D_TFU_CS_CVTCT_SHIFT                        16
+# define V3D_TFU_CS_NFREE_MASK                         V3D_MASK(13, 8)
+# define V3D_TFU_CS_NFREE_SHIFT                        8
+# define V3D_TFU_CS_BUSY                               BIT(0)
+
+#define V3D_TFU_SU                                     0x00404
+/* Interrupt when FINTTHR input slots are free (0 = disabled) */
+# define V3D_TFU_SU_FINTTHR_MASK                       V3D_MASK(13, 8)
+# define V3D_TFU_SU_FINTTHR_SHIFT                      8
+/* Skips resetting the CRC at the start of CRC generation. */
+# define V3D_TFU_SU_CRCCHAIN                           BIT(4)
+/* skips writes, computes CRC of the image.  miplevels must be 0. */
+# define V3D_TFU_SU_CRC                                BIT(3)
+# define V3D_TFU_SU_THROTTLE_MASK                      V3D_MASK(1, 0)
+# define V3D_TFU_SU_THROTTLE_SHIFT                     0
+
+#define V3D_TFU_ICFG                                   0x00408
+/* Interrupt when the conversion is complete. */
+# define V3D_TFU_ICFG_IOC                              BIT(0)
+
+/* Input Image Address */
+#define V3D_TFU_IIA                                    0x0040c
+/* Input Chroma Address */
+#define V3D_TFU_ICA                                    0x00410
+/* Input Image Stride */
+#define V3D_TFU_IIS                                    0x00414
+/* Input Image U-Plane Address */
+#define V3D_TFU_IUA                                    0x00418
+/* Output Image Address */
+#define V3D_TFU_IOA                                    0x0041c
+/* Image Output Size */
+#define V3D_TFU_IOS                                    0x00420
+/* TFU YUV Coefficient 0 */
+#define V3D_TFU_COEF0                                  0x00424
+/* Use these regs instead of the defaults. */
+# define V3D_TFU_COEF0_USECOEF                         BIT(31)
+/* TFU YUV Coefficient 1 */
+#define V3D_TFU_COEF1                                  0x00428
+/* TFU YUV Coefficient 2 */
+#define V3D_TFU_COEF2                                  0x0042c
+/* TFU YUV Coefficient 3 */
+#define V3D_TFU_COEF3                                  0x00430
+
+#define V3D_TFU_CRC                                    0x00434
+
 /* Per-MMU registers. */
 
 #define V3D_MMUC_CONTROL                               0x01000
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index c66d0ce21435..f7508e907536 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -30,6 +30,12 @@ to_v3d_job(struct drm_sched_job *sched_job)
 	return container_of(sched_job, struct v3d_job, base);
 }
 
+static struct v3d_tfu_job *
+to_tfu_job(struct drm_sched_job *sched_job)
+{
+	return container_of(sched_job, struct v3d_tfu_job, base);
+}
+
 static void
 v3d_job_free(struct drm_sched_job *sched_job)
 {
@@ -40,6 +46,16 @@ v3d_job_free(struct drm_sched_job *sched_job)
 	v3d_exec_put(job->exec);
 }
 
+static void
+v3d_tfu_job_free(struct drm_sched_job *sched_job)
+{
+	struct v3d_tfu_job *job = to_tfu_job(sched_job);
+
+	drm_sched_job_cleanup(sched_job);
+
+	v3d_tfu_job_put(job);
+}
+
 /**
  * Returns the fences that the bin or render job depends on, one by one.
  * v3d_job_run() won't be called until all of them have been signaled.
@@ -78,6 +94,27 @@ v3d_job_dependency(struct drm_sched_job *sched_job,
 	return fence;
 }
 
+/**
+ * Returns the fences that the TFU job depends on, one by one.
+ * v3d_tfu_job_run() won't be called until all of them have been
+ * signaled.
+ */
+static struct dma_fence *
+v3d_tfu_job_dependency(struct drm_sched_job *sched_job,
+		       struct drm_sched_entity *s_entity)
+{
+	struct v3d_tfu_job *job = to_tfu_job(sched_job);
+	struct dma_fence *fence;
+
+	fence = job->in_fence;
+	if (fence) {
+		job->in_fence = NULL;
+		return fence;
+	}
+
+	return NULL;
+}
+
 static struct dma_fence *v3d_job_run(struct drm_sched_job *sched_job)
 {
 	struct v3d_job *job = to_v3d_job(sched_job);
@@ -149,28 +186,47 @@ static struct dma_fence *v3d_job_run(struct drm_sched_job *sched_job)
 	return fence;
 }
 
-static void
-v3d_job_timedout(struct drm_sched_job *sched_job)
+static struct dma_fence *
+v3d_tfu_job_run(struct drm_sched_job *sched_job)
 {
-	struct v3d_job *job = to_v3d_job(sched_job);
-	struct v3d_exec_info *exec = job->exec;
-	struct v3d_dev *v3d = exec->v3d;
-	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
-	enum v3d_queue q;
-	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
-	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
+	struct v3d_tfu_job *job = to_tfu_job(sched_job);
+	struct v3d_dev *v3d = job->v3d;
+	struct drm_device *dev = &v3d->drm;
+	struct dma_fence *fence;
 
-	/* If the current address or return address have changed, then
-	 * the GPU has probably made progress and we should delay the
-	 * reset.  This could fail if the GPU got in an infinite loop
-	 * in the CL, but that is pretty unlikely outside of an i-g-t
-	 * testcase.
-	 */
-	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
-		job->timedout_ctca = ctca;
-		job->timedout_ctra = ctra;
-		return;
+	fence = v3d_fence_create(v3d, V3D_TFU);
+	if (IS_ERR(fence))
+		return NULL;
+
+	v3d->tfu_job = job;
+	if (job->done_fence)
+		dma_fence_put(job->done_fence);
+	job->done_fence = dma_fence_get(fence);
+
+	trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
+
+	V3D_WRITE(V3D_TFU_IIA, job->args.iia);
+	V3D_WRITE(V3D_TFU_IIS, job->args.iis);
+	V3D_WRITE(V3D_TFU_ICA, job->args.ica);
+	V3D_WRITE(V3D_TFU_IUA, job->args.iua);
+	V3D_WRITE(V3D_TFU_IOA, job->args.ioa);
+	V3D_WRITE(V3D_TFU_IOS, job->args.ios);
+	V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]);
+	if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) {
+		V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]);
+		V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]);
+		V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]);
 	}
+	/* ICFG kicks off the job. */
+	V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC);
+
+	return fence;
+}
+
+static void
+v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
+{
+	enum v3d_queue q;
 
 	mutex_lock(&v3d->reset_lock);
 
@@ -195,6 +251,39 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
 	mutex_unlock(&v3d->reset_lock);
 }
 
+static void
+v3d_job_timedout(struct drm_sched_job *sched_job)
+{
+	struct v3d_job *job = to_v3d_job(sched_job);
+	struct v3d_exec_info *exec = job->exec;
+	struct v3d_dev *v3d = exec->v3d;
+	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
+	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
+	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
+
+	/* If the current address or return address have changed, then
+	 * the GPU has probably made progress and we should delay the
+	 * reset.  This could fail if the GPU got in an infinite loop
+	 * in the CL, but that is pretty unlikely outside of an i-g-t
+	 * testcase.
+	 */
+	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
+		job->timedout_ctca = ctca;
+		job->timedout_ctra = ctra;
+		return;
+	}
+
+	v3d_gpu_reset_for_timeout(v3d, sched_job);
+}
+
+static void
+v3d_tfu_job_timedout(struct drm_sched_job *sched_job)
+{
+	struct v3d_tfu_job *job = to_tfu_job(sched_job);
+
+	v3d_gpu_reset_for_timeout(job->v3d, sched_job);
+}
+
 static const struct drm_sched_backend_ops v3d_sched_ops = {
 	.dependency = v3d_job_dependency,
 	.run_job = v3d_job_run,
@@ -202,6 +291,13 @@ static const struct drm_sched_backend_ops v3d_sched_ops = {
 	.free_job = v3d_job_free
 };
 
+static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
+	.dependency = v3d_tfu_job_dependency,
+	.run_job = v3d_tfu_job_run,
+	.timedout_job = v3d_tfu_job_timedout,
+	.free_job = v3d_tfu_job_free
+};
+
 int
 v3d_sched_init(struct v3d_dev *v3d)
 {
@@ -232,6 +328,19 @@ v3d_sched_init(struct v3d_dev *v3d)
 		return ret;
 	}
 
+	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
+			     &v3d_tfu_sched_ops,
+			     hw_jobs_limit, job_hang_limit,
+			     msecs_to_jiffies(hang_limit_ms),
+			     "v3d_tfu");
+	if (ret) {
+		dev_err(v3d->dev, "Failed to create TFU scheduler: %d.",
+			ret);
+		drm_sched_fini(&v3d->queue[V3D_RENDER].sched);
+		drm_sched_fini(&v3d->queue[V3D_BIN].sched);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/v3d/v3d_trace.h b/drivers/gpu/drm/v3d/v3d_trace.h
index 85dd351e1e09..f54ed9cd3444 100644
--- a/drivers/gpu/drm/v3d/v3d_trace.h
+++ b/drivers/gpu/drm/v3d/v3d_trace.h
@@ -42,6 +42,26 @@ TRACE_EVENT(v3d_submit_cl,
 		      __entry->ctnqea)
 );
 
+TRACE_EVENT(v3d_submit_tfu,
+	    TP_PROTO(struct drm_device *dev,
+		     uint64_t seqno),
+	    TP_ARGS(dev, seqno),
+
+	    TP_STRUCT__entry(
+			     __field(u32, dev)
+			     __field(u64, seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev->primary->index;
+			   __entry->seqno = seqno;
+			   ),
+
+	    TP_printk("dev=%u, seqno=%llu",
+		      __entry->dev,
+		      __entry->seqno)
+);
+
 TRACE_EVENT(v3d_reset_begin,
 	    TP_PROTO(struct drm_device *dev),
 	    TP_ARGS(dev),
diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
index b1e5de076b0f..35c7d813c66e 100644
--- a/include/uapi/drm/v3d_drm.h
+++ b/include/uapi/drm/v3d_drm.h
@@ -36,6 +36,7 @@ extern "C" {
 #define DRM_V3D_MMAP_BO                           0x03
 #define DRM_V3D_GET_PARAM                         0x04
 #define DRM_V3D_GET_BO_OFFSET                     0x05
+#define DRM_V3D_SUBMIT_TFU                        0x06
 
 #define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
 #define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
@@ -43,6 +44,7 @@ extern "C" {
 #define DRM_IOCTL_V3D_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_MMAP_BO, struct drm_v3d_mmap_bo)
 #define DRM_IOCTL_V3D_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
 #define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
+#define DRM_IOCTL_V3D_SUBMIT_TFU          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
 
 /**
  * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
@@ -179,6 +181,7 @@ enum drm_v3d_param {
 	DRM_V3D_PARAM_V3D_CORE0_IDENT0,
 	DRM_V3D_PARAM_V3D_CORE0_IDENT1,
 	DRM_V3D_PARAM_V3D_CORE0_IDENT2,
+	DRM_V3D_PARAM_SUPPORTS_TFU,
 };
 
 struct drm_v3d_get_param {
@@ -197,6 +200,28 @@ struct drm_v3d_get_bo_offset {
 	__u32 offset;
 };
 
+struct drm_v3d_submit_tfu {
+	__u32 icfg;
+	__u32 iia;
+	__u32 iis;
+	__u32 ica;
+	__u32 iua;
+	__u32 ioa;
+	__u32 ios;
+	__u32 coef[4];
+	/* First handle is the output BO, following are other inputs.
+	 * 0 for unused.
+	 */
+	__u32 bo_handles[4];
+	/* sync object to block on before running the TFU job.  Each TFU
+	 * job will execute in the order submitted to its FD.  Synchronization
+	 * against rendering jobs requires using sync objects.
+	 */
+	__u32 in_sync;
+	/* Sync object to signal when the TFU job is done. */
+	__u32 out_sync;
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 4f693b55c3d2d2239b8a0094b518a1e533cf75d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 27 Nov 2018 14:42:03 -0800
Subject: tcp: implement coalescing on backlog queue

In case GRO is not as efficient as it should be or disabled,
we might have a user thread trapped in __release_sock() while
softirq handler flood packets up to the point we have to drop.

This patch balances work done from user thread and softirq,
to give more chances to __release_sock() to complete its work
before new packets are added the the backlog.

This also helps if we receive many ACK packets, since GRO
does not aggregate them.

This patch brings ~60% throughput increase on a receiver
without GRO, but the spectacular gain is really on
1000x release_sock() latency reduction I have measured.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h |  1 +
 net/ipv4/proc.c           |  1 +
 net/ipv4/tcp_ipv4.c       | 92 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 88 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index f80135e5feaa..86dc24a96c90 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -243,6 +243,7 @@ enum
 	LINUX_MIB_TCPREQQFULLDROP,		/* TCPReqQFullDrop */
 	LINUX_MIB_TCPRETRANSFAIL,		/* TCPRetransFail */
 	LINUX_MIB_TCPRCVCOALESCE,		/* TCPRcvCoalesce */
+	LINUX_MIB_TCPBACKLOGCOALESCE,		/* TCPBacklogCoalesce */
 	LINUX_MIB_TCPOFOQUEUE,			/* TCPOFOQueue */
 	LINUX_MIB_TCPOFODROP,			/* TCPOFODrop */
 	LINUX_MIB_TCPOFOMERGE,			/* TCPOFOMerge */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 70289682a670..c3610b37bb4c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
 	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
 	SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+	SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
 	SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
 	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
 	SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 795605a23275..4904250a9aac 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
-
-	/* Only socket owner can try to collapse/prune rx queues
-	 * to reduce memory overhead, so add a little headroom here.
-	 * Few sockets backlog are possibly concurrently non empty.
-	 */
-	limit += 64*1024;
+	struct skb_shared_info *shinfo;
+	const struct tcphdr *th;
+	struct tcphdr *thtail;
+	struct sk_buff *tail;
+	unsigned int hdrlen;
+	bool fragstolen;
+	u32 gso_segs;
+	int delta;
 
 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
 	 * we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 
 	skb_dst_drop(skb);
 
+	if (unlikely(tcp_checksum_complete(skb))) {
+		bh_unlock_sock(sk);
+		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+		return true;
+	}
+
+	/* Attempt coalescing to last skb in backlog, even if we are
+	 * above the limits.
+	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+	 */
+	th = (const struct tcphdr *)skb->data;
+	hdrlen = th->doff * 4;
+	shinfo = skb_shinfo(skb);
+
+	if (!shinfo->gso_size)
+		shinfo->gso_size = skb->len - hdrlen;
+
+	if (!shinfo->gso_segs)
+		shinfo->gso_segs = 1;
+
+	tail = sk->sk_backlog.tail;
+	if (!tail)
+		goto no_coalesce;
+	thtail = (struct tcphdr *)tail->data;
+
+	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+	    ((TCP_SKB_CB(tail)->tcp_flags |
+	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
+	    ((TCP_SKB_CB(tail)->tcp_flags ^
+	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+#ifdef CONFIG_TLS_DEVICE
+	    tail->decrypted != skb->decrypted ||
+#endif
+	    thtail->doff != th->doff ||
+	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+		goto no_coalesce;
+
+	__skb_pull(skb, hdrlen);
+	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+		thtail->window = th->window;
+
+		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
+			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+
+		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+
+		if (TCP_SKB_CB(skb)->has_rxtstamp) {
+			TCP_SKB_CB(tail)->has_rxtstamp = true;
+			tail->tstamp = skb->tstamp;
+			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+		}
+
+		/* Not as strict as GRO. We only need to carry mss max value */
+		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
+						 skb_shinfo(tail)->gso_size);
+
+		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
+		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+
+		sk->sk_backlog.len += delta;
+		__NET_INC_STATS(sock_net(sk),
+				LINUX_MIB_TCPBACKLOGCOALESCE);
+		kfree_skb_partial(skb, fragstolen);
+		return false;
+	}
+	__skb_push(skb, hdrlen);
+
+no_coalesce:
+	/* Only socket owner can try to collapse/prune rx queues
+	 * to reduce memory overhead, so add a little headroom here.
+	 * Few sockets backlog are possibly concurrently non empty.
+	 */
+	limit += 64*1024;
+
 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
 		bh_unlock_sock(sk);
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
-- 
cgit v1.2.3-59-g8ed1b


From 26d31925cd5ea4b5b168ed538b0326d63ccbb384 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 28 Nov 2018 19:12:56 +0100
Subject: tun: implement carrier change

The userspace may need to control the carrier state.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 27 ++++++++++++++++++++++++++-
 include/uapi/linux/if_tun.h |  1 +
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 56575f88d1fd..835c73f42ae7 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1254,6 +1254,21 @@ static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
+static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
+{
+	if (new_carrier) {
+		struct tun_struct *tun = netdev_priv(dev);
+
+		if (!tun->numqueues)
+			return -EPERM;
+
+		netif_carrier_on(dev);
+	} else {
+		netif_carrier_off(dev);
+	}
+	return 0;
+}
+
 static const struct net_device_ops tun_netdev_ops = {
 	.ndo_uninit		= tun_net_uninit,
 	.ndo_open		= tun_net_open,
@@ -1263,6 +1278,7 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_select_queue	= tun_select_queue,
 	.ndo_set_rx_headroom	= tun_set_headroom,
 	.ndo_get_stats64	= tun_net_get_stats64,
+	.ndo_change_carrier	= tun_net_change_carrier,
 };
 
 static void __tun_xdp_flush_tfile(struct tun_file *tfile)
@@ -1345,6 +1361,7 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 	.ndo_bpf		= tun_xdp,
 	.ndo_xdp_xmit		= tun_xdp_xmit,
+	.ndo_change_carrier	= tun_net_change_carrier,
 };
 
 static void tun_flow_init(struct tun_struct *tun)
@@ -3002,12 +3019,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	struct net *net = sock_net(&tfile->sk);
 	struct tun_struct *tun;
 	void __user* argp = (void __user*)arg;
+	unsigned int ifindex, carrier;
 	struct ifreq ifr;
 	kuid_t owner;
 	kgid_t group;
 	int sndbuf;
 	int vnet_hdr_sz;
-	unsigned int ifindex;
 	int le;
 	int ret;
 	bool do_notify = false;
@@ -3291,6 +3308,14 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
 		break;
 
+	case TUNSETCARRIER:
+		ret = -EFAULT;
+		if (copy_from_user(&carrier, argp, sizeof(carrier)))
+			goto unlock;
+
+		ret = tun_net_change_carrier(tun->dev, (bool)carrier);
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index ee432cd3018c..23a6753b37df 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -59,6 +59,7 @@
 #define TUNGETVNETBE _IOR('T', 223, int)
 #define TUNSETSTEERINGEBPF _IOR('T', 224, int)
 #define TUNSETFILTEREBPF _IOR('T', 225, int)
+#define TUNSETCARRIER _IOW('T', 226, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
cgit v1.2.3-59-g8ed1b


From e9ee9efc0d176512cdce9d27ff8549d7ffa2bfcd Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Fri, 30 Nov 2018 21:08:14 -0800
Subject: bpf: Add BPF_F_ANY_ALIGNMENT.

Often we want to write tests cases that check things like bad context
offset accesses.  And one way to do this is to use an odd offset on,
for example, a 32-bit load.

This unfortunately triggers the alignment checks first on platforms
that do not set CONFIG_EFFICIENT_UNALIGNED_ACCESS.  So the test
case see the alignment failure rather than what it was testing for.

It is often not completely possible to respect the original intention
of the test, or even test the same exact thing, while solving the
alignment issue.

Another option could have been to check the alignment after the
context and other validations are performed by the verifier, but
that is a non-trivial change to the verifier.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h                    | 14 ++++++++++++++
 kernel/bpf/syscall.c                        |  7 ++++++-
 kernel/bpf/verifier.c                       |  2 ++
 tools/include/uapi/linux/bpf.h              | 14 ++++++++++++++
 tools/lib/bpf/bpf.c                         |  8 ++++----
 tools/lib/bpf/bpf.h                         |  2 +-
 tools/testing/selftests/bpf/test_align.c    |  4 ++--
 tools/testing/selftests/bpf/test_verifier.c |  3 ++-
 8 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 597afdbc1ab9..8050caea7495 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -232,6 +232,20 @@ enum bpf_attach_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+ * verifier will allow any alignment whatsoever.  On platforms
+ * with strict alignment requirements for loads ands stores (such
+ * as sparc and mips) the verifier validates that all loads and
+ * stores provably follow this requirement.  This flag turns that
+ * checking and enforcement off.
+ *
+ * It is mostly used for testing when we want to validate the
+ * context and memory access aspects of the verifier, but because
+ * of an unaligned access the alignment check would trigger before
+ * the one we are interested in.
+ */
+#define BPF_F_ANY_ALIGNMENT	(1U << 1)
+
 /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
 #define BPF_PSEUDO_MAP_FD	1
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 85cbeec06e50..f9554d9a14e1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1452,9 +1452,14 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	if (CHECK_ATTR(BPF_PROG_LOAD))
 		return -EINVAL;
 
-	if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
 		return -EINVAL;
 
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* copy eBPF program license from user space */
 	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
 			      sizeof(license) - 1) < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9584438fa2cc..71988337ac14 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6505,6 +6505,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
+	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
+		env->strict_alignment = false;
 
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 597afdbc1ab9..8050caea7495 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -232,6 +232,20 @@ enum bpf_attach_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+ * verifier will allow any alignment whatsoever.  On platforms
+ * with strict alignment requirements for loads ands stores (such
+ * as sparc and mips) the verifier validates that all loads and
+ * stores provably follow this requirement.  This flag turns that
+ * checking and enforcement off.
+ *
+ * It is mostly used for testing when we want to validate the
+ * context and memory access aspects of the verifier, but because
+ * of an unaligned access the alignment check would trigger before
+ * the one we are interested in.
+ */
+#define BPF_F_ANY_ALIGNMENT	(1U << 1)
+
 /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
 #define BPF_PSEUDO_MAP_FD	1
 
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index ce1822194590..c19226cccf39 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -279,9 +279,9 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 }
 
 int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
-		       size_t insns_cnt, int strict_alignment,
-		       const char *license, __u32 kern_version,
-		       char *log_buf, size_t log_buf_sz, int log_level)
+		       size_t insns_cnt, __u32 prog_flags, const char *license,
+		       __u32 kern_version, char *log_buf, size_t log_buf_sz,
+		       int log_level)
 {
 	union bpf_attr attr;
 
@@ -295,7 +295,7 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 	attr.log_level = log_level;
 	log_buf[0] = 0;
 	attr.kern_version = kern_version;
-	attr.prog_flags = strict_alignment ? BPF_F_STRICT_ALIGNMENT : 0;
+	attr.prog_flags = prog_flags;
 
 	return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 }
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 09e8bbe111d4..60392b70587c 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -98,7 +98,7 @@ LIBBPF_API int bpf_load_program(enum bpf_prog_type type,
 				char *log_buf, size_t log_buf_sz);
 LIBBPF_API int bpf_verify_program(enum bpf_prog_type type,
 				  const struct bpf_insn *insns,
-				  size_t insns_cnt, int strict_alignment,
+				  size_t insns_cnt, __u32 prog_flags,
 				  const char *license, __u32 kern_version,
 				  char *log_buf, size_t log_buf_sz,
 				  int log_level);
diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c
index 5f377ec53f2f..3c789d03b629 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -620,8 +620,8 @@ static int do_test_single(struct bpf_align_test *test)
 
 	prog_len = probe_filter_length(prog);
 	fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
-				     prog, prog_len, 1, "GPL", 0,
-				     bpf_vlog, sizeof(bpf_vlog), 2);
+				     prog, prog_len, BPF_F_STRICT_ALIGNMENT,
+				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2);
 	if (fd_prog < 0 && test->result != REJECT) {
 		printf("Failed to load program.\n");
 		printf("%s", bpf_vlog);
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 5eace1f606fb..78e779c35869 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -14275,7 +14275,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	prog_len = probe_filter_length(prog);
 
 	fd_prog = bpf_verify_program(prog_type, prog, prog_len,
-				     test->flags & F_LOAD_WITH_STRICT_ALIGNMENT,
+				     test->flags & F_LOAD_WITH_STRICT_ALIGNMENT ?
+				     BPF_F_STRICT_ALIGNMENT : 0,
 				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 1);
 
 	expected_ret = unpriv && test->result_unpriv != UNDEF ?
-- 
cgit v1.2.3-59-g8ed1b


From c3e9305983597a61083482581e83f0bd77ba306a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 12 Nov 2018 16:26:44 +0100
Subject: netfilter: remove NFC_* cache bits

These are very very (for long time unused) caching infrastructure
definition, remove then. They have nothing to do with the NFC subsystem.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter.h        |  4 ----
 include/uapi/linux/netfilter_decnet.h | 10 ----------
 include/uapi/linux/netfilter_ipv4.h   | 28 ----------------------------
 include/uapi/linux/netfilter_ipv6.h   | 29 -----------------------------
 4 files changed, 71 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index cca10e767cd8..ca9e63d6e0e4 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -34,10 +34,6 @@
 
 /* only for userspace compatibility */
 #ifndef __KERNEL__
-/* Generic cache responses from hook functions.
-   <= 0x2000 is used for protocol-flags. */
-#define NFC_UNKNOWN 0x4000
-#define NFC_ALTERED 0x8000
 
 /* NF_VERDICT_BITS should be 8 now, but userspace might break if this changes */
 #define NF_VERDICT_BITS 16
diff --git a/include/uapi/linux/netfilter_decnet.h b/include/uapi/linux/netfilter_decnet.h
index 61f1c7dfd033..3c77f54560f2 100644
--- a/include/uapi/linux/netfilter_decnet.h
+++ b/include/uapi/linux/netfilter_decnet.h
@@ -15,16 +15,6 @@
 
 #include <limits.h> /* for INT_MIN, INT_MAX */
 
-/* IP Cache bits. */
-/* Src IP address. */
-#define NFC_DN_SRC		0x0001
-/* Dest IP address. */
-#define NFC_DN_DST		0x0002
-/* Input device. */
-#define NFC_DN_IF_IN		0x0004
-/* Output device. */
-#define NFC_DN_IF_OUT		0x0008
-
 /* kernel define is in netfilter_defs.h */
 #define NF_DN_NUMHOOKS		7
 #endif /* ! __KERNEL__ */
diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h
index c3b060775e13..155e77d6a42d 100644
--- a/include/uapi/linux/netfilter_ipv4.h
+++ b/include/uapi/linux/netfilter_ipv4.h
@@ -13,34 +13,6 @@
 
 #include <limits.h> /* for INT_MIN, INT_MAX */
 
-/* IP Cache bits. */
-/* Src IP address. */
-#define NFC_IP_SRC		0x0001
-/* Dest IP address. */
-#define NFC_IP_DST		0x0002
-/* Input device. */
-#define NFC_IP_IF_IN		0x0004
-/* Output device. */
-#define NFC_IP_IF_OUT		0x0008
-/* TOS. */
-#define NFC_IP_TOS		0x0010
-/* Protocol. */
-#define NFC_IP_PROTO		0x0020
-/* IP options. */
-#define NFC_IP_OPTIONS		0x0040
-/* Frag & flags. */
-#define NFC_IP_FRAG		0x0080
-
-/* Per-protocol information: only matters if proto match. */
-/* TCP flags. */
-#define NFC_IP_TCPFLAGS		0x0100
-/* Source port. */
-#define NFC_IP_SRC_PT		0x0200
-/* Dest port. */
-#define NFC_IP_DST_PT		0x0400
-/* Something else about the proto */
-#define NFC_IP_PROTO_UNKNOWN	0x2000
-
 /* IP Hooks */
 /* After promisc drops, checksum checks. */
 #define NF_IP_PRE_ROUTING	0
diff --git a/include/uapi/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h
index dc624fd24d25..80aa9b0799af 100644
--- a/include/uapi/linux/netfilter_ipv6.h
+++ b/include/uapi/linux/netfilter_ipv6.h
@@ -16,35 +16,6 @@
 
 #include <limits.h> /* for INT_MIN, INT_MAX */
 
-/* IP Cache bits. */
-/* Src IP address. */
-#define NFC_IP6_SRC              0x0001
-/* Dest IP address. */
-#define NFC_IP6_DST              0x0002
-/* Input device. */
-#define NFC_IP6_IF_IN            0x0004
-/* Output device. */
-#define NFC_IP6_IF_OUT           0x0008
-/* TOS. */
-#define NFC_IP6_TOS              0x0010
-/* Protocol. */
-#define NFC_IP6_PROTO            0x0020
-/* IP options. */
-#define NFC_IP6_OPTIONS          0x0040
-/* Frag & flags. */
-#define NFC_IP6_FRAG             0x0080
-
-
-/* Per-protocol information: only matters if proto match. */
-/* TCP flags. */
-#define NFC_IP6_TCPFLAGS         0x0100
-/* Source port. */
-#define NFC_IP6_SRC_PT           0x0200
-/* Dest port. */
-#define NFC_IP6_DST_PT           0x0400
-/* Something else about the proto */
-#define NFC_IP6_PROTO_UNKNOWN    0x2000
-
 /* IP6 Hooks */
 /* After promisc drops, checksum checks. */
 #define NF_IP6_PRE_ROUTING	0
-- 
cgit v1.2.3-59-g8ed1b


From 92799ef7209bfd4c8eadb88c2c8f6fcba544b367 Mon Sep 17 00:00:00 2001
From: Sergey Dorodnicov <sergey.dorodnicov@intel.com>
Date: Wed, 12 Sep 2018 02:42:06 -0400
Subject: media: v4l: Add 4bpp packed depth confidence format CNF4

Adding new fourcc CNF4 for 4 bit-per-pixel packed depth confidence
information provided by Intel RealSense cameras. Every two consecutive
pixels are packed into a single byte.

Signed-off-by: Sergey Dorodnicov <sergey.dorodnicov@intel.com>
Signed-off-by: Evgeni Raikhel <evgeni.raikhel@intel.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/depth-formats.rst |  1 +
 Documentation/media/uapi/v4l/pixfmt-cnf4.rst   | 31 ++++++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c           |  1 +
 include/uapi/linux/videodev2.h                 |  1 +
 4 files changed, 34 insertions(+)
 create mode 100644 Documentation/media/uapi/v4l/pixfmt-cnf4.rst

(limited to 'include/uapi')

diff --git a/Documentation/media/uapi/v4l/depth-formats.rst b/Documentation/media/uapi/v4l/depth-formats.rst
index d1641e9687a6..9533348691d9 100644
--- a/Documentation/media/uapi/v4l/depth-formats.rst
+++ b/Documentation/media/uapi/v4l/depth-formats.rst
@@ -14,3 +14,4 @@ Depth data provides distance to points, mapped onto the image plane
 
     pixfmt-inzi
     pixfmt-z16
+    pixfmt-cnf4
diff --git a/Documentation/media/uapi/v4l/pixfmt-cnf4.rst b/Documentation/media/uapi/v4l/pixfmt-cnf4.rst
new file mode 100644
index 000000000000..8f469290c304
--- /dev/null
+++ b/Documentation/media/uapi/v4l/pixfmt-cnf4.rst
@@ -0,0 +1,31 @@
+.. -*- coding: utf-8; mode: rst -*-
+
+.. _V4L2-PIX-FMT-CNF4:
+
+******************************
+V4L2_PIX_FMT_CNF4 ('CNF4')
+******************************
+
+Depth sensor confidence information as a 4 bits per pixel packed array
+
+Description
+===========
+
+Proprietary format used by Intel RealSense Depth cameras containing depth
+confidence information in range 0-15 with 0 indicating that the sensor was
+unable to resolve any signal and 15 indicating maximum level of confidence for
+the specific sensor (actual error margins might change from sensor to sensor).
+
+Every two consecutive pixels are packed into a single byte.
+Bits 0-3 of byte n refer to confidence value of depth pixel 2*n,
+bits 4-7 to confidence value of depth pixel 2*n+1.
+
+**Bit-packed representation.**
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths: 64 64
+
+    * - Y'\ :sub:`01[3:0]`\ (bits 7--4) Y'\ :sub:`00[3:0]`\ (bits 3--0)
+      - Y'\ :sub:`03[3:0]`\ (bits 7--4) Y'\ :sub:`02[3:0]`\ (bits 3--0)
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index e384142d2826..363be5bb7942 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1189,6 +1189,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_Y12I:		descr = "Interleaved 12-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Z16:		descr = "16-bit Depth"; break;
 	case V4L2_PIX_FMT_INZI:		descr = "Planar 10:16 Greyscale Depth"; break;
+	case V4L2_PIX_FMT_CNF4:		descr = "4-bit Depth Confidence (Packed)"; break;
 	case V4L2_PIX_FMT_PAL8:		descr = "8-bit Palette"; break;
 	case V4L2_PIX_FMT_UV8:		descr = "8-bit Chrominance UV 4-4"; break;
 	case V4L2_PIX_FMT_YVU410:	descr = "Planar YVU 4:1:0"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 2a223835214c..2db1635de956 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -689,6 +689,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_MT21C    v4l2_fourcc('M', 'T', '2', '1') /* Mediatek compressed block mode  */
 #define V4L2_PIX_FMT_INZI     v4l2_fourcc('I', 'N', 'Z', 'I') /* Intel Planar Greyscale 10-bit and Depth 16-bit */
 #define V4L2_PIX_FMT_SUNXI_TILED_NV12 v4l2_fourcc('S', 'T', '1', '2') /* Sunxi Tiled NV12 Format */
+#define V4L2_PIX_FMT_CNF4     v4l2_fourcc('C', 'N', 'F', '4') /* Intel 4-bit packed depth confidence information */
 
 /* 10bit raw bayer packed, 32 bytes for every 25 pixels, last LSB 6 bits unused */
 #define V4L2_PIX_FMT_IPU3_SBGGR10	v4l2_fourcc('i', 'p', '3', 'b') /* IPU3 packed 10-bit BGGR bayer */
-- 
cgit v1.2.3-59-g8ed1b


From e3da08d057002f9d0831949d51666c3e15dc6b29 Mon Sep 17 00:00:00 2001
From: Petar Penkov <ppenkov@google.com>
Date: Sun, 2 Dec 2018 20:18:19 -0500
Subject: bpf: allow BPF read access to qdisc pkt_len

The pkt_len field in qdisc_skb_cb stores the skb length as it will
appear on the wire after segmentation. For byte accounting, this value
is more accurate than skb->len. It is computed on entry to the TC
layer, so only valid there.

Allow read access to this field from BPF tc classifier and action
programs. The implementation is analogous to tc_classid, aside from
restricting to read access.

To distinguish it from skb->len and self-describe export as wire_len.

Changes v1->v2
  - Rename pkt_len to wire_len

Signed-off-by: Petar Penkov <ppenkov@google.com>
Signed-off-by: Vlad Dumitrescu <vladum@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h                    |  1 +
 net/core/filter.c                           | 16 +++++++++++++++
 tools/include/uapi/linux/bpf.h              |  1 +
 tools/testing/selftests/bpf/test_verifier.c | 32 +++++++++++++++++++++++++++++
 4 files changed, 50 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8050caea7495..0183b8e70a9e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2497,6 +2497,7 @@ struct __sk_buff {
 	__u32 data_meta;
 	struct bpf_flow_keys *flow_keys;
 	__u64 tstamp;
+	__u32 wire_len;
 };
 
 struct bpf_tunnel_key {
diff --git a/net/core/filter.c b/net/core/filter.c
index bd0df75dc7b6..3d54af4c363d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5773,6 +5773,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, flow_keys):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 	case bpf_ctx_range(struct __sk_buff, tstamp):
+	case bpf_ctx_range(struct __sk_buff, wire_len):
 		return false;
 	}
 
@@ -5797,6 +5798,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, flow_keys):
+	case bpf_ctx_range(struct __sk_buff, wire_len):
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
@@ -5843,6 +5845,7 @@ static bool lwt_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, flow_keys):
 	case bpf_ctx_range(struct __sk_buff, tstamp):
+	case bpf_ctx_range(struct __sk_buff, wire_len):
 		return false;
 	}
 
@@ -6273,6 +6276,7 @@ static bool sk_skb_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, flow_keys):
 	case bpf_ctx_range(struct __sk_buff, tstamp):
+	case bpf_ctx_range(struct __sk_buff, wire_len):
 		return false;
 	}
 
@@ -6360,6 +6364,7 @@ static bool flow_dissector_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 	case bpf_ctx_range(struct __sk_buff, tstamp):
+	case bpf_ctx_range(struct __sk_buff, wire_len):
 		return false;
 	}
 
@@ -6685,6 +6690,17 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 					      bpf_target_off(struct sk_buff,
 							     tstamp, 8,
 							     target_size));
+		break;
+
+	case offsetof(struct __sk_buff, wire_len):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4);
+
+		off = si->off;
+		off -= offsetof(struct __sk_buff, wire_len);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, pkt_len);
+		*target_size = 4;
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
 	}
 
 	return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8050caea7495..0183b8e70a9e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2497,6 +2497,7 @@ struct __sk_buff {
 	__u32 data_meta;
 	struct bpf_flow_keys *flow_keys;
 	__u64 tstamp;
+	__u32 wire_len;
 };
 
 struct bpf_tunnel_key {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index c3b038f26ece..b4b4a3f93639 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -14033,6 +14033,38 @@ static struct bpf_test tests[] = {
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
 	},
+	{
+		"check wire_len is not readable by sockets",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, wire_len)),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check wire_len is readable by tc classifier",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, wire_len)),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"check wire_len is not writable by tc classifier",
+		.insns = {
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, wire_len)),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "invalid bpf_context access",
+		.errstr_unpriv = "R1 leaks addr",
+		.result = REJECT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3-59-g8ed1b


From 90b1023f68c78b9b85e364250155776c8e421176 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Mon, 3 Dec 2018 12:13:35 +0000
Subject: bpf: fix documentation for eBPF helpers

The missing indentation on the "Return" sections for bpf_map_pop_elem()
and bpf_map_peek_elem() helpers break RST and man pages generation. This
patch fixes them, and moves the description of those two helpers towards
the end of the list (even though they are somehow related to the three
first helpers for maps, the man page explicitly states that the helpers
are sorted in chronological order).

While at it, bring other minor formatting edits for eBPF helpers
documentation: mostly blank lines removal, RST formatting, or other
small nits for consistency.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 90 ++++++++++++++++++++++++------------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0183b8e70a9e..572eb2d42768 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -496,18 +496,6 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_pop_elem(struct bpf_map *map, void *value)
- * 	Description
- * 		Pop an element from *map*.
- * Return
- * 		0 on success, or a negative error in case of failure.
- *
- * int bpf_map_peek_elem(struct bpf_map *map, void *value)
- * 	Description
- * 		Get an element from *map* without removing it.
- * Return
- * 		0 on success, or a negative error in case of failure.
- *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
@@ -1931,9 +1919,9 @@ union bpf_attr {
  *		is set to metric from route (IPv4/IPv6 only), and ifindex
  *		is set to the device index of the nexthop from the FIB lookup.
  *
- *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be a combination of one or more of the
- *             following values:
+ *		*plen* argument is the size of the passed in struct.
+ *		*flags* argument can be a combination of one or more of the
+ *		following values:
  *
  *		**BPF_FIB_LOOKUP_DIRECT**
  *			Do a direct table lookup vs full lookup using FIB
@@ -1942,9 +1930,9 @@ union bpf_attr {
  *			Perform lookup from an egress perspective (default is
  *			ingress).
  *
- *             *ctx* is either **struct xdp_md** for XDP programs or
- *             **struct sk_buff** tc cls_act programs.
- *     Return
+ *		*ctx* is either **struct xdp_md** for XDP programs or
+ *		**struct sk_buff** tc cls_act programs.
+ *	Return
  *		* < 0 if any input argument is invalid
  *		*   0 on success (packet is forwarded, nexthop neighbor exists)
  *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
@@ -2089,8 +2077,8 @@ union bpf_attr {
  *		translated to a keycode using the rc keymap, and reported as
  *		an input key down event. After a period a key up event is
  *		generated. This period can be extended by calling either
- *		**bpf_rc_keydown** () again with the same values, or calling
- *		**bpf_rc_repeat** ().
+ *		**bpf_rc_keydown**\ () again with the same values, or calling
+ *		**bpf_rc_repeat**\ ().
  *
  *		Some protocols include a toggle bit, in case the button	was
  *		released and pressed again between consecutive scancodes.
@@ -2173,21 +2161,22 @@ union bpf_attr {
  *		The *flags* meaning is specific for each map type,
  *		and has to be 0 for cgroup local storage.
  *
- *		Depending on the bpf program type, a local storage area
- *		can be shared between multiple instances of the bpf program,
+ *		Depending on the BPF program type, a local storage area
+ *		can be shared between multiple instances of the BPF program,
  *		running simultaneously.
  *
  *		A user should care about the synchronization by himself.
- *		For example, by using the BPF_STX_XADD instruction to alter
+ *		For example, by using the **BPF_STX_XADD** instruction to alter
  *		the shared data.
  *	Return
- *		Pointer to the local storage area.
+ *		A pointer to the local storage area.
  *
  * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
  *	Description
- *		Select a SO_REUSEPORT sk from a	BPF_MAP_TYPE_REUSEPORT_ARRAY map
- *		It checks the selected sk is matching the incoming
- *		request in the skb.
+ *		Select a **SO_REUSEPORT** socket from a
+ *		**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ *		It checks the selected socket is matching the incoming
+ *		request in the socket buffer.
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
@@ -2195,7 +2184,7 @@ union bpf_attr {
  *	Description
  *		Look for TCP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
- *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *		and if non-**NULL**, released via **bpf_sk_release**\ ().
  *
  *		The *ctx* should point to the context of the program, such as
  *		the skb or socket (depending on the hook in use). This is used
@@ -2221,15 +2210,15 @@ union bpf_attr {
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
- *		For sockets with reuseport option, *struct bpf_sock*
- *		return is from reuse->socks[] using hash of the packet.
+ *		A pointer to *struct bpf_sock*, or **NULL** in case of failure.
+ *		For sockets with reuseport option, **struct bpf_sock**
+ *		return is from **reuse->socks**\ [] using hash of the packet.
  *
  * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
  *	Description
  *		Look for UDP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
- *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *		and if non-**NULL**, released via **bpf_sk_release**\ ().
  *
  *		The *ctx* should point to the context of the program, such as
  *		the skb or socket (depending on the hook in use). This is used
@@ -2255,46 +2244,57 @@ union bpf_attr {
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
- *		For sockets with reuseport option, *struct bpf_sock*
- *		return is from reuse->socks[] using hash of the packet.
+ *		A pointer to **struct bpf_sock**, or **NULL** in case of
+ *		failure. For sockets with reuseport option, **struct bpf_sock**
+ *		return is from **reuse->socks**\ [] using hash of the packet.
  *
- * int bpf_sk_release(struct bpf_sock *sk)
+ * int bpf_sk_release(struct bpf_sock *sock)
  *	Description
- *		Release the reference held by *sock*. *sock* must be a non-NULL
- *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *		Release the reference held by *sock*. *sock* must be a
+ *		non-**NULL** pointer that was returned from
+ *		**bpf_sk_lookup_xxx**\ ().
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Pop an element from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Get an element from *map* without removing it.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
  *	Description
- *		For socket policies, insert *len* bytes into msg at offset
+ *		For socket policies, insert *len* bytes into *msg* at offset
  *		*start*.
  *
  *		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
- *		*msg* it may want to insert metadata or options into the msg.
+ *		*msg* it may want to insert metadata or options into the *msg*.
  *		This can later be read and used by any of the lower layer BPF
  *		hooks.
  *
  *		This helper may fail if under memory pressure (a malloc
  *		fails) in these cases BPF programs will get an appropriate
  *		error and BPF programs will need to handle them.
- *
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
  * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
- *	 Description
+ *	Description
  *		Will remove *pop* bytes from a *msg* starting at byte *start*.
  *		This may result in **ENOMEM** errors under certain situations if
  *		an allocation and copy are required due to a full ring buffer.
  *		However, the helper will try to avoid doing the allocation
  *		if possible. Other errors can occur if input parameters are
- *		invalid either due to *start* byte not being valid part of msg
+ *		invalid either due to *start* byte not being valid part of *msg*
  *		payload and/or *pop* value being to large.
- *
  *	Return
- *		0 on success, or a negative erro in case of failure.
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-59-g8ed1b


From 846e980a87fc30075517d6d979548294d5461bdb Mon Sep 17 00:00:00 2001
From: Shalom Toledo <shalomt@mellanox.com>
Date: Mon, 3 Dec 2018 07:58:59 +0000
Subject: devlink: Add 'fw_load_policy' generic parameter

Many drivers load the device's firmware image during the initialization
flow either from the flash or from the disk. Currently this option is not
controlled by the user and the driver decides from where to load the
firmware image.

'fw_load_policy' gives the ability to control this option which allows the
user to choose between different loading policies supported by the driver.

This parameter can be useful while testing and/or debugging the device. For
example, testing a firmware bug fix.

Signed-off-by: Shalom Toledo <shalomt@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink-params.txt | 9 +++++++++
 include/net/devlink.h                       | 4 ++++
 include/uapi/linux/devlink.h                | 5 +++++
 net/core/devlink.c                          | 5 +++++
 4 files changed, 23 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/networking/devlink-params.txt b/Documentation/networking/devlink-params.txt
index ae444ffe73ac..2d26434ddcf8 100644
--- a/Documentation/networking/devlink-params.txt
+++ b/Documentation/networking/devlink-params.txt
@@ -40,3 +40,12 @@ msix_vec_per_pf_min	[DEVICE, GENERIC]
 			for the device initialization. Value is same across all
 			physical functions (PFs) in the device.
 			Type: u32
+
+fw_load_policy		[DEVICE, GENERIC]
+			Controls the device's firmware loading policy.
+			Valid values:
+			* DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER (0)
+			  Load firmware version preferred by the driver.
+			* DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH (1)
+			  Load firmware currently stored in flash.
+			Type: u8
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 45db0c79462d..67f4293bc970 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -365,6 +365,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
 	DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
 	DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
+	DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -392,6 +393,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME "msix_vec_per_pf_min"
 #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE DEVLINK_PARAM_TYPE_U32
 
+#define DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME "fw_load_policy"
+#define DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE DEVLINK_PARAM_TYPE_U8
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 79407bbd296d..6e52d3660654 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -163,6 +163,11 @@ enum devlink_param_cmode {
 	DEVLINK_PARAM_CMODE_MAX = __DEVLINK_PARAM_CMODE_MAX - 1
 };
 
+enum devlink_param_fw_load_policy_value {
+	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER,
+	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH,
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3a4b29a13d31..abb0da9d7b4b 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2692,6 +2692,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME,
 		.type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY,
+		.name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME,
+		.type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3-59-g8ed1b


From b5a36b1e1b138285ea0df34bf96c759e1e30fafd Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Mon, 3 Dec 2018 11:31:23 +0000
Subject: bpf: respect size hint to BPF_PROG_TEST_RUN if present

Use data_size_out as a size hint when copying test output to user space.
ENOSPC is returned if the output buffer is too small.
Callers which so far did not set data_size_out are not affected.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  7 +++++--
 net/bpf/test_run.c       | 15 +++++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 572eb2d42768..c8e1eeee2c5f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -374,8 +374,11 @@ union bpf_attr {
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
 		__u32		prog_fd;
 		__u32		retval;
-		__u32		data_size_in;
-		__u32		data_size_out;
+		__u32		data_size_in;	/* input: len of data_in */
+		__u32		data_size_out;	/* input/output: len of data_out
+						 *   returns ENOSPC if data_out
+						 *   is too small.
+						 */
 		__aligned_u64	data_in;
 		__aligned_u64	data_out;
 		__u32		repeat;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index c89c22c49015..7663e6a57280 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -74,8 +74,18 @@ static int bpf_test_finish(const union bpf_attr *kattr,
 {
 	void __user *data_out = u64_to_user_ptr(kattr->test.data_out);
 	int err = -EFAULT;
+	u32 copy_size = size;
+
+	/* Clamp copy if the user has provided a size hint, but copy the full
+	 * buffer if not to retain old behaviour.
+	 */
+	if (kattr->test.data_size_out &&
+	    copy_size > kattr->test.data_size_out) {
+		copy_size = kattr->test.data_size_out;
+		err = -ENOSPC;
+	}
 
-	if (data_out && copy_to_user(data_out, data, size))
+	if (data_out && copy_to_user(data_out, data, copy_size))
 		goto out;
 	if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size)))
 		goto out;
@@ -83,7 +93,8 @@ static int bpf_test_finish(const union bpf_attr *kattr,
 		goto out;
 	if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration)))
 		goto out;
-	err = 0;
+	if (err != -ENOSPC)
+		err = 0;
 out:
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From d3b21767821ed322a4024c99bc360cd0892f3d82 Mon Sep 17 00:00:00 2001
From: Lukasz Spintzyk <lukasz.spintzyk@displaylink.com>
Date: Wed, 23 May 2018 19:04:08 -0700
Subject: drm: Add a new plane property to send damage during plane update

FB_DAMAGE_CLIPS is an optional plane property to mark damaged regions
on the plane in framebuffer coordinates of the framebuffer attached to
the plane.

The layout of blob data is simply an array of "struct drm_mode_rect".
Unlike plane src coordinates, damage clips are not in 16.16 fixed point.
As plane src in framebuffer cannot be negative so are damage clips. In
damage clip, x1/y1 are inclusive and x2/y2 are exclusive.

This patch also exports the kernel internal drm_rect to userspace as
drm_mode_rect. This is because "struct drm_clip_rect" is not sufficient
to represent damage for current plane size.

Driver which are interested in enabling FB_DAMAGE_CLIPS property for a
plane should enable this property using drm_plane_enable_damage_clips.

v2:
- Input validation on damage clips against framebuffer size.
- Doc update, other minor changes.

Signed-off-by: Lukasz Spintzyk <lukasz.spintzyk@displaylink.com>
Signed-off-by: Deepak Rawat <drawat@vmware.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Thomas Hellstrom <thellstrom@vmware.com>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
---
 Documentation/gpu/drm-kms.rst       | 12 ++++++
 drivers/gpu/drm/Makefile            |  2 +-
 drivers/gpu/drm/drm_atomic.c        | 22 ++++++++++
 drivers/gpu/drm/drm_atomic_uapi.c   | 13 ++++++
 drivers/gpu/drm/drm_damage_helper.c | 83 +++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_mode_config.c   |  6 +++
 include/drm/drm_damage_helper.h     | 39 +++++++++++++++++
 include/drm/drm_mode_config.h       |  9 ++++
 include/drm/drm_plane.h             | 40 ++++++++++++++++++
 include/uapi/drm/drm_mode.h         | 19 +++++++++
 10 files changed, 244 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/drm_damage_helper.c
 create mode 100644 include/drm/drm_damage_helper.h

(limited to 'include/uapi')

diff --git a/Documentation/gpu/drm-kms.rst b/Documentation/gpu/drm-kms.rst
index 4b1501b4835b..6c3e89e324f8 100644
--- a/Documentation/gpu/drm-kms.rst
+++ b/Documentation/gpu/drm-kms.rst
@@ -554,6 +554,18 @@ Plane Composition Properties
 .. kernel-doc:: drivers/gpu/drm/drm_blend.c
    :export:
 
+FB_DAMAGE_CLIPS
+~~~~~~~~~~~~~~~
+
+.. kernel-doc:: drivers/gpu/drm/drm_damage_helper.c
+   :doc: overview
+
+.. kernel-doc:: drivers/gpu/drm/drm_damage_helper.c
+   :export:
+
+.. kernel-doc:: include/drm/drm_damage_helper.h
+   :internal:
+
 Color Management Properties
 ---------------------------
 
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 1fafc2f8e8f9..e2336cd2059b 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -37,7 +37,7 @@ drm_kms_helper-y := drm_crtc_helper.o drm_dp_helper.o drm_probe_helper.o \
 		drm_kms_helper_common.o drm_dp_dual_mode_helper.o \
 		drm_simple_kms_helper.o drm_modeset_helper.o \
 		drm_scdc_helper.o drm_gem_framebuffer_helper.o \
-		drm_atomic_state_helper.o
+		drm_atomic_state_helper.o drm_damage_helper.o
 
 drm_kms_helper-$(CONFIG_DRM_PANEL_BRIDGE) += bridge/panel.o
 drm_kms_helper-$(CONFIG_DRM_FBDEV_EMULATION) += drm_fb_helper.o
diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 9ac26437051b..48ec378fb27e 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -531,6 +531,8 @@ static int drm_atomic_plane_check(const struct drm_plane_state *old_plane_state,
 	struct drm_crtc *crtc = new_plane_state->crtc;
 	const struct drm_framebuffer *fb = new_plane_state->fb;
 	unsigned int fb_width, fb_height;
+	struct drm_mode_rect *clips;
+	uint32_t num_clips;
 	int ret;
 
 	/* either *both* CRTC and FB must be set, or neither */
@@ -604,6 +606,26 @@ static int drm_atomic_plane_check(const struct drm_plane_state *old_plane_state,
 		return -ENOSPC;
 	}
 
+	clips = drm_plane_get_damage_clips(new_plane_state);
+	num_clips = drm_plane_get_damage_clips_count(new_plane_state);
+
+	/* Make sure damage clips are valid and inside the fb. */
+	while (num_clips > 0) {
+		if (clips->x1 >= clips->x2 ||
+		    clips->y1 >= clips->y2 ||
+		    clips->x1 < 0 ||
+		    clips->y1 < 0 ||
+		    clips->x2 > fb_width ||
+		    clips->y2 > fb_height) {
+			DRM_DEBUG_ATOMIC("[PLANE:%d:%s] invalid damage clip %d %d %d %d\n",
+					 plane->base.id, plane->name, clips->x1,
+					 clips->y1, clips->x2, clips->y2);
+			return -EINVAL;
+		}
+		clips++;
+		num_clips--;
+	}
+
 	if (plane_switching_crtc(old_plane_state, new_plane_state)) {
 		DRM_DEBUG_ATOMIC("[PLANE:%d:%s] switching CRTC directly\n",
 				 plane->base.id, plane->name);
diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
index 86ac33922b09..0876c6941f7a 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -513,6 +513,8 @@ static int drm_atomic_plane_set_property(struct drm_plane *plane,
 {
 	struct drm_device *dev = plane->dev;
 	struct drm_mode_config *config = &dev->mode_config;
+	bool replaced = false;
+	int ret;
 
 	if (property == config->prop_fb_id) {
 		struct drm_framebuffer *fb = drm_framebuffer_lookup(dev, NULL, val);
@@ -566,6 +568,14 @@ static int drm_atomic_plane_set_property(struct drm_plane *plane,
 		state->color_encoding = val;
 	} else if (property == plane->color_range_property) {
 		state->color_range = val;
+	} else if (property == config->prop_fb_damage_clips) {
+		ret = drm_atomic_replace_property_blob_from_id(dev,
+					&state->fb_damage_clips,
+					val,
+					-1,
+					sizeof(struct drm_rect),
+					&replaced);
+		return ret;
 	} else if (plane->funcs->atomic_set_property) {
 		return plane->funcs->atomic_set_property(plane, state,
 				property, val);
@@ -621,6 +631,9 @@ drm_atomic_plane_get_property(struct drm_plane *plane,
 		*val = state->color_encoding;
 	} else if (property == plane->color_range_property) {
 		*val = state->color_range;
+	} else if (property == config->prop_fb_damage_clips) {
+		*val = (state->fb_damage_clips) ?
+			state->fb_damage_clips->base.id : 0;
 	} else if (plane->funcs->atomic_get_property) {
 		return plane->funcs->atomic_get_property(plane, state, property, val);
 	} else {
diff --git a/drivers/gpu/drm/drm_damage_helper.c b/drivers/gpu/drm/drm_damage_helper.c
new file mode 100644
index 000000000000..8dc906a489a9
--- /dev/null
+++ b/drivers/gpu/drm/drm_damage_helper.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/**************************************************************************
+ *
+ * Copyright (c) 2018 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Deepak Rawat <drawat@vmware.com>
+ *
+ **************************************************************************/
+
+#include <drm/drm_damage_helper.h>
+
+/**
+ * DOC: overview
+ *
+ * FB_DAMAGE_CLIPS is an optional plane property which provides a means to
+ * specify a list of damage rectangles on a plane in framebuffer coordinates of
+ * the framebuffer attached to the plane. In current context damage is the area
+ * of plane framebuffer that has changed since last plane update (also called
+ * page-flip), irrespective of whether currently attached framebuffer is same as
+ * framebuffer attached during last plane update or not.
+ *
+ * FB_DAMAGE_CLIPS is a hint to kernel which could be helpful for some drivers
+ * to optimize internally especially for virtual devices where each framebuffer
+ * change needs to be transmitted over network, usb, etc.
+ *
+ * Since FB_DAMAGE_CLIPS is a hint so it is an optional property. User-space can
+ * ignore damage clips property and in that case driver will do a full plane
+ * update. In case damage clips are provided then it is guaranteed that the area
+ * inside damage clips will be updated to plane. For efficiency driver can do
+ * full update or can update more than specified in damage clips. Since driver
+ * is free to read more, user-space must always render the entire visible
+ * framebuffer. Otherwise there can be corruptions. Also, if a user-space
+ * provides damage clips which doesn't encompass the actual damage to
+ * framebuffer (since last plane update) can result in incorrect rendering.
+ *
+ * FB_DAMAGE_CLIPS is a blob property with the layout of blob data is simply an
+ * array of &drm_mode_rect. Unlike plane &drm_plane_state.src coordinates,
+ * damage clips are not in 16.16 fixed point. Similar to plane src in
+ * framebuffer, damage clips cannot be negative. In damage clip, x1/y1 are
+ * inclusive and x2/y2 are exclusive. While kernel does not error for overlapped
+ * damage clips, it is strongly discouraged.
+ *
+ * Drivers that are interested in damage interface for plane should enable
+ * FB_DAMAGE_CLIPS property by calling drm_plane_enable_fb_damage_clips().
+ */
+
+/**
+ * drm_plane_enable_fb_damage_clips - Enables plane fb damage clips property.
+ * @plane: Plane on which to enable damage clips property.
+ *
+ * This function lets driver to enable the damage clips property on a plane.
+ */
+void drm_plane_enable_fb_damage_clips(struct drm_plane *plane)
+{
+	struct drm_device *dev = plane->dev;
+	struct drm_mode_config *config = &dev->mode_config;
+
+	drm_object_attach_property(&plane->base, config->prop_fb_damage_clips,
+				   0);
+}
+EXPORT_SYMBOL(drm_plane_enable_fb_damage_clips);
diff --git a/drivers/gpu/drm/drm_mode_config.c b/drivers/gpu/drm/drm_mode_config.c
index ee80788f2c40..05cd5e9857e4 100644
--- a/drivers/gpu/drm/drm_mode_config.c
+++ b/drivers/gpu/drm/drm_mode_config.c
@@ -297,6 +297,12 @@ static int drm_mode_create_standard_properties(struct drm_device *dev)
 		return -ENOMEM;
 	dev->mode_config.prop_crtc_id = prop;
 
+	prop = drm_property_create(dev, DRM_MODE_PROP_BLOB, "FB_DAMAGE_CLIPS",
+				   0);
+	if (!prop)
+		return -ENOMEM;
+	dev->mode_config.prop_fb_damage_clips = prop;
+
 	prop = drm_property_create_bool(dev, DRM_MODE_PROP_ATOMIC,
 			"ACTIVE");
 	if (!prop)
diff --git a/include/drm/drm_damage_helper.h b/include/drm/drm_damage_helper.h
new file mode 100644
index 000000000000..4947c614fff9
--- /dev/null
+++ b/include/drm/drm_damage_helper.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/**************************************************************************
+ *
+ * Copyright (c) 2018 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Deepak Rawat <drawat@vmware.com>
+ *
+ **************************************************************************/
+
+#ifndef DRM_DAMAGE_HELPER_H_
+#define DRM_DAMAGE_HELPER_H_
+
+#include <drm/drm_atomic_helper.h>
+
+void drm_plane_enable_fb_damage_clips(struct drm_plane *plane);
+
+#endif
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 5dbeabdbaf91..862767f9fddc 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -633,6 +633,15 @@ struct drm_mode_config {
 	 * &drm_crtc.
 	 */
 	struct drm_property *prop_crtc_id;
+	/**
+	 * @prop_fb_damage_clips: Optional plane property to mark damaged
+	 * regions on the plane in framebuffer coordinates of the framebuffer
+	 * attached to the plane.
+	 *
+	 * The layout of blob data is simply an array of &drm_mode_rect. Unlike
+	 * plane src coordinates, damage clips are not in 16.16 fixed point.
+	 */
+	struct drm_property *prop_fb_damage_clips;
 	/**
 	 * @prop_active: Default atomic CRTC property to control the active
 	 * state, which is the simplified implementation for DPMS in atomic
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 3701f56c3362..87153ecb5f1f 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -173,6 +173,16 @@ struct drm_plane_state {
 	 */
 	enum drm_color_range color_range;
 
+	/**
+	 * @fb_damage_clips:
+	 *
+	 * Blob representing damage (area in plane framebuffer that changed
+	 * since last plane update) as an array of &drm_mode_rect in framebuffer
+	 * coodinates of the attached framebuffer. Note that unlike plane src,
+	 * damage clips are not in 16.16 fixed point.
+	 */
+	struct drm_property_blob *fb_damage_clips;
+
 	/** @src: clipped source coordinates of the plane (in 16.16) */
 	/** @dst: clipped destination coordinates of the plane */
 	struct drm_rect src, dst;
@@ -800,5 +810,35 @@ static inline struct drm_plane *drm_plane_find(struct drm_device *dev,
 
 bool drm_any_plane_has_format(struct drm_device *dev,
 			      u32 format, u64 modifier);
+/**
+ * drm_plane_get_damage_clips_count - Returns damage clips count.
+ * @state: Plane state.
+ *
+ * Simple helper to get the number of &drm_mode_rect clips set by user-space
+ * during plane update.
+ *
+ * Return: Number of clips in plane fb_damage_clips blob property.
+ */
+static inline unsigned int
+drm_plane_get_damage_clips_count(const struct drm_plane_state *state)
+{
+	return (state && state->fb_damage_clips) ?
+		state->fb_damage_clips->length/sizeof(struct drm_mode_rect) : 0;
+}
+
+/**
+ * drm_plane_get_damage_clips - Returns damage clips.
+ * @state: Plane state.
+ *
+ * Note that this function returns uapi type &drm_mode_rect.
+ *
+ * Return: Damage clips in plane fb_damage_clips blob property.
+ */
+static inline struct drm_mode_rect *
+drm_plane_get_damage_clips(const struct drm_plane_state *state)
+{
+	return (struct drm_mode_rect *)((state && state->fb_damage_clips) ?
+					state->fb_damage_clips->data : NULL);
+}
 
 #endif
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index d3e0fe31efc5..a439c2e67896 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -888,6 +888,25 @@ struct drm_mode_revoke_lease {
 	__u32 lessee_id;
 };
 
+/**
+ * struct drm_mode_rect - Two dimensional rectangle.
+ * @x1: Horizontal starting coordinate (inclusive).
+ * @y1: Vertical starting coordinate (inclusive).
+ * @x2: Horizontal ending coordinate (exclusive).
+ * @y2: Vertical ending coordinate (exclusive).
+ *
+ * With drm subsystem using struct drm_rect to manage rectangular area this
+ * export it to user-space.
+ *
+ * Currently used by drm_mode_atomic blob property FB_DAMAGE_CLIPS.
+ */
+struct drm_mode_rect {
+	__s32 x1;
+	__s32 y1;
+	__s32 x2;
+	__s32 y2;
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From cc1068eb6ad21a6cf54aa5f9ae25bf50fd5c9d4b Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Thu, 15 Nov 2018 15:25:04 -0800
Subject: uapi/nl80211: fix spelling errors

Spelling errors found by codespell

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 51bd85b7d839..2b53c0e949c7 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1734,7 +1734,7 @@ enum nl80211_commands {
  *	the values passed in @NL80211_ATTR_SCAN_SSIDS (eg. if an SSID
  *	is included in the probe request, but the match attributes
  *	will never let it go through), -EINVAL may be returned.
- *	If ommited, no filtering is done.
+ *	If omitted, no filtering is done.
  *
  * @NL80211_ATTR_INTERFACE_COMBINATIONS: Nested attribute listing the supported
  *	interface combinations. In each nested item, it contains attributes
@@ -1839,7 +1839,7 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_INACTIVITY_TIMEOUT: timeout value in seconds, this can be
  *	used by the drivers which has MLME in firmware and does not have support
- *	to report per station tx/rx activity to free up the staion entry from
+ *	to report per station tx/rx activity to free up the station entry from
  *	the list. This needs to be used when the driver advertises the
  *	capability to timeout the stations.
  *
@@ -2200,7 +2200,7 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST: When present the RSSI level for BSSs in
  *	the specified band is to be adjusted before doing
- *	%NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI based comparision to figure out
+ *	%NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI based comparison to figure out
  *	better BSSs. The attribute value is a packed structure
  *	value as specified by &struct nl80211_bss_select_rssi_adjust.
  *
@@ -4910,7 +4910,7 @@ enum nl80211_iface_limit_attrs {
  *	numbers = [ #{STA} <= 1, #{P2P-client,P2P-GO} <= 3 ], max = 4
  *	=> allows a STA plus three P2P interfaces
  *
- * The list of these four possiblities could completely be contained
+ * The list of these four possibilities could completely be contained
  * within the %NL80211_ATTR_INTERFACE_COMBINATIONS attribute to indicate
  * that any of these groups must match.
  *
@@ -4940,7 +4940,7 @@ enum nl80211_if_combination_attrs {
  * enum nl80211_plink_state - state of a mesh peer link finite state machine
  *
  * @NL80211_PLINK_LISTEN: initial state, considered the implicit
- *	state of non existant mesh peer links
+ *	state of non existent mesh peer links
  * @NL80211_PLINK_OPN_SNT: mesh plink open frame has been sent to
  *	this mesh peer
  * @NL80211_PLINK_OPN_RCVD: mesh plink open frame has been received
@@ -5432,7 +5432,7 @@ enum nl80211_timeout_reason {
  *	request parameters IE in the probe request
  * @NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP: accept broadcast probe responses
  * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE: send probe request frames at
- *	rate of at least 5.5M. In case non OCE AP is dicovered in the channel,
+ *	rate of at least 5.5M. In case non OCE AP is discovered in the channel,
  *	only the first probe req in the channel will be sent in high rate.
  * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: allow probe request
  *	tx deferral (dot11FILSProbeDelay shall be set to 15ms)
-- 
cgit v1.2.3-59-g8ed1b


From c7bcb13442e1bd8a4cc41c74540dd2e2f7058d16 Mon Sep 17 00:00:00 2001
From: Lijun Ou <oulijun@huawei.com>
Date: Sat, 24 Nov 2018 16:49:21 +0800
Subject: RDMA/hns: Add SRQ support for hip08 kernel mode

This patch implements the SRQ(Share Receive Queue) verbs
and update the poll cq verbs to deal with SRQ complentions.

Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_cmd.h    |   4 +
 drivers/infiniband/hw/hns/hns_roce_device.h |  58 +++-
 drivers/infiniband/hw/hns/hns_roce_hem.c    |  32 ++-
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 384 ++++++++++++++++++++++++--
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h  |  86 ++++++
 drivers/infiniband/hw/hns/hns_roce_main.c   |  15 +
 drivers/infiniband/hw/hns/hns_roce_mr.c     | 137 +++++++++-
 drivers/infiniband/hw/hns/hns_roce_qp.c     |  21 +-
 drivers/infiniband/hw/hns/hns_roce_srq.c    | 407 ++++++++++++++++++++++++++++
 include/uapi/rdma/hns-abi.h                 |   6 +
 10 files changed, 1098 insertions(+), 52 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h
index 9549ae51a0dd..927701df5eff 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cmd.h
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h
@@ -120,6 +120,10 @@ enum {
 	HNS_ROCE_CMD_SQD2RTS_QP		= 0x20,
 	HNS_ROCE_CMD_2RST_QP		= 0x21,
 	HNS_ROCE_CMD_QUERY_QP		= 0x22,
+	HNS_ROCE_CMD_SW2HW_SRQ		= 0x70,
+	HNS_ROCE_CMD_MODIFY_SRQC	= 0x72,
+	HNS_ROCE_CMD_QUERY_SRQC		= 0x73,
+	HNS_ROCE_CMD_HW2SW_SRQ		= 0x74,
 };
 
 int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 5d79d8070a62..5a40746cb2e3 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -111,6 +111,9 @@
 #define PAGES_SHIFT_24				24
 #define PAGES_SHIFT_32				32
 
+#define HNS_ROCE_IDX_QUE_ENTRY_SZ		4
+#define SRQ_DB_REG				0x230
+
 enum {
 	HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0,
 	HNS_ROCE_SUPPORT_SQ_RECORD_DB = 1 << 1,
@@ -436,9 +439,37 @@ struct hns_roce_cq {
 	struct completion		free;
 };
 
+struct hns_roce_idx_que {
+	struct hns_roce_buf		idx_buf;
+	int				entry_sz;
+	u32				buf_size;
+	struct ib_umem			*umem;
+	struct hns_roce_mtt		mtt;
+	u64				*bitmap;
+};
+
 struct hns_roce_srq {
 	struct ib_srq		ibsrq;
-	int			srqn;
+	void (*event)(struct hns_roce_srq *srq, enum hns_roce_event event);
+	unsigned long		srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+	void __iomem		*db_reg_l;
+
+	atomic_t		refcount;
+	struct completion	free;
+
+	struct hns_roce_buf	buf;
+	u64		       *wrid;
+	struct ib_umem	       *umem;
+	struct hns_roce_mtt	mtt;
+	struct hns_roce_idx_que idx_que;
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct mutex		mutex;
 };
 
 struct hns_roce_uar_table {
@@ -761,6 +792,12 @@ struct hns_roce_caps {
 	u32		cqe_ba_pg_sz;
 	u32		cqe_buf_pg_sz;
 	u32		cqe_hop_num;
+	u32		srqwqe_ba_pg_sz;
+	u32		srqwqe_buf_pg_sz;
+	u32		srqwqe_hop_num;
+	u32		idx_ba_pg_sz;
+	u32		idx_buf_pg_sz;
+	u32		idx_hop_num;
 	u32		eqe_ba_pg_sz;
 	u32		eqe_buf_pg_sz;
 	u32		eqe_hop_num;
@@ -829,6 +866,17 @@ struct hns_roce_hw {
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 	int (*init_eq)(struct hns_roce_dev *hr_dev);
 	void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
+	void (*write_srqc)(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_srq *srq, u32 pdn, u16 xrcd, u32 cqn,
+			   void *mb_buf, u64 *mtts_wqe, u64 *mtts_idx,
+			   dma_addr_t dma_handle_wqe,
+			   dma_addr_t dma_handle_idx);
+	int (*modify_srq)(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
+		       enum ib_srq_attr_mask srq_attr_mask,
+		       struct ib_udata *udata);
+	int (*query_srq)(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+	int (*post_srq_recv)(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
+			     const struct ib_recv_wr **bad_wr);
 };
 
 struct hns_roce_dev {
@@ -1038,6 +1086,14 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
 int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_mtt *mtt, struct ib_umem *umem);
 
+struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
+				   struct ib_srq_init_attr *srq_init_attr,
+				   struct ib_udata *udata);
+int hns_roce_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
+			enum ib_srq_attr_mask srq_attr_mask,
+			struct ib_udata *udata);
+int hns_roce_destroy_srq(struct ib_srq *ibsrq);
+
 struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd,
 				 struct ib_qp_init_attr *init_attr,
 				 struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index 388b0406e41f..4cdbcafa5915 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -46,7 +46,9 @@ bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type)
 	    (hr_dev->caps.cqc_hop_num && type == HEM_TYPE_CQC) ||
 	    (hr_dev->caps.srqc_hop_num && type == HEM_TYPE_SRQC) ||
 	    (hr_dev->caps.cqe_hop_num && type == HEM_TYPE_CQE) ||
-	    (hr_dev->caps.mtt_hop_num && type == HEM_TYPE_MTT))
+	    (hr_dev->caps.mtt_hop_num && type == HEM_TYPE_MTT) ||
+	    (hr_dev->caps.srqwqe_hop_num && type == HEM_TYPE_SRQWQE) ||
+	    (hr_dev->caps.idx_hop_num && type == HEM_TYPE_IDX))
 		return true;
 
 	return false;
@@ -147,6 +149,22 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
 		mhop->ba_l0_num = mhop->bt_chunk_size / 8;
 		mhop->hop_num = hr_dev->caps.cqe_hop_num;
 		break;
+	case HEM_TYPE_SRQWQE:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.srqwqe_buf_pg_sz
+					    + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz
+					    + PAGE_SHIFT);
+		mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+		mhop->hop_num = hr_dev->caps.srqwqe_hop_num;
+		break;
+	case HEM_TYPE_IDX:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.idx_buf_pg_sz
+				       + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz
+				       + PAGE_SHIFT);
+		mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+		mhop->hop_num = hr_dev->caps.idx_hop_num;
+		break;
 	default:
 		dev_err(dev, "Table %d not support multi-hop addressing!\n",
 			 table->type);
@@ -906,6 +924,18 @@ int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
 			bt_chunk_size = buf_chunk_size;
 			hop_num = hr_dev->caps.cqe_hop_num;
 			break;
+		case HEM_TYPE_SRQWQE:
+			buf_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz
+					+ PAGE_SHIFT);
+			bt_chunk_size = buf_chunk_size;
+			hop_num = hr_dev->caps.srqwqe_hop_num;
+			break;
+		case HEM_TYPE_IDX:
+			buf_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz
+					+ PAGE_SHIFT);
+			bt_chunk_size = buf_chunk_size;
+			hop_num = hr_dev->caps.idx_hop_num;
+			break;
 		default:
 			dev_err(dev,
 			  "Table %d not support to init hem table here!\n",
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 215f5ca377cc..6c9baf99894e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1339,6 +1339,12 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 	caps->cqe_ba_pg_sz	= 0;
 	caps->cqe_buf_pg_sz	= 0;
 	caps->cqe_hop_num	= HNS_ROCE_CQE_HOP_NUM;
+	caps->srqwqe_ba_pg_sz	= 0;
+	caps->srqwqe_buf_pg_sz	= 0;
+	caps->srqwqe_hop_num	= HNS_ROCE_SRQWQE_HOP_NUM;
+	caps->idx_ba_pg_sz	= 0;
+	caps->idx_buf_pg_sz	= 0;
+	caps->idx_hop_num	= HNS_ROCE_IDX_HOP_NUM;
 	caps->eqe_ba_pg_sz	= 0;
 	caps->eqe_buf_pg_sz	= 0;
 	caps->eqe_hop_num	= HNS_ROCE_EQE_HOP_NUM;
@@ -2028,6 +2034,27 @@ static struct hns_roce_v2_cqe *next_cqe_sw_v2(struct hns_roce_cq *hr_cq)
 	return get_sw_cqe_v2(hr_cq, hr_cq->cons_index);
 }
 
+static void *get_srq_wqe(struct hns_roce_srq *srq, int n)
+{
+	return hns_roce_buf_offset(&srq->buf, n << srq->wqe_shift);
+}
+
+static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index)
+{
+	u32 bitmap_num;
+	int bit_num;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	bitmap_num = wqe_index / (sizeof(u64) * 8);
+	bit_num = wqe_index % (sizeof(u64) * 8);
+	srq->idx_que.bitmap[bitmap_num] |= (1ULL << bit_num);
+	srq->tail++;
+
+	spin_unlock(&srq->lock);
+}
+
 static void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index)
 {
 	*hr_cq->set_ci_db = cons_index & 0xffffff;
@@ -2039,6 +2066,7 @@ static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
 	struct hns_roce_v2_cqe *cqe, *dest;
 	u32 prod_index;
 	int nfreed = 0;
+	int wqe_index;
 	u8 owner_bit;
 
 	for (prod_index = hr_cq->cons_index; get_sw_cqe_v2(hr_cq, prod_index);
@@ -2056,7 +2084,13 @@ static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
 		if ((roce_get_field(cqe->byte_16, V2_CQE_BYTE_16_LCL_QPN_M,
 				    V2_CQE_BYTE_16_LCL_QPN_S) &
 				    HNS_ROCE_V2_CQE_QPN_MASK) == qpn) {
-			/* In v1 engine, not support SRQ */
+			if (srq &&
+			    roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_S_R_S)) {
+				wqe_index = roce_get_field(cqe->byte_4,
+						     V2_CQE_BYTE_4_WQE_INDX_M,
+						     V2_CQE_BYTE_4_WQE_INDX_S);
+				hns_roce_free_srq_wqe(srq, wqe_index);
+			}
 			++nfreed;
 		} else if (nfreed) {
 			dest = get_cqe_v2(hr_cq, (prod_index + nfreed) &
@@ -2233,6 +2267,7 @@ static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe,
 static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 				struct hns_roce_qp **cur_qp, struct ib_wc *wc)
 {
+	struct hns_roce_srq *srq = NULL;
 	struct hns_roce_dev *hr_dev;
 	struct hns_roce_v2_cqe *cqe;
 	struct hns_roce_qp *hr_qp;
@@ -2275,6 +2310,37 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 	wc->qp = &(*cur_qp)->ibqp;
 	wc->vendor_err = 0;
 
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		if ((*cur_qp)->sq_signal_bits) {
+			/*
+			 * If sg_signal_bit is 1,
+			 * firstly tail pointer updated to wqe
+			 * which current cqe correspond to
+			 */
+			wqe_ctr = (u16)roce_get_field(cqe->byte_4,
+						      V2_CQE_BYTE_4_WQE_INDX_M,
+						      V2_CQE_BYTE_4_WQE_INDX_S);
+			wq->tail += (wqe_ctr - (u16)wq->tail) &
+				    (wq->wqe_cnt - 1);
+		}
+
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else if ((*cur_qp)->ibqp.srq) {
+		srq = to_hr_srq((*cur_qp)->ibqp.srq);
+		wqe_ctr = le16_to_cpu(roce_get_field(cqe->byte_4,
+						     V2_CQE_BYTE_4_WQE_INDX_M,
+						     V2_CQE_BYTE_4_WQE_INDX_S));
+		wc->wr_id = srq->wrid[wqe_ctr];
+		hns_roce_free_srq_wqe(srq, wqe_ctr);
+	} else {
+		/* Update tail pointer, record wr_id */
+		wq = &(*cur_qp)->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	}
+
 	status = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_STATUS_M,
 				V2_CQE_BYTE_4_STATUS_S);
 	switch (status & HNS_ROCE_V2_CQE_STATUS_MASK) {
@@ -2394,23 +2460,6 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 			wc->status = IB_WC_GENERAL_ERR;
 			break;
 		}
-
-		wq = &(*cur_qp)->sq;
-		if ((*cur_qp)->sq_signal_bits) {
-			/*
-			 * If sg_signal_bit is 1,
-			 * firstly tail pointer updated to wqe
-			 * which current cqe correspond to
-			 */
-			wqe_ctr = (u16)roce_get_field(cqe->byte_4,
-						      V2_CQE_BYTE_4_WQE_INDX_M,
-						      V2_CQE_BYTE_4_WQE_INDX_S);
-			wq->tail += (wqe_ctr - (u16)wq->tail) &
-				    (wq->wqe_cnt - 1);
-		}
-
-		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
-		++wq->tail;
 	} else {
 		/* RQ correspond to CQE */
 		wc->byte_len = le32_to_cpu(cqe->byte_cnt);
@@ -2455,11 +2504,6 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 				return -EAGAIN;
 		}
 
-		/* Update tail pointer, record wr_id */
-		wq = &(*cur_qp)->rq;
-		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
-		++wq->tail;
-
 		wc->sl = (u8)roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_SL_M,
 					    V2_CQE_BYTE_32_SL_S);
 		wc->src_qp = (u8)roce_get_field(cqe->byte_32,
@@ -2768,6 +2812,8 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
+		       (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+		       hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT || ibqp->srq) ? 0 :
 		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
@@ -3109,6 +3155,8 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
 
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
+		       (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+		       hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT || ibqp->srq) ? 0 :
 		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
@@ -3810,6 +3858,11 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
 		set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask);
 
+	roce_set_bit(context->byte_108_rx_reqepsn, V2_QPC_BYTE_108_INV_CREDIT_S,
+		     ibqp->srq ? 1 : 0);
+	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
+		     V2_QPC_BYTE_108_INV_CREDIT_S, 0);
+
 	/* Every status migrate must change state */
 	roce_set_field(context->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M,
 		       V2_QPC_BYTE_60_QP_ST_S, new_state);
@@ -4095,7 +4148,8 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 			hns_roce_free_db(hr_dev, &hr_qp->rdb);
 	}
 
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+	     hr_qp->rq.wqe_cnt) {
 		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
 		kfree(hr_qp->rq_inl_buf.wqe_list);
 	}
@@ -5350,6 +5404,284 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev)
 	destroy_workqueue(hr_dev->irq_workq);
 }
 
+static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_srq *srq, u32 pdn, u16 xrcd,
+				   u32 cqn, void *mb_buf, u64 *mtts_wqe,
+				   u64 *mtts_idx, dma_addr_t dma_handle_wqe,
+				   dma_addr_t dma_handle_idx)
+{
+	struct hns_roce_srq_context *srq_context;
+
+	srq_context = mb_buf;
+	memset(srq_context, 0, sizeof(*srq_context));
+
+	roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQ_ST_M,
+		       SRQC_BYTE_4_SRQ_ST_S, 1);
+
+	roce_set_field(srq_context->byte_4_srqn_srqst,
+		       SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M,
+		       SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S,
+		       (hr_dev->caps.srqwqe_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 :
+		       hr_dev->caps.srqwqe_hop_num));
+	roce_set_field(srq_context->byte_4_srqn_srqst,
+		       SRQC_BYTE_4_SRQ_SHIFT_M, SRQC_BYTE_4_SRQ_SHIFT_S,
+		       ilog2(srq->max));
+
+	roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQN_M,
+		       SRQC_BYTE_4_SRQN_S, srq->srqn);
+
+	roce_set_field(srq_context->byte_8_limit_wl, SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+		       SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
+
+	roce_set_field(srq_context->byte_12_xrcd, SRQC_BYTE_12_SRQ_XRCD_M,
+		       SRQC_BYTE_12_SRQ_XRCD_S, xrcd);
+
+	srq_context->wqe_bt_ba = cpu_to_le32((u32)(dma_handle_wqe >> 3));
+
+	roce_set_field(srq_context->byte_24_wqe_bt_ba,
+		       SRQC_BYTE_24_SRQ_WQE_BT_BA_M,
+		       SRQC_BYTE_24_SRQ_WQE_BT_BA_S,
+		       cpu_to_le32(dma_handle_wqe >> 35));
+
+	roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_PD_M,
+		       SRQC_BYTE_28_PD_S, pdn);
+	roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_RQWS_M,
+		       SRQC_BYTE_28_RQWS_S, srq->max_gs <= 0 ? 0 :
+		       fls(srq->max_gs - 1));
+
+	srq_context->idx_bt_ba = (u32)(dma_handle_idx >> 3);
+	srq_context->idx_bt_ba = cpu_to_le32(srq_context->idx_bt_ba);
+	roce_set_field(srq_context->rsv_idx_bt_ba,
+		       SRQC_BYTE_36_SRQ_IDX_BT_BA_M,
+		       SRQC_BYTE_36_SRQ_IDX_BT_BA_S,
+		       cpu_to_le32(dma_handle_idx >> 35));
+
+	srq_context->idx_cur_blk_addr = (u32)(mtts_idx[0] >> PAGE_ADDR_SHIFT);
+	srq_context->idx_cur_blk_addr =
+				     cpu_to_le32(srq_context->idx_cur_blk_addr);
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M,
+		       SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S,
+		       cpu_to_le32((mtts_idx[0]) >> (32 + PAGE_ADDR_SHIFT)));
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M,
+		       SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S,
+		       hr_dev->caps.idx_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 :
+		       hr_dev->caps.idx_hop_num);
+
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M,
+		       SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S,
+		       hr_dev->caps.idx_ba_pg_sz);
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M,
+		       SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S,
+		       hr_dev->caps.idx_buf_pg_sz);
+
+	srq_context->idx_nxt_blk_addr = (u32)(mtts_idx[1] >> PAGE_ADDR_SHIFT);
+	srq_context->idx_nxt_blk_addr =
+				   cpu_to_le32(srq_context->idx_nxt_blk_addr);
+	roce_set_field(srq_context->rsv_idxnxtblkaddr,
+		       SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M,
+		       SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S,
+		       cpu_to_le32((mtts_idx[1]) >> (32 + PAGE_ADDR_SHIFT)));
+	roce_set_field(srq_context->byte_56_xrc_cqn,
+		       SRQC_BYTE_56_SRQ_XRC_CQN_M, SRQC_BYTE_56_SRQ_XRC_CQN_S,
+		       cqn);
+	roce_set_field(srq_context->byte_56_xrc_cqn,
+		       SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M,
+		       SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S,
+		       hr_dev->caps.srqwqe_ba_pg_sz + PG_SHIFT_OFFSET);
+	roce_set_field(srq_context->byte_56_xrc_cqn,
+		       SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M,
+		       SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S,
+		       hr_dev->caps.srqwqe_buf_pg_sz + PG_SHIFT_OFFSET);
+
+	roce_set_bit(srq_context->db_record_addr_record_en,
+		     SRQC_BYTE_60_SRQ_RECORD_EN_S, 0);
+}
+
+static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq,
+				  struct ib_srq_attr *srq_attr,
+				  enum ib_srq_attr_mask srq_attr_mask,
+				  struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+	struct hns_roce_srq_context *srq_context;
+	struct hns_roce_srq_context *srqc_mask;
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+
+	if (srq_attr_mask & IB_SRQ_LIMIT) {
+		if (srq_attr->srq_limit >= srq->max)
+			return -EINVAL;
+
+		mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+
+		srq_context = mailbox->buf;
+		srqc_mask = (struct hns_roce_srq_context *)mailbox->buf + 1;
+
+		memset(srqc_mask, 0xff, sizeof(*srqc_mask));
+
+		roce_set_field(srq_context->byte_8_limit_wl,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_S, srq_attr->srq_limit);
+		roce_set_field(srqc_mask->byte_8_limit_wl,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
+
+		ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, 0,
+					HNS_ROCE_CMD_MODIFY_SRQC,
+					HNS_ROCE_CMD_TIMEOUT_MSECS);
+		hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+		if (ret) {
+			dev_err(hr_dev->dev,
+				"MODIFY SRQ Failed to cmd mailbox.\n");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+	struct hns_roce_srq_context *srq_context;
+	struct hns_roce_cmd_mailbox *mailbox;
+	int limit_wl;
+	int ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	srq_context = mailbox->buf;
+	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, 0,
+				HNS_ROCE_CMD_QUERY_SRQC,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret) {
+		dev_err(hr_dev->dev, "QUERY SRQ cmd process error\n");
+		goto out;
+	}
+
+	limit_wl = roce_get_field(srq_context->byte_8_limit_wl,
+				  SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+				  SRQC_BYTE_8_SRQ_LIMIT_WL_S);
+
+	attr->srq_limit = limit_wl;
+	attr->max_wr    = srq->max - 1;
+	attr->max_sge   = srq->max_gs;
+
+	memcpy(srq_context, mailbox->buf, sizeof(*srq_context));
+
+out:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	return ret;
+}
+
+static int find_empty_entry(struct hns_roce_idx_que *idx_que)
+{
+	int bit_num;
+	int i;
+
+	/* bitmap[i] is set zero if all bits are allocated */
+	for (i = 0; idx_que->bitmap[i] == 0; ++i)
+		;
+	bit_num = ffs(idx_que->bitmap[i]);
+	idx_que->bitmap[i] &= ~(1ULL << (bit_num - 1));
+
+	return i * sizeof(u64) * 8 + (bit_num - 1);
+}
+
+static void fill_idx_queue(struct hns_roce_idx_que *idx_que,
+			   int cur_idx, int wqe_idx)
+{
+	unsigned int *addr;
+
+	addr = (unsigned int *)hns_roce_buf_offset(&idx_que->idx_buf,
+						   cur_idx * idx_que->entry_sz);
+	*addr = wqe_idx;
+}
+
+static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
+				     const struct ib_recv_wr *wr,
+				     const struct ib_recv_wr **bad_wr)
+{
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+	struct hns_roce_v2_wqe_data_seg *dseg;
+	struct hns_roce_v2_db srq_db;
+	unsigned long flags;
+	int ret = 0;
+	int wqe_idx;
+	void *wqe;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	ind = srq->head & (srq->max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			ret = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		wqe_idx = find_empty_entry(&srq->idx_que);
+		fill_idx_queue(&srq->idx_que, ind, wqe_idx);
+		wqe = get_srq_wqe(srq, wqe_idx);
+		dseg = (struct hns_roce_v2_wqe_data_seg *)wqe;
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			dseg[i].len = cpu_to_le32(wr->sg_list[i].length);
+			dseg[i].lkey = cpu_to_le32(wr->sg_list[i].lkey);
+			dseg[i].addr = cpu_to_le64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->max_gs) {
+			dseg->len = 0;
+			dseg->lkey = cpu_to_le32(0x100);
+			dseg->addr = 0;
+		}
+
+		srq->wrid[wqe_idx] = wr->wr_id;
+		ind = (ind + 1) & (srq->max - 1);
+	}
+
+	if (likely(nreq)) {
+		srq->head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		srq_db.byte_4 = HNS_ROCE_V2_SRQ_DB << 24 | srq->srqn;
+		srq_db.parameter = srq->head;
+
+		hns_roce_write64_k((__le32 *)&srq_db, srq->db_reg_l);
+
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return ret;
+}
+
 static const struct hns_roce_hw hns_roce_hw_v2 = {
 	.cmq_init = hns_roce_v2_cmq_init,
 	.cmq_exit = hns_roce_v2_cmq_exit,
@@ -5377,6 +5709,10 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
 	.poll_cq = hns_roce_v2_poll_cq,
 	.init_eq = hns_roce_v2_init_eq_table,
 	.cleanup_eq = hns_roce_v2_cleanup_eq_table,
+	.write_srqc = hns_roce_v2_write_srqc,
+	.modify_srq = hns_roce_v2_modify_srq,
+	.query_srq = hns_roce_v2_query_srq,
+	.post_srq_recv = hns_roce_v2_post_srq_recv,
 };
 
 static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = {
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 383bab5e75ef..c48301c6fe5d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -93,8 +93,10 @@
 #define HNS_ROCE_CONTEXT_HOP_NUM		1
 #define HNS_ROCE_MTT_HOP_NUM			1
 #define HNS_ROCE_CQE_HOP_NUM			1
+#define HNS_ROCE_SRQWQE_HOP_NUM			1
 #define HNS_ROCE_PBL_HOP_NUM			2
 #define HNS_ROCE_EQE_HOP_NUM			2
+#define HNS_ROCE_IDX_HOP_NUM			1
 
 #define HNS_ROCE_V2_GID_INDEX_NUM		256
 
@@ -334,6 +336,90 @@ struct hns_roce_v2_cq_context {
 #define	V2_CQC_BYTE_64_SE_CQE_IDX_S 0
 #define	V2_CQC_BYTE_64_SE_CQE_IDX_M GENMASK(23, 0)
 
+struct hns_roce_srq_context {
+	__le32	byte_4_srqn_srqst;
+	__le32	byte_8_limit_wl;
+	__le32	byte_12_xrcd;
+	__le32	byte_16_pi_ci;
+	__le32	wqe_bt_ba;
+	__le32	byte_24_wqe_bt_ba;
+	__le32	byte_28_rqws_pd;
+	__le32	idx_bt_ba;
+	__le32	rsv_idx_bt_ba;
+	__le32	idx_cur_blk_addr;
+	__le32	byte_44_idxbufpgsz_addr;
+	__le32	idx_nxt_blk_addr;
+	__le32	rsv_idxnxtblkaddr;
+	__le32	byte_56_xrc_cqn;
+	__le32	db_record_addr_record_en;
+	__le32	db_record_addr;
+};
+
+#define SRQC_BYTE_4_SRQ_ST_S 0
+#define SRQC_BYTE_4_SRQ_ST_M GENMASK(1, 0)
+
+#define SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S 2
+#define SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M GENMASK(3, 2)
+
+#define SRQC_BYTE_4_SRQ_SHIFT_S 4
+#define SRQC_BYTE_4_SRQ_SHIFT_M GENMASK(7, 4)
+
+#define SRQC_BYTE_4_SRQN_S 8
+#define SRQC_BYTE_4_SRQN_M GENMASK(31, 8)
+
+#define SRQC_BYTE_8_SRQ_LIMIT_WL_S 0
+#define SRQC_BYTE_8_SRQ_LIMIT_WL_M GENMASK(15, 0)
+
+#define SRQC_BYTE_12_SRQ_XRCD_S 0
+#define SRQC_BYTE_12_SRQ_XRCD_M GENMASK(23, 0)
+
+#define SRQC_BYTE_16_SRQ_PRODUCER_IDX_S 0
+#define SRQC_BYTE_16_SRQ_PRODUCER_IDX_M GENMASK(15, 0)
+
+#define SRQC_BYTE_16_SRQ_CONSUMER_IDX_S 0
+#define SRQC_BYTE_16_SRQ_CONSUMER_IDX_M GENMASK(31, 16)
+
+#define SRQC_BYTE_24_SRQ_WQE_BT_BA_S 0
+#define SRQC_BYTE_24_SRQ_WQE_BT_BA_M GENMASK(28, 0)
+
+#define SRQC_BYTE_28_PD_S 0
+#define SRQC_BYTE_28_PD_M GENMASK(23, 0)
+
+#define SRQC_BYTE_28_RQWS_S 24
+#define SRQC_BYTE_28_RQWS_M GENMASK(27, 24)
+
+#define SRQC_BYTE_36_SRQ_IDX_BT_BA_S 0
+#define SRQC_BYTE_36_SRQ_IDX_BT_BA_M GENMASK(28, 0)
+
+#define SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S 0
+#define SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M GENMASK(19, 0)
+
+#define SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S 22
+#define SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M GENMASK(23, 22)
+
+#define SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S 24
+#define SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M GENMASK(27, 24)
+
+#define SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S 28
+#define SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M GENMASK(31, 28)
+
+#define SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S 0
+#define SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M GENMASK(19, 0)
+
+#define SRQC_BYTE_56_SRQ_XRC_CQN_S 0
+#define SRQC_BYTE_56_SRQ_XRC_CQN_M GENMASK(23, 0)
+
+#define SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S 24
+#define SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M GENMASK(27, 24)
+
+#define SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S 28
+#define SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M GENMASK(31, 28)
+
+#define SRQC_BYTE_60_SRQ_RECORD_EN_S 0
+
+#define SRQC_BYTE_60_SRQ_DB_RECORD_ADDR_S 1
+#define SRQC_BYTE_60_SRQ_DB_RECORD_ADDR_M GENMASK(31, 1)
+
 enum{
 	V2_MPT_ST_VALID = 0x1,
 	V2_MPT_ST_FREE	= 0x2,
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index d183f13d206f..65ba43cee810 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -546,6 +546,21 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 		ib_dev->map_mr_sg		= hns_roce_map_mr_sg;
 	}
 
+	/* SRQ */
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
+		ib_dev->create_srq = hns_roce_create_srq;
+		ib_dev->modify_srq = hr_dev->hw->modify_srq;
+		ib_dev->query_srq = hr_dev->hw->query_srq;
+		ib_dev->destroy_srq = hns_roce_destroy_srq;
+		ib_dev->post_srq_recv = hr_dev->hw->post_srq_recv;
+		ib_dev->uverbs_cmd_mask |=
+				(1ULL << IB_USER_VERBS_CMD_CREATE_SRQ) |
+				(1ULL << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+				(1ULL << IB_USER_VERBS_CMD_QUERY_SRQ) |
+				(1ULL << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+				(1ULL << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+	}
+
 	/* OTHERS */
 	ib_dev->get_port_immutable	= hns_roce_port_immutable;
 	ib_dev->disassociate_ucontext	= hns_roce_disassociate_ucontext;
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 521ad2aa3a4e..fabc95dbdf97 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -184,12 +184,27 @@ static int hns_roce_alloc_mtt_range(struct hns_roce_dev *hr_dev, int order,
 	struct hns_roce_buddy *buddy;
 	int ret;
 
-	if (mtt_type == MTT_TYPE_WQE) {
+	switch (mtt_type) {
+	case MTT_TYPE_WQE:
 		buddy = &mr_table->mtt_buddy;
 		table = &mr_table->mtt_table;
-	} else {
+		break;
+	case MTT_TYPE_CQE:
 		buddy = &mr_table->mtt_cqe_buddy;
 		table = &mr_table->mtt_cqe_table;
+		break;
+	case MTT_TYPE_SRQWQE:
+		buddy = &mr_table->mtt_srqwqe_buddy;
+		table = &mr_table->mtt_srqwqe_table;
+		break;
+	case MTT_TYPE_IDX:
+		buddy = &mr_table->mtt_idx_buddy;
+		table = &mr_table->mtt_idx_table;
+		break;
+	default:
+		dev_err(hr_dev->dev, "Unsupport MTT table type: %d\n",
+			mtt_type);
+		return -EINVAL;
 	}
 
 	ret = hns_roce_buddy_alloc(buddy, order, seg);
@@ -242,18 +257,40 @@ void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt)
 	if (mtt->order < 0)
 		return;
 
-	if (mtt->mtt_type == MTT_TYPE_WQE) {
+	switch (mtt->mtt_type) {
+	case MTT_TYPE_WQE:
 		hns_roce_buddy_free(&mr_table->mtt_buddy, mtt->first_seg,
 				    mtt->order);
 		hns_roce_table_put_range(hr_dev, &mr_table->mtt_table,
 					mtt->first_seg,
 					mtt->first_seg + (1 << mtt->order) - 1);
-	} else {
+		break;
+	case MTT_TYPE_CQE:
 		hns_roce_buddy_free(&mr_table->mtt_cqe_buddy, mtt->first_seg,
 				    mtt->order);
 		hns_roce_table_put_range(hr_dev, &mr_table->mtt_cqe_table,
 					mtt->first_seg,
 					mtt->first_seg + (1 << mtt->order) - 1);
+		break;
+	case MTT_TYPE_SRQWQE:
+		hns_roce_buddy_free(&mr_table->mtt_srqwqe_buddy, mtt->first_seg,
+				    mtt->order);
+		hns_roce_table_put_range(hr_dev, &mr_table->mtt_srqwqe_table,
+					mtt->first_seg,
+					mtt->first_seg + (1 << mtt->order) - 1);
+		break;
+	case MTT_TYPE_IDX:
+		hns_roce_buddy_free(&mr_table->mtt_idx_buddy, mtt->first_seg,
+				    mtt->order);
+		hns_roce_table_put_range(hr_dev, &mr_table->mtt_idx_table,
+					mtt->first_seg,
+					mtt->first_seg + (1 << mtt->order) - 1);
+		break;
+	default:
+		dev_err(hr_dev->dev,
+			"Unsupport mtt type %d, clean mtt failed\n",
+			mtt->mtt_type);
+		break;
 	}
 }
 EXPORT_SYMBOL_GPL(hns_roce_mtt_cleanup);
@@ -713,10 +750,26 @@ static int hns_roce_write_mtt_chunk(struct hns_roce_dev *hr_dev,
 	u32 bt_page_size;
 	u32 i;
 
-	if (mtt->mtt_type == MTT_TYPE_WQE)
+	switch (mtt->mtt_type) {
+	case MTT_TYPE_WQE:
+		table = &hr_dev->mr_table.mtt_table;
 		bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
-	else
+		break;
+	case MTT_TYPE_CQE:
+		table = &hr_dev->mr_table.mtt_cqe_table;
 		bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
+		break;
+	case MTT_TYPE_SRQWQE:
+		table = &hr_dev->mr_table.mtt_srqwqe_table;
+		bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT);
+		break;
+	case MTT_TYPE_IDX:
+		table = &hr_dev->mr_table.mtt_idx_table;
+		bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT);
+		break;
+	default:
+		return -EINVAL;
+	}
 
 	/* All MTTs must fit in the same page */
 	if (start_index / (bt_page_size / sizeof(u64)) !=
@@ -726,11 +779,6 @@ static int hns_roce_write_mtt_chunk(struct hns_roce_dev *hr_dev,
 	if (start_index & (HNS_ROCE_MTT_ENTRY_PER_SEG - 1))
 		return -EINVAL;
 
-	if (mtt->mtt_type == MTT_TYPE_WQE)
-		table = &hr_dev->mr_table.mtt_table;
-	else
-		table = &hr_dev->mr_table.mtt_cqe_table;
-
 	mtts = hns_roce_table_find(hr_dev, table,
 				mtt->first_seg + s / hr_dev->caps.mtt_entry_sz,
 				&dma_handle);
@@ -759,10 +807,25 @@ static int hns_roce_write_mtt(struct hns_roce_dev *hr_dev,
 	if (mtt->order < 0)
 		return -EINVAL;
 
-	if (mtt->mtt_type == MTT_TYPE_WQE)
+	switch (mtt->mtt_type) {
+	case MTT_TYPE_WQE:
 		bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
-	else
+		break;
+	case MTT_TYPE_CQE:
 		bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
+		break;
+	case MTT_TYPE_SRQWQE:
+		bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT);
+		break;
+	case MTT_TYPE_IDX:
+		bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT);
+		break;
+	default:
+		dev_err(hr_dev->dev,
+			"Unsupport mtt type %d, write mtt failed\n",
+			mtt->mtt_type);
+		return -EINVAL;
+	}
 
 	while (npages > 0) {
 		chunk = min_t(int, bt_page_size / sizeof(u64), npages);
@@ -828,8 +891,31 @@ int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev)
 		if (ret)
 			goto err_buddy_cqe;
 	}
+
+	if (hr_dev->caps.num_srqwqe_segs) {
+		ret = hns_roce_buddy_init(&mr_table->mtt_srqwqe_buddy,
+					  ilog2(hr_dev->caps.num_srqwqe_segs));
+		if (ret)
+			goto err_buddy_srqwqe;
+	}
+
+	if (hr_dev->caps.num_idx_segs) {
+		ret = hns_roce_buddy_init(&mr_table->mtt_idx_buddy,
+					  ilog2(hr_dev->caps.num_idx_segs));
+		if (ret)
+			goto err_buddy_idx;
+	}
+
 	return 0;
 
+err_buddy_idx:
+	if (hr_dev->caps.num_srqwqe_segs)
+		hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy);
+
+err_buddy_srqwqe:
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
+		hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy);
+
 err_buddy_cqe:
 	hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
 
@@ -842,6 +928,10 @@ void hns_roce_cleanup_mr_table(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
 
+	if (hr_dev->caps.num_idx_segs)
+		hns_roce_buddy_cleanup(&mr_table->mtt_idx_buddy);
+	if (hr_dev->caps.num_srqwqe_segs)
+		hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy);
 	hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
 	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
 		hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy);
@@ -897,8 +987,25 @@ int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
 	u32 bt_page_size;
 	u32 n;
 
-	order = mtt->mtt_type == MTT_TYPE_WQE ? hr_dev->caps.mtt_ba_pg_sz :
-		hr_dev->caps.cqe_ba_pg_sz;
+	switch (mtt->mtt_type) {
+	case MTT_TYPE_WQE:
+		order = hr_dev->caps.mtt_ba_pg_sz;
+		break;
+	case MTT_TYPE_CQE:
+		order = hr_dev->caps.cqe_ba_pg_sz;
+		break;
+	case MTT_TYPE_SRQWQE:
+		order = hr_dev->caps.srqwqe_ba_pg_sz;
+		break;
+	case MTT_TYPE_IDX:
+		order = hr_dev->caps.idx_ba_pg_sz;
+		break;
+	default:
+		dev_err(dev, "Unsupport mtt type %d, write mtt failed\n",
+			mtt->mtt_type);
+		return -EINVAL;
+	}
+
 	bt_page_size = 1 << (order + PAGE_SHIFT);
 
 	pages = (u64 *) __get_free_pages(GFP_KERNEL, order);
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 5ebf481a39d9..52d2b299b3be 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -280,7 +280,7 @@ void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
 EXPORT_SYMBOL_GPL(hns_roce_release_range_qp);
 
 static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
-				struct ib_qp_cap *cap, int is_user, int has_srq,
+				struct ib_qp_cap *cap, int is_user, int has_rq,
 				struct hns_roce_qp *hr_qp)
 {
 	struct device *dev = hr_dev->dev;
@@ -294,14 +294,12 @@ static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
 		return -EINVAL;
 	}
 
-	/* If srq exit, set zero for relative number of rq */
-	if (has_srq) {
-		if (cap->max_recv_wr) {
-			dev_dbg(dev, "srq no need config max_recv_wr\n");
-			return -EINVAL;
-		}
-
-		hr_qp->rq.wqe_cnt = hr_qp->rq.max_gs = 0;
+	/* If srq exist, set zero for relative number of rq */
+	if (!has_rq) {
+		hr_qp->rq.wqe_cnt = 0;
+		hr_qp->rq.max_gs = 0;
+		cap->max_recv_wr = 0;
+		cap->max_recv_sge = 0;
 	} else {
 		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
 			dev_err(dev, "user space no need config max_recv_wr max_recv_sge\n");
@@ -563,13 +561,14 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		hr_qp->sq_signal_bits = cpu_to_le32(IB_SIGNAL_REQ_WR);
 
 	ret = hns_roce_set_rq_size(hr_dev, &init_attr->cap, !!ib_pd->uobject,
-				   !!init_attr->srq, hr_qp);
+				   hns_roce_qp_has_rq(init_attr), hr_qp);
 	if (ret) {
 		dev_err(dev, "hns_roce_set_rq_size failed\n");
 		goto err_out;
 	}
 
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+	    hns_roce_qp_has_rq(init_attr)) {
 		/* allocate recv inline buf */
 		hr_qp->rq_inl_buf.wqe_list = kcalloc(hr_qp->rq.wqe_cnt,
 					       sizeof(struct hns_roce_rinl_wqe),
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index d8a86130febe..46732d25a8a8 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -9,6 +9,413 @@
 #include "hns_roce_cmd.h"
 #include "hns_roce_hem.h"
 
+static void hns_roce_ib_srq_event(struct hns_roce_srq *srq,
+				  enum hns_roce_event event_type)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
+	struct ib_srq *ibsrq = &srq->ibsrq;
+	struct ib_event event;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (event_type) {
+		case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			dev_err(hr_dev->dev,
+			   "hns_roce:Unexpected event type 0x%x on SRQ %06lx\n",
+			   event_type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+static int hns_roce_sw2hw_srq(struct hns_roce_dev *dev,
+			      struct hns_roce_cmd_mailbox *mailbox,
+			      unsigned long srq_num)
+{
+	return hns_roce_cmd_mbox(dev, mailbox->dma, 0, srq_num, 0,
+				 HNS_ROCE_CMD_SW2HW_SRQ,
+				 HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+
+static int hns_roce_hw2sw_srq(struct hns_roce_dev *dev,
+			     struct hns_roce_cmd_mailbox *mailbox,
+			     unsigned long srq_num)
+{
+	return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
+				 mailbox ? 0 : 1, HNS_ROCE_CMD_HW2SW_SRQ,
+				 HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+
+int hns_roce_srq_alloc(struct hns_roce_dev *hr_dev, u32 pdn, u32 cqn, u16 xrcd,
+		       struct hns_roce_mtt *hr_mtt, u64 db_rec_addr,
+		       struct hns_roce_srq *srq)
+{
+	struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+	struct hns_roce_cmd_mailbox *mailbox;
+	dma_addr_t dma_handle_wqe;
+	dma_addr_t dma_handle_idx;
+	u64 *mtts_wqe;
+	u64 *mtts_idx;
+	int ret;
+
+	/* Get the physical address of srq buf */
+	mtts_wqe = hns_roce_table_find(hr_dev,
+				       &hr_dev->mr_table.mtt_srqwqe_table,
+				       srq->mtt.first_seg,
+				       &dma_handle_wqe);
+	if (!mtts_wqe) {
+		dev_err(hr_dev->dev,
+			"SRQ alloc.Failed to find srq buf addr.\n");
+		return -EINVAL;
+	}
+
+	/* Get physical address of idx que buf */
+	mtts_idx = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_idx_table,
+				       srq->idx_que.mtt.first_seg,
+				       &dma_handle_idx);
+	if (!mtts_idx) {
+		dev_err(hr_dev->dev,
+			"SRQ alloc.Failed to find idx que buf addr.\n");
+		return -EINVAL;
+	}
+
+	ret = hns_roce_bitmap_alloc(&srq_table->bitmap, &srq->srqn);
+	if (ret == -1) {
+		dev_err(hr_dev->dev, "SRQ alloc.Failed to alloc index.\n");
+		return -ENOMEM;
+	}
+
+	ret = hns_roce_table_get(hr_dev, &srq_table->table, srq->srqn);
+	if (ret)
+		goto err_out;
+
+	ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL));
+	if (ret)
+		goto err_put;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox)) {
+		ret = PTR_ERR(mailbox);
+		goto err_xa;
+	}
+
+	hr_dev->hw->write_srqc(hr_dev, srq, pdn, xrcd, cqn, mailbox->buf,
+			       mtts_wqe, mtts_idx, dma_handle_wqe,
+			       dma_handle_idx);
+
+	ret = hns_roce_sw2hw_srq(hr_dev, mailbox, srq->srqn);
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	if (ret)
+		goto err_xa;
+
+	atomic_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+	return ret;
+
+err_xa:
+	xa_erase(&srq_table->xa, srq->srqn);
+
+err_put:
+	hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn);
+
+err_out:
+	hns_roce_bitmap_free(&srq_table->bitmap, srq->srqn, BITMAP_NO_RR);
+	return ret;
+}
+
+void hns_roce_srq_free(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
+{
+	struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+	int ret;
+
+	ret = hns_roce_hw2sw_srq(hr_dev, NULL, srq->srqn);
+	if (ret)
+		dev_err(hr_dev->dev, "HW2SW_SRQ failed (%d) for CQN %06lx\n",
+			ret, srq->srqn);
+
+	xa_erase(&srq_table->xa, srq->srqn);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn);
+	hns_roce_bitmap_free(&srq_table->bitmap, srq->srqn, BITMAP_NO_RR);
+}
+
+static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq,
+				   u32 page_shift)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+	struct hns_roce_idx_que *idx_que = &srq->idx_que;
+	u32 bitmap_num;
+	int i;
+
+	bitmap_num = HNS_ROCE_ALOGN_UP(srq->max, 8 * sizeof(u64));
+
+	idx_que->bitmap = kcalloc(1, bitmap_num / 8, GFP_KERNEL);
+	if (!idx_que->bitmap)
+		return -ENOMEM;
+
+	bitmap_num = bitmap_num / (8 * sizeof(u64));
+
+	idx_que->buf_size = srq->idx_que.buf_size;
+
+	if (hns_roce_buf_alloc(hr_dev, idx_que->buf_size, (1 << page_shift) * 2,
+			       &idx_que->idx_buf, page_shift)) {
+		kfree(idx_que->bitmap);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < bitmap_num; i++)
+		idx_que->bitmap[i] = ~(0UL);
+
+	return 0;
+}
+
+struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
+				   struct ib_srq_init_attr *srq_init_attr,
+				   struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+	struct hns_roce_srq *srq;
+	int srq_desc_size;
+	int srq_buf_size;
+	u32 page_shift;
+	int ret = 0;
+	u32 npages;
+	u32 cqn;
+
+	/* Check the actual SRQ wqe and SRQ sge num */
+	if (srq_init_attr->attr.max_wr >= hr_dev->caps.max_srq_wrs ||
+	    srq_init_attr->attr.max_sge > hr_dev->caps.max_srq_sges)
+		return ERR_PTR(-EINVAL);
+
+	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+
+	srq->max = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1);
+	srq->max_gs = srq_init_attr->attr.max_sge;
+
+	srq_desc_size = max(16, 16 * srq->max_gs);
+
+	srq->wqe_shift = ilog2(srq_desc_size);
+
+	srq_buf_size = srq->max * srq_desc_size;
+
+	srq->idx_que.entry_sz = HNS_ROCE_IDX_QUE_ENTRY_SZ;
+	srq->idx_que.buf_size = srq->max * srq->idx_que.entry_sz;
+	srq->mtt.mtt_type = MTT_TYPE_SRQWQE;
+	srq->idx_que.mtt.mtt_type = MTT_TYPE_IDX;
+
+	if (udata) {
+		struct hns_roce_ib_create_srq  ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+			ret = -EFAULT;
+			goto err_srq;
+		}
+
+		srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+					srq_buf_size, 0, 0);
+		if (IS_ERR(srq->umem)) {
+			ret = PTR_ERR(srq->umem);
+			goto err_srq;
+		}
+
+		if (hr_dev->caps.srqwqe_buf_pg_sz) {
+			npages = (ib_umem_page_count(srq->umem) +
+				  (1 << hr_dev->caps.srqwqe_buf_pg_sz) - 1) /
+				  (1 << hr_dev->caps.srqwqe_buf_pg_sz);
+			page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
+			ret = hns_roce_mtt_init(hr_dev, npages,
+						page_shift,
+						&srq->mtt);
+		} else
+			ret = hns_roce_mtt_init(hr_dev,
+						ib_umem_page_count(srq->umem),
+						srq->umem->page_shift,
+						&srq->mtt);
+		if (ret)
+			goto err_buf;
+
+		ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->mtt, srq->umem);
+		if (ret)
+			goto err_srq_mtt;
+
+		/* config index queue BA */
+		srq->idx_que.umem = ib_umem_get(pd->uobject->context,
+						ucmd.que_addr,
+						srq->idx_que.buf_size, 0, 0);
+		if (IS_ERR(srq->idx_que.umem)) {
+			dev_err(hr_dev->dev,
+				"ib_umem_get error for index queue\n");
+			goto err_srq_mtt;
+		}
+
+		if (hr_dev->caps.idx_buf_pg_sz) {
+			npages = (ib_umem_page_count(srq->idx_que.umem) +
+				  (1 << hr_dev->caps.idx_buf_pg_sz) - 1) /
+				  (1 << hr_dev->caps.idx_buf_pg_sz);
+			page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
+			ret = hns_roce_mtt_init(hr_dev, npages,
+						page_shift, &srq->idx_que.mtt);
+		} else {
+			ret = hns_roce_mtt_init(hr_dev,
+				       ib_umem_page_count(srq->idx_que.umem),
+				       srq->idx_que.umem->page_shift,
+				       &srq->idx_que.mtt);
+		}
+
+		if (ret) {
+			dev_err(hr_dev->dev,
+				"hns_roce_mtt_init error for idx que\n");
+			goto err_idx_mtt;
+		}
+
+		ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->idx_que.mtt,
+						 srq->idx_que.umem);
+		if (ret) {
+			dev_err(hr_dev->dev,
+			      "hns_roce_ib_umem_write_mtt error for idx que\n");
+			goto err_idx_buf;
+		}
+	} else {
+		page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
+		if (hns_roce_buf_alloc(hr_dev, srq_buf_size,
+				      (1 << page_shift) * 2,
+				      &srq->buf, page_shift)) {
+			ret = -ENOMEM;
+			goto err_srq;
+		}
+
+		srq->head = 0;
+		srq->tail = srq->max - 1;
+
+		ret = hns_roce_mtt_init(hr_dev, srq->buf.npages,
+					srq->buf.page_shift, &srq->mtt);
+		if (ret)
+			goto err_buf;
+
+		ret = hns_roce_buf_write_mtt(hr_dev, &srq->mtt, &srq->buf);
+		if (ret)
+			goto err_srq_mtt;
+
+		page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
+		ret = hns_roce_create_idx_que(pd, srq, page_shift);
+		if (ret) {
+			dev_err(hr_dev->dev, "Create idx queue fail(%d)!\n",
+				ret);
+			goto err_srq_mtt;
+		}
+
+		/* Init mtt table for idx_que */
+		ret = hns_roce_mtt_init(hr_dev, srq->idx_que.idx_buf.npages,
+					srq->idx_que.idx_buf.page_shift,
+					&srq->idx_que.mtt);
+		if (ret)
+			goto err_create_idx;
+
+		/* Write buffer address into the mtt table */
+		ret = hns_roce_buf_write_mtt(hr_dev, &srq->idx_que.mtt,
+					     &srq->idx_que.idx_buf);
+		if (ret)
+			goto err_idx_buf;
+
+		srq->wrid = kvmalloc_array(srq->max, sizeof(u64), GFP_KERNEL);
+		if (!srq->wrid) {
+			ret = -ENOMEM;
+			goto err_idx_buf;
+		}
+	}
+
+	cqn = ib_srq_has_cq(srq_init_attr->srq_type) ?
+	      to_hr_cq(srq_init_attr->ext.cq)->cqn : 0;
+
+	srq->db_reg_l = hr_dev->reg_base + SRQ_DB_REG;
+
+	ret = hns_roce_srq_alloc(hr_dev, to_hr_pd(pd)->pdn, cqn, 0,
+				 &srq->mtt, 0, srq);
+	if (ret)
+		goto err_wrid;
+
+	srq->event = hns_roce_ib_srq_event;
+	srq->ibsrq.ext.xrc.srq_num = srq->srqn;
+
+	if (pd->uobject) {
+		if (ib_copy_to_udata(udata, &srq->srqn, sizeof(__u32))) {
+			ret = -EFAULT;
+			goto err_wrid;
+		}
+	}
+
+	return &srq->ibsrq;
+
+err_wrid:
+	kvfree(srq->wrid);
+
+err_idx_buf:
+	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+
+err_idx_mtt:
+	if (udata)
+		ib_umem_release(srq->idx_que.umem);
+
+err_create_idx:
+	hns_roce_buf_free(hr_dev, srq->idx_que.buf_size,
+			  &srq->idx_que.idx_buf);
+	kfree(srq->idx_que.bitmap);
+
+err_srq_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+
+err_buf:
+	if (udata)
+		ib_umem_release(srq->umem);
+	else
+		hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
+
+err_srq:
+	kfree(srq);
+	return ERR_PTR(ret);
+}
+
+int hns_roce_destroy_srq(struct ib_srq *ibsrq)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+
+	hns_roce_srq_free(hr_dev, srq);
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+
+	if (ibsrq->uobject) {
+		hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+		ib_umem_release(srq->idx_que.umem);
+		ib_umem_release(srq->umem);
+	} else {
+		kvfree(srq->wrid);
+		hns_roce_buf_free(hr_dev, srq->max << srq->wqe_shift,
+				  &srq->buf);
+	}
+
+	kfree(srq);
+
+	return 0;
+}
+
 int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
index c1f87735514f..ef3c7ec793a7 100644
--- a/include/uapi/rdma/hns-abi.h
+++ b/include/uapi/rdma/hns-abi.h
@@ -46,6 +46,12 @@ struct hns_roce_ib_create_cq_resp {
 	__aligned_u64 cap_flags;
 };
 
+struct hns_roce_ib_create_srq {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+	__aligned_u64 que_addr;
+};
+
 struct hns_roce_ib_create_qp {
 	__aligned_u64 buf_addr;
 	__aligned_u64 db_addr;
-- 
cgit v1.2.3-59-g8ed1b


From d30d42e08c76cb9323ec6121190eb026b07f773b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 5 Dec 2018 17:35:44 -0800
Subject: bpf: Change insn_offset to insn_off in bpf_func_info

The later patch will introduce "struct bpf_line_info" which
has member "line_off" and "file_off" referring back to the
string section in btf.  The line_"off" and file_"off"
are more consistent to the naming convention in btf.h that
means "offset" (e.g. name_off in "struct btf_type").

The to-be-added "struct bpf_line_info" also has another
member, "insn_off" which is the same as the "insn_offset"
in "struct bpf_func_info".  Hence, this patch renames "insn_offset"
to "insn_off" for "struct bpf_func_info".

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  2 +-
 kernel/bpf/verifier.c    | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8e1eeee2c5f..a84fd232d934 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2991,7 +2991,7 @@ struct bpf_flow_keys {
 };
 
 struct bpf_func_info {
-	__u32	insn_offset;
+	__u32	insn_off;
 	__u32	type_id;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 71988337ac14..7658c61c1a88 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4707,24 +4707,24 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 			goto free_btf;
 		}
 
-		/* check insn_offset */
+		/* check insn_off */
 		if (i == 0) {
-			if (krecord[i].insn_offset) {
+			if (krecord[i].insn_off) {
 				verbose(env,
-					"nonzero insn_offset %u for the first func info record",
-					krecord[i].insn_offset);
+					"nonzero insn_off %u for the first func info record",
+					krecord[i].insn_off);
 				ret = -EINVAL;
 				goto free_btf;
 			}
-		} else if (krecord[i].insn_offset <= prev_offset) {
+		} else if (krecord[i].insn_off <= prev_offset) {
 			verbose(env,
 				"same or smaller insn offset (%u) than previous func info record (%u)",
-				krecord[i].insn_offset, prev_offset);
+				krecord[i].insn_off, prev_offset);
 			ret = -EINVAL;
 			goto free_btf;
 		}
 
-		if (env->subprog_info[i].start != krecord[i].insn_offset) {
+		if (env->subprog_info[i].start != krecord[i].insn_off) {
 			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
 			ret = -EINVAL;
 			goto free_btf;
@@ -4739,7 +4739,7 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 			goto free_btf;
 		}
 
-		prev_offset = krecord[i].insn_offset;
+		prev_offset = krecord[i].insn_off;
 		urecord += urec_size;
 	}
 
@@ -4762,7 +4762,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
 		return;
 
 	for (i = 0; i < env->subprog_cnt; i++)
-		env->prog->aux->func_info[i].insn_offset = env->subprog_info[i].start;
+		env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start;
 }
 
 /* check %cur's range satisfies %old's */
-- 
cgit v1.2.3-59-g8ed1b


From 4e21565b7fd4d9045765f697887e74a704135fe2 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Thu, 15 Nov 2018 14:52:40 +0900
Subject: asm-generic: add kexec_file_load system call to unistd.h

The initial user of this system call number is arm64.

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/uapi/asm-generic/unistd.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 538546edbfbd..3b7196295fa6 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -738,9 +738,11 @@ __SYSCALL(__NR_statx,     sys_statx)
 __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 #define __NR_rseq 293
 __SYSCALL(__NR_rseq, sys_rseq)
+#define __NR_kexec_file_load 294
+__SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
 
 #undef __NR_syscalls
-#define __NR_syscalls 294
+#define __NR_syscalls 295
 
 /*
  * 32 bit systems traditionally used different
-- 
cgit v1.2.3-59-g8ed1b


From d2e9ace47aac92a465c4ad8e0cd1f5f8422a117e Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Wed, 28 Nov 2018 10:22:20 -0800
Subject: IB/hfi1: Add OPFN and TID RDMA capability bits

The OPFN and TID RDMA capability bits are added to allow users to control
which feature is enabled and disabled.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/common.h | 19 +++++++++++--------
 include/uapi/rdma/hfi/hfi1_user.h   |  6 +++---
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index 7108d4d92259..40d3cfb58bd1 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -136,18 +136,21 @@
 				  HFI1_CAP_ALLOW_PERM_JKEY |		\
 				  HFI1_CAP_STATIC_RATE_CTRL |		\
 				  HFI1_CAP_PRINT_UNIMPL |		\
-				  HFI1_CAP_TID_UNMAP)
+				  HFI1_CAP_TID_UNMAP |			\
+				  HFI1_CAP_OPFN)
 /*
  * A set of capability bits that are "global" and are not allowed to be
  * set in the user bitmask.
  */
 #define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |			\
-				  HFI1_CAP_USE_SDMA_HEAD |		\
-				  HFI1_CAP_EXTENDED_PSN |		\
-				  HFI1_CAP_PRINT_UNIMPL |		\
-				  HFI1_CAP_NO_INTEGRITY |		\
-				  HFI1_CAP_PKEY_CHECK) <<		\
-				 HFI1_CAP_USER_SHIFT)
+				   HFI1_CAP_USE_SDMA_HEAD |		\
+				   HFI1_CAP_EXTENDED_PSN |		\
+				   HFI1_CAP_PRINT_UNIMPL |		\
+				   HFI1_CAP_NO_INTEGRITY |		\
+				   HFI1_CAP_PKEY_CHECK |		\
+				   HFI1_CAP_TID_RDMA |			\
+				   HFI1_CAP_OPFN) <<			\
+				  HFI1_CAP_USER_SHIFT)
 /*
  * Set of capabilities that need to be enabled for kernel context in
  * order to be allowed for user contexts, as well.
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index c6a984c0c881..01ac5853d9ac 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -6,7 +6,7 @@
  *
  * GPL LICENSE SUMMARY
  *
- * Copyright(c) 2015 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -95,7 +95,7 @@
 #define HFI1_CAP_SDMA_AHG         (1UL <<  2) /* Enable SDMA AHG support */
 #define HFI1_CAP_EXTENDED_PSN     (1UL <<  3) /* Enable Extended PSN support */
 #define HFI1_CAP_HDRSUPP          (1UL <<  4) /* Enable Header Suppression */
-/* 1UL << 5 unused */
+#define HFI1_CAP_TID_RDMA         (1UL <<  5) /* Enable TID RDMA operations */
 #define HFI1_CAP_USE_SDMA_HEAD    (1UL <<  6) /* DMA Hdr Q tail vs. use CSR */
 #define HFI1_CAP_MULTI_PKT_EGR    (1UL <<  7) /* Enable multi-packet Egr buffs*/
 #define HFI1_CAP_NODROP_RHQ_FULL  (1UL <<  8) /* Don't drop on Hdr Q full */
@@ -106,7 +106,7 @@
 #define HFI1_CAP_NO_INTEGRITY     (1UL << 13) /* Enable ctxt integrity checks */
 #define HFI1_CAP_PKEY_CHECK       (1UL << 14) /* Enable ctxt PKey checking */
 #define HFI1_CAP_STATIC_RATE_CTRL (1UL << 15) /* Allow PBC.StaticRateControl */
-/* 1UL << 16 unused */
+#define HFI1_CAP_OPFN             (1UL << 16) /* Enable the OPFN protocol */
 #define HFI1_CAP_SDMA_HEAD_CHECK  (1UL << 17) /* SDMA head checking */
 #define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */
 
-- 
cgit v1.2.3-59-g8ed1b


From 6e8e72cd206e2ba68801e4f2490f639d41808c8d Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:18 +0000
Subject: crypto: user - convert all stats from u32 to u64

All the 32-bit fields need to be 64-bit.  In some cases, UINT32_MAX crypto
operations can be done in seconds.

Reported-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c                 |  10 ++--
 crypto/crypto_user_stat.c       | 114 +++++++++++++++++++---------------------
 include/crypto/acompress.h      |   8 +--
 include/crypto/aead.h           |   8 +--
 include/crypto/akcipher.h       |  16 +++---
 include/crypto/hash.h           |   6 +--
 include/crypto/kpp.h            |  12 ++---
 include/crypto/rng.h            |   8 +--
 include/crypto/skcipher.h       |   8 +--
 include/linux/crypto.h          |  46 ++++++++--------
 include/uapi/linux/cryptouser.h |  38 +++++++-------
 11 files changed, 133 insertions(+), 141 deletions(-)

(limited to 'include/uapi')

diff --git a/crypto/algapi.c b/crypto/algapi.c
index f5396c88e8cd..42fe316f80ee 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -259,13 +259,13 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
 	list_add(&larval->alg.cra_list, &crypto_alg_list);
 
 #ifdef CONFIG_CRYPTO_STATS
-	atomic_set(&alg->encrypt_cnt, 0);
-	atomic_set(&alg->decrypt_cnt, 0);
+	atomic64_set(&alg->encrypt_cnt, 0);
+	atomic64_set(&alg->decrypt_cnt, 0);
 	atomic64_set(&alg->encrypt_tlen, 0);
 	atomic64_set(&alg->decrypt_tlen, 0);
-	atomic_set(&alg->verify_cnt, 0);
-	atomic_set(&alg->cipher_err_cnt, 0);
-	atomic_set(&alg->sign_cnt, 0);
+	atomic64_set(&alg->verify_cnt, 0);
+	atomic64_set(&alg->cipher_err_cnt, 0);
+	atomic64_set(&alg->sign_cnt, 0);
 #endif
 
 out:
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index a6fb2e6f618d..352569f378a0 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -35,22 +35,21 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat raead;
 	u64 v64;
-	u32 v32;
 
 	memset(&raead, 0, sizeof(raead));
 
 	strscpy(raead.type, "aead", sizeof(raead.type));
 
-	v32 = atomic_read(&alg->encrypt_cnt);
-	raead.stat_encrypt_cnt = v32;
+	v64 = atomic64_read(&alg->encrypt_cnt);
+	raead.stat_encrypt_cnt = v64;
 	v64 = atomic64_read(&alg->encrypt_tlen);
 	raead.stat_encrypt_tlen = v64;
-	v32 = atomic_read(&alg->decrypt_cnt);
-	raead.stat_decrypt_cnt = v32;
+	v64 = atomic64_read(&alg->decrypt_cnt);
+	raead.stat_decrypt_cnt = v64;
 	v64 = atomic64_read(&alg->decrypt_tlen);
 	raead.stat_decrypt_tlen = v64;
-	v32 = atomic_read(&alg->aead_err_cnt);
-	raead.stat_aead_err_cnt = v32;
+	v64 = atomic64_read(&alg->aead_err_cnt);
+	raead.stat_aead_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
 }
@@ -59,22 +58,21 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rcipher;
 	u64 v64;
-	u32 v32;
 
 	memset(&rcipher, 0, sizeof(rcipher));
 
 	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
 
-	v32 = atomic_read(&alg->encrypt_cnt);
-	rcipher.stat_encrypt_cnt = v32;
+	v64 = atomic64_read(&alg->encrypt_cnt);
+	rcipher.stat_encrypt_cnt = v64;
 	v64 = atomic64_read(&alg->encrypt_tlen);
 	rcipher.stat_encrypt_tlen = v64;
-	v32 = atomic_read(&alg->decrypt_cnt);
-	rcipher.stat_decrypt_cnt = v32;
+	v64 = atomic64_read(&alg->decrypt_cnt);
+	rcipher.stat_decrypt_cnt = v64;
 	v64 = atomic64_read(&alg->decrypt_tlen);
 	rcipher.stat_decrypt_tlen = v64;
-	v32 = atomic_read(&alg->cipher_err_cnt);
-	rcipher.stat_cipher_err_cnt = v32;
+	v64 = atomic64_read(&alg->cipher_err_cnt);
+	rcipher.stat_cipher_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
 }
@@ -83,21 +81,20 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rcomp;
 	u64 v64;
-	u32 v32;
 
 	memset(&rcomp, 0, sizeof(rcomp));
 
 	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
-	v32 = atomic_read(&alg->compress_cnt);
-	rcomp.stat_compress_cnt = v32;
+	v64 = atomic64_read(&alg->compress_cnt);
+	rcomp.stat_compress_cnt = v64;
 	v64 = atomic64_read(&alg->compress_tlen);
 	rcomp.stat_compress_tlen = v64;
-	v32 = atomic_read(&alg->decompress_cnt);
-	rcomp.stat_decompress_cnt = v32;
+	v64 = atomic64_read(&alg->decompress_cnt);
+	rcomp.stat_decompress_cnt = v64;
 	v64 = atomic64_read(&alg->decompress_tlen);
 	rcomp.stat_decompress_tlen = v64;
-	v32 = atomic_read(&alg->cipher_err_cnt);
-	rcomp.stat_compress_err_cnt = v32;
+	v64 = atomic64_read(&alg->cipher_err_cnt);
+	rcomp.stat_compress_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
@@ -106,21 +103,20 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat racomp;
 	u64 v64;
-	u32 v32;
 
 	memset(&racomp, 0, sizeof(racomp));
 
 	strscpy(racomp.type, "acomp", sizeof(racomp.type));
-	v32 = atomic_read(&alg->compress_cnt);
-	racomp.stat_compress_cnt = v32;
+	v64 = atomic64_read(&alg->compress_cnt);
+	racomp.stat_compress_cnt = v64;
 	v64 = atomic64_read(&alg->compress_tlen);
 	racomp.stat_compress_tlen = v64;
-	v32 = atomic_read(&alg->decompress_cnt);
-	racomp.stat_decompress_cnt = v32;
+	v64 = atomic64_read(&alg->decompress_cnt);
+	racomp.stat_decompress_cnt = v64;
 	v64 = atomic64_read(&alg->decompress_tlen);
 	racomp.stat_decompress_tlen = v64;
-	v32 = atomic_read(&alg->cipher_err_cnt);
-	racomp.stat_compress_err_cnt = v32;
+	v64 = atomic64_read(&alg->cipher_err_cnt);
+	racomp.stat_compress_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
 }
@@ -129,25 +125,24 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rakcipher;
 	u64 v64;
-	u32 v32;
 
 	memset(&rakcipher, 0, sizeof(rakcipher));
 
 	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
-	v32 = atomic_read(&alg->encrypt_cnt);
-	rakcipher.stat_encrypt_cnt = v32;
+	v64 = atomic64_read(&alg->encrypt_cnt);
+	rakcipher.stat_encrypt_cnt = v64;
 	v64 = atomic64_read(&alg->encrypt_tlen);
 	rakcipher.stat_encrypt_tlen = v64;
-	v32 = atomic_read(&alg->decrypt_cnt);
-	rakcipher.stat_decrypt_cnt = v32;
+	v64 = atomic64_read(&alg->decrypt_cnt);
+	rakcipher.stat_decrypt_cnt = v64;
 	v64 = atomic64_read(&alg->decrypt_tlen);
 	rakcipher.stat_decrypt_tlen = v64;
-	v32 = atomic_read(&alg->sign_cnt);
-	rakcipher.stat_sign_cnt = v32;
-	v32 = atomic_read(&alg->verify_cnt);
-	rakcipher.stat_verify_cnt = v32;
-	v32 = atomic_read(&alg->akcipher_err_cnt);
-	rakcipher.stat_akcipher_err_cnt = v32;
+	v64 = atomic64_read(&alg->sign_cnt);
+	rakcipher.stat_sign_cnt = v64;
+	v64 = atomic64_read(&alg->verify_cnt);
+	rakcipher.stat_verify_cnt = v64;
+	v64 = atomic64_read(&alg->akcipher_err_cnt);
+	rakcipher.stat_akcipher_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
 		       sizeof(rakcipher), &rakcipher);
@@ -156,19 +151,19 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rkpp;
-	u32 v;
+	u64 v;
 
 	memset(&rkpp, 0, sizeof(rkpp));
 
 	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
 
-	v = atomic_read(&alg->setsecret_cnt);
+	v = atomic64_read(&alg->setsecret_cnt);
 	rkpp.stat_setsecret_cnt = v;
-	v = atomic_read(&alg->generate_public_key_cnt);
+	v = atomic64_read(&alg->generate_public_key_cnt);
 	rkpp.stat_generate_public_key_cnt = v;
-	v = atomic_read(&alg->compute_shared_secret_cnt);
+	v = atomic64_read(&alg->compute_shared_secret_cnt);
 	rkpp.stat_compute_shared_secret_cnt = v;
-	v = atomic_read(&alg->kpp_err_cnt);
+	v = atomic64_read(&alg->kpp_err_cnt);
 	rkpp.stat_kpp_err_cnt = v;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
@@ -178,18 +173,17 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rhash;
 	u64 v64;
-	u32 v32;
 
 	memset(&rhash, 0, sizeof(rhash));
 
 	strscpy(rhash.type, "ahash", sizeof(rhash.type));
 
-	v32 = atomic_read(&alg->hash_cnt);
-	rhash.stat_hash_cnt = v32;
+	v64 = atomic64_read(&alg->hash_cnt);
+	rhash.stat_hash_cnt = v64;
 	v64 = atomic64_read(&alg->hash_tlen);
 	rhash.stat_hash_tlen = v64;
-	v32 = atomic_read(&alg->hash_err_cnt);
-	rhash.stat_hash_err_cnt = v32;
+	v64 = atomic64_read(&alg->hash_err_cnt);
+	rhash.stat_hash_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -198,18 +192,17 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rhash;
 	u64 v64;
-	u32 v32;
 
 	memset(&rhash, 0, sizeof(rhash));
 
 	strscpy(rhash.type, "shash", sizeof(rhash.type));
 
-	v32 = atomic_read(&alg->hash_cnt);
-	rhash.stat_hash_cnt = v32;
+	v64 = atomic64_read(&alg->hash_cnt);
+	rhash.stat_hash_cnt = v64;
 	v64 = atomic64_read(&alg->hash_tlen);
 	rhash.stat_hash_tlen = v64;
-	v32 = atomic_read(&alg->hash_err_cnt);
-	rhash.stat_hash_err_cnt = v32;
+	v64 = atomic64_read(&alg->hash_err_cnt);
+	rhash.stat_hash_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -218,20 +211,19 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rrng;
 	u64 v64;
-	u32 v32;
 
 	memset(&rrng, 0, sizeof(rrng));
 
 	strscpy(rrng.type, "rng", sizeof(rrng.type));
 
-	v32 = atomic_read(&alg->generate_cnt);
-	rrng.stat_generate_cnt = v32;
+	v64 = atomic64_read(&alg->generate_cnt);
+	rrng.stat_generate_cnt = v64;
 	v64 = atomic64_read(&alg->generate_tlen);
 	rrng.stat_generate_tlen = v64;
-	v32 = atomic_read(&alg->seed_cnt);
-	rrng.stat_seed_cnt = v32;
-	v32 = atomic_read(&alg->hash_err_cnt);
-	rrng.stat_rng_err_cnt = v32;
+	v64 = atomic64_read(&alg->seed_cnt);
+	rrng.stat_seed_cnt = v64;
+	v64 = atomic64_read(&alg->hash_err_cnt);
+	rrng.stat_rng_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
 }
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 22e6f412c595..f79918196811 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -240,9 +240,9 @@ static inline void crypto_stat_compress(struct acomp_req *req, int ret)
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->compress_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->compress_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->compress_cnt);
 		atomic64_add(req->slen, &tfm->base.__crt_alg->compress_tlen);
 	}
 #endif
@@ -254,9 +254,9 @@ static inline void crypto_stat_decompress(struct acomp_req *req, int ret)
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->compress_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->decompress_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->decompress_cnt);
 		atomic64_add(req->slen, &tfm->base.__crt_alg->decompress_tlen);
 	}
 #endif
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 0d765d7bfb82..99afd78c665d 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -312,9 +312,9 @@ static inline void crypto_stat_aead_encrypt(struct aead_request *req, int ret)
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->aead_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->encrypt_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt);
 		atomic64_add(req->cryptlen, &tfm->base.__crt_alg->encrypt_tlen);
 	}
 #endif
@@ -326,9 +326,9 @@ static inline void crypto_stat_aead_decrypt(struct aead_request *req, int ret)
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->aead_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->decrypt_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt);
 		atomic64_add(req->cryptlen, &tfm->base.__crt_alg->decrypt_tlen);
 	}
 #endif
diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h
index afac71119396..3dc05cf7e0a9 100644
--- a/include/crypto/akcipher.h
+++ b/include/crypto/akcipher.h
@@ -278,9 +278,9 @@ static inline void crypto_stat_akcipher_encrypt(struct akcipher_request *req,
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->encrypt_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt);
 		atomic64_add(req->src_len, &tfm->base.__crt_alg->encrypt_tlen);
 	}
 #endif
@@ -293,9 +293,9 @@ static inline void crypto_stat_akcipher_decrypt(struct akcipher_request *req,
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->decrypt_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt);
 		atomic64_add(req->src_len, &tfm->base.__crt_alg->decrypt_tlen);
 	}
 #endif
@@ -308,9 +308,9 @@ static inline void crypto_stat_akcipher_sign(struct akcipher_request *req,
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->sign_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->sign_cnt);
 #endif
 }
 
@@ -321,9 +321,9 @@ static inline void crypto_stat_akcipher_verify(struct akcipher_request *req,
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->verify_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->verify_cnt);
 #endif
 }
 
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index bc7796600338..52920bed05ba 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -418,7 +418,7 @@ static inline void crypto_stat_ahash_update(struct ahash_request *req, int ret)
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->hash_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt);
 	else
 		atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen);
 #endif
@@ -430,9 +430,9 @@ static inline void crypto_stat_ahash_final(struct ahash_request *req, int ret)
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->hash_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->hash_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->hash_cnt);
 		atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen);
 	}
 #endif
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index f517ba6d3a27..bd5103a80919 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -272,9 +272,9 @@ static inline void crypto_stat_kpp_set_secret(struct crypto_kpp *tfm, int ret)
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret)
-		atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->setsecret_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->setsecret_cnt);
 #endif
 }
 
@@ -285,9 +285,9 @@ static inline void crypto_stat_kpp_generate_public_key(struct kpp_request *req,
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 
 	if (ret)
-		atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->generate_public_key_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->generate_public_key_cnt);
 #endif
 }
 
@@ -298,9 +298,9 @@ static inline void crypto_stat_kpp_compute_shared_secret(struct kpp_request *req
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 
 	if (ret)
-		atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->compute_shared_secret_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->compute_shared_secret_cnt);
 #endif
 }
 
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index 6d258f5b68f1..966615bba45e 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -126,9 +126,9 @@ static inline void crypto_stat_rng_seed(struct crypto_rng *tfm, int ret)
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->rng_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->seed_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->seed_cnt);
 #endif
 }
 
@@ -137,9 +137,9 @@ static inline void crypto_stat_rng_generate(struct crypto_rng *tfm,
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->rng_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->generate_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->generate_cnt);
 		atomic64_add(dlen, &tfm->base.__crt_alg->generate_tlen);
 	}
 #endif
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index 925f547cdcfa..dff54731ddf4 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -491,9 +491,9 @@ static inline void crypto_stat_skcipher_encrypt(struct skcipher_request *req,
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&alg->cipher_err_cnt);
+		atomic64_inc(&alg->cipher_err_cnt);
 	} else {
-		atomic_inc(&alg->encrypt_cnt);
+		atomic64_inc(&alg->encrypt_cnt);
 		atomic64_add(req->cryptlen, &alg->encrypt_tlen);
 	}
 #endif
@@ -504,9 +504,9 @@ static inline void crypto_stat_skcipher_decrypt(struct skcipher_request *req,
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&alg->cipher_err_cnt);
+		atomic64_inc(&alg->cipher_err_cnt);
 	} else {
-		atomic_inc(&alg->decrypt_cnt);
+		atomic64_inc(&alg->decrypt_cnt);
 		atomic64_add(req->cryptlen, &alg->decrypt_tlen);
 	}
 #endif
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 3e05053b8d57..b109b50906e7 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -517,11 +517,11 @@ struct crypto_alg {
 
 #ifdef CONFIG_CRYPTO_STATS
 	union {
-		atomic_t encrypt_cnt;
-		atomic_t compress_cnt;
-		atomic_t generate_cnt;
-		atomic_t hash_cnt;
-		atomic_t setsecret_cnt;
+		atomic64_t encrypt_cnt;
+		atomic64_t compress_cnt;
+		atomic64_t generate_cnt;
+		atomic64_t hash_cnt;
+		atomic64_t setsecret_cnt;
 	};
 	union {
 		atomic64_t encrypt_tlen;
@@ -530,29 +530,29 @@ struct crypto_alg {
 		atomic64_t hash_tlen;
 	};
 	union {
-		atomic_t akcipher_err_cnt;
-		atomic_t cipher_err_cnt;
-		atomic_t compress_err_cnt;
-		atomic_t aead_err_cnt;
-		atomic_t hash_err_cnt;
-		atomic_t rng_err_cnt;
-		atomic_t kpp_err_cnt;
+		atomic64_t akcipher_err_cnt;
+		atomic64_t cipher_err_cnt;
+		atomic64_t compress_err_cnt;
+		atomic64_t aead_err_cnt;
+		atomic64_t hash_err_cnt;
+		atomic64_t rng_err_cnt;
+		atomic64_t kpp_err_cnt;
 	};
 	union {
-		atomic_t decrypt_cnt;
-		atomic_t decompress_cnt;
-		atomic_t seed_cnt;
-		atomic_t generate_public_key_cnt;
+		atomic64_t decrypt_cnt;
+		atomic64_t decompress_cnt;
+		atomic64_t seed_cnt;
+		atomic64_t generate_public_key_cnt;
 	};
 	union {
 		atomic64_t decrypt_tlen;
 		atomic64_t decompress_tlen;
 	};
 	union {
-		atomic_t verify_cnt;
-		atomic_t compute_shared_secret_cnt;
+		atomic64_t verify_cnt;
+		atomic64_t compute_shared_secret_cnt;
 	};
-	atomic_t sign_cnt;
+	atomic64_t sign_cnt;
 #endif /* CONFIG_CRYPTO_STATS */
 
 } CRYPTO_MINALIGN_ATTR;
@@ -983,9 +983,9 @@ static inline void crypto_stat_ablkcipher_encrypt(struct ablkcipher_request *req
 		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
+		atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
 	} else {
-		atomic_inc(&crt->base->base.__crt_alg->encrypt_cnt);
+		atomic64_inc(&crt->base->base.__crt_alg->encrypt_cnt);
 		atomic64_add(req->nbytes, &crt->base->base.__crt_alg->encrypt_tlen);
 	}
 #endif
@@ -999,9 +999,9 @@ static inline void crypto_stat_ablkcipher_decrypt(struct ablkcipher_request *req
 		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
+		atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
 	} else {
-		atomic_inc(&crt->base->base.__crt_alg->decrypt_cnt);
+		atomic64_inc(&crt->base->base.__crt_alg->decrypt_cnt);
 		atomic64_add(req->nbytes, &crt->base->base.__crt_alg->decrypt_tlen);
 	}
 #endif
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 6dafbc3e4414..9f8187077ce4 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -79,11 +79,11 @@ struct crypto_user_alg {
 struct crypto_stat {
 	char type[CRYPTO_MAX_NAME];
 	union {
-		__u32 stat_encrypt_cnt;
-		__u32 stat_compress_cnt;
-		__u32 stat_generate_cnt;
-		__u32 stat_hash_cnt;
-		__u32 stat_setsecret_cnt;
+		__u64 stat_encrypt_cnt;
+		__u64 stat_compress_cnt;
+		__u64 stat_generate_cnt;
+		__u64 stat_hash_cnt;
+		__u64 stat_setsecret_cnt;
 	};
 	union {
 		__u64 stat_encrypt_tlen;
@@ -92,29 +92,29 @@ struct crypto_stat {
 		__u64 stat_hash_tlen;
 	};
 	union {
-		__u32 stat_akcipher_err_cnt;
-		__u32 stat_cipher_err_cnt;
-		__u32 stat_compress_err_cnt;
-		__u32 stat_aead_err_cnt;
-		__u32 stat_hash_err_cnt;
-		__u32 stat_rng_err_cnt;
-		__u32 stat_kpp_err_cnt;
+		__u64 stat_akcipher_err_cnt;
+		__u64 stat_cipher_err_cnt;
+		__u64 stat_compress_err_cnt;
+		__u64 stat_aead_err_cnt;
+		__u64 stat_hash_err_cnt;
+		__u64 stat_rng_err_cnt;
+		__u64 stat_kpp_err_cnt;
 	};
 	union {
-		__u32 stat_decrypt_cnt;
-		__u32 stat_decompress_cnt;
-		__u32 stat_seed_cnt;
-		__u32 stat_generate_public_key_cnt;
+		__u64 stat_decrypt_cnt;
+		__u64 stat_decompress_cnt;
+		__u64 stat_seed_cnt;
+		__u64 stat_generate_public_key_cnt;
 	};
 	union {
 		__u64 stat_decrypt_tlen;
 		__u64 stat_decompress_tlen;
 	};
 	union {
-		__u32 stat_verify_cnt;
-		__u32 stat_compute_shared_secret_cnt;
+		__u64 stat_verify_cnt;
+		__u64 stat_compute_shared_secret_cnt;
 	};
-	__u32 stat_sign_cnt;
+	__u64 stat_sign_cnt;
 };
 
 struct crypto_report_larval {
-- 
cgit v1.2.3-59-g8ed1b


From 7f0a9d5c9d1ba8ab3e5b144e52553744dc0d7471 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:19 +0000
Subject: crypto: user - split user space crypto stat structures

It is cleaner to have each stat in their own structures.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/crypto_user_stat.c       |  20 ++++----
 include/uapi/linux/cryptouser.h | 100 +++++++++++++++++++++++++---------------
 2 files changed, 72 insertions(+), 48 deletions(-)

(limited to 'include/uapi')

diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 352569f378a0..3c14be2f7a1b 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -33,7 +33,7 @@ struct crypto_dump_info {
 
 static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat raead;
+	struct crypto_stat_aead raead;
 	u64 v64;
 
 	memset(&raead, 0, sizeof(raead));
@@ -56,7 +56,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rcipher;
+	struct crypto_stat_cipher rcipher;
 	u64 v64;
 
 	memset(&rcipher, 0, sizeof(rcipher));
@@ -79,7 +79,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rcomp;
+	struct crypto_stat_compress rcomp;
 	u64 v64;
 
 	memset(&rcomp, 0, sizeof(rcomp));
@@ -101,7 +101,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat racomp;
+	struct crypto_stat_compress racomp;
 	u64 v64;
 
 	memset(&racomp, 0, sizeof(racomp));
@@ -123,7 +123,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rakcipher;
+	struct crypto_stat_akcipher rakcipher;
 	u64 v64;
 
 	memset(&rakcipher, 0, sizeof(rakcipher));
@@ -150,7 +150,7 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rkpp;
+	struct crypto_stat_kpp rkpp;
 	u64 v;
 
 	memset(&rkpp, 0, sizeof(rkpp));
@@ -171,7 +171,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rhash;
+	struct crypto_stat_hash rhash;
 	u64 v64;
 
 	memset(&rhash, 0, sizeof(rhash));
@@ -190,7 +190,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rhash;
+	struct crypto_stat_hash rhash;
 	u64 v64;
 
 	memset(&rhash, 0, sizeof(rhash));
@@ -209,7 +209,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rrng;
+	struct crypto_stat_rng rrng;
 	u64 v64;
 
 	memset(&rrng, 0, sizeof(rrng));
@@ -248,7 +248,7 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 	if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority))
 		goto nla_put_failure;
 	if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
-		struct crypto_stat rl;
+		struct crypto_stat_larval rl;
 
 		memset(&rl, 0, sizeof(rl));
 		strscpy(rl.type, "larval", sizeof(rl.type));
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 9f8187077ce4..3a70f025e27d 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -76,45 +76,69 @@ struct crypto_user_alg {
 	__u32 cru_flags;
 };
 
-struct crypto_stat {
-	char type[CRYPTO_MAX_NAME];
-	union {
-		__u64 stat_encrypt_cnt;
-		__u64 stat_compress_cnt;
-		__u64 stat_generate_cnt;
-		__u64 stat_hash_cnt;
-		__u64 stat_setsecret_cnt;
-	};
-	union {
-		__u64 stat_encrypt_tlen;
-		__u64 stat_compress_tlen;
-		__u64 stat_generate_tlen;
-		__u64 stat_hash_tlen;
-	};
-	union {
-		__u64 stat_akcipher_err_cnt;
-		__u64 stat_cipher_err_cnt;
-		__u64 stat_compress_err_cnt;
-		__u64 stat_aead_err_cnt;
-		__u64 stat_hash_err_cnt;
-		__u64 stat_rng_err_cnt;
-		__u64 stat_kpp_err_cnt;
-	};
-	union {
-		__u64 stat_decrypt_cnt;
-		__u64 stat_decompress_cnt;
-		__u64 stat_seed_cnt;
-		__u64 stat_generate_public_key_cnt;
-	};
-	union {
-		__u64 stat_decrypt_tlen;
-		__u64 stat_decompress_tlen;
-	};
-	union {
-		__u64 stat_verify_cnt;
-		__u64 stat_compute_shared_secret_cnt;
-	};
+struct crypto_stat_aead {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_encrypt_cnt;
+	__u64 stat_encrypt_tlen;
+	__u64 stat_decrypt_cnt;
+	__u64 stat_decrypt_tlen;
+	__u64 stat_aead_err_cnt;
+};
+
+struct crypto_stat_akcipher {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_encrypt_cnt;
+	__u64 stat_encrypt_tlen;
+	__u64 stat_decrypt_cnt;
+	__u64 stat_decrypt_tlen;
+	__u64 stat_verify_cnt;
 	__u64 stat_sign_cnt;
+	__u64 stat_akcipher_err_cnt;
+};
+
+struct crypto_stat_cipher {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_encrypt_cnt;
+	__u64 stat_encrypt_tlen;
+	__u64 stat_decrypt_cnt;
+	__u64 stat_decrypt_tlen;
+	__u64 stat_cipher_err_cnt;
+};
+
+struct crypto_stat_compress {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_compress_cnt;
+	__u64 stat_compress_tlen;
+	__u64 stat_decompress_cnt;
+	__u64 stat_decompress_tlen;
+	__u64 stat_compress_err_cnt;
+};
+
+struct crypto_stat_hash {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_hash_cnt;
+	__u64 stat_hash_tlen;
+	__u64 stat_hash_err_cnt;
+};
+
+struct crypto_stat_kpp {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_setsecret_cnt;
+	__u64 stat_generate_public_key_cnt;
+	__u64 stat_compute_shared_secret_cnt;
+	__u64 stat_kpp_err_cnt;
+};
+
+struct crypto_stat_rng {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_generate_cnt;
+	__u64 stat_generate_tlen;
+	__u64 stat_seed_cnt;
+	__u64 stat_rng_err_cnt;
+};
+
+struct crypto_stat_larval {
+	char type[CRYPTO_MAX_NAME];
 };
 
 struct crypto_report_larval {
-- 
cgit v1.2.3-59-g8ed1b


From 44f13133cb03ec32fc88a533673248ef5c0617e3 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:25 +0000
Subject: crypto: user - rename err_cnt parameter

Since now all crypto stats are on their own structures, it is now
useless to have the algorithm name in the err_cnt member.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c                 | 38 +++++++++++++++++++-------------------
 crypto/crypto_user_stat.c       | 18 +++++++++---------
 include/linux/crypto.h          | 28 ++++++++++++++--------------
 include/uapi/linux/cryptouser.h | 14 +++++++-------
 tools/crypto/getstat.c          | 18 +++++++++---------
 5 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'include/uapi')

diff --git a/crypto/algapi.c b/crypto/algapi.c
index a8cb5aed0069..c0d4f9ef6b0f 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -1083,7 +1083,7 @@ void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret,
 				     struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.cipher.encrypt_cnt);
 		atomic64_add(nbytes, &alg->stats.cipher.encrypt_tlen);
@@ -1096,7 +1096,7 @@ void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret,
 				     struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.cipher.decrypt_cnt);
 		atomic64_add(nbytes, &alg->stats.cipher.decrypt_tlen);
@@ -1109,7 +1109,7 @@ void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg,
 			       int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.aead.aead_err_cnt);
+		atomic64_inc(&alg->stats.aead.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.aead.encrypt_cnt);
 		atomic64_add(cryptlen, &alg->stats.aead.encrypt_tlen);
@@ -1122,7 +1122,7 @@ void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg,
 			       int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.aead.aead_err_cnt);
+		atomic64_inc(&alg->stats.aead.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.aead.decrypt_cnt);
 		atomic64_add(cryptlen, &alg->stats.aead.decrypt_tlen);
@@ -1135,7 +1135,7 @@ void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.akcipher.encrypt_cnt);
 		atomic64_add(src_len, &alg->stats.akcipher.encrypt_tlen);
@@ -1148,7 +1148,7 @@ void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.akcipher.decrypt_cnt);
 		atomic64_add(src_len, &alg->stats.akcipher.decrypt_tlen);
@@ -1160,7 +1160,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_decrypt);
 void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
 	else
 		atomic64_inc(&alg->stats.akcipher.sign_cnt);
 	crypto_alg_put(alg);
@@ -1170,7 +1170,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign);
 void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
 	else
 		atomic64_inc(&alg->stats.akcipher.verify_cnt);
 	crypto_alg_put(alg);
@@ -1180,7 +1180,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify);
 void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.compress.compress_err_cnt);
+		atomic64_inc(&alg->stats.compress.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.compress.compress_cnt);
 		atomic64_add(slen, &alg->stats.compress.compress_tlen);
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_compress);
 void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.compress.compress_err_cnt);
+		atomic64_inc(&alg->stats.compress.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.compress.decompress_cnt);
 		atomic64_add(slen, &alg->stats.compress.decompress_tlen);
@@ -1205,7 +1205,7 @@ void crypto_stats_ahash_update(unsigned int nbytes, int ret,
 			       struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.hash.hash_err_cnt);
+		atomic64_inc(&alg->stats.hash.err_cnt);
 	else
 		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
 	crypto_alg_put(alg);
@@ -1216,7 +1216,7 @@ void crypto_stats_ahash_final(unsigned int nbytes, int ret,
 			      struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.hash.hash_err_cnt);
+		atomic64_inc(&alg->stats.hash.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.hash.hash_cnt);
 		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
@@ -1228,7 +1228,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_ahash_final);
 void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
 {
 	if (ret)
-		atomic64_inc(&alg->stats.kpp.kpp_err_cnt);
+		atomic64_inc(&alg->stats.kpp.err_cnt);
 	else
 		atomic64_inc(&alg->stats.kpp.setsecret_cnt);
 	crypto_alg_put(alg);
@@ -1238,7 +1238,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret);
 void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
 {
 	if (ret)
-		atomic64_inc(&alg->stats.kpp.kpp_err_cnt);
+		atomic64_inc(&alg->stats.kpp.err_cnt);
 	else
 		atomic64_inc(&alg->stats.kpp.generate_public_key_cnt);
 	crypto_alg_put(alg);
@@ -1248,7 +1248,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key);
 void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
 {
 	if (ret)
-		atomic64_inc(&alg->stats.kpp.kpp_err_cnt);
+		atomic64_inc(&alg->stats.kpp.err_cnt);
 	else
 		atomic64_inc(&alg->stats.kpp.compute_shared_secret_cnt);
 	crypto_alg_put(alg);
@@ -1258,7 +1258,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret);
 void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.rng.rng_err_cnt);
+		atomic64_inc(&alg->stats.rng.err_cnt);
 	else
 		atomic64_inc(&alg->stats.rng.seed_cnt);
 	crypto_alg_put(alg);
@@ -1269,7 +1269,7 @@ void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen,
 			       int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.rng.rng_err_cnt);
+		atomic64_inc(&alg->stats.rng.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.rng.generate_cnt);
 		atomic64_add(dlen, &alg->stats.rng.generate_tlen);
@@ -1282,7 +1282,7 @@ void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.cipher.encrypt_cnt);
 		atomic64_add(cryptlen, &alg->stats.cipher.encrypt_tlen);
@@ -1295,7 +1295,7 @@ void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.cipher.decrypt_cnt);
 		atomic64_add(cryptlen, &alg->stats.cipher.decrypt_tlen);
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 113bf1691560..0ba00aaeb810 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -43,7 +43,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 	raead.stat_encrypt_tlen = atomic64_read(&alg->stats.aead.encrypt_tlen);
 	raead.stat_decrypt_cnt = atomic64_read(&alg->stats.aead.decrypt_cnt);
 	raead.stat_decrypt_tlen = atomic64_read(&alg->stats.aead.decrypt_tlen);
-	raead.stat_aead_err_cnt = atomic64_read(&alg->stats.aead.aead_err_cnt);
+	raead.stat_err_cnt = atomic64_read(&alg->stats.aead.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
 }
@@ -60,7 +60,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 	rcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.cipher.encrypt_tlen);
 	rcipher.stat_decrypt_cnt =  atomic64_read(&alg->stats.cipher.decrypt_cnt);
 	rcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.cipher.decrypt_tlen);
-	rcipher.stat_cipher_err_cnt =  atomic64_read(&alg->stats.cipher.cipher_err_cnt);
+	rcipher.stat_err_cnt =  atomic64_read(&alg->stats.cipher.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
 }
@@ -76,7 +76,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 	rcomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
 	rcomp.stat_decompress_cnt = atomic64_read(&alg->stats.compress.decompress_cnt);
 	rcomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
-	rcomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt);
+	rcomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
@@ -92,7 +92,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 	racomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
 	racomp.stat_decompress_cnt =  atomic64_read(&alg->stats.compress.decompress_cnt);
 	racomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
-	racomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt);
+	racomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
 }
@@ -110,7 +110,7 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 	rakcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.akcipher.decrypt_tlen);
 	rakcipher.stat_sign_cnt = atomic64_read(&alg->stats.akcipher.sign_cnt);
 	rakcipher.stat_verify_cnt = atomic64_read(&alg->stats.akcipher.verify_cnt);
-	rakcipher.stat_akcipher_err_cnt = atomic64_read(&alg->stats.akcipher.akcipher_err_cnt);
+	rakcipher.stat_err_cnt = atomic64_read(&alg->stats.akcipher.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
 		       sizeof(rakcipher), &rakcipher);
@@ -127,7 +127,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 	rkpp.stat_setsecret_cnt = atomic64_read(&alg->stats.kpp.setsecret_cnt);
 	rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->stats.kpp.generate_public_key_cnt);
 	rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->stats.kpp.compute_shared_secret_cnt);
-	rkpp.stat_kpp_err_cnt = atomic64_read(&alg->stats.kpp.kpp_err_cnt);
+	rkpp.stat_err_cnt = atomic64_read(&alg->stats.kpp.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
 }
@@ -142,7 +142,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 
 	rhash.stat_hash_cnt = atomic64_read(&alg->stats.hash.hash_cnt);
 	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
-	rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt);
+	rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -157,7 +157,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 
 	rhash.stat_hash_cnt =  atomic64_read(&alg->stats.hash.hash_cnt);
 	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
-	rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt);
+	rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -173,7 +173,7 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 	rrng.stat_generate_cnt = atomic64_read(&alg->stats.rng.generate_cnt);
 	rrng.stat_generate_tlen = atomic64_read(&alg->stats.rng.generate_tlen);
 	rrng.stat_seed_cnt = atomic64_read(&alg->stats.rng.seed_cnt);
-	rrng.stat_rng_err_cnt = atomic64_read(&alg->stats.rng.rng_err_cnt);
+	rrng.stat_err_cnt = atomic64_read(&alg->stats.rng.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
 }
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 8a46ab35479e..a2967c1a08b1 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -376,14 +376,14 @@ struct compress_alg {
  * @encrypt_tlen:	total data size handled by encrypt requests
  * @decrypt_cnt:	number of decrypt requests
  * @decrypt_tlen:	total data size handled by decrypt requests
- * @aead_err_cnt:	number of error for AEAD requests
+ * @err_cnt:		number of error for AEAD requests
  */
 struct crypto_istat_aead {
 	atomic64_t encrypt_cnt;
 	atomic64_t encrypt_tlen;
 	atomic64_t decrypt_cnt;
 	atomic64_t decrypt_tlen;
-	atomic64_t aead_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -394,7 +394,7 @@ struct crypto_istat_aead {
  * @decrypt_tlen:	total data size handled by decrypt requests
  * @verify_cnt:		number of verify operation
  * @sign_cnt:		number of sign requests
- * @akcipher_err_cnt:	number of error for akcipher requests
+ * @err_cnt:		number of error for akcipher requests
  */
 struct crypto_istat_akcipher {
 	atomic64_t encrypt_cnt;
@@ -403,7 +403,7 @@ struct crypto_istat_akcipher {
 	atomic64_t decrypt_tlen;
 	atomic64_t verify_cnt;
 	atomic64_t sign_cnt;
-	atomic64_t akcipher_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -412,14 +412,14 @@ struct crypto_istat_akcipher {
  * @encrypt_tlen:	total data size handled by encrypt requests
  * @decrypt_cnt:	number of decrypt requests
  * @decrypt_tlen:	total data size handled by decrypt requests
- * @cipher_err_cnt:	number of error for cipher requests
+ * @err_cnt:		number of error for cipher requests
  */
 struct crypto_istat_cipher {
 	atomic64_t encrypt_cnt;
 	atomic64_t encrypt_tlen;
 	atomic64_t decrypt_cnt;
 	atomic64_t decrypt_tlen;
-	atomic64_t cipher_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -428,26 +428,26 @@ struct crypto_istat_cipher {
  * @compress_tlen:	total data size handled by compress requests
  * @decompress_cnt:	number of decompress requests
  * @decompress_tlen:	total data size handled by decompress requests
- * @compress_err_cnt:	number of error for compress requests
+ * @err_cnt:		number of error for compress requests
  */
 struct crypto_istat_compress {
 	atomic64_t compress_cnt;
 	atomic64_t compress_tlen;
 	atomic64_t decompress_cnt;
 	atomic64_t decompress_tlen;
-	atomic64_t compress_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
  * struct crypto_istat_hash - statistics for has algorithm
  * @hash_cnt:		number of hash requests
  * @hash_tlen:		total data size hashed
- * @hash_err_cnt:	number of error for hash requests
+ * @err_cnt:		number of error for hash requests
  */
 struct crypto_istat_hash {
 	atomic64_t hash_cnt;
 	atomic64_t hash_tlen;
-	atomic64_t hash_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -455,13 +455,13 @@ struct crypto_istat_hash {
  * @setsecret_cnt:		number of setsecrey operation
  * @generate_public_key_cnt:	number of generate_public_key operation
  * @compute_shared_secret_cnt:	number of compute_shared_secret operation
- * @kpp_err_cnt:		number of error for KPP requests
+ * @err_cnt:			number of error for KPP requests
  */
 struct crypto_istat_kpp {
 	atomic64_t setsecret_cnt;
 	atomic64_t generate_public_key_cnt;
 	atomic64_t compute_shared_secret_cnt;
-	atomic64_t kpp_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -469,13 +469,13 @@ struct crypto_istat_kpp {
  * @generate_cnt:	number of RNG generate requests
  * @generate_tlen:	total data size of generated data by the RNG
  * @seed_cnt:		number of times the RNG was seeded
- * @rng_err_cnt:	number of error for RNG requests
+ * @err_cnt:		number of error for RNG requests
  */
 struct crypto_istat_rng {
 	atomic64_t generate_cnt;
 	atomic64_t generate_tlen;
 	atomic64_t seed_cnt;
-	atomic64_t rng_err_cnt;
+	atomic64_t err_cnt;
 };
 #endif /* CONFIG_CRYPTO_STATS */
 
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 3a70f025e27d..4dc1603919ce 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -82,7 +82,7 @@ struct crypto_stat_aead {
 	__u64 stat_encrypt_tlen;
 	__u64 stat_decrypt_cnt;
 	__u64 stat_decrypt_tlen;
-	__u64 stat_aead_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_akcipher {
@@ -93,7 +93,7 @@ struct crypto_stat_akcipher {
 	__u64 stat_decrypt_tlen;
 	__u64 stat_verify_cnt;
 	__u64 stat_sign_cnt;
-	__u64 stat_akcipher_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_cipher {
@@ -102,7 +102,7 @@ struct crypto_stat_cipher {
 	__u64 stat_encrypt_tlen;
 	__u64 stat_decrypt_cnt;
 	__u64 stat_decrypt_tlen;
-	__u64 stat_cipher_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_compress {
@@ -111,14 +111,14 @@ struct crypto_stat_compress {
 	__u64 stat_compress_tlen;
 	__u64 stat_decompress_cnt;
 	__u64 stat_decompress_tlen;
-	__u64 stat_compress_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_hash {
 	char type[CRYPTO_MAX_NAME];
 	__u64 stat_hash_cnt;
 	__u64 stat_hash_tlen;
-	__u64 stat_hash_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_kpp {
@@ -126,7 +126,7 @@ struct crypto_stat_kpp {
 	__u64 stat_setsecret_cnt;
 	__u64 stat_generate_public_key_cnt;
 	__u64 stat_compute_shared_secret_cnt;
-	__u64 stat_kpp_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_rng {
@@ -134,7 +134,7 @@ struct crypto_stat_rng {
 	__u64 stat_generate_cnt;
 	__u64 stat_generate_tlen;
 	__u64 stat_seed_cnt;
-	__u64 stat_rng_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_larval {
diff --git a/tools/crypto/getstat.c b/tools/crypto/getstat.c
index 57fbb94608d4..9e8ff76420fa 100644
--- a/tools/crypto/getstat.c
+++ b/tools/crypto/getstat.c
@@ -157,7 +157,7 @@ static int get_stat(const char *drivername)
 		printf("%s\tHash\n\tHash: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rhash->stat_hash_cnt, rhash->stat_hash_tlen,
-			rhash->stat_hash_err_cnt);
+			rhash->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_COMPRESS]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_COMPRESS];
 		struct crypto_stat_compress *rblk =
@@ -166,7 +166,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rblk->stat_compress_cnt, rblk->stat_compress_tlen,
 			rblk->stat_decompress_cnt, rblk->stat_decompress_tlen,
-			rblk->stat_compress_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_ACOMP]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_ACOMP];
 		struct crypto_stat_compress *rcomp =
@@ -175,7 +175,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rcomp->stat_compress_cnt, rcomp->stat_compress_tlen,
 			rcomp->stat_decompress_cnt, rcomp->stat_decompress_tlen,
-			rcomp->stat_compress_err_cnt);
+			rcomp->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_AEAD]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_AEAD];
 		struct crypto_stat_aead *raead =
@@ -184,7 +184,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			raead->stat_encrypt_cnt, raead->stat_encrypt_tlen,
 			raead->stat_decrypt_cnt, raead->stat_decrypt_tlen,
-			raead->stat_aead_err_cnt);
+			raead->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_BLKCIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_BLKCIPHER];
 		struct crypto_stat_cipher *rblk =
@@ -193,7 +193,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
-			rblk->stat_cipher_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_AKCIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_AKCIPHER];
 		struct crypto_stat_akcipher *rblk =
@@ -203,7 +203,7 @@ static int get_stat(const char *drivername)
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
 			rblk->stat_sign_cnt, rblk->stat_verify_cnt,
-			rblk->stat_akcipher_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_CIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_CIPHER];
 		struct crypto_stat_cipher *rblk =
@@ -212,7 +212,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
-			rblk->stat_cipher_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_RNG]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_RNG];
 		struct crypto_stat_rng *rrng =
@@ -221,7 +221,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rrng->stat_seed_cnt,
 			rrng->stat_generate_cnt, rrng->stat_generate_tlen,
-			rrng->stat_rng_err_cnt);
+			rrng->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_KPP]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_KPP];
 		struct crypto_stat_kpp *rkpp =
@@ -231,7 +231,7 @@ static int get_stat(const char *drivername)
 			rkpp->stat_setsecret_cnt,
 			rkpp->stat_generate_public_key_cnt,
 			rkpp->stat_compute_shared_secret_cnt,
-			rkpp->stat_kpp_err_cnt);
+			rkpp->stat_err_cnt);
 	} else {
 		fprintf(stderr, "%s is of an unknown algorithm\n", drivername);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 52ea899637c746984d657b508da6e3f2686adfca Mon Sep 17 00:00:00 2001
From: Peter Hutterer <peter.hutterer@who-t.net>
Date: Wed, 5 Dec 2018 10:42:21 +1000
Subject: Input: add `REL_WHEEL_HI_RES` and `REL_HWHEEL_HI_RES`

This event code represents scroll reports from high-resolution wheels and
is modelled after the approach Windows uses. The value 120 is one detent
(wheel click) of movement. Mice with higher-resolution scrolling can send
fractions of 120 which must be accumulated in userspace. Userspace can either
wait for a full 120 to accumulate or scroll by fractions of one logical scroll
movement as the events come in. 120 was picked as magic number because it has
a high number of integer fractions that can be used by high-resolution wheels.

For more information see
https://docs.microsoft.com/en-us/previous-versions/windows/hardware/design/dn613912(v=vs.85)

These new axes obsolete REL_WHEEL and REL_HWHEEL. The legacy axes are emulated
by the kernel but the most accurate (and most granular) data is available
through the new axes.

Signed-off-by: Peter Hutterer <peter.hutterer@who-t.net>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Verified-by: Harry Cutts <hcutts@chromium.org>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 Documentation/input/event-codes.rst    | 21 ++++++++++++++++++++-
 include/uapi/linux/input-event-codes.h |  2 ++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst
index a8c0873beb95..b24b5343f5eb 100644
--- a/Documentation/input/event-codes.rst
+++ b/Documentation/input/event-codes.rst
@@ -190,7 +190,26 @@ A few EV_REL codes have special meanings:
 * REL_WHEEL, REL_HWHEEL:
 
   - These codes are used for vertical and horizontal scroll wheels,
-    respectively.
+    respectively. The value is the number of detents moved on the wheel, the
+    physical size of which varies by device. For high-resolution wheels
+    this may be an approximation based on the high-resolution scroll events,
+    see REL_WHEEL_HI_RES. These event codes are legacy codes and
+    REL_WHEEL_HI_RES and REL_HWHEEL_HI_RES should be preferred where
+    available.
+
+* REL_WHEEL_HI_RES, REL_HWHEEL_HI_RES:
+
+  - High-resolution scroll wheel data. The accumulated value 120 represents
+    movement by one detent. For devices that do not provide high-resolution
+    scrolling, the value is always a multiple of 120. For devices with
+    high-resolution scrolling, the value may be a fraction of 120.
+
+    If a vertical scroll wheel supports high-resolution scrolling, this code
+    will be emitted in addition to REL_WHEEL or REL_HWHEEL. The REL_WHEEL
+    and REL_HWHEEL may be an approximation based on the high-resolution
+    scroll events. There is no guarantee that the high-resolution data
+    is a multiple of 120 at the time of an emulated REL_WHEEL or REL_HWHEEL
+    event.
 
 EV_ABS
 ------
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 3eb5a4c3d60a..265ef2028660 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -716,6 +716,8 @@
  * the situation described above.
  */
 #define REL_RESERVED		0x0a
+#define REL_WHEEL_HI_RES	0x0b
+#define REL_HWHEEL_HI_RES	0x0c
 #define REL_MAX			0x0f
 #define REL_CNT			(REL_MAX+1)
 
-- 
cgit v1.2.3-59-g8ed1b


From 36e7999dc19a1a6e76258020a49f7f7836018e46 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 3 Dec 2018 14:24:33 -0800
Subject: drm/v3d: Document cache flushing ABI.

Right now, userspace doesn't do any L2T writes, but we should lay out
our expectations for how it works.

v2: Explicitly mention the VCD cache flushing requirements and that
    we'll flush the other caches before each of the CLs.

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20181203222438.25417-1-eric@anholt.net
Reviewed-by: Dave Emett <david.emett@broadcom.com>
---
 include/uapi/drm/v3d_drm.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
index 35c7d813c66e..ea70669d2138 100644
--- a/include/uapi/drm/v3d_drm.h
+++ b/include/uapi/drm/v3d_drm.h
@@ -52,6 +52,14 @@ extern "C" {
  *
  * This asks the kernel to have the GPU execute an optional binner
  * command list, and a render command list.
+ *
+ * The L1T, slice, L2C, L2T, and GCA caches will be flushed before
+ * each CL executes.  The VCD cache should be flushed (if necessary)
+ * by the submitted CLs.  The TLB writes are guaranteed to have been
+ * flushed by the time the render done IRQ happens, which is the
+ * trigger for out_sync.  Any dirtying of cachelines by the job (only
+ * possible using TMU writes) must be flushed by the caller using the
+ * CL's cache flush commands.
  */
 struct drm_v3d_submit_cl {
 	/* Pointer to the binner command list.
-- 
cgit v1.2.3-59-g8ed1b


From 569c665150156e44ecbd92af47a6d3fd4e2e4690 Mon Sep 17 00:00:00 2001
From: Danit Goldberg <danitg@mellanox.com>
Date: Fri, 30 Nov 2018 13:22:05 +0200
Subject: IB/mlx5: Add packet based credit mode support

The device can support two credit modes, message based (default) and
packet based. In order to enable packet based mode, the QP should be
created with special flag that indicates this.

This patch adds support for the new DV QP creation flag that can be used
for RC QPs in order to change the credit mode.

Signed-off-by: Danit Goldberg <danitg@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  1 +
 drivers/infiniband/hw/mlx5/qp.c      | 15 +++++++++++++--
 include/uapi/rdma/mlx5-abi.h         |  1 +
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 861b68f2e330..3e034bc85bde 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -461,6 +461,7 @@ enum mlx5_ib_qp_flags {
 	MLX5_IB_QP_UNDERLAY			= 1 << 10,
 	MLX5_IB_QP_PCI_WRITE_END_PADDING	= 1 << 11,
 	MLX5_IB_QP_TUNNEL_OFFLOAD		= 1 << 12,
+	MLX5_IB_QP_PACKET_BASED_CREDIT		= 1 << 13,
 };
 
 struct mlx5_umr_wr {
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6841c0f9237f..d5095fcd4cda 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1889,7 +1889,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 					      MLX5_QP_FLAG_BFREG_INDEX |
 					      MLX5_QP_FLAG_TYPE_DCT |
 					      MLX5_QP_FLAG_TYPE_DCI |
-					      MLX5_QP_FLAG_ALLOW_SCATTER_CQE))
+					      MLX5_QP_FLAG_ALLOW_SCATTER_CQE |
+					      MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE))
 			return -EINVAL;
 
 		err = get_qp_user_index(to_mucontext(pd->uobject->context),
@@ -1925,6 +1926,15 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 			qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC;
 		}
 
+		if (ucmd.flags & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) {
+			if (init_attr->qp_type != IB_QPT_RC ||
+				!MLX5_CAP_GEN(dev->mdev, qp_packet_based)) {
+				mlx5_ib_dbg(dev, "packet based credit mode isn't supported\n");
+				return -EOPNOTSUPP;
+			}
+			qp->flags |= MLX5_IB_QP_PACKET_BASED_CREDIT;
+		}
+
 		if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) {
 			if (init_attr->qp_type != IB_QPT_UD ||
 			    (MLX5_CAP_GEN(dev->mdev, port_type) !=
@@ -2021,7 +2031,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		MLX5_SET(qpc, qpc, cd_slave_send, 1);
 	if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
 		MLX5_SET(qpc, qpc, cd_slave_receive, 1);
-
+	if (qp->flags & MLX5_IB_QP_PACKET_BASED_CREDIT)
+		MLX5_SET(qpc, qpc, req_e2e_credit_mode, 1);
 	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
 		configure_responder_scat_cqe(init_attr, qpc);
 		configure_requester_scat_cqe(dev, init_attr,
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 8fa9f90e2bb1..4af581170f62 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -48,6 +48,7 @@ enum {
 	MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC = 1 << 6,
 	MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC = 1 << 7,
 	MLX5_QP_FLAG_ALLOW_SCATTER_CQE	= 1 << 8,
+	MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE	= 1 << 9,
 };
 
 enum {
-- 
cgit v1.2.3-59-g8ed1b


From 7e11b911b520de6a3189fafa94740f5fde2a2c98 Mon Sep 17 00:00:00 2001
From: Danit Goldberg <danitg@mellanox.com>
Date: Fri, 30 Nov 2018 13:22:06 +0200
Subject: IB/mlx5: Report packet based credit mode device capability

Report packet based credit mode capability via the mlx5 DV interface.

Signed-off-by: Danit Goldberg <danitg@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 3 +++
 include/uapi/rdma/mlx5-abi.h      | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 96515a8c9d2c..f985d0d9b883 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1018,6 +1018,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 
 		if (MLX5_CAP_GEN(mdev, cqe_128_always))
 			resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
+		if (MLX5_CAP_GEN(mdev, qp_packet_based))
+			resp.flags |=
+				MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
 	}
 
 	if (field_avail(typeof(resp), sw_parsing_caps,
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 4af581170f62..87b3198f4b5d 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -237,6 +237,7 @@ enum mlx5_ib_query_dev_resp_flags {
 	/* Support 128B CQE compression */
 	MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP = 1 << 0,
 	MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD  = 1 << 1,
+	MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE = 1 << 2,
 };
 
 enum mlx5_ib_tunnel_offloads {
-- 
cgit v1.2.3-59-g8ed1b


From 1dde0ea95b782425b95455d487cb44991525a1d1 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 20 Nov 2018 21:00:29 -0500
Subject: drm/amdkfd: Add DMABuf import functionality

This is used for interoperability between ROCm compute and graphics
APIs. It allows importing graphics driver BOs into the ROCm SVM
address space for zero-copy GPU access.

The API is split into two steps (query and import) to allow user mode
to manage the virtual address space allocation for the imported buffer.

Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c       |  57 +++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       |  11 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  55 +++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h          |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c        |   4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         | 118 ++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c        |  18 ++++
 include/uapi/linux/kfd_ioctl.h                   |  26 ++++-
 9 files changed, 287 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 68b29a210eaa..68e4cf1b655c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -26,6 +26,7 @@
 #include "amdgpu.h"
 #include "amdgpu_gfx.h"
 #include <linux/module.h>
+#include <linux/dma-buf.h>
 
 const struct kgd2kfd_calls *kgd2kfd;
 
@@ -433,6 +434,62 @@ void amdgpu_amdkfd_get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info)
 	cu_info->lds_size = acu_info.lds_size;
 }
 
+int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
+				  struct kgd_dev **dma_buf_kgd,
+				  uint64_t *bo_size, void *metadata_buffer,
+				  size_t buffer_size, uint32_t *metadata_size,
+				  uint32_t *flags)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+	struct dma_buf *dma_buf;
+	struct drm_gem_object *obj;
+	struct amdgpu_bo *bo;
+	uint64_t metadata_flags;
+	int r = -EINVAL;
+
+	dma_buf = dma_buf_get(dma_buf_fd);
+	if (IS_ERR(dma_buf))
+		return PTR_ERR(dma_buf);
+
+	if (dma_buf->ops != &amdgpu_dmabuf_ops)
+		/* Can't handle non-graphics buffers */
+		goto out_put;
+
+	obj = dma_buf->priv;
+	if (obj->dev->driver != adev->ddev->driver)
+		/* Can't handle buffers from different drivers */
+		goto out_put;
+
+	adev = obj->dev->dev_private;
+	bo = gem_to_amdgpu_bo(obj);
+	if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
+				    AMDGPU_GEM_DOMAIN_GTT)))
+		/* Only VRAM and GTT BOs are supported */
+		goto out_put;
+
+	r = 0;
+	if (dma_buf_kgd)
+		*dma_buf_kgd = (struct kgd_dev *)adev;
+	if (bo_size)
+		*bo_size = amdgpu_bo_size(bo);
+	if (metadata_size)
+		*metadata_size = bo->metadata_size;
+	if (metadata_buffer)
+		r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size,
+					   metadata_size, &metadata_flags);
+	if (flags) {
+		*flags = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
+			ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT;
+
+		if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
+			*flags |= ALLOC_MEM_FLAGS_PUBLIC;
+	}
+
+out_put:
+	dma_buf_put(dma_buf);
+	return r;
+}
+
 uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 131c6e5e6f10..70429f7aa9a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -149,6 +149,11 @@ uint64_t amdgpu_amdkfd_get_gpu_clock_counter(struct kgd_dev *kgd);
 
 uint32_t amdgpu_amdkfd_get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
 void amdgpu_amdkfd_get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info);
+int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
+				  struct kgd_dev **dmabuf_kgd,
+				  uint64_t *bo_size, void *metadata_buffer,
+				  size_t buffer_size, uint32_t *metadata_size,
+				  uint32_t *flags);
 uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
 uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
 
@@ -200,6 +205,12 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
 int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
 					      struct kfd_vm_fault_info *info);
 
+int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
+				      struct dma_buf *dmabuf,
+				      uint64_t va, void *vm,
+				      struct kgd_mem **mem, uint64_t *size,
+				      uint64_t *mmap_offset);
+
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
 void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 5fb60e1d713a..a0a500d45886 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -25,6 +25,7 @@
 #include <linux/list.h>
 #include <linux/pagemap.h>
 #include <linux/sched/mm.h>
+#include <linux/dma-buf.h>
 #include <drm/drmP.h>
 #include "amdgpu_object.h"
 #include "amdgpu_vm.h"
@@ -1664,6 +1665,60 @@ int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
 	return 0;
 }
 
+int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
+				      struct dma_buf *dma_buf,
+				      uint64_t va, void *vm,
+				      struct kgd_mem **mem, uint64_t *size,
+				      uint64_t *mmap_offset)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+	struct drm_gem_object *obj;
+	struct amdgpu_bo *bo;
+	struct amdgpu_vm *avm = (struct amdgpu_vm *)vm;
+
+	if (dma_buf->ops != &amdgpu_dmabuf_ops)
+		/* Can't handle non-graphics buffers */
+		return -EINVAL;
+
+	obj = dma_buf->priv;
+	if (obj->dev->dev_private != adev)
+		/* Can't handle buffers from other devices */
+		return -EINVAL;
+
+	bo = gem_to_amdgpu_bo(obj);
+	if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
+				    AMDGPU_GEM_DOMAIN_GTT)))
+		/* Only VRAM and GTT BOs are supported */
+		return -EINVAL;
+
+	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
+	if (!*mem)
+		return -ENOMEM;
+
+	if (size)
+		*size = amdgpu_bo_size(bo);
+
+	if (mmap_offset)
+		*mmap_offset = amdgpu_bo_mmap_offset(bo);
+
+	INIT_LIST_HEAD(&(*mem)->bo_va_list);
+	mutex_init(&(*mem)->lock);
+	(*mem)->mapping_flags =
+		AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
+		AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC;
+
+	(*mem)->bo = amdgpu_bo_ref(bo);
+	(*mem)->va = va;
+	(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
+		AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
+	(*mem)->mapped_to_gpu_memory = 0;
+	(*mem)->process_info = avm->process_info;
+	add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, false);
+	amdgpu_sync_create(&(*mem)->sync);
+
+	return 0;
+}
+
 /* Evict a userptr BO by stopping the queues if necessary
  *
  * Runs in MMU notifier, may be in RECLAIM_FS context. This means it
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h
index d63daba9b17c..f1ddfc50bcc7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h
@@ -54,6 +54,8 @@ void *amdgpu_gem_prime_vmap(struct drm_gem_object *obj);
 void amdgpu_gem_prime_vunmap(struct drm_gem_object *obj, void *vaddr);
 int amdgpu_gem_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
 
+extern const struct dma_buf_ops amdgpu_dmabuf_ops;
+
 /*
  * GEM objects.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
index 3e44d889f7af..71913a18d142 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
@@ -39,8 +39,6 @@
 #include <drm/amdgpu_drm.h>
 #include <linux/dma-buf.h>
 
-static const struct dma_buf_ops amdgpu_dmabuf_ops;
-
 /**
  * amdgpu_gem_prime_get_sg_table - &drm_driver.gem_prime_get_sg_table
  * implementation
@@ -332,7 +330,7 @@ static int amdgpu_gem_begin_cpu_access(struct dma_buf *dma_buf,
 	return ret;
 }
 
-static const struct dma_buf_ops amdgpu_dmabuf_ops = {
+const struct dma_buf_ops amdgpu_dmabuf_ops = {
 	.attach = amdgpu_gem_map_attach,
 	.detach = amdgpu_gem_map_detach,
 	.map_dma_buf = drm_gem_map_dma_buf,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 5f4062b41add..ae3ae0fb2602 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -33,6 +33,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/dma-buf.h>
 #include <asm/processor.h>
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
@@ -1550,6 +1551,115 @@ copy_from_user_failed:
 	return err;
 }
 
+static int kfd_ioctl_get_dmabuf_info(struct file *filep,
+		struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_get_dmabuf_info_args *args = data;
+	struct kfd_dev *dev = NULL;
+	struct kgd_dev *dma_buf_kgd;
+	void *metadata_buffer = NULL;
+	uint32_t flags;
+	unsigned int i;
+	int r;
+
+	/* Find a KFD GPU device that supports the get_dmabuf_info query */
+	for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++)
+		if (dev)
+			break;
+	if (!dev)
+		return -EINVAL;
+
+	if (args->metadata_ptr) {
+		metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL);
+		if (!metadata_buffer)
+			return -ENOMEM;
+	}
+
+	/* Get dmabuf info from KGD */
+	r = amdgpu_amdkfd_get_dmabuf_info(dev->kgd, args->dmabuf_fd,
+					  &dma_buf_kgd, &args->size,
+					  metadata_buffer, args->metadata_size,
+					  &args->metadata_size, &flags);
+	if (r)
+		goto exit;
+
+	/* Reverse-lookup gpu_id from kgd pointer */
+	dev = kfd_device_by_kgd(dma_buf_kgd);
+	if (!dev) {
+		r = -EINVAL;
+		goto exit;
+	}
+	args->gpu_id = dev->id;
+	args->flags = flags;
+
+	/* Copy metadata buffer to user mode */
+	if (metadata_buffer) {
+		r = copy_to_user((void __user *)args->metadata_ptr,
+				 metadata_buffer, args->metadata_size);
+		if (r != 0)
+			r = -EFAULT;
+	}
+
+exit:
+	kfree(metadata_buffer);
+
+	return r;
+}
+
+static int kfd_ioctl_import_dmabuf(struct file *filep,
+				   struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_import_dmabuf_args *args = data;
+	struct kfd_process_device *pdd;
+	struct dma_buf *dmabuf;
+	struct kfd_dev *dev;
+	int idr_handle;
+	uint64_t size;
+	void *mem;
+	int r;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	dmabuf = dma_buf_get(args->dmabuf_fd);
+	if (!dmabuf)
+		return -EINVAL;
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		r = PTR_ERR(pdd);
+		goto err_unlock;
+	}
+
+	r = amdgpu_amdkfd_gpuvm_import_dmabuf(dev->kgd, dmabuf,
+					      args->va_addr, pdd->vm,
+					      (struct kgd_mem **)&mem, &size,
+					      NULL);
+	if (r)
+		goto err_unlock;
+
+	idr_handle = kfd_process_device_create_obj_handle(pdd, mem);
+	if (idr_handle < 0) {
+		r = -EFAULT;
+		goto err_free;
+	}
+
+	mutex_unlock(&p->mutex);
+
+	args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
+
+	return 0;
+
+err_free:
+	amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem);
+err_unlock:
+	mutex_unlock(&p->mutex);
+	return r;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
 			    .cmd_drv = 0, .name = #ioctl}
@@ -1635,7 +1745,13 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 			kfd_ioctl_set_cu_mask, 0),
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE,
-			kfd_ioctl_get_queue_wave_state, 0)
+			kfd_ioctl_get_queue_wave_state, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO,
+				kfd_ioctl_get_dmabuf_info, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF,
+				kfd_ioctl_import_dmabuf, 0),
 
 };
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index dec8e64f36bd..0689d4ccbbc0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -793,6 +793,7 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
 struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
+struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd);
 int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
 int kfd_numa_node_to_apic_id(int numa_node_id);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index c5ed21ef2462..5f5b2acedbac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -111,6 +111,24 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
 	return device;
 }
 
+struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd)
+{
+	struct kfd_topology_device *top_dev;
+	struct kfd_dev *device = NULL;
+
+	down_read(&topology_lock);
+
+	list_for_each_entry(top_dev, &topology_device_list, list)
+		if (top_dev->gpu && top_dev->gpu->kgd == kgd) {
+			device = top_dev->gpu;
+			break;
+		}
+
+	up_read(&topology_lock);
+
+	return device;
+}
+
 /* Called with write topology_lock acquired */
 static void kfd_release_topology_device(struct kfd_topology_device *dev)
 {
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index b01eb502d49c..e622fd1fbd46 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -398,6 +398,24 @@ struct kfd_ioctl_unmap_memory_from_gpu_args {
 	__u32 n_success;		/* to/from KFD */
 };
 
+struct kfd_ioctl_get_dmabuf_info_args {
+	__u64 size;		/* from KFD */
+	__u64 metadata_ptr;	/* to KFD */
+	__u32 metadata_size;	/* to KFD (space allocated by user)
+				 * from KFD (actual metadata size)
+				 */
+	__u32 gpu_id;	/* from KFD */
+	__u32 flags;		/* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */
+	__u32 dmabuf_fd;	/* to KFD */
+};
+
+struct kfd_ioctl_import_dmabuf_args {
+	__u64 va_addr;	/* to KFD */
+	__u64 handle;	/* from KFD */
+	__u32 gpu_id;	/* to KFD */
+	__u32 dmabuf_fd;	/* to KFD */
+};
+
 #define AMDKFD_IOCTL_BASE 'K'
 #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
 #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
@@ -486,7 +504,13 @@ struct kfd_ioctl_unmap_memory_from_gpu_args {
 #define AMDKFD_IOC_GET_QUEUE_WAVE_STATE		\
 		AMDKFD_IOWR(0x1B, struct kfd_ioctl_get_queue_wave_state_args)
 
+#define AMDKFD_IOC_GET_DMABUF_INFO		\
+		AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_dmabuf_info_args)
+
+#define AMDKFD_IOC_IMPORT_DMABUF		\
+		AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x1C
+#define AMDKFD_COMMAND_END		0x1E
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From c454a46b5efd8eff8880e88ece2976e60a26bf35 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 7 Dec 2018 16:42:25 -0800
Subject: bpf: Add bpf_line_info support

This patch adds bpf_line_info support.

It accepts an array of bpf_line_info objects during BPF_PROG_LOAD.
The "line_info", "line_info_cnt" and "line_info_rec_size" are added
to the "union bpf_attr".  The "line_info_rec_size" makes
bpf_line_info extensible in the future.

The new "check_btf_line()" ensures the userspace line_info is valid
for the kernel to use.

When the verifier is translating/patching the bpf_prog (through
"bpf_patch_insn_single()"), the line_infos' insn_off is also
adjusted by the newly added "bpf_adj_linfo()".

If the bpf_prog is jited, this patch also provides the jited addrs (in
aux->jited_linfo) for the corresponding line_info.insn_off.
"bpf_prog_fill_jited_linfo()" is added to fill the aux->jited_linfo.
It is currently called by the x86 jit.  Other jits can also use
"bpf_prog_fill_jited_linfo()" and it will be done in the followup patches.
In the future, if it deemed necessary, a particular jit could also provide
its own "bpf_prog_fill_jited_linfo()" implementation.

A few "*line_info*" fields are added to the bpf_prog_info such
that the user can get the xlated line_info back (i.e. the line_info
with its insn_off reflecting the translated prog).  The jited_line_info
is available if the prog is jited.  It is an array of __u64.
If the prog is not jited, jited_line_info_cnt is 0.

The verifier's verbose log with line_info will be done in
a follow up patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c  |   2 +
 include/linux/bpf.h          |  21 +++++
 include/linux/bpf_verifier.h |   1 +
 include/linux/btf.h          |   1 +
 include/linux/filter.h       |   7 ++
 include/uapi/linux/bpf.h     |  19 +++++
 kernel/bpf/btf.c             |   2 +-
 kernel/bpf/core.c            | 118 +++++++++++++++++++++++++-
 kernel/bpf/syscall.c         |  83 ++++++++++++++++--
 kernel/bpf/verifier.c        | 198 +++++++++++++++++++++++++++++++++++++------
 10 files changed, 419 insertions(+), 33 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2580cd2e98b1..5542303c43d9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1181,6 +1181,8 @@ out_image:
 	}
 
 	if (!image || !prog->is_func || extra_pass) {
+		if (image)
+			bpf_prog_fill_jited_linfo(prog, addrs);
 out_addrs:
 		kfree(addrs);
 		kfree(jit_data);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e82b7039fc66..0c992b86eb2c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,7 +319,28 @@ struct bpf_prog_aux {
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
+	/* bpf_line_info loaded from userspace.  linfo->insn_off
+	 * has the xlated insn offset.
+	 * Both the main and sub prog share the same linfo.
+	 * The subprog can access its first linfo by
+	 * using the linfo_idx.
+	 */
+	struct bpf_line_info *linfo;
+	/* jited_linfo is the jited addr of the linfo.  It has a
+	 * one to one mapping to linfo:
+	 * jited_linfo[i] is the jited addr for the linfo[i]->insn_off.
+	 * Both the main and sub prog share the same jited_linfo.
+	 * The subprog can access its first jited_linfo by
+	 * using the linfo_idx.
+	 */
+	void **jited_linfo;
 	u32 func_info_cnt;
+	u32 nr_linfo;
+	/* subprog can use linfo_idx to access its first linfo and
+	 * jited_linfo.
+	 * main prog always has linfo_idx == 0
+	 */
+	u32 linfo_idx;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 11f5df1092d9..c736945be7c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -203,6 +203,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 
 struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
+	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u16 stack_depth; /* max. stack depth used by this function */
 };
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 8c2199b5d250..b98405a56383 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -46,6 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
 int btf_get_fd_by_id(u32 id);
 u32 btf_id(const struct btf *btf);
+bool btf_name_offset_valid(const struct btf *btf, u32 offset);
 
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d16deead65c6..29f21f9d7f68 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -718,6 +718,13 @@ void bpf_prog_free(struct bpf_prog *fp);
 
 bool bpf_opcode_in_insntable(u8 code);
 
+void bpf_prog_free_linfo(struct bpf_prog *prog);
+void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
+			       const u32 *insn_to_jit_off);
+int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
+void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
+void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);
+
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a84fd232d934..7a66db8d15d5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -356,6 +356,9 @@ union bpf_attr {
 		__u32		func_info_rec_size;	/* userspace bpf_func_info size */
 		__aligned_u64	func_info;	/* func info */
 		__u32		func_info_cnt;	/* number of bpf_func_info records */
+		__u32		line_info_rec_size;	/* userspace bpf_line_info size */
+		__aligned_u64	line_info;	/* line info */
+		__u32		line_info_cnt;	/* number of bpf_line_info records */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -2679,6 +2682,12 @@ struct bpf_prog_info {
 	__u32 func_info_rec_size;
 	__aligned_u64 func_info;
 	__u32 func_info_cnt;
+	__u32 line_info_cnt;
+	__aligned_u64 line_info;
+	__aligned_u64 jited_line_info;
+	__u32 jited_line_info_cnt;
+	__u32 line_info_rec_size;
+	__u32 jited_line_info_rec_size;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -2995,4 +3004,14 @@ struct bpf_func_info {
 	__u32	type_id;
 };
 
+#define BPF_LINE_INFO_LINE_NUM(line_col)	((line_col) >> 10)
+#define BPF_LINE_INFO_LINE_COL(line_col)	((line_col) & 0x3ff)
+
+struct bpf_line_info {
+	__u32	insn_off;
+	__u32	file_name_off;
+	__u32	line_off;
+	__u32	line_col;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a09b2f94ab25..e0a827f95e19 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -444,7 +444,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 	return kind_ops[BTF_INFO_KIND(t->info)];
 }
 
-static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
 	return BTF_STR_OFFSET_VALID(offset) &&
 		offset < btf->hdr.str_len;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a5b223ef7131..5cdd8da0e7f2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -105,6 +105,91 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
 
+int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
+{
+	if (!prog->aux->nr_linfo || !prog->jit_requested)
+		return 0;
+
+	prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
+					 sizeof(*prog->aux->jited_linfo),
+					 GFP_KERNEL | __GFP_NOWARN);
+	if (!prog->aux->jited_linfo)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+{
+	kfree(prog->aux->jited_linfo);
+	prog->aux->jited_linfo = NULL;
+}
+
+void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
+{
+	if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
+		bpf_prog_free_jited_linfo(prog);
+}
+
+/* The jit engine is responsible to provide an array
+ * for insn_off to the jited_off mapping (insn_to_jit_off).
+ *
+ * The idx to this array is the insn_off.  Hence, the insn_off
+ * here is relative to the prog itself instead of the main prog.
+ * This array has one entry for each xlated bpf insn.
+ *
+ * jited_off is the byte off to the last byte of the jited insn.
+ *
+ * Hence, with
+ * insn_start:
+ *      The first bpf insn off of the prog.  The insn off
+ *      here is relative to the main prog.
+ *      e.g. if prog is a subprog, insn_start > 0
+ * linfo_idx:
+ *      The prog's idx to prog->aux->linfo and jited_linfo
+ *
+ * jited_linfo[linfo_idx] = prog->bpf_func
+ *
+ * For i > linfo_idx,
+ *
+ * jited_linfo[i] = prog->bpf_func +
+ *	insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
+ */
+void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
+			       const u32 *insn_to_jit_off)
+{
+	u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
+	const struct bpf_line_info *linfo;
+	void **jited_linfo;
+
+	if (!prog->aux->jited_linfo)
+		/* Userspace did not provide linfo */
+		return;
+
+	linfo_idx = prog->aux->linfo_idx;
+	linfo = &prog->aux->linfo[linfo_idx];
+	insn_start = linfo[0].insn_off;
+	insn_end = insn_start + prog->len;
+
+	jited_linfo = &prog->aux->jited_linfo[linfo_idx];
+	jited_linfo[0] = prog->bpf_func;
+
+	nr_linfo = prog->aux->nr_linfo - linfo_idx;
+
+	for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
+		/* The verifier ensures that linfo[i].insn_off is
+		 * strictly increasing
+		 */
+		jited_linfo[i] = prog->bpf_func +
+			insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
+}
+
+void bpf_prog_free_linfo(struct bpf_prog *prog)
+{
+	bpf_prog_free_jited_linfo(prog);
+	kvfree(prog->aux->linfo);
+}
+
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags)
 {
@@ -294,6 +379,26 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
 	return ret;
 }
 
+static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
+{
+	struct bpf_line_info *linfo;
+	u32 i, nr_linfo;
+
+	nr_linfo = prog->aux->nr_linfo;
+	if (!nr_linfo || !delta)
+		return;
+
+	linfo = prog->aux->linfo;
+
+	for (i = 0; i < nr_linfo; i++)
+		if (off < linfo[i].insn_off)
+			break;
+
+	/* Push all off < linfo[i].insn_off by delta */
+	for (; i < nr_linfo; i++)
+		linfo[i].insn_off += delta;
+}
+
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len)
 {
@@ -349,6 +454,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 	 */
 	BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
 
+	bpf_adj_linfo(prog_adj, off, insn_delta);
+
 	return prog_adj;
 }
 
@@ -1591,13 +1698,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 * be JITed, but falls back to the interpreter.
 	 */
 	if (!bpf_prog_is_dev_bound(fp->aux)) {
+		*err = bpf_prog_alloc_jited_linfo(fp);
+		if (*err)
+			return fp;
+
 		fp = bpf_int_jit_compile(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
 		if (!fp->jited) {
+			bpf_prog_free_jited_linfo(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
 			*err = -ENOTSUPP;
 			return fp;
-		}
 #endif
+		} else {
+			bpf_prog_free_unused_jited_linfo(fp);
+		}
 	} else {
 		*err = bpf_prog_offload_compile(fp);
 		if (*err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index aa05aa38f4a8..19c88cff7880 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1215,6 +1215,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 		bpf_prog_kallsyms_del_all(prog);
 		btf_put(prog->aux->btf);
 		kvfree(prog->aux->func_info);
+		bpf_prog_free_linfo(prog);
 
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
@@ -1439,7 +1440,7 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD func_info_cnt
+#define	BPF_PROG_LOAD_LAST_FIELD line_info_cnt
 
 static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 {
@@ -1560,6 +1561,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	return err;
 
 free_used_maps:
+	bpf_prog_free_linfo(prog);
 	kvfree(prog->aux->func_info);
 	btf_put(prog->aux->btf);
 	bpf_prog_kallsyms_del_subprogs(prog);
@@ -2041,6 +2043,37 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
 	return insns;
 }
 
+static int set_info_rec_size(struct bpf_prog_info *info)
+{
+	/*
+	 * Ensure info.*_rec_size is the same as kernel expected size
+	 *
+	 * or
+	 *
+	 * Only allow zero *_rec_size if both _rec_size and _cnt are
+	 * zero.  In this case, the kernel will set the expected
+	 * _rec_size back to the info.
+	 */
+
+	if ((info->func_info_cnt || info->func_info_rec_size) &&
+	    info->func_info_rec_size != sizeof(struct bpf_func_info))
+		return -EINVAL;
+
+	if ((info->line_info_cnt || info->line_info_rec_size) &&
+	    info->line_info_rec_size != sizeof(struct bpf_line_info))
+		return -EINVAL;
+
+	if ((info->jited_line_info_cnt || info->jited_line_info_rec_size) &&
+	    info->jited_line_info_rec_size != sizeof(__u64))
+		return -EINVAL;
+
+	info->func_info_rec_size = sizeof(struct bpf_func_info);
+	info->line_info_rec_size = sizeof(struct bpf_line_info);
+	info->jited_line_info_rec_size = sizeof(__u64);
+
+	return 0;
+}
+
 static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 				   const union bpf_attr *attr,
 				   union bpf_attr __user *uattr)
@@ -2083,11 +2116,9 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 				return -EFAULT;
 	}
 
-	if ((info.func_info_cnt || info.func_info_rec_size) &&
-	    info.func_info_rec_size != sizeof(struct bpf_func_info))
-		return -EINVAL;
-
-	info.func_info_rec_size = sizeof(struct bpf_func_info);
+	err = set_info_rec_size(&info);
+	if (err)
+		return err;
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
@@ -2095,6 +2126,8 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		info.nr_jited_ksyms = 0;
 		info.nr_jited_func_lens = 0;
 		info.func_info_cnt = 0;
+		info.line_info_cnt = 0;
+		info.jited_line_info_cnt = 0;
 		goto done;
 	}
 
@@ -2251,6 +2284,44 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.line_info_cnt;
+	info.line_info_cnt = prog->aux->nr_linfo;
+	if (info.line_info_cnt && ulen) {
+		if (bpf_dump_raw_ok()) {
+			__u8 __user *user_linfo;
+
+			user_linfo = u64_to_user_ptr(info.line_info);
+			ulen = min_t(u32, info.line_info_cnt, ulen);
+			if (copy_to_user(user_linfo, prog->aux->linfo,
+					 info.line_info_rec_size * ulen))
+				return -EFAULT;
+		} else {
+			info.line_info = 0;
+		}
+	}
+
+	ulen = info.jited_line_info_cnt;
+	if (prog->aux->jited_linfo)
+		info.jited_line_info_cnt = prog->aux->nr_linfo;
+	else
+		info.jited_line_info_cnt = 0;
+	if (info.jited_line_info_cnt && ulen) {
+		if (bpf_dump_raw_ok()) {
+			__u64 __user *user_linfo;
+			u32 i;
+
+			user_linfo = u64_to_user_ptr(info.jited_line_info);
+			ulen = min_t(u32, info.jited_line_info_cnt, ulen);
+			for (i = 0; i < ulen; i++) {
+				if (put_user((__u64)(long)prog->aux->jited_linfo[i],
+					     &user_linfo[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_line_info = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2752d35ad073..9d25506bd55a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4640,15 +4640,17 @@ err_free:
 #define MIN_BPF_FUNCINFO_SIZE	8
 #define MAX_FUNCINFO_REC_SIZE	252
 
-static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
-			  union bpf_attr *attr, union bpf_attr __user *uattr)
+static int check_btf_func(struct bpf_verifier_env *env,
+			  const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
 {
 	u32 i, nfuncs, urec_size, min_size, prev_offset;
 	u32 krec_size = sizeof(struct bpf_func_info);
-	struct bpf_func_info *krecord = NULL;
+	struct bpf_func_info *krecord;
 	const struct btf_type *type;
+	struct bpf_prog *prog;
+	const struct btf *btf;
 	void __user *urecord;
-	struct btf *btf;
 	int ret = 0;
 
 	nfuncs = attr->func_info_cnt;
@@ -4668,20 +4670,15 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	btf = btf_get_by_fd(attr->prog_btf_fd);
-	if (IS_ERR(btf)) {
-		verbose(env, "unable to get btf from fd\n");
-		return PTR_ERR(btf);
-	}
+	prog = env->prog;
+	btf = prog->aux->btf;
 
 	urecord = u64_to_user_ptr(attr->func_info);
 	min_size = min_t(u32, krec_size, urec_size);
 
 	krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
-	if (!krecord) {
-		ret = -ENOMEM;
-		goto free_btf;
-	}
+	if (!krecord)
+		return -ENOMEM;
 
 	for (i = 0; i < nfuncs; i++) {
 		ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
@@ -4694,12 +4691,12 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 				if (put_user(min_size, &uattr->func_info_rec_size))
 					ret = -EFAULT;
 			}
-			goto free_btf;
+			goto err_free;
 		}
 
 		if (copy_from_user(&krecord[i], urecord, min_size)) {
 			ret = -EFAULT;
-			goto free_btf;
+			goto err_free;
 		}
 
 		/* check insn_off */
@@ -4709,20 +4706,20 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 					"nonzero insn_off %u for the first func info record",
 					krecord[i].insn_off);
 				ret = -EINVAL;
-				goto free_btf;
+				goto err_free;
 			}
 		} else if (krecord[i].insn_off <= prev_offset) {
 			verbose(env,
 				"same or smaller insn offset (%u) than previous func info record (%u)",
 				krecord[i].insn_off, prev_offset);
 			ret = -EINVAL;
-			goto free_btf;
+			goto err_free;
 		}
 
 		if (env->subprog_info[i].start != krecord[i].insn_off) {
 			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
 			ret = -EINVAL;
-			goto free_btf;
+			goto err_free;
 		}
 
 		/* check type_id */
@@ -4731,20 +4728,18 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 			verbose(env, "invalid type id %d in func info",
 				krecord[i].type_id);
 			ret = -EINVAL;
-			goto free_btf;
+			goto err_free;
 		}
 
 		prev_offset = krecord[i].insn_off;
 		urecord += urec_size;
 	}
 
-	prog->aux->btf = btf;
 	prog->aux->func_info = krecord;
 	prog->aux->func_info_cnt = nfuncs;
 	return 0;
 
-free_btf:
-	btf_put(btf);
+err_free:
 	kvfree(krecord);
 	return ret;
 }
@@ -4760,6 +4755,150 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
 		env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start;
 }
 
+#define MIN_BPF_LINEINFO_SIZE	(offsetof(struct bpf_line_info, line_col) + \
+		sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MAX_LINEINFO_REC_SIZE	MAX_FUNCINFO_REC_SIZE
+
+static int check_btf_line(struct bpf_verifier_env *env,
+			  const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
+	struct bpf_subprog_info *sub;
+	struct bpf_line_info *linfo;
+	struct bpf_prog *prog;
+	const struct btf *btf;
+	void __user *ulinfo;
+	int err;
+
+	nr_linfo = attr->line_info_cnt;
+	if (!nr_linfo)
+		return 0;
+
+	rec_size = attr->line_info_rec_size;
+	if (rec_size < MIN_BPF_LINEINFO_SIZE ||
+	    rec_size > MAX_LINEINFO_REC_SIZE ||
+	    rec_size & (sizeof(u32) - 1))
+		return -EINVAL;
+
+	/* Need to zero it in case the userspace may
+	 * pass in a smaller bpf_line_info object.
+	 */
+	linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
+			 GFP_KERNEL | __GFP_NOWARN);
+	if (!linfo)
+		return -ENOMEM;
+
+	prog = env->prog;
+	btf = prog->aux->btf;
+
+	s = 0;
+	sub = env->subprog_info;
+	ulinfo = u64_to_user_ptr(attr->line_info);
+	expected_size = sizeof(struct bpf_line_info);
+	ncopy = min_t(u32, expected_size, rec_size);
+	for (i = 0; i < nr_linfo; i++) {
+		err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
+		if (err) {
+			if (err == -E2BIG) {
+				verbose(env, "nonzero tailing record in line_info");
+				if (put_user(expected_size,
+					     &uattr->line_info_rec_size))
+					err = -EFAULT;
+			}
+			goto err_free;
+		}
+
+		if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
+			err = -EFAULT;
+			goto err_free;
+		}
+
+		/*
+		 * Check insn_off to ensure
+		 * 1) strictly increasing AND
+		 * 2) bounded by prog->len
+		 *
+		 * The linfo[0].insn_off == 0 check logically falls into
+		 * the later "missing bpf_line_info for func..." case
+		 * because the first linfo[0].insn_off must be the
+		 * first sub also and the first sub must have
+		 * subprog_info[0].start == 0.
+		 */
+		if ((i && linfo[i].insn_off <= prev_offset) ||
+		    linfo[i].insn_off >= prog->len) {
+			verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
+				i, linfo[i].insn_off, prev_offset,
+				prog->len);
+			err = -EINVAL;
+			goto err_free;
+		}
+
+		if (!btf_name_offset_valid(btf, linfo[i].line_off) ||
+		    !btf_name_offset_valid(btf, linfo[i].file_name_off)) {
+			verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
+			err = -EINVAL;
+			goto err_free;
+		}
+
+		if (s != env->subprog_cnt) {
+			if (linfo[i].insn_off == sub[s].start) {
+				sub[s].linfo_idx = i;
+				s++;
+			} else if (sub[s].start < linfo[i].insn_off) {
+				verbose(env, "missing bpf_line_info for func#%u\n", s);
+				err = -EINVAL;
+				goto err_free;
+			}
+		}
+
+		prev_offset = linfo[i].insn_off;
+		ulinfo += rec_size;
+	}
+
+	if (s != env->subprog_cnt) {
+		verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
+			env->subprog_cnt - s, s);
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	prog->aux->linfo = linfo;
+	prog->aux->nr_linfo = nr_linfo;
+
+	return 0;
+
+err_free:
+	kvfree(linfo);
+	return err;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+			  const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct btf *btf;
+	int err;
+
+	if (!attr->func_info_cnt && !attr->line_info_cnt)
+		return 0;
+
+	btf = btf_get_by_fd(attr->prog_btf_fd);
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+	env->prog->aux->btf = btf;
+
+	err = check_btf_func(env, attr, uattr);
+	if (err)
+		return err;
+
+	err = check_btf_line(env, attr, uattr);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 /* check %cur's range satisfies %old's */
 static bool range_within(struct bpf_reg_state *old,
 			 struct bpf_reg_state *cur)
@@ -6004,7 +6143,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	int i, j, subprog_start, subprog_end = 0, len, subprog;
 	struct bpf_insn *insn;
 	void *old_bpf_func;
-	int err = -ENOMEM;
+	int err;
 
 	if (env->subprog_cnt <= 1)
 		return 0;
@@ -6035,6 +6174,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		insn->imm = 1;
 	}
 
+	err = bpf_prog_alloc_jited_linfo(prog);
+	if (err)
+		goto out_undo_insn;
+
+	err = -ENOMEM;
 	func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
 	if (!func)
 		goto out_undo_insn;
@@ -6065,6 +6209,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
+		func[i]->aux->linfo = prog->aux->linfo;
+		func[i]->aux->nr_linfo = prog->aux->nr_linfo;
+		func[i]->aux->jited_linfo = prog->aux->jited_linfo;
+		func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
 			err = -ENOTSUPP;
@@ -6138,6 +6286,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
 	prog->aux->func_cnt = env->subprog_cnt;
+	bpf_prog_free_unused_jited_linfo(prog);
 	return 0;
 out_free:
 	for (i = 0; i < env->subprog_cnt; i++)
@@ -6154,6 +6303,7 @@ out_undo_insn:
 		insn->off = 0;
 		insn->imm = env->insn_aux_data[i].call_imm;
 	}
+	bpf_prog_free_jited_linfo(prog);
 	return err;
 }
 
@@ -6526,7 +6676,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (ret < 0)
 		goto skip_full_check;
 
-	ret = check_btf_func(env->prog, env, attr, uattr);
+	ret = check_btf_info(env, attr, uattr);
 	if (ret < 0)
 		goto skip_full_check;
 
-- 
cgit v1.2.3-59-g8ed1b


From 01d3240a04f4c09392e13c77b54d4423ebce2d72 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Thu, 6 Dec 2018 13:01:03 +0000
Subject: media: bpf: add bpf function to report mouse movement

Some IR remotes have a directional pad or other pointer-like thing that
can be used as a mouse. Make it possible to decode these types of IR
protocols in BPF.

Cc: netdev@vger.kernel.org
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/media/rc/bpf-lirc.c                        | 24 ++++++++
 include/uapi/linux/bpf.h                           | 17 +++++-
 tools/include/uapi/linux/bpf.h                     | 18 +++++-
 tools/testing/selftests/bpf/bpf_helpers.h          |  2 +
 tools/testing/selftests/bpf/test_lirc_mode2.sh     |  3 +-
 tools/testing/selftests/bpf/test_lirc_mode2_kern.c |  3 +
 tools/testing/selftests/bpf/test_lirc_mode2_user.c | 65 +++++++++++++++-------
 7 files changed, 109 insertions(+), 23 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 8b97fd1f0cea..390a722e6211 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -59,6 +59,28 @@ static const struct bpf_func_proto rc_keydown_proto = {
 	.arg4_type = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_rc_pointer_rel, u32*, sample, s32, rel_x, s32, rel_y)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
+
+	input_report_rel(ctrl->dev->input_dev, REL_X, rel_x);
+	input_report_rel(ctrl->dev->input_dev, REL_Y, rel_y);
+	input_sync(ctrl->dev->input_dev);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_pointer_rel_proto = {
+	.func	   = bpf_rc_pointer_rel,
+	.gpl_only  = true,
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_ANYTHING,
+	.arg3_type = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -67,6 +89,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &rc_repeat_proto;
 	case BPF_FUNC_rc_keydown:
 		return &rc_keydown_proto;
+	case BPF_FUNC_rc_pointer_rel:
+		return &rc_pointer_rel_proto;
 	case BPF_FUNC_map_lookup_elem:
 		return &bpf_map_lookup_elem_proto;
 	case BPF_FUNC_map_update_elem:
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7a66db8d15d5..1bee1135866a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2301,6 +2301,20 @@ union bpf_attr {
  *		payload and/or *pop* value being to large.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded pointer movement.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2394,7 +2408,8 @@ union bpf_attr {
 	FN(map_pop_elem),		\
 	FN(map_peek_elem),		\
 	FN(msg_push_data),		\
-	FN(msg_pop_data),
+	FN(msg_pop_data),		\
+	FN(rc_pointer_rel),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7973c28b24a0..6ad50b6471d3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2298,9 +2298,22 @@ union bpf_attr {
  *		if possible. Other errors can occur if input parameters are
  *		invalid either due to *start* byte not being valid part of msg
  *		payload and/or *pop* value being to large.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded pointer movement.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
  *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
  *	Return
- *		0 on success, or a negative erro in case of failure.
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2394,7 +2407,8 @@ union bpf_attr {
 	FN(map_pop_elem),		\
 	FN(map_peek_elem),		\
 	FN(msg_push_data),		\
-	FN(msg_pop_data),
+	FN(msg_pop_data),		\
+	FN(rc_pointer_rel),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 7b69519a09b1..04c060e8f10a 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -170,6 +170,8 @@ static int (*bpf_skb_vlan_push)(void *ctx, __be16 vlan_proto, __u16 vlan_tci) =
 	(void *) BPF_FUNC_skb_vlan_push;
 static int (*bpf_skb_vlan_pop)(void *ctx) =
 	(void *) BPF_FUNC_skb_vlan_pop;
+static int (*bpf_rc_pointer_rel)(void *ctx, int rel_x, int rel_y) =
+	(void *) BPF_FUNC_rc_pointer_rel;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2.sh b/tools/testing/selftests/bpf/test_lirc_mode2.sh
index 677686198df3..ec4e15948e40 100755
--- a/tools/testing/selftests/bpf/test_lirc_mode2.sh
+++ b/tools/testing/selftests/bpf/test_lirc_mode2.sh
@@ -21,13 +21,14 @@ do
 	if grep -q DRV_NAME=rc-loopback $i/uevent
 	then
 		LIRCDEV=$(grep DEVNAME= $i/lirc*/uevent | sed sQDEVNAME=Q/dev/Q)
+		INPUTDEV=$(grep DEVNAME= $i/input*/event*/uevent | sed sQDEVNAME=Q/dev/Q)
 	fi
 done
 
 if [ -n $LIRCDEV ];
 then
 	TYPE=lirc_mode2
-	./test_lirc_mode2_user $LIRCDEV
+	./test_lirc_mode2_user $LIRCDEV $INPUTDEV
 	ret=$?
 	if [ $ret -ne 0 ]; then
 		echo -e ${RED}"FAIL: $TYPE"${NC}
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_kern.c b/tools/testing/selftests/bpf/test_lirc_mode2_kern.c
index ba26855563a5..4147130cc3b7 100644
--- a/tools/testing/selftests/bpf/test_lirc_mode2_kern.c
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_kern.c
@@ -15,6 +15,9 @@ int bpf_decoder(unsigned int *sample)
 
 		if (duration & 0x10000)
 			bpf_rc_keydown(sample, 0x40, duration & 0xffff, 0);
+		if (duration & 0x20000)
+			bpf_rc_pointer_rel(sample, (duration >> 8) & 0xff,
+					   duration & 0xff);
 	}
 
 	return 0;
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
index d470d63c33db..fb5fd6841ef3 100644
--- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
@@ -29,6 +29,7 @@
 
 #include <linux/bpf.h>
 #include <linux/lirc.h>
+#include <linux/input.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -47,12 +48,13 @@
 int main(int argc, char **argv)
 {
 	struct bpf_object *obj;
-	int ret, lircfd, progfd, mode;
-	int testir = 0x1dead;
+	int ret, lircfd, progfd, inputfd;
+	int testir1 = 0x1dead;
+	int testir2 = 0x20101;
 	u32 prog_ids[10], prog_flags[10], prog_cnt;
 
-	if (argc != 2) {
-		printf("Usage: %s /dev/lircN\n", argv[0]);
+	if (argc != 3) {
+		printf("Usage: %s /dev/lircN /dev/input/eventM\n", argv[0]);
 		return 2;
 	}
 
@@ -76,9 +78,9 @@ int main(int argc, char **argv)
 		return 1;
 	}
 
-	mode = LIRC_MODE_SCANCODE;
-	if (ioctl(lircfd, LIRC_SET_REC_MODE, &mode)) {
-		printf("failed to set rec mode: %m\n");
+	inputfd = open(argv[2], O_RDONLY | O_NONBLOCK);
+	if (inputfd == -1) {
+		printf("failed to open input device %s: %m\n", argv[1]);
 		return 1;
 	}
 
@@ -102,29 +104,54 @@ int main(int argc, char **argv)
 	}
 
 	/* Write raw IR */
-	ret = write(lircfd, &testir, sizeof(testir));
-	if (ret != sizeof(testir)) {
+	ret = write(lircfd, &testir1, sizeof(testir1));
+	if (ret != sizeof(testir1)) {
 		printf("Failed to send test IR message: %m\n");
 		return 1;
 	}
 
-	struct pollfd pfd = { .fd = lircfd, .events = POLLIN };
-	struct lirc_scancode lsc;
+	struct pollfd pfd = { .fd = inputfd, .events = POLLIN };
+	struct input_event event;
 
-	poll(&pfd, 1, 100);
+	for (;;) {
+		poll(&pfd, 1, 100);
 
-	/* Read decoded IR */
-	ret = read(lircfd, &lsc, sizeof(lsc));
-	if (ret != sizeof(lsc)) {
-		printf("Failed to read decoded IR: %m\n");
-		return 1;
+		/* Read decoded IR */
+		ret = read(inputfd, &event, sizeof(event));
+		if (ret != sizeof(event)) {
+			printf("Failed to read decoded IR: %m\n");
+			return 1;
+		}
+
+		if (event.type == EV_MSC && event.code == MSC_SCAN &&
+		    event.value == 0xdead) {
+			break;
+		}
 	}
 
-	if (lsc.scancode != 0xdead || lsc.rc_proto != 64) {
-		printf("Incorrect scancode decoded\n");
+	/* Write raw IR */
+	ret = write(lircfd, &testir2, sizeof(testir2));
+	if (ret != sizeof(testir2)) {
+		printf("Failed to send test IR message: %m\n");
 		return 1;
 	}
 
+	for (;;) {
+		poll(&pfd, 1, 100);
+
+		/* Read decoded IR */
+		ret = read(inputfd, &event, sizeof(event));
+		if (ret != sizeof(event)) {
+			printf("Failed to read decoded IR: %m\n");
+			return 1;
+		}
+
+		if (event.type == EV_REL && event.code == REL_Y &&
+		    event.value == 1 ) {
+			break;
+		}
+	}
+
 	prog_cnt = 10;
 	ret = bpf_prog_query(lircfd, BPF_LIRC_MODE2, 0, prog_flags, prog_ids,
 			     &prog_cnt);
-- 
cgit v1.2.3-59-g8ed1b


From 11d8b82d2222cade12caad2c125f23023777dcbc Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 10 Dec 2018 14:14:08 -0800
Subject: bpf: rename *_info_cnt to nr_*_info in bpf_prog_info

In uapi bpf.h, currently we have the following fields in
the struct bpf_prog_info:
	__u32 func_info_cnt;
	__u32 line_info_cnt;
	__u32 jited_line_info_cnt;
The above field names "func_info_cnt" and "line_info_cnt"
also appear in union bpf_attr for program loading.

The original intention is to keep the names the same
between bpf_prog_info and bpf_attr
so it will imply what we returned to user space will be
the same as what the user space passed to the kernel.

Such a naming convention in bpf_prog_info is not consistent
with other fields like:
        __u32 nr_jited_ksyms;
        __u32 nr_jited_func_lens;

This patch made this adjustment so in bpf_prog_info
newly introduced *_info_cnt becomes nr_*_info.

Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  6 +++---
 kernel/bpf/syscall.c     | 38 +++++++++++++++++++-------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1bee1135866a..f943ed803309 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2696,11 +2696,11 @@ struct bpf_prog_info {
 	__u32 btf_id;
 	__u32 func_info_rec_size;
 	__aligned_u64 func_info;
-	__u32 func_info_cnt;
-	__u32 line_info_cnt;
+	__u32 nr_func_info;
+	__u32 nr_line_info;
 	__aligned_u64 line_info;
 	__aligned_u64 jited_line_info;
-	__u32 jited_line_info_cnt;
+	__u32 nr_jited_line_info;
 	__u32 line_info_rec_size;
 	__u32 jited_line_info_rec_size;
 } __attribute__((aligned(8)));
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a99a23bf5910..5745c7837621 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2055,15 +2055,15 @@ static int set_info_rec_size(struct bpf_prog_info *info)
 	 * _rec_size back to the info.
 	 */
 
-	if ((info->func_info_cnt || info->func_info_rec_size) &&
+	if ((info->nr_func_info || info->func_info_rec_size) &&
 	    info->func_info_rec_size != sizeof(struct bpf_func_info))
 		return -EINVAL;
 
-	if ((info->line_info_cnt || info->line_info_rec_size) &&
+	if ((info->nr_line_info || info->line_info_rec_size) &&
 	    info->line_info_rec_size != sizeof(struct bpf_line_info))
 		return -EINVAL;
 
-	if ((info->jited_line_info_cnt || info->jited_line_info_rec_size) &&
+	if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
 	    info->jited_line_info_rec_size != sizeof(__u64))
 		return -EINVAL;
 
@@ -2125,9 +2125,9 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		info.xlated_prog_len = 0;
 		info.nr_jited_ksyms = 0;
 		info.nr_jited_func_lens = 0;
-		info.func_info_cnt = 0;
-		info.line_info_cnt = 0;
-		info.jited_line_info_cnt = 0;
+		info.nr_func_info = 0;
+		info.nr_line_info = 0;
+		info.nr_jited_line_info = 0;
 		goto done;
 	}
 
@@ -2268,14 +2268,14 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (prog->aux->btf)
 		info.btf_id = btf_id(prog->aux->btf);
 
-	ulen = info.func_info_cnt;
-	info.func_info_cnt = prog->aux->func_info_cnt;
-	if (info.func_info_cnt && ulen) {
+	ulen = info.nr_func_info;
+	info.nr_func_info = prog->aux->func_info_cnt;
+	if (info.nr_func_info && ulen) {
 		if (bpf_dump_raw_ok()) {
 			char __user *user_finfo;
 
 			user_finfo = u64_to_user_ptr(info.func_info);
-			ulen = min_t(u32, info.func_info_cnt, ulen);
+			ulen = min_t(u32, info.nr_func_info, ulen);
 			if (copy_to_user(user_finfo, prog->aux->func_info,
 					 info.func_info_rec_size * ulen))
 				return -EFAULT;
@@ -2284,14 +2284,14 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
-	ulen = info.line_info_cnt;
-	info.line_info_cnt = prog->aux->nr_linfo;
-	if (info.line_info_cnt && ulen) {
+	ulen = info.nr_line_info;
+	info.nr_line_info = prog->aux->nr_linfo;
+	if (info.nr_line_info && ulen) {
 		if (bpf_dump_raw_ok()) {
 			__u8 __user *user_linfo;
 
 			user_linfo = u64_to_user_ptr(info.line_info);
-			ulen = min_t(u32, info.line_info_cnt, ulen);
+			ulen = min_t(u32, info.nr_line_info, ulen);
 			if (copy_to_user(user_linfo, prog->aux->linfo,
 					 info.line_info_rec_size * ulen))
 				return -EFAULT;
@@ -2300,18 +2300,18 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
-	ulen = info.jited_line_info_cnt;
+	ulen = info.nr_jited_line_info;
 	if (prog->aux->jited_linfo)
-		info.jited_line_info_cnt = prog->aux->nr_linfo;
+		info.nr_jited_line_info = prog->aux->nr_linfo;
 	else
-		info.jited_line_info_cnt = 0;
-	if (info.jited_line_info_cnt && ulen) {
+		info.nr_jited_line_info = 0;
+	if (info.nr_jited_line_info && ulen) {
 		if (bpf_dump_raw_ok()) {
 			__u64 __user *user_linfo;
 			u32 i;
 
 			user_linfo = u64_to_user_ptr(info.jited_line_info);
-			ulen = min_t(u32, info.jited_line_info_cnt, ulen);
+			ulen = min_t(u32, info.nr_jited_line_info, ulen);
 			for (i = 0; i < ulen; i++) {
 				if (put_user((__u64)(long)prog->aux->jited_linfo[i],
 					     &user_linfo[i]))
-- 
cgit v1.2.3-59-g8ed1b


From 7a93d5c38e5ee68376ca88d9e3f9841451b9efb1 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Tue, 23 Oct 2018 14:42:37 -0400
Subject: drm/msm/gpu: add submit flag to hint which buffers should be dumped

To lower CPU  overhead, future userspace will be switching to pinning
iova and avoiding the use of relocs, and only include cmds table entries
for IB1 level cmdstream (but not IB2 or state-groups).

This leaves the kernel unsure what to dump for rd/hangrd cmdstream
dumping.  So add a MSM_SUBMIT_BO_DUMP flag so userspace can indicate
buffers that contain cmdstream (or are otherwise important to dump).

Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 drivers/gpu/drm/msm/msm_gem_submit.c |  5 ++++-
 drivers/gpu/drm/msm/msm_rd.c         | 13 ++++++++++---
 include/uapi/drm/msm_drm.h           |  5 ++++-
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index a43e91e70bd9..3cbed4acb0f4 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -114,8 +114,11 @@ static int submit_lookup_objects(struct msm_gem_submit *submit,
 			pagefault_disable();
 		}
 
+/* at least one of READ and/or WRITE flags should be set: */
+#define MANDATORY_FLAGS (MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE)
+
 		if ((submit_bo.flags & ~MSM_SUBMIT_BO_FLAGS) ||
-			!(submit_bo.flags & MSM_SUBMIT_BO_FLAGS)) {
+			!(submit_bo.flags & MANDATORY_FLAGS)) {
 			DRM_ERROR("invalid flags: %x\n", submit_bo.flags);
 			ret = -EINVAL;
 			goto out_unlock;
diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c
index cca933458439..b5672061ae08 100644
--- a/drivers/gpu/drm/msm/msm_rd.c
+++ b/drivers/gpu/drm/msm/msm_rd.c
@@ -345,6 +345,12 @@ static void snapshot_buf(struct msm_rd_state *rd,
 	msm_gem_put_vaddr(&obj->base);
 }
 
+static bool
+should_dump(struct msm_gem_submit *submit, int idx)
+{
+	return rd_full || (submit->bos[idx].flags & MSM_SUBMIT_BO_DUMP);
+}
+
 /* called under struct_mutex */
 void msm_rd_dump_submit(struct msm_rd_state *rd, struct msm_gem_submit *submit,
 		const char *fmt, ...)
@@ -386,15 +392,16 @@ void msm_rd_dump_submit(struct msm_rd_state *rd, struct msm_gem_submit *submit,
 
 	rd_write_section(rd, RD_CMD, msg, ALIGN(n, 4));
 
-	for (i = 0; rd_full && i < submit->nr_bos; i++)
-		snapshot_buf(rd, submit, i, 0, 0);
+	for (i = 0; i < submit->nr_bos; i++)
+		if (should_dump(submit, i))
+			snapshot_buf(rd, submit, i, 0, 0);
 
 	for (i = 0; i < submit->nr_cmds; i++) {
 		uint64_t iova = submit->cmd[i].iova;
 		uint32_t szd  = submit->cmd[i].size; /* in dwords */
 
 		/* snapshot cmdstream bo's (if we haven't already): */
-		if (!rd_full) {
+		if (!should_dump(submit, i)) {
 			snapshot_buf(rd, submit, submit->cmd[i].idx,
 					submit->cmd[i].iova, szd * 4);
 		}
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index c06d0a5bdd80..3c3af92c4b3e 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -188,8 +188,11 @@ struct drm_msm_gem_submit_cmd {
  */
 #define MSM_SUBMIT_BO_READ             0x0001
 #define MSM_SUBMIT_BO_WRITE            0x0002
+#define MSM_SUBMIT_BO_DUMP             0x0004
 
-#define MSM_SUBMIT_BO_FLAGS            (MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE)
+#define MSM_SUBMIT_BO_FLAGS            (MSM_SUBMIT_BO_READ | \
+					MSM_SUBMIT_BO_WRITE | \
+					MSM_SUBMIT_BO_DUMP)
 
 struct drm_msm_gem_submit_bo {
 	__u32 flags;          /* in, mask of MSM_SUBMIT_BO_x */
-- 
cgit v1.2.3-59-g8ed1b


From 789d2e5a772ce312a7a2b81ffaf304946195beb5 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Thu, 29 Nov 2018 09:54:42 -0500
Subject: drm/msm: rework GEM_INFO ioctl

Prep work to add a way to get/set the GEM objects debug name.

Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 drivers/gpu/drm/msm/msm_drv.c | 28 +++++++++++++++++++---------
 include/uapi/drm/msm_drm.h    | 18 +++++++++++++-----
 2 files changed, 32 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index a73187274b62..fe86c675d5b7 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -882,21 +882,31 @@ static int msm_ioctl_gem_info(struct drm_device *dev, void *data,
 	struct drm_gem_object *obj;
 	int ret = 0;
 
-	if (args->flags & ~MSM_INFO_FLAGS)
+	if (args->pad)
 		return -EINVAL;
 
+	switch (args->info) {
+	case MSM_INFO_GET_OFFSET:
+	case MSM_INFO_GET_IOVA:
+		/* value returned as immediate, not pointer, so len==0: */
+		if (args->len)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	obj = drm_gem_object_lookup(file, args->handle);
 	if (!obj)
 		return -ENOENT;
 
-	if (args->flags & MSM_INFO_IOVA) {
-		uint64_t iova;
-
-		ret = msm_ioctl_gem_info_iova(dev, obj, &iova);
-		if (!ret)
-			args->offset = iova;
-	} else {
-		args->offset = msm_gem_mmap_offset(obj);
+	switch (args->info) {
+	case MSM_INFO_GET_OFFSET:
+		args->value = msm_gem_mmap_offset(obj);
+		break;
+	case MSM_INFO_GET_IOVA:
+		ret = msm_ioctl_gem_info_iova(dev, obj, &args->value);
+		break;
 	}
 
 	drm_gem_object_put_unlocked(obj);
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index 3c3af92c4b3e..7b372c1bcc7d 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -105,14 +105,22 @@ struct drm_msm_gem_new {
 	__u32 handle;         /* out */
 };
 
-#define MSM_INFO_IOVA	0x01
-
-#define MSM_INFO_FLAGS (MSM_INFO_IOVA)
+/* Get or set GEM buffer info.  The requested value can be passed
+ * directly in 'value', or for data larger than 64b 'value' is a
+ * pointer to userspace buffer, with 'len' specifying the number of
+ * bytes copied into that buffer.  For info returned by pointer,
+ * calling the GEM_INFO ioctl with null 'value' will return the
+ * required buffer size in 'len'
+ */
+#define MSM_INFO_GET_OFFSET	0x00   /* get mmap() offset, returned by value */
+#define MSM_INFO_GET_IOVA	0x01   /* get iova, returned by value */
 
 struct drm_msm_gem_info {
 	__u32 handle;         /* in */
-	__u32 flags;	      /* in - combination of MSM_INFO_* flags */
-	__u64 offset;         /* out, mmap() offset or iova */
+	__u32 info;           /* in - one of MSM_INFO_* */
+	__u64 value;          /* in or out */
+	__u32 len;            /* in or out */
+	__u32 pad;
 };
 
 #define MSM_PREP_READ        0x01
-- 
cgit v1.2.3-59-g8ed1b


From f05c83e7746088e6ada6ce83fd7840b4b7b52ffe Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Thu, 29 Nov 2018 10:27:22 -0500
Subject: drm/msm: add uapi to get/set debug name

Add UAPI to get/set GEM objects' debug name.

Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 drivers/gpu/drm/msm/msm_drv.c | 36 +++++++++++++++++++++++++++++++++++-
 include/uapi/drm/msm_drm.h    |  2 ++
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index fe86c675d5b7..c08f83c7ca57 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -23,6 +23,7 @@
 #include "msm_drv.h"
 #include "msm_debugfs.h"
 #include "msm_fence.h"
+#include "msm_gem.h"
 #include "msm_gpu.h"
 #include "msm_kms.h"
 
@@ -880,7 +881,8 @@ static int msm_ioctl_gem_info(struct drm_device *dev, void *data,
 {
 	struct drm_msm_gem_info *args = data;
 	struct drm_gem_object *obj;
-	int ret = 0;
+	struct msm_gem_object *msm_obj;
+	int i, ret = 0;
 
 	if (args->pad)
 		return -EINVAL;
@@ -892,6 +894,9 @@ static int msm_ioctl_gem_info(struct drm_device *dev, void *data,
 		if (args->len)
 			return -EINVAL;
 		break;
+	case MSM_INFO_SET_NAME:
+	case MSM_INFO_GET_NAME:
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -900,6 +905,8 @@ static int msm_ioctl_gem_info(struct drm_device *dev, void *data,
 	if (!obj)
 		return -ENOENT;
 
+	msm_obj = to_msm_bo(obj);
+
 	switch (args->info) {
 	case MSM_INFO_GET_OFFSET:
 		args->value = msm_gem_mmap_offset(obj);
@@ -907,6 +914,33 @@ static int msm_ioctl_gem_info(struct drm_device *dev, void *data,
 	case MSM_INFO_GET_IOVA:
 		ret = msm_ioctl_gem_info_iova(dev, obj, &args->value);
 		break;
+	case MSM_INFO_SET_NAME:
+		/* length check should leave room for terminating null: */
+		if (args->len >= sizeof(msm_obj->name)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = copy_from_user(msm_obj->name,
+			u64_to_user_ptr(args->value), args->len);
+		msm_obj->name[args->len] = '\0';
+		for (i = 0; i < args->len; i++) {
+			if (!isprint(msm_obj->name[i])) {
+				msm_obj->name[i] = '\0';
+				break;
+			}
+		}
+		break;
+	case MSM_INFO_GET_NAME:
+		if (args->value && (args->len < strlen(msm_obj->name))) {
+			ret = -EINVAL;
+			break;
+		}
+		args->len = strlen(msm_obj->name);
+		if (args->value) {
+			ret = copy_to_user(u64_to_user_ptr(args->value),
+					msm_obj->name, args->len);
+		}
+		break;
 	}
 
 	drm_gem_object_put_unlocked(obj);
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index 7b372c1bcc7d..91a16b333c69 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -114,6 +114,8 @@ struct drm_msm_gem_new {
  */
 #define MSM_INFO_GET_OFFSET	0x00   /* get mmap() offset, returned by value */
 #define MSM_INFO_GET_IOVA	0x01   /* get iova, returned by value */
+#define MSM_INFO_SET_NAME	0x02   /* set the debug name (by pointer) */
+#define MSM_INFO_GET_NAME	0x03   /* get debug name, returned by pointer */
 
 struct drm_msm_gem_info {
 	__u32 handle;         /* in */
-- 
cgit v1.2.3-59-g8ed1b


From 0bd72117fba2dd51a65eaa7b480adc0eea9a4409 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 11 Dec 2018 10:26:33 +0100
Subject: bpf: fix up uapi helper description and sync bpf header with tools

Minor markup fixup from bpf-next into net-next merge in the BPF helper
description of bpf_sk_lookup_tcp() and bpf_sk_lookup_udp(). Also sync
up the copy of bpf.h from tooling infrastructure.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h       | 12 +++---
 tools/include/uapi/linux/bpf.h | 87 +++++++++++++++++++++---------------------
 2 files changed, 50 insertions(+), 49 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 92e962ba0c47..aa582cd5bfcf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2218,9 +2218,9 @@ union bpf_attr {
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
- *		For sockets with reuseport option, the *struct bpf_sock*
- *		result is from reuse->socks[] using the hash of the tuple.
+ *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ *		For sockets with reuseport option, the **struct bpf_sock**
+ *		result is from **reuse->socks**\ [] using the hash of the tuple.
  *
  * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
@@ -2254,9 +2254,9 @@ union bpf_attr {
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
- *		For sockets with reuseport option, the *struct bpf_sock*
- *		result is from reuse->socks[] using the hash of the tuple.
+ *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ *		For sockets with reuseport option, the **struct bpf_sock**
+ *		result is from **reuse->socks**\ [] using the hash of the tuple.
  *
  * int bpf_sk_release(struct bpf_sock *sock)
  *	Description
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 94c002584068..aa582cd5bfcf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -502,18 +502,6 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_pop_elem(struct bpf_map *map, void *value)
- * 	Description
- * 		Pop an element from *map*.
- * Return
- * 		0 on success, or a negative error in case of failure.
- *
- * int bpf_map_peek_elem(struct bpf_map *map, void *value)
- * 	Description
- * 		Get an element from *map* without removing it.
- * Return
- * 		0 on success, or a negative error in case of failure.
- *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
@@ -1937,9 +1925,9 @@ union bpf_attr {
  *		is set to metric from route (IPv4/IPv6 only), and ifindex
  *		is set to the device index of the nexthop from the FIB lookup.
  *
- *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be a combination of one or more of the
- *             following values:
+ *		*plen* argument is the size of the passed in struct.
+ *		*flags* argument can be a combination of one or more of the
+ *		following values:
  *
  *		**BPF_FIB_LOOKUP_DIRECT**
  *			Do a direct table lookup vs full lookup using FIB
@@ -1948,9 +1936,9 @@ union bpf_attr {
  *			Perform lookup from an egress perspective (default is
  *			ingress).
  *
- *             *ctx* is either **struct xdp_md** for XDP programs or
- *             **struct sk_buff** tc cls_act programs.
- *     Return
+ *		*ctx* is either **struct xdp_md** for XDP programs or
+ *		**struct sk_buff** tc cls_act programs.
+ *	Return
  *		* < 0 if any input argument is invalid
  *		*   0 on success (packet is forwarded, nexthop neighbor exists)
  *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
@@ -2095,8 +2083,8 @@ union bpf_attr {
  *		translated to a keycode using the rc keymap, and reported as
  *		an input key down event. After a period a key up event is
  *		generated. This period can be extended by calling either
- *		**bpf_rc_keydown** () again with the same values, or calling
- *		**bpf_rc_repeat** ().
+ *		**bpf_rc_keydown**\ () again with the same values, or calling
+ *		**bpf_rc_repeat**\ ().
  *
  *		Some protocols include a toggle bit, in case the button	was
  *		released and pressed again between consecutive scancodes.
@@ -2179,21 +2167,22 @@ union bpf_attr {
  *		The *flags* meaning is specific for each map type,
  *		and has to be 0 for cgroup local storage.
  *
- *		Depending on the bpf program type, a local storage area
- *		can be shared between multiple instances of the bpf program,
+ *		Depending on the BPF program type, a local storage area
+ *		can be shared between multiple instances of the BPF program,
  *		running simultaneously.
  *
  *		A user should care about the synchronization by himself.
- *		For example, by using the BPF_STX_XADD instruction to alter
+ *		For example, by using the **BPF_STX_XADD** instruction to alter
  *		the shared data.
  *	Return
- *		Pointer to the local storage area.
+ *		A pointer to the local storage area.
  *
  * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
  *	Description
- *		Select a SO_REUSEPORT sk from a	BPF_MAP_TYPE_REUSEPORT_ARRAY map
- *		It checks the selected sk is matching the incoming
- *		request in the skb.
+ *		Select a **SO_REUSEPORT** socket from a
+ *		**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ *		It checks the selected socket is matching the incoming
+ *		request in the socket buffer.
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
@@ -2201,7 +2190,7 @@ union bpf_attr {
  *	Description
  *		Look for TCP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
- *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *		and if non-**NULL**, released via **bpf_sk_release**\ ().
  *
  *		The *ctx* should point to the context of the program, such as
  *		the skb or socket (depending on the hook in use). This is used
@@ -2229,15 +2218,15 @@ union bpf_attr {
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
- *		For sockets with reuseport option, the *struct bpf_sock*
- *		result is from reuse->socks[] using the hash of the tuple.
+ *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ *		For sockets with reuseport option, the **struct bpf_sock**
+ *		result is from **reuse->socks**\ [] using the hash of the tuple.
  *
  * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for UDP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
- *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *		and if non-**NULL**, released via **bpf_sk_release**\ ().
  *
  *		The *ctx* should point to the context of the program, such as
  *		the skb or socket (depending on the hook in use). This is used
@@ -2265,42 +2254,54 @@ union bpf_attr {
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
- *		For sockets with reuseport option, the *struct bpf_sock*
- *		result is from reuse->socks[] using the hash of the tuple.
+ *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ *		For sockets with reuseport option, the **struct bpf_sock**
+ *		result is from **reuse->socks**\ [] using the hash of the tuple.
  *
- * int bpf_sk_release(struct bpf_sock *sk)
+ * int bpf_sk_release(struct bpf_sock *sock)
  *	Description
- *		Release the reference held by *sock*. *sock* must be a non-NULL
- *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *		Release the reference held by *sock*. *sock* must be a
+ *		non-**NULL** pointer that was returned from
+ *		**bpf_sk_lookup_xxx**\ ().
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Pop an element from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Get an element from *map* without removing it.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
  *	Description
- *		For socket policies, insert *len* bytes into msg at offset
+ *		For socket policies, insert *len* bytes into *msg* at offset
  *		*start*.
  *
  *		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
- *		*msg* it may want to insert metadata or options into the msg.
+ *		*msg* it may want to insert metadata or options into the *msg*.
  *		This can later be read and used by any of the lower layer BPF
  *		hooks.
  *
  *		This helper may fail if under memory pressure (a malloc
  *		fails) in these cases BPF programs will get an appropriate
  *		error and BPF programs will need to handle them.
- *
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
  * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
- *	 Description
+ *	Description
  *		Will remove *pop* bytes from a *msg* starting at byte *start*.
  *		This may result in **ENOMEM** errors under certain situations if
  *		an allocation and copy are required due to a full ring buffer.
  *		However, the helper will try to avoid doing the allocation
  *		if possible. Other errors can occur if input parameters are
- *		invalid either due to *start* byte not being valid part of msg
+ *		invalid either due to *start* byte not being valid part of *msg*
  *		payload and/or *pop* value being to large.
  *	Return
  *		0 on success, or a negative error in case of failure.
-- 
cgit v1.2.3-59-g8ed1b


From 6a21cc50f0c7f87dae5259f6cfefe024412313f6 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.ws>
Date: Sun, 9 Dec 2018 11:24:13 -0700
Subject: seccomp: add a return code to trap to userspace

This patch introduces a means for syscalls matched in seccomp to notify
some other task that a particular filter has been triggered.

The motivation for this is primarily for use with containers. For example,
if a container does an init_module(), we obviously don't want to load this
untrusted code, which may be compiled for the wrong version of the kernel
anyway. Instead, we could parse the module image, figure out which module
the container is trying to load and load it on the host.

As another example, containers cannot mount() in general since various
filesystems assume a trusted image. However, if an orchestrator knows that
e.g. a particular block device has not been exposed to a container for
writing, it want to allow the container to mount that block device (that
is, handle the mount for it).

This patch adds functionality that is already possible via at least two
other means that I know about, both of which involve ptrace(): first, one
could ptrace attach, and then iterate through syscalls via PTRACE_SYSCALL.
Unfortunately this is slow, so a faster version would be to install a
filter that does SECCOMP_RET_TRACE, which triggers a PTRACE_EVENT_SECCOMP.
Since ptrace allows only one tracer, if the container runtime is that
tracer, users inside the container (or outside) trying to debug it will not
be able to use ptrace, which is annoying. It also means that older
distributions based on Upstart cannot boot inside containers using ptrace,
since upstart itself uses ptrace to monitor services while starting.

The actual implementation of this is fairly small, although getting the
synchronization right was/is slightly complex.

Finally, it's worth noting that the classic seccomp TOCTOU of reading
memory data from the task still applies here, but can be avoided with
careful design of the userspace handler: if the userspace handler reads all
of the task memory that is necessary before applying its security policy,
the tracee's subsequent memory edits will not be read by the tracer.

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
CC: Kees Cook <keescook@chromium.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Oleg Nesterov <oleg@redhat.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: "Serge E. Hallyn" <serge@hallyn.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
CC: Christian Brauner <christian@brauner.io>
CC: Tyler Hicks <tyhicks@canonical.com>
CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/ioctl/ioctl-number.txt           |   1 +
 Documentation/userspace-api/seccomp_filter.rst |  84 +++++
 include/linux/seccomp.h                        |   7 +-
 include/uapi/linux/seccomp.h                   |  40 ++-
 kernel/seccomp.c                               | 448 ++++++++++++++++++++++++-
 tools/testing/selftests/seccomp/seccomp_bpf.c  | 447 +++++++++++++++++++++++-
 6 files changed, 1017 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index af6f6ba1fe80..c9558146ac58 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -79,6 +79,7 @@ Code  Seq#(hex)	Include File		Comments
 0x1b	all	InfiniBand Subsystem	<http://infiniband.sourceforge.net/>
 0x20	all	drivers/cdrom/cm206.h
 0x22	all	scsi/sg.h
+'!'	00-1F	uapi/linux/seccomp.h
 '#'	00-3F	IEEE 1394 Subsystem	Block for the entire subsystem
 '$'	00-0F	linux/perf_counter.h, linux/perf_event.h
 '%'	00-0F	include/uapi/linux/stm.h
diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index 82a468bc7560..b1b846d8a094 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -122,6 +122,11 @@ In precedence order, they are:
 	Results in the lower 16-bits of the return value being passed
 	to userland as the errno without executing the system call.
 
+``SECCOMP_RET_USER_NOTIF``:
+    Results in a ``struct seccomp_notif`` message sent on the userspace
+    notification fd, if it is attached, or ``-ENOSYS`` if it is not. See below
+    on discussion of how to handle user notifications.
+
 ``SECCOMP_RET_TRACE``:
 	When returned, this value will cause the kernel to attempt to
 	notify a ``ptrace()``-based tracer prior to executing the system
@@ -183,6 +188,85 @@ The ``samples/seccomp/`` directory contains both an x86-specific example
 and a more generic example of a higher level macro interface for BPF
 program generation.
 
+Userspace Notification
+======================
+
+The ``SECCOMP_RET_USER_NOTIF`` return code lets seccomp filters pass a
+particular syscall to userspace to be handled. This may be useful for
+applications like container managers, which wish to intercept particular
+syscalls (``mount()``, ``finit_module()``, etc.) and change their behavior.
+
+To acquire a notification FD, use the ``SECCOMP_FILTER_FLAG_NEW_LISTENER``
+argument to the ``seccomp()`` syscall:
+
+.. code-block:: c
+
+    fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
+
+which (on success) will return a listener fd for the filter, which can then be
+passed around via ``SCM_RIGHTS`` or similar. Note that filter fds correspond to
+a particular filter, and not a particular task. So if this task then forks,
+notifications from both tasks will appear on the same filter fd. Reads and
+writes to/from a filter fd are also synchronized, so a filter fd can safely
+have many readers.
+
+The interface for a seccomp notification fd consists of two structures:
+
+.. code-block:: c
+
+    struct seccomp_notif_sizes {
+        __u16 seccomp_notif;
+        __u16 seccomp_notif_resp;
+        __u16 seccomp_data;
+    };
+
+    struct seccomp_notif {
+        __u64 id;
+        __u32 pid;
+        __u32 flags;
+        struct seccomp_data data;
+    };
+
+    struct seccomp_notif_resp {
+        __u64 id;
+        __s64 val;
+        __s32 error;
+        __u32 flags;
+    };
+
+The ``struct seccomp_notif_sizes`` structure can be used to determine the size
+of the various structures used in seccomp notifications. The size of ``struct
+seccomp_data`` may change in the future, so code should use:
+
+.. code-block:: c
+
+    struct seccomp_notif_sizes sizes;
+    seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes);
+
+to determine the size of the various structures to allocate. See
+samples/seccomp/user-trap.c for an example.
+
+Users can read via ``ioctl(SECCOMP_IOCTL_NOTIF_RECV)``  (or ``poll()``) on a
+seccomp notification fd to receive a ``struct seccomp_notif``, which contains
+five members: the input length of the structure, a unique-per-filter ``id``,
+the ``pid`` of the task which triggered this request (which may be 0 if the
+task is in a pid ns not visible from the listener's pid namespace), a ``flags``
+member which for now only has ``SECCOMP_NOTIF_FLAG_SIGNALED``, representing
+whether or not the notification is a result of a non-fatal signal, and the
+``data`` passed to seccomp. Userspace can then make a decision based on this
+information about what to do, and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a
+response, indicating what should be returned to userspace. The ``id`` member of
+``struct seccomp_notif_resp`` should be the same ``id`` as in ``struct
+seccomp_notif``.
+
+It is worth noting that ``struct seccomp_data`` contains the values of register
+arguments to the syscall, but does not contain pointers to memory. The task's
+memory is accessible to suitably privileged traces via ``ptrace()`` or
+``/proc/pid/mem``. However, care should be taken to avoid the TOCTOU mentioned
+above in this document: all arguments being read from the tracee's memory
+should be read into the tracer's memory before any policy decisions are made.
+This allows for an atomic decision on syscall arguments.
+
 Sysctls
 =======
 
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index b5103c019cf4..84868d37b35d 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -4,9 +4,10 @@
 
 #include <uapi/linux/seccomp.h>
 
-#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC	| \
-					 SECCOMP_FILTER_FLAG_LOG	| \
-					 SECCOMP_FILTER_FLAG_SPEC_ALLOW)
+#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC | \
+					 SECCOMP_FILTER_FLAG_LOG | \
+					 SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
+					 SECCOMP_FILTER_FLAG_NEW_LISTENER)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 9efc0e73d50b..90734aa5aa36 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -15,11 +15,13 @@
 #define SECCOMP_SET_MODE_STRICT		0
 #define SECCOMP_SET_MODE_FILTER		1
 #define SECCOMP_GET_ACTION_AVAIL	2
+#define SECCOMP_GET_NOTIF_SIZES		3
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC	(1UL << 0)
-#define SECCOMP_FILTER_FLAG_LOG		(1UL << 1)
-#define SECCOMP_FILTER_FLAG_SPEC_ALLOW	(1UL << 2)
+#define SECCOMP_FILTER_FLAG_TSYNC		(1UL << 0)
+#define SECCOMP_FILTER_FLAG_LOG			(1UL << 1)
+#define SECCOMP_FILTER_FLAG_SPEC_ALLOW		(1UL << 2)
+#define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
 
 /*
  * All BPF programs must return a 32-bit value.
@@ -35,6 +37,7 @@
 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
+#define SECCOMP_RET_USER_NOTIF	 0x7fc00000U /* notifies userspace */
 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
@@ -60,4 +63,35 @@ struct seccomp_data {
 	__u64 args[6];
 };
 
+struct seccomp_notif_sizes {
+	__u16 seccomp_notif;
+	__u16 seccomp_notif_resp;
+	__u16 seccomp_data;
+};
+
+struct seccomp_notif {
+	__u64 id;
+	__u32 pid;
+	__u32 flags;
+	struct seccomp_data data;
+};
+
+struct seccomp_notif_resp {
+	__u64 id;
+	__s64 val;
+	__s32 error;
+	__u32 flags;
+};
+
+#define SECCOMP_IOC_MAGIC		'!'
+#define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
+#define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
+
+/* Flags for seccomp notification fd ioctl. */
+#define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
+#define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
+						struct seccomp_notif_resp)
+#define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 393e029f778a..15b6be97fc09 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -33,12 +33,74 @@
 #endif
 
 #ifdef CONFIG_SECCOMP_FILTER
+#include <linux/file.h>
 #include <linux/filter.h>
 #include <linux/pid.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
+#include <linux/anon_inodes.h>
+
+enum notify_state {
+	SECCOMP_NOTIFY_INIT,
+	SECCOMP_NOTIFY_SENT,
+	SECCOMP_NOTIFY_REPLIED,
+};
+
+struct seccomp_knotif {
+	/* The struct pid of the task whose filter triggered the notification */
+	struct task_struct *task;
+
+	/* The "cookie" for this request; this is unique for this filter. */
+	u64 id;
+
+	/*
+	 * The seccomp data. This pointer is valid the entire time this
+	 * notification is active, since it comes from __seccomp_filter which
+	 * eclipses the entire lifecycle here.
+	 */
+	const struct seccomp_data *data;
+
+	/*
+	 * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
+	 * struct seccomp_knotif is created and starts out in INIT. Once the
+	 * handler reads the notification off of an FD, it transitions to SENT.
+	 * If a signal is received the state transitions back to INIT and
+	 * another message is sent. When the userspace handler replies, state
+	 * transitions to REPLIED.
+	 */
+	enum notify_state state;
+
+	/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
+	int error;
+	long val;
+
+	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
+	struct completion ready;
+
+	struct list_head list;
+};
+
+/**
+ * struct notification - container for seccomp userspace notifications. Since
+ * most seccomp filters will not have notification listeners attached and this
+ * structure is fairly large, we store the notification-specific stuff in a
+ * separate structure.
+ *
+ * @request: A semaphore that users of this notification can wait on for
+ *           changes. Actual reads and writes are still controlled with
+ *           filter->notify_lock.
+ * @next_id: The id of the next request.
+ * @notifications: A list of struct seccomp_knotif elements.
+ * @wqh: A wait queue for poll.
+ */
+struct notification {
+	struct semaphore request;
+	u64 next_id;
+	struct list_head notifications;
+	wait_queue_head_t wqh;
+};
 
 /**
  * struct seccomp_filter - container for seccomp BPF programs
@@ -50,6 +112,8 @@
  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
+ * @notif: the struct that holds all notification related information
+ * @notify_lock: A lock for all notification-related accesses.
  *
  * seccomp_filter objects are organized in a tree linked via the @prev
  * pointer.  For any task, it appears to be a singly-linked list starting
@@ -66,6 +130,8 @@ struct seccomp_filter {
 	bool log;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
+	struct notification *notif;
+	struct mutex notify_lock;
 };
 
 /* Limit any path through the tree to 256KB worth of instructions. */
@@ -386,6 +452,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 	if (!sfilter)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&sfilter->notify_lock);
 	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
 					seccomp_check_filter, save_orig);
 	if (ret < 0) {
@@ -479,7 +546,6 @@ static long seccomp_attach_filter(unsigned int flags,
 
 static void __get_seccomp_filter(struct seccomp_filter *filter)
 {
-	/* Reference count is bounded by the number of total processes. */
 	refcount_inc(&filter->usage);
 }
 
@@ -550,11 +616,13 @@ static void seccomp_send_sigsys(int syscall, int reason)
 #define SECCOMP_LOG_TRACE		(1 << 4)
 #define SECCOMP_LOG_LOG			(1 << 5)
 #define SECCOMP_LOG_ALLOW		(1 << 6)
+#define SECCOMP_LOG_USER_NOTIF		(1 << 7)
 
 static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
 				    SECCOMP_LOG_KILL_THREAD  |
 				    SECCOMP_LOG_TRAP  |
 				    SECCOMP_LOG_ERRNO |
+				    SECCOMP_LOG_USER_NOTIF |
 				    SECCOMP_LOG_TRACE |
 				    SECCOMP_LOG_LOG;
 
@@ -575,6 +643,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 	case SECCOMP_RET_TRACE:
 		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
 		break;
+	case SECCOMP_RET_USER_NOTIF:
+		log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
+		break;
 	case SECCOMP_RET_LOG:
 		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
 		break;
@@ -646,6 +717,68 @@ void secure_computing_strict(int this_syscall)
 #else
 
 #ifdef CONFIG_SECCOMP_FILTER
+static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
+{
+	/*
+	 * Note: overflow is ok here, the id just needs to be unique per
+	 * filter.
+	 */
+	lockdep_assert_held(&filter->notify_lock);
+	return filter->notif->next_id++;
+}
+
+static void seccomp_do_user_notification(int this_syscall,
+					 struct seccomp_filter *match,
+					 const struct seccomp_data *sd)
+{
+	int err;
+	long ret = 0;
+	struct seccomp_knotif n = {};
+
+	mutex_lock(&match->notify_lock);
+	err = -ENOSYS;
+	if (!match->notif)
+		goto out;
+
+	n.task = current;
+	n.state = SECCOMP_NOTIFY_INIT;
+	n.data = sd;
+	n.id = seccomp_next_notify_id(match);
+	init_completion(&n.ready);
+	list_add(&n.list, &match->notif->notifications);
+
+	up(&match->notif->request);
+	wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
+	mutex_unlock(&match->notify_lock);
+
+	/*
+	 * This is where we wait for a reply from userspace.
+	 */
+	err = wait_for_completion_interruptible(&n.ready);
+	mutex_lock(&match->notify_lock);
+	if (err == 0) {
+		ret = n.val;
+		err = n.error;
+	}
+
+	/*
+	 * Note that it's possible the listener died in between the time when
+	 * we were notified of a respons (or a signal) and when we were able to
+	 * re-acquire the lock, so only delete from the list if the
+	 * notification actually exists.
+	 *
+	 * Also note that this test is only valid because there's no way to
+	 * *reattach* to a notifier right now. If one is added, we'll need to
+	 * keep track of the notif itself and make sure they match here.
+	 */
+	if (match->notif)
+		list_del(&n.list);
+out:
+	mutex_unlock(&match->notify_lock);
+	syscall_set_return_value(current, task_pt_regs(current),
+				 err, ret);
+}
+
 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 			    const bool recheck_after_trace)
 {
@@ -728,6 +861,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 		return 0;
 
+	case SECCOMP_RET_USER_NOTIF:
+		seccomp_do_user_notification(this_syscall, match, sd);
+		goto skip;
+
 	case SECCOMP_RET_LOG:
 		seccomp_log(this_syscall, 0, action, true);
 		return 0;
@@ -834,6 +971,263 @@ out:
 }
 
 #ifdef CONFIG_SECCOMP_FILTER
+static int seccomp_notify_release(struct inode *inode, struct file *file)
+{
+	struct seccomp_filter *filter = file->private_data;
+	struct seccomp_knotif *knotif;
+
+	mutex_lock(&filter->notify_lock);
+
+	/*
+	 * If this file is being closed because e.g. the task who owned it
+	 * died, let's wake everyone up who was waiting on us.
+	 */
+	list_for_each_entry(knotif, &filter->notif->notifications, list) {
+		if (knotif->state == SECCOMP_NOTIFY_REPLIED)
+			continue;
+
+		knotif->state = SECCOMP_NOTIFY_REPLIED;
+		knotif->error = -ENOSYS;
+		knotif->val = 0;
+
+		complete(&knotif->ready);
+	}
+
+	kfree(filter->notif);
+	filter->notif = NULL;
+	mutex_unlock(&filter->notify_lock);
+	__put_seccomp_filter(filter);
+	return 0;
+}
+
+static long seccomp_notify_recv(struct seccomp_filter *filter,
+				void __user *buf)
+{
+	struct seccomp_knotif *knotif = NULL, *cur;
+	struct seccomp_notif unotif;
+	ssize_t ret;
+
+	memset(&unotif, 0, sizeof(unotif));
+
+	ret = down_interruptible(&filter->notif->request);
+	if (ret < 0)
+		return ret;
+
+	mutex_lock(&filter->notify_lock);
+	list_for_each_entry(cur, &filter->notif->notifications, list) {
+		if (cur->state == SECCOMP_NOTIFY_INIT) {
+			knotif = cur;
+			break;
+		}
+	}
+
+	/*
+	 * If we didn't find a notification, it could be that the task was
+	 * interrupted by a fatal signal between the time we were woken and
+	 * when we were able to acquire the rw lock.
+	 */
+	if (!knotif) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	unotif.id = knotif->id;
+	unotif.pid = task_pid_vnr(knotif->task);
+	unotif.data = *(knotif->data);
+
+	knotif->state = SECCOMP_NOTIFY_SENT;
+	wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
+	ret = 0;
+out:
+	mutex_unlock(&filter->notify_lock);
+
+	if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
+		ret = -EFAULT;
+
+		/*
+		 * Userspace screwed up. To make sure that we keep this
+		 * notification alive, let's reset it back to INIT. It
+		 * may have died when we released the lock, so we need to make
+		 * sure it's still around.
+		 */
+		knotif = NULL;
+		mutex_lock(&filter->notify_lock);
+		list_for_each_entry(cur, &filter->notif->notifications, list) {
+			if (cur->id == unotif.id) {
+				knotif = cur;
+				break;
+			}
+		}
+
+		if (knotif) {
+			knotif->state = SECCOMP_NOTIFY_INIT;
+			up(&filter->notif->request);
+		}
+		mutex_unlock(&filter->notify_lock);
+	}
+
+	return ret;
+}
+
+static long seccomp_notify_send(struct seccomp_filter *filter,
+				void __user *buf)
+{
+	struct seccomp_notif_resp resp = {};
+	struct seccomp_knotif *knotif = NULL, *cur;
+	long ret;
+
+	if (copy_from_user(&resp, buf, sizeof(resp)))
+		return -EFAULT;
+
+	if (resp.flags)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return ret;
+
+	list_for_each_entry(cur, &filter->notif->notifications, list) {
+		if (cur->id == resp.id) {
+			knotif = cur;
+			break;
+		}
+	}
+
+	if (!knotif) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	/* Allow exactly one reply. */
+	if (knotif->state != SECCOMP_NOTIFY_SENT) {
+		ret = -EINPROGRESS;
+		goto out;
+	}
+
+	ret = 0;
+	knotif->state = SECCOMP_NOTIFY_REPLIED;
+	knotif->error = resp.error;
+	knotif->val = resp.val;
+	complete(&knotif->ready);
+out:
+	mutex_unlock(&filter->notify_lock);
+	return ret;
+}
+
+static long seccomp_notify_id_valid(struct seccomp_filter *filter,
+				    void __user *buf)
+{
+	struct seccomp_knotif *knotif = NULL;
+	u64 id;
+	long ret;
+
+	if (copy_from_user(&id, buf, sizeof(id)))
+		return -EFAULT;
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return ret;
+
+	ret = -ENOENT;
+	list_for_each_entry(knotif, &filter->notif->notifications, list) {
+		if (knotif->id == id) {
+			if (knotif->state == SECCOMP_NOTIFY_SENT)
+				ret = 0;
+			goto out;
+		}
+	}
+
+out:
+	mutex_unlock(&filter->notify_lock);
+	return ret;
+}
+
+static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long arg)
+{
+	struct seccomp_filter *filter = file->private_data;
+	void __user *buf = (void __user *)arg;
+
+	switch (cmd) {
+	case SECCOMP_IOCTL_NOTIF_RECV:
+		return seccomp_notify_recv(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_SEND:
+		return seccomp_notify_send(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_ID_VALID:
+		return seccomp_notify_id_valid(filter, buf);
+	default:
+		return -EINVAL;
+	}
+}
+
+static __poll_t seccomp_notify_poll(struct file *file,
+				    struct poll_table_struct *poll_tab)
+{
+	struct seccomp_filter *filter = file->private_data;
+	__poll_t ret = 0;
+	struct seccomp_knotif *cur;
+
+	poll_wait(file, &filter->notif->wqh, poll_tab);
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return EPOLLERR;
+
+	list_for_each_entry(cur, &filter->notif->notifications, list) {
+		if (cur->state == SECCOMP_NOTIFY_INIT)
+			ret |= EPOLLIN | EPOLLRDNORM;
+		if (cur->state == SECCOMP_NOTIFY_SENT)
+			ret |= EPOLLOUT | EPOLLWRNORM;
+		if ((ret & EPOLLIN) && (ret & EPOLLOUT))
+			break;
+	}
+
+	mutex_unlock(&filter->notify_lock);
+
+	return ret;
+}
+
+static const struct file_operations seccomp_notify_ops = {
+	.poll = seccomp_notify_poll,
+	.release = seccomp_notify_release,
+	.unlocked_ioctl = seccomp_notify_ioctl,
+};
+
+static struct file *init_listener(struct seccomp_filter *filter)
+{
+	struct file *ret = ERR_PTR(-EBUSY);
+	struct seccomp_filter *cur;
+
+	for (cur = current->seccomp.filter; cur; cur = cur->prev) {
+		if (cur->notif)
+			goto out;
+	}
+
+	ret = ERR_PTR(-ENOMEM);
+	filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
+	if (!filter->notif)
+		goto out;
+
+	sema_init(&filter->notif->request, 0);
+	filter->notif->next_id = get_random_u64();
+	INIT_LIST_HEAD(&filter->notif->notifications);
+	init_waitqueue_head(&filter->notif->wqh);
+
+	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
+				 filter, O_RDWR);
+	if (IS_ERR(ret))
+		goto out_notif;
+
+	/* The file has a reference to it now */
+	__get_seccomp_filter(filter);
+
+out_notif:
+	if (IS_ERR(ret))
+		kfree(filter->notif);
+out:
+	return ret;
+}
+
 /**
  * seccomp_set_mode_filter: internal function for setting seccomp filter
  * @flags:  flags to change filter behavior
@@ -853,6 +1247,8 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
 	struct seccomp_filter *prepared = NULL;
 	long ret = -EINVAL;
+	int listener = -1;
+	struct file *listener_f = NULL;
 
 	/* Validate flags. */
 	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
@@ -863,13 +1259,28 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	if (IS_ERR(prepared))
 		return PTR_ERR(prepared);
 
+	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
+		listener = get_unused_fd_flags(O_CLOEXEC);
+		if (listener < 0) {
+			ret = listener;
+			goto out_free;
+		}
+
+		listener_f = init_listener(prepared);
+		if (IS_ERR(listener_f)) {
+			put_unused_fd(listener);
+			ret = PTR_ERR(listener_f);
+			goto out_free;
+		}
+	}
+
 	/*
 	 * Make sure we cannot change seccomp or nnp state via TSYNC
 	 * while another thread is in the middle of calling exec.
 	 */
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
 	    mutex_lock_killable(&current->signal->cred_guard_mutex))
-		goto out_free;
+		goto out_put_fd;
 
 	spin_lock_irq(&current->sighand->siglock);
 
@@ -887,6 +1298,16 @@ out:
 	spin_unlock_irq(&current->sighand->siglock);
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
 		mutex_unlock(&current->signal->cred_guard_mutex);
+out_put_fd:
+	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
+		if (ret < 0) {
+			fput(listener_f);
+			put_unused_fd(listener);
+		} else {
+			fd_install(listener, listener_f);
+			ret = listener;
+		}
+	}
 out_free:
 	seccomp_filter_free(prepared);
 	return ret;
@@ -911,6 +1332,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
 	case SECCOMP_RET_KILL_THREAD:
 	case SECCOMP_RET_TRAP:
 	case SECCOMP_RET_ERRNO:
+	case SECCOMP_RET_USER_NOTIF:
 	case SECCOMP_RET_TRACE:
 	case SECCOMP_RET_LOG:
 	case SECCOMP_RET_ALLOW:
@@ -922,6 +1344,20 @@ static long seccomp_get_action_avail(const char __user *uaction)
 	return 0;
 }
 
+static long seccomp_get_notif_sizes(void __user *usizes)
+{
+	struct seccomp_notif_sizes sizes = {
+		.seccomp_notif = sizeof(struct seccomp_notif),
+		.seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
+		.seccomp_data = sizeof(struct seccomp_data),
+	};
+
+	if (copy_to_user(usizes, &sizes, sizeof(sizes)))
+		return -EFAULT;
+
+	return 0;
+}
+
 /* Common entry point for both prctl and syscall. */
 static long do_seccomp(unsigned int op, unsigned int flags,
 		       void __user *uargs)
@@ -938,6 +1374,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
 			return -EINVAL;
 
 		return seccomp_get_action_avail(uargs);
+	case SECCOMP_GET_NOTIF_SIZES:
+		if (flags != 0)
+			return -EINVAL;
+
+		return seccomp_get_notif_sizes(uargs);
 	default:
 		return -EINVAL;
 	}
@@ -1111,6 +1552,7 @@ long seccomp_get_metadata(struct task_struct *task,
 #define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread"
 #define SECCOMP_RET_TRAP_NAME		"trap"
 #define SECCOMP_RET_ERRNO_NAME		"errno"
+#define SECCOMP_RET_USER_NOTIF_NAME	"user_notif"
 #define SECCOMP_RET_TRACE_NAME		"trace"
 #define SECCOMP_RET_LOG_NAME		"log"
 #define SECCOMP_RET_ALLOW_NAME		"allow"
@@ -1120,6 +1562,7 @@ static const char seccomp_actions_avail[] =
 				SECCOMP_RET_KILL_THREAD_NAME	" "
 				SECCOMP_RET_TRAP_NAME		" "
 				SECCOMP_RET_ERRNO_NAME		" "
+				SECCOMP_RET_USER_NOTIF_NAME     " "
 				SECCOMP_RET_TRACE_NAME		" "
 				SECCOMP_RET_LOG_NAME		" "
 				SECCOMP_RET_ALLOW_NAME;
@@ -1134,6 +1577,7 @@ static const struct seccomp_log_name seccomp_log_names[] = {
 	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
+	{ SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
 	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
 	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
 	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index e1473234968d..5c9768a1b8cd 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -5,6 +5,7 @@
  * Test code for seccomp bpf.
  */
 
+#define _GNU_SOURCE
 #include <sys/types.h>
 
 /*
@@ -40,10 +41,12 @@
 #include <sys/fcntl.h>
 #include <sys/mman.h>
 #include <sys/times.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
 
-#define _GNU_SOURCE
 #include <unistd.h>
 #include <sys/syscall.h>
+#include <poll.h>
 
 #include "../kselftest_harness.h"
 
@@ -133,6 +136,10 @@ struct seccomp_data {
 #define SECCOMP_GET_ACTION_AVAIL 2
 #endif
 
+#ifndef SECCOMP_GET_NOTIF_SIZES
+#define SECCOMP_GET_NOTIF_SIZES 3
+#endif
+
 #ifndef SECCOMP_FILTER_FLAG_TSYNC
 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
 #endif
@@ -154,6 +161,44 @@ struct seccomp_metadata {
 };
 #endif
 
+#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
+#define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
+
+#define SECCOMP_RET_USER_NOTIF 0x7fc00000U
+
+#define SECCOMP_IOC_MAGIC		'!'
+#define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
+#define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
+
+/* Flags for seccomp notification fd ioctl. */
+#define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
+#define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
+						struct seccomp_notif_resp)
+#define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
+
+struct seccomp_notif {
+	__u64 id;
+	__u32 pid;
+	__u32 flags;
+	struct seccomp_data data;
+};
+
+struct seccomp_notif_resp {
+	__u64 id;
+	__s64 val;
+	__s32 error;
+	__u32 flags;
+};
+
+struct seccomp_notif_sizes {
+	__u16 seccomp_notif;
+	__u16 seccomp_notif_resp;
+	__u16 seccomp_data;
+};
+#endif
+
 #ifndef seccomp
 int seccomp(unsigned int op, unsigned int flags, void *args)
 {
@@ -2077,7 +2122,8 @@ TEST(detect_seccomp_filter_flags)
 {
 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
 				 SECCOMP_FILTER_FLAG_LOG,
-				 SECCOMP_FILTER_FLAG_SPEC_ALLOW };
+				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
+				 SECCOMP_FILTER_FLAG_NEW_LISTENER };
 	unsigned int flag, all_flags;
 	int i;
 	long ret;
@@ -2933,6 +2979,403 @@ skip:
 	ASSERT_EQ(0, kill(pid, SIGKILL));
 }
 
+static int user_trap_syscall(int nr, unsigned int flags)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+	};
+
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+
+	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
+}
+
+#define USER_NOTIF_MAGIC 116983961184613L
+TEST(user_notification_basic)
+{
+	pid_t pid;
+	long ret;
+	int status, listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+	struct pollfd pollfd;
+
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	/* Check that we get -ENOSYS with no listener attached */
+	if (pid == 0) {
+		if (user_trap_syscall(__NR_getpid, 0) < 0)
+			exit(1);
+		ret = syscall(__NR_getpid);
+		exit(ret >= 0 || errno != ENOSYS);
+	}
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+
+	/* Add some no-op filters so for grins. */
+	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+
+	/* Check that the basic notification machinery works */
+	listener = user_trap_syscall(__NR_getpid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	EXPECT_GE(listener, 0);
+
+	/* Installing a second listener in the chain should EBUSY */
+	EXPECT_EQ(user_trap_syscall(__NR_getpid,
+				    SECCOMP_FILTER_FLAG_NEW_LISTENER),
+		  -1);
+	EXPECT_EQ(errno, EBUSY);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ret = syscall(__NR_getpid);
+		exit(ret != USER_NOTIF_MAGIC);
+	}
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
+
+	EXPECT_GT(poll(&pollfd, 1, -1), 0);
+	EXPECT_EQ(pollfd.revents, POLLIN);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
+
+	EXPECT_GT(poll(&pollfd, 1, -1), 0);
+	EXPECT_EQ(pollfd.revents, POLLOUT);
+
+	EXPECT_EQ(req.data.nr,  __NR_getpid);
+
+	resp.id = req.id;
+	resp.error = 0;
+	resp.val = USER_NOTIF_MAGIC;
+
+	/* check that we make sure flags == 0 */
+	resp.flags = 1;
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	resp.flags = 0;
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(user_notification_kill_in_middle)
+{
+	pid_t pid;
+	long ret;
+	int listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+
+	listener = user_trap_syscall(__NR_getpid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	EXPECT_GE(listener, 0);
+
+	/*
+	 * Check that nothing bad happens when we kill the task in the middle
+	 * of a syscall.
+	 */
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ret = syscall(__NR_getpid);
+		exit(ret != USER_NOTIF_MAGIC);
+	}
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
+
+	EXPECT_EQ(kill(pid, SIGKILL), 0);
+	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
+
+	resp.id = req.id;
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+	EXPECT_EQ(ret, -1);
+	EXPECT_EQ(errno, ENOENT);
+}
+
+static int handled = -1;
+
+static void signal_handler(int signal)
+{
+	if (write(handled, "c", 1) != 1)
+		perror("write from signal");
+}
+
+TEST(user_notification_signal)
+{
+	pid_t pid;
+	long ret;
+	int status, listener, sk_pair[2];
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+	char c;
+
+	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
+
+	listener = user_trap_syscall(__NR_gettid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	EXPECT_GE(listener, 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		close(sk_pair[0]);
+		handled = sk_pair[1];
+		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
+			perror("signal");
+			exit(1);
+		}
+		/*
+		 * ERESTARTSYS behavior is a bit hard to test, because we need
+		 * to rely on a signal that has not yet been handled. Let's at
+		 * least check that the error code gets propagated through, and
+		 * hope that it doesn't break when there is actually a signal :)
+		 */
+		ret = syscall(__NR_gettid);
+		exit(!(ret == -1 && errno == 512));
+	}
+
+	close(sk_pair[1]);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+	EXPECT_EQ(kill(pid, SIGUSR1), 0);
+
+	/*
+	 * Make sure the signal really is delivered, which means we're not
+	 * stuck in the user notification code any more and the notification
+	 * should be dead.
+	 */
+	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
+
+	resp.id = req.id;
+	resp.error = -EPERM;
+	resp.val = 0;
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+	EXPECT_EQ(errno, ENOENT);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+	resp.id = req.id;
+	resp.error = -512; /* -ERESTARTSYS */
+	resp.val = 0;
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(user_notification_closed_listener)
+{
+	pid_t pid;
+	long ret;
+	int status, listener;
+
+	listener = user_trap_syscall(__NR_getpid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	EXPECT_GE(listener, 0);
+
+	/*
+	 * Check that we get an ENOSYS when the listener is closed.
+	 */
+	pid = fork();
+	ASSERT_GE(pid, 0);
+	if (pid == 0) {
+		close(listener);
+		ret = syscall(__NR_getpid);
+		exit(ret != -1 && errno != ENOSYS);
+	}
+
+	close(listener);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+/*
+ * Check that a pid in a child namespace still shows up as valid in ours.
+ */
+TEST(user_notification_child_pid_ns)
+{
+	pid_t pid;
+	int status, listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+
+	ASSERT_EQ(unshare(CLONE_NEWPID), 0);
+
+	listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	ASSERT_GE(listener, 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0)
+		exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+	EXPECT_EQ(req.pid, pid);
+
+	resp.id = req.id;
+	resp.error = 0;
+	resp.val = USER_NOTIF_MAGIC;
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+	close(listener);
+}
+
+/*
+ * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
+ * invalid.
+ */
+TEST(user_notification_sibling_pid_ns)
+{
+	pid_t pid, pid2;
+	int status, listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+
+	listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	ASSERT_GE(listener, 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ASSERT_EQ(unshare(CLONE_NEWPID), 0);
+
+		pid2 = fork();
+		ASSERT_GE(pid2, 0);
+
+		if (pid2 == 0)
+			exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
+
+		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
+		EXPECT_EQ(true, WIFEXITED(status));
+		EXPECT_EQ(0, WEXITSTATUS(status));
+		exit(WEXITSTATUS(status));
+	}
+
+	/* Create the sibling ns, and sibling in it. */
+	EXPECT_EQ(unshare(CLONE_NEWPID), 0);
+	EXPECT_EQ(errno, 0);
+
+	pid2 = fork();
+	EXPECT_GE(pid2, 0);
+
+	if (pid2 == 0) {
+		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+		/*
+		 * The pid should be 0, i.e. the task is in some namespace that
+		 * we can't "see".
+		 */
+		ASSERT_EQ(req.pid, 0);
+
+		resp.id = req.id;
+		resp.error = 0;
+		resp.val = USER_NOTIF_MAGIC;
+
+		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+		exit(0);
+	}
+
+	close(listener);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+
+	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(user_notification_fault_recv)
+{
+	pid_t pid;
+	int status, listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+
+	listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	ASSERT_GE(listener, 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0)
+		exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
+
+	/* Do a bad recv() */
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
+	EXPECT_EQ(errno, EFAULT);
+
+	/* We should still be able to receive this notification, though. */
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+	EXPECT_EQ(req.pid, pid);
+
+	resp.id = req.id;
+	resp.error = 0;
+	resp.val = USER_NOTIF_MAGIC;
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(seccomp_get_notif_sizes)
+{
+	struct seccomp_notif_sizes sizes;
+
+	EXPECT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
+	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
+	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
+}
+
 /*
  * TODO:
  * - add microbenchmarks
-- 
cgit v1.2.3-59-g8ed1b


From c872bdb38febb4c31ece3599c52cf1f833b89f4e Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 12 Dec 2018 09:37:46 -0800
Subject: bpf: include sub program tags in bpf_prog_info

Changes v2 -> v3:
1. remove check for bpf_dump_raw_ok().

Changes v1 -> v2:
1. Fix error path as Martin suggested.

This patch adds nr_prog_tags and prog_tags to bpf_prog_info. This is a
reliable way for user space to get tags of all sub programs. Before this
patch, user space need to find sub program tags via kallsyms.

This feature will be used in BPF introspection, where user space queries
information about BPF programs via sys_bpf.

Signed-off-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa582cd5bfcf..e7d57e89f25f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2717,6 +2717,8 @@ struct bpf_prog_info {
 	__u32 nr_jited_line_info;
 	__u32 line_info_rec_size;
 	__u32 jited_line_info_rec_size;
+	__u32 nr_prog_tags;
+	__aligned_u64 prog_tags;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b7c585838c72..7f1410d6fbe9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2315,6 +2315,28 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_prog_tags;
+	info.nr_prog_tags = prog->aux->func_cnt ? : 1;
+	if (ulen) {
+		__u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
+		u32 i;
+
+		user_prog_tags = u64_to_user_ptr(info.prog_tags);
+		ulen = min_t(u32, info.nr_prog_tags, ulen);
+		if (prog->aux->func_cnt) {
+			for (i = 0; i < ulen; i++) {
+				if (copy_to_user(user_prog_tags[i],
+						 prog->aux->func[i]->tag,
+						 BPF_TAG_SIZE))
+					return -EFAULT;
+			}
+		} else {
+			if (copy_to_user(user_prog_tags[0],
+					 prog->tag, BPF_TAG_SIZE))
+				return -EFAULT;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3-59-g8ed1b


From ec6e822d1a22d0eef1d1fa260dff751dba9a4258 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 7 Dec 2018 18:39:26 +0000
Subject: arm64: expose user PAC bit positions via ptrace

When pointer authentication is in use, data/instruction pointers have a
number of PAC bits inserted into them. The number and position of these
bits depends on the configured TCR_ELx.TxSZ and whether tagging is
enabled. ARMv8.3 allows tagging to differ for instruction and data
pointers.

For userspace debuggers to unwind the stack and/or to follow pointer
chains, they need to be able to remove the PAC bits before attempting to
use a pointer.

This patch adds a new structure with masks describing the location of
the PAC bits in userspace instruction and data pointers (i.e. those
addressable via TTBR0), which userspace can query via PTRACE_GETREGSET.
By clearing these bits from pointers (and replacing them with the value
of bit 55), userspace can acquire the PAC-less versions.

This new regset is exposed when the kernel is built with (user) pointer
authentication support, and the address authentication feature is
enabled. Otherwise, the regset is hidden.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
[will: Fix to use vabits_user instead of VA_BITS and rename macro]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/memory.h       |  3 +++
 arch/arm64/include/asm/pointer_auth.h |  8 ++++++++
 arch/arm64/include/asm/processor.h    |  2 --
 arch/arm64/include/uapi/asm/ptrace.h  |  7 +++++++
 arch/arm64/kernel/ptrace.c            | 38 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h              |  1 +
 6 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 6747a3eddeb1..bd749033bc88 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -207,6 +207,9 @@ static inline unsigned long kaslr_offset(void)
 	return kimage_vaddr - KIMAGE_VADDR;
 }
 
+/* the actual size of a user virtual address */
+extern u64			vabits_user;
+
 /*
  * Allow all memory at the discovery stage. We will clip it later.
  */
diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h
index 91c4185dda5b..2a22c03c1540 100644
--- a/arch/arm64/include/asm/pointer_auth.h
+++ b/arch/arm64/include/asm/pointer_auth.h
@@ -2,9 +2,11 @@
 #ifndef __ASM_POINTER_AUTH_H
 #define __ASM_POINTER_AUTH_H
 
+#include <linux/bitops.h>
 #include <linux/random.h>
 
 #include <asm/cpufeature.h>
+#include <asm/memory.h>
 #include <asm/sysreg.h>
 
 #ifdef CONFIG_ARM64_PTR_AUTH
@@ -61,6 +63,12 @@ static inline void ptrauth_keys_switch(struct ptrauth_keys *keys)
 		__ptrauth_key_install(APGA, keys->apga);
 }
 
+/*
+ * The EL0 pointer bits used by a pointer authentication code.
+ * This is dependent on TBI0 being enabled, or bits 63:56 would also apply.
+ */
+#define ptrauth_user_pac_mask()	GENMASK(54, vabits_user)
+
 #define ptrauth_thread_init_user(tsk)					\
 do {									\
 	struct task_struct *__ptiu_tsk = (tsk);				\
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index bbecc6fe3e5b..f4b8e09aff56 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -53,8 +53,6 @@
  */
 
 #define DEFAULT_MAP_WINDOW_64	(UL(1) << VA_BITS)
-
-extern u64 vabits_user;
 #define TASK_SIZE_64		(UL(1) << vabits_user)
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index a36227fdb084..c2f249bcd829 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -229,6 +229,13 @@ struct user_sve_header {
 		  SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, flags)	\
 		: SVE_PT_FPSIMD_OFFSET + SVE_PT_FPSIMD_SIZE(vq, flags))
 
+/* pointer authentication masks (NT_ARM_PAC_MASK) */
+
+struct user_pac_mask {
+	__u64		data_mask;
+	__u64		insn_mask;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 1710a2d01669..9dce33b0e260 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -46,6 +46,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/fpsimd.h>
 #include <asm/pgtable.h>
+#include <asm/pointer_auth.h>
 #include <asm/stacktrace.h>
 #include <asm/syscall.h>
 #include <asm/traps.h>
@@ -956,6 +957,30 @@ out:
 
 #endif /* CONFIG_ARM64_SVE */
 
+#ifdef CONFIG_ARM64_PTR_AUTH
+static int pac_mask_get(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			void *kbuf, void __user *ubuf)
+{
+	/*
+	 * The PAC bits can differ across data and instruction pointers
+	 * depending on TCR_EL1.TBID*, which we may make use of in future, so
+	 * we expose separate masks.
+	 */
+	unsigned long mask = ptrauth_user_pac_mask();
+	struct user_pac_mask uregs = {
+		.data_mask = mask,
+		.insn_mask = mask,
+	};
+
+	if (!system_supports_address_auth())
+		return -EINVAL;
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &uregs, 0, -1);
+}
+#endif /* CONFIG_ARM64_PTR_AUTH */
+
 enum aarch64_regset {
 	REGSET_GPR,
 	REGSET_FPR,
@@ -968,6 +993,9 @@ enum aarch64_regset {
 #ifdef CONFIG_ARM64_SVE
 	REGSET_SVE,
 #endif
+#ifdef CONFIG_ARM64_PTR_AUTH
+	REGSET_PAC_MASK,
+#endif
 };
 
 static const struct user_regset aarch64_regsets[] = {
@@ -1037,6 +1065,16 @@ static const struct user_regset aarch64_regsets[] = {
 		.get_size = sve_get_size,
 	},
 #endif
+#ifdef CONFIG_ARM64_PTR_AUTH
+	[REGSET_PAC_MASK] = {
+		.core_note_type = NT_ARM_PAC_MASK,
+		.n = sizeof(struct user_pac_mask) / sizeof(u64),
+		.size = sizeof(u64),
+		.align = sizeof(u64),
+		.get = pac_mask_get,
+		/* this cannot be set dynamically */
+	},
+#endif
 };
 
 static const struct user_regset_view user_aarch64_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index c5358e0ae7c5..3f23273d690c 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -420,6 +420,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_HW_WATCH	0x403		/* ARM hardware watchpoint registers */
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
 #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
+#define NT_ARM_PAC_MASK		0x406	/* ARM pointer authentication code masks */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
-- 
cgit v1.2.3-59-g8ed1b


From ba830885656414101b2f8ca88786524d4bb5e8c1 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Fri, 7 Dec 2018 18:39:28 +0000
Subject: arm64: add prctl control for resetting ptrauth keys

Add an arm64-specific prctl to allow a thread to reinitialize its
pointer authentication keys to random values. This can be useful when
exec() is not used for starting new processes, to ensure that different
processes still have different keys.

Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/pointer_auth.h |  3 +++
 arch/arm64/include/asm/processor.h    |  4 +++
 arch/arm64/kernel/Makefile            |  1 +
 arch/arm64/kernel/pointer_auth.c      | 47 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h            |  8 ++++++
 kernel/sys.c                          |  8 ++++++
 6 files changed, 71 insertions(+)
 create mode 100644 arch/arm64/kernel/pointer_auth.c

(limited to 'include/uapi')

diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h
index 5ccf49b4dac3..80eb03afd677 100644
--- a/arch/arm64/include/asm/pointer_auth.h
+++ b/arch/arm64/include/asm/pointer_auth.h
@@ -63,6 +63,8 @@ static inline void ptrauth_keys_switch(struct ptrauth_keys *keys)
 		__ptrauth_key_install(APGA, keys->apga);
 }
 
+extern int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg);
+
 /*
  * The EL0 pointer bits used by a pointer authentication code.
  * This is dependent on TBI0 being enabled, or bits 63:56 would also apply.
@@ -86,6 +88,7 @@ do {									\
 	ptrauth_keys_switch(&(tsk)->thread_info.keys_user)
 
 #else /* CONFIG_ARM64_PTR_AUTH */
+#define ptrauth_prctl_reset_keys(tsk, arg)	(-EINVAL)
 #define ptrauth_strip_insn_pac(lr)	(lr)
 #define ptrauth_thread_init_user(tsk)
 #define ptrauth_thread_switch(tsk)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index f4b8e09aff56..142c708cb429 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -44,6 +44,7 @@
 #include <asm/hw_breakpoint.h>
 #include <asm/lse.h>
 #include <asm/pgtable-hwdef.h>
+#include <asm/pointer_auth.h>
 #include <asm/ptrace.h>
 #include <asm/types.h>
 
@@ -289,6 +290,9 @@ extern void __init minsigstksz_setup(void);
 #define SVE_SET_VL(arg)	sve_set_current_vl(arg)
 #define SVE_GET_VL()	sve_get_current_vl()
 
+/* PR_PAC_RESET_KEYS prctl */
+#define PAC_RESET_KEYS(tsk, arg)	ptrauth_prctl_reset_keys(tsk, arg)
+
 /*
  * For CONFIG_GCC_PLUGIN_STACKLEAK
  *
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 583334ce1c2c..df08d735b21d 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -58,6 +58,7 @@ arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
 arm64-obj-$(CONFIG_CRASH_CORE)		+= crash_core.o
 arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
 arm64-obj-$(CONFIG_ARM64_SSBD)		+= ssbd.o
+arm64-obj-$(CONFIG_ARM64_PTR_AUTH)	+= pointer_auth.o
 
 obj-y					+= $(arm64-obj-y) vdso/ probes/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c
new file mode 100644
index 000000000000..b9f6f5f3409a
--- /dev/null
+++ b/arch/arm64/kernel/pointer_auth.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/errno.h>
+#include <linux/prctl.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <asm/cpufeature.h>
+#include <asm/pointer_auth.h>
+
+int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg)
+{
+	struct ptrauth_keys *keys = &tsk->thread_info.keys_user;
+	unsigned long addr_key_mask = PR_PAC_APIAKEY | PR_PAC_APIBKEY |
+				      PR_PAC_APDAKEY | PR_PAC_APDBKEY;
+	unsigned long key_mask = addr_key_mask | PR_PAC_APGAKEY;
+
+	if (!system_supports_address_auth() && !system_supports_generic_auth())
+		return -EINVAL;
+
+	if (!arg) {
+		ptrauth_keys_init(keys);
+		ptrauth_keys_switch(keys);
+		return 0;
+	}
+
+	if (arg & ~key_mask)
+		return -EINVAL;
+
+	if (((arg & addr_key_mask) && !system_supports_address_auth()) ||
+	    ((arg & PR_PAC_APGAKEY) && !system_supports_generic_auth()))
+		return -EINVAL;
+
+	if (arg & PR_PAC_APIAKEY)
+		get_random_bytes(&keys->apia, sizeof(keys->apia));
+	if (arg & PR_PAC_APIBKEY)
+		get_random_bytes(&keys->apib, sizeof(keys->apib));
+	if (arg & PR_PAC_APDAKEY)
+		get_random_bytes(&keys->apda, sizeof(keys->apda));
+	if (arg & PR_PAC_APDBKEY)
+		get_random_bytes(&keys->apdb, sizeof(keys->apdb));
+	if (arg & PR_PAC_APGAKEY)
+		get_random_bytes(&keys->apga, sizeof(keys->apga));
+
+	ptrauth_keys_switch(keys);
+
+	return 0;
+}
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index c0d7ea0bf5b6..0f535a501391 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -219,4 +219,12 @@ struct prctl_mm_map {
 # define PR_SPEC_DISABLE		(1UL << 2)
 # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
 
+/* Reset arm64 pointer authentication keys */
+#define PR_PAC_RESET_KEYS		54
+# define PR_PAC_APIAKEY			(1UL << 0)
+# define PR_PAC_APIBKEY			(1UL << 1)
+# define PR_PAC_APDAKEY			(1UL << 2)
+# define PR_PAC_APDBKEY			(1UL << 3)
+# define PR_PAC_APGAKEY			(1UL << 4)
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 123bd73046ec..64b5a230f38d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -121,6 +121,9 @@
 #ifndef SVE_GET_VL
 # define SVE_GET_VL()		(-EINVAL)
 #endif
+#ifndef PAC_RESET_KEYS
+# define PAC_RESET_KEYS(a, b)	(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2476,6 +2479,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			return -EINVAL;
 		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
 		break;
+	case PR_PAC_RESET_KEYS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = PAC_RESET_KEYS(me, arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 72148d1a57e7c76745e68c94ad5d235240d26ac8 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 16 Jun 2017 08:38:31 -0400
Subject: media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT

The V4L2_BUF_TYPE_META_OUTPUT mirrors the V4L2_BUF_TYPE_META_CAPTURE with
the exception that it is an OUTPUT type. The use case for this is to pass
buffers to the device that are not image data but metadata. The formats,
just as the metadata capture formats, are typically device specific and
highly structured.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Tested-by: Tian Shu Qiu <tian.shu.qiu@intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/common/videobuf2/videobuf2-v4l2.c |  1 +
 drivers/media/v4l2-core/v4l2-compat-ioctl32.c   |  2 ++
 drivers/media/v4l2-core/v4l2-dev.c              | 12 ++++++++----
 drivers/media/v4l2-core/v4l2-ioctl.c            | 23 +++++++++++++++++++++++
 include/media/v4l2-ioctl.h                      | 17 +++++++++++++++++
 include/uapi/linux/videodev2.h                  |  2 ++
 6 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c
index 1244c246d0c4..8e7ab0283f06 100644
--- a/drivers/media/common/videobuf2/videobuf2-v4l2.c
+++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c
@@ -704,6 +704,7 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create)
 		requested_sizes[0] = f->fmt.sdr.buffersize;
 		break;
 	case V4L2_BUF_TYPE_META_CAPTURE:
+	case V4L2_BUF_TYPE_META_OUTPUT:
 		requested_sizes[0] = f->fmt.meta.buffersize;
 		break;
 	default:
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index f4325329fbd6..fe4577a46869 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -323,6 +323,7 @@ static int __get_v4l2_format32(struct v4l2_format __user *p64,
 		return copy_in_user(&p64->fmt.sdr, &p32->fmt.sdr,
 				    sizeof(p64->fmt.sdr)) ? -EFAULT : 0;
 	case V4L2_BUF_TYPE_META_CAPTURE:
+	case V4L2_BUF_TYPE_META_OUTPUT:
 		return copy_in_user(&p64->fmt.meta, &p32->fmt.meta,
 				    sizeof(p64->fmt.meta)) ? -EFAULT : 0;
 	default:
@@ -392,6 +393,7 @@ static int __put_v4l2_format32(struct v4l2_format __user *p64,
 		return copy_in_user(&p32->fmt.sdr, &p64->fmt.sdr,
 				    sizeof(p64->fmt.sdr)) ? -EFAULT : 0;
 	case V4L2_BUF_TYPE_META_CAPTURE:
+	case V4L2_BUF_TYPE_META_OUTPUT:
 		return copy_in_user(&p32->fmt.meta, &p64->fmt.meta,
 				    sizeof(p64->fmt.meta)) ? -EFAULT : 0;
 	default:
diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c
index 2130e3a0f4da..d7528f82a66a 100644
--- a/drivers/media/v4l2-core/v4l2-dev.c
+++ b/drivers/media/v4l2-core/v4l2-dev.c
@@ -597,7 +597,8 @@ static void determine_valid_ioctls(struct video_device *vdev)
 			       ops->vidioc_enum_fmt_vid_overlay ||
 			       ops->vidioc_enum_fmt_meta_cap)) ||
 		    (is_tx && (ops->vidioc_enum_fmt_vid_out ||
-			       ops->vidioc_enum_fmt_vid_out_mplane)))
+			       ops->vidioc_enum_fmt_vid_out_mplane ||
+			       ops->vidioc_enum_fmt_meta_out)))
 			set_bit(_IOC_NR(VIDIOC_ENUM_FMT), valid_ioctls);
 		if ((is_rx && (ops->vidioc_g_fmt_vid_cap ||
 			       ops->vidioc_g_fmt_vid_cap_mplane ||
@@ -605,7 +606,8 @@ static void determine_valid_ioctls(struct video_device *vdev)
 			       ops->vidioc_g_fmt_meta_cap)) ||
 		    (is_tx && (ops->vidioc_g_fmt_vid_out ||
 			       ops->vidioc_g_fmt_vid_out_mplane ||
-			       ops->vidioc_g_fmt_vid_out_overlay)))
+			       ops->vidioc_g_fmt_vid_out_overlay ||
+			       ops->vidioc_g_fmt_meta_out)))
 			 set_bit(_IOC_NR(VIDIOC_G_FMT), valid_ioctls);
 		if ((is_rx && (ops->vidioc_s_fmt_vid_cap ||
 			       ops->vidioc_s_fmt_vid_cap_mplane ||
@@ -613,7 +615,8 @@ static void determine_valid_ioctls(struct video_device *vdev)
 			       ops->vidioc_s_fmt_meta_cap)) ||
 		    (is_tx && (ops->vidioc_s_fmt_vid_out ||
 			       ops->vidioc_s_fmt_vid_out_mplane ||
-			       ops->vidioc_s_fmt_vid_out_overlay)))
+			       ops->vidioc_s_fmt_vid_out_overlay ||
+			       ops->vidioc_s_fmt_meta_out)))
 			 set_bit(_IOC_NR(VIDIOC_S_FMT), valid_ioctls);
 		if ((is_rx && (ops->vidioc_try_fmt_vid_cap ||
 			       ops->vidioc_try_fmt_vid_cap_mplane ||
@@ -621,7 +624,8 @@ static void determine_valid_ioctls(struct video_device *vdev)
 			       ops->vidioc_try_fmt_meta_cap)) ||
 		    (is_tx && (ops->vidioc_try_fmt_vid_out ||
 			       ops->vidioc_try_fmt_vid_out_mplane ||
-			       ops->vidioc_try_fmt_vid_out_overlay)))
+			       ops->vidioc_try_fmt_vid_out_overlay ||
+			       ops->vidioc_try_fmt_meta_out)))
 			 set_bit(_IOC_NR(VIDIOC_TRY_FMT), valid_ioctls);
 		SET_VALID_IOCTL(ops, VIDIOC_OVERLAY, vidioc_overlay);
 		SET_VALID_IOCTL(ops, VIDIOC_G_FBUF, vidioc_g_fbuf);
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index ef58bc31bfde..1441a73ce64c 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -194,6 +194,7 @@ const char *v4l2_type_names[] = {
 	[V4L2_BUF_TYPE_SDR_CAPTURE]        = "sdr-cap",
 	[V4L2_BUF_TYPE_SDR_OUTPUT]         = "sdr-out",
 	[V4L2_BUF_TYPE_META_CAPTURE]       = "meta-cap",
+	[V4L2_BUF_TYPE_META_OUTPUT]	   = "meta-out",
 };
 EXPORT_SYMBOL(v4l2_type_names);
 
@@ -366,6 +367,7 @@ static void v4l_print_format(const void *arg, bool write_only)
 			(sdr->pixelformat >> 24) & 0xff);
 		break;
 	case V4L2_BUF_TYPE_META_CAPTURE:
+	case V4L2_BUF_TYPE_META_OUTPUT:
 		meta = &p->fmt.meta;
 		pr_cont(", dataformat=%c%c%c%c, buffersize=%u\n",
 			(meta->dataformat >>  0) & 0xff,
@@ -999,6 +1001,10 @@ static int check_fmt(struct file *file, enum v4l2_buf_type type)
 		if (is_vid && is_rx && ops->vidioc_g_fmt_meta_cap)
 			return 0;
 		break;
+	case V4L2_BUF_TYPE_META_OUTPUT:
+		if (is_vid && is_tx && ops->vidioc_g_fmt_meta_out)
+			return 0;
+		break;
 	default:
 		break;
 	}
@@ -1410,6 +1416,11 @@ static int v4l_enum_fmt(const struct v4l2_ioctl_ops *ops,
 			break;
 		ret = ops->vidioc_enum_fmt_meta_cap(file, fh, arg);
 		break;
+	case V4L2_BUF_TYPE_META_OUTPUT:
+		if (unlikely(!ops->vidioc_enum_fmt_meta_out))
+			break;
+		ret = ops->vidioc_enum_fmt_meta_out(file, fh, arg);
+		break;
 	}
 	if (ret == 0)
 		v4l_fill_fmtdesc(p);
@@ -1488,6 +1499,8 @@ static int v4l_g_fmt(const struct v4l2_ioctl_ops *ops,
 		return ops->vidioc_g_fmt_sdr_out(file, fh, arg);
 	case V4L2_BUF_TYPE_META_CAPTURE:
 		return ops->vidioc_g_fmt_meta_cap(file, fh, arg);
+	case V4L2_BUF_TYPE_META_OUTPUT:
+		return ops->vidioc_g_fmt_meta_out(file, fh, arg);
 	}
 	return -EINVAL;
 }
@@ -1601,6 +1614,11 @@ static int v4l_s_fmt(const struct v4l2_ioctl_ops *ops,
 			break;
 		CLEAR_AFTER_FIELD(p, fmt.meta);
 		return ops->vidioc_s_fmt_meta_cap(file, fh, arg);
+	case V4L2_BUF_TYPE_META_OUTPUT:
+		if (unlikely(!ops->vidioc_s_fmt_meta_out))
+			break;
+		CLEAR_AFTER_FIELD(p, fmt.meta);
+		return ops->vidioc_s_fmt_meta_out(file, fh, arg);
 	}
 	return -EINVAL;
 }
@@ -1693,6 +1711,11 @@ static int v4l_try_fmt(const struct v4l2_ioctl_ops *ops,
 			break;
 		CLEAR_AFTER_FIELD(p, fmt.meta);
 		return ops->vidioc_try_fmt_meta_cap(file, fh, arg);
+	case V4L2_BUF_TYPE_META_OUTPUT:
+		if (unlikely(!ops->vidioc_try_fmt_meta_out))
+			break;
+		CLEAR_AFTER_FIELD(p, fmt.meta);
+		return ops->vidioc_try_fmt_meta_out(file, fh, arg);
 	}
 	return -EINVAL;
 }
diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h
index aa4511aa5ffc..8533ece5026e 100644
--- a/include/media/v4l2-ioctl.h
+++ b/include/media/v4l2-ioctl.h
@@ -48,6 +48,9 @@ struct v4l2_fh;
  * @vidioc_enum_fmt_meta_cap: pointer to the function that implements
  *	:ref:`VIDIOC_ENUM_FMT <vidioc_enum_fmt>` ioctl logic
  *	for metadata capture
+ * @vidioc_enum_fmt_meta_out: pointer to the function that implements
+ *	:ref:`VIDIOC_ENUM_FMT <vidioc_enum_fmt>` ioctl logic
+ *	for metadata output
  * @vidioc_g_fmt_vid_cap: pointer to the function that implements
  *	:ref:`VIDIOC_G_FMT <vidioc_g_fmt>` ioctl logic for video capture
  *	in single plane mode
@@ -80,6 +83,8 @@ struct v4l2_fh;
  *	Radio output
  * @vidioc_g_fmt_meta_cap: pointer to the function that implements
  *	:ref:`VIDIOC_G_FMT <vidioc_g_fmt>` ioctl logic for metadata capture
+ * @vidioc_g_fmt_meta_out: pointer to the function that implements
+ *	:ref:`VIDIOC_G_FMT <vidioc_g_fmt>` ioctl logic for metadata output
  * @vidioc_s_fmt_vid_cap: pointer to the function that implements
  *	:ref:`VIDIOC_S_FMT <vidioc_g_fmt>` ioctl logic for video capture
  *	in single plane mode
@@ -112,6 +117,8 @@ struct v4l2_fh;
  *	Radio output
  * @vidioc_s_fmt_meta_cap: pointer to the function that implements
  *	:ref:`VIDIOC_S_FMT <vidioc_g_fmt>` ioctl logic for metadata capture
+ * @vidioc_s_fmt_meta_out: pointer to the function that implements
+ *	:ref:`VIDIOC_S_FMT <vidioc_g_fmt>` ioctl logic for metadata output
  * @vidioc_try_fmt_vid_cap: pointer to the function that implements
  *	:ref:`VIDIOC_TRY_FMT <vidioc_g_fmt>` ioctl logic for video capture
  *	in single plane mode
@@ -146,6 +153,8 @@ struct v4l2_fh;
  *	Radio output
  * @vidioc_try_fmt_meta_cap: pointer to the function that implements
  *	:ref:`VIDIOC_TRY_FMT <vidioc_g_fmt>` ioctl logic for metadata capture
+ * @vidioc_try_fmt_meta_out: pointer to the function that implements
+ *	:ref:`VIDIOC_TRY_FMT <vidioc_g_fmt>` ioctl logic for metadata output
  * @vidioc_reqbufs: pointer to the function that implements
  *	:ref:`VIDIOC_REQBUFS <vidioc_reqbufs>` ioctl
  * @vidioc_querybuf: pointer to the function that implements
@@ -314,6 +323,8 @@ struct v4l2_ioctl_ops {
 				       struct v4l2_fmtdesc *f);
 	int (*vidioc_enum_fmt_meta_cap)(struct file *file, void *fh,
 					struct v4l2_fmtdesc *f);
+	int (*vidioc_enum_fmt_meta_out)(struct file *file, void *fh,
+					struct v4l2_fmtdesc *f);
 
 	/* VIDIOC_G_FMT handlers */
 	int (*vidioc_g_fmt_vid_cap)(struct file *file, void *fh,
@@ -342,6 +353,8 @@ struct v4l2_ioctl_ops {
 				    struct v4l2_format *f);
 	int (*vidioc_g_fmt_meta_cap)(struct file *file, void *fh,
 				     struct v4l2_format *f);
+	int (*vidioc_g_fmt_meta_out)(struct file *file, void *fh,
+				     struct v4l2_format *f);
 
 	/* VIDIOC_S_FMT handlers */
 	int (*vidioc_s_fmt_vid_cap)(struct file *file, void *fh,
@@ -370,6 +383,8 @@ struct v4l2_ioctl_ops {
 				    struct v4l2_format *f);
 	int (*vidioc_s_fmt_meta_cap)(struct file *file, void *fh,
 				     struct v4l2_format *f);
+	int (*vidioc_s_fmt_meta_out)(struct file *file, void *fh,
+				     struct v4l2_format *f);
 
 	/* VIDIOC_TRY_FMT handlers */
 	int (*vidioc_try_fmt_vid_cap)(struct file *file, void *fh,
@@ -398,6 +413,8 @@ struct v4l2_ioctl_ops {
 				      struct v4l2_format *f);
 	int (*vidioc_try_fmt_meta_cap)(struct file *file, void *fh,
 				       struct v4l2_format *f);
+	int (*vidioc_try_fmt_meta_out)(struct file *file, void *fh,
+				       struct v4l2_format *f);
 
 	/* Buffer handlers */
 	int (*vidioc_reqbufs)(struct file *file, void *fh,
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 2db1635de956..a9d47b1b9437 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -145,6 +145,7 @@ enum v4l2_buf_type {
 	V4L2_BUF_TYPE_SDR_CAPTURE          = 11,
 	V4L2_BUF_TYPE_SDR_OUTPUT           = 12,
 	V4L2_BUF_TYPE_META_CAPTURE         = 13,
+	V4L2_BUF_TYPE_META_OUTPUT	   = 14,
 	/* Deprecated, do not use */
 	V4L2_BUF_TYPE_PRIVATE              = 0x80,
 };
@@ -469,6 +470,7 @@ struct v4l2_capability {
 #define V4L2_CAP_READWRITE              0x01000000  /* read/write systemcalls */
 #define V4L2_CAP_ASYNCIO                0x02000000  /* async I/O */
 #define V4L2_CAP_STREAMING              0x04000000  /* streaming I/O ioctls */
+#define V4L2_CAP_META_OUTPUT		0x08000000  /* Is a metadata output device */
 
 #define V4L2_CAP_TOUCH                  0x10000000  /* Is a touch device */
 
-- 
cgit v1.2.3-59-g8ed1b


From 2a31b9db153530df4aa02dac8c32837bf5f47019 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Oct 2018 02:36:47 +0200
Subject: kvm: introduce manual dirty log reprotect

There are two problems with KVM_GET_DIRTY_LOG.  First, and less important,
it can take kvm->mmu_lock for an extended period of time.  Second, its user
can actually see many false positives in some cases.  The latter is due
to a benign race like this:

  1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects
     them.
  2. The guest modifies the pages, causing them to be marked ditry.
  3. Userspace actually copies the pages.
  4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though
     they were not written to since (3).

This is especially a problem for large guests, where the time between
(1) and (3) can be substantial.  This patch introduces a new
capability which, when enabled, makes KVM_GET_DIRTY_LOG not
write-protect the pages it returns.  Instead, userspace has to
explicitly clear the dirty log bits just before using the content
of the page.  The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a
64-page granularity rather than requiring to sync a full memslot;
this way, the mmu_lock is taken for small amounts of time, and
only a small amount of time will pass between write protection
of pages and the sending of their content.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt                  |  67 +++++++++++
 arch/mips/kvm/mips.c                               |  23 ++++
 arch/x86/kvm/x86.c                                 |  27 +++++
 include/linux/kvm_host.h                           |   5 +
 include/uapi/linux/kvm.h                           |  15 +++
 tools/testing/selftests/kvm/Makefile               |   2 +
 tools/testing/selftests/kvm/clear_dirty_log_test.c |   2 +
 tools/testing/selftests/kvm/dirty_log_test.c       |  19 +++
 tools/testing/selftests/kvm/include/kvm_util.h     |   2 +
 tools/testing/selftests/kvm/lib/kvm_util.c         |  13 ++
 virt/kvm/arm/arm.c                                 |  16 +++
 virt/kvm/kvm_main.c                                | 132 ++++++++++++++++++---
 12 files changed, 306 insertions(+), 17 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/clear_dirty_log_test.c

(limited to 'include/uapi')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 1071c10cf1c7..f2c345f7b630 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -305,6 +305,9 @@ the address space for which you want to return the dirty bitmap.
 They must be less than the value that KVM_CHECK_EXTENSION returns for
 the KVM_CAP_MULTI_ADDRESS_SPACE capability.
 
+The bits in the dirty bitmap are cleared before the ioctl returns, unless
+KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is enabled.  For more information,
+see the description of the capability.
 
 4.9 KVM_SET_MEMORY_ALIAS
 
@@ -3758,6 +3761,46 @@ Coalesced pio is based on coalesced mmio. There is little difference
 between coalesced mmio and pio except that coalesced pio records accesses
 to I/O ports.
 
+4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl)
+
+Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_dirty_log (in)
+Returns: 0 on success, -1 on error
+
+/* for KVM_CLEAR_DIRTY_LOG */
+struct kvm_clear_dirty_log {
+	__u32 slot;
+	__u32 num_pages;
+	__u64 first_page;
+	union {
+		void __user *dirty_bitmap; /* one bit per page */
+		__u64 padding;
+	};
+};
+
+The ioctl clears the dirty status of pages in a memory slot, according to
+the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap
+field.  Bit 0 of the bitmap corresponds to page "first_page" in the
+memory slot, and num_pages is the size in bits of the input bitmap.
+Both first_page and num_pages must be a multiple of 64.  For each bit
+that is set in the input bitmap, the corresponding page is marked "clean"
+in KVM's dirty bitmap, and dirty tracking is re-enabled for that page
+(for example via write-protection, or by clearing the dirty bit in
+a page table entry).
+
+If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies
+the address space for which you want to return the dirty bitmap.
+They must be less than the value that KVM_CHECK_EXTENSION returns for
+the KVM_CAP_MULTI_ADDRESS_SPACE capability.
+
+This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
+is enabled; for more information, see the description of the capability.
+However, it can always be used as long as KVM_CHECK_EXTENSION confirms
+that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present.
+
+
 5. The kvm_run structure
 ------------------------
 
@@ -4652,6 +4695,30 @@ and injected exceptions.
 * For the new DR6 bits, note that bit 16 is set iff the #DB exception
   will clear DR6.RTM.
 
+7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT
+
+Architectures: all
+Parameters: args[0] whether feature should be enabled or not
+
+With this capability enabled, KVM_GET_DIRTY_LOG will not automatically
+clear and write-protect all pages that are returned as dirty.
+Rather, userspace will have to do this operation separately using
+KVM_CLEAR_DIRTY_LOG.
+
+At the cost of a slightly more complicated operation, this provides better
+scalability and responsiveness for two reasons.  First,
+KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather
+than requiring to sync a full memslot; this ensures that KVM does not
+take spinlocks for an extended period of time.  Second, in some cases a
+large amount of time can pass between a call to KVM_GET_DIRTY_LOG and
+userspace actually using the data in the page.  Pages can be modified
+during this time, which is inefficint for both the guest and userspace:
+the guest will incur a higher penalty due to write protection faults,
+while userspace can see false reports of dirty pages.  Manual reprotection
+helps reducing this time, improving guest performance and reducing the
+number of dirty log false positives.
+
+
 8. Other capabilities.
 ----------------------
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 3898e657952e..3734cd58895e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1023,6 +1023,29 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 	return r;
 }
 
+int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	bool flush = false;
+	int r;
+
+	mutex_lock(&kvm->slots_lock);
+
+	r = kvm_clear_dirty_log_protect(kvm, log, &flush);
+
+	if (flush) {
+		slots = kvm_memslots(kvm);
+		memslot = id_to_memslot(slots, log->slot);
+
+		/* Let implementation handle TLB/GVA invalidation */
+		kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
+	}
+
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	long r;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 448f011aa317..6af846c54660 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4418,6 +4418,33 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 	return r;
 }
 
+int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
+{
+	bool flush = false;
+	int r;
+
+	mutex_lock(&kvm->slots_lock);
+
+	/*
+	 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+	 */
+	if (kvm_x86_ops->flush_log_dirty)
+		kvm_x86_ops->flush_log_dirty(kvm);
+
+	r = kvm_clear_dirty_log_protect(kvm, log, &flush);
+
+	/*
+	 * All the TLBs can be flushed out of mmu lock, see the comments in
+	 * kvm_mmu_slot_remove_write_access().
+	 */
+	lockdep_assert_held(&kvm->slots_lock);
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 			bool line_status)
 {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8c56b2873b13..e065aeaae29e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -449,6 +449,7 @@ struct kvm {
 #endif
 	long tlbs_dirty;
 	struct list_head devices;
+	bool manual_dirty_log_protect;
 	struct dentry *debugfs_dentry;
 	struct kvm_stat_data **debugfs_stat_data;
 	struct srcu_struct srcu;
@@ -754,6 +755,8 @@ int kvm_get_dirty_log(struct kvm *kvm,
 
 int kvm_get_dirty_log_protect(struct kvm *kvm,
 			      struct kvm_dirty_log *log, bool *flush);
+int kvm_clear_dirty_log_protect(struct kvm *kvm,
+				struct kvm_clear_dirty_log *log, bool *flush);
 
 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 					struct kvm_memory_slot *slot,
@@ -762,6 +765,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				struct kvm_dirty_log *log);
+int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
+				  struct kvm_clear_dirty_log *log);
 
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 			bool line_status);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2b7a652c9fa4..9fe35f1ac938 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -492,6 +492,17 @@ struct kvm_dirty_log {
 	};
 };
 
+/* for KVM_CLEAR_DIRTY_LOG */
+struct kvm_clear_dirty_log {
+	__u32 slot;
+	__u32 num_pages;
+	__u64 first_page;
+	union {
+		void __user *dirty_bitmap; /* one bit per page */
+		__u64 padding2;
+	};
+};
+
 /* for KVM_SET_SIGNAL_MASK */
 struct kvm_signal_mask {
 	__u32 len;
@@ -975,6 +986,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
 #define KVM_CAP_EXCEPTION_PAYLOAD 164
 #define KVM_CAP_ARM_VM_IPA_SIZE 165
+#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1421,6 +1433,9 @@ struct kvm_enc_region {
 #define KVM_GET_NESTED_STATE         _IOWR(KVMIO, 0xbe, struct kvm_nested_state)
 #define KVM_SET_NESTED_STATE         _IOW(KVMIO,  0xbf, struct kvm_nested_state)
 
+/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */
+#define KVM_CLEAR_DIRTY_LOG          _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 52bfe5e76907..caaa0d5eba92 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -16,8 +16,10 @@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/state_test
 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
+TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
 
 TEST_GEN_PROGS_aarch64 += dirty_log_test
+TEST_GEN_PROGS_aarch64 += clear_dirty_log_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
 LIBKVM += $(LIBKVM_$(UNAME_M))
diff --git a/tools/testing/selftests/kvm/clear_dirty_log_test.c b/tools/testing/selftests/kvm/clear_dirty_log_test.c
new file mode 100644
index 000000000000..749336937d37
--- /dev/null
+++ b/tools/testing/selftests/kvm/clear_dirty_log_test.c
@@ -0,0 +1,2 @@
+#define USE_CLEAR_DIRTY_LOG
+#include "dirty_log_test.c"
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index aeff95a91b15..4629c7ccfa28 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -275,6 +275,14 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
 
 	vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code);
 
+#ifdef USE_CLEAR_DIRTY_LOG
+	struct kvm_enable_cap cap = {};
+
+	cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT;
+	cap.args[0] = 1;
+	vm_enable_cap(vm, &cap);
+#endif
+
 	/* Add an extra memory slot for testing dirty logging */
 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
 				    guest_test_mem,
@@ -316,6 +324,10 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
 		/* Give the vcpu thread some time to dirty some pages */
 		usleep(interval * 1000);
 		kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
+#ifdef USE_CLEAR_DIRTY_LOG
+		kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
+				       DIV_ROUND_UP(host_num_pages, 64) * 64);
+#endif
 		vm_dirty_log_verify(bmap);
 		iteration++;
 		sync_global_to_guest(vm, iteration);
@@ -392,6 +404,13 @@ int main(int argc, char *argv[])
 	unsigned int mode;
 	int opt, i;
 
+#ifdef USE_CLEAR_DIRTY_LOG
+	if (!kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT)) {
+		fprintf(stderr, "KVM_CLEAR_DIRTY_LOG not available, skipping tests\n");
+		exit(KSFT_SKIP);
+	}
+#endif
+
 	while ((opt = getopt(argc, argv, "hi:I:o:tm:")) != -1) {
 		switch (opt) {
 		case 'i':
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index a4e59e3b4826..c51bfaba017a 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -58,6 +58,8 @@ void kvm_vm_free(struct kvm_vm *vmp);
 void kvm_vm_restart(struct kvm_vm *vmp, int perm);
 void kvm_vm_release(struct kvm_vm *vmp);
 void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log);
+void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
+			    uint64_t first_page, uint32_t num_pages);
 
 int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva,
 		       size_t len);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 1b41e71283d5..c9e94d6503af 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -231,6 +231,19 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
 		    strerror(-ret));
 }
 
+void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
+			    uint64_t first_page, uint32_t num_pages)
+{
+	struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot,
+		                            .first_page = first_page,
+	                                    .num_pages = num_pages };
+	int ret;
+
+	ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);
+	TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s",
+		    strerror(-ret));
+}
+
 /*
  * Userspace Memory Region Find
  *
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 120a2663dab9..e91adf77d99a 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -1219,6 +1219,22 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 	return r;
 }
 
+int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
+{
+	bool flush = false;
+	int r;
+
+	mutex_lock(&kvm->slots_lock);
+
+	r = kvm_clear_dirty_log_protect(kvm, log, &flush);
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
 					struct kvm_arm_device_addr *dev_addr)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 54f0fcfd431e..0041947b7390 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1133,7 +1133,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 /**
  * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
- *	are dirty write protect them for next write.
+ *	and reenable dirty page tracking for the corresponding pages.
  * @kvm:	pointer to kvm instance
  * @log:	slot id and address to which we copy the log
  * @is_dirty:	flag set if any page is dirty
@@ -1176,37 +1176,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 		return -ENOENT;
 
 	n = kvm_dirty_bitmap_bytes(memslot);
+	*flush = false;
+	if (kvm->manual_dirty_log_protect) {
+		/*
+		 * Unlike kvm_get_dirty_log, we always return false in *flush,
+		 * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
+		 * is some code duplication between this function and
+		 * kvm_get_dirty_log, but hopefully all architecture
+		 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
+		 * can be eliminated.
+		 */
+		dirty_bitmap_buffer = dirty_bitmap;
+	} else {
+		dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
+		memset(dirty_bitmap_buffer, 0, n);
 
-	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
-	memset(dirty_bitmap_buffer, 0, n);
+		spin_lock(&kvm->mmu_lock);
+		for (i = 0; i < n / sizeof(long); i++) {
+			unsigned long mask;
+			gfn_t offset;
 
-	spin_lock(&kvm->mmu_lock);
+			if (!dirty_bitmap[i])
+				continue;
+
+			*flush = true;
+			mask = xchg(&dirty_bitmap[i], 0);
+			dirty_bitmap_buffer[i] = mask;
+
+			if (mask) {
+				offset = i * BITS_PER_LONG;
+				kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
+									offset, mask);
+			}
+		}
+		spin_unlock(&kvm->mmu_lock);
+	}
+
+	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+
+/**
+ * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
+ *	and reenable dirty page tracking for the corresponding pages.
+ * @kvm:	pointer to kvm instance
+ * @log:	slot id and address from which to fetch the bitmap of dirty pages
+ */
+int kvm_clear_dirty_log_protect(struct kvm *kvm,
+				struct kvm_clear_dirty_log *log, bool *flush)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int as_id, id, n;
+	gfn_t offset;
+	unsigned long i;
+	unsigned long *dirty_bitmap;
+	unsigned long *dirty_bitmap_buffer;
+
+	as_id = log->slot >> 16;
+	id = (u16)log->slot;
+	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+		return -EINVAL;
+
+	if ((log->first_page & 63) || (log->num_pages & 63))
+		return -EINVAL;
+
+	slots = __kvm_memslots(kvm, as_id);
+	memslot = id_to_memslot(slots, id);
+
+	dirty_bitmap = memslot->dirty_bitmap;
+	if (!dirty_bitmap)
+		return -ENOENT;
+
+	n = kvm_dirty_bitmap_bytes(memslot);
 	*flush = false;
-	for (i = 0; i < n / sizeof(long); i++) {
-		unsigned long mask;
-		gfn_t offset;
+	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
+	if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
+		return -EFAULT;
 
-		if (!dirty_bitmap[i])
+	spin_lock(&kvm->mmu_lock);
+	for (offset = log->first_page,
+	     i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--;
+	     i++, offset += BITS_PER_LONG) {
+		unsigned long mask = *dirty_bitmap_buffer++;
+		atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
+		if (!mask)
 			continue;
 
-		*flush = true;
-
-		mask = xchg(&dirty_bitmap[i], 0);
-		dirty_bitmap_buffer[i] = mask;
+		mask &= atomic_long_fetch_andnot(mask, p);
 
+		/*
+		 * mask contains the bits that really have been cleared.  This
+		 * never includes any bits beyond the length of the memslot (if
+		 * the length is not aligned to 64 pages), therefore it is not
+		 * a problem if userspace sets them in log->dirty_bitmap.
+		*/
 		if (mask) {
-			offset = i * BITS_PER_LONG;
+			*flush = true;
 			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
 								offset, mask);
 		}
 	}
-
 	spin_unlock(&kvm->mmu_lock);
-	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
-		return -EFAULT;
+
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
 #endif
 
 bool kvm_largepages_enabled(void)
@@ -2949,6 +3026,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 	case KVM_CAP_ENABLE_CAP_VM:
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
+#endif
 		return 1;
 #ifdef CONFIG_KVM_MMIO
 	case KVM_CAP_COALESCED_MMIO:
@@ -2982,6 +3062,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 					   struct kvm_enable_cap *cap)
 {
 	switch (cap->cap) {
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
+		if (cap->flags || (cap->args[0] & ~1))
+			return -EINVAL;
+		kvm->manual_dirty_log_protect = cap->args[0];
+		return 0;
+#endif
 	default:
 		return kvm_vm_ioctl_enable_cap(kvm, cap);
 	}
@@ -3029,6 +3116,17 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
 		break;
 	}
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+	case KVM_CLEAR_DIRTY_LOG: {
+		struct kvm_clear_dirty_log log;
+
+		r = -EFAULT;
+		if (copy_from_user(&log, argp, sizeof(log)))
+			goto out;
+		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
+		break;
+	}
+#endif
 #ifdef CONFIG_KVM_MMIO
 	case KVM_REGISTER_COALESCED_MMIO: {
 		struct kvm_coalesced_mmio_zone zone;
-- 
cgit v1.2.3-59-g8ed1b


From 2bc39970e9327ceb06cb210f86ba35f81d00e350 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 10 Dec 2018 18:21:56 +0100
Subject: x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID

With every new Hyper-V Enlightenment we implement we're forced to add a
KVM_CAP_HYPERV_* capability. While this approach works it is fairly
inconvenient: the majority of the enlightenments we do have corresponding
CPUID feature bit(s) and userspace has to know this anyways to be able to
expose the feature to the guest.

Add KVM_GET_SUPPORTED_HV_CPUID ioctl (backed by KVM_CAP_HYPERV_CPUID, "one
cap to rule them all!") returning all Hyper-V CPUID feature leaves.

Using the existing KVM_GET_SUPPORTED_CPUID doesn't seem to be possible:
Hyper-V CPUID feature leaves intersect with KVM's (e.g. 0x40000000,
0x40000001) and we would probably confuse userspace in case we decide to
return these twice.

KVM_CAP_HYPERV_CPUID's number is interim: we're intended to drop
KVM_CAP_HYPERV_STIMER_DIRECT and use its number instead.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  56 ++++++++++++++++++
 arch/x86/kvm/hyperv.c             | 121 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/hyperv.h             |   2 +
 arch/x86/kvm/x86.c                |  20 +++++++
 include/uapi/linux/kvm.h          |   4 ++
 5 files changed, 203 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f2c345f7b630..356156f5c52d 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3800,6 +3800,62 @@ is enabled; for more information, see the description of the capability.
 However, it can always be used as long as KVM_CHECK_EXTENSION confirms
 that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present.
 
+4.118 KVM_GET_SUPPORTED_HV_CPUID
+
+Capability: KVM_CAP_HYPERV_CPUID
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_cpuid2 (in/out)
+Returns: 0 on success, -1 on error
+
+struct kvm_cpuid2 {
+	__u32 nent;
+	__u32 padding;
+	struct kvm_cpuid_entry2 entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+	__u32 function;
+	__u32 index;
+	__u32 flags;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding[3];
+};
+
+This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in
+KVM.  Userspace can use the information returned by this ioctl to construct
+cpuid information presented to guests consuming Hyper-V enlightenments (e.g.
+Windows or Hyper-V guests).
+
+CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level
+Functional Specification (TLFS). These leaves can't be obtained with
+KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature
+leaves (0x40000000, 0x40000001).
+
+Currently, the following list of CPUID leaves are returned:
+ HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS
+ HYPERV_CPUID_INTERFACE
+ HYPERV_CPUID_VERSION
+ HYPERV_CPUID_FEATURES
+ HYPERV_CPUID_ENLIGHTMENT_INFO
+ HYPERV_CPUID_IMPLEMENT_LIMITS
+ HYPERV_CPUID_NESTED_FEATURES
+
+HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was
+enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
+
+Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
+with the 'nent' field indicating the number of entries in the variable-size
+array 'entries'.  If the number of entries is too low to describe all Hyper-V
+feature leaves, an error (E2BIG) is returned. If the number is more or equal
+to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the
+number of valid entries in the 'entries' array, which is then filled.
+
+'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved,
+userspace should not expect to get any particular value there.
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a9b5d6c4e3eb..e3c076020d4e 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1752,3 +1752,124 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
 		return kvm_hv_eventfd_deassign(kvm, args->conn_id);
 	return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
 }
+
+int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+				struct kvm_cpuid_entry2 __user *entries)
+{
+	uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu);
+	struct kvm_cpuid_entry2 cpuid_entries[] = {
+		{ .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS },
+		{ .function = HYPERV_CPUID_INTERFACE },
+		{ .function = HYPERV_CPUID_VERSION },
+		{ .function = HYPERV_CPUID_FEATURES },
+		{ .function = HYPERV_CPUID_ENLIGHTMENT_INFO },
+		{ .function = HYPERV_CPUID_IMPLEMENT_LIMITS },
+		{ .function = HYPERV_CPUID_NESTED_FEATURES },
+	};
+	int i, nent = ARRAY_SIZE(cpuid_entries);
+
+	/* Skip NESTED_FEATURES if eVMCS is not supported */
+	if (!evmcs_ver)
+		--nent;
+
+	if (cpuid->nent < nent)
+		return -E2BIG;
+
+	if (cpuid->nent > nent)
+		cpuid->nent = nent;
+
+	for (i = 0; i < nent; i++) {
+		struct kvm_cpuid_entry2 *ent = &cpuid_entries[i];
+		u32 signature[3];
+
+		switch (ent->function) {
+		case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS:
+			memcpy(signature, "Linux KVM Hv", 12);
+
+			ent->eax = HYPERV_CPUID_NESTED_FEATURES;
+			ent->ebx = signature[0];
+			ent->ecx = signature[1];
+			ent->edx = signature[2];
+			break;
+
+		case HYPERV_CPUID_INTERFACE:
+			memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
+			ent->eax = signature[0];
+			break;
+
+		case HYPERV_CPUID_VERSION:
+			/*
+			 * We implement some Hyper-V 2016 functions so let's use
+			 * this version.
+			 */
+			ent->eax = 0x00003839;
+			ent->ebx = 0x000A0000;
+			break;
+
+		case HYPERV_CPUID_FEATURES:
+			ent->eax |= HV_X64_MSR_VP_RUNTIME_AVAILABLE;
+			ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE;
+			ent->eax |= HV_X64_MSR_SYNIC_AVAILABLE;
+			ent->eax |= HV_MSR_SYNTIMER_AVAILABLE;
+			ent->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE;
+			ent->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
+			ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE;
+			ent->eax |= HV_X64_MSR_RESET_AVAILABLE;
+			ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
+			ent->eax |= HV_X64_MSR_GUEST_IDLE_AVAILABLE;
+			ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS;
+			ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT;
+
+			ent->ebx |= HV_X64_POST_MESSAGES;
+			ent->ebx |= HV_X64_SIGNAL_EVENTS;
+
+			ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
+			ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+			ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
+
+			break;
+
+		case HYPERV_CPUID_ENLIGHTMENT_INFO:
+			ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+			ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
+			ent->eax |= HV_X64_SYSTEM_RESET_RECOMMENDED;
+			ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
+			ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
+			ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
+			ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
+
+			/*
+			 * Default number of spinlock retry attempts, matches
+			 * HyperV 2016.
+			 */
+			ent->ebx = 0x00000FFF;
+
+			break;
+
+		case HYPERV_CPUID_IMPLEMENT_LIMITS:
+			/* Maximum number of virtual processors */
+			ent->eax = KVM_MAX_VCPUS;
+			/*
+			 * Maximum number of logical processors, matches
+			 * HyperV 2016.
+			 */
+			ent->ebx = 64;
+
+			break;
+
+		case HYPERV_CPUID_NESTED_FEATURES:
+			ent->eax = evmcs_ver;
+
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	if (copy_to_user(entries, cpuid_entries,
+			 nent * sizeof(struct kvm_cpuid_entry2)))
+		return -EFAULT;
+
+	return 0;
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 9c21c3479899..fd7cf13a2144 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -97,5 +97,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
 void kvm_hv_init_vm(struct kvm *kvm);
 void kvm_hv_destroy_vm(struct kvm *kvm);
 int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
+int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+				struct kvm_cpuid_entry2 __user *entries);
 
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54ef79421c6b..18b7817af2bc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2997,6 +2997,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_TLBFLUSH:
 	case KVM_CAP_HYPERV_SEND_IPI:
 	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+	case KVM_CAP_HYPERV_CPUID:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -4191,6 +4192,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
 		break;
 	}
+	case KVM_GET_SUPPORTED_HV_CPUID: {
+		struct kvm_cpuid2 __user *cpuid_arg = argp;
+		struct kvm_cpuid2 cpuid;
+
+		r = -EFAULT;
+		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+			goto out;
+
+		r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
+						cpuid_arg->entries);
+		if (r)
+			goto out;
+
+		r = -EFAULT;
+		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 9fe35f1ac938..6d4ea4b6c922 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -987,6 +987,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_EXCEPTION_PAYLOAD 164
 #define KVM_CAP_ARM_VM_IPA_SIZE 165
 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
+#define KVM_CAP_HYPERV_CPUID 167
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1436,6 +1437,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */
 #define KVM_CLEAR_DIRTY_LOG          _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log)
 
+/* Available with KVM_CAP_HYPERV_CPUID */
+#define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
-- 
cgit v1.2.3-59-g8ed1b


From 17abc9ec68b73ddeb262a507a62421016b9c54d5 Mon Sep 17 00:00:00 2001
From: Tomasz Duszynski <tduszyns@gmail.com>
Date: Fri, 14 Dec 2018 19:28:01 +0100
Subject: iio: add IIO_MASSCONCENTRATION channel type

Measuring particulate matter in ug / m3 (micro-grams per cubic meter)
is de facto standard. Existing air quality sensors usually follow
this convention and are capable of returning measurements using
this unit.

IIO currently does not offer suitable channel type for this
type of measurements hence this patch adds this.

In addition, extra modifiers are introduced used for distinguishing
between fine pm1, pm2p5 and coarse pm4, pm10 particle measurements, i.e
IIO_MOD_PM1, IIO_MOD_PM25 and IIO_MOD_PM4, IIO_MOD_PM10.

pmX consists of particles with aerodynamic diameter less or equal to
X micrometers.

Signed-off-by: Tomasz Duszynski <tduszyns@gmail.com>
Acked-by: Matt Ranostay <matt.ranostay@konsulko.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/ABI/testing/sysfs-bus-iio | 17 ++++++++++++++++-
 drivers/iio/industrialio-core.c         |  5 +++++
 include/uapi/linux/iio/types.h          |  5 +++++
 tools/iio/iio_event_monitor.c           | 10 ++++++++++
 4 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 8127a08e366d..67fd88bf7910 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1684,4 +1684,19 @@ KernelVersion:	4.18
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Raw (unscaled) phase difference reading from channel Y
-		that can be processed to radians.
\ No newline at end of file
+		that can be processed to radians.
+
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm1_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm1_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm2p5_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm2p5_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm4_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm4_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm10_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm10_input
+KernelVersion:	4.22
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Mass concentration reading of particulate matter in ug / m3.
+		pmX consists of particles with aerodynamic diameter less or
+		equal to X micrometers.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 4f5cd9f60870..4700fd5d8c90 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -87,6 +87,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_GRAVITY]  = "gravity",
 	[IIO_POSITIONRELATIVE]  = "positionrelative",
 	[IIO_PHASE] = "phase",
+	[IIO_MASSCONCENTRATION] = "massconcentration",
 };
 
 static const char * const iio_modifier_names[] = {
@@ -127,6 +128,10 @@ static const char * const iio_modifier_names[] = {
 	[IIO_MOD_Q] = "q",
 	[IIO_MOD_CO2] = "co2",
 	[IIO_MOD_VOC] = "voc",
+	[IIO_MOD_PM1] = "pm1",
+	[IIO_MOD_PM2P5] = "pm2p5",
+	[IIO_MOD_PM4] = "pm4",
+	[IIO_MOD_PM10] = "pm10",
 };
 
 /* relies on pairs of these shared then separate */
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index 92baabc103ac..c59adac24b1c 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -46,6 +46,7 @@ enum iio_chan_type {
 	IIO_GRAVITY,
 	IIO_POSITIONRELATIVE,
 	IIO_PHASE,
+	IIO_MASSCONCENTRATION,
 };
 
 enum iio_modifier {
@@ -87,6 +88,10 @@ enum iio_modifier {
 	IIO_MOD_VOC,
 	IIO_MOD_LIGHT_UV,
 	IIO_MOD_LIGHT_DUV,
+	IIO_MOD_PM1,
+	IIO_MOD_PM2P5,
+	IIO_MOD_PM4,
+	IIO_MOD_PM10,
 };
 
 enum iio_event_type {
diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c
index ac2de6b7e89f..f6b8003fbe3c 100644
--- a/tools/iio/iio_event_monitor.c
+++ b/tools/iio/iio_event_monitor.c
@@ -60,6 +60,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_GRAVITY] = "gravity",
 	[IIO_POSITIONRELATIVE] = "positionrelative",
 	[IIO_PHASE] = "phase",
+	[IIO_MASSCONCENTRATION] = "massconcentration",
 };
 
 static const char * const iio_ev_type_text[] = {
@@ -115,6 +116,10 @@ static const char * const iio_modifier_names[] = {
 	[IIO_MOD_Q] = "q",
 	[IIO_MOD_CO2] = "co2",
 	[IIO_MOD_VOC] = "voc",
+	[IIO_MOD_PM1] = "pm1",
+	[IIO_MOD_PM2P5] = "pm2p5",
+	[IIO_MOD_PM4] = "pm4",
+	[IIO_MOD_PM10] = "pm10",
 };
 
 static bool event_is_known(struct iio_event_data *event)
@@ -156,6 +161,7 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_GRAVITY:
 	case IIO_POSITIONRELATIVE:
 	case IIO_PHASE:
+	case IIO_MASSCONCENTRATION:
 		break;
 	default:
 		return false;
@@ -200,6 +206,10 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_MOD_Q:
 	case IIO_MOD_CO2:
 	case IIO_MOD_VOC:
+	case IIO_MOD_PM1:
+	case IIO_MOD_PM2P5:
+	case IIO_MOD_PM4:
+	case IIO_MOD_PM10:
 		break;
 	default:
 		return false;
-- 
cgit v1.2.3-59-g8ed1b


From b170f7d48443d1ea3e4ffbf409025b5e5b1146fe Mon Sep 17 00:00:00 2001
From: Andreas Brauchli <a.brauchli@elementarea.net>
Date: Thu, 13 Dec 2018 15:43:22 +0100
Subject: iio: Add modifiers for ethanol and H2 gases

Add ethanol and H2 gas modifiers:
* IIO_MOD_ETHANOL
* IIO_MOD_H2

Signed-off-by: Andreas Brauchli <andreas.brauchli@sensirion.com>
Acked-by: Matt Ranostay <matt.ranostay@konsulko.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/ABI/testing/sysfs-bus-iio | 4 ++++
 include/uapi/linux/iio/types.h          | 2 ++
 tools/iio/iio_event_monitor.c           | 4 ++++
 3 files changed, 10 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 67fd88bf7910..864f8efd12e5 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1554,6 +1554,10 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_co2_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_co2_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_ethanol_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_ethanol_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_h2_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_h2_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_voc_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_voc_raw
 KernelVersion:	4.3
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index c59adac24b1c..fdd81affca4b 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -92,6 +92,8 @@ enum iio_modifier {
 	IIO_MOD_PM2P5,
 	IIO_MOD_PM4,
 	IIO_MOD_PM10,
+	IIO_MOD_ETHANOL,
+	IIO_MOD_H2,
 };
 
 enum iio_event_type {
diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c
index f6b8003fbe3c..7bf9bde28bcc 100644
--- a/tools/iio/iio_event_monitor.c
+++ b/tools/iio/iio_event_monitor.c
@@ -115,6 +115,8 @@ static const char * const iio_modifier_names[] = {
 	[IIO_MOD_I] = "i",
 	[IIO_MOD_Q] = "q",
 	[IIO_MOD_CO2] = "co2",
+	[IIO_MOD_ETHANOL] = "ethanol",
+	[IIO_MOD_H2] = "h2",
 	[IIO_MOD_VOC] = "voc",
 	[IIO_MOD_PM1] = "pm1",
 	[IIO_MOD_PM2P5] = "pm2p5",
@@ -205,6 +207,8 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_MOD_I:
 	case IIO_MOD_Q:
 	case IIO_MOD_CO2:
+	case IIO_MOD_ETHANOL:
+	case IIO_MOD_H2:
 	case IIO_MOD_VOC:
 	case IIO_MOD_PM1:
 	case IIO_MOD_PM2P5:
-- 
cgit v1.2.3-59-g8ed1b


From df9b0e30d44c901ac27c0f38cd54511b3f130c6d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 15 Dec 2018 14:09:06 -0800
Subject: neighbor: Add protocol attribute

Similar to routes and rules, add protocol attribute to neighbor entries
for easier tracking of how each was created.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h        |  2 ++
 include/uapi/linux/neighbour.h |  1 +
 net/core/neighbour.c           | 24 +++++++++++++++++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 30fd50adf234..66221f1991c0 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -149,6 +149,7 @@ struct neighbour {
 	__u8			nud_state;
 	__u8			type;
 	__u8			dead;
+	u8			protocol;
 	seqlock_t		ha_lock;
 	unsigned char		ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8);
 	struct hh_cache		hh;
@@ -173,6 +174,7 @@ struct pneigh_entry {
 	possible_net_t		net;
 	struct net_device	*dev;
 	u8			flags;
+	u8			protocol;
 	u8			key[0];
 };
 
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 998155444e0d..cd144e3099a3 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -28,6 +28,7 @@ enum {
 	NDA_MASTER,
 	NDA_LINK_NETNSID,
 	NDA_SRC_VNI,
+	NDA_PROTOCOL,  /* Originator of entry */
 	__NDA_MAX
 };
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 42b413774370..fb4372cb1de1 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1828,6 +1828,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct net_device *dev = NULL;
 	struct neighbour *neigh;
 	void *dst, *lladdr;
+	u8 protocol = 0;
 	int err;
 
 	ASSERT_RTNL();
@@ -1867,6 +1868,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	dst = nla_data(tb[NDA_DST]);
 	lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
 
+	if (tb[NDA_PROTOCOL]) {
+		if (nla_len(tb[NDA_PROTOCOL]) != sizeof(u8)) {
+			NL_SET_ERR_MSG(extack, "Invalid protocol attribute");
+			goto out;
+		}
+		protocol = nla_get_u8(tb[NDA_PROTOCOL]);
+	}
+
 	if (ndm->ndm_flags & NTF_PROXY) {
 		struct pneigh_entry *pn;
 
@@ -1874,6 +1883,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 		pn = pneigh_lookup(tbl, net, dst, dev, 1);
 		if (pn) {
 			pn->flags = ndm->ndm_flags;
+			if (protocol)
+				pn->protocol = protocol;
 			err = 0;
 		}
 		goto out;
@@ -1924,6 +1935,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	} else
 		err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
 				     NETLINK_CB(skb).portid, extack);
+
+	if (protocol)
+		neigh->protocol = protocol;
+
 	neigh_release(neigh);
 
 out:
@@ -2417,6 +2432,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
 	    nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
 		goto nla_put_failure;
 
+	if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -2448,6 +2466,9 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
 	if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
 		goto nla_put_failure;
 
+	if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -3103,7 +3124,8 @@ static inline size_t neigh_nlmsg_size(void)
 	       + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
 	       + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
 	       + nla_total_size(sizeof(struct nda_cacheinfo))
-	       + nla_total_size(4); /* NDA_PROBES */
+	       + nla_total_size(4)  /* NDA_PROBES */
+	       + nla_total_size(1); /* NDA_PROTOCOL */
 }
 
 static void __neigh_notify(struct neighbour *n, int type, int flags,
-- 
cgit v1.2.3-59-g8ed1b


From 20427e5db3f96fc054c6a6ad95606906b834deb1 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 3 Dec 2018 20:56:47 +0100
Subject: mmc: document 'Reliable Write' bit in uapi header

If we use it this way, people should know about it. Also, replace
true/false with nonzero/zero because the flag is not strictly a bool
anymore.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/uapi/linux/mmc/ioctl.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mmc/ioctl.h b/include/uapi/linux/mmc/ioctl.h
index 45f369dc0a42..00c08120f3ba 100644
--- a/include/uapi/linux/mmc/ioctl.h
+++ b/include/uapi/linux/mmc/ioctl.h
@@ -5,7 +5,10 @@
 #include <linux/types.h>
 
 struct mmc_ioc_cmd {
-	/* Implies direction of data.  true = write, false = read */
+	/*
+	 * Direction of data: nonzero = write, zero = read.
+	 * Bit 31 selects 'Reliable Write' for RPMB.
+	 */
 	int write_flag;
 
 	/* Application-specific command.  true = precede with CMD55 */
-- 
cgit v1.2.3-59-g8ed1b


From 7239ff4b2be8ec0c3160da7fdd1475785fdb4cb9 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 30 Oct 2018 16:43:23 +0200
Subject: btrfs: Introduce support for FSID change without metadata rewrite

This field is going to be used when the user wants to change the UUID
of the filesystem without having to rewrite all metadata blocks. This
field adds another level of indirection such that when the FSID is
changed what really happens is the current UUID (the one with which the
fs was created) is copied to the 'metadata_uuid' field in the superblock
as well as a new incompat flag is set METADATA_UUID. When the kernel
detects this flag is set it knows that the superblock in fact has 2
UUIDs:

1. Is the UUID which is user-visible, currently known as FSID.
2. Metadata UUID - this is the UUID which is stamped into all on-disk
   datastructures belonging to this file system.

When the new incompat flag is present device scanning checks whether
both fsid/metadata_uuid of the scanned device match any of the
registered filesystems. When the flag is not set then both UUIDs are
equal and only the FSID is retained on disk, metadata_uuid is set only
in-memory during mount.

Additionally a new metadata_uuid field is also added to the fs_info
struct. It's initialised either with the FSID in case METADATA_UUID
incompat flag is not set or with the metdata_uuid of the superblock
otherwise.

This commit introduces the new fields as well as the new incompat flag
and switches all users of the fsid to the new logic.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor updates in comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c                |  4 +--
 fs/btrfs/ctree.h                | 16 ++++++---
 fs/btrfs/disk-io.c              | 33 ++++++++++++++----
 fs/btrfs/extent-tree.c          |  2 +-
 fs/btrfs/volumes.c              | 74 ++++++++++++++++++++++++++++++++---------
 fs/btrfs/volumes.h              |  1 +
 include/uapi/linux/btrfs.h      |  1 +
 include/uapi/linux/btrfs_tree.h |  1 +
 8 files changed, 104 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a436259c71b1..3b7d80add475 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -224,7 +224,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, new_root_objectid);
 
-	write_extent_buffer_fsid(cow, fs_info->fsid);
+	write_extent_buffer_fsid(cow, fs_info->metadata_fsid);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
@@ -1050,7 +1050,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, root->root_key.objectid);
 
-	write_extent_buffer_fsid(cow, fs_info->fsid);
+	write_extent_buffer_fsid(cow, fs_info->metadata_fsid);
 
 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
 	if (ret) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 83f0d98c9d5d..f8429cd8afb6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -195,9 +195,10 @@ struct btrfs_root_backup {
  * it currently lacks any block count etc etc
  */
 struct btrfs_super_block {
-	u8 csum[BTRFS_CSUM_SIZE];
 	/* the first 4 fields must match struct btrfs_header */
-	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
+	u8 csum[BTRFS_CSUM_SIZE];
+	/* FS specific UUID, visible to user */
+	u8 fsid[BTRFS_FSID_SIZE];
 	__le64 bytenr; /* this block number */
 	__le64 flags;
 
@@ -234,8 +235,11 @@ struct btrfs_super_block {
 	__le64 cache_generation;
 	__le64 uuid_tree_generation;
 
+	/* the UUID written into btree blocks */
+	u8 metadata_uuid[BTRFS_FSID_SIZE];
+
 	/* future expansion */
-	__le64 reserved[30];
+	__le64 reserved[28];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
@@ -265,7 +269,8 @@ struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
 	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
 	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
-	 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
+	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID)
 
 #define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
 	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -768,7 +773,10 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
 #define BTRFS_FS_BALANCE_RUNNING		18
 
 struct btrfs_fs_info {
+	/* User-visible fs UUID */
 	u8 fsid[BTRFS_FSID_SIZE];
+	/* UUID written to btree blocks */
+	u8 metadata_fsid[BTRFS_FSID_SIZE];
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	unsigned long flags;
 	struct btrfs_root *extent_root;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c1d127decc8d..18bc678ae4e6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -542,7 +542,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
 	if (WARN_ON(!PageUptodate(page)))
 		return -EUCLEAN;
 
-	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
+	ASSERT(memcmp_extent_buffer(eb, fs_info->metadata_fsid,
 			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
 
 	return csum_tree_block(fs_info, eb, 0);
@@ -557,7 +557,20 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 
 	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	while (fs_devices) {
-		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+		u8 *metadata_uuid;
+
+		/*
+		 * Checking the incompat flag is only valid for the current
+		 * fs. For seed devices it's forbidden to have their uuid
+		 * changed so reading ->fsid in this case is fine
+		 */
+		if (fs_devices == fs_info->fs_devices &&
+		    btrfs_fs_incompat(fs_info, METADATA_UUID))
+			metadata_uuid = fs_devices->metadata_uuid;
+		else
+			metadata_uuid = fs_devices->fsid;
+
+		if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
 			ret = 0;
 			break;
 		}
@@ -2443,10 +2456,11 @@ static int validate_super(struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 
-	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
+	if (memcmp(fs_info->metadata_fsid, sb->dev_item.fsid,
+		   BTRFS_FSID_SIZE) != 0) {
 		btrfs_err(fs_info,
-			   "dev_item UUID does not match fsid: %pU != %pU",
-			   fs_info->fsid, sb->dev_item.fsid);
+			"dev_item UUID does not match metadata fsid: %pU != %pU",
+			fs_info->metadata_fsid, sb->dev_item.fsid);
 		ret = -EINVAL;
 	}
 
@@ -2790,6 +2804,12 @@ int open_ctree(struct super_block *sb,
 	brelse(bh);
 
 	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+	if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
+		memcpy(fs_info->metadata_fsid,
+		       fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE);
+	} else {
+		memcpy(fs_info->metadata_fsid, fs_info->fsid, BTRFS_FSID_SIZE);
+	}
 
 	ret = btrfs_validate_mount_super(fs_info);
 	if (ret) {
@@ -3728,7 +3748,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
 		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
 		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
-		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE);
+		memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
+		       BTRFS_FSID_SIZE);
 
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d242a1174e50..bd749eb0c5be 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8312,7 +8312,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	btrfs_set_header_generation(buf, trans->transid);
 	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(buf, owner);
-	write_extent_buffer_fsid(buf, fs_info->fsid);
+	write_extent_buffer_fsid(buf, fs_info->metadata_fsid);
 	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 		buf->log_index = root->log_transid % 2;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0839fae337f6..f5626671d61d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -245,13 +245,15 @@ struct list_head *btrfs_get_fs_uuids(void)
 
 /*
  * alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid:	if not NULL, copy the uuid to fs_devices::fsid
+ * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
+ * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
  *
  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
  * The returned struct is not linked onto any lists and can be destroyed with
  * kfree() right away.
  */
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
+						 const u8 *metadata_fsid)
 {
 	struct btrfs_fs_devices *fs_devs;
 
@@ -268,6 +270,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 	if (fsid)
 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 
+	if (metadata_fsid)
+		memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
+	else if (fsid)
+		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
+
 	return fs_devs;
 }
 
@@ -375,13 +382,23 @@ static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
 	return NULL;
 }
 
-static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+static noinline struct btrfs_fs_devices *find_fsid(
+		const u8 *fsid, const u8 *metadata_fsid)
 {
 	struct btrfs_fs_devices *fs_devices;
 
+	ASSERT(fsid);
+
 	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
-			return fs_devices;
+		if (metadata_fsid) {
+			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
+			    && memcmp(metadata_fsid, fs_devices->metadata_uuid,
+				      BTRFS_FSID_SIZE) == 0)
+				return fs_devices;
+		} else {
+			if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+				return fs_devices;
+		}
 	}
 	return NULL;
 }
@@ -716,6 +733,13 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 	device->generation = btrfs_super_generation(disk_super);
 
 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+		if (btrfs_super_incompat_flags(disk_super) &
+		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
+			pr_err(
+		"BTRFS: Invalid seeding and uuid-changed device detected\n");
+			goto error_brelse;
+		}
+
 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		fs_devices->seeding = 1;
 	} else {
@@ -766,10 +790,21 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 	struct rcu_string *name;
 	u64 found_transid = btrfs_super_generation(disk_super);
 	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+
+	if (has_metadata_uuid)
+		fs_devices = find_fsid(disk_super->fsid, disk_super->metadata_uuid);
+	else
+		fs_devices = find_fsid(disk_super->fsid, NULL);
 
-	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
-		fs_devices = alloc_fs_devices(disk_super->fsid);
+		if (has_metadata_uuid)
+			fs_devices = alloc_fs_devices(disk_super->fsid,
+						      disk_super->metadata_uuid);
+		else
+			fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
+
 		if (IS_ERR(fs_devices))
 			return ERR_CAST(fs_devices);
 
@@ -920,7 +955,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 	struct btrfs_device *device;
 	struct btrfs_device *orig_dev;
 
-	fs_devices = alloc_fs_devices(orig->fsid);
+	fs_devices = alloc_fs_devices(orig->fsid, NULL);
 	if (IS_ERR(fs_devices))
 		return fs_devices;
 
@@ -1745,7 +1780,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
 	ptr = btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 	ptr = btrfs_device_fsid(dev_item);
-	write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, trans->fs_info->metadata_fsid, ptr,
+			    BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
@@ -2176,7 +2212,13 @@ static struct btrfs_device *btrfs_find_device_by_path(
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	dev_uuid = disk_super->dev_item.uuid;
-	device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
+	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+		device = btrfs_find_device(fs_info, devid, dev_uuid,
+				disk_super->metadata_uuid);
+	else
+		device = btrfs_find_device(fs_info, devid,
+				dev_uuid, disk_super->fsid);
+
 	brelse(bh);
 	if (!device)
 		device = ERR_PTR(-ENOENT);
@@ -2246,7 +2288,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 	if (!fs_devices->seeding)
 		return -EINVAL;
 
-	seed_devices = alloc_fs_devices(NULL);
+	seed_devices = alloc_fs_devices(NULL, NULL);
 	if (IS_ERR(seed_devices))
 		return PTR_ERR(seed_devices);
 
@@ -2283,6 +2325,8 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 
 	generate_random_uuid(fs_devices->fsid);
 	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	memcpy(fs_info->metadata_fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
 	mutex_unlock(&fs_devices->device_list_mutex);
 
@@ -6294,7 +6338,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 	cur_devices = fs_info->fs_devices;
 	while (cur_devices) {
 		if (!fsid ||
-		    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
+		    !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
 			device = find_device(cur_devices, devid, uuid);
 			if (device)
 				return device;
@@ -6623,12 +6667,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
 		fs_devices = fs_devices->seed;
 	}
 
-	fs_devices = find_fsid(fsid);
+	fs_devices = find_fsid(fsid, NULL);
 	if (!fs_devices) {
 		if (!btrfs_test_opt(fs_info, DEGRADED))
 			return ERR_PTR(-ENOENT);
 
-		fs_devices = alloc_fs_devices(fsid);
+		fs_devices = alloc_fs_devices(fsid, NULL);
 		if (IS_ERR(fs_devices))
 			return fs_devices;
 
@@ -6678,7 +6722,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
 			   BTRFS_FSID_SIZE);
 
-	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
+	if (memcmp(fs_uuid, fs_info->metadata_fsid, BTRFS_FSID_SIZE)) {
 		fs_devices = open_seed_devices(fs_info, fs_uuid);
 		if (IS_ERR(fs_devices))
 			return PTR_ERR(fs_devices);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 40820e0ec5a4..bdee4b60e0ba 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -210,6 +210,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
 
 struct btrfs_fs_devices {
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+	u8 metadata_uuid[BTRFS_FSID_SIZE];
 	struct list_head fs_list;
 
 	u64 num_devices;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5ca1d21fc4a7..e0763bc4158e 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -269,6 +269,7 @@ struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
 #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA	(1ULL << 8)
 #define BTRFS_FEATURE_INCOMPAT_NO_HOLES		(1ULL << 9)
+#define BTRFS_FEATURE_INCOMPAT_METADATA_UUID	(1ULL << 10)
 
 struct btrfs_ioctl_feature_flags {
 	__u64 compat_flags;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index aff1356c2bb8..e974f4bb5378 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -458,6 +458,7 @@ struct btrfs_free_space_header {
 #define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
 #define BTRFS_SUPER_FLAG_METADUMP_V2	(1ULL << 34)
 #define BTRFS_SUPER_FLAG_CHANGING_FSID	(1ULL << 35)
+#define BTRFS_SUPER_FLAG_CHANGING_FSID_V2 (1ULL << 36)
 
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 6c4fc209fcf9d27efbaa48368773e4d2bfbd59aa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 16 Dec 2018 00:49:47 +0100
Subject: bpf: remove useless version check for prog load

Existing libraries and tracing frameworks work around this kernel
version check by automatically deriving the kernel version from
uname(3) or similar such that the user does not need to do it
manually; these workarounds also make the version check useless
at the same time.

Moreover, most other BPF tracing types enabling bpf_probe_read()-like
functionality have /not/ adapted this check, and in general these
days it is well understood anyway that all the tracing programs are
not stable with regards to future kernels as kernel internal data
structures are subject to change from release to release.

Back at last netconf we discussed [0] and agreed to remove this
check from bpf_prog_load() and instead document it here in the uapi
header that there is no such guarantee for stable API for these
programs.

  [0] http://vger.kernel.org/netconf2018_files/DanielBorkmann_netconf2018.pdf

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 10 +++++++++-
 kernel/bpf/syscall.c           |  5 -----
 tools/include/uapi/linux/bpf.h | 10 +++++++++-
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e7d57e89f25f..1d324c2cbca2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -133,6 +133,14 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STACK,
 };
 
+/* Note that tracing related programs such as
+ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
+ * are not subject to a stable API since kernel internal data
+ * structures can change from release to release and may
+ * therefore break existing tracing BPF programs. Tracing BPF
+ * programs correspond to /a/ specific kernel which is to be
+ * analyzed, and not /a/ specific kernel /and/ all future ones.
+ */
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
@@ -343,7 +351,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
-		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		kern_version;	/* not used */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
 		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6ae062f1cf20..5db31067d85e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1473,11 +1473,6 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
 		return -E2BIG;
-
-	if (type == BPF_PROG_TYPE_KPROBE &&
-	    attr->kern_version != LINUX_VERSION_CODE)
-		return -EINVAL;
-
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
 	    !capable(CAP_SYS_ADMIN))
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e7d57e89f25f..1d324c2cbca2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -133,6 +133,14 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STACK,
 };
 
+/* Note that tracing related programs such as
+ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
+ * are not subject to a stable API since kernel internal data
+ * structures can change from release to release and may
+ * therefore break existing tracing BPF programs. Tracing BPF
+ * programs correspond to /a/ specific kernel which is to be
+ * analyzed, and not /a/ specific kernel /and/ all future ones.
+ */
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
@@ -343,7 +351,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
-		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		kern_version;	/* not used */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
 		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
-- 
cgit v1.2.3-59-g8ed1b


From b61c41c28eb09ae1bb02479a8f65171c037124c6 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 13 Dec 2018 20:23:26 +0300
Subject: Move EM_XTENSA to uapi/linux/elf-em.h

This should never have been defined in the arch tree to begin with,
and now uapi/linux/audit.h header is going to use EM_XTENSA
in order to define AUDIT_ARCH_XTENSA which is needed to implement
syscall_get_arch() which in turn is required to extend
the generic ptrace API with PTRACE_GET_SYSCALL_INFO request.

Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: linux-xtensa@linux-xtensa.org
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/include/asm/elf.h | 2 +-
 include/uapi/linux/elf-em.h   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/arch/xtensa/include/asm/elf.h b/arch/xtensa/include/asm/elf.h
index 0cd01b0c00bd..f3291b110a0d 100644
--- a/arch/xtensa/include/asm/elf.h
+++ b/arch/xtensa/include/asm/elf.h
@@ -15,10 +15,10 @@
 
 #include <asm/ptrace.h>
 #include <asm/coprocessor.h>
+#include <linux/elf-em.h>
 
 /* Xtensa processor ELF architecture-magic number */
 
-#define EM_XTENSA	94
 #define EM_XTENSA_OLD	0xABC7
 
 /* Xtensa relocations defined by the ABIs */
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index 93722e60204c..d2fb964432f3 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -34,6 +34,7 @@
 #define EM_M32R		88	/* Renesas M32R */
 #define EM_MN10300	89	/* Panasonic/MEI MN10300, AM33 */
 #define EM_OPENRISC     92     /* OpenRISC 32-bit embedded processor */
+#define EM_XTENSA	94	/* Tensilica Xtensa Architecture */
 #define EM_BLACKFIN     106     /* ADI Blackfin Processor */
 #define EM_ALTERA_NIOS2	113	/* Altera Nios II soft-core processor */
 #define EM_TI_C6000	140	/* TI C6X DSPs */
-- 
cgit v1.2.3-59-g8ed1b


From 98c3115a4ec56f03056efd9295e0fcb4c5c57a85 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 20 Nov 2018 03:17:01 +0300
Subject: xtensa: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in order to extend the generic ptrace API with PTRACE_GET_SYSCALL_INFO
request.

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Acked-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/include/asm/syscall.h | 7 +++++++
 include/uapi/linux/audit.h        | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h
index d98e0303917d..6969956d3f93 100644
--- a/arch/xtensa/include/asm/syscall.h
+++ b/arch/xtensa/include/asm/syscall.h
@@ -10,6 +10,13 @@
 #ifndef _ASM_SYSCALL_H
 #define _ASM_SYSCALL_H
 
+#include <uapi/linux/audit.h>
+
+static inline int syscall_get_arch(void)
+{
+	return AUDIT_ARCH_XTENSA;
+}
+
 struct pt_regs;
 
 asmlinkage long xtensa_rt_sigreturn(struct pt_regs*);
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 818ae690ab79..9e67fd359d58 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -411,6 +411,7 @@ enum {
 #define AUDIT_ARCH_TILEGX32	(EM_TILEGX|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_TILEPRO	(EM_TILEPRO|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_XTENSA	(EM_XTENSA)
 
 #define AUDIT_PERM_EXEC		1
 #define AUDIT_PERM_WRITE	2
-- 
cgit v1.2.3-59-g8ed1b


From 9d5f9f701b1891466fb3dbb1806ad97716f95cc3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 15 Dec 2018 22:13:51 -0800
Subject: bpf: btf: fix struct/union/fwd types with kind_flag

This patch fixed two issues with BTF. One is related to
struct/union bitfield encoding and the other is related to
forward type.

Issue #1 and solution:

======================

Current btf encoding of bitfield follows what pahole generates.
For each bitfield, pahole will duplicate the type chain and
put the bitfield size at the final int or enum type.
Since the BTF enum type cannot encode bit size,
pahole workarounds the issue by generating
an int type whenever the enum bit size is not 32.

For example,
  -bash-4.4$ cat t.c
  typedef int ___int;
  enum A { A1, A2, A3 };
  struct t {
    int a[5];
    ___int b:4;
    volatile enum A c:4;
  } g;
  -bash-4.4$ gcc -c -O2 -g t.c
The current kernel supports the following BTF encoding:
  $ pahole -JV t.o
  [1] TYPEDEF ___int type_id=2
  [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  [3] ENUM A size=4 vlen=3
        A1 val=0
        A2 val=1
        A3 val=2
  [4] STRUCT t size=24 vlen=3
        a type_id=5 bits_offset=0
        b type_id=9 bits_offset=160
        c type_id=11 bits_offset=164
  [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5
  [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none)
  [7] VOLATILE (anon) type_id=3
  [8] INT int size=1 bit_offset=0 nr_bits=4 encoding=(none)
  [9] TYPEDEF ___int type_id=8
  [10] INT (anon) size=1 bit_offset=0 nr_bits=4 encoding=SIGNED
  [11] VOLATILE (anon) type_id=10

Two issues are in the above:
  . by changing enum type to int, we lost the original
    type information and this will not be ideal later
    when we try to convert BTF to a header file.
  . the type duplication for bitfields will cause
    BTF bloat. Duplicated types cannot be deduplicated
    later if the bitfield size is different.

To fix this issue, this patch implemented a compatible
change for BTF struct type encoding:
  . the bit 31 of struct_type->info, previously reserved,
    now is used to indicate whether bitfield_size is
    encoded in btf_member or not.
  . if bit 31 of struct_type->info is set,
    btf_member->offset will encode like:
      bit 0 - 23: bit offset
      bit 24 - 31: bitfield size
    if bit 31 is not set, the old behavior is preserved:
      bit 0 - 31: bit offset

So if the struct contains a bit field, the maximum bit offset
will be reduced to (2^24 - 1) instead of MAX_UINT. The maximum
bitfield size will be 256 which is enough for today as maximum
bitfield in compiler can be 128 where int128 type is supported.

This kernel patch intends to support the new BTF encoding:
  $ pahole -JV t.o
  [1] TYPEDEF ___int type_id=2
  [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  [3] ENUM A size=4 vlen=3
        A1 val=0
        A2 val=1
        A3 val=2
  [4] STRUCT t kind_flag=1 size=24 vlen=3
        a type_id=5 bitfield_size=0 bits_offset=0
        b type_id=1 bitfield_size=4 bits_offset=160
        c type_id=7 bitfield_size=4 bits_offset=164
  [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5
  [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none)
  [7] VOLATILE (anon) type_id=3

Issue #2 and solution:
======================

Current forward type in BTF does not specify whether the original
type is struct or union. This will not work for type pretty print
and BTF-to-header-file conversion as struct/union must be specified.
  $ cat tt.c
  struct t;
  union u;
  int foo(struct t *t, union u *u) { return 0; }
  $ gcc -c -g -O2 tt.c
  $ pahole -JV tt.o
  [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  [2] FWD t type_id=0
  [3] PTR (anon) type_id=2
  [4] FWD u type_id=0
  [5] PTR (anon) type_id=4

To fix this issue, similar to issue #1, type->info bit 31
is used. If the bit is set, it is union type. Otherwise, it is
a struct type.

  $ pahole -JV tt.o
  [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  [2] FWD t kind_flag=0 type_id=0
  [3] PTR (anon) kind_flag=0 type_id=2
  [4] FWD u kind_flag=1 type_id=0
  [5] PTR (anon) kind_flag=0 type_id=4

Pahole/LLVM change:
===================

The new kind_flag functionality has been implemented in pahole
and llvm:
  https://github.com/yonghong-song/pahole/tree/bitfield
  https://github.com/yonghong-song/llvm/tree/bitfield

Note that pahole hasn't implemented func/func_proto kind
and .BTF.ext. So to print function signature with bpftool,
the llvm compiler should be used.

Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h |  20 +++-
 kernel/bpf/btf.c         | 280 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 278 insertions(+), 22 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 14f66948fc95..7b7475ef2f17 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -34,7 +34,9 @@ struct btf_type {
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
 	 * bits 24-27: kind (e.g. int, ptr, array...etc)
-	 * bits 28-31: unused
+	 * bits 28-30: unused
+	 * bit     31: kind_flag, currently used by
+	 *             struct, union and fwd
 	 */
 	__u32 info;
 	/* "size" is used by INT, ENUM, STRUCT and UNION.
@@ -52,6 +54,7 @@ struct btf_type {
 
 #define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
 #define BTF_INFO_VLEN(info)	((info) & 0xffff)
+#define BTF_INFO_KFLAG(info)	((info) >> 31)
 
 #define BTF_KIND_UNKN		0	/* Unknown	*/
 #define BTF_KIND_INT		1	/* Integer	*/
@@ -110,9 +113,22 @@ struct btf_array {
 struct btf_member {
 	__u32	name_off;
 	__u32	type;
-	__u32	offset;	/* offset in bits */
+	/* If the type info kind_flag is set, the btf_member offset
+	 * contains both member bitfield size and bit offset. The
+	 * bitfield size is set for bitfield members. If the type
+	 * info kind_flag is not set, the offset contains only bit
+	 * offset.
+	 */
+	__u32	offset;
 };
 
+/* If the struct/union type info kind_flag is set, the
+ * following two macros are used to access bitfield_size
+ * and bit_offset from btf_member.offset.
+ */
+#define BTF_MEMBER_BITFIELD_SIZE(val)	((val) >> 24)
+#define BTF_MEMBER_BIT_OFFSET(val)	((val) & 0xffffff)
+
 /* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param".
  * The exact number of btf_param is stored in the vlen (of the
  * info in "struct btf_type").
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 72caa799e82f..93b6905e3a9b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -164,7 +164,7 @@
 #define BITS_ROUNDUP_BYTES(bits) \
 	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
 
-#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INFO_MASK 0x8f00ffff
 #define BTF_INT_MASK 0x0fffffff
 #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
 #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -274,6 +274,10 @@ struct btf_kind_operations {
 			    const struct btf_type *struct_type,
 			    const struct btf_member *member,
 			    const struct btf_type *member_type);
+	int (*check_kflag_member)(struct btf_verifier_env *env,
+				  const struct btf_type *struct_type,
+				  const struct btf_member *member,
+				  const struct btf_type *member_type);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
 	void (*seq_show)(const struct btf *btf, const struct btf_type *t,
@@ -419,6 +423,25 @@ static u16 btf_type_vlen(const struct btf_type *t)
 	return BTF_INFO_VLEN(t->info);
 }
 
+static bool btf_type_kflag(const struct btf_type *t)
+{
+	return BTF_INFO_KFLAG(t->info);
+}
+
+static u32 btf_member_bit_offset(const struct btf_type *struct_type,
+			     const struct btf_member *member)
+{
+	return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
+					   : member->offset;
+}
+
+static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
+				    const struct btf_member *member)
+{
+	return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
+					   : 0;
+}
+
 static u32 btf_type_int(const struct btf_type *t)
 {
 	return *(u32 *)(t + 1);
@@ -627,9 +650,17 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	if (env->phase != CHECK_META)
 		btf_verifier_log_type(env, struct_type, NULL);
 
-	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
-			   __btf_name_by_offset(btf, member->name_off),
-			   member->type, member->offset);
+	if (btf_type_kflag(struct_type))
+		__btf_verifier_log(log,
+				   "\t%s type_id=%u bitfield_size=%u bits_offset=%u",
+				   __btf_name_by_offset(btf, member->name_off),
+				   member->type,
+				   BTF_MEMBER_BITFIELD_SIZE(member->offset),
+				   BTF_MEMBER_BIT_OFFSET(member->offset));
+	else
+		__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+				   __btf_name_by_offset(btf, member->name_off),
+				   member->type, member->offset);
 
 	if (fmt && *fmt) {
 		__btf_verifier_log(log, " ");
@@ -945,6 +976,38 @@ static int btf_df_check_member(struct btf_verifier_env *env,
 	return -EINVAL;
 }
 
+static int btf_df_check_kflag_member(struct btf_verifier_env *env,
+				     const struct btf_type *struct_type,
+				     const struct btf_member *member,
+				     const struct btf_type *member_type)
+{
+	btf_verifier_log_basic(env, struct_type,
+			       "Unsupported check_kflag_member");
+	return -EINVAL;
+}
+
+/* Used for ptr, array and struct/union type members.
+ * int, enum and modifier types have their specific callback functions.
+ */
+static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
+					  const struct btf_type *struct_type,
+					  const struct btf_member *member,
+					  const struct btf_type *member_type)
+{
+	if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member bitfield_size");
+		return -EINVAL;
+	}
+
+	/* bitfield size is 0, so member->offset represents bit offset only.
+	 * It is safe to call non kflag check_member variants.
+	 */
+	return btf_type_ops(member_type)->check_member(env, struct_type,
+						       member,
+						       member_type);
+}
+
 static int btf_df_resolve(struct btf_verifier_env *env,
 			  const struct resolve_vertex *v)
 {
@@ -997,6 +1060,62 @@ static int btf_int_check_member(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int btf_int_check_kflag_member(struct btf_verifier_env *env,
+				      const struct btf_type *struct_type,
+				      const struct btf_member *member,
+				      const struct btf_type *member_type)
+{
+	u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset;
+	u32 int_data = btf_type_int(member_type);
+	u32 struct_size = struct_type->size;
+	u32 nr_copy_bits;
+
+	/* a regular int type is required for the kflag int member */
+	if (!btf_type_int_is_regular(member_type)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member base type");
+		return -EINVAL;
+	}
+
+	/* check sanity of bitfield size */
+	nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+	struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+	nr_int_data_bits = BTF_INT_BITS(int_data);
+	if (!nr_bits) {
+		/* Not a bitfield member, member offset must be at byte
+		 * boundary.
+		 */
+		if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+			btf_verifier_log_member(env, struct_type, member,
+						"Invalid member offset");
+			return -EINVAL;
+		}
+
+		nr_bits = nr_int_data_bits;
+	} else if (nr_bits > nr_int_data_bits) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member bitfield_size");
+		return -EINVAL;
+	}
+
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
+	if (nr_copy_bits > BITS_PER_U64) {
+		btf_verifier_log_member(env, struct_type, member,
+					"nr_copy_bits exceeds 64");
+		return -EINVAL;
+	}
+
+	if (struct_size < bytes_offset ||
+	    struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_int_check_meta(struct btf_verifier_env *env,
 			      const struct btf_type *t,
 			      u32 meta_left)
@@ -1016,6 +1135,11 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	int_data = btf_type_int(t);
 	if (int_data & ~BTF_INT_MASK) {
 		btf_verifier_log_basic(env, t, "Invalid int_data:%x",
@@ -1097,6 +1221,7 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset,
 	seq_printf(m, "0x%llx", print_num);
 }
 
+
 static void btf_int_bits_seq_show(const struct btf *btf,
 				  const struct btf_type *t,
 				  void *data, u8 bits_offset,
@@ -1163,6 +1288,7 @@ static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_int_check_member,
+	.check_kflag_member = btf_int_check_kflag_member,
 	.log_details = btf_int_log,
 	.seq_show = btf_int_seq_show,
 };
@@ -1192,6 +1318,31 @@ static int btf_modifier_check_member(struct btf_verifier_env *env,
 							 resolved_type);
 }
 
+static int btf_modifier_check_kflag_member(struct btf_verifier_env *env,
+					   const struct btf_type *struct_type,
+					   const struct btf_member *member,
+					   const struct btf_type *member_type)
+{
+	const struct btf_type *resolved_type;
+	u32 resolved_type_id = member->type;
+	struct btf_member resolved_member;
+	struct btf *btf = env->btf;
+
+	resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+	if (!resolved_type) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member");
+		return -EINVAL;
+	}
+
+	resolved_member = *member;
+	resolved_member.type = resolved_type_id;
+
+	return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type,
+							       &resolved_member,
+							       resolved_type);
+}
+
 static int btf_ptr_check_member(struct btf_verifier_env *env,
 				const struct btf_type *struct_type,
 				const struct btf_member *member,
@@ -1227,6 +1378,11 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	if (!BTF_TYPE_ID_VALID(t->type)) {
 		btf_verifier_log_type(env, t, "Invalid type_id");
 		return -EINVAL;
@@ -1380,6 +1536,7 @@ static struct btf_kind_operations modifier_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_modifier_resolve,
 	.check_member = btf_modifier_check_member,
+	.check_kflag_member = btf_modifier_check_kflag_member,
 	.log_details = btf_ref_type_log,
 	.seq_show = btf_modifier_seq_show,
 };
@@ -1388,6 +1545,7 @@ static struct btf_kind_operations ptr_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_ptr_resolve,
 	.check_member = btf_ptr_check_member,
+	.check_kflag_member = btf_generic_check_kflag_member,
 	.log_details = btf_ref_type_log,
 	.seq_show = btf_ptr_seq_show,
 };
@@ -1422,6 +1580,7 @@ static struct btf_kind_operations fwd_ops = {
 	.check_meta = btf_fwd_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
+	.check_kflag_member = btf_df_check_kflag_member,
 	.log_details = btf_ref_type_log,
 	.seq_show = btf_df_seq_show,
 };
@@ -1480,6 +1639,11 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	if (t->size) {
 		btf_verifier_log_type(env, t, "size != 0");
 		return -EINVAL;
@@ -1603,6 +1767,7 @@ static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
 	.resolve = btf_array_resolve,
 	.check_member = btf_array_check_member,
+	.check_kflag_member = btf_generic_check_kflag_member,
 	.log_details = btf_array_log,
 	.seq_show = btf_array_seq_show,
 };
@@ -1641,6 +1806,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 	u32 meta_needed, last_offset;
 	struct btf *btf = env->btf;
 	u32 struct_size = t->size;
+	u32 offset;
 	u16 i;
 
 	meta_needed = btf_type_vlen(t) * sizeof(*member);
@@ -1682,7 +1848,8 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 			return -EINVAL;
 		}
 
-		if (is_union && member->offset) {
+		offset = btf_member_bit_offset(t, member);
+		if (is_union && offset) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid member bits_offset");
 			return -EINVAL;
@@ -1692,20 +1859,20 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 		 * ">" instead of ">=" because the last member could be
 		 * "char a[0];"
 		 */
-		if (last_offset > member->offset) {
+		if (last_offset > offset) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid member bits_offset");
 			return -EINVAL;
 		}
 
-		if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+		if (BITS_ROUNDUP_BYTES(offset) > struct_size) {
 			btf_verifier_log_member(env, t, member,
 						"Member bits_offset exceeds its struct size");
 			return -EINVAL;
 		}
 
 		btf_verifier_log_member(env, t, member, NULL);
-		last_offset = member->offset;
+		last_offset = offset;
 	}
 
 	return meta_needed;
@@ -1735,9 +1902,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 
 		last_member_type = btf_type_by_id(env->btf,
 						  last_member_type_id);
-		err = btf_type_ops(last_member_type)->check_member(env, v->t,
-							last_member,
-							last_member_type);
+		if (btf_type_kflag(v->t))
+			err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t,
+								last_member,
+								last_member_type);
+		else
+			err = btf_type_ops(last_member_type)->check_member(env, v->t,
+								last_member,
+								last_member_type);
 		if (err)
 			return err;
 	}
@@ -1759,9 +1931,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 			return env_stack_push(env, member_type, member_type_id);
 		}
 
-		err = btf_type_ops(member_type)->check_member(env, v->t,
-							      member,
-							      member_type);
+		if (btf_type_kflag(v->t))
+			err = btf_type_ops(member_type)->check_kflag_member(env, v->t,
+									    member,
+									    member_type);
+		else
+			err = btf_type_ops(member_type)->check_member(env, v->t,
+								      member,
+								      member_type);
 		if (err)
 			return err;
 	}
@@ -1789,17 +1966,26 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
 	for_each_member(i, t, member) {
 		const struct btf_type *member_type = btf_type_by_id(btf,
 								member->type);
-		u32 member_offset = member->offset;
-		u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
-		u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
 		const struct btf_kind_operations *ops;
+		u32 member_offset, bitfield_size;
+		u32 bytes_offset;
+		u8 bits8_offset;
 
 		if (i)
 			seq_puts(m, seq);
 
-		ops = btf_type_ops(member_type);
-		ops->seq_show(btf, member_type, member->type,
-			      data + bytes_offset, bits8_offset, m);
+		member_offset = btf_member_bit_offset(t, member);
+		bitfield_size = btf_member_bitfield_size(t, member);
+		if (bitfield_size) {
+			btf_bitfield_seq_show(data, member_offset,
+					      bitfield_size, m);
+		} else {
+			bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+			bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+			ops = btf_type_ops(member_type);
+			ops->seq_show(btf, member_type, member->type,
+				      data + bytes_offset, bits8_offset, m);
+		}
 	}
 	seq_puts(m, "}");
 }
@@ -1808,6 +1994,7 @@ static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
 	.resolve = btf_struct_resolve,
 	.check_member = btf_struct_check_member,
+	.check_kflag_member = btf_generic_check_kflag_member,
 	.log_details = btf_struct_log,
 	.seq_show = btf_struct_seq_show,
 };
@@ -1837,6 +2024,41 @@ static int btf_enum_check_member(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int btf_enum_check_kflag_member(struct btf_verifier_env *env,
+				       const struct btf_type *struct_type,
+				       const struct btf_member *member,
+				       const struct btf_type *member_type)
+{
+	u32 struct_bits_off, nr_bits, bytes_end, struct_size;
+	u32 int_bitsize = sizeof(int) * BITS_PER_BYTE;
+
+	struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+	nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+	if (!nr_bits) {
+		if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+			btf_verifier_log_member(env, struct_type, member,
+						"Member is not byte aligned");
+				return -EINVAL;
+		}
+
+		nr_bits = int_bitsize;
+	} else if (nr_bits > int_bitsize) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member bitfield_size");
+		return -EINVAL;
+	}
+
+	struct_size = struct_type->size;
+	bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits);
+	if (struct_size < bytes_end) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 			       const struct btf_type *t,
 			       u32 meta_left)
@@ -1856,6 +2078,11 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	if (t->size != sizeof(int)) {
 		btf_verifier_log_type(env, t, "Expected size:%zu",
 				      sizeof(int));
@@ -1924,6 +2151,7 @@ static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_enum_check_member,
+	.check_kflag_member = btf_enum_check_kflag_member,
 	.log_details = btf_enum_log,
 	.seq_show = btf_enum_seq_show,
 };
@@ -1946,6 +2174,11 @@ static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	btf_verifier_log_type(env, t, NULL);
 
 	return meta_needed;
@@ -2005,6 +2238,7 @@ static struct btf_kind_operations func_proto_ops = {
 	 * Hence, there is no btf_func_check_member().
 	 */
 	.check_member = btf_df_check_member,
+	.check_kflag_member = btf_df_check_kflag_member,
 	.log_details = btf_func_proto_log,
 	.seq_show = btf_df_seq_show,
 };
@@ -2024,6 +2258,11 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	btf_verifier_log_type(env, t, NULL);
 
 	return 0;
@@ -2033,6 +2272,7 @@ static struct btf_kind_operations func_ops = {
 	.check_meta = btf_func_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
+	.check_kflag_member = btf_df_check_kflag_member,
 	.log_details = btf_ref_type_log,
 	.seq_show = btf_df_seq_show,
 };
-- 
cgit v1.2.3-59-g8ed1b


From 30db641ef4f68054db9b191b6c0200fb1a96d458 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sat, 15 Dec 2018 11:03:23 +0200
Subject: cfg80211: clarify LCI/civic location documentation

The older code and current userspace assumed that this data
is the content of the Measurement Report element, starting
with the Measurement Token. Clarify this in the documentation.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  6 ++++--
 include/uapi/linux/nl80211.h | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 569b128aade8..e0c41eb1c860 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -777,8 +777,10 @@ struct cfg80211_crypto_settings {
  * @probe_resp: probe response template (AP mode only)
  * @ftm_responder: enable FTM responder functionality; -1 for no change
  *	(which also implies no change in LCI/civic location data)
- * @lci: LCI subelement content
- * @civicloc: Civic location subelement content
+ * @lci: Measurement Report element content, starting with Measurement Token
+ *	(measurement type 8)
+ * @civicloc: Measurement Report element content, starting with Measurement
+ *	Token (measurement type 11)
  * @lci_len: LCI data length
  * @civicloc_len: Civic location data length
  */
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 2b53c0e949c7..4625a8624ba2 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5893,9 +5893,11 @@ enum nl80211_external_auth_action {
  * @__NL80211_FTM_RESP_ATTR_INVALID: Invalid
  * @NL80211_FTM_RESP_ATTR_ENABLED: FTM responder is enabled
  * @NL80211_FTM_RESP_ATTR_LCI: The content of Measurement Report Element
- *	(9.4.2.22 in 802.11-2016) with type 8 - LCI (9.4.2.22.10)
+ *	(9.4.2.22 in 802.11-2016) with type 8 - LCI (9.4.2.22.10),
+ *	i.e. starting with the measurement token
  * @NL80211_FTM_RESP_ATTR_CIVIC: The content of Measurement Report Element
- *	(9.4.2.22 in 802.11-2016) with type 11 - Civic (Section 9.4.2.22.13)
+ *	(9.4.2.22 in 802.11-2016) with type 11 - Civic (Section 9.4.2.22.13),
+ *	i.e. starting with the measurement token
  * @__NL80211_FTM_RESP_ATTR_LAST: Internal
  * @NL80211_FTM_RESP_ATTR_MAX: highest FTM responder attribute.
  */
@@ -6295,9 +6297,15 @@ enum nl80211_peer_measurement_ftm_failure_reasons {
  * @NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE: distance variance (u64, mm^2, note
  *	that standard deviation is the square root of variance, optional)
  * @NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD: distance spread (u64, mm, optional)
- * @NL80211_PMSR_FTM_RESP_ATTR_LCI: LCI data from peer (binary, optional)
+ * @NL80211_PMSR_FTM_RESP_ATTR_LCI: LCI data from peer (binary, optional);
+ *	this is the contents of the Measurement Report Element (802.11-2016
+ *	9.4.2.22.1) starting with the Measurement Token, with Measurement
+ *	Type 8.
  * @NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC: civic location data from peer
- *	(binary, optional)
+ *	(binary, optional);
+ *	this is the contents of the Measurement Report Element (802.11-2016
+ *	9.4.2.22.1) starting with the Measurement Token, with Measurement
+ *	Type 11.
  * @NL80211_PMSR_FTM_RESP_ATTR_PAD: ignore, for u64/s64 padding only
  *
  * @NUM_NL80211_PMSR_FTM_RESP_ATTR: internal
-- 
cgit v1.2.3-59-g8ed1b


From 30c63115e20b70f89b7cfb66b35e2a0ef4b0ef07 Mon Sep 17 00:00:00 2001
From: Sriram R <srirrama@codeaurora.org>
Date: Tue, 4 Dec 2018 17:46:52 +0530
Subject: nl80211: Add support to notify radar event info received from STA

Currently radar detection and corresponding channel switch is handled
at the AP device. STA ignores these detected radar events since the
radar signal can be seen mostly by the AP as well. But in scenarios where
a radar signal is seen only at STA, notifying this event to the AP which
can trigger a channel switch can be useful.
Stations can report such radar events autonomously through Spectrum
management (Measurement Report) action frame to its AP. The userspace on
processing the report can notify the kernel with the use of the added
NL80211_CMD_NOTIFY_RADAR to indicate the detected event and inturn adding
the reported channel to NOL.

Signed-off-by: Sriram R <srirrama@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  7 +++++
 net/wireless/nl80211.c       | 62 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 4625a8624ba2..31ae5c7f10e3 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1060,6 +1060,11 @@
  *	the measurement completed, using the measurement cookie
  *	(%NL80211_ATTR_COOKIE).
  *
+ * @NL80211_CMD_NOTIFY_RADAR: Notify the kernel that a radar signal was
+ *	detected and reported by a neighboring device on the channel
+ *	indicated by %NL80211_ATTR_WIPHY_FREQ and other attributes
+ *	determining the width and type.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1278,6 +1283,8 @@ enum nl80211_commands {
 	NL80211_CMD_PEER_MEASUREMENT_RESULT,
 	NL80211_CMD_PEER_MEASUREMENT_COMPLETE,
 
+	NL80211_CMD_NOTIFY_RADAR,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4e9133e4587b..71a54ada377b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7990,6 +7990,60 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
 	return err;
 }
 
+static int nl80211_notify_radar_detection(struct sk_buff *skb,
+					  struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_chan_def chandef;
+	enum nl80211_dfs_regions dfs_region;
+	int err;
+
+	dfs_region = reg_get_dfs_region(wiphy);
+	if (dfs_region == NL80211_DFS_UNSET) {
+		GENL_SET_ERR_MSG(info,
+				 "DFS Region is not set. Unexpected Radar indication");
+		return -EINVAL;
+	}
+
+	err = nl80211_parse_chandef(rdev, info, &chandef);
+	if (err) {
+		GENL_SET_ERR_MSG(info, "Unable to extract chandef info");
+		return err;
+	}
+
+	err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype);
+	if (err < 0) {
+		GENL_SET_ERR_MSG(info, "chandef is invalid");
+		return err;
+	}
+
+	if (err == 0) {
+		GENL_SET_ERR_MSG(info,
+				 "Unexpected Radar indication for chandef/iftype");
+		return -EINVAL;
+	}
+
+	/* Do not process this notification if radar is already detected
+	 * by kernel on this channel, and return success.
+	 */
+	if (chandef.chan->dfs_state == NL80211_DFS_UNAVAILABLE)
+		return 0;
+
+	cfg80211_set_dfs_state(wiphy, &chandef, NL80211_DFS_UNAVAILABLE);
+
+	cfg80211_sched_dfs_chan_update(rdev);
+
+	memcpy(&rdev->radar_chandef, &chandef, sizeof(chandef));
+
+	/* Propagate this notification to other radios as well */
+	queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk);
+
+	return 0;
+}
+
 static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -14074,6 +14128,14 @@ static const struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_NOTIFY_RADAR,
+		.doit = nl80211_notify_radar_detection,
+		.policy = nl80211_policy,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 4785860e04bc8d7e244b25257168e1cf8a5529ab Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 30 Nov 2018 13:06:21 +0200
Subject: RDMA/uverbs: Implement an ioctl that can call write and write_ex
 handlers

Now that the handlers do not process their own udata we can make a
sensible ioctl that wrappers them. The ioctl follows the same format as
the write_ex() and has the user explicitly specify the core and driver
in/out opaque structures and a command number.

This works for all forms of write commands.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/Makefile                  |  2 +-
 drivers/infiniband/core/rdma_core.h               |  5 ++
 drivers/infiniband/core/uverbs.h                  |  1 -
 drivers/infiniband/core/uverbs_ioctl.c            | 40 ++++++++-------
 drivers/infiniband/core/uverbs_std_types.c        |  3 --
 drivers/infiniband/core/uverbs_std_types_device.c | 60 +++++++++++++++++++++++
 drivers/infiniband/core/uverbs_uapi.c             |  1 +
 include/uapi/rdma/ib_user_ioctl_cmds.h            | 10 ++++
 include/uapi/rdma/ib_user_verbs.h                 |  2 +-
 9 files changed, 100 insertions(+), 24 deletions(-)
 create mode 100644 drivers/infiniband/core/uverbs_std_types_device.c

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 867cee5e27b2..69dee36e0e89 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -38,4 +38,4 @@ ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
 				uverbs_std_types_cq.o \
 				uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
 				uverbs_std_types_mr.o uverbs_std_types_counters.o \
-				uverbs_uapi.o
+				uverbs_uapi.o uverbs_std_types_device.o
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index b3ca7457ac42..be6b8e1257d0 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -188,6 +188,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile);
 
 extern const struct uapi_definition uverbs_def_obj_counters[];
 extern const struct uapi_definition uverbs_def_obj_cq[];
+extern const struct uapi_definition uverbs_def_obj_device[];
 extern const struct uapi_definition uverbs_def_obj_dm[];
 extern const struct uapi_definition uverbs_def_obj_flow_action[];
 extern const struct uapi_definition uverbs_def_obj_intf[];
@@ -214,4 +215,8 @@ uapi_get_method(const struct uverbs_api *uapi, u32 command)
 	return uapi->write_methods[cmd_idx];
 }
 
+void uverbs_fill_udata(struct uverbs_attr_bundle *bundle,
+		       struct ib_udata *udata, unsigned int attr_in,
+		       unsigned int attr_out);
+
 #endif /* RDMA_CORE_H */
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 8b41c95300c6..88029f3b6853 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -246,7 +246,6 @@ int uverbs_dealloc_mw(struct ib_mw *mw);
 void ib_uverbs_detach_umcast(struct ib_qp *qp,
 			     struct ib_uqp_object *uobj);
 
-void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata);
 long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 
 struct ib_uverbs_flow_spec {
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index e643a43dce8d..3cc46447240e 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -436,7 +436,9 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
 		return -EINVAL;
 
 	if (pbundle->method_elm->has_udata)
-		create_udata(&pbundle->bundle, &pbundle->bundle.driver_udata);
+		uverbs_fill_udata(&pbundle->bundle,
+				  &pbundle->bundle.driver_udata,
+				  UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);
 
 	if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
 		struct uverbs_obj_attr *destroy_attr =
@@ -664,35 +666,37 @@ int uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle,
 EXPORT_SYMBOL(uverbs_get_flags32);
 
 /*
- * This is for ease of conversion. The purpose is to convert all drivers to
- * use uverbs_attr_bundle instead of ib_udata.  Assume attr == 0 is input and
- * attr == 1 is output.
+ * Fill a ib_udata struct (core or uhw) using the given attribute IDs.
+ * This is primarily used to convert the UVERBS_ATTR_UHW() into the
+ * ib_udata format used by the drivers.
  */
-void create_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata)
+void uverbs_fill_udata(struct uverbs_attr_bundle *bundle,
+		       struct ib_udata *udata, unsigned int attr_in,
+		       unsigned int attr_out)
 {
 	struct bundle_priv *pbundle =
 		container_of(bundle, struct bundle_priv, bundle);
-	const struct uverbs_attr *uhw_in =
-		uverbs_attr_get(bundle, UVERBS_ATTR_UHW_IN);
-	const struct uverbs_attr *uhw_out =
-		uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT);
-
-	if (!IS_ERR(uhw_in)) {
-		udata->inlen = uhw_in->ptr_attr.len;
-		if (uverbs_attr_ptr_is_inline(uhw_in))
+	const struct uverbs_attr *in =
+		uverbs_attr_get(&pbundle->bundle, attr_in);
+	const struct uverbs_attr *out =
+		uverbs_attr_get(&pbundle->bundle, attr_out);
+
+	if (!IS_ERR(in)) {
+		udata->inlen = in->ptr_attr.len;
+		if (uverbs_attr_ptr_is_inline(in))
 			udata->inbuf =
-				&pbundle->user_attrs[uhw_in->ptr_attr.uattr_idx]
+				&pbundle->user_attrs[in->ptr_attr.uattr_idx]
 					 .data;
 		else
-			udata->inbuf = u64_to_user_ptr(uhw_in->ptr_attr.data);
+			udata->inbuf = u64_to_user_ptr(in->ptr_attr.data);
 	} else {
 		udata->inbuf = NULL;
 		udata->inlen = 0;
 	}
 
-	if (!IS_ERR(uhw_out)) {
-		udata->outbuf = u64_to_user_ptr(uhw_out->ptr_attr.data);
-		udata->outlen = uhw_out->ptr_attr.len;
+	if (!IS_ERR(out)) {
+		udata->outbuf = u64_to_user_ptr(out->ptr_attr.data);
+		udata->outlen = out->ptr_attr.len;
 	} else {
 		udata->outbuf = NULL;
 		udata->outlen = 0;
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index 424f325f8cba..e3df1e342e2f 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -259,10 +259,7 @@ DECLARE_UVERBS_NAMED_OBJECT(
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD,
 			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd));
 
-DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE);
-
 const struct uapi_definition uverbs_def_obj_intf[] = {
-	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE),
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_PD,
 				      UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)),
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COMP_CHANNEL,
diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
new file mode 100644
index 000000000000..aafb251b7d37
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+/*
+ * This ioctl method allows calling any defined write or write_ex
+ * handler. This essentially replaces the hdr/ex_hdr system with the ioctl
+ * marshalling, and brings the non-ex path into the same marshalling as the ex
+ * path.
+ */
+static int UVERBS_HANDLER(UVERBS_METHOD_INVOKE_WRITE)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct uverbs_api *uapi = attrs->ufile->device->uapi;
+	const struct uverbs_api_write_method *method_elm;
+	u32 cmd;
+	int rc;
+
+	rc = uverbs_get_const(&cmd, attrs, UVERBS_ATTR_WRITE_CMD);
+	if (rc)
+		return rc;
+
+	method_elm = uapi_get_method(uapi, cmd);
+	if (IS_ERR(method_elm))
+		return PTR_ERR(method_elm);
+
+	uverbs_fill_udata(attrs, &attrs->ucore, UVERBS_ATTR_CORE_IN,
+			  UVERBS_ATTR_CORE_OUT);
+
+	if (attrs->ucore.inlen < method_elm->req_size ||
+	    attrs->ucore.outlen < method_elm->resp_size)
+		return -ENOSPC;
+
+	return method_elm->handler(attrs);
+}
+
+DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_INVOKE_WRITE,
+			    UVERBS_ATTR_CONST_IN(UVERBS_ATTR_WRITE_CMD,
+						 enum ib_uverbs_write_cmds,
+						 UA_MANDATORY),
+			    UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CORE_IN,
+					       UVERBS_ATTR_MIN_SIZE(sizeof(u32)),
+					       UA_OPTIONAL),
+			    UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CORE_OUT,
+						UVERBS_ATTR_MIN_SIZE(0),
+						UA_OPTIONAL),
+			    UVERBS_ATTR_UHW());
+
+DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE,
+			      &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE));
+
+const struct uapi_definition uverbs_def_obj_device[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE),
+	{},
+};
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index 830e48fe5e65..9ae08e4b78a3 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -621,6 +621,7 @@ void uverbs_destroy_api(struct uverbs_api *uapi)
 static const struct uapi_definition uverbs_core_api[] = {
 	UAPI_DEF_CHAIN(uverbs_def_obj_counters),
 	UAPI_DEF_CHAIN(uverbs_def_obj_cq),
+	UAPI_DEF_CHAIN(uverbs_def_obj_device),
 	UAPI_DEF_CHAIN(uverbs_def_obj_dm),
 	UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
 	UAPI_DEF_CHAIN(uverbs_def_obj_intf),
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index 2c881aaf05c2..34e71994f4a5 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -63,6 +63,16 @@ enum {
 	UVERBS_ATTR_UHW_OUT,
 };
 
+enum uverbs_methods_device {
+	UVERBS_METHOD_INVOKE_WRITE,
+};
+
+enum uverbs_attrs_invoke_write_cmd_attr_ids {
+	UVERBS_ATTR_CORE_IN,
+	UVERBS_ATTR_CORE_OUT,
+	UVERBS_ATTR_WRITE_CMD,
+};
+
 enum uverbs_attrs_create_cq_cmd_attr_ids {
 	UVERBS_ATTR_CREATE_CQ_HANDLE,
 	UVERBS_ATTR_CREATE_CQ_CQE,
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index c586fc43739c..480d9a60b68e 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -46,7 +46,7 @@
 #define IB_USER_VERBS_ABI_VERSION	6
 #define IB_USER_VERBS_CMD_THRESHOLD    50
 
-enum {
+enum ib_uverbs_write_cmds {
 	IB_USER_VERBS_CMD_GET_CONTEXT,
 	IB_USER_VERBS_CMD_QUERY_DEVICE,
 	IB_USER_VERBS_CMD_QUERY_PORT,
-- 
cgit v1.2.3-59-g8ed1b


From 149d3845f4a548dbc83932fab3491aeb0b070b3a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 30 Nov 2018 13:16:47 +0200
Subject: RDMA/uverbs: Add a method to introspect handles in a context

Introduce a helper function gather_objects_handle() to copy object handles
under a spin lock.

Expose these objects handles via the uverbs ioctl interface.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/uverbs_std_types_device.c | 89 ++++++++++++++++++++++-
 include/uapi/rdma/ib_user_ioctl_cmds.h            |  7 ++
 2 files changed, 95 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
index aafb251b7d37..165ece1eb655 100644
--- a/drivers/infiniband/core/uverbs_std_types_device.c
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -51,8 +51,95 @@ DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_INVOKE_WRITE,
 						UA_OPTIONAL),
 			    UVERBS_ATTR_UHW());
 
+static uint32_t *
+gather_objects_handle(struct ib_uverbs_file *ufile,
+		      const struct uverbs_api_object *uapi_object,
+		      struct uverbs_attr_bundle *attrs,
+		      ssize_t out_len,
+		      u64 *total)
+{
+	u64 max_count = out_len / sizeof(u32);
+	struct ib_uobject *obj;
+	u64 count = 0;
+	u32 *handles;
+
+	/* Allocated memory that cannot page out where we gather
+	 * all object ids under a spin_lock.
+	 */
+	handles = uverbs_zalloc(attrs, out_len);
+	if (IS_ERR(handles))
+		return handles;
+
+	spin_lock_irq(&ufile->uobjects_lock);
+	list_for_each_entry(obj, &ufile->uobjects, list) {
+		u32 obj_id = obj->id;
+
+		if (obj->uapi_object != uapi_object)
+			continue;
+
+		if (count >= max_count)
+			break;
+
+		handles[count] = obj_id;
+		count++;
+	}
+	spin_unlock_irq(&ufile->uobjects_lock);
+
+	*total = count;
+	return handles;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_INFO_HANDLES)(
+	struct uverbs_attr_bundle *attrs)
+{
+	const struct uverbs_api_object *uapi_object;
+	ssize_t out_len;
+	u64 total = 0;
+	u16 object_id;
+	u32 *handles;
+	int ret;
+
+	out_len = uverbs_attr_get_len(attrs, UVERBS_ATTR_INFO_HANDLES_LIST);
+	if (out_len <= 0 || (out_len % sizeof(u32) != 0))
+		return -EINVAL;
+
+	ret = uverbs_get_const(&object_id, attrs, UVERBS_ATTR_INFO_OBJECT_ID);
+	if (ret)
+		return ret;
+
+	uapi_object = uapi_get_object(attrs->ufile->device->uapi, object_id);
+	if (!uapi_object)
+		return -EINVAL;
+
+	handles = gather_objects_handle(attrs->ufile, uapi_object, attrs,
+					out_len, &total);
+	if (IS_ERR(handles))
+		return PTR_ERR(handles);
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_HANDLES_LIST, handles,
+			     sizeof(u32) * total);
+	if (ret)
+		goto err;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_TOTAL_HANDLES, &total,
+			     sizeof(total));
+err:
+	return ret;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_INFO_HANDLES,
+	/* Also includes any device specific object ids */
+	UVERBS_ATTR_CONST_IN(UVERBS_ATTR_INFO_OBJECT_ID,
+			     enum uverbs_default_objects, UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_TOTAL_HANDLES,
+			    UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_HANDLES_LIST,
+			    UVERBS_ATTR_MIN_SIZE(sizeof(u32)), UA_OPTIONAL));
+
 DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE,
-			      &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE));
+			      &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE),
+			      &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES));
 
 const struct uapi_definition uverbs_def_obj_device[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE),
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index 34e71994f4a5..cf6e5f575696 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -65,6 +65,7 @@ enum {
 
 enum uverbs_methods_device {
 	UVERBS_METHOD_INVOKE_WRITE,
+	UVERBS_METHOD_INFO_HANDLES,
 };
 
 enum uverbs_attrs_invoke_write_cmd_attr_ids {
@@ -167,4 +168,10 @@ enum uverbs_methods_actions_counters_ops {
 	UVERBS_METHOD_COUNTERS_READ,
 };
 
+enum uverbs_attrs_info_handles_id {
+	UVERBS_ATTR_INFO_OBJECT_ID,
+	UVERBS_ATTR_INFO_TOTAL_HANDLES,
+	UVERBS_ATTR_INFO_HANDLES_LIST,
+};
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From bbc13cda37711eb7baa4091017887a57074f5410 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 30 Nov 2018 13:16:48 +0200
Subject: RDMA/uverbs: Add an ioctl method to destroy an object

Add an ioctl method to destroy the PD, MR, MW, AH, flow, RWQ indirection
table and XRCD objects by handle which doesn't require any output response
during destruction.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_std_types.c    | 60 ++++++++++++++++++++++++---
 drivers/infiniband/core/uverbs_std_types_mr.c | 10 ++++-
 include/uapi/rdma/ib_user_ioctl_cmds.h        | 52 +++++++++++++++++++++++
 3 files changed, 115 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index e3df1e342e2f..8e975f5b1f01 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -228,36 +228,84 @@ DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_QP,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp));
 
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_MW_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MW_HANDLE,
+			UVERBS_OBJECT_MW,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW,
-			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw));
+			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw),
+			    &UVERBS_METHOD(UVERBS_METHOD_MW_DESTROY));
 
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_SRQ,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object),
 				 uverbs_free_srq));
 
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_AH_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_AH_HANDLE,
+			UVERBS_OBJECT_AH,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH,
-			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah));
+			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah),
+			    &UVERBS_METHOD(UVERBS_METHOD_AH_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_FLOW_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_HANDLE,
+			UVERBS_OBJECT_FLOW,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
 
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_FLOW,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object),
-				 uverbs_free_flow));
+				 uverbs_free_flow),
+			    &UVERBS_METHOD(UVERBS_METHOD_FLOW_DESTROY));
 
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_WQ,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq));
 
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_RWQ_IND_TBL_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE,
+			UVERBS_OBJECT_RWQ_IND_TBL,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL,
-			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl));
+			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl),
+			    &UVERBS_METHOD(UVERBS_METHOD_RWQ_IND_TBL_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_XRCD_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_XRCD_HANDLE,
+			UVERBS_OBJECT_XRCD,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
 
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_XRCD,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object),
-				 uverbs_free_xrcd));
+				 uverbs_free_xrcd),
+			    &UVERBS_METHOD(UVERBS_METHOD_XRCD_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_PD_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_PD_HANDLE,
+			UVERBS_OBJECT_PD,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD,
-			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd));
+			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd),
+			    &UVERBS_METHOD(UVERBS_METHOD_PD_DESTROY));
 
 const struct uapi_definition uverbs_def_obj_intf[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_PD,
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index cafb49a45515..a034352ff60f 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -143,10 +143,18 @@ DECLARE_UVERBS_NAMED_METHOD(
 			    UVERBS_ATTR_TYPE(u32),
 			    UA_MANDATORY));
 
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_MR_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE,
+			UVERBS_OBJECT_MR,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_MR,
 	UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr),
-	&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG));
+	&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
+	&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY));
 
 const struct uapi_definition uverbs_def_obj_mr[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index cf6e5f575696..fbc92d0f0dcd 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -146,6 +146,11 @@ enum uverbs_attrs_reg_dm_mr_cmd_attr_ids {
 
 enum uverbs_methods_mr {
 	UVERBS_METHOD_DM_MR_REG,
+	UVERBS_METHOD_MR_DESTROY,
+};
+
+enum uverbs_attrs_mr_destroy_ids {
+	UVERBS_ATTR_DESTROY_MR_HANDLE,
 };
 
 enum uverbs_attrs_create_counters_cmd_attr_ids {
@@ -174,4 +179,51 @@ enum uverbs_attrs_info_handles_id {
 	UVERBS_ATTR_INFO_HANDLES_LIST,
 };
 
+enum uverbs_methods_pd {
+	UVERBS_METHOD_PD_DESTROY,
+};
+
+enum uverbs_attrs_pd_destroy_ids {
+	UVERBS_ATTR_DESTROY_PD_HANDLE,
+};
+
+enum uverbs_methods_mw {
+	UVERBS_METHOD_MW_DESTROY,
+};
+
+enum uverbs_attrs_mw_destroy_ids {
+	UVERBS_ATTR_DESTROY_MW_HANDLE,
+};
+
+enum uverbs_methods_xrcd {
+	UVERBS_METHOD_XRCD_DESTROY,
+};
+
+enum uverbs_attrs_xrcd_destroy_ids {
+	UVERBS_ATTR_DESTROY_XRCD_HANDLE,
+};
+
+enum uverbs_methods_ah {
+	UVERBS_METHOD_AH_DESTROY,
+};
+
+enum uverbs_attrs_ah_destroy_ids {
+	UVERBS_ATTR_DESTROY_AH_HANDLE,
+};
+
+enum uverbs_methods_rwq_ind_tbl {
+	UVERBS_METHOD_RWQ_IND_TBL_DESTROY,
+};
+
+enum uverbs_attrs_rwq_ind_tbl_destroy_ids {
+	UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE,
+};
+
+enum uverbs_methods_flow {
+	UVERBS_METHOD_FLOW_DESTROY,
+};
+
+enum uverbs_attrs_flow_destroy_ids {
+	UVERBS_ATTR_DESTROY_FLOW_HANDLE,
+};
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From ad8a4496757f6f7344011a20a07195bd27e3989c Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 11 Dec 2018 13:37:52 +0200
Subject: IB/uverbs: Add support to advise_mr

Add new ioctl method for the MR object - ADVISE_MR.

This command can be used by users to give an advice or directions to the
kernel about an address range that belongs to memory regions.

A new ib_device callback, advise_mr(), is introduced here to suupport the
new command. This command takes the following arguments:

- pd:		The protection domain to which all memory regions belong
- advice: 	The type of the advice
	  	* IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH - Pre-fetch a range of
		an on-demand paging MR
	  	* IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE - Pre-fetch a range
		of an on-demand paging MR with write intention
- flags:	The properties of the advice
		* IB_UVERBS_ADVISE_MR_FLAG_FLUSH - Operation must end before
		return to the caller
- sg_list:	The list of memory ranges
- num_sge:	The number of memory ranges in the list
- attrs:	More attributes to be parsed by the provider

Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Guy Levi <guyle@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_std_types_mr.c | 56 ++++++++++++++++++++++++++-
 include/rdma/ib_verbs.h                       |  4 ++
 include/uapi/rdma/ib_user_ioctl_cmds.h        |  8 ++++
 include/uapi/rdma/ib_user_ioctl_verbs.h       |  9 +++++
 4 files changed, 76 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index a034352ff60f..71dfa5e5938e 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -39,6 +39,42 @@ static int uverbs_free_mr(struct ib_uobject *uobject,
 	return ib_dereg_mr((struct ib_mr *)uobject->object);
 }
 
+static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_pd *pd =
+		uverbs_attr_get_obj(attrs, UVERBS_ATTR_ADVISE_MR_PD_HANDLE);
+	enum ib_uverbs_advise_mr_advice advice;
+	struct ib_device *ib_dev = pd->device;
+	struct ib_sge *sg_list;
+	u32 num_sge;
+	u32 flags;
+	int ret;
+
+	/* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */
+	if (!ib_dev->ops.advise_mr)
+		return -EOPNOTSUPP;
+
+	ret = uverbs_get_const(&advice, attrs, UVERBS_ATTR_ADVISE_MR_ADVICE);
+	if (ret)
+		return ret;
+
+	ret = uverbs_get_flags32(&flags, attrs, UVERBS_ATTR_ADVISE_MR_FLAGS,
+				 IB_UVERBS_ADVISE_MR_FLAG_FLUSH);
+	if (ret)
+		return ret;
+
+	num_sge = uverbs_attr_ptr_get_array_size(
+		attrs, UVERBS_ATTR_ADVISE_MR_SGE_LIST, sizeof(struct ib_sge));
+	if (num_sge < 0)
+		return num_sge;
+
+	sg_list = uverbs_attr_get_alloced_ptr(attrs,
+					      UVERBS_ATTR_ADVISE_MR_SGE_LIST);
+	return ib_dev->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
+				     attrs);
+}
+
 static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
 	struct uverbs_attr_bundle *attrs)
 {
@@ -114,6 +150,23 @@ err_dereg:
 	return ret;
 }
 
+DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_ADVISE_MR,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
+			UVERBS_OBJECT_PD,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_CONST_IN(UVERBS_ATTR_ADVISE_MR_ADVICE,
+			     enum ib_uverbs_advise_mr_advice,
+			     UA_MANDATORY),
+	UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_ADVISE_MR_FLAGS,
+			     enum ib_uverbs_advise_mr_flag,
+			     UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ADVISE_MR_SGE_LIST,
+			   UVERBS_ATTR_MIN_SIZE(sizeof(struct ib_uverbs_sge)),
+			   UA_MANDATORY,
+			   UA_ALLOC_AND_COPY));
+
 DECLARE_UVERBS_NAMED_METHOD(
 	UVERBS_METHOD_DM_MR_REG,
 	UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE,
@@ -154,7 +207,8 @@ DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_MR,
 	UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr),
 	&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
-	&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY));
+	&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
+	&UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR));
 
 const struct uapi_definition uverbs_def_obj_mr[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 5b3b51f00f48..0ec15d673d92 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2415,6 +2415,10 @@ struct ib_device_ops {
 	int (*dereg_mr)(struct ib_mr *mr);
 	struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type,
 				  u32 max_num_sg);
+	int (*advise_mr)(struct ib_pd *pd,
+			 enum ib_uverbs_advise_mr_advice advice, u32 flags,
+			 struct ib_sge *sg_list, u32 num_sge,
+			 struct uverbs_attr_bundle *attrs);
 	int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
 			 unsigned int *sg_offset);
 	int (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index fbc92d0f0dcd..f04647852add 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -147,12 +147,20 @@ enum uverbs_attrs_reg_dm_mr_cmd_attr_ids {
 enum uverbs_methods_mr {
 	UVERBS_METHOD_DM_MR_REG,
 	UVERBS_METHOD_MR_DESTROY,
+	UVERBS_METHOD_ADVISE_MR,
 };
 
 enum uverbs_attrs_mr_destroy_ids {
 	UVERBS_ATTR_DESTROY_MR_HANDLE,
 };
 
+enum uverbs_attrs_advise_mr_cmd_attr_ids {
+	UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
+	UVERBS_ATTR_ADVISE_MR_ADVICE,
+	UVERBS_ATTR_ADVISE_MR_FLAGS,
+	UVERBS_ATTR_ADVISE_MR_SGE_LIST,
+};
+
 enum uverbs_attrs_create_counters_cmd_attr_ids {
 	UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
 };
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
index 6cdf192070a2..9dcd345b852a 100644
--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -157,4 +157,13 @@ enum ib_uverbs_read_counters_flags {
 	IB_UVERBS_READ_COUNTERS_PREFER_CACHED = 1 << 0,
 };
 
+enum ib_uverbs_advise_mr_advice {
+	IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH,
+	IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
+};
+
+enum ib_uverbs_advise_mr_flag {
+	IB_UVERBS_ADVISE_MR_FLAG_FLUSH = 1 << 0,
+};
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 3bdbd0228e7555ec745e08469b98e5a0966409d6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 16 Dec 2018 15:47:04 -0800
Subject: bpf: sockmap, metadata support for reporting size of msg

This adds metadata to sk_msg_md for BPF programs to read the sk_msg
size.

When the SK_MSG program is running under an application that is using
sendfile the data is not copied into sk_msg buffers by default. Rather
the BPF program uses sk_msg_pull_data to read the bytes in. This
avoids doing the costly memcopy instructions when they are not in
fact needed. However, if we don't know the size of the sk_msg we
have to guess if needed bytes are available by doing a pull request
which may fail. By including the size of the sk_msg BPF programs can
check the size before issuing sk_msg_pull_data requests.

Additionally, the same applies for sendmsg calls when the application
provides multiple iovs. Here the BPF program needs to pull in data
to update data pointers but its not clear where the data ends without
a size parameter. In many cases "guessing" is not easy to do
and results in multiple calls to pull and without bounded loops
everything gets fairly tricky.

Clean this up by including a u32 size field. Note, all writes into
sk_msg_md are rejected already from sk_msg_is_valid_access so nothing
additional is needed there.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skmsg.h    | 3 +++
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 6 ++++++
 3 files changed, 10 insertions(+)

(limited to 'include/uapi')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 2a11e9d91dfa..eb8f6cb84c10 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -36,6 +36,9 @@ struct sk_msg_sg {
 	struct scatterlist		data[MAX_MSG_FRAGS + 1];
 };
 
+/* UAPI in filter.c depends on struct sk_msg_sg being first element. If
+ * this is moved filter.c also must be updated.
+ */
 struct sk_msg {
 	struct sk_msg_sg		sg;
 	void				*data;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1d324c2cbca2..91c43884f295 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2665,6 +2665,7 @@ struct sk_msg_md {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 size;		/* Total size of sk_msg */
 };
 
 struct sk_reuseport_md {
diff --git a/net/core/filter.c b/net/core/filter.c
index f9348806e843..3a3b21726fb5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7530,6 +7530,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
+
+	case offsetof(struct sk_msg_md, size):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_sg, size));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3-59-g8ed1b


From 3ad20fe393b31025bebfc2d76964561f65df48aa Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Fri, 14 Dec 2018 13:11:14 +0100
Subject: binder: implement binderfs

As discussed at Linux Plumbers Conference 2018 in Vancouver [1] this is the
implementation of binderfs.

/* Abstract */
binderfs is a backwards-compatible filesystem for Android's binder ipc
mechanism. Each ipc namespace will mount a new binderfs instance. Mounting
binderfs multiple times at different locations in the same ipc namespace
will not cause a new super block to be allocated and hence it will be the
same filesystem instance.
Each new binderfs mount will have its own set of binder devices only
visible in the ipc namespace it has been mounted in. All devices in a new
binderfs mount will follow the scheme binder%d and numbering will always
start at 0.

/* Backwards compatibility */
Devices requested in the Kconfig via CONFIG_ANDROID_BINDER_DEVICES for the
initial ipc namespace will work as before. They will be registered via
misc_register() and appear in the devtmpfs mount. Specifically, the
standard devices binder, hwbinder, and vndbinder will all appear in their
standard locations in /dev. Mounting or unmounting the binderfs mount in
the initial ipc namespace will have no effect on these devices, i.e. they
will neither show up in the binderfs mount nor will they disappear when the
binderfs mount is gone.

/* binder-control */
Each new binderfs instance comes with a binder-control device. No other
devices will be present at first. The binder-control device can be used to
dynamically allocate binder devices. All requests operate on the binderfs
mount the binder-control device resides in.
Assuming a new instance of binderfs has been mounted at /dev/binderfs
via mount -t binderfs binderfs /dev/binderfs. Then a request to create a
new binder device can be made as illustrated in [2].
Binderfs devices can simply be removed via unlink().

/* Implementation details */
- dynamic major number allocation:
  When binderfs is registered as a new filesystem it will dynamically
  allocate a new major number. The allocated major number will be returned
  in struct binderfs_device when a new binder device is allocated.
- global minor number tracking:
  Minor are tracked in a global idr struct that is capped at
  BINDERFS_MAX_MINOR. The minor number tracker is protected by a global
  mutex. This is the only point of contention between binderfs mounts.
- struct binderfs_info:
  Each binderfs super block has its own struct binderfs_info that tracks
  specific details about a binderfs instance:
  - ipc namespace
  - dentry of the binder-control device
  - root uid and root gid of the user namespace the binderfs instance
    was mounted in
- mountable by user namespace root:
  binderfs can be mounted by user namespace root in a non-initial user
  namespace. The devices will be owned by user namespace root.
- binderfs binder devices without misc infrastructure:
  New binder devices associated with a binderfs mount do not use the
  full misc_register() infrastructure.
  The misc_register() infrastructure can only create new devices in the
  host's devtmpfs mount. binderfs does however only make devices appear
  under its own mountpoint and thus allocates new character device nodes
  from the inode of the root dentry of the super block. This will have
  the side-effect that binderfs specific device nodes do not appear in
  sysfs. This behavior is similar to devpts allocated pts devices and
  has no effect on the functionality of the ipc mechanism itself.

[1]: https://goo.gl/JL2tfX
[2]: program to allocate a new binderfs binder device:

     #define _GNU_SOURCE
     #include <errno.h>
     #include <fcntl.h>
     #include <stdio.h>
     #include <stdlib.h>
     #include <string.h>
     #include <sys/ioctl.h>
     #include <sys/stat.h>
     #include <sys/types.h>
     #include <unistd.h>
     #include <linux/android/binder_ctl.h>

     int main(int argc, char *argv[])
     {
             int fd, ret, saved_errno;
             size_t len;
             struct binderfs_device device = { 0 };

             if (argc < 2)
                     exit(EXIT_FAILURE);

             len = strlen(argv[1]);
             if (len > BINDERFS_MAX_NAME)
                     exit(EXIT_FAILURE);

             memcpy(device.name, argv[1], len);

             fd = open("/dev/binderfs/binder-control", O_RDONLY | O_CLOEXEC);
             if (fd < 0) {
                     printf("%s - Failed to open binder-control device\n",
                            strerror(errno));
                     exit(EXIT_FAILURE);
             }

             ret = ioctl(fd, BINDER_CTL_ADD, &device);
             saved_errno = errno;
             close(fd);
             errno = saved_errno;
             if (ret < 0) {
                     printf("%s - Failed to allocate new binder device\n",
                            strerror(errno));
                     exit(EXIT_FAILURE);
             }

             printf("Allocated new binder device with major %d, minor %d, and "
                    "name %s\n", device.major, device.minor,
                    device.name);

             exit(EXIT_SUCCESS);
     }

Cc: Martijn Coenen <maco@android.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/Kconfig                 |  12 +
 drivers/android/Makefile                |   1 +
 drivers/android/binder.c                |  25 +-
 drivers/android/binder_internal.h       |  49 +++
 drivers/android/binderfs.c              | 544 ++++++++++++++++++++++++++++++++
 include/uapi/linux/android/binder_ctl.h |  35 ++
 include/uapi/linux/magic.h              |   1 +
 7 files changed, 650 insertions(+), 17 deletions(-)
 create mode 100644 drivers/android/binder_internal.h
 create mode 100644 drivers/android/binderfs.c
 create mode 100644 include/uapi/linux/android/binder_ctl.h

(limited to 'include/uapi')

diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index 51e8250d113f..4c190f8d1f4c 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -20,6 +20,18 @@ config ANDROID_BINDER_IPC
 	  Android process, using Binder to identify, invoke and pass arguments
 	  between said processes.
 
+config ANDROID_BINDERFS
+	bool "Android Binderfs filesystem"
+	depends on ANDROID_BINDER_IPC
+	default n
+	---help---
+	  Binderfs is a pseudo-filesystem for the Android Binder IPC driver
+	  which can be mounted per-ipc namespace allowing to run multiple
+	  instances of Android.
+	  Each binderfs mount initially only contains a binder-control device.
+	  It can be used to dynamically allocate new binder IPC devices via
+	  ioctls.
+
 config ANDROID_BINDER_DEVICES
 	string "Android Binder devices"
 	depends on ANDROID_BINDER_IPC
diff --git a/drivers/android/Makefile b/drivers/android/Makefile
index a01254c43ee3..c7856e3200da 100644
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -1,4 +1,5 @@
 ccflags-y += -I$(src)			# needed for trace events
 
+obj-$(CONFIG_ANDROID_BINDERFS)		+= binderfs.o
 obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
 obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 210940bd0457..cdfc87629efb 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -79,6 +79,7 @@
 #include <asm/cacheflush.h>
 
 #include "binder_alloc.h"
+#include "binder_internal.h"
 #include "binder_trace.h"
 
 static HLIST_HEAD(binder_deferred_list);
@@ -249,20 +250,6 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
 	return e;
 }
 
-struct binder_context {
-	struct binder_node *binder_context_mgr_node;
-	struct mutex context_mgr_node_lock;
-
-	kuid_t binder_context_mgr_uid;
-	const char *name;
-};
-
-struct binder_device {
-	struct hlist_node hlist;
-	struct miscdevice miscdev;
-	struct binder_context context;
-};
-
 /**
  * struct binder_work - work enqueued on a worklist
  * @entry:             node enqueued on list
@@ -5024,8 +5011,12 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	proc->tsk = current->group_leader;
 	INIT_LIST_HEAD(&proc->todo);
 	proc->default_priority = task_nice(current);
-	binder_dev = container_of(filp->private_data, struct binder_device,
-				  miscdev);
+	/* binderfs stashes devices in i_private */
+	if (is_binderfs_device(nodp))
+		binder_dev = nodp->i_private;
+	else
+		binder_dev = container_of(filp->private_data,
+					  struct binder_device, miscdev);
 	proc->context = &binder_dev->context;
 	binder_alloc_init(&proc->alloc);
 
@@ -5816,7 +5807,7 @@ static int transaction_log_show(struct seq_file *m, void *unused)
 	return 0;
 }
 
-static const struct file_operations binder_fops = {
+const struct file_operations binder_fops = {
 	.owner = THIS_MODULE,
 	.poll = binder_poll,
 	.unlocked_ioctl = binder_ioctl,
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
new file mode 100644
index 000000000000..7fb97f503ef2
--- /dev/null
+++ b/drivers/android/binder_internal.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_BINDER_INTERNAL_H
+#define _LINUX_BINDER_INTERNAL_H
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/uidgid.h>
+
+struct binder_context {
+	struct binder_node *binder_context_mgr_node;
+	struct mutex context_mgr_node_lock;
+	kuid_t binder_context_mgr_uid;
+	const char *name;
+};
+
+/**
+ * struct binder_device - information about a binder device node
+ * @hlist:          list of binder devices (only used for devices requested via
+ *                  CONFIG_ANDROID_BINDER_DEVICES)
+ * @miscdev:        information about a binder character device node
+ * @context:        binder context information
+ * @binderfs_inode: This is the inode of the root dentry of the super block
+ *                  belonging to a binderfs mount.
+ */
+struct binder_device {
+	struct hlist_node hlist;
+	struct miscdevice miscdev;
+	struct binder_context context;
+	struct inode *binderfs_inode;
+};
+
+extern const struct file_operations binder_fops;
+
+#ifdef CONFIG_ANDROID_BINDERFS
+extern bool is_binderfs_device(const struct inode *inode);
+#else
+static inline bool is_binderfs_device(const struct inode *inode)
+{
+	return false;
+}
+#endif
+
+#endif /* _LINUX_BINDER_INTERNAL_H */
diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
new file mode 100644
index 000000000000..7496b10532aa
--- /dev/null
+++ b/drivers/android/binderfs.c
@@ -0,0 +1,544 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/compiler_types.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/gfp.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/ipc_namespace.h>
+#include <linux/kdev_t.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/major.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/radix-tree.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock_types.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/user_namespace.h>
+#include <linux/xarray.h>
+#include <uapi/asm-generic/errno-base.h>
+#include <uapi/linux/android/binder.h>
+#include <uapi/linux/android/binder_ctl.h>
+
+#include "binder_internal.h"
+
+#define FIRST_INODE 1
+#define SECOND_INODE 2
+#define INODE_OFFSET 3
+#define INTSTRLEN 21
+#define BINDERFS_MAX_MINOR (1U << MINORBITS)
+
+static struct vfsmount *binderfs_mnt;
+
+static dev_t binderfs_dev;
+static DEFINE_MUTEX(binderfs_minors_mutex);
+static DEFINE_IDA(binderfs_minors);
+
+/**
+ * binderfs_info - information about a binderfs mount
+ * @ipc_ns:         The ipc namespace the binderfs mount belongs to.
+ * @control_dentry: This records the dentry of this binderfs mount
+ *                  binder-control device.
+ * @root_uid:       uid that needs to be used when a new binder device is
+ *                  created.
+ * @root_gid:       gid that needs to be used when a new binder device is
+ *                  created.
+ */
+struct binderfs_info {
+	struct ipc_namespace *ipc_ns;
+	struct dentry *control_dentry;
+	kuid_t root_uid;
+	kgid_t root_gid;
+
+};
+
+static inline struct binderfs_info *BINDERFS_I(const struct inode *inode)
+{
+	return inode->i_sb->s_fs_info;
+}
+
+bool is_binderfs_device(const struct inode *inode)
+{
+	if (inode->i_sb->s_magic == BINDERFS_SUPER_MAGIC)
+		return true;
+
+	return false;
+}
+
+/**
+ * binderfs_binder_device_create - allocate inode from super block of a
+ *                                 binderfs mount
+ * @ref_inode: inode from wich the super block will be taken
+ * @userp:     buffer to copy information about new device for userspace to
+ * @req:       struct binderfs_device as copied from userspace
+ *
+ * This function allocated a new binder_device and reserves a new minor
+ * number for it.
+ * Minor numbers are limited and tracked globally in binderfs_minors. The
+ * function will stash a struct binder_device for the specific binder
+ * device in i_private of the inode.
+ * It will go on to allocate a new inode from the super block of the
+ * filesystem mount, stash a struct binder_device in its i_private field
+ * and attach a dentry to that inode.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+static int binderfs_binder_device_create(struct inode *ref_inode,
+					 struct binderfs_device __user *userp,
+					 struct binderfs_device *req)
+{
+	int minor, ret;
+	struct dentry *dentry, *dup, *root;
+	struct binder_device *device;
+	size_t name_len = BINDERFS_MAX_NAME + 1;
+	char *name = NULL;
+	struct inode *inode = NULL;
+	struct super_block *sb = ref_inode->i_sb;
+	struct binderfs_info *info = sb->s_fs_info;
+
+	/* Reserve new minor number for the new device. */
+	mutex_lock(&binderfs_minors_mutex);
+	minor = ida_alloc_max(&binderfs_minors, BINDERFS_MAX_MINOR, GFP_KERNEL);
+	mutex_unlock(&binderfs_minors_mutex);
+	if (minor < 0)
+		return minor;
+
+	ret = -ENOMEM;
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device)
+		goto err;
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto err;
+
+	inode->i_ino = minor + INODE_OFFSET;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+	init_special_inode(inode, S_IFCHR | 0600,
+			   MKDEV(MAJOR(binderfs_dev), minor));
+	inode->i_fop = &binder_fops;
+	inode->i_uid = info->root_uid;
+	inode->i_gid = info->root_gid;
+
+	name = kmalloc(name_len, GFP_KERNEL);
+	if (!name)
+		goto err;
+
+	strscpy(name, req->name, name_len);
+
+	device->binderfs_inode = inode;
+	device->context.binder_context_mgr_uid = INVALID_UID;
+	device->context.name = name;
+	device->miscdev.name = name;
+	device->miscdev.minor = minor;
+	mutex_init(&device->context.context_mgr_node_lock);
+
+	req->major = MAJOR(binderfs_dev);
+	req->minor = minor;
+
+	ret = copy_to_user(userp, req, sizeof(*req));
+	if (ret) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	root = sb->s_root;
+	inode_lock(d_inode(root));
+	dentry = d_alloc_name(root, name);
+	if (!dentry) {
+		inode_unlock(d_inode(root));
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* Verify that the name userspace gave us is not already in use. */
+	dup = d_lookup(root, &dentry->d_name);
+	if (dup) {
+		if (d_really_is_positive(dup)) {
+			dput(dup);
+			dput(dentry);
+			inode_unlock(d_inode(root));
+			ret = -EEXIST;
+			goto err;
+		}
+		dput(dup);
+	}
+
+	inode->i_private = device;
+	d_add(dentry, inode);
+	fsnotify_create(root->d_inode, dentry);
+	inode_unlock(d_inode(root));
+
+	return 0;
+
+err:
+	kfree(name);
+	kfree(device);
+	mutex_lock(&binderfs_minors_mutex);
+	ida_free(&binderfs_minors, minor);
+	mutex_unlock(&binderfs_minors_mutex);
+	iput(inode);
+
+	return ret;
+}
+
+/**
+ * binderfs_ctl_ioctl - handle binder device node allocation requests
+ *
+ * The request handler for the binder-control device. All requests operate on
+ * the binderfs mount the binder-control device resides in:
+ * - BINDER_CTL_ADD
+ *   Allocate a new binder device.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+static long binder_ctl_ioctl(struct file *file, unsigned int cmd,
+			     unsigned long arg)
+{
+	int ret = -EINVAL;
+	struct inode *inode = file_inode(file);
+	struct binderfs_device __user *device = (struct binderfs_device __user *)arg;
+	struct binderfs_device device_req;
+
+	switch (cmd) {
+	case BINDER_CTL_ADD:
+		ret = copy_from_user(&device_req, device, sizeof(device_req));
+		if (ret) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = binderfs_binder_device_create(inode, device, &device_req);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static void binderfs_evict_inode(struct inode *inode)
+{
+	struct binder_device *device = inode->i_private;
+
+	clear_inode(inode);
+
+	if (!device)
+		return;
+
+	mutex_lock(&binderfs_minors_mutex);
+	ida_free(&binderfs_minors, device->miscdev.minor);
+	mutex_unlock(&binderfs_minors_mutex);
+
+	kfree(device->context.name);
+	kfree(device);
+}
+
+static const struct super_operations binderfs_super_ops = {
+	.statfs = simple_statfs,
+	.evict_inode = binderfs_evict_inode,
+};
+
+static int binderfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry,
+			   unsigned int flags)
+{
+	struct inode *inode = d_inode(old_dentry);
+
+	/* binderfs doesn't support directories. */
+	if (d_is_dir(old_dentry))
+		return -EPERM;
+
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
+	if (!simple_empty(new_dentry))
+		return -ENOTEMPTY;
+
+	if (d_really_is_positive(new_dentry))
+		simple_unlink(new_dir, new_dentry);
+
+	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
+		new_dir->i_mtime = inode->i_ctime = current_time(old_dir);
+
+	return 0;
+}
+
+static int binderfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	/*
+	 * The control dentry is only ever touched during mount so checking it
+	 * here should not require us to take lock.
+	 */
+	if (BINDERFS_I(dir)->control_dentry == dentry)
+		return -EPERM;
+
+	return simple_unlink(dir, dentry);
+}
+
+static const struct file_operations binder_ctl_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nonseekable_open,
+	.unlocked_ioctl	= binder_ctl_ioctl,
+	.compat_ioctl	= binder_ctl_ioctl,
+	.llseek		= noop_llseek,
+};
+
+/**
+ * binderfs_binder_ctl_create - create a new binder-control device
+ * @sb: super block of the binderfs mount
+ *
+ * This function creates a new binder-control device node in the binderfs mount
+ * referred to by @sb.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+static int binderfs_binder_ctl_create(struct super_block *sb)
+{
+	int minor, ret;
+	struct dentry *dentry;
+	struct binder_device *device;
+	struct inode *inode = NULL;
+	struct dentry *root = sb->s_root;
+	struct binderfs_info *info = sb->s_fs_info;
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device)
+		return -ENOMEM;
+
+	inode_lock(d_inode(root));
+
+	/* If we have already created a binder-control node, return. */
+	if (info->control_dentry) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	/* Reserve a new minor number for the new device. */
+	mutex_lock(&binderfs_minors_mutex);
+	minor = ida_alloc_max(&binderfs_minors, BINDERFS_MAX_MINOR, GFP_KERNEL);
+	mutex_unlock(&binderfs_minors_mutex);
+	if (minor < 0) {
+		ret = minor;
+		goto out;
+	}
+
+	inode->i_ino = SECOND_INODE;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+	init_special_inode(inode, S_IFCHR | 0600,
+			   MKDEV(MAJOR(binderfs_dev), minor));
+	inode->i_fop = &binder_ctl_fops;
+	inode->i_uid = info->root_uid;
+	inode->i_gid = info->root_gid;
+
+	device->binderfs_inode = inode;
+	device->miscdev.minor = minor;
+
+	dentry = d_alloc_name(root, "binder-control");
+	if (!dentry)
+		goto out;
+
+	inode->i_private = device;
+	info->control_dentry = dentry;
+	d_add(dentry, inode);
+	inode_unlock(d_inode(root));
+
+	return 0;
+
+out:
+	inode_unlock(d_inode(root));
+	kfree(device);
+	iput(inode);
+
+	return ret;
+}
+
+static const struct inode_operations binderfs_dir_inode_operations = {
+	.lookup = simple_lookup,
+	.rename = binderfs_rename,
+	.unlink = binderfs_unlink,
+};
+
+static int binderfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct binderfs_info *info;
+	int ret = -ENOMEM;
+	struct inode *inode = NULL;
+	struct ipc_namespace *ipc_ns = sb->s_fs_info;
+
+	get_ipc_ns(ipc_ns);
+
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
+
+	/*
+	 * The binderfs filesystem can be mounted by userns root in a
+	 * non-initial userns. By default such mounts have the SB_I_NODEV flag
+	 * set in s_iflags to prevent security issues where userns root can
+	 * just create random device nodes via mknod() since it owns the
+	 * filesystem mount. But binderfs does not allow to create any files
+	 * including devices nodes. The only way to create binder devices nodes
+	 * is through the binder-control device which userns root is explicitly
+	 * allowed to do. So removing the SB_I_NODEV flag from s_iflags is both
+	 * necessary and safe.
+	 */
+	sb->s_iflags &= ~SB_I_NODEV;
+	sb->s_iflags |= SB_I_NOEXEC;
+	sb->s_magic = BINDERFS_SUPER_MAGIC;
+	sb->s_op = &binderfs_super_ops;
+	sb->s_time_gran = 1;
+
+	info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL);
+	if (!info)
+		goto err_without_dentry;
+
+	info->ipc_ns = ipc_ns;
+	info->root_gid = make_kgid(sb->s_user_ns, 0);
+	if (!gid_valid(info->root_gid))
+		info->root_gid = GLOBAL_ROOT_GID;
+	info->root_uid = make_kuid(sb->s_user_ns, 0);
+	if (!uid_valid(info->root_uid))
+		info->root_uid = GLOBAL_ROOT_UID;
+
+	sb->s_fs_info = info;
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto err_without_dentry;
+
+	inode->i_ino = FIRST_INODE;
+	inode->i_fop = &simple_dir_operations;
+	inode->i_mode = S_IFDIR | 0755;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+	inode->i_op = &binderfs_dir_inode_operations;
+	set_nlink(inode, 2);
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		goto err_without_dentry;
+
+	ret = binderfs_binder_ctl_create(sb);
+	if (ret)
+		goto err_with_dentry;
+
+	return 0;
+
+err_with_dentry:
+	dput(sb->s_root);
+	sb->s_root = NULL;
+
+err_without_dentry:
+	put_ipc_ns(ipc_ns);
+	iput(inode);
+	kfree(info);
+
+	return ret;
+}
+
+static int binderfs_test_super(struct super_block *sb, void *data)
+{
+	struct binderfs_info *info = sb->s_fs_info;
+
+	if (info)
+		return info->ipc_ns == data;
+
+	return 0;
+}
+
+static int binderfs_set_super(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+static struct dentry *binderfs_mount(struct file_system_type *fs_type,
+				     int flags, const char *dev_name,
+				     void *data)
+{
+	struct super_block *sb;
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+
+	if (!ns_capable(ipc_ns->user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	sb = sget_userns(fs_type, binderfs_test_super, binderfs_set_super,
+			 flags, ipc_ns->user_ns, ipc_ns);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+
+	if (!sb->s_root) {
+		int ret = binderfs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
+		if (ret) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(ret);
+		}
+
+		sb->s_flags |= SB_ACTIVE;
+	}
+
+	return dget(sb->s_root);
+}
+
+static void binderfs_kill_super(struct super_block *sb)
+{
+	struct binderfs_info *info = sb->s_fs_info;
+
+	if (info && info->ipc_ns)
+		put_ipc_ns(info->ipc_ns);
+
+	kfree(info);
+	kill_litter_super(sb);
+}
+
+static struct file_system_type binder_fs_type = {
+	.name		= "binder",
+	.mount		= binderfs_mount,
+	.kill_sb	= binderfs_kill_super,
+	.fs_flags	= FS_USERNS_MOUNT,
+};
+
+static int __init init_binderfs(void)
+{
+	int ret;
+
+	/* Allocate new major number for binderfs. */
+	ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR,
+				  "binder");
+	if (ret)
+		return ret;
+
+	ret = register_filesystem(&binder_fs_type);
+	if (ret) {
+		unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR);
+		return ret;
+	}
+
+	binderfs_mnt = kern_mount(&binder_fs_type);
+	if (IS_ERR(binderfs_mnt)) {
+		ret = PTR_ERR(binderfs_mnt);
+		binderfs_mnt = NULL;
+		unregister_filesystem(&binder_fs_type);
+		unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR);
+	}
+
+	return ret;
+}
+
+device_initcall(init_binderfs);
diff --git a/include/uapi/linux/android/binder_ctl.h b/include/uapi/linux/android/binder_ctl.h
new file mode 100644
index 000000000000..65b2efd1a0a5
--- /dev/null
+++ b/include/uapi/linux/android/binder_ctl.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2018 Canonical Ltd.
+ *
+ */
+
+#ifndef _UAPI_LINUX_BINDER_CTL_H
+#define _UAPI_LINUX_BINDER_CTL_H
+
+#include <linux/android/binder.h>
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define BINDERFS_MAX_NAME 255
+
+/**
+ * struct binderfs_device - retrieve information about a new binder device
+ * @name:   the name to use for the new binderfs binder device
+ * @major:  major number allocated for binderfs binder devices
+ * @minor:  minor number allocated for the new binderfs binder device
+ *
+ */
+struct binderfs_device {
+	char name[BINDERFS_MAX_NAME + 1];
+	__u8 major;
+	__u8 minor;
+};
+
+/**
+ * Allocate a new binder device.
+ */
+#define BINDER_CTL_ADD _IOWR('b', 1, struct binderfs_device)
+
+#endif /* _UAPI_LINUX_BINDER_CTL_H */
+
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 96c24478d8ce..f8c00045d537 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -73,6 +73,7 @@
 #define DAXFS_MAGIC             0x64646178
 #define BINFMTFS_MAGIC          0x42494e4d
 #define DEVPTS_SUPER_MAGIC	0x1cd1
+#define BINDERFS_SUPER_MAGIC	0x6c6f6f70
 #define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
 #define PIPEFS_MAGIC            0x50495045
 #define PROC_SUPER_MAGIC	0x9fa0
-- 
cgit v1.2.3-59-g8ed1b


From 1f23816b8eb8fdc39990abe166c10a18c16f6b21 Mon Sep 17 00:00:00 2001
From: Changpeng Liu <changpeng.liu@intel.com>
Date: Thu, 1 Nov 2018 15:40:35 -0700
Subject: virtio_blk: add discard and write zeroes support

In commit 88c85538, "virtio-blk: add discard and write zeroes features
to specification" (https://github.com/oasis-tcs/virtio-spec), the virtio
block specification has been extended to add VIRTIO_BLK_T_DISCARD and
VIRTIO_BLK_T_WRITE_ZEROES commands.  This patch enables support for
discard and write zeroes in the virtio-blk driver when the device
advertises the corresponding features, VIRTIO_BLK_F_DISCARD and
VIRTIO_BLK_F_WRITE_ZEROES.

Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Signed-off-by: Daniel Verkamp <dverkamp@chromium.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 drivers/block/virtio_blk.c      | 83 ++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/virtio_blk.h | 54 +++++++++++++++++++++++++++
 2 files changed, 135 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 086c6bb12baa..0f39efb4b3aa 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -18,6 +18,7 @@
 
 #define PART_BITS 4
 #define VQ_NAME_LEN 16
+#define MAX_DISCARD_SEGMENTS 256u
 
 static int major;
 static DEFINE_IDA(vd_index_ida);
@@ -172,10 +173,48 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
 	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
+static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
+{
+	unsigned short segments = blk_rq_nr_discard_segments(req);
+	unsigned short n = 0;
+	struct virtio_blk_discard_write_zeroes *range;
+	struct bio *bio;
+	u32 flags = 0;
+
+	if (unmap)
+		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;
+
+	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
+	if (!range)
+		return -ENOMEM;
+
+	__rq_for_each_bio(bio, req) {
+		u64 sector = bio->bi_iter.bi_sector;
+		u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+
+		range[n].flags = cpu_to_le32(flags);
+		range[n].num_sectors = cpu_to_le32(num_sectors);
+		range[n].sector = cpu_to_le64(sector);
+		n++;
+	}
+
+	req->special_vec.bv_page = virt_to_page(range);
+	req->special_vec.bv_offset = offset_in_page(range);
+	req->special_vec.bv_len = sizeof(*range) * segments;
+	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+	return 0;
+}
+
 static inline void virtblk_request_done(struct request *req)
 {
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		kfree(page_address(req->special_vec.bv_page) +
+		      req->special_vec.bv_offset);
+	}
+
 	switch (req_op(req)) {
 	case REQ_OP_SCSI_IN:
 	case REQ_OP_SCSI_OUT:
@@ -225,6 +264,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	int qid = hctx->queue_num;
 	int err;
 	bool notify = false;
+	bool unmap = false;
 	u32 type;
 
 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
@@ -237,6 +277,13 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	case REQ_OP_FLUSH:
 		type = VIRTIO_BLK_T_FLUSH;
 		break;
+	case REQ_OP_DISCARD:
+		type = VIRTIO_BLK_T_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		type = VIRTIO_BLK_T_WRITE_ZEROES;
+		unmap = !(req->cmd_flags & REQ_NOUNMAP);
+		break;
 	case REQ_OP_SCSI_IN:
 	case REQ_OP_SCSI_OUT:
 		type = VIRTIO_BLK_T_SCSI_CMD;
@@ -256,6 +303,12 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	blk_mq_start_request(req);
 
+	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
+		err = virtblk_setup_discard_write_zeroes(req, unmap);
+		if (err)
+			return BLK_STS_RESOURCE;
+	}
+
 	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
 	if (num) {
 		if (rq_data_dir(req) == WRITE)
@@ -802,6 +855,32 @@ static int virtblk_probe(struct virtio_device *vdev)
 	if (!err && opt_io_size)
 		blk_queue_io_opt(q, blk_size * opt_io_size);
 
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
+		q->limits.discard_granularity = blk_size;
+
+		virtio_cread(vdev, struct virtio_blk_config,
+			     discard_sector_alignment, &v);
+		q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0;
+
+		virtio_cread(vdev, struct virtio_blk_config,
+			     max_discard_sectors, &v);
+		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);
+
+		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
+			     &v);
+		blk_queue_max_discard_segments(q,
+					       min_not_zero(v,
+							    MAX_DISCARD_SEGMENTS));
+
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+	}
+
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
+		virtio_cread(vdev, struct virtio_blk_config,
+			     max_write_zeroes_sectors, &v);
+		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
+	}
+
 	virtblk_update_capacity(vblk, false);
 	virtio_device_ready(vdev);
 
@@ -895,14 +974,14 @@ static unsigned int features_legacy[] = {
 	VIRTIO_BLK_F_SCSI,
 #endif
 	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
-	VIRTIO_BLK_F_MQ,
+	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
 }
 ;
 static unsigned int features[] = {
 	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
 	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
-	VIRTIO_BLK_F_MQ,
+	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 9ebe4d968dd5..0f99d7b49ede 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -38,6 +38,8 @@
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
 #define VIRTIO_BLK_F_TOPOLOGY	10	/* Topology information is available */
 #define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
+#define VIRTIO_BLK_F_DISCARD	13	/* DISCARD is supported */
+#define VIRTIO_BLK_F_WRITE_ZEROES	14	/* WRITE ZEROES is supported */
 
 /* Legacy feature bits */
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -86,6 +88,39 @@ struct virtio_blk_config {
 
 	/* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
 	__u16 num_queues;
+
+	/* the next 3 entries are guarded by VIRTIO_BLK_F_DISCARD */
+	/*
+	 * The maximum discard sectors (in 512-byte sectors) for
+	 * one segment.
+	 */
+	__u32 max_discard_sectors;
+	/*
+	 * The maximum number of discard segments in a
+	 * discard command.
+	 */
+	__u32 max_discard_seg;
+	/* Discard commands must be aligned to this number of sectors. */
+	__u32 discard_sector_alignment;
+
+	/* the next 3 entries are guarded by VIRTIO_BLK_F_WRITE_ZEROES */
+	/*
+	 * The maximum number of write zeroes sectors (in 512-byte sectors) in
+	 * one segment.
+	 */
+	__u32 max_write_zeroes_sectors;
+	/*
+	 * The maximum number of segments in a write zeroes
+	 * command.
+	 */
+	__u32 max_write_zeroes_seg;
+	/*
+	 * Set if a VIRTIO_BLK_T_WRITE_ZEROES request may result in the
+	 * deallocation of one or more of the sectors.
+	 */
+	__u8 write_zeroes_may_unmap;
+
+	__u8 unused1[3];
 } __attribute__((packed));
 
 /*
@@ -114,6 +149,12 @@ struct virtio_blk_config {
 /* Get device ID command */
 #define VIRTIO_BLK_T_GET_ID    8
 
+/* Discard command */
+#define VIRTIO_BLK_T_DISCARD	11
+
+/* Write zeroes command */
+#define VIRTIO_BLK_T_WRITE_ZEROES	13
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER	0x80000000
@@ -133,6 +174,19 @@ struct virtio_blk_outhdr {
 	__virtio64 sector;
 };
 
+/* Unmap this range (only valid for write zeroes command) */
+#define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP	0x00000001
+
+/* Discard/write zeroes range for each request. */
+struct virtio_blk_discard_write_zeroes {
+	/* discard/write zeroes start sector */
+	__le64 sector;
+	/* number of discard/write zeroes sectors */
+	__le32 num_sectors;
+	/* flags for this range */
+	__le32 flags;
+};
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 struct virtio_scsi_inhdr {
 	__virtio32 errors;
-- 
cgit v1.2.3-59-g8ed1b


From 4b86713236e4bd6ea6c881a97711ae039fc4069b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 17 Dec 2018 18:35:09 +0100
Subject: vhost: split structs into a separate header file

vhost structs are shared by vhost-kernel and vhost-user.  Split them
into a separate file to ease copying them into programs that implement
either the server or the client side of vhost-user.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/vhost.h       | 113 +---------------------------------
 include/uapi/linux/vhost_types.h | 128 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+), 111 deletions(-)
 create mode 100644 include/uapi/linux/vhost_types.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 84c3de89696a..40d028eed645 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -11,94 +11,9 @@
  * device configuration.
  */
 
+#include <linux/vhost_types.h>
 #include <linux/types.h>
-#include <linux/compiler.h>
 #include <linux/ioctl.h>
-#include <linux/virtio_config.h>
-#include <linux/virtio_ring.h>
-
-struct vhost_vring_state {
-	unsigned int index;
-	unsigned int num;
-};
-
-struct vhost_vring_file {
-	unsigned int index;
-	int fd; /* Pass -1 to unbind from file. */
-
-};
-
-struct vhost_vring_addr {
-	unsigned int index;
-	/* Option flags. */
-	unsigned int flags;
-	/* Flag values: */
-	/* Whether log address is valid. If set enables logging. */
-#define VHOST_VRING_F_LOG 0
-
-	/* Start of array of descriptors (virtually contiguous) */
-	__u64 desc_user_addr;
-	/* Used structure address. Must be 32 bit aligned */
-	__u64 used_user_addr;
-	/* Available structure address. Must be 16 bit aligned */
-	__u64 avail_user_addr;
-	/* Logging support. */
-	/* Log writes to used structure, at offset calculated from specified
-	 * address. Address must be 32 bit aligned. */
-	__u64 log_guest_addr;
-};
-
-/* no alignment requirement */
-struct vhost_iotlb_msg {
-	__u64 iova;
-	__u64 size;
-	__u64 uaddr;
-#define VHOST_ACCESS_RO      0x1
-#define VHOST_ACCESS_WO      0x2
-#define VHOST_ACCESS_RW      0x3
-	__u8 perm;
-#define VHOST_IOTLB_MISS           1
-#define VHOST_IOTLB_UPDATE         2
-#define VHOST_IOTLB_INVALIDATE     3
-#define VHOST_IOTLB_ACCESS_FAIL    4
-	__u8 type;
-};
-
-#define VHOST_IOTLB_MSG 0x1
-#define VHOST_IOTLB_MSG_V2 0x2
-
-struct vhost_msg {
-	int type;
-	union {
-		struct vhost_iotlb_msg iotlb;
-		__u8 padding[64];
-	};
-};
-
-struct vhost_msg_v2 {
-	__u32 type;
-	__u32 reserved;
-	union {
-		struct vhost_iotlb_msg iotlb;
-		__u8 padding[64];
-	};
-};
-
-struct vhost_memory_region {
-	__u64 guest_phys_addr;
-	__u64 memory_size; /* bytes */
-	__u64 userspace_addr;
-	__u64 flags_padding; /* No flags are currently specified. */
-};
-
-/* All region addresses and sizes must be 4K aligned. */
-#define VHOST_PAGE_SIZE 0x1000
-
-struct vhost_memory {
-	__u32 nregions;
-	__u32 padding;
-	struct vhost_memory_region regions[0];
-};
 
 /* ioctls */
 
@@ -186,31 +101,7 @@ struct vhost_memory {
  * device.  This can be used to stop the ring (e.g. for migration). */
 #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
 
-/* Feature bits */
-/* Log all write descriptors. Can be changed while device is active. */
-#define VHOST_F_LOG_ALL 26
-/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
-#define VHOST_NET_F_VIRTIO_NET_HDR 27
-
-/* VHOST_SCSI specific definitions */
-
-/*
- * Used by QEMU userspace to ensure a consistent vhost-scsi ABI.
- *
- * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate +
- *            RFC-v2 vhost-scsi userspace.  Add GET_ABI_VERSION ioctl usage
- * ABI Rev 1: January 2013. Ignore vhost_tpgt filed in struct vhost_scsi_target.
- *            All the targets under vhost_wwpn can be seen and used by guset.
- */
-
-#define VHOST_SCSI_ABI_VERSION	1
-
-struct vhost_scsi_target {
-	int abi_version;
-	char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */
-	unsigned short vhost_tpgt;
-	unsigned short reserved;
-};
+/* VHOST_SCSI specific defines */
 
 #define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target)
 #define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target)
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
new file mode 100644
index 000000000000..c907290ff065
--- /dev/null
+++ b/include/uapi/linux/vhost_types.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_VHOST_TYPES_H
+#define _LINUX_VHOST_TYPES_H
+/* Userspace interface for in-kernel virtio accelerators. */
+
+/* vhost is used to reduce the number of system calls involved in virtio.
+ *
+ * Existing virtio net code is used in the guest without modification.
+ *
+ * This header includes interface used by userspace hypervisor for
+ * device configuration.
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+
+struct vhost_vring_state {
+	unsigned int index;
+	unsigned int num;
+};
+
+struct vhost_vring_file {
+	unsigned int index;
+	int fd; /* Pass -1 to unbind from file. */
+
+};
+
+struct vhost_vring_addr {
+	unsigned int index;
+	/* Option flags. */
+	unsigned int flags;
+	/* Flag values: */
+	/* Whether log address is valid. If set enables logging. */
+#define VHOST_VRING_F_LOG 0
+
+	/* Start of array of descriptors (virtually contiguous) */
+	__u64 desc_user_addr;
+	/* Used structure address. Must be 32 bit aligned */
+	__u64 used_user_addr;
+	/* Available structure address. Must be 16 bit aligned */
+	__u64 avail_user_addr;
+	/* Logging support. */
+	/* Log writes to used structure, at offset calculated from specified
+	 * address. Address must be 32 bit aligned. */
+	__u64 log_guest_addr;
+};
+
+/* no alignment requirement */
+struct vhost_iotlb_msg {
+	__u64 iova;
+	__u64 size;
+	__u64 uaddr;
+#define VHOST_ACCESS_RO      0x1
+#define VHOST_ACCESS_WO      0x2
+#define VHOST_ACCESS_RW      0x3
+	__u8 perm;
+#define VHOST_IOTLB_MISS           1
+#define VHOST_IOTLB_UPDATE         2
+#define VHOST_IOTLB_INVALIDATE     3
+#define VHOST_IOTLB_ACCESS_FAIL    4
+	__u8 type;
+};
+
+#define VHOST_IOTLB_MSG 0x1
+#define VHOST_IOTLB_MSG_V2 0x2
+
+struct vhost_msg {
+	int type;
+	union {
+		struct vhost_iotlb_msg iotlb;
+		__u8 padding[64];
+	};
+};
+
+struct vhost_msg_v2 {
+	__u32 type;
+	__u32 reserved;
+	union {
+		struct vhost_iotlb_msg iotlb;
+		__u8 padding[64];
+	};
+};
+
+struct vhost_memory_region {
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+	__u64 userspace_addr;
+	__u64 flags_padding; /* No flags are currently specified. */
+};
+
+/* All region addresses and sizes must be 4K aligned. */
+#define VHOST_PAGE_SIZE 0x1000
+
+struct vhost_memory {
+	__u32 nregions;
+	__u32 padding;
+	struct vhost_memory_region regions[0];
+};
+
+/* VHOST_SCSI specific definitions */
+
+/*
+ * Used by QEMU userspace to ensure a consistent vhost-scsi ABI.
+ *
+ * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate +
+ *            RFC-v2 vhost-scsi userspace.  Add GET_ABI_VERSION ioctl usage
+ * ABI Rev 1: January 2013. Ignore vhost_tpgt field in struct vhost_scsi_target.
+ *            All the targets under vhost_wwpn can be seen and used by guset.
+ */
+
+#define VHOST_SCSI_ABI_VERSION	1
+
+struct vhost_scsi_target {
+	int abi_version;
+	char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */
+	unsigned short vhost_tpgt;
+	unsigned short reserved;
+};
+
+/* Feature bits */
+/* Log all write descriptors. Can be changed while device is active. */
+#define VHOST_F_LOG_ALL 26
+/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
+#define VHOST_NET_F_VIRTIO_NET_HDR 27
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From e262e32d6bde0f77fb0c95d977482fc872c51996 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:07:23 +0000
Subject: vfs: Suppress MS_* flag defs within the kernel unless explicitly
 enabled

Only the mount namespace code that implements mount(2) should be using the
MS_* flags.  Suppress them inside the kernel unless uapi/linux/mount.h is
included.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: David Howells <dhowells@redhat.com>
---
 arch/arc/kernel/setup.c       |  1 +
 arch/arm/kernel/atags_parse.c |  1 +
 arch/sh/kernel/setup.c        |  1 +
 arch/sparc/kernel/setup_32.c  |  1 +
 arch/sparc/kernel/setup_64.c  |  1 +
 arch/x86/kernel/setup.c       |  1 +
 drivers/base/devtmpfs.c       |  1 +
 fs/namespace.c                |  1 +
 fs/pnode.c                    |  1 +
 fs/super.c                    |  1 +
 include/uapi/linux/fs.h       | 56 ++++-------------------------------------
 include/uapi/linux/mount.h    | 58 +++++++++++++++++++++++++++++++++++++++++++
 init/do_mounts.c              |  1 +
 init/do_mounts_initrd.c       |  1 +
 security/apparmor/lsm.c       |  1 +
 security/apparmor/mount.c     |  1 +
 security/selinux/hooks.c      |  1 +
 security/tomoyo/mount.c       |  1 +
 18 files changed, 79 insertions(+), 51 deletions(-)
 create mode 100644 include/uapi/linux/mount.h

(limited to 'include/uapi')

diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c
index b2cae79a25d7..714dc5c2baf1 100644
--- a/arch/arc/kernel/setup.c
+++ b/arch/arc/kernel/setup.c
@@ -19,6 +19,7 @@
 #include <linux/of_fdt.h>
 #include <linux/of.h>
 #include <linux/cache.h>
+#include <uapi/linux/mount.h>
 #include <asm/sections.h>
 #include <asm/arcregs.h>
 #include <asm/tlb.h>
diff --git a/arch/arm/kernel/atags_parse.c b/arch/arm/kernel/atags_parse.c
index c10a3e8ee998..a8a4333929f5 100644
--- a/arch/arm/kernel/atags_parse.c
+++ b/arch/arm/kernel/atags_parse.c
@@ -24,6 +24,7 @@
 #include <linux/root_dev.h>
 #include <linux/screen_info.h>
 #include <linux/memblock.h>
+#include <uapi/linux/mount.h>
 
 #include <asm/setup.h>
 #include <asm/system_info.h>
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index c286cf5da6e7..2c0e0f37a318 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -32,6 +32,7 @@
 #include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <linux/uaccess.h>
+#include <uapi/linux/mount.h>
 #include <asm/io.h>
 #include <asm/page.h>
 #include <asm/elf.h>
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index 13664c377196..7df3d704284c 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -34,6 +34,7 @@
 #include <linux/kdebug.h>
 #include <linux/export.h>
 #include <linux/start_kernel.h>
+#include <uapi/linux/mount.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index cd2825cb8420..014390950333 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -33,6 +33,7 @@
 #include <linux/module.h>
 #include <linux/start_kernel.h>
 #include <linux/memblock.h>
+#include <uapi/linux/mount.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b74e7bfed6ab..25a9802fffec 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -50,6 +50,7 @@
 #include <linux/kvm_para.h>
 #include <linux/dma-contiguous.h>
 #include <xen/xen.h>
+#include <uapi/linux/mount.h>
 
 #include <linux/errno.h>
 #include <linux/kernel.h>
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index b93fc862d365..0dbc43068eeb 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -25,6 +25,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <uapi/linux/mount.h>
 #include "base.h"
 
 static struct task_struct *thread;
diff --git a/fs/namespace.c b/fs/namespace.c
index 98d27da43304..6ae784ece25c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -26,6 +26,7 @@
 #include <linux/memblock.h>
 #include <linux/task_work.h>
 #include <linux/sched/task.h>
+#include <uapi/linux/mount.h>
 
 #include "pnode.h"
 #include "internal.h"
diff --git a/fs/pnode.c b/fs/pnode.c
index 53d411a371ce..1100e810d855 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -10,6 +10,7 @@
 #include <linux/mount.h>
 #include <linux/fs.h>
 #include <linux/nsproxy.h>
+#include <uapi/linux/mount.h>
 #include "internal.h"
 #include "pnode.h"
 
diff --git a/fs/super.c b/fs/super.c
index ca53a08497ed..6654de035893 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -35,6 +35,7 @@
 #include <linux/fsnotify.h>
 #include <linux/lockdep.h>
 #include <linux/user_namespace.h>
+#include <uapi/linux/mount.h>
 #include "internal.h"
 
 static int thaw_super_locked(struct super_block *sb);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a441ea1bfe6d..53a22e8e0408 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -14,6 +14,11 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
+/* Use of MS_* flags within the kernel is restricted to core mount(2) code. */
+#if !defined(__KERNEL__)
+#include <linux/mount.h>
+#endif
+
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
  * the file limit at runtime and only root can increase the per-process
@@ -101,57 +106,6 @@ struct inodes_stat_t {
 
 #define NR_FILE  8192	/* this can well be larger on a larger system */
 
-
-/*
- * These are the fs-independent mount-flags: up to 32 flags are supported
- */
-#define MS_RDONLY	 1	/* Mount read-only */
-#define MS_NOSUID	 2	/* Ignore suid and sgid bits */
-#define MS_NODEV	 4	/* Disallow access to device special files */
-#define MS_NOEXEC	 8	/* Disallow program execution */
-#define MS_SYNCHRONOUS	16	/* Writes are synced at once */
-#define MS_REMOUNT	32	/* Alter flags of a mounted FS */
-#define MS_MANDLOCK	64	/* Allow mandatory locks on an FS */
-#define MS_DIRSYNC	128	/* Directory modifications are synchronous */
-#define MS_NOATIME	1024	/* Do not update access times. */
-#define MS_NODIRATIME	2048	/* Do not update directory access times */
-#define MS_BIND		4096
-#define MS_MOVE		8192
-#define MS_REC		16384
-#define MS_VERBOSE	32768	/* War is peace. Verbosity is silence.
-				   MS_VERBOSE is deprecated. */
-#define MS_SILENT	32768
-#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
-#define MS_UNBINDABLE	(1<<17)	/* change to unbindable */
-#define MS_PRIVATE	(1<<18)	/* change to private */
-#define MS_SLAVE	(1<<19)	/* change to slave */
-#define MS_SHARED	(1<<20)	/* change to shared */
-#define MS_RELATIME	(1<<21)	/* Update atime relative to mtime/ctime. */
-#define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
-#define MS_I_VERSION	(1<<23) /* Update inode I_version field */
-#define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
-#define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
-
-/* These sb flags are internal to the kernel */
-#define MS_SUBMOUNT     (1<<26)
-#define MS_NOREMOTELOCK	(1<<27)
-#define MS_NOSEC	(1<<28)
-#define MS_BORN		(1<<29)
-#define MS_ACTIVE	(1<<30)
-#define MS_NOUSER	(1<<31)
-
-/*
- * Superblock flags that can be altered by MS_REMOUNT
- */
-#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
-			 MS_LAZYTIME)
-
-/*
- * Old magic mount flag and mask
- */
-#define MS_MGC_VAL 0xC0ED0000
-#define MS_MGC_MSK 0xffff0000
-
 /*
  * Structure for FS_IOC_FSGETXATTR[A] and FS_IOC_FSSETXATTR.
  */
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
new file mode 100644
index 000000000000..3f9ec42510b0
--- /dev/null
+++ b/include/uapi/linux/mount.h
@@ -0,0 +1,58 @@
+#ifndef _UAPI_LINUX_MOUNT_H
+#define _UAPI_LINUX_MOUNT_H
+
+/*
+ * These are the fs-independent mount-flags: up to 32 flags are supported
+ *
+ * Usage of these is restricted within the kernel to core mount(2) code and
+ * callers of sys_mount() only.  Filesystems should be using the SB_*
+ * equivalent instead.
+ */
+#define MS_RDONLY	 1	/* Mount read-only */
+#define MS_NOSUID	 2	/* Ignore suid and sgid bits */
+#define MS_NODEV	 4	/* Disallow access to device special files */
+#define MS_NOEXEC	 8	/* Disallow program execution */
+#define MS_SYNCHRONOUS	16	/* Writes are synced at once */
+#define MS_REMOUNT	32	/* Alter flags of a mounted FS */
+#define MS_MANDLOCK	64	/* Allow mandatory locks on an FS */
+#define MS_DIRSYNC	128	/* Directory modifications are synchronous */
+#define MS_NOATIME	1024	/* Do not update access times. */
+#define MS_NODIRATIME	2048	/* Do not update directory access times */
+#define MS_BIND		4096
+#define MS_MOVE		8192
+#define MS_REC		16384
+#define MS_VERBOSE	32768	/* War is peace. Verbosity is silence.
+				   MS_VERBOSE is deprecated. */
+#define MS_SILENT	32768
+#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
+#define MS_UNBINDABLE	(1<<17)	/* change to unbindable */
+#define MS_PRIVATE	(1<<18)	/* change to private */
+#define MS_SLAVE	(1<<19)	/* change to slave */
+#define MS_SHARED	(1<<20)	/* change to shared */
+#define MS_RELATIME	(1<<21)	/* Update atime relative to mtime/ctime. */
+#define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
+#define MS_I_VERSION	(1<<23) /* Update inode I_version field */
+#define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
+#define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
+
+/* These sb flags are internal to the kernel */
+#define MS_SUBMOUNT     (1<<26)
+#define MS_NOREMOTELOCK	(1<<27)
+#define MS_NOSEC	(1<<28)
+#define MS_BORN		(1<<29)
+#define MS_ACTIVE	(1<<30)
+#define MS_NOUSER	(1<<31)
+
+/*
+ * Superblock flags that can be altered by MS_REMOUNT
+ */
+#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
+			 MS_LAZYTIME)
+
+/*
+ * Old magic mount flag and mask
+ */
+#define MS_MGC_VAL 0xC0ED0000
+#define MS_MGC_MSK 0xffff0000
+
+#endif /* _UAPI_LINUX_MOUNT_H */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index a754e3ba9831..f8c230c77035 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -22,6 +22,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs_sb.h>
 #include <linux/nfs_mount.h>
+#include <uapi/linux/mount.h>
 
 #include "do_mounts.h"
 
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index d1a5d885ce13..56a557403d39 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/freezer.h>
 #include <linux/kmod.h>
+#include <uapi/linux/mount.h>
 
 #include "do_mounts.h"
 
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 42446a216f3b..2c010874329f 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -26,6 +26,7 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/sock.h>
+#include <uapi/linux/mount.h>
 
 #include "include/apparmor.h"
 #include "include/apparmorfs.h"
diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c
index c1da22482bfb..8c3787399356 100644
--- a/security/apparmor/mount.c
+++ b/security/apparmor/mount.c
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <uapi/linux/mount.h>
 
 #include "include/apparmor.h"
 #include "include/audit.h"
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7ce683259357..f695438d985c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -88,6 +88,7 @@
 #include <linux/msg.h>
 #include <linux/shm.h>
 #include <linux/bpf.h>
+#include <uapi/linux/mount.h>
 
 #include "avc.h"
 #include "objsec.h"
diff --git a/security/tomoyo/mount.c b/security/tomoyo/mount.c
index 807fd91dbb54..7dc7f59b7dde 100644
--- a/security/tomoyo/mount.c
+++ b/security/tomoyo/mount.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/slab.h>
+#include <uapi/linux/mount.h>
 #include "common.h"
 
 /* String table for special mount operations. */
-- 
cgit v1.2.3-59-g8ed1b


From b4a1ed0cd18b771e4279b4eb9cf39b565560eea6 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Thu, 20 Dec 2018 19:13:07 +0100
Subject: fbdev: make FB_BACKLIGHT a tristate

BACKLIGHT_CLASS_DEVICE is already tristate, but a dependency
FB_BACKLIGHT prevents it from being built as a module.  There
doesn't seem to be any particularly good reason for this, so
switch FB_BACKLIGHT over to tristate.

Signed-off-by: Rob Clark <robdclark@gmail.com>
Tested-by: Arnd Bergmann <arnd@arndb.de>
Cc: Simon Horman <horms+renesas@verge.net.au>
Cc: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Ulf Magnusson <ulfalizer@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Hans de Goede <j.w.r.degoede@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/fbdev/Kconfig        | 2 +-
 drivers/video/fbdev/core/fbsysfs.c | 8 ++++----
 include/linux/fb.h                 | 2 +-
 include/uapi/linux/fb.h            | 2 --
 4 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index e413f54208f4..5d85965767e3 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -184,7 +184,7 @@ config FB_MACMODES
        depends on FB
 
 config FB_BACKLIGHT
-	bool
+	tristate
 	depends on FB
 	select BACKLIGHT_LCD_SUPPORT
 	select BACKLIGHT_CLASS_DEVICE
diff --git a/drivers/video/fbdev/core/fbsysfs.c b/drivers/video/fbdev/core/fbsysfs.c
index e31a182b42bf..44cca39f2b51 100644
--- a/drivers/video/fbdev/core/fbsysfs.c
+++ b/drivers/video/fbdev/core/fbsysfs.c
@@ -60,7 +60,7 @@ struct fb_info *framebuffer_alloc(size_t size, struct device *dev)
 	info->device = dev;
 	info->fbcon_rotate_hint = -1;
 
-#ifdef CONFIG_FB_BACKLIGHT
+#if IS_ENABLED(CONFIG_FB_BACKLIGHT)
 	mutex_init(&info->bl_curve_mutex);
 #endif
 
@@ -429,7 +429,7 @@ static ssize_t show_fbstate(struct device *device,
 	return snprintf(buf, PAGE_SIZE, "%d\n", fb_info->state);
 }
 
-#ifdef CONFIG_FB_BACKLIGHT
+#if IS_ENABLED(CONFIG_FB_BACKLIGHT)
 static ssize_t store_bl_curve(struct device *device,
 			      struct device_attribute *attr,
 			      const char *buf, size_t count)
@@ -510,7 +510,7 @@ static struct device_attribute device_attrs[] = {
 	__ATTR(stride, S_IRUGO, show_stride, NULL),
 	__ATTR(rotate, S_IRUGO|S_IWUSR, show_rotate, store_rotate),
 	__ATTR(state, S_IRUGO|S_IWUSR, show_fbstate, store_fbstate),
-#ifdef CONFIG_FB_BACKLIGHT
+#if IS_ENABLED(CONFIG_FB_BACKLIGHT)
 	__ATTR(bl_curve, S_IRUGO|S_IWUSR, show_bl_curve, store_bl_curve),
 #endif
 };
@@ -551,7 +551,7 @@ void fb_cleanup_device(struct fb_info *fb_info)
 	}
 }
 
-#ifdef CONFIG_FB_BACKLIGHT
+#if IS_ENABLED(CONFIG_FB_BACKLIGHT)
 /* This function generates a linear backlight curve
  *
  *     0: off
diff --git a/include/linux/fb.h b/include/linux/fb.h
index a3cab6dc9b44..7cdd31a69719 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -485,7 +485,7 @@ struct fb_info {
 	struct list_head modelist;      /* mode list */
 	struct fb_videomode *mode;	/* current mode */
 
-#ifdef CONFIG_FB_BACKLIGHT
+#if IS_ENABLED(CONFIG_FB_BACKLIGHT)
 	/* assigned backlight device */
 	/* set before framebuffer registration, 
 	   remove after unregister */
diff --git a/include/uapi/linux/fb.h b/include/uapi/linux/fb.h
index 6cd9b198b7c6..b6aac7ee1f67 100644
--- a/include/uapi/linux/fb.h
+++ b/include/uapi/linux/fb.h
@@ -393,11 +393,9 @@ struct fb_cursor {
 	struct fb_image	image;	/* Cursor image */
 };
 
-#ifdef CONFIG_FB_BACKLIGHT
 /* Settings for the generic backlight code */
 #define FB_BACKLIGHT_LEVELS	128
 #define FB_BACKLIGHT_MAX	0xFF
-#endif
 
 
 #endif /* _UAPI_LINUX_FB_H */
-- 
cgit v1.2.3-59-g8ed1b


From 4fa2813d26c82680216e535c18a690ca0c4e860a Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@mellanox.com>
Date: Sun, 9 Dec 2018 11:58:05 +0200
Subject: RDMA/nldev: Expose port_cap_flags2

port_cap_flags2 represents IBTA PortInfo:CapabilityMask2.

The field safely extends the RDMA_NLDEV_ATTR_CAP_FLAGS operand as it was
exported as 64 bit to allow this kind of extension.

Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 9 ++++++---
 include/uapi/rdma/rdma_netlink.h | 3 +++
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 093bbfcdc53b..e600fc23ae62 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -227,6 +227,7 @@ static int fill_port_info(struct sk_buff *msg,
 	struct net_device *netdev = NULL;
 	struct ib_port_attr attr;
 	int ret;
+	u64 cap_flags = 0;
 
 	if (fill_nldev_handle(msg, device))
 		return -EMSGSIZE;
@@ -239,10 +240,12 @@ static int fill_port_info(struct sk_buff *msg,
 		return ret;
 
 	if (rdma_protocol_ib(device, port)) {
-		BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64));
+		BUILD_BUG_ON((sizeof(attr.port_cap_flags) +
+				sizeof(attr.port_cap_flags2)) > sizeof(u64));
+		cap_flags = attr.port_cap_flags |
+			((u64)attr.port_cap_flags2 << 32);
 		if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
-				      (u64)attr.port_cap_flags,
-				      RDMA_NLDEV_ATTR_PAD))
+				      cap_flags, RDMA_NLDEV_ATTR_PAD))
 			return -EMSGSIZE;
 		if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
 				      attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD))
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index f9c41bf59efc..2e18b77a817f 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -283,6 +283,9 @@ enum rdma_nldev_attr {
 
 	/*
 	 * Device and port capabilities
+	 *
+	 * When used for port info, first 32-bits are CapabilityMask followed by
+	 * 16-bit CapabilityMask2.
 	 */
 	RDMA_NLDEV_ATTR_CAP_FLAGS,		/* u64 */
 
-- 
cgit v1.2.3-59-g8ed1b


From 641d1207d2ed0ef21ff5ad61e067d630028e2f62 Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@mellanox.com>
Date: Sun, 9 Dec 2018 11:58:06 +0200
Subject: IB/core: Move query port to ioctl

Add a method for query port under the uverbs global methods.  Current
ib_port_attr struct is passed as a single attribute and port_cap_flags2 is
added as a new attribute to the function.

Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs.h                  | 25 +++++++
 drivers/infiniband/core/uverbs_cmd.c              | 53 +--------------
 drivers/infiniband/core/uverbs_std_types_device.c | 79 ++++++++++++++++++++++-
 include/uapi/rdma/ib_user_ioctl_cmds.h            |  7 ++
 include/uapi/rdma/ib_user_ioctl_verbs.h           |  7 ++
 5 files changed, 118 insertions(+), 53 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 88029f3b6853..ea0bc6885517 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -293,4 +293,29 @@ extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION);
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM);
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS);
 
+/*
+ * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the
+ * PortInfo CapabilityMask, but was extended with unique bits.
+ */
+static inline u32 make_port_cap_flags(const struct ib_port_attr *attr)
+{
+	u32 res;
+
+	/* All IBA CapabilityMask bits are passed through here, except bit 26,
+	 * which is overridden with IP_BASED_GIDS. This is due to a historical
+	 * mistake in the implementation of IP_BASED_GIDS. Otherwise all other
+	 * bits match the IBA definition across all kernel versions.
+	 */
+	res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS;
+
+	if (attr->ip_gids)
+		res |= IB_UVERBS_PCF_IP_BASED_GIDS;
+
+	return res;
+}
+
+
+void copy_port_attr_to_resp(struct ib_port_attr *attr,
+			    struct ib_uverbs_query_port_resp *resp,
+			    struct ib_device *ib_dev, u8 port_num);
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 4dc454d35f32..6b12cc5f97b2 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -361,27 +361,6 @@ static int ib_uverbs_query_device(struct uverbs_attr_bundle *attrs)
 	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-/*
- * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the
- * PortInfo CapabilityMask, but was extended with unique bits.
- */
-static u32 make_port_cap_flags(const struct ib_port_attr *attr)
-{
-	u32 res;
-
-	/* All IBA CapabilityMask bits are passed through here, except bit 26,
-	 * which is overridden with IP_BASED_GIDS. This is due to a historical
-	 * mistake in the implementation of IP_BASED_GIDS. Otherwise all other
-	 * bits match the IBA definition across all kernel versions.
-	 */
-	res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS;
-
-	if (attr->ip_gids)
-		res |= IB_UVERBS_PCF_IP_BASED_GIDS;
-
-	return res;
-}
-
 static int ib_uverbs_query_port(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_query_port      cmd;
@@ -405,37 +384,7 @@ static int ib_uverbs_query_port(struct uverbs_attr_bundle *attrs)
 		return ret;
 
 	memset(&resp, 0, sizeof resp);
-
-	resp.state 	     = attr.state;
-	resp.max_mtu 	     = attr.max_mtu;
-	resp.active_mtu      = attr.active_mtu;
-	resp.gid_tbl_len     = attr.gid_tbl_len;
-	resp.port_cap_flags  = make_port_cap_flags(&attr);
-	resp.max_msg_sz      = attr.max_msg_sz;
-	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
-	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
-	resp.pkey_tbl_len    = attr.pkey_tbl_len;
-
-	if (rdma_is_grh_required(ib_dev, cmd.port_num))
-		resp.flags |= IB_UVERBS_QPF_GRH_REQUIRED;
-
-	if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) {
-		resp.lid     = OPA_TO_IB_UCAST_LID(attr.lid);
-		resp.sm_lid  = OPA_TO_IB_UCAST_LID(attr.sm_lid);
-	} else {
-		resp.lid     = ib_lid_cpu16(attr.lid);
-		resp.sm_lid  = ib_lid_cpu16(attr.sm_lid);
-	}
-	resp.lmc 	     = attr.lmc;
-	resp.max_vl_num      = attr.max_vl_num;
-	resp.sm_sl 	     = attr.sm_sl;
-	resp.subnet_timeout  = attr.subnet_timeout;
-	resp.init_type_reply = attr.init_type_reply;
-	resp.active_width    = attr.active_width;
-	resp.active_speed    = attr.active_speed;
-	resp.phys_state      = attr.phys_state;
-	resp.link_layer      = rdma_port_get_link_layer(ib_dev,
-							cmd.port_num);
+	copy_port_attr_to_resp(&attr, &resp, ib_dev, cmd.port_num);
 
 	return uverbs_response(attrs, &resp, sizeof(resp));
 }
diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
index 165ece1eb655..5030ec480370 100644
--- a/drivers/infiniband/core/uverbs_std_types_device.c
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -6,6 +6,8 @@
 #include <rdma/uverbs_std_types.h>
 #include "rdma_core.h"
 #include "uverbs.h"
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/opa_addr.h>
 
 /*
  * This ioctl method allows calling any defined write or write_ex
@@ -127,6 +129,71 @@ err:
 	return ret;
 }
 
+void copy_port_attr_to_resp(struct ib_port_attr *attr,
+			    struct ib_uverbs_query_port_resp *resp,
+			    struct ib_device *ib_dev, u8 port_num)
+{
+	resp->state = attr->state;
+	resp->max_mtu = attr->max_mtu;
+	resp->active_mtu = attr->active_mtu;
+	resp->gid_tbl_len = attr->gid_tbl_len;
+	resp->port_cap_flags = make_port_cap_flags(attr);
+	resp->max_msg_sz = attr->max_msg_sz;
+	resp->bad_pkey_cntr = attr->bad_pkey_cntr;
+	resp->qkey_viol_cntr = attr->qkey_viol_cntr;
+	resp->pkey_tbl_len = attr->pkey_tbl_len;
+
+	if (rdma_is_grh_required(ib_dev, port_num))
+		resp->flags |= IB_UVERBS_QPF_GRH_REQUIRED;
+
+	if (rdma_cap_opa_ah(ib_dev, port_num)) {
+		resp->lid = OPA_TO_IB_UCAST_LID(attr->lid);
+		resp->sm_lid = OPA_TO_IB_UCAST_LID(attr->sm_lid);
+	} else {
+		resp->lid = ib_lid_cpu16(attr->lid);
+		resp->sm_lid = ib_lid_cpu16(attr->sm_lid);
+	}
+
+	resp->lmc = attr->lmc;
+	resp->max_vl_num = attr->max_vl_num;
+	resp->sm_sl = attr->sm_sl;
+	resp->subnet_timeout = attr->subnet_timeout;
+	resp->init_type_reply = attr->init_type_reply;
+	resp->active_width = attr->active_width;
+	resp->active_speed = attr->active_speed;
+	resp->phys_state = attr->phys_state;
+	resp->link_layer = rdma_port_get_link_layer(ib_dev, port_num);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_device *ib_dev = attrs->ufile->device->ib_dev;
+	struct ib_port_attr attr = {};
+	struct ib_uverbs_query_port_resp_ex resp = {};
+	int ret;
+	u8 port_num;
+
+	/* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */
+	if (!ib_dev->ops.query_port)
+		return -EOPNOTSUPP;
+
+	ret = uverbs_get_const(&port_num, attrs,
+			       UVERBS_ATTR_QUERY_PORT_PORT_NUM);
+	if (ret)
+		return ret;
+
+	ret = ib_query_port(ib_dev, port_num, &attr);
+	if (ret)
+		return ret;
+
+	copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num);
+	resp.port_cap_flags2 = attr.port_cap_flags2;
+
+	return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP,
+					     &resp, sizeof(resp));
+}
+
 DECLARE_UVERBS_NAMED_METHOD(
 	UVERBS_METHOD_INFO_HANDLES,
 	/* Also includes any device specific object ids */
@@ -137,9 +204,19 @@ DECLARE_UVERBS_NAMED_METHOD(
 	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_HANDLES_LIST,
 			    UVERBS_ATTR_MIN_SIZE(sizeof(u32)), UA_OPTIONAL));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_QUERY_PORT,
+	UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_PORT_NUM, u8, UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(
+		UVERBS_ATTR_QUERY_PORT_RESP,
+		UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex,
+				   reserved),
+		UA_MANDATORY));
+
 DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE,
 			      &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE),
-			      &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES));
+			      &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES),
+			      &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT));
 
 const struct uapi_definition uverbs_def_obj_device[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE),
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index f04647852add..64f0e3aacd3f 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -66,6 +66,7 @@ enum {
 enum uverbs_methods_device {
 	UVERBS_METHOD_INVOKE_WRITE,
 	UVERBS_METHOD_INFO_HANDLES,
+	UVERBS_METHOD_QUERY_PORT,
 };
 
 enum uverbs_attrs_invoke_write_cmd_attr_ids {
@@ -74,6 +75,11 @@ enum uverbs_attrs_invoke_write_cmd_attr_ids {
 	UVERBS_ATTR_WRITE_CMD,
 };
 
+enum uverbs_attrs_query_port_cmd_attr_ids {
+	UVERBS_ATTR_QUERY_PORT_PORT_NUM,
+	UVERBS_ATTR_QUERY_PORT_RESP,
+};
+
 enum uverbs_attrs_create_cq_cmd_attr_ids {
 	UVERBS_ATTR_CREATE_CQ_HANDLE,
 	UVERBS_ATTR_CREATE_CQ_CQE,
@@ -234,4 +240,5 @@ enum uverbs_methods_flow {
 enum uverbs_attrs_flow_destroy_ids {
 	UVERBS_ATTR_DESTROY_FLOW_HANDLE,
 };
+
 #endif
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
index 9dcd345b852a..72c7fc75f960 100644
--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -35,6 +35,7 @@
 #define IB_USER_IOCTL_VERBS_H
 
 #include <linux/types.h>
+#include <rdma/ib_user_verbs.h>
 
 #ifndef RDMA_UAPI_PTR
 #define RDMA_UAPI_PTR(_type, _name)	__aligned_u64 _name
@@ -166,4 +167,10 @@ enum ib_uverbs_advise_mr_flag {
 	IB_UVERBS_ADVISE_MR_FLAG_FLUSH = 1 << 0,
 };
 
+struct ib_uverbs_query_port_resp_ex {
+	struct ib_uverbs_query_port_resp legacy_resp;
+	__u16 port_cap_flags2;
+	__u8  reserved[6];
+};
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 7f92891778dff62303c070ac81de7b7d80de331a Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 20 Dec 2018 12:10:36 +1100
Subject: vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
pluggable PCIe devices but still have PCIe links which are used
for config space and MMIO. In addition to that the GPUs have 6 NVLinks
which are connected to other GPUs and the POWER9 CPU. POWER9 chips
have a special unit on a die called an NPU which is an NVLink2 host bus
adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
These systems also support ATS (address translation services) which is
a part of the NVLink2 protocol. Such GPUs also share on-board RAM
(16GB or 32GB) to the system via the same NVLink2 so a CPU has
cache-coherent access to a GPU RAM.

This exports GPU RAM to the userspace as a new VFIO device region. This
preregisters the new memory as device memory as it might be used for DMA.
This inserts pfns from the fault handler as the GPU memory is not onlined
until the vendor driver is loaded and trained the NVLinks so doing this
earlier causes low level errors which we fence in the firmware so
it does not hurt the host system but still better be avoided; for the same
reason this does not map GPU RAM into the host kernel (usual thing for
emulated access otherwise).

This exports an ATSD (Address Translation Shootdown) register of NPU which
allows TLB invalidations inside GPU for an operating system. The register
conveniently occupies a single 64k page. It is also presented to
the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
each of them can be used for TLB invalidation in a GPU linked to this NPU.
This allocates one ATSD register per an NVLink bridge allowing passing
up to 6 registers. Due to the host firmware bug (just recently fixed),
only 1 ATSD register per NPU was actually advertised to the host system
so this passes that alone register via the first NVLink bridge device in
the group which is still enough as QEMU collects them all back and
presents to the guest via vPHB to mimic the emulated NPU PHB on the host.

In order to provide the userspace with the information about GPU-to-NVLink
connections, this exports an additional capability called "tgt"
(which is an abbreviated host system bus address). The "tgt" property
tells the GPU its own system address and allows the guest driver to
conglomerate the routing information so each GPU knows how to get directly
to the other GPUs.

For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
know LPID (a logical partition ID or a KVM guest hardware ID in other
words) and PID (a memory context ID of a userspace process, not to be
confused with a linux pid). This assigns a GPU to LPID in the NPU and
this is why this adds a listener for KVM on an IOMMU group. A PID comes
via NVLink from a GPU and NPU uses a PID wildcard to pass it through.

This requires coherent memory and ATSD to be available on the host as
the GPU vendor only supports configurations with both features enabled
and other configurations are known not to work. Because of this and
because of the ways the features are advertised to the host system
(which is a device tree with very platform specific properties),
this requires enabled POWERNV platform.

The V100 GPUs do not advertise any of these capabilities via the config
space and there are more than just one device ID so this relies on
the platform to tell whether these GPUs have special abilities such as
NVLinks.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/vfio/pci/Kconfig            |   6 +
 drivers/vfio/pci/Makefile           |   1 +
 drivers/vfio/pci/trace.h            | 102 ++++++++
 drivers/vfio/pci/vfio_pci.c         |  27 +-
 drivers/vfio/pci/vfio_pci_nvlink2.c | 482 ++++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_private.h |  14 ++
 include/uapi/linux/vfio.h           |  42 ++++
 7 files changed, 672 insertions(+), 2 deletions(-)
 create mode 100644 drivers/vfio/pci/trace.h
 create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c

(limited to 'include/uapi')

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 42dc1d3d71cf..d0f8e4f5a039 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -38,3 +38,9 @@ config VFIO_PCI_IGD
 	  and LPC bridge config space.
 
 	  To enable Intel IGD assignment through vfio-pci, say Y.
+
+config VFIO_PCI_NVLINK2
+	def_bool y
+	depends on VFIO_PCI && PPC_POWERNV
+	help
+	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 76d8ec058edd..9662c063a6b1 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,5 +1,6 @@
 
 vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
+vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/trace.h b/drivers/vfio/pci/trace.h
new file mode 100644
index 000000000000..228ccdb8d1c8
--- /dev/null
+++ b/drivers/vfio/pci/trace.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * VFIO PCI mmap/mmap_fault tracepoints
+ *
+ * Copyright (C) 2018 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vfio_pci
+
+#if !defined(_TRACE_VFIO_PCI_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VFIO_PCI_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(vfio_pci_nvgpu_mmap_fault,
+	TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+			vm_fault_t ret),
+	TP_ARGS(pdev, hpa, ua, ret),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, hpa)
+		__field(unsigned long, ua)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->name = dev_name(&pdev->dev),
+		__entry->hpa = hpa;
+		__entry->ua = ua;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%s: %lx -> %lx ret=%d", __entry->name, __entry->hpa,
+			__entry->ua, __entry->ret)
+);
+
+TRACE_EVENT(vfio_pci_nvgpu_mmap,
+	TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+			unsigned long size, int ret),
+	TP_ARGS(pdev, hpa, ua, size, ret),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, hpa)
+		__field(unsigned long, ua)
+		__field(unsigned long, size)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->name = dev_name(&pdev->dev),
+		__entry->hpa = hpa;
+		__entry->ua = ua;
+		__entry->size = size;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
+			__entry->ua, __entry->size, __entry->ret)
+);
+
+TRACE_EVENT(vfio_pci_npu2_mmap,
+	TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+			unsigned long size, int ret),
+	TP_ARGS(pdev, hpa, ua, size, ret),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, hpa)
+		__field(unsigned long, ua)
+		__field(unsigned long, size)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->name = dev_name(&pdev->dev),
+		__entry->hpa = hpa;
+		__entry->ua = ua;
+		__entry->size = size;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
+			__entry->ua, __entry->size, __entry->ret)
+);
+
+#endif /* _TRACE_VFIO_PCI_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 0c94204e21f5..a89fa5d4e877 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -289,14 +289,37 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 		if (ret) {
 			dev_warn(&vdev->pdev->dev,
 				 "Failed to setup Intel IGD regions\n");
-			vfio_pci_disable(vdev);
-			return ret;
+			goto disable_exit;
+		}
+	}
+
+	if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
+	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
+		ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
+		if (ret && ret != -ENODEV) {
+			dev_warn(&vdev->pdev->dev,
+				 "Failed to setup NVIDIA NV2 RAM region\n");
+			goto disable_exit;
+		}
+	}
+
+	if (pdev->vendor == PCI_VENDOR_ID_IBM &&
+	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
+		ret = vfio_pci_ibm_npu2_init(vdev);
+		if (ret && ret != -ENODEV) {
+			dev_warn(&vdev->pdev->dev,
+					"Failed to setup NVIDIA NV2 ATSD region\n");
+			goto disable_exit;
 		}
 	}
 
 	vfio_pci_probe_mmaps(vdev);
 
 	return 0;
+
+disable_exit:
+	vfio_pci_disable(vdev);
+	return ret;
 }
 
 static void vfio_pci_disable(struct vfio_pci_device *vdev)
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
new file mode 100644
index 000000000000..054a2cf9dd8e
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
+ *
+ * Copyright (C) 2018 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Register an on-GPU RAM region for cacheable access.
+ *
+ * Derived from original vfio_pci_igd.c:
+ * Copyright (C) 2016 Red Hat, Inc.  All rights reserved.
+ *	Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
+#include <asm/kvm_ppc.h>
+#include "vfio_pci_private.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
+
+struct vfio_pci_nvgpu_data {
+	unsigned long gpu_hpa; /* GPU RAM physical address */
+	unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
+	unsigned long useraddr; /* GPU RAM userspace address */
+	unsigned long size; /* Size of the GPU RAM window (usually 128GB) */
+	struct mm_struct *mm;
+	struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */
+	struct pci_dev *gpdev;
+	struct notifier_block group_notifier;
+};
+
+static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev,
+		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct vfio_pci_nvgpu_data *data = vdev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK;
+	size_t sizealigned;
+	void __iomem *ptr;
+
+	if (pos >= vdev->region[i].size)
+		return -EINVAL;
+
+	count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	/*
+	 * We map only a bit of GPU RAM for a short time instead of mapping it
+	 * for the guest lifetime as:
+	 *
+	 * 1) we do not know GPU RAM size, only aperture which is 4-8 times
+	 *    bigger than actual RAM size (16/32GB RAM vs. 128GB aperture);
+	 * 2) mapping GPU RAM allows CPU to prefetch and if this happens
+	 *    before NVLink bridge is reset (which fences GPU RAM),
+	 *    hardware management interrupts (HMI) might happen, this
+	 *    will freeze NVLink bridge.
+	 *
+	 * This is not fast path anyway.
+	 */
+	sizealigned = _ALIGN_UP(posoff + count, PAGE_SIZE);
+	ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned);
+	if (!ptr)
+		return -EFAULT;
+
+	if (iswrite) {
+		if (copy_from_user(ptr + posoff, buf, count))
+			count = -EFAULT;
+		else
+			*ppos += count;
+	} else {
+		if (copy_to_user(buf, ptr + posoff, count))
+			count = -EFAULT;
+		else
+			*ppos += count;
+	}
+
+	iounmap(ptr);
+
+	return count;
+}
+
+static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region)
+{
+	struct vfio_pci_nvgpu_data *data = region->data;
+	long ret;
+
+	/* If there were any mappings at all... */
+	if (data->mm) {
+		ret = mm_iommu_put(data->mm, data->mem);
+		WARN_ON(ret);
+
+		mmdrop(data->mm);
+	}
+
+	vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
+			&data->group_notifier);
+
+	pnv_npu2_unmap_lpar_dev(data->gpdev);
+
+	kfree(data);
+}
+
+static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf)
+{
+	vm_fault_t ret;
+	struct vm_area_struct *vma = vmf->vma;
+	struct vfio_pci_region *region = vma->vm_private_data;
+	struct vfio_pci_nvgpu_data *data = region->data;
+	unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+	unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT;
+	unsigned long vm_pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	unsigned long pfn = nv2pg + vm_pgoff + vmf_off;
+
+	ret = vmf_insert_pfn(vma, vmf->address, pfn);
+	trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT,
+			vmf->address, ret);
+
+	return ret;
+}
+
+static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = {
+	.fault = vfio_pci_nvgpu_mmap_fault,
+};
+
+static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region, struct vm_area_struct *vma)
+{
+	int ret;
+	struct vfio_pci_nvgpu_data *data = region->data;
+
+	if (data->useraddr)
+		return -EPERM;
+
+	if (vma->vm_end - vma->vm_start > data->size)
+		return -EINVAL;
+
+	vma->vm_private_data = region;
+	vma->vm_flags |= VM_PFNMAP;
+	vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops;
+
+	/*
+	 * Calling mm_iommu_newdev() here once as the region is not
+	 * registered yet and therefore right initialization will happen now.
+	 * Other places will use mm_iommu_find() which returns
+	 * registered @mem and does not go gup().
+	 */
+	data->useraddr = vma->vm_start;
+	data->mm = current->mm;
+
+	atomic_inc(&data->mm->mm_count);
+	ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
+			(vma->vm_end - vma->vm_start) >> PAGE_SHIFT,
+			data->gpu_hpa, &data->mem);
+
+	trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr,
+			vma->vm_end - vma->vm_start, ret);
+
+	return ret;
+}
+
+static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region, struct vfio_info_cap *caps)
+{
+	struct vfio_pci_nvgpu_data *data = region->data;
+	struct vfio_region_info_cap_nvlink2_ssatgt cap = { 0 };
+
+	cap.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT;
+	cap.header.version = 1;
+	cap.tgt = data->gpu_tgt;
+
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+static const struct vfio_pci_regops vfio_pci_nvgpu_regops = {
+	.rw = vfio_pci_nvgpu_rw,
+	.release = vfio_pci_nvgpu_release,
+	.mmap = vfio_pci_nvgpu_mmap,
+	.add_capability = vfio_pci_nvgpu_add_capability,
+};
+
+static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb,
+		unsigned long action, void *opaque)
+{
+	struct kvm *kvm = opaque;
+	struct vfio_pci_nvgpu_data *data = container_of(nb,
+			struct vfio_pci_nvgpu_data,
+			group_notifier);
+
+	if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm &&
+			pnv_npu2_map_lpar_dev(data->gpdev,
+				kvm->arch.lpid, MSR_DR | MSR_PR))
+		return NOTIFY_BAD;
+
+	return NOTIFY_OK;
+}
+
+int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
+{
+	int ret;
+	u64 reg[2];
+	u64 tgt = 0;
+	struct device_node *npu_node, *mem_node;
+	struct pci_dev *npu_dev;
+	struct vfio_pci_nvgpu_data *data;
+	uint32_t mem_phandle = 0;
+	unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM;
+
+	/*
+	 * PCI config space does not tell us about NVLink presense but
+	 * platform does, use this.
+	 */
+	npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0);
+	if (!npu_dev)
+		return -ENODEV;
+
+	npu_node = pci_device_to_OF_node(npu_dev);
+	if (!npu_node)
+		return -EINVAL;
+
+	if (of_property_read_u32(npu_node, "memory-region", &mem_phandle))
+		return -EINVAL;
+
+	mem_node = of_find_node_by_phandle(mem_phandle);
+	if (!mem_node)
+		return -EINVAL;
+
+	if (of_property_read_variable_u64_array(mem_node, "reg", reg,
+				ARRAY_SIZE(reg), ARRAY_SIZE(reg)) !=
+			ARRAY_SIZE(reg))
+		return -EINVAL;
+
+	if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
+		dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
+		return -EFAULT;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->gpu_hpa = reg[0];
+	data->gpu_tgt = tgt;
+	data->size = reg[1];
+
+	dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa,
+			data->gpu_hpa + data->size - 1);
+
+	data->gpdev = vdev->pdev;
+	data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier;
+
+	ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
+			&events, &data->group_notifier);
+	if (ret)
+		goto free_exit;
+
+	/*
+	 * We have just set KVM, we do not need the listener anymore.
+	 * Also, keeping it registered means that if more than one GPU is
+	 * assigned, we will get several similar notifiers notifying about
+	 * the same device again which does not help with anything.
+	 */
+	vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
+			&data->group_notifier);
+
+	ret = vfio_pci_register_dev_region(vdev,
+			PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+			VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
+			&vfio_pci_nvgpu_regops,
+			data->size,
+			VFIO_REGION_INFO_FLAG_READ |
+			VFIO_REGION_INFO_FLAG_WRITE |
+			VFIO_REGION_INFO_FLAG_MMAP,
+			data);
+	if (ret)
+		goto free_exit;
+
+	return 0;
+free_exit:
+	kfree(data);
+
+	return ret;
+}
+
+/*
+ * IBM NPU2 bridge
+ */
+struct vfio_pci_npu2_data {
+	void *base; /* ATSD register virtual address, for emulated access */
+	unsigned long mmio_atsd; /* ATSD physical address */
+	unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
+	unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */
+};
+
+static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev,
+		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct vfio_pci_npu2_data *data = vdev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+	if (pos >= vdev->region[i].size)
+		return -EINVAL;
+
+	count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	if (iswrite) {
+		if (copy_from_user(data->base + pos, buf, count))
+			return -EFAULT;
+	} else {
+		if (copy_to_user(buf, data->base + pos, count))
+			return -EFAULT;
+	}
+	*ppos += count;
+
+	return count;
+}
+
+static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region, struct vm_area_struct *vma)
+{
+	int ret;
+	struct vfio_pci_npu2_data *data = region->data;
+	unsigned long req_len = vma->vm_end - vma->vm_start;
+
+	if (req_len != PAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_PFNMAP;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT,
+			req_len, vma->vm_page_prot);
+	trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start,
+			vma->vm_end - vma->vm_start, ret);
+
+	return ret;
+}
+
+static void vfio_pci_npu2_release(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region)
+{
+	struct vfio_pci_npu2_data *data = region->data;
+
+	memunmap(data->base);
+	kfree(data);
+}
+
+static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region, struct vfio_info_cap *caps)
+{
+	struct vfio_pci_npu2_data *data = region->data;
+	struct vfio_region_info_cap_nvlink2_ssatgt captgt = { 0 };
+	struct vfio_region_info_cap_nvlink2_lnkspd capspd = { 0 };
+	int ret;
+
+	captgt.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT;
+	captgt.header.version = 1;
+	captgt.tgt = data->gpu_tgt;
+
+	capspd.header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD;
+	capspd.header.version = 1;
+	capspd.link_speed = data->link_speed;
+
+	ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt));
+	if (ret)
+		return ret;
+
+	return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd));
+}
+
+static const struct vfio_pci_regops vfio_pci_npu2_regops = {
+	.rw = vfio_pci_npu2_rw,
+	.mmap = vfio_pci_npu2_mmap,
+	.release = vfio_pci_npu2_release,
+	.add_capability = vfio_pci_npu2_add_capability,
+};
+
+int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
+{
+	int ret;
+	struct vfio_pci_npu2_data *data;
+	struct device_node *nvlink_dn;
+	u32 nvlink_index = 0;
+	struct pci_dev *npdev = vdev->pdev;
+	struct device_node *npu_node = pci_device_to_OF_node(npdev);
+	struct pci_controller *hose = pci_bus_to_host(npdev->bus);
+	u64 mmio_atsd = 0;
+	u64 tgt = 0;
+	u32 link_speed = 0xff;
+
+	/*
+	 * PCI config space does not tell us about NVLink presense but
+	 * platform does, use this.
+	 */
+	if (!pnv_pci_get_gpu_dev(vdev->pdev))
+		return -ENODEV;
+
+	/*
+	 * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links
+	 * so we can allocate one register per link, using nvlink index as
+	 * a key.
+	 * There is always at least one ATSD register so as long as at least
+	 * NVLink bridge #0 is passed to the guest, ATSD will be available.
+	 */
+	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+			&nvlink_index)))
+		return -ENODEV;
+
+	if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
+			&mmio_atsd)) {
+		dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
+		mmio_atsd = 0;
+	}
+
+	if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
+		dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
+		return -EFAULT;
+	}
+
+	if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) {
+		dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n");
+		return -EFAULT;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->mmio_atsd = mmio_atsd;
+	data->gpu_tgt = tgt;
+	data->link_speed = link_speed;
+	if (data->mmio_atsd) {
+		data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT);
+		if (!data->base) {
+			ret = -ENOMEM;
+			goto free_exit;
+		}
+	}
+
+	/*
+	 * We want to expose the capability even if this specific NVLink
+	 * did not get its own ATSD register because capabilities
+	 * belong to VFIO regions and normally there will be ATSD register
+	 * assigned to the NVLink bridge.
+	 */
+	ret = vfio_pci_register_dev_region(vdev,
+			PCI_VENDOR_ID_IBM |
+			VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+			VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
+			&vfio_pci_npu2_regops,
+			data->mmio_atsd ? PAGE_SIZE : 0,
+			VFIO_REGION_INFO_FLAG_READ |
+			VFIO_REGION_INFO_FLAG_WRITE |
+			VFIO_REGION_INFO_FLAG_MMAP,
+			data);
+	if (ret)
+		goto free_exit;
+
+	return 0;
+
+free_exit:
+	kfree(data);
+
+	return ret;
+}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 93c1738dae11..127071b84dd7 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -163,4 +163,18 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev)
 	return -ENODEV;
 }
 #endif
+#ifdef CONFIG_VFIO_PCI_NVLINK2
+extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev);
+extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev);
+#else
+static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
+{
+	return -ENODEV;
+}
+
+static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
+{
+	return -ENODEV;
+}
+#endif
 #endif /* VFIO_PCI_PRIVATE_H */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 813102810f53..02bb7ad6e986 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -353,6 +353,21 @@ struct vfio_region_gfx_edid {
 #define VFIO_DEVICE_GFX_LINK_STATE_DOWN  2
 };
 
+/*
+ * 10de vendor sub-type
+ *
+ * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
+ */
+#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM	(1)
+
+/*
+ * 1014 vendor sub-type
+ *
+ * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
+ * to do TLB invalidation on a GPU.
+ */
+#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
+
 /*
  * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
  * which allows direct access to non-MSIX registers which happened to be within
@@ -363,6 +378,33 @@ struct vfio_region_gfx_edid {
  */
 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
 
+/*
+ * Capability with compressed real address (aka SSA - small system address)
+ * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing
+ * and by the userspace to associate a NVLink bridge with a GPU.
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT	4
+
+struct vfio_region_info_cap_nvlink2_ssatgt {
+	struct vfio_info_cap_header header;
+	__u64 tgt;
+};
+
+/*
+ * Capability with an NVLink link speed. The value is read by
+ * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed"
+ * property in the device tree. The value is fixed in the hardware
+ * and failing to provide the correct value results in the link
+ * not working with no indication from the driver why.
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD	5
+
+struct vfio_region_info_cap_nvlink2_lnkspd {
+	struct vfio_info_cap_header header;
+	__u32 link_speed;
+	__u32 __pad;
+};
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  *				    struct vfio_irq_info)
-- 
cgit v1.2.3-59-g8ed1b


From 077b930adafead095cd38600539ec129f1379d8c Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 13 Dec 2018 20:22:00 +0300
Subject: elf-em.h: add EM_CSKY

The uapi/linux/audit.h header is going to use EM_CSKY in order
to define AUDIT_ARCH_CSKY which is needed to implement
syscall_get_arch() which in turn is required to extend
the generic ptrace API with PTRACE_GET_SYSCALL_INFO request.

The value for EM_CSKY has been taken from arch/csky/include/asm/elf.h
and confirmed by binutils:include/elf/common.h

Cc: Guo Ren <guoren@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Guo Ren <guoren@kernel.org>
---
 arch/csky/include/asm/elf.h | 2 +-
 include/uapi/linux/elf-em.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/arch/csky/include/asm/elf.h b/arch/csky/include/asm/elf.h
index 773b133ca297..48b5366568ab 100644
--- a/arch/csky/include/asm/elf.h
+++ b/arch/csky/include/asm/elf.h
@@ -7,7 +7,7 @@
 #include <asm/ptrace.h>
 #include <abi/regdef.h>
 
-#define ELF_ARCH 252
+#define ELF_ARCH EM_CSKY
 
 /* CSKY Relocations */
 #define R_CSKY_NONE               0
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index 93722e60204c..d9544b8a7096 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -43,6 +43,7 @@
 #define EM_TILEGX	191	/* Tilera TILE-Gx */
 #define EM_RISCV	243	/* RISC-V */
 #define EM_BPF		247	/* Linux BPF - in-kernel virtual machine */
+#define EM_CSKY		252	/* C-SKY */
 #define EM_FRV		0x5441	/* Fujitsu FR-V */
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From d770b25653447b4c57303859e2ac04ebe9318f8e Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 13 Dec 2018 20:22:07 +0300
Subject: csky: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in order to extend the generic ptrace API with PTRACE_GET_SYSCALL_INFO
request.

Cc: Guo Ren <guoren@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Guo Ren <guoren@kernel.org>

 arch/csky/include/asm/syscall.h | 7 +++++++
 include/uapi/linux/audit.h      | 1 +
 2 files changed, 8 insertions(+)
---
 arch/csky/include/asm/syscall.h | 7 +++++++
 include/uapi/linux/audit.h      | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/csky/include/asm/syscall.h b/arch/csky/include/asm/syscall.h
index 926a64a8b4ee..d637445737b7 100644
--- a/arch/csky/include/asm/syscall.h
+++ b/arch/csky/include/asm/syscall.h
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <abi/regdef.h>
+#include <uapi/linux/audit.h>
 
 static inline int
 syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
@@ -68,4 +69,10 @@ syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
 	memcpy(&regs->a1 + i * sizeof(regs->a1), args, n * sizeof(regs->a0));
 }
 
+static inline int
+syscall_get_arch(void)
+{
+	return AUDIT_ARCH_CSKY;
+}
+
 #endif	/* __ASM_SYSCALL_H */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 818ae690ab79..f91729232f46 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -378,6 +378,7 @@ enum {
 #define AUDIT_ARCH_ARM		(EM_ARM|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ARMEB	(EM_ARM)
 #define AUDIT_ARCH_CRIS		(EM_CRIS|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_CSKY		(EM_CSKY|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_FRV		(EM_FRV)
 #define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_IA64		(EM_IA_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-- 
cgit v1.2.3-59-g8ed1b


From c10b13325ced237f6129e8ee73cd8c72e1bd10ed Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Tue, 18 Dec 2018 20:32:37 +0530
Subject: tty: serial: Add RDA8810PL UART driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add UART driver for RDA Micro RDA8810PL SoC.

Signed-off-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 Documentation/admin-guide/kernel-parameters.txt |   6 +
 drivers/tty/serial/Kconfig                      |  19 +
 drivers/tty/serial/Makefile                     |   1 +
 drivers/tty/serial/rda-uart.c                   | 831 ++++++++++++++++++++++++
 include/uapi/linux/serial_core.h                |   3 +
 5 files changed, 860 insertions(+)
 create mode 100644 drivers/tty/serial/rda-uart.c

(limited to 'include/uapi')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index aefd358a5ca3..98ee9eaa52be 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1021,6 +1021,12 @@
 			specified address. The serial port must already be
 			setup and configured. Options are not yet supported.
 
+		rda,<addr>
+			Start an early, polled-mode console on a serial port
+			of an RDA Micro SoC, such as RDA8810PL, at the
+			specified address. The serial port must already be
+			setup and configured. Options are not yet supported.
+
 		smh	Use ARM semihosting calls for early console.
 
 		s3c2410,<addr>
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 32886c304641..67b9bf3b500e 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1529,6 +1529,25 @@ config SERIAL_OWL_CONSOLE
 	  Say 'Y' here if you wish to use Actions Semiconductor S500/S900 UART
 	  as the system console.
 
+config SERIAL_RDA
+	bool "RDA Micro serial port support"
+	depends on ARCH_RDA || COMPILE_TEST
+	select SERIAL_CORE
+	help
+	  This driver is for RDA8810PL SoC's UART.
+	  Say 'Y' here if you wish to use the on-board serial port.
+	  Otherwise, say 'N'.
+
+config SERIAL_RDA_CONSOLE
+	bool "Console on RDA Micro serial port"
+	depends on SERIAL_RDA=y
+	select SERIAL_CORE_CONSOLE
+	select SERIAL_EARLYCON
+	default y
+	help
+	  Say 'Y' here if you wish to use the RDA8810PL UART as the system
+	  console. Only earlycon is implemented currently.
+
 endmenu
 
 config SERIAL_MCTRL_GPIO
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index daac675612df..8c303736b7e8 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_SERIAL_MVEBU_UART)	+= mvebu-uart.o
 obj-$(CONFIG_SERIAL_PIC32)	+= pic32_uart.o
 obj-$(CONFIG_SERIAL_MPS2_UART)	+= mps2-uart.o
 obj-$(CONFIG_SERIAL_OWL)	+= owl-uart.o
+obj-$(CONFIG_SERIAL_RDA)	+= rda-uart.o
 
 # GPIOLIB helpers for modem control lines
 obj-$(CONFIG_SERIAL_MCTRL_GPIO)	+= serial_mctrl_gpio.o
diff --git a/drivers/tty/serial/rda-uart.c b/drivers/tty/serial/rda-uart.c
new file mode 100644
index 000000000000..284623eefaeb
--- /dev/null
+++ b/drivers/tty/serial/rda-uart.c
@@ -0,0 +1,831 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RDA8810PL serial device driver
+ *
+ * Copyright RDA Microelectronics Company Limited
+ * Copyright (c) 2017 Andreas Färber
+ * Copyright (c) 2018 Manivannan Sadhasivam
+ */
+
+#include <linux/clk.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#define RDA_UART_PORT_NUM 3
+#define RDA_UART_DEV_NAME "ttyRDA"
+
+#define RDA_UART_CTRL		0x00
+#define RDA_UART_STATUS		0x04
+#define RDA_UART_RXTX_BUFFER	0x08
+#define RDA_UART_IRQ_MASK	0x0c
+#define RDA_UART_IRQ_CAUSE	0x10
+#define RDA_UART_IRQ_TRIGGERS	0x14
+#define RDA_UART_CMD_SET	0x18
+#define RDA_UART_CMD_CLR	0x1c
+
+/* UART_CTRL Bits */
+#define RDA_UART_ENABLE			BIT(0)
+#define RDA_UART_DBITS_8		BIT(1)
+#define RDA_UART_TX_SBITS_2		BIT(2)
+#define RDA_UART_PARITY_EN		BIT(3)
+#define RDA_UART_PARITY(x)		(((x) & 0x3) << 4)
+#define RDA_UART_PARITY_ODD		RDA_UART_PARITY(0)
+#define RDA_UART_PARITY_EVEN		RDA_UART_PARITY(1)
+#define RDA_UART_PARITY_SPACE		RDA_UART_PARITY(2)
+#define RDA_UART_PARITY_MARK		RDA_UART_PARITY(3)
+#define RDA_UART_DIV_MODE		BIT(20)
+#define RDA_UART_IRDA_EN		BIT(21)
+#define RDA_UART_DMA_EN			BIT(22)
+#define RDA_UART_FLOW_CNT_EN		BIT(23)
+#define RDA_UART_LOOP_BACK_EN		BIT(24)
+#define RDA_UART_RX_LOCK_ERR		BIT(25)
+#define RDA_UART_RX_BREAK_LEN(x)	(((x) & 0xf) << 28)
+
+/* UART_STATUS Bits */
+#define RDA_UART_RX_FIFO(x)		(((x) & 0x7f) << 0)
+#define RDA_UART_RX_FIFO_MASK		(0x7f << 0)
+#define RDA_UART_TX_FIFO(x)		(((x) & 0x1f) << 8)
+#define RDA_UART_TX_FIFO_MASK		(0x1f << 8)
+#define RDA_UART_TX_ACTIVE		BIT(14)
+#define RDA_UART_RX_ACTIVE		BIT(15)
+#define RDA_UART_RX_OVERFLOW_ERR	BIT(16)
+#define RDA_UART_TX_OVERFLOW_ERR	BIT(17)
+#define RDA_UART_RX_PARITY_ERR		BIT(18)
+#define RDA_UART_RX_FRAMING_ERR		BIT(19)
+#define RDA_UART_RX_BREAK_INT		BIT(20)
+#define RDA_UART_DCTS			BIT(24)
+#define RDA_UART_CTS			BIT(25)
+#define RDA_UART_DTR			BIT(28)
+#define RDA_UART_CLK_ENABLED		BIT(31)
+
+/* UART_RXTX_BUFFER Bits */
+#define RDA_UART_RX_DATA(x)		(((x) & 0xff) << 0)
+#define RDA_UART_TX_DATA(x)		(((x) & 0xff) << 0)
+
+/* UART_IRQ_MASK Bits */
+#define RDA_UART_TX_MODEM_STATUS	BIT(0)
+#define RDA_UART_RX_DATA_AVAILABLE	BIT(1)
+#define RDA_UART_TX_DATA_NEEDED		BIT(2)
+#define RDA_UART_RX_TIMEOUT		BIT(3)
+#define RDA_UART_RX_LINE_ERR		BIT(4)
+#define RDA_UART_TX_DMA_DONE		BIT(5)
+#define RDA_UART_RX_DMA_DONE		BIT(6)
+#define RDA_UART_RX_DMA_TIMEOUT		BIT(7)
+#define RDA_UART_DTR_RISE		BIT(8)
+#define RDA_UART_DTR_FALL		BIT(9)
+
+/* UART_IRQ_CAUSE Bits */
+#define RDA_UART_TX_MODEM_STATUS_U	BIT(16)
+#define RDA_UART_RX_DATA_AVAILABLE_U	BIT(17)
+#define RDA_UART_TX_DATA_NEEDED_U	BIT(18)
+#define RDA_UART_RX_TIMEOUT_U		BIT(19)
+#define RDA_UART_RX_LINE_ERR_U		BIT(20)
+#define RDA_UART_TX_DMA_DONE_U		BIT(21)
+#define RDA_UART_RX_DMA_DONE_U		BIT(22)
+#define RDA_UART_RX_DMA_TIMEOUT_U	BIT(23)
+#define RDA_UART_DTR_RISE_U		BIT(24)
+#define RDA_UART_DTR_FALL_U		BIT(25)
+
+/* UART_TRIGGERS Bits */
+#define RDA_UART_RX_TRIGGER(x)		(((x) & 0x1f) << 0)
+#define RDA_UART_TX_TRIGGER(x)		(((x) & 0xf) << 8)
+#define RDA_UART_AFC_LEVEL(x)		(((x) & 0x1f) << 16)
+
+/* UART_CMD_SET Bits */
+#define RDA_UART_RI			BIT(0)
+#define RDA_UART_DCD			BIT(1)
+#define RDA_UART_DSR			BIT(2)
+#define RDA_UART_TX_BREAK_CONTROL	BIT(3)
+#define RDA_UART_TX_FINISH_N_WAIT	BIT(4)
+#define RDA_UART_RTS			BIT(5)
+#define RDA_UART_RX_FIFO_RESET		BIT(6)
+#define RDA_UART_TX_FIFO_RESET		BIT(7)
+
+#define RDA_UART_TX_FIFO_SIZE	16
+
+static struct uart_driver rda_uart_driver;
+
+struct rda_uart_port {
+	struct uart_port port;
+	struct clk *clk;
+};
+
+#define to_rda_uart_port(port) container_of(port, struct rda_uart_port, port)
+
+static struct rda_uart_port *rda_uart_ports[RDA_UART_PORT_NUM];
+
+static inline void rda_uart_write(struct uart_port *port, u32 val,
+				  unsigned int off)
+{
+	writel(val, port->membase + off);
+}
+
+static inline u32 rda_uart_read(struct uart_port *port, unsigned int off)
+{
+	return readl(port->membase + off);
+}
+
+static unsigned int rda_uart_tx_empty(struct uart_port *port)
+{
+	unsigned long flags;
+	unsigned int ret;
+	u32 val;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	val = rda_uart_read(port, RDA_UART_STATUS);
+	ret = (val & RDA_UART_TX_FIFO_MASK) ? TIOCSER_TEMT : 0;
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return ret;
+}
+
+static unsigned int rda_uart_get_mctrl(struct uart_port *port)
+{
+	unsigned int mctrl = 0;
+	u32 cmd_set, status;
+
+	cmd_set = rda_uart_read(port, RDA_UART_CMD_SET);
+	status = rda_uart_read(port, RDA_UART_STATUS);
+	if (cmd_set & RDA_UART_RTS)
+		mctrl |= TIOCM_RTS;
+	if (!(status & RDA_UART_CTS))
+		mctrl |= TIOCM_CTS;
+
+	return mctrl;
+}
+
+static void rda_uart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	u32 val;
+
+	if (mctrl & TIOCM_RTS) {
+		val = rda_uart_read(port, RDA_UART_CMD_SET);
+		rda_uart_write(port, (val | RDA_UART_RTS), RDA_UART_CMD_SET);
+	} else {
+		/* Clear RTS to stop to receive. */
+		val = rda_uart_read(port, RDA_UART_CMD_CLR);
+		rda_uart_write(port, (val | RDA_UART_RTS), RDA_UART_CMD_CLR);
+	}
+
+	val = rda_uart_read(port, RDA_UART_CTRL);
+
+	if (mctrl & TIOCM_LOOP)
+		val |= RDA_UART_LOOP_BACK_EN;
+	else
+		val &= ~RDA_UART_LOOP_BACK_EN;
+
+	rda_uart_write(port, val, RDA_UART_CTRL);
+}
+
+static void rda_uart_stop_tx(struct uart_port *port)
+{
+	u32 val;
+
+	val = rda_uart_read(port, RDA_UART_IRQ_MASK);
+	val &= ~RDA_UART_TX_DATA_NEEDED;
+	rda_uart_write(port, val, RDA_UART_IRQ_MASK);
+
+	val = rda_uart_read(port, RDA_UART_CMD_SET);
+	val |= RDA_UART_TX_FIFO_RESET;
+	rda_uart_write(port, val, RDA_UART_CMD_SET);
+}
+
+static void rda_uart_stop_rx(struct uart_port *port)
+{
+	u32 val;
+
+	val = rda_uart_read(port, RDA_UART_IRQ_MASK);
+	val &= ~(RDA_UART_RX_DATA_AVAILABLE | RDA_UART_RX_TIMEOUT);
+	rda_uart_write(port, val, RDA_UART_IRQ_MASK);
+
+	/* Read Rx buffer before reset to avoid Rx timeout interrupt */
+	val = rda_uart_read(port, RDA_UART_RXTX_BUFFER);
+
+	val = rda_uart_read(port, RDA_UART_CMD_SET);
+	val |= RDA_UART_RX_FIFO_RESET;
+	rda_uart_write(port, val, RDA_UART_CMD_SET);
+}
+
+static void rda_uart_start_tx(struct uart_port *port)
+{
+	u32 val;
+
+	if (uart_tx_stopped(port)) {
+		rda_uart_stop_tx(port);
+		return;
+	}
+
+	val = rda_uart_read(port, RDA_UART_IRQ_MASK);
+	val |= RDA_UART_TX_DATA_NEEDED;
+	rda_uart_write(port, val, RDA_UART_IRQ_MASK);
+}
+
+static void rda_uart_change_baudrate(struct rda_uart_port *rda_port,
+				     unsigned long baud)
+{
+	clk_set_rate(rda_port->clk, baud * 8);
+}
+
+static void rda_uart_set_termios(struct uart_port *port,
+				 struct ktermios *termios,
+				 struct ktermios *old)
+{
+	struct rda_uart_port *rda_port = to_rda_uart_port(port);
+	unsigned long flags;
+	unsigned int ctrl, cmd_set, cmd_clr, triggers;
+	unsigned int baud;
+	u32 irq_mask;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	baud = uart_get_baud_rate(port, termios, old, 9600, port->uartclk / 4);
+	rda_uart_change_baudrate(rda_port, baud);
+
+	ctrl = rda_uart_read(port, RDA_UART_CTRL);
+	cmd_set = rda_uart_read(port, RDA_UART_CMD_SET);
+	cmd_clr = rda_uart_read(port, RDA_UART_CMD_CLR);
+
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+	case CS6:
+		dev_warn(port->dev, "bit size not supported, using 7 bits\n");
+		/* Fall through */
+	case CS7:
+		ctrl &= ~RDA_UART_DBITS_8;
+		break;
+	default:
+		ctrl |= RDA_UART_DBITS_8;
+		break;
+	}
+
+	/* stop bits */
+	if (termios->c_cflag & CSTOPB)
+		ctrl |= RDA_UART_TX_SBITS_2;
+	else
+		ctrl &= ~RDA_UART_TX_SBITS_2;
+
+	/* parity check */
+	if (termios->c_cflag & PARENB) {
+		ctrl |= RDA_UART_PARITY_EN;
+
+		/* Mark or Space parity */
+		if (termios->c_cflag & CMSPAR) {
+			if (termios->c_cflag & PARODD)
+				ctrl |= RDA_UART_PARITY_MARK;
+			else
+				ctrl |= RDA_UART_PARITY_SPACE;
+		} else if (termios->c_cflag & PARODD) {
+			ctrl |= RDA_UART_PARITY_ODD;
+		} else {
+			ctrl |= RDA_UART_PARITY_EVEN;
+		}
+	} else {
+		ctrl &= ~RDA_UART_PARITY_EN;
+	}
+
+	/* Hardware handshake (RTS/CTS) */
+	if (termios->c_cflag & CRTSCTS) {
+		ctrl   |= RDA_UART_FLOW_CNT_EN;
+		cmd_set |= RDA_UART_RTS;
+	} else {
+		ctrl   &= ~RDA_UART_FLOW_CNT_EN;
+		cmd_clr |= RDA_UART_RTS;
+	}
+
+	ctrl |= RDA_UART_ENABLE;
+	ctrl &= ~RDA_UART_DMA_EN;
+
+	triggers  = (RDA_UART_AFC_LEVEL(20) | RDA_UART_RX_TRIGGER(16));
+	irq_mask = rda_uart_read(port, RDA_UART_IRQ_MASK);
+	rda_uart_write(port, 0, RDA_UART_IRQ_MASK);
+
+	rda_uart_write(port, triggers, RDA_UART_IRQ_TRIGGERS);
+	rda_uart_write(port, ctrl, RDA_UART_CTRL);
+	rda_uart_write(port, cmd_set, RDA_UART_CMD_SET);
+	rda_uart_write(port, cmd_clr, RDA_UART_CMD_CLR);
+
+	rda_uart_write(port, irq_mask, RDA_UART_IRQ_MASK);
+
+	/* Don't rewrite B0 */
+	if (tty_termios_baud_rate(termios))
+		tty_termios_encode_baud_rate(termios, baud, baud);
+
+	/* update the per-port timeout */
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void rda_uart_send_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->state->xmit;
+	unsigned int ch;
+	u32 val;
+
+	if (uart_tx_stopped(port))
+		return;
+
+	if (port->x_char) {
+		while (!(rda_uart_read(port, RDA_UART_STATUS) &
+			 RDA_UART_TX_FIFO_MASK))
+			cpu_relax();
+
+		rda_uart_write(port, port->x_char, RDA_UART_RXTX_BUFFER);
+		port->icount.tx++;
+		port->x_char = 0;
+	}
+
+	while (rda_uart_read(port, RDA_UART_STATUS) & RDA_UART_TX_FIFO_MASK) {
+		if (uart_circ_empty(xmit))
+			break;
+
+		ch = xmit->buf[xmit->tail];
+		rda_uart_write(port, ch, RDA_UART_RXTX_BUFFER);
+		xmit->tail = (xmit->tail + 1) & (SERIAL_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (!uart_circ_empty(xmit)) {
+		/* Re-enable Tx FIFO interrupt */
+		val = rda_uart_read(port, RDA_UART_IRQ_MASK);
+		val |= RDA_UART_TX_DATA_NEEDED;
+		rda_uart_write(port, val, RDA_UART_IRQ_MASK);
+	}
+}
+
+static void rda_uart_receive_chars(struct uart_port *port)
+{
+	u32 status, val;
+
+	status = rda_uart_read(port, RDA_UART_STATUS);
+	while ((status & RDA_UART_RX_FIFO_MASK)) {
+		char flag = TTY_NORMAL;
+
+		if (status & RDA_UART_RX_PARITY_ERR) {
+			port->icount.parity++;
+			flag = TTY_PARITY;
+		}
+
+		if (status & RDA_UART_RX_FRAMING_ERR) {
+			port->icount.frame++;
+			flag = TTY_FRAME;
+		}
+
+		if (status & RDA_UART_RX_OVERFLOW_ERR) {
+			port->icount.overrun++;
+			flag = TTY_OVERRUN;
+		}
+
+		val = rda_uart_read(port, RDA_UART_RXTX_BUFFER);
+		val &= 0xff;
+
+		port->icount.rx++;
+		tty_insert_flip_char(&port->state->port, val, flag);
+
+		status = rda_uart_read(port, RDA_UART_STATUS);
+	}
+
+	spin_unlock(&port->lock);
+	tty_flip_buffer_push(&port->state->port);
+	spin_lock(&port->lock);
+}
+
+static irqreturn_t rda_interrupt(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	unsigned long flags;
+	u32 val, irq_mask;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* Clear IRQ cause */
+	val = rda_uart_read(port, RDA_UART_IRQ_CAUSE);
+	rda_uart_write(port, val, RDA_UART_IRQ_CAUSE);
+
+	if (val & (RDA_UART_RX_DATA_AVAILABLE | RDA_UART_RX_TIMEOUT))
+		rda_uart_receive_chars(port);
+
+	if (val & (RDA_UART_TX_DATA_NEEDED)) {
+		irq_mask = rda_uart_read(port, RDA_UART_IRQ_MASK);
+		irq_mask &= ~RDA_UART_TX_DATA_NEEDED;
+		rda_uart_write(port, irq_mask, RDA_UART_IRQ_MASK);
+
+		rda_uart_send_chars(port);
+	}
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static int rda_uart_startup(struct uart_port *port)
+{
+	unsigned long flags;
+	int ret;
+	u32 val;
+
+	spin_lock_irqsave(&port->lock, flags);
+	rda_uart_write(port, 0, RDA_UART_IRQ_MASK);
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	ret = request_irq(port->irq, rda_interrupt, IRQF_NO_SUSPEND,
+			  "rda-uart", port);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	val = rda_uart_read(port, RDA_UART_CTRL);
+	val |= RDA_UART_ENABLE;
+	rda_uart_write(port, val, RDA_UART_CTRL);
+
+	/* enable rx interrupt */
+	val = rda_uart_read(port, RDA_UART_IRQ_MASK);
+	val |= (RDA_UART_RX_DATA_AVAILABLE | RDA_UART_RX_TIMEOUT);
+	rda_uart_write(port, val, RDA_UART_IRQ_MASK);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return 0;
+}
+
+static void rda_uart_shutdown(struct uart_port *port)
+{
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	rda_uart_stop_tx(port);
+	rda_uart_stop_rx(port);
+
+	val = rda_uart_read(port, RDA_UART_CTRL);
+	val &= ~RDA_UART_ENABLE;
+	rda_uart_write(port, val, RDA_UART_CTRL);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static const char *rda_uart_type(struct uart_port *port)
+{
+	return (port->type == PORT_RDA) ? "rda-uart" : NULL;
+}
+
+static int rda_uart_request_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENXIO;
+
+	if (!devm_request_mem_region(port->dev, port->mapbase,
+				     resource_size(res), dev_name(port->dev)))
+		return -EBUSY;
+
+	if (port->flags & UPF_IOREMAP) {
+		port->membase = devm_ioremap_nocache(port->dev, port->mapbase,
+						     resource_size(res));
+		if (!port->membase)
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void rda_uart_config_port(struct uart_port *port, int flags)
+{
+	unsigned long irq_flags;
+
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_RDA;
+		rda_uart_request_port(port);
+	}
+
+	spin_lock_irqsave(&port->lock, irq_flags);
+
+	/* Clear mask, so no surprise interrupts. */
+	rda_uart_write(port, 0, RDA_UART_IRQ_MASK);
+
+	/* Clear status register */
+	rda_uart_write(port, 0, RDA_UART_STATUS);
+
+	spin_unlock_irqrestore(&port->lock, irq_flags);
+}
+
+static void rda_uart_release_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return;
+
+	if (port->flags & UPF_IOREMAP) {
+		devm_release_mem_region(port->dev, port->mapbase,
+					resource_size(res));
+		devm_iounmap(port->dev, port->membase);
+		port->membase = NULL;
+	}
+}
+
+static int rda_uart_verify_port(struct uart_port *port,
+				struct serial_struct *ser)
+{
+	if (port->type != PORT_RDA)
+		return -EINVAL;
+
+	if (port->irq != ser->irq)
+		return -EINVAL;
+
+	return 0;
+}
+
+static const struct uart_ops rda_uart_ops = {
+	.tx_empty       = rda_uart_tx_empty,
+	.get_mctrl      = rda_uart_get_mctrl,
+	.set_mctrl      = rda_uart_set_mctrl,
+	.start_tx       = rda_uart_start_tx,
+	.stop_tx        = rda_uart_stop_tx,
+	.stop_rx        = rda_uart_stop_rx,
+	.startup        = rda_uart_startup,
+	.shutdown       = rda_uart_shutdown,
+	.set_termios    = rda_uart_set_termios,
+	.type           = rda_uart_type,
+	.request_port	= rda_uart_request_port,
+	.release_port	= rda_uart_release_port,
+	.config_port	= rda_uart_config_port,
+	.verify_port	= rda_uart_verify_port,
+};
+
+#ifdef CONFIG_SERIAL_RDA_CONSOLE
+
+static void rda_console_putchar(struct uart_port *port, int ch)
+{
+	if (!port->membase)
+		return;
+
+	while (!(rda_uart_read(port, RDA_UART_STATUS) & RDA_UART_TX_FIFO_MASK))
+		cpu_relax();
+
+	rda_uart_write(port, ch, RDA_UART_RXTX_BUFFER);
+}
+
+static void rda_uart_port_write(struct uart_port *port, const char *s,
+				u_int count)
+{
+	u32 old_irq_mask;
+	unsigned long flags;
+	int locked;
+
+	local_irq_save(flags);
+
+	if (port->sysrq) {
+		locked = 0;
+	} else if (oops_in_progress) {
+		locked = spin_trylock(&port->lock);
+	} else {
+		spin_lock(&port->lock);
+		locked = 1;
+	}
+
+	old_irq_mask = rda_uart_read(port, RDA_UART_IRQ_MASK);
+	rda_uart_write(port, 0, RDA_UART_IRQ_MASK);
+
+	uart_console_write(port, s, count, rda_console_putchar);
+
+	/* wait until all contents have been sent out */
+	while (!(rda_uart_read(port, RDA_UART_STATUS) & RDA_UART_TX_FIFO_MASK))
+		cpu_relax();
+
+	rda_uart_write(port, old_irq_mask, RDA_UART_IRQ_MASK);
+
+	if (locked)
+		spin_unlock(&port->lock);
+
+	local_irq_restore(flags);
+}
+
+static void rda_uart_console_write(struct console *co, const char *s,
+				   u_int count)
+{
+	struct rda_uart_port *rda_port;
+
+	rda_port = rda_uart_ports[co->index];
+	if (!rda_port)
+		return;
+
+	rda_uart_port_write(&rda_port->port, s, count);
+}
+
+static int rda_uart_console_setup(struct console *co, char *options)
+{
+	struct rda_uart_port *rda_port;
+	int baud = 921600;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if (co->index < 0 || co->index >= RDA_UART_PORT_NUM)
+		return -EINVAL;
+
+	rda_port = rda_uart_ports[co->index];
+	if (!rda_port || !rda_port->port.membase)
+		return -ENODEV;
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(&rda_port->port, co, baud, parity, bits, flow);
+}
+
+static struct console rda_uart_console = {
+	.name = RDA_UART_DEV_NAME,
+	.write = rda_uart_console_write,
+	.device = uart_console_device,
+	.setup = rda_uart_console_setup,
+	.flags = CON_PRINTBUFFER,
+	.index = -1,
+	.data = &rda_uart_driver,
+};
+
+static int __init rda_uart_console_init(void)
+{
+	register_console(&rda_uart_console);
+
+	return 0;
+}
+console_initcall(rda_uart_console_init);
+
+static void rda_uart_early_console_write(struct console *co,
+					 const char *s,
+					 u_int count)
+{
+	struct earlycon_device *dev = co->data;
+
+	rda_uart_port_write(&dev->port, s, count);
+}
+
+static int __init
+rda_uart_early_console_setup(struct earlycon_device *device, const char *opt)
+{
+	if (!device->port.membase)
+		return -ENODEV;
+
+	device->con->write = rda_uart_early_console_write;
+
+	return 0;
+}
+
+OF_EARLYCON_DECLARE(rda, "rda,8810pl-uart",
+		    rda_uart_early_console_setup);
+
+#define RDA_UART_CONSOLE (&rda_uart_console)
+#else
+#define RDA_UART_CONSOLE NULL
+#endif /* CONFIG_SERIAL_RDA_CONSOLE */
+
+static struct uart_driver rda_uart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "rda-uart",
+	.dev_name = RDA_UART_DEV_NAME,
+	.nr = RDA_UART_PORT_NUM,
+	.cons = RDA_UART_CONSOLE,
+};
+
+static const struct of_device_id rda_uart_dt_matches[] = {
+	{ .compatible = "rda,8810pl-uart" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, rda_uart_dt_matches);
+
+static int rda_uart_probe(struct platform_device *pdev)
+{
+	struct resource *res_mem;
+	struct rda_uart_port *rda_port;
+	int ret, irq;
+
+	if (pdev->dev.of_node)
+		pdev->id = of_alias_get_id(pdev->dev.of_node, "serial");
+
+	if (pdev->id < 0 || pdev->id >= RDA_UART_PORT_NUM) {
+		dev_err(&pdev->dev, "id %d out of range\n", pdev->id);
+		return -EINVAL;
+	}
+
+	res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res_mem) {
+		dev_err(&pdev->dev, "could not get mem\n");
+		return -ENODEV;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "could not get irq\n");
+		return irq;
+	}
+
+	if (rda_uart_ports[pdev->id]) {
+		dev_err(&pdev->dev, "port %d already allocated\n", pdev->id);
+		return -EBUSY;
+	}
+
+	rda_port = devm_kzalloc(&pdev->dev, sizeof(*rda_port), GFP_KERNEL);
+	if (!rda_port)
+		return -ENOMEM;
+
+	rda_port->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(rda_port->clk)) {
+		dev_err(&pdev->dev, "could not get clk\n");
+		return PTR_ERR(rda_port->clk);
+	}
+
+	rda_port->port.dev = &pdev->dev;
+	rda_port->port.regshift = 0;
+	rda_port->port.line = pdev->id;
+	rda_port->port.type = PORT_RDA;
+	rda_port->port.iotype = UPIO_MEM;
+	rda_port->port.mapbase = res_mem->start;
+	rda_port->port.irq = irq;
+	rda_port->port.uartclk = clk_get_rate(rda_port->clk);
+	if (rda_port->port.uartclk == 0) {
+		dev_err(&pdev->dev, "clock rate is zero\n");
+		return -EINVAL;
+	}
+	rda_port->port.flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP |
+			       UPF_LOW_LATENCY;
+	rda_port->port.x_char = 0;
+	rda_port->port.fifosize = RDA_UART_TX_FIFO_SIZE;
+	rda_port->port.ops = &rda_uart_ops;
+
+	rda_uart_ports[pdev->id] = rda_port;
+	platform_set_drvdata(pdev, rda_port);
+
+	ret = uart_add_one_port(&rda_uart_driver, &rda_port->port);
+	if (ret)
+		rda_uart_ports[pdev->id] = NULL;
+
+	return ret;
+}
+
+static int rda_uart_remove(struct platform_device *pdev)
+{
+	struct rda_uart_port *rda_port = platform_get_drvdata(pdev);
+
+	uart_remove_one_port(&rda_uart_driver, &rda_port->port);
+	rda_uart_ports[pdev->id] = NULL;
+
+	return 0;
+}
+
+static struct platform_driver rda_uart_platform_driver = {
+	.probe = rda_uart_probe,
+	.remove = rda_uart_remove,
+	.driver = {
+		.name = "rda-uart",
+		.of_match_table = rda_uart_dt_matches,
+	},
+};
+
+static int __init rda_uart_init(void)
+{
+	int ret;
+
+	ret = uart_register_driver(&rda_uart_driver);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&rda_uart_platform_driver);
+	if (ret)
+		uart_unregister_driver(&rda_uart_driver);
+
+	return ret;
+}
+
+static void __init rda_uart_exit(void)
+{
+	platform_driver_unregister(&rda_uart_platform_driver);
+	uart_unregister_driver(&rda_uart_driver);
+}
+
+module_init(rda_uart_init);
+module_exit(rda_uart_exit);
+
+MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>");
+MODULE_DESCRIPTION("RDA8810PL serial device driver");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index dce5f9dae121..df4a7534e239 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -281,4 +281,7 @@
 /* MediaTek BTIF */
 #define PORT_MTK_BTIF	117
 
+/* RDA UART */
+#define PORT_RDA	118
+
 #endif /* _UAPILINUX_SERIAL_CORE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 7a79d717e0817610932ce3b7b6033ea06ee1d577 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Mon, 31 Dec 2018 23:59:59 +0100
Subject: batman-adv: Update copyright years for 2019

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batadv_packet.h     | 2 +-
 include/uapi/linux/batman_adv.h        | 2 +-
 net/batman-adv/Kconfig                 | 2 +-
 net/batman-adv/Makefile                | 2 +-
 net/batman-adv/bat_algo.c              | 2 +-
 net/batman-adv/bat_algo.h              | 2 +-
 net/batman-adv/bat_iv_ogm.c            | 2 +-
 net/batman-adv/bat_iv_ogm.h            | 2 +-
 net/batman-adv/bat_v.c                 | 2 +-
 net/batman-adv/bat_v.h                 | 2 +-
 net/batman-adv/bat_v_elp.c             | 2 +-
 net/batman-adv/bat_v_elp.h             | 2 +-
 net/batman-adv/bat_v_ogm.c             | 2 +-
 net/batman-adv/bat_v_ogm.h             | 2 +-
 net/batman-adv/bitarray.c              | 2 +-
 net/batman-adv/bitarray.h              | 2 +-
 net/batman-adv/bridge_loop_avoidance.c | 2 +-
 net/batman-adv/bridge_loop_avoidance.h | 2 +-
 net/batman-adv/debugfs.c               | 2 +-
 net/batman-adv/debugfs.h               | 2 +-
 net/batman-adv/distributed-arp-table.c | 2 +-
 net/batman-adv/distributed-arp-table.h | 2 +-
 net/batman-adv/fragmentation.c         | 2 +-
 net/batman-adv/fragmentation.h         | 2 +-
 net/batman-adv/gateway_client.c        | 2 +-
 net/batman-adv/gateway_client.h        | 2 +-
 net/batman-adv/gateway_common.c        | 2 +-
 net/batman-adv/gateway_common.h        | 2 +-
 net/batman-adv/hard-interface.c        | 2 +-
 net/batman-adv/hard-interface.h        | 2 +-
 net/batman-adv/hash.c                  | 2 +-
 net/batman-adv/hash.h                  | 2 +-
 net/batman-adv/icmp_socket.c           | 2 +-
 net/batman-adv/icmp_socket.h           | 2 +-
 net/batman-adv/log.c                   | 2 +-
 net/batman-adv/log.h                   | 2 +-
 net/batman-adv/main.c                  | 2 +-
 net/batman-adv/main.h                  | 2 +-
 net/batman-adv/multicast.c             | 2 +-
 net/batman-adv/multicast.h             | 2 +-
 net/batman-adv/netlink.c               | 2 +-
 net/batman-adv/netlink.h               | 2 +-
 net/batman-adv/network-coding.c        | 2 +-
 net/batman-adv/network-coding.h        | 2 +-
 net/batman-adv/originator.c            | 2 +-
 net/batman-adv/originator.h            | 2 +-
 net/batman-adv/routing.c               | 2 +-
 net/batman-adv/routing.h               | 2 +-
 net/batman-adv/send.c                  | 2 +-
 net/batman-adv/send.h                  | 2 +-
 net/batman-adv/soft-interface.c        | 2 +-
 net/batman-adv/soft-interface.h        | 2 +-
 net/batman-adv/sysfs.c                 | 2 +-
 net/batman-adv/sysfs.h                 | 2 +-
 net/batman-adv/tp_meter.c              | 2 +-
 net/batman-adv/tp_meter.h              | 2 +-
 net/batman-adv/trace.c                 | 2 +-
 net/batman-adv/trace.h                 | 2 +-
 net/batman-adv/translation-table.c     | 2 +-
 net/batman-adv/translation-table.h     | 2 +-
 net/batman-adv/tvlv.c                  | 2 +-
 net/batman-adv/tvlv.h                  | 2 +-
 net/batman-adv/types.h                 | 2 +-
 63 files changed, 63 insertions(+), 63 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h
index 894d8d2f713d..7eb2936a8e22 100644
--- a/include/uapi/linux/batadv_packet.h
+++ b/include/uapi/linux/batadv_packet.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 324a0e1143e7..a28e76a7e0a2 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: MIT */
-/* Copyright (C) 2016-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2019  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index c386e6981416..a31db5e9ac8e 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-# Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
 #
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index 9b58160fe485..a887ecc3efa1 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-# Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
 #
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index ea309ad06175..7b7e15641fef 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 534b790c3753..25e7bb51928c 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index f97e566f0402..de61091af666 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index 3dc6a7a43eb7..785f6666273c 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 90e33f84d37a..445594ed58af 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2019  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index ec4a2a569750..465a4fc23354 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index e8090f099eb8..7b80f6f8d4dc 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2019  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index e8c7b7fd290d..75f189ee4a1c 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2019  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 2948b41b06d4..c9698ad41854 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2019  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index e5be14c908c6..f67cf7ee06b2 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2019  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index a296a4d851f5..63e134e763e3 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2006-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2019  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 48f683289531..f3a05ad9afad 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2006-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2019  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 5fdde2947802..ef39aabdb694 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2019  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 71f95a3e4d3f..31771c751efb 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2019  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index d4a7702e48d8..3b9d1ad2f467 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 8de018e5c577..c0b8694041ec 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index c7f3e2231233..899ab051ffce 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2019  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index e58d7a9bfc8d..68c0ff321acd 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2019  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 5b71a289d04f..b506d15b8230 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2019  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 944512e07782..abdac26579bf 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2019  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 9d8e5eda2314..0a6b9b30eabd 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index f0b86fcb2493..b5732c8be81a 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 936c107f3199..53d03adc9bfc 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 80afb2793687..89bae100a0b0 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 508f4416dfc9..28c1fb8d1af0 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index d1c0f6189301..48de28c83401 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index 9194f4d891b1..56a08ce193d5 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2006-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2019  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 0e36fa1c7c3e..37507b6d4006 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2006-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2019  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index d70f363c52ae..d34bb79e0405 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index 958be22beda9..5f8926522ff0 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 02e55b78132f..f97849a8380b 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 35f4f397ed57..660e9bcc85a2 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index d1ed839fd32b..75750870cf04 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index f3ef2c725a41..3ed669d7dc6b 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 69244e4598f5..5de6a37525f5 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2014-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2019  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 3b04ab13f0eb..466013fe88af 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2014-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2019  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 2dc3304cee54..5fe833cc9507 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2016-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2019  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 571d9a5ae7aa..216484b8b82d 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2016-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2019  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 34caf129a9bf..278762bd94c6 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2012-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2019  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index 65c346812bc1..96ef0a511fc7 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2012-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2019  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 56a981af5c92..e5cdf89ef63c 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index a8b4c7b667ec..dca1e4a34ec6 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 469fa97d1add..cae0e5dd0768 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index db54c2d9b8bf..0102d69d345c 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 4a35f5c2f52b..66a8b3e44501 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 64cce07b8fe6..1f6132922e60 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 943071008670..b14fb3462af7 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index daf87f07fadd..538bb661878c 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 09427fc6494a..e1b816262c53 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index c1e3fb69952d..705ffbe763f4 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 11520de96ccb..500109bbd551 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2012-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2019  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index 68e600974759..6b4d0f733896 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2012-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2019  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c
index 8e1024217cff..f77c917ed20d 100644
--- a/net/batman-adv/trace.c
+++ b/net/batman-adv/trace.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2019  B.A.T.M.A.N. contributors:
  *
  * Sven Eckelmann
  *
diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
index 104784be94d7..5e5579051400 100644
--- a/net/batman-adv/trace.h
+++ b/net/batman-adv/trace.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2019  B.A.T.M.A.N. contributors:
  *
  * Sven Eckelmann
  *
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 8dcd4968cde7..f73d79139ae7 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 01b6c8eafaf9..61bca75e5911 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 40e69c9346d2..7e947b01919d 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index ef5867f49824..c0f033b1acb8 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index cbe17da36fcb..a21b34ed6548 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
-- 
cgit v1.2.3-59-g8ed1b


From f5162216b7dab0c07e070b8b7f98891a85047f59 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Thu, 3 Jan 2019 15:27:43 -0800
Subject: autofs: add strictexpire mount option

Commit 092a53452bb7 ("autofs: take more care to not update last_used on
path walk") helped to (partially) resolve a problem where automounts
were not expiring due to aggressive accesses from user space.

This patch was later reverted because, for very large environments, it
meant more mount requests from clients and when there are a lot of
clients this caused a fairly significant increase in server load.

But there is a need for both types of expire check, depending on use
case, so add a mount option to allow for strict update of last use of
autofs dentrys (which just means not updating the last use on path walk
access).

Link: http://lkml.kernel.org/r/154296973880.9889.14085372741514507967.stgit@pluto-themaw-net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/autofs_i.h         | 1 +
 fs/autofs/inode.c            | 8 +++++++-
 fs/autofs/root.c             | 5 ++++-
 include/uapi/linux/auto_fs.h | 2 +-
 4 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 9b81c10ef251..3e59f0ed777b 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -104,6 +104,7 @@ struct autofs_wait_queue {
 #define AUTOFS_SBI_MAGIC 0x6d4a556d
 
 #define AUTOFS_SBI_CATATONIC	0x0001
+#define AUTOFS_SBI_STRICTEXPIRE 0x0002
 
 struct autofs_sb_info {
 	u32 magic;
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 3a94dbda36dd..0e8ea2d9a2bb 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -87,6 +87,8 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root)
 		seq_printf(m, ",direct");
 	else
 		seq_printf(m, ",indirect");
+	if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
+		seq_printf(m, ",strictexpire");
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	if (sbi->pipe)
 		seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
@@ -109,7 +111,7 @@ static const struct super_operations autofs_sops = {
 };
 
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
-	Opt_indirect, Opt_direct, Opt_offset};
+	Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire};
 
 static const match_table_t tokens = {
 	{Opt_fd, "fd=%u"},
@@ -121,6 +123,7 @@ static const match_table_t tokens = {
 	{Opt_indirect, "indirect"},
 	{Opt_direct, "direct"},
 	{Opt_offset, "offset"},
+	{Opt_strictexpire, "strictexpire"},
 	{Opt_err, NULL}
 };
 
@@ -200,6 +203,9 @@ static int parse_options(char *options,
 		case Opt_offset:
 			set_autofs_type_offset(&sbi->type);
 			break;
+		case Opt_strictexpire:
+			sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
+			break;
 		default:
 			return 1;
 		}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 164ccd3402cf..1246f396bf0e 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -275,8 +275,11 @@ static int autofs_mount_wait(const struct path *path, bool rcu_walk)
 		pr_debug("waiting for mount name=%pd\n", path->dentry);
 		status = autofs_wait(sbi, path, NFY_MOUNT);
 		pr_debug("mount wait done status=%d\n", status);
+		ino->last_used = jiffies;
+		return status;
 	}
-	ino->last_used = jiffies;
+	if (!(sbi->flags & AUTOFS_SBI_STRICTEXPIRE))
+		ino->last_used = jiffies;
 	return status;
 }
 
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index df31aa9c9a8c..082119630b49 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -23,7 +23,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	3
 #define AUTOFS_MAX_PROTO_VERSION	5
 
-#define AUTOFS_PROTO_SUBVERSION		3
+#define AUTOFS_PROTO_SUBVERSION		4
 
 /*
  * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
-- 
cgit v1.2.3-59-g8ed1b


From 9da22854761a76c45d78aa2ae2b4bbd504b4f171 Mon Sep 17 00:00:00 2001
From: Carmeli Tamir <carmeli.tamir@gmail.com>
Date: Thu, 3 Jan 2019 15:27:49 -0800
Subject: include/uapi/linux/msdos_fs.h: use MSDOS_NAME for volume label size

The FAT file system volume label file stored in the root directory
should match the volume label field in the FAT boot sector.  As
consequence, the max length of these fields ought to be the same.  This
patch replaces the magic '11' usef in the struct fat_boot_sector with
MSDOS_NAME, which is used in struct msdos_dir_entry.

Please check the following references:
1. Microsoft FAT specification 2005
(http://read.pudn.com/downloads77/ebook/294884/FAT32%20Spec%20%28SDA%20Contribution%29.pdf).
Search for 'volume label'.
2. Microsoft Extensible Firmware Initiative, FAT32 File System Specification
(https://staff.washington.edu/dittrich/misc/fatgen103.pdf).
Search for 'volume label'.
3. User space code that creates FAT filesystem
sometimes uses MSDOS_NAME for the label, sometimes not.
Search for 'if (memcmp(label, NO_NAME, MSDOS_NAME))'.
I consider to make the same patch there as well.
https://github.com/dosfstools/dosfstools/blob/master/src/mkfs.fat.c

Link: http://lkml.kernel.org/r/1543096879-82837-1-git-send-email-carmeli.tamir@gmail.com
Signed-off-by: Carmeli Tamir <carmeli.tamir@gmail.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/msdos_fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h
index fde753735aba..1216e6caf59b 100644
--- a/include/uapi/linux/msdos_fs.h
+++ b/include/uapi/linux/msdos_fs.h
@@ -135,7 +135,7 @@ struct fat_boot_sector {
 						   for mount state. */
 			__u8	signature;  /* extended boot signature */
 			__u8	vol_id[4];	/* volume ID */
-			__u8	vol_label[11];	/* volume label */
+			__u8	vol_label[MSDOS_NAME];	/* volume label */
 			__u8	fs_type[8];		/* file system type */
 			/* other fields are not added here */
 		} fat16;
@@ -158,7 +158,7 @@ struct fat_boot_sector {
 						   for mount state. */
 			__u8	signature;  /* extended boot signature */
 			__u8	vol_id[4];	/* volume ID */
-			__u8	vol_label[11];	/* volume label */
+			__u8	vol_label[MSDOS_NAME];	/* volume label */
 			__u8	fs_type[8];		/* file system type */
 			/* other fields are not added here */
 		} fat32;
-- 
cgit v1.2.3-59-g8ed1b


From b553337a57cf4f077464292520f4e975ea4cda83 Mon Sep 17 00:00:00 2001
From: Carmeli Tamir <carmeli.tamir@gmail.com>
Date: Thu, 3 Jan 2019 15:27:53 -0800
Subject: fat: remove FAT_FIRST_ENT macro

The comment edited in this patch was the only reference to the
FAT_FIRST_ENT macro, which is not used anymore.  Moreover, the commented
line of code does not compile with the current code.

Since the FAT_FIRST_ENT macro checks the FAT variant in a way that the
patch series changes, I removed it, and instead wrote a clear
explanation of what was checked.

I verified that the changed comment is correct according to Microsoft
FAT spec, search for "BPB_Media" in the following references:

1. Microsoft FAT specification 2005
(http://read.pudn.com/downloads77/ebook/294884/FAT32%20Spec%20%28SDA%20Contribution%29.pdf).
Search for 'volume label'.
2. Microsoft Extensible Firmware Initiative, FAT32 File System Specification
(https://staff.washington.edu/dittrich/misc/fatgen103.pdf).
Search for 'volume label'.

Link: http://lkml.kernel.org/r/1544990640-11604-2-git-send-email-carmeli.tamir@gmail.com
Signed-off-by: Carmeli Tamir <carmeli.tamir@gmail.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c                | 12 ++++++++----
 include/uapi/linux/msdos_fs.h |  3 ---
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c0b5b5c3373b..269e7b91b8b1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1803,11 +1803,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 	fat_ent_access_init(sb);
 
 	/*
-	 * The low byte of FAT's first entry must have same value with
-	 * media-field.  But in real world, too many devices is
-	 * writing wrong value.  So, removed that validity check.
+	 * The low byte of the first FAT entry must have the same value as
+	 * the media field of the boot sector. But in real world, too many
+	 * devices are writing wrong values. So, removed that validity check.
 	 *
-	 * if (FAT_FIRST_ENT(sb, media) != first)
+	 * The removed check compared the first FAT entry to a value dependent
+	 * on the media field like this:
+	 * == (0x0F00 | media), for FAT12
+	 * == (0XFF00 | media), for FAT16
+	 * == (0x0FFFFF | media), for FAT32
 	 */
 
 	error = -EINVAL;
diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h
index 1216e6caf59b..833c7079a1e3 100644
--- a/include/uapi/linux/msdos_fs.h
+++ b/include/uapi/linux/msdos_fs.h
@@ -58,9 +58,6 @@
 #define MSDOS_DOT	".          "	/* ".", padded to MSDOS_NAME chars */
 #define MSDOS_DOTDOT	"..         "	/* "..", padded to MSDOS_NAME chars */
 
-#define FAT_FIRST_ENT(s, x)	((MSDOS_SB(s)->fat_bits == 32 ? 0x0FFFFF00 : \
-	MSDOS_SB(s)->fat_bits == 16 ? 0xFF00 : 0xF00) | (x))
-
 /* start of data cluster's entry (number of reserved clusters) */
 #define FAT_START_ENT	2
 
-- 
cgit v1.2.3-59-g8ed1b


From d19dc016187502dda6b8095e44eb46a18e89b2b3 Mon Sep 17 00:00:00 2001
From: Carmeli Tamir <carmeli.tamir@gmail.com>
Date: Thu, 3 Jan 2019 15:27:56 -0800
Subject: fat: move MAX_FAT to fat.h and change it to inline function

MAX_FAT is useless in msdos_fs.h, since it uses the MSDOS_SB function
that is defined in fat.h.  So really, this macro can be only called from
code that already includes fat.h.

Hence, this patch moves it to fat.h, right after MSDOS_SB is defined.  I
also changed it to an inline function in order to save the double call
to MSDOS_SB.  This was suggested by joe@perches.com in the previous
version.

This patch is required for the next in the series, in which the variant
(whether this is FAT12, FAT16 or FAT32) checks are replaced with new
macros.

Link: http://lkml.kernel.org/r/1544990640-11604-3-git-send-email-carmeli.tamir@gmail.com
Signed-off-by: Carmeli Tamir <carmeli.tamir@gmail.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h                  | 9 +++++++++
 fs/fat/inode.c                | 2 +-
 include/uapi/linux/msdos_fs.h | 2 --
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 4e1b2f6df5e6..979bb11832f6 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -142,6 +142,15 @@ static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+/* Maximum number of clusters */
+static inline u32 max_fat(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	return sbi->fat_bits == 32 ? MAX_FAT32 :
+		sbi->fat_bits == 16 ? MAX_FAT16 : MAX_FAT12;
+}
+
 static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 {
 	return container_of(inode, struct msdos_inode_info, vfs_inode);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 269e7b91b8b1..8d0de1aac4eb 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1781,7 +1781,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 	/* check that FAT table does not overflow */
 	fat_clusters = calc_fat_clusters(sb);
 	total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
-	if (total_clusters > MAX_FAT(sb)) {
+	if (total_clusters > max_fat(sb)) {
 		if (!silent)
 			fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
 			       total_clusters);
diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h
index 833c7079a1e3..a5773899f4d9 100644
--- a/include/uapi/linux/msdos_fs.h
+++ b/include/uapi/linux/msdos_fs.h
@@ -65,8 +65,6 @@
 #define MAX_FAT12	0xFF4
 #define MAX_FAT16	0xFFF4
 #define MAX_FAT32	0x0FFFFFF6
-#define MAX_FAT(s)	(MSDOS_SB(s)->fat_bits == 32 ? MAX_FAT32 : \
-	MSDOS_SB(s)->fat_bits == 16 ? MAX_FAT16 : MAX_FAT12)
 
 /* bad cluster mark */
 #define BAD_FAT12	0xFF7
-- 
cgit v1.2.3-59-g8ed1b


From d1877155891020cb26ad4fba45bfee52d8da9951 Mon Sep 17 00:00:00 2001
From: Tigran Aivazian <aivazian.tigran@gmail.com>
Date: Thu, 3 Jan 2019 15:28:14 -0800
Subject: bfs: extra sanity checking and static inode bitmap

Strengthen validation of BFS superblock against corruption.  Make
in-core inode bitmap static part of superblock info structure.  Print a
warning when mounting a BFS filesystem created with "-N 512" option as
only 510 files can be created in the root directory.  Make the kernel
messages more uniform.  Update the 'prefix' passed to bfs_dump_imap() to
match the current naming of operations.  White space and comments
cleanup.

Link: http://lkml.kernel.org/r/CAK+_RLkFZMduoQF36wZFd3zLi-6ZutWKsydjeHFNdtRvZZEb4w@mail.gmail.com
Signed-off-by: Tigran Aivazian <aivazian.tigran@gmail.com>
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bfs/bfs.h                | 11 ++++++--
 fs/bfs/dir.c                |  4 +--
 fs/bfs/file.c               |  2 +-
 fs/bfs/inode.c              | 65 +++++++++++++++++++--------------------------
 include/uapi/linux/bfs_fs.h |  2 +-
 5 files changed, 41 insertions(+), 43 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 67aef3bb89e4..606f9378b2f0 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -1,13 +1,20 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *	fs/bfs/bfs.h
- *	Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com>
+ *	Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
  */
 #ifndef _FS_BFS_BFS_H
 #define _FS_BFS_BFS_H
 
 #include <linux/bfs_fs.h>
 
+/* In theory BFS supports up to 512 inodes, numbered from 2 (for /) up to 513 inclusive.
+   In actual fact, attempting to create the 512th inode (i.e. inode No. 513 or file No. 511)
+   will fail with ENOSPC in bfs_add_entry(): the root directory cannot contain so many entries, counting '..'.
+   So, mkfs.bfs(8) should really limit its -N option to 511 and not 512. For now, we just print a warning
+   if a filesystem is mounted with such "impossible to fill up" number of inodes */
+#define BFS_MAX_LASTI	513
+
 /*
  * BFS file system in-core superblock info
  */
@@ -17,7 +24,7 @@ struct bfs_sb_info {
 	unsigned long si_freei;
 	unsigned long si_lf_eblk;
 	unsigned long si_lasti;
-	unsigned long *si_imap;
+	DECLARE_BITMAP(si_imap, BFS_MAX_LASTI+1);
 	struct mutex bfs_lock;
 };
 
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index f32f21c3bbc7..d8dfe3a0cb39 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -2,8 +2,8 @@
 /*
  *	fs/bfs/dir.c
  *	BFS directory operations.
- *	Copyright (C) 1999,2000  Tigran Aivazian <tigran@veritas.com>
- *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
+ *	Copyright (C) 1999-2018  Tigran Aivazian <aivazian.tigran@gmail.com>
+ *  Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
  */
 
 #include <linux/time.h>
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 1476cdd90cfb..0dceefc54b48 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -2,7 +2,7 @@
 /*
  *	fs/bfs/file.c
  *	BFS file operations.
- *	Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
+ *	Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
  *
  *	Make the file block allocation algorithm understand the size
  *	of the underlying block device.
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index d81c148682e7..d136b2aaafb3 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -1,10 +1,9 @@
 /*
  *	fs/bfs/inode.c
  *	BFS superblock and inode operations.
- *	Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ *	Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
  *	From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
- *
- *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
+ *	Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
  */
 
 #include <linux/module.h>
@@ -118,12 +117,12 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 	unsigned int ino = (u16)inode->i_ino;
-        unsigned long i_sblock;
+	unsigned long i_sblock;
 	struct bfs_inode *di;
 	struct buffer_head *bh;
 	int err = 0;
 
-        dprintf("ino=%08x\n", ino);
+	dprintf("ino=%08x\n", ino);
 
 	di = find_inode(inode->i_sb, ino, &bh);
 	if (IS_ERR(di))
@@ -144,7 +143,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
 	di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
 	di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
-        i_sblock = BFS_I(inode)->i_sblock;
+	i_sblock = BFS_I(inode)->i_sblock;
 	di->i_sblock = cpu_to_le32(i_sblock);
 	di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
 	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
@@ -188,13 +187,13 @@ static void bfs_evict_inode(struct inode *inode)
 	mark_buffer_dirty(bh);
 	brelse(bh);
 
-        if (bi->i_dsk_ino) {
+	if (bi->i_dsk_ino) {
 		if (bi->i_sblock)
 			info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
 		info->si_freei++;
 		clear_bit(ino, info->si_imap);
-		bfs_dump_imap("delete_inode", s);
-        }
+		bfs_dump_imap("evict_inode", s);
+	}
 
 	/*
 	 * If this was the last file, make the previous block
@@ -214,7 +213,6 @@ static void bfs_put_super(struct super_block *s)
 		return;
 
 	mutex_destroy(&info->bfs_lock);
-	kfree(info->si_imap);
 	kfree(info);
 	s->s_fs_info = NULL;
 }
@@ -311,8 +309,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s)
 		else
 			strcat(tmpbuf, "0");
 	}
-	printf("BFS-fs: %s: lasti=%08lx <%s>\n",
-				prefix, BFS_SB(s)->si_lasti, tmpbuf);
+	printf("%s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf);
 	free_page((unsigned long)tmpbuf);
 #endif
 }
@@ -322,7 +319,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	struct buffer_head *bh, *sbh;
 	struct bfs_super_block *bfs_sb;
 	struct inode *inode;
-	unsigned i, imap_len;
+	unsigned i;
 	struct bfs_sb_info *info;
 	int ret = -EINVAL;
 	unsigned long i_sblock, i_eblock, i_eoff, s_size;
@@ -341,8 +338,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	bfs_sb = (struct bfs_super_block *)sbh->b_data;
 	if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
 		if (!silent)
-			printf("No BFS filesystem on %s (magic=%08x)\n", 
-				s->s_id,  le32_to_cpu(bfs_sb->s_magic));
+			printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id,  le32_to_cpu(bfs_sb->s_magic));
 		goto out1;
 	}
 	if (BFS_UNCLEAN(bfs_sb, s) && !silent)
@@ -351,18 +347,16 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	s->s_magic = BFS_MAGIC;
 
 	if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) ||
-	    le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) {
-		printf("Superblock is corrupted\n");
+	    le32_to_cpu(bfs_sb->s_start) < sizeof(struct bfs_super_block) + sizeof(struct bfs_dirent)) {
+		printf("Superblock is corrupted on %s\n", s->s_id);
 		goto out1;
 	}
 
-	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
-					sizeof(struct bfs_inode)
-					+ BFS_ROOT_INO - 1;
-	imap_len = (info->si_lasti / 8) + 1;
-	info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN);
-	if (!info->si_imap) {
-		printf("Cannot allocate %u bytes\n", imap_len);
+	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1;
+	if (info->si_lasti == BFS_MAX_LASTI)
+		printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id);
+	else if (info->si_lasti > BFS_MAX_LASTI) {
+		printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id);
 		goto out1;
 	}
 	for (i = 0; i < BFS_ROOT_INO; i++)
@@ -372,26 +366,25 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	inode = bfs_iget(s, BFS_ROOT_INO);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
-		goto out2;
+		goto out1;
 	}
 	s->s_root = d_make_root(inode);
 	if (!s->s_root) {
 		ret = -ENOMEM;
-		goto out2;
+		goto out1;
 	}
 
 	info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
-	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1
-			- le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
+	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
 	info->si_freei = 0;
 	info->si_lf_eblk = 0;
 
 	/* can we read the last block? */
 	bh = sb_bread(s, info->si_blocks - 1);
 	if (!bh) {
-		printf("Last block not available: %lu\n", info->si_blocks - 1);
+		printf("Last block not available on %s: %lu\n", s->s_id, info->si_blocks - 1);
 		ret = -EIO;
-		goto out3;
+		goto out2;
 	}
 	brelse(bh);
 
@@ -425,11 +418,11 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 			(i_eoff != le32_to_cpu(-1) && i_eoff > s_size) ||
 			i_sblock * BFS_BSIZE > i_eoff) {
 
-			printf("Inode 0x%08x corrupted\n", i);
+			printf("Inode 0x%08x corrupted on %s\n", i, s->s_id);
 
 			brelse(bh);
 			ret = -EIO;
-			goto out3;
+			goto out2;
 		}
 
 		if (!di->i_ino) {
@@ -445,14 +438,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	}
 	brelse(bh);
 	brelse(sbh);
-	bfs_dump_imap("read_super", s);
+	bfs_dump_imap("fill_super", s);
 	return 0;
 
-out3:
+out2:
 	dput(s->s_root);
 	s->s_root = NULL;
-out2:
-	kfree(info->si_imap);
 out1:
 	brelse(sbh);
 out:
@@ -482,7 +473,7 @@ static int __init init_bfs_fs(void)
 	int err = init_inodecache();
 	if (err)
 		goto out1;
-        err = register_filesystem(&bfs_fs_type);
+	err = register_filesystem(&bfs_fs_type);
 	if (err)
 		goto out;
 	return 0;
diff --git a/include/uapi/linux/bfs_fs.h b/include/uapi/linux/bfs_fs.h
index 940b04772af8..08f6b4956359 100644
--- a/include/uapi/linux/bfs_fs.h
+++ b/include/uapi/linux/bfs_fs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *	include/linux/bfs_fs.h - BFS data structures on disk.
- *	Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com>
+ *	Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
  */
 
 #ifndef _LINUX_BFS_FS_H
-- 
cgit v1.2.3-59-g8ed1b


From 81c9d43f94870be66146739c6e61df40dc17bb64 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Thu, 3 Jan 2019 15:28:20 -0800
Subject: kernel/sysctl: add panic_print into sysctl

So that we can also runtime chose to print out the needed system info
for panic, other than setting the kernel cmdline.

Link: http://lkml.kernel.org/r/1543398842-19295-3-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 17 +++++++++++++++++
 include/linux/kernel.h          |  1 +
 include/uapi/linux/sysctl.h     |  1 +
 kernel/panic.c                  |  2 +-
 kernel/sysctl.c                 |  7 +++++++
 kernel/sysctl_binary.c          |  1 +
 6 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 1b8775298cf7..c0527d8a468a 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_stackoverflow
 - panic_on_unrecovered_nmi
 - panic_on_warn
+- panic_print
 - panic_on_rcu_stall
 - perf_cpu_time_max_percent
 - perf_event_paranoid
@@ -654,6 +655,22 @@ a kernel rebuild when attempting to kdump at the location of a WARN().
 
 ==============================================================
 
+panic_print:
+
+Bitmask for printing system info when panic happens. User can chose
+combination of the following bits:
+
+bit 0: print all tasks info
+bit 1: print system memory info
+bit 2: print timer info
+bit 3: print locks info if CONFIG_LOCKDEP is on
+bit 4: print ftrace buffer
+
+So for example to print tasks and memory info on panic, user can:
+  echo 3 > /proc/sys/kernel/panic_print
+
+==============================================================
+
 panic_on_rcu_stall:
 
 When set to 1, calls panic() after RCU stall detection messages. This
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6aac75b51ba..8f0e68e250a7 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -527,6 +527,7 @@ static inline u32 int_sqrt64(u64 x)
 extern void bust_spinlocks(int yes);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
 extern int panic_timeout;
+extern unsigned long panic_print;
 extern int panic_on_oops;
 extern int panic_on_unrecovered_nmi;
 extern int panic_on_io_nmi;
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index d71013fffaf6..87aa2a6d9125 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
 	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
 	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
 	KERN_PANIC_ON_WARN=77, /* int: call panic() in WARN() functions */
+	KERN_PANIC_PRINT=78, /* ulong: bitmask to print system info on panic */
 };
 
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 855f66738bc7..f121e6ba7e11 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
 #define PANIC_PRINT_TIMER_INFO		0x00000004
 #define PANIC_PRINT_LOCK_INFO		0x00000008
 #define PANIC_PRINT_FTRACE_INFO		0x00000010
-static unsigned long panic_print;
+unsigned long panic_print;
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7f6c1a3b3485..ba4d9e85feb8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -807,6 +807,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "panic_print",
+		.data		= &panic_print,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 #if defined CONFIG_PRINTK
 	{
 		.procname	= "printk",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 07148b497451..73c132095a7b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -140,6 +140,7 @@ static const struct bin_table bin_kern_table[] = {
 	{ CTL_INT,	KERN_MAX_LOCK_DEPTH,		"max_lock_depth" },
 	{ CTL_INT,	KERN_PANIC_ON_NMI,		"panic_on_unrecovered_nmi" },
 	{ CTL_INT,	KERN_PANIC_ON_WARN,		"panic_on_warn" },
+	{ CTL_ULONG,	KERN_PANIC_PRINT,		"panic_print" },
 	{}
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From d4ce5458ea1b7d8ca49c436d602095c4912777d3 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 3 Jan 2019 10:10:37 +0900
Subject: arch: remove stale comments "UAPI Header export list"

These comments are leftovers of commit fcc8487d477a ("uapi: export all
headers under uapi directories").

Prior to that commit, exported headers must be explicitly added to
header-y. Now, all headers under the uapi/ directories are exported.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 arch/alpha/include/uapi/asm/Kbuild      | 1 -
 arch/arc/include/uapi/asm/Kbuild        | 1 -
 arch/arm/include/uapi/asm/Kbuild        | 1 -
 arch/arm64/include/uapi/asm/Kbuild      | 1 -
 arch/c6x/include/uapi/asm/Kbuild        | 1 -
 arch/h8300/include/uapi/asm/Kbuild      | 1 -
 arch/hexagon/include/uapi/asm/Kbuild    | 1 -
 arch/ia64/include/uapi/asm/Kbuild       | 1 -
 arch/m68k/include/uapi/asm/Kbuild       | 1 -
 arch/microblaze/include/uapi/asm/Kbuild | 1 -
 arch/mips/include/uapi/asm/Kbuild       | 1 -
 arch/nds32/include/uapi/asm/Kbuild      | 1 -
 arch/nios2/include/uapi/asm/Kbuild      | 1 -
 arch/openrisc/include/uapi/asm/Kbuild   | 1 -
 arch/parisc/include/uapi/asm/Kbuild     | 1 -
 arch/powerpc/include/uapi/asm/Kbuild    | 1 -
 arch/riscv/include/uapi/asm/Kbuild      | 1 -
 arch/s390/include/uapi/asm/Kbuild       | 1 -
 arch/sh/include/uapi/asm/Kbuild         | 1 -
 arch/sparc/include/uapi/asm/Kbuild      | 1 -
 arch/unicore32/include/uapi/asm/Kbuild  | 1 -
 arch/x86/include/uapi/asm/Kbuild        | 1 -
 arch/xtensa/include/uapi/asm/Kbuild     | 1 -
 include/uapi/linux/Kbuild               | 2 --
 24 files changed, 25 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/Kbuild b/arch/alpha/include/uapi/asm/Kbuild
index 6a3a0ce0c61b..50dea7fa7590 100644
--- a/arch/alpha/include/uapi/asm/Kbuild
+++ b/arch/alpha/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild
index 170b5db64afe..42ac23efa318 100644
--- a/arch/arc/include/uapi/asm/Kbuild
+++ b/arch/arc/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 4d1cc1847edf..393fe326e607 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd-common.h
diff --git a/arch/arm64/include/uapi/asm/Kbuild b/arch/arm64/include/uapi/asm/Kbuild
index 6c5adf458690..eb435434055d 100644
--- a/arch/arm64/include/uapi/asm/Kbuild
+++ b/arch/arm64/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += errno.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 26644e15d854..633f38651cb0 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 2f65f78792cb..c1a994e3a37f 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild
index 41a176dbb53e..bcc65a237534 100644
--- a/arch/hexagon/include/uapi/asm/Kbuild
+++ b/arch/hexagon/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index ccce0ea65e05..3536c789d105 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_64.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index b6452910d7e1..2d75dbc49e25 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index b6656d930a0e..c95da4111ba2 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
diff --git a/arch/mips/include/uapi/asm/Kbuild b/arch/mips/include/uapi/asm/Kbuild
index ed4bd032f456..215d808560de 100644
--- a/arch/mips/include/uapi/asm/Kbuild
+++ b/arch/mips/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_n32.h
diff --git a/arch/nds32/include/uapi/asm/Kbuild b/arch/nds32/include/uapi/asm/Kbuild
index 40be972faf9e..ddd2a283b8b4 100644
--- a/arch/nds32/include/uapi/asm/Kbuild
+++ b/arch/nds32/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bpf_perf_event.h
diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild
index 13a3d77b4d7b..37bec5d0690f 100644
--- a/arch/nios2/include/uapi/asm/Kbuild
+++ b/arch/nios2/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 130c16ccba0a..bb850fa820c1 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild
index d31b4261cafc..99c0c73ded6d 100644
--- a/arch/parisc/include/uapi/asm/Kbuild
+++ b/arch/parisc/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 8ab8ba1b71bc..6e7574ee7b9a 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
diff --git a/arch/riscv/include/uapi/asm/Kbuild b/arch/riscv/include/uapi/asm/Kbuild
index 5511b9918131..5945a59bd523 100644
--- a/arch/riscv/include/uapi/asm/Kbuild
+++ b/arch/riscv/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += setup.h
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index dc38a90cf091..89b45823e2ce 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild
index dcb93543f55d..7ef099eded36 100644
--- a/arch/sh/include/uapi/asm/Kbuild
+++ b/arch/sh/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
diff --git a/arch/sparc/include/uapi/asm/Kbuild b/arch/sparc/include/uapi/asm/Kbuild
index ae72977287e3..7064bfbd947a 100644
--- a/arch/sparc/include/uapi/asm/Kbuild
+++ b/arch/sparc/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 8611ef980554..3620f6aa9dcc 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += auxvec.h
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index 322681622d1e..d15343a57da5 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += bpf_perf_event.h
diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild
index f95cad300369..3ae1bbda803a 100644
--- a/arch/xtensa/include/uapi/asm/Kbuild
+++ b/arch/xtensa/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
-# UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ca2787d9bf0f..5f24b50c9e88 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -1,5 +1,3 @@
-# UAPI Header export list
-
 ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/a.out.h),)
 no-export-headers += a.out.h
 endif
-- 
cgit v1.2.3-59-g8ed1b


From 8094c3ceb21ad93896fd4d238e8ba41911932eaf Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 6 Jan 2019 08:36:21 -0500
Subject: fscrypt: add Adiantum support

Add support for the Adiantum encryption mode to fscrypt.  Adiantum is a
tweakable, length-preserving encryption mode with security provably
reducible to that of XChaCha12 and AES-256, subject to a security bound.
It's also a true wide-block mode, unlike XTS.  See the paper
"Adiantum: length-preserving encryption for entry-level processors"
(https://eprint.iacr.org/2018/720.pdf) for more details.  Also see
commit 059c2a4d8e16 ("crypto: adiantum - add Adiantum support").

On sufficiently long messages, Adiantum's bottlenecks are XChaCha12 and
the NH hash function.  These algorithms are fast even on processors
without dedicated crypto instructions.  Adiantum makes it feasible to
enable storage encryption on low-end mobile devices that lack AES
instructions; currently such devices are unencrypted.  On ARM Cortex-A7,
on 4096-byte messages Adiantum encryption is about 4 times faster than
AES-256-XTS encryption; decryption is about 5 times faster.

In fscrypt, Adiantum is suitable for encrypting both file contents and
names.  With filenames, it fixes a known weakness: when two filenames in
a directory share a common prefix of >= 16 bytes, with CTS-CBC their
encrypted filenames share a common prefix too, leaking information.
Adiantum does not have this problem.

Since Adiantum also accepts long tweaks (IVs), it's also safe to use the
master key directly for Adiantum encryption rather than deriving
per-file keys, provided that the per-file nonce is included in the IVs
and the master key isn't used for any other encryption mode.  This
configuration saves memory and improves performance.  A new fscrypt
policy flag is added to allow users to opt-in to this configuration.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 Documentation/filesystems/fscrypt.rst | 179 +++++++++--------
 fs/crypto/crypto.c                    |  28 +--
 fs/crypto/fname.c                     |  22 ++-
 fs/crypto/fscrypt_private.h           |  67 ++++++-
 fs/crypto/keyinfo.c                   | 351 ++++++++++++++++++++++++++--------
 fs/crypto/policy.c                    |   5 +-
 include/uapi/linux/fs.h               |   4 +-
 7 files changed, 468 insertions(+), 188 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index cfbc18f0d9c9..3a7b60521b94 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -132,47 +132,28 @@ designed for this purpose be used, such as scrypt, PBKDF2, or Argon2.
 Per-file keys
 -------------
 
-Master keys are not used to encrypt file contents or names directly.
-Instead, a unique key is derived for each encrypted file, including
-each regular file, directory, and symbolic link.  This has several
-advantages:
-
-- In cryptosystems, the same key material should never be used for
-  different purposes.  Using the master key as both an XTS key for
-  contents encryption and as a CTS-CBC key for filenames encryption
-  would violate this rule.
-- Per-file keys simplify the choice of IVs (Initialization Vectors)
-  for contents encryption.  Without per-file keys, to ensure IV
-  uniqueness both the inode and logical block number would need to be
-  encoded in the IVs.  This would make it impossible to renumber
-  inodes, which e.g. ``resize2fs`` can do when resizing an ext4
-  filesystem.  With per-file keys, it is sufficient to encode just the
-  logical block number in the IVs.
-- Per-file keys strengthen the encryption of filenames, where IVs are
-  reused out of necessity.  With a unique key per directory, IV reuse
-  is limited to within a single directory.
-- Per-file keys allow individual files to be securely erased simply by
-  securely erasing their keys.  (Not yet implemented.)
-
-A KDF (Key Derivation Function) is used to derive per-file keys from
-the master key.  This is done instead of wrapping a randomly-generated
-key for each file because it reduces the size of the encryption xattr,
-which for some filesystems makes the xattr more likely to fit in-line
-in the filesystem's inode table.  With a KDF, only a 16-byte nonce is
-required --- long enough to make key reuse extremely unlikely.  A
-wrapped key, on the other hand, would need to be up to 64 bytes ---
-the length of an AES-256-XTS key.  Furthermore, currently there is no
-requirement to support unlocking a file with multiple alternative
-master keys or to support rotating master keys.  Instead, the master
-keys may be wrapped in userspace, e.g. as done by the `fscrypt
-<https://github.com/google/fscrypt>`_ tool.
-
-The current KDF encrypts the master key using the 16-byte nonce as an
-AES-128-ECB key.  The output is used as the derived key.  If the
-output is longer than needed, then it is truncated to the needed
-length.  Truncation is the norm for directories and symlinks, since
-those use the CTS-CBC encryption mode which requires a key half as
-long as that required by the XTS encryption mode.
+Since each master key can protect many files, it is necessary to
+"tweak" the encryption of each file so that the same plaintext in two
+files doesn't map to the same ciphertext, or vice versa.  In most
+cases, fscrypt does this by deriving per-file keys.  When a new
+encrypted inode (regular file, directory, or symlink) is created,
+fscrypt randomly generates a 16-byte nonce and stores it in the
+inode's encryption xattr.  Then, it uses a KDF (Key Derivation
+Function) to derive the file's key from the master key and nonce.
+
+The Adiantum encryption mode (see `Encryption modes and usage`_) is
+special, since it accepts longer IVs and is suitable for both contents
+and filenames encryption.  For it, a "direct key" option is offered
+where the file's nonce is included in the IVs and the master key is
+used for encryption directly.  This improves performance; however,
+users must not use the same master key for any other encryption mode.
+
+Below, the KDF and design considerations are described in more detail.
+
+The current KDF works by encrypting the master key with AES-128-ECB,
+using the file's nonce as the AES key.  The output is used as the
+derived key.  If the output is longer than needed, then it is
+truncated to the needed length.
 
 Note: this KDF meets the primary security requirement, which is to
 produce unique derived keys that preserve the entropy of the master
@@ -181,6 +162,20 @@ However, it is nonstandard and has some problems such as being
 reversible, so it is generally considered to be a mistake!  It may be
 replaced with HKDF or another more standard KDF in the future.
 
+Key derivation was chosen over key wrapping because wrapped keys would
+require larger xattrs which would be less likely to fit in-line in the
+filesystem's inode table, and there didn't appear to be any
+significant advantages to key wrapping.  In particular, currently
+there is no requirement to support unlocking a file with multiple
+alternative master keys or to support rotating master keys.  Instead,
+the master keys may be wrapped in userspace, e.g. as is done by the
+`fscrypt <https://github.com/google/fscrypt>`_ tool.
+
+Including the inode number in the IVs was considered.  However, it was
+rejected as it would have prevented ext4 filesystems from being
+resized, and by itself still wouldn't have been sufficient to prevent
+the same key from being directly reused for both XTS and CTS-CBC.
+
 Encryption modes and usage
 ==========================
 
@@ -191,54 +186,80 @@ Currently, the following pairs of encryption modes are supported:
 
 - AES-256-XTS for contents and AES-256-CTS-CBC for filenames
 - AES-128-CBC for contents and AES-128-CTS-CBC for filenames
+- Adiantum for both contents and filenames
+
+If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
 
-It is strongly recommended to use AES-256-XTS for contents encryption.
 AES-128-CBC was added only for low-powered embedded devices with
 crypto accelerators such as CAAM or CESA that do not support XTS.
 
+Adiantum is a (primarily) stream cipher-based mode that is fast even
+on CPUs without dedicated crypto instructions.  It's also a true
+wide-block mode, unlike XTS.  It can also eliminate the need to derive
+per-file keys.  However, it depends on the security of two primitives,
+XChaCha12 and AES-256, rather than just one.  See the paper
+"Adiantum: length-preserving encryption for entry-level processors"
+(https://eprint.iacr.org/2018/720.pdf) for more details.  To use
+Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled.  Also, fast
+implementations of ChaCha and NHPoly1305 should be enabled, e.g.
+CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
+
 New encryption modes can be added relatively easily, without changes
 to individual filesystems.  However, authenticated encryption (AE)
 modes are not currently supported because of the difficulty of dealing
 with ciphertext expansion.
 
+Contents encryption
+-------------------
+
 For file contents, each filesystem block is encrypted independently.
 Currently, only the case where the filesystem block size is equal to
-the system's page size (usually 4096 bytes) is supported.  With the
-XTS mode of operation (recommended), the logical block number within
-the file is used as the IV.  With the CBC mode of operation (not
-recommended), ESSIV is used; specifically, the IV for CBC is the
-logical block number encrypted with AES-256, where the AES-256 key is
-the SHA-256 hash of the inode's data encryption key.
-
-For filenames, the full filename is encrypted at once.  Because of the
-requirements to retain support for efficient directory lookups and
-filenames of up to 255 bytes, a constant initialization vector (IV) is
-used.  However, each encrypted directory uses a unique key, which
-limits IV reuse to within a single directory.  Note that IV reuse in
-the context of CTS-CBC encryption means that when the original
-filenames share a common prefix at least as long as the cipher block
-size (16 bytes for AES), the corresponding encrypted filenames will
-also share a common prefix.  This is undesirable; it may be fixed in
-the future by switching to an encryption mode that is a strong
-pseudorandom permutation on arbitrary-length messages, e.g. the HEH
-(Hash-Encrypt-Hash) mode.
-
-Since filenames are encrypted with the CTS-CBC mode of operation, the
-plaintext and ciphertext filenames need not be multiples of the AES
-block size, i.e. 16 bytes.  However, the minimum size that can be
-encrypted is 16 bytes, so shorter filenames are NUL-padded to 16 bytes
-before being encrypted.  In addition, to reduce leakage of filename
-lengths via their ciphertexts, all filenames are NUL-padded to the
-next 4, 8, 16, or 32-byte boundary (configurable).  32 is recommended
-since this provides the best confidentiality, at the cost of making
-directory entries consume slightly more space.  Note that since NUL
-(``\0``) is not otherwise a valid character in filenames, the padding
-will never produce duplicate plaintexts.
+the system's page size (usually 4096 bytes) is supported.
+
+Each block's IV is set to the logical block number within the file as
+a little endian number, except that:
+
+- With CBC mode encryption, ESSIV is also used.  Specifically, each IV
+  is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
+  of the file's data encryption key.
+
+- In the "direct key" configuration (FS_POLICY_FLAG_DIRECT_KEY set in
+  the fscrypt_policy), the file's nonce is also appended to the IV.
+  Currently this is only allowed with the Adiantum encryption mode.
+
+Filenames encryption
+--------------------
+
+For filenames, each full filename is encrypted at once.  Because of
+the requirements to retain support for efficient directory lookups and
+filenames of up to 255 bytes, the same IV is used for every filename
+in a directory.
+
+However, each encrypted directory still uses a unique key; or
+alternatively (for the "direct key" configuration) has the file's
+nonce included in the IVs.  Thus, IV reuse is limited to within a
+single directory.
+
+With CTS-CBC, the IV reuse means that when the plaintext filenames
+share a common prefix at least as long as the cipher block size (16
+bytes for AES), the corresponding encrypted filenames will also share
+a common prefix.  This is undesirable.  Adiantum does not have this
+weakness, as it is a wide-block encryption mode.
+
+All supported filenames encryption modes accept any plaintext length
+>= 16 bytes; cipher block alignment is not required.  However,
+filenames shorter than 16 bytes are NUL-padded to 16 bytes before
+being encrypted.  In addition, to reduce leakage of filename lengths
+via their ciphertexts, all filenames are NUL-padded to the next 4, 8,
+16, or 32-byte boundary (configurable).  32 is recommended since this
+provides the best confidentiality, at the cost of making directory
+entries consume slightly more space.  Note that since NUL (``\0``) is
+not otherwise a valid character in filenames, the padding will never
+produce duplicate plaintexts.
 
 Symbolic link targets are considered a type of filename and are
-encrypted in the same way as filenames in directory entries.  Each
-symlink also uses a unique key; hence, the hardcoded IV is not a
-problem for symlinks.
+encrypted in the same way as filenames in directory entries, except
+that IV reuse is not a problem as each symlink has its own inode.
 
 User API
 ========
@@ -272,9 +293,13 @@ This structure must be initialized as follows:
   and FS_ENCRYPTION_MODE_AES_256_CTS (4) for
   ``filenames_encryption_mode``.
 
-- ``flags`` must be set to a value from ``<linux/fs.h>`` which
+- ``flags`` must contain a value from ``<linux/fs.h>`` which
   identifies the amount of NUL-padding to use when encrypting
   filenames.  If unsure, use FS_POLICY_FLAGS_PAD_32 (0x3).
+  In addition, if the chosen encryption modes are both
+  FS_ENCRYPTION_MODE_ADIANTUM, this can contain
+  FS_POLICY_FLAG_DIRECT_KEY to specify that the master key should be
+  used directly, without key derivation.
 
 - ``master_key_descriptor`` specifies how to find the master key in
   the keyring; see `Adding keys`_.  It is up to userspace to choose a
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 0f46cf550907..4dc788e3bc96 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -133,15 +133,25 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
 }
 EXPORT_SYMBOL(fscrypt_get_ctx);
 
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+			 const struct fscrypt_info *ci)
+{
+	memset(iv, 0, ci->ci_mode->ivsize);
+	iv->lblk_num = cpu_to_le64(lblk_num);
+
+	if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY)
+		memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+
+	if (ci->ci_essiv_tfm != NULL)
+		crypto_cipher_encrypt_one(ci->ci_essiv_tfm, iv->raw, iv->raw);
+}
+
 int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 			   u64 lblk_num, struct page *src_page,
 			   struct page *dest_page, unsigned int len,
 			   unsigned int offs, gfp_t gfp_flags)
 {
-	struct {
-		__le64 index;
-		u8 padding[FS_IV_SIZE - sizeof(__le64)];
-	} iv;
+	union fscrypt_iv iv;
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist dst, src;
@@ -151,15 +161,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
 
 	BUG_ON(len == 0);
 
-	BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE);
-	BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE);
-	iv.index = cpu_to_le64(lblk_num);
-	memset(iv.padding, 0, sizeof(iv.padding));
-
-	if (ci->ci_essiv_tfm != NULL) {
-		crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv,
-					  (u8 *)&iv);
-	}
+	fscrypt_generate_iv(&iv, lblk_num, ci);
 
 	req = skcipher_request_alloc(tfm, gfp_flags);
 	if (!req)
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index d7a0f682ca12..7ff40a73dbec 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -40,10 +40,11 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
 {
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
-	struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm;
-	int res = 0;
-	char iv[FS_CRYPTO_BLOCK_SIZE];
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	union fscrypt_iv iv;
 	struct scatterlist sg;
+	int res;
 
 	/*
 	 * Copy the filename to the output buffer for encrypting in-place and
@@ -55,7 +56,7 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
 	memset(out + iname->len, 0, olen - iname->len);
 
 	/* Initialize the IV */
-	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+	fscrypt_generate_iv(&iv, 0, ci);
 
 	/* Set up the encryption request */
 	req = skcipher_request_alloc(tfm, GFP_NOFS);
@@ -65,7 +66,7 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
 			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 			crypto_req_done, &wait);
 	sg_init_one(&sg, out, olen);
-	skcipher_request_set_crypt(req, &sg, &sg, olen, iv);
+	skcipher_request_set_crypt(req, &sg, &sg, olen, &iv);
 
 	/* Do the encryption */
 	res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
@@ -94,9 +95,10 @@ static int fname_decrypt(struct inode *inode,
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist src_sg, dst_sg;
-	struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm;
-	int res = 0;
-	char iv[FS_CRYPTO_BLOCK_SIZE];
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_skcipher *tfm = ci->ci_ctfm;
+	union fscrypt_iv iv;
+	int res;
 
 	/* Allocate request */
 	req = skcipher_request_alloc(tfm, GFP_NOFS);
@@ -107,12 +109,12 @@ static int fname_decrypt(struct inode *inode,
 		crypto_req_done, &wait);
 
 	/* Initialize IV */
-	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+	fscrypt_generate_iv(&iv, 0, ci);
 
 	/* Create decryption request */
 	sg_init_one(&src_sg, iname->name, iname->len);
 	sg_init_one(&dst_sg, oname->name, oname->len);
-	skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv);
 	res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res < 0) {
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 79debfc9cef9..7424f851eb5c 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -17,7 +17,6 @@
 #include <crypto/hash.h>
 
 /* Encryption parameters */
-#define FS_IV_SIZE			16
 #define FS_KEY_DERIVATION_NONCE_SIZE	16
 
 /**
@@ -52,16 +51,42 @@ struct fscrypt_symlink_data {
 } __packed;
 
 /*
- * A pointer to this structure is stored in the file system's in-core
- * representation of an inode.
+ * fscrypt_info - the "encryption key" for an inode
+ *
+ * When an encrypted file's key is made available, an instance of this struct is
+ * allocated and stored in ->i_crypt_info.  Once created, it remains until the
+ * inode is evicted.
  */
 struct fscrypt_info {
+
+	/* The actual crypto transform used for encryption and decryption */
+	struct crypto_skcipher *ci_ctfm;
+
+	/*
+	 * Cipher for ESSIV IV generation.  Only set for CBC contents
+	 * encryption, otherwise is NULL.
+	 */
+	struct crypto_cipher *ci_essiv_tfm;
+
+	/*
+	 * Encryption mode used for this inode.  It corresponds to either
+	 * ci_data_mode or ci_filename_mode, depending on the inode type.
+	 */
+	struct fscrypt_mode *ci_mode;
+
+	/*
+	 * If non-NULL, then this inode uses a master key directly rather than a
+	 * derived key, and ci_ctfm will equal ci_master_key->mk_ctfm.
+	 * Otherwise, this inode uses a derived key.
+	 */
+	struct fscrypt_master_key *ci_master_key;
+
+	/* fields from the fscrypt_context */
 	u8 ci_data_mode;
 	u8 ci_filename_mode;
 	u8 ci_flags;
-	struct crypto_skcipher *ci_ctfm;
-	struct crypto_cipher *ci_essiv_tfm;
-	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+	u8 ci_master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
 };
 
 typedef enum {
@@ -83,6 +108,10 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
 	    filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
 		return true;
 
+	if (contents_mode == FS_ENCRYPTION_MODE_ADIANTUM &&
+	    filenames_mode == FS_ENCRYPTION_MODE_ADIANTUM)
+		return true;
+
 	return false;
 }
 
@@ -107,6 +136,22 @@ fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...);
 #define fscrypt_err(sb, fmt, ...)		\
 	fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
 
+#define FSCRYPT_MAX_IV_SIZE	32
+
+union fscrypt_iv {
+	struct {
+		/* logical block number within the file */
+		__le64 lblk_num;
+
+		/* per-file nonce; only set in DIRECT_KEY mode */
+		u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+	};
+	u8 raw[FSCRYPT_MAX_IV_SIZE];
+};
+
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+			 const struct fscrypt_info *ci);
+
 /* fname.c */
 extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
 			 u8 *out, unsigned int olen);
@@ -115,6 +160,16 @@ extern bool fscrypt_fname_encrypted_size(const struct inode *inode,
 					 u32 *encrypted_len_ret);
 
 /* keyinfo.c */
+
+struct fscrypt_mode {
+	const char *friendly_name;
+	const char *cipher_str;
+	int keysize;
+	int ivsize;
+	bool logged_impl_name;
+	bool needs_essiv;
+};
+
 extern void __exit fscrypt_essiv_cleanup(void);
 
 #endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 7874c9bb2fc5..1e11a683f63d 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,15 +10,21 @@
  */
 
 #include <keys/user-type.h>
+#include <linux/hashtable.h>
 #include <linux/scatterlist.h>
 #include <linux/ratelimit.h>
 #include <crypto/aes.h>
+#include <crypto/algapi.h>
 #include <crypto/sha.h>
 #include <crypto/skcipher.h>
 #include "fscrypt_private.h"
 
 static struct crypto_shash *essiv_hash_tfm;
 
+/* Table of keys referenced by FS_POLICY_FLAG_DIRECT_KEY policies */
+static DEFINE_HASHTABLE(fscrypt_master_keys, 6); /* 6 bits = 64 buckets */
+static DEFINE_SPINLOCK(fscrypt_master_keys_lock);
+
 /*
  * Key derivation function.  This generates the derived key by encrypting the
  * master key with AES-128-ECB using the inode's nonce as the AES key.
@@ -123,56 +129,37 @@ invalid:
 	return ERR_PTR(-ENOKEY);
 }
 
-/* Find the master key, then derive the inode's actual encryption key */
-static int find_and_derive_key(const struct inode *inode,
-			       const struct fscrypt_context *ctx,
-			       u8 *derived_key, unsigned int derived_keysize)
-{
-	struct key *key;
-	const struct fscrypt_key *payload;
-	int err;
-
-	key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
-					ctx->master_key_descriptor,
-					derived_keysize, &payload);
-	if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
-		key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
-						ctx->master_key_descriptor,
-						derived_keysize, &payload);
-	}
-	if (IS_ERR(key))
-		return PTR_ERR(key);
-	err = derive_key_aes(payload->raw, ctx, derived_key, derived_keysize);
-	up_read(&key->sem);
-	key_put(key);
-	return err;
-}
-
-static struct fscrypt_mode {
-	const char *friendly_name;
-	const char *cipher_str;
-	int keysize;
-	bool logged_impl_name;
-} available_modes[] = {
+static struct fscrypt_mode available_modes[] = {
 	[FS_ENCRYPTION_MODE_AES_256_XTS] = {
 		.friendly_name = "AES-256-XTS",
 		.cipher_str = "xts(aes)",
 		.keysize = 64,
+		.ivsize = 16,
 	},
 	[FS_ENCRYPTION_MODE_AES_256_CTS] = {
 		.friendly_name = "AES-256-CTS-CBC",
 		.cipher_str = "cts(cbc(aes))",
 		.keysize = 32,
+		.ivsize = 16,
 	},
 	[FS_ENCRYPTION_MODE_AES_128_CBC] = {
 		.friendly_name = "AES-128-CBC",
 		.cipher_str = "cbc(aes)",
 		.keysize = 16,
+		.ivsize = 16,
+		.needs_essiv = true,
 	},
 	[FS_ENCRYPTION_MODE_AES_128_CTS] = {
 		.friendly_name = "AES-128-CTS-CBC",
 		.cipher_str = "cts(cbc(aes))",
 		.keysize = 16,
+		.ivsize = 16,
+	},
+	[FS_ENCRYPTION_MODE_ADIANTUM] = {
+		.friendly_name = "Adiantum",
+		.cipher_str = "adiantum(xchacha12,aes)",
+		.keysize = 32,
+		.ivsize = 32,
 	},
 };
 
@@ -198,14 +185,196 @@ select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode)
 	return ERR_PTR(-EINVAL);
 }
 
-static void put_crypt_info(struct fscrypt_info *ci)
+/* Find the master key, then derive the inode's actual encryption key */
+static int find_and_derive_key(const struct inode *inode,
+			       const struct fscrypt_context *ctx,
+			       u8 *derived_key, const struct fscrypt_mode *mode)
 {
-	if (!ci)
+	struct key *key;
+	const struct fscrypt_key *payload;
+	int err;
+
+	key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
+					ctx->master_key_descriptor,
+					mode->keysize, &payload);
+	if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
+		key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
+						ctx->master_key_descriptor,
+						mode->keysize, &payload);
+	}
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	if (ctx->flags & FS_POLICY_FLAG_DIRECT_KEY) {
+		if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) {
+			fscrypt_warn(inode->i_sb,
+				     "direct key mode not allowed with %s",
+				     mode->friendly_name);
+			err = -EINVAL;
+		} else if (ctx->contents_encryption_mode !=
+			   ctx->filenames_encryption_mode) {
+			fscrypt_warn(inode->i_sb,
+				     "direct key mode not allowed with different contents and filenames modes");
+			err = -EINVAL;
+		} else {
+			memcpy(derived_key, payload->raw, mode->keysize);
+			err = 0;
+		}
+	} else {
+		err = derive_key_aes(payload->raw, ctx, derived_key,
+				     mode->keysize);
+	}
+	up_read(&key->sem);
+	key_put(key);
+	return err;
+}
+
+/* Allocate and key a symmetric cipher object for the given encryption mode */
+static struct crypto_skcipher *
+allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
+			   const struct inode *inode)
+{
+	struct crypto_skcipher *tfm;
+	int err;
+
+	tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+	if (IS_ERR(tfm)) {
+		fscrypt_warn(inode->i_sb,
+			     "error allocating '%s' transform for inode %lu: %ld",
+			     mode->cipher_str, inode->i_ino, PTR_ERR(tfm));
+		return tfm;
+	}
+	if (unlikely(!mode->logged_impl_name)) {
+		/*
+		 * fscrypt performance can vary greatly depending on which
+		 * crypto algorithm implementation is used.  Help people debug
+		 * performance problems by logging the ->cra_driver_name the
+		 * first time a mode is used.  Note that multiple threads can
+		 * race here, but it doesn't really matter.
+		 */
+		mode->logged_impl_name = true;
+		pr_info("fscrypt: %s using implementation \"%s\"\n",
+			mode->friendly_name,
+			crypto_skcipher_alg(tfm)->base.cra_driver_name);
+	}
+	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
+	if (err)
+		goto err_free_tfm;
+
+	return tfm;
+
+err_free_tfm:
+	crypto_free_skcipher(tfm);
+	return ERR_PTR(err);
+}
+
+/* Master key referenced by FS_POLICY_FLAG_DIRECT_KEY policy */
+struct fscrypt_master_key {
+	struct hlist_node mk_node;
+	refcount_t mk_refcount;
+	const struct fscrypt_mode *mk_mode;
+	struct crypto_skcipher *mk_ctfm;
+	u8 mk_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 mk_raw[FS_MAX_KEY_SIZE];
+};
+
+static void free_master_key(struct fscrypt_master_key *mk)
+{
+	if (mk) {
+		crypto_free_skcipher(mk->mk_ctfm);
+		kzfree(mk);
+	}
+}
+
+static void put_master_key(struct fscrypt_master_key *mk)
+{
+	if (!refcount_dec_and_lock(&mk->mk_refcount, &fscrypt_master_keys_lock))
 		return;
+	hash_del(&mk->mk_node);
+	spin_unlock(&fscrypt_master_keys_lock);
 
-	crypto_free_skcipher(ci->ci_ctfm);
-	crypto_free_cipher(ci->ci_essiv_tfm);
-	kmem_cache_free(fscrypt_info_cachep, ci);
+	free_master_key(mk);
+}
+
+/*
+ * Find/insert the given master key into the fscrypt_master_keys table.  If
+ * found, it is returned with elevated refcount, and 'to_insert' is freed if
+ * non-NULL.  If not found, 'to_insert' is inserted and returned if it's
+ * non-NULL; otherwise NULL is returned.
+ */
+static struct fscrypt_master_key *
+find_or_insert_master_key(struct fscrypt_master_key *to_insert,
+			  const u8 *raw_key, const struct fscrypt_mode *mode,
+			  const struct fscrypt_info *ci)
+{
+	unsigned long hash_key;
+	struct fscrypt_master_key *mk;
+
+	/*
+	 * Careful: to avoid potentially leaking secret key bytes via timing
+	 * information, we must key the hash table by descriptor rather than by
+	 * raw key, and use crypto_memneq() when comparing raw keys.
+	 */
+
+	BUILD_BUG_ON(sizeof(hash_key) > FS_KEY_DESCRIPTOR_SIZE);
+	memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key));
+
+	spin_lock(&fscrypt_master_keys_lock);
+	hash_for_each_possible(fscrypt_master_keys, mk, mk_node, hash_key) {
+		if (memcmp(ci->ci_master_key_descriptor, mk->mk_descriptor,
+			   FS_KEY_DESCRIPTOR_SIZE) != 0)
+			continue;
+		if (mode != mk->mk_mode)
+			continue;
+		if (crypto_memneq(raw_key, mk->mk_raw, mode->keysize))
+			continue;
+		/* using existing tfm with same (descriptor, mode, raw_key) */
+		refcount_inc(&mk->mk_refcount);
+		spin_unlock(&fscrypt_master_keys_lock);
+		free_master_key(to_insert);
+		return mk;
+	}
+	if (to_insert)
+		hash_add(fscrypt_master_keys, &to_insert->mk_node, hash_key);
+	spin_unlock(&fscrypt_master_keys_lock);
+	return to_insert;
+}
+
+/* Prepare to encrypt directly using the master key in the given mode */
+static struct fscrypt_master_key *
+fscrypt_get_master_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
+		       const u8 *raw_key, const struct inode *inode)
+{
+	struct fscrypt_master_key *mk;
+	int err;
+
+	/* Is there already a tfm for this key? */
+	mk = find_or_insert_master_key(NULL, raw_key, mode, ci);
+	if (mk)
+		return mk;
+
+	/* Nope, allocate one. */
+	mk = kzalloc(sizeof(*mk), GFP_NOFS);
+	if (!mk)
+		return ERR_PTR(-ENOMEM);
+	refcount_set(&mk->mk_refcount, 1);
+	mk->mk_mode = mode;
+	mk->mk_ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
+	if (IS_ERR(mk->mk_ctfm)) {
+		err = PTR_ERR(mk->mk_ctfm);
+		mk->mk_ctfm = NULL;
+		goto err_free_mk;
+	}
+	memcpy(mk->mk_descriptor, ci->ci_master_key_descriptor,
+	       FS_KEY_DESCRIPTOR_SIZE);
+	memcpy(mk->mk_raw, raw_key, mode->keysize);
+
+	return find_or_insert_master_key(mk, raw_key, mode, ci);
+
+err_free_mk:
+	free_master_key(mk);
+	return ERR_PTR(err);
 }
 
 static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
@@ -275,11 +444,67 @@ void __exit fscrypt_essiv_cleanup(void)
 	crypto_free_shash(essiv_hash_tfm);
 }
 
+/*
+ * Given the encryption mode and key (normally the derived key, but for
+ * FS_POLICY_FLAG_DIRECT_KEY mode it's the master key), set up the inode's
+ * symmetric cipher transform object(s).
+ */
+static int setup_crypto_transform(struct fscrypt_info *ci,
+				  struct fscrypt_mode *mode,
+				  const u8 *raw_key, const struct inode *inode)
+{
+	struct fscrypt_master_key *mk;
+	struct crypto_skcipher *ctfm;
+	int err;
+
+	if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) {
+		mk = fscrypt_get_master_key(ci, mode, raw_key, inode);
+		if (IS_ERR(mk))
+			return PTR_ERR(mk);
+		ctfm = mk->mk_ctfm;
+	} else {
+		mk = NULL;
+		ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
+		if (IS_ERR(ctfm))
+			return PTR_ERR(ctfm);
+	}
+	ci->ci_master_key = mk;
+	ci->ci_ctfm = ctfm;
+
+	if (mode->needs_essiv) {
+		/* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
+		WARN_ON(mode->ivsize != AES_BLOCK_SIZE);
+		WARN_ON(ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY);
+
+		err = init_essiv_generator(ci, raw_key, mode->keysize);
+		if (err) {
+			fscrypt_warn(inode->i_sb,
+				     "error initializing ESSIV generator for inode %lu: %d",
+				     inode->i_ino, err);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+	if (!ci)
+		return;
+
+	if (ci->ci_master_key) {
+		put_master_key(ci->ci_master_key);
+	} else {
+		crypto_free_skcipher(ci->ci_ctfm);
+		crypto_free_cipher(ci->ci_essiv_tfm);
+	}
+	kmem_cache_free(fscrypt_info_cachep, ci);
+}
+
 int fscrypt_get_encryption_info(struct inode *inode)
 {
 	struct fscrypt_info *crypt_info;
 	struct fscrypt_context ctx;
-	struct crypto_skcipher *ctfm;
 	struct fscrypt_mode *mode;
 	u8 *raw_key = NULL;
 	int res;
@@ -312,74 +537,42 @@ int fscrypt_get_encryption_info(struct inode *inode)
 	if (ctx.flags & ~FS_POLICY_FLAGS_VALID)
 		return -EINVAL;
 
-	crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
+	crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
 	if (!crypt_info)
 		return -ENOMEM;
 
 	crypt_info->ci_flags = ctx.flags;
 	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
 	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
-	crypt_info->ci_ctfm = NULL;
-	crypt_info->ci_essiv_tfm = NULL;
-	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
-				sizeof(crypt_info->ci_master_key));
+	memcpy(crypt_info->ci_master_key_descriptor, ctx.master_key_descriptor,
+	       FS_KEY_DESCRIPTOR_SIZE);
+	memcpy(crypt_info->ci_nonce, ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
 
 	mode = select_encryption_mode(crypt_info, inode);
 	if (IS_ERR(mode)) {
 		res = PTR_ERR(mode);
 		goto out;
 	}
+	WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
+	crypt_info->ci_mode = mode;
 
 	/*
-	 * This cannot be a stack buffer because it is passed to the scatterlist
-	 * crypto API as part of key derivation.
+	 * This cannot be a stack buffer because it may be passed to the
+	 * scatterlist crypto API as part of key derivation.
 	 */
 	res = -ENOMEM;
 	raw_key = kmalloc(mode->keysize, GFP_NOFS);
 	if (!raw_key)
 		goto out;
 
-	res = find_and_derive_key(inode, &ctx, raw_key, mode->keysize);
+	res = find_and_derive_key(inode, &ctx, raw_key, mode);
 	if (res)
 		goto out;
 
-	ctfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
-	if (IS_ERR(ctfm)) {
-		res = PTR_ERR(ctfm);
-		fscrypt_warn(inode->i_sb,
-			     "error allocating '%s' transform for inode %lu: %d",
-			     mode->cipher_str, inode->i_ino, res);
-		goto out;
-	}
-	if (unlikely(!mode->logged_impl_name)) {
-		/*
-		 * fscrypt performance can vary greatly depending on which
-		 * crypto algorithm implementation is used.  Help people debug
-		 * performance problems by logging the ->cra_driver_name the
-		 * first time a mode is used.  Note that multiple threads can
-		 * race here, but it doesn't really matter.
-		 */
-		mode->logged_impl_name = true;
-		pr_info("fscrypt: %s using implementation \"%s\"\n",
-			mode->friendly_name,
-			crypto_skcipher_alg(ctfm)->base.cra_driver_name);
-	}
-	crypt_info->ci_ctfm = ctfm;
-	crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
-	res = crypto_skcipher_setkey(ctfm, raw_key, mode->keysize);
+	res = setup_crypto_transform(crypt_info, mode, raw_key, inode);
 	if (res)
 		goto out;
 
-	if (S_ISREG(inode->i_mode) &&
-	    crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) {
-		res = init_essiv_generator(crypt_info, raw_key, mode->keysize);
-		if (res) {
-			fscrypt_warn(inode->i_sb,
-				     "error initializing ESSIV generator for inode %lu: %d",
-				     inode->i_ino, res);
-			goto out;
-		}
-	}
 	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL)
 		crypt_info = NULL;
 out:
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index c6d431a5cce9..f490de921ce8 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -199,7 +199,8 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 	child_ci = child->i_crypt_info;
 
 	if (parent_ci && child_ci) {
-		return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
+		return memcmp(parent_ci->ci_master_key_descriptor,
+			      child_ci->ci_master_key_descriptor,
 			      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
 			(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
 			(parent_ci->ci_filename_mode ==
@@ -254,7 +255,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
 	ctx.contents_encryption_mode = ci->ci_data_mode;
 	ctx.filenames_encryption_mode = ci->ci_filename_mode;
 	ctx.flags = ci->ci_flags;
-	memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+	memcpy(ctx.master_key_descriptor, ci->ci_master_key_descriptor,
 	       FS_KEY_DESCRIPTOR_SIZE);
 	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
 	BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a441ea1bfe6d..086e7ee550df 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -269,7 +269,8 @@ struct fsxattr {
 #define FS_POLICY_FLAGS_PAD_16		0x02
 #define FS_POLICY_FLAGS_PAD_32		0x03
 #define FS_POLICY_FLAGS_PAD_MASK	0x03
-#define FS_POLICY_FLAGS_VALID		0x03
+#define FS_POLICY_FLAG_DIRECT_KEY	0x04	/* use master key directly */
+#define FS_POLICY_FLAGS_VALID		0x07
 
 /* Encryption algorithms */
 #define FS_ENCRYPTION_MODE_INVALID		0
@@ -281,6 +282,7 @@ struct fsxattr {
 #define FS_ENCRYPTION_MODE_AES_128_CTS		6
 #define FS_ENCRYPTION_MODE_SPECK128_256_XTS	7 /* Removed, do not use. */
 #define FS_ENCRYPTION_MODE_SPECK128_256_CTS	8 /* Removed, do not use. */
+#define FS_ENCRYPTION_MODE_ADIANTUM		9
 
 struct fscrypt_policy {
 	__u8 version;
-- 
cgit v1.2.3-59-g8ed1b


From efe75c494f57890900caf6c8a0667db35bfaf56a Mon Sep 17 00:00:00 2001
From: David Abdurachmanov <david.abdurachmanov@gmail.com>
Date: Mon, 29 Oct 2018 11:48:53 +0100
Subject: riscv: add audit support

On RISC-V (riscv) audit is supported through generic lib/audit.c.
The patch adds required arch specific definitions.

Signed-off-by: David Abdurachmanov <david.abdurachmanov@gmail.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/Kconfig                   |  1 +
 arch/riscv/include/asm/ptrace.h      |  5 +++++
 arch/riscv/include/asm/syscall.h     | 10 ++++++++++
 arch/riscv/include/asm/thread_info.h |  6 ++++++
 arch/riscv/kernel/entry.S            |  4 ++--
 include/uapi/linux/audit.h           |  2 ++
 6 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 55da93f4e818..38787c48d76c 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -28,6 +28,7 @@ config RISCV
 	select GENERIC_STRNLEN_USER
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_ATOMIC64 if !64BIT || !RISCV_ISA_A
+	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_FUTEX_CMPXCHG if FUTEX
diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h
index bbe1862e8f80..d35ec2f41381 100644
--- a/arch/riscv/include/asm/ptrace.h
+++ b/arch/riscv/include/asm/ptrace.h
@@ -113,6 +113,11 @@ static inline void frame_pointer_set(struct pt_regs *regs,
 	SET_FP(regs, val);
 }
 
+static inline unsigned long regs_return_value(struct pt_regs *regs)
+{
+	return regs->a0;
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_RISCV_PTRACE_H */
diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h
index 8d25f8904c00..bba3da6ef157 100644
--- a/arch/riscv/include/asm/syscall.h
+++ b/arch/riscv/include/asm/syscall.h
@@ -18,6 +18,7 @@
 #ifndef _ASM_RISCV_SYSCALL_H
 #define _ASM_RISCV_SYSCALL_H
 
+#include <uapi/linux/audit.h>
 #include <linux/sched.h>
 #include <linux/err.h>
 
@@ -99,4 +100,13 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	memcpy(&regs->a1 + i * sizeof(regs->a1), args, n * sizeof(regs->a0));
 }
 
+static inline int syscall_get_arch(void)
+{
+#ifdef CONFIG_64BIT
+	return AUDIT_ARCH_RISCV64;
+#else
+	return AUDIT_ARCH_RISCV32;
+#endif
+}
+
 #endif	/* _ASM_RISCV_SYSCALL_H */
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index f8fa1cd2dad9..1c9cc8389928 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -80,13 +80,19 @@ struct thread_info {
 #define TIF_RESTORE_SIGMASK	4	/* restore signal mask in do_signal() */
 #define TIF_MEMDIE		5	/* is terminating due to OOM killer */
 #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
+#define TIF_SYSCALL_AUDIT	7	/* syscall auditing */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 
 #define _TIF_WORK_MASK \
 	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
 
+#define _TIF_SYSCALL_WORK \
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
+
 #endif /* _ASM_RISCV_THREAD_INFO_H */
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 13d4826ab2a1..355166f57205 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -201,7 +201,7 @@ handle_syscall:
 	REG_S s2, PT_SEPC(sp)
 	/* Trace syscalls, but only if requested by the user. */
 	REG_L t0, TASK_TI_FLAGS(tp)
-	andi t0, t0, _TIF_SYSCALL_TRACE
+	andi t0, t0, _TIF_SYSCALL_WORK
 	bnez t0, handle_syscall_trace_enter
 check_syscall_nr:
 	/* Check to make sure we don't jump to a bogus syscall number. */
@@ -221,7 +221,7 @@ ret_from_syscall:
 	REG_S a0, PT_A0(sp)
 	/* Trace syscalls, but only if requested by the user. */
 	REG_L t0, TASK_TI_FLAGS(tp)
-	andi t0, t0, _TIF_SYSCALL_TRACE
+	andi t0, t0, _TIF_SYSCALL_WORK
 	bnez t0, handle_syscall_trace_exit
 
 ret_from_exception:
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 818ae690ab79..d0e037a96a7b 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -399,6 +399,8 @@ enum {
 /* do not define AUDIT_ARCH_PPCLE since it is not supported by audit */
 #define AUDIT_ARCH_PPC64	(EM_PPC64|__AUDIT_ARCH_64BIT)
 #define AUDIT_ARCH_PPC64LE	(EM_PPC64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_RISCV32	(EM_RISCV|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_RISCV64	(EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_S390		(EM_S390)
 #define AUDIT_ARCH_S390X	(EM_S390|__AUDIT_ARCH_64BIT)
 #define AUDIT_ARCH_SH		(EM_SH)
-- 
cgit v1.2.3-59-g8ed1b


From c2eb8effb265ac5cdd960d8e61ecb931e9c767cd Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Wed, 24 Oct 2018 06:50:34 -0400
Subject: media: videodev2.h: add v4l2_timeval_to_ns inline function

We want to be able to uniquely identify buffers for stateless
codecs. The internal timestamp (a u64) as stored internally in the
kernel is a suitable candidate for that, but in struct v4l2_buffer
it is represented as a struct timeval.

Add a v4l2_timeval_to_ns() function that converts the struct timeval
into a u64 in the same way that the kernel does. This makes it possible
to use this u64 elsewhere as a unique identifier of the buffer.

Since timestamps are also copied from the output buffer to the
corresponding capture buffer(s) by M2M devices, the u64 can be
used to refer to both output and capture buffers.

The plan is that in the future we redesign struct v4l2_buffer and use
u64 for the timestamp instead of a struct timeval (which has lots of
problems with 32 vs 64 bit and y2038 layout changes), and then there
is no more need to use this function.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/videodev2.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index b5671ce2724f..d6eed479c3a6 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -973,6 +973,18 @@ struct v4l2_buffer {
 	};
 };
 
+/**
+ * v4l2_timeval_to_ns - Convert timeval to nanoseconds
+ * @ts:		pointer to the timeval variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timeval
+ * parameter.
+ */
+static inline __u64 v4l2_timeval_to_ns(const struct timeval *tv)
+{
+	return (__u64)tv->tv_sec * 1000000000ULL + tv->tv_usec * 1000;
+}
+
 /*  Flags for 'flags' field */
 /* Buffer is mapped (flag) */
 #define V4L2_BUF_FLAG_MAPPED			0x00000001
-- 
cgit v1.2.3-59-g8ed1b


From b7ea4894aa867aaf1c31bfb4b00a3c3e38eedf95 Mon Sep 17 00:00:00 2001
From: Eugene Syromiatnikov <esyr@redhat.com>
Date: Mon, 7 Jan 2019 16:22:38 +0100
Subject: ptp: uapi: change _IOW to IOWR in PTP_SYS_OFFSET_EXTENDED definition

The ioctl command is read/write (or just read, if the fact that user space
writes n_samples field is ignored).

Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ptp_clock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index d73d83950265..1bc794ad957a 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -147,7 +147,7 @@ struct ptp_pin_desc {
 #define PTP_SYS_OFFSET_PRECISE \
 	_IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise)
 #define PTP_SYS_OFFSET_EXTENDED \
-	_IOW(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended)
+	_IOWR(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended)
 
 struct ptp_extts_event {
 	struct ptp_clock_time t; /* Time event occured. */
-- 
cgit v1.2.3-59-g8ed1b


From 6325e01b6cdf4636b721cf7259c1616e3cf28ce2 Mon Sep 17 00:00:00 2001
From: Adit Ranadive <aditr@vmware.com>
Date: Wed, 9 Jan 2019 23:08:49 +0000
Subject: RDMA/vmw_pvrdma: Return the correct opcode when creating WR

Since the IB_WR_REG_MR opcode value changed, let's set the PVRDMA device
opcodes explicitly.

Reported-by: Ruishuang Wang <ruishuangw@vmware.com>
Fixes: 9a59739bd01f ("IB/rxe: Revise the ib_wr_opcode enum")
Cc: stable@vger.kernel.org
Reviewed-by: Bryan Tan <bryantan@vmware.com>
Reviewed-by: Ruishuang Wang <ruishuangw@vmware.com>
Reviewed-by: Vishnu Dasa <vdasa@vmware.com>
Signed-off-by: Adit Ranadive <aditr@vmware.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/vmw_pvrdma/pvrdma.h    | 35 +++++++++++++++++++++++++++-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c |  6 +++++
 include/uapi/rdma/vmw_pvrdma-abi.h           |  1 +
 3 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
index 42b8685c997e..3c633ab58052 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
@@ -427,7 +427,40 @@ static inline enum ib_qp_state pvrdma_qp_state_to_ib(enum pvrdma_qp_state state)
 
 static inline enum pvrdma_wr_opcode ib_wr_opcode_to_pvrdma(enum ib_wr_opcode op)
 {
-	return (enum pvrdma_wr_opcode)op;
+	switch (op) {
+	case IB_WR_RDMA_WRITE:
+		return PVRDMA_WR_RDMA_WRITE;
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return PVRDMA_WR_RDMA_WRITE_WITH_IMM;
+	case IB_WR_SEND:
+		return PVRDMA_WR_SEND;
+	case IB_WR_SEND_WITH_IMM:
+		return PVRDMA_WR_SEND_WITH_IMM;
+	case IB_WR_RDMA_READ:
+		return PVRDMA_WR_RDMA_READ;
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		return PVRDMA_WR_ATOMIC_CMP_AND_SWP;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		return PVRDMA_WR_ATOMIC_FETCH_AND_ADD;
+	case IB_WR_LSO:
+		return PVRDMA_WR_LSO;
+	case IB_WR_SEND_WITH_INV:
+		return PVRDMA_WR_SEND_WITH_INV;
+	case IB_WR_RDMA_READ_WITH_INV:
+		return PVRDMA_WR_RDMA_READ_WITH_INV;
+	case IB_WR_LOCAL_INV:
+		return PVRDMA_WR_LOCAL_INV;
+	case IB_WR_REG_MR:
+		return PVRDMA_WR_FAST_REG_MR;
+	case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+		return PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP;
+	case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+		return PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+	case IB_WR_REG_SIG_MR:
+		return PVRDMA_WR_REG_SIG_MR;
+	default:
+		return PVRDMA_WR_ERROR;
+	}
 }
 
 static inline enum ib_wc_status pvrdma_wc_status_to_ib(
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
index 3acf74cbe266..1ec3646087ba 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -721,6 +721,12 @@ int pvrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
 			wqe_hdr->ex.imm_data = wr->ex.imm_data;
 
+		if (unlikely(wqe_hdr->opcode == PVRDMA_WR_ERROR)) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto out;
+		}
+
 		switch (qp->ibqp.qp_type) {
 		case IB_QPT_GSI:
 		case IB_QPT_UD:
diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h
index d13fd490b66d..6e73f0274e41 100644
--- a/include/uapi/rdma/vmw_pvrdma-abi.h
+++ b/include/uapi/rdma/vmw_pvrdma-abi.h
@@ -78,6 +78,7 @@ enum pvrdma_wr_opcode {
 	PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD,
 	PVRDMA_WR_BIND_MW,
 	PVRDMA_WR_REG_SIG_MR,
+	PVRDMA_WR_ERROR,
 };
 
 enum pvrdma_wc_status {
-- 
cgit v1.2.3-59-g8ed1b


From c13295ad219d8bb0e47942d4cfc8251de449a67e Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Fri, 11 Jan 2019 00:25:41 +0100
Subject: binderfs: rename header to binderfs.h

It doesn't make sense to call the header binder_ctl.h when its sole
existence is tied to binderfs. So give it a sensible name. Users will far
more easily remember binderfs.h than binder_ctl.h.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binderfs.c              |  2 +-
 include/uapi/linux/android/binder_ctl.h | 35 ---------------------------------
 include/uapi/linux/android/binderfs.h   | 35 +++++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 36 deletions(-)
 delete mode 100644 include/uapi/linux/android/binder_ctl.h
 create mode 100644 include/uapi/linux/android/binderfs.h

(limited to 'include/uapi')

diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
index 89788969bc04..f6341893b5ba 100644
--- a/drivers/android/binderfs.c
+++ b/drivers/android/binderfs.c
@@ -31,7 +31,7 @@
 #include <linux/xarray.h>
 #include <uapi/asm-generic/errno-base.h>
 #include <uapi/linux/android/binder.h>
-#include <uapi/linux/android/binder_ctl.h>
+#include <uapi/linux/android/binderfs.h>
 
 #include "binder_internal.h"
 
diff --git a/include/uapi/linux/android/binder_ctl.h b/include/uapi/linux/android/binder_ctl.h
deleted file mode 100644
index 65b2efd1a0a5..000000000000
--- a/include/uapi/linux/android/binder_ctl.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 2018 Canonical Ltd.
- *
- */
-
-#ifndef _UAPI_LINUX_BINDER_CTL_H
-#define _UAPI_LINUX_BINDER_CTL_H
-
-#include <linux/android/binder.h>
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-#define BINDERFS_MAX_NAME 255
-
-/**
- * struct binderfs_device - retrieve information about a new binder device
- * @name:   the name to use for the new binderfs binder device
- * @major:  major number allocated for binderfs binder devices
- * @minor:  minor number allocated for the new binderfs binder device
- *
- */
-struct binderfs_device {
-	char name[BINDERFS_MAX_NAME + 1];
-	__u8 major;
-	__u8 minor;
-};
-
-/**
- * Allocate a new binder device.
- */
-#define BINDER_CTL_ADD _IOWR('b', 1, struct binderfs_device)
-
-#endif /* _UAPI_LINUX_BINDER_CTL_H */
-
diff --git a/include/uapi/linux/android/binderfs.h b/include/uapi/linux/android/binderfs.h
new file mode 100644
index 000000000000..65b2efd1a0a5
--- /dev/null
+++ b/include/uapi/linux/android/binderfs.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2018 Canonical Ltd.
+ *
+ */
+
+#ifndef _UAPI_LINUX_BINDER_CTL_H
+#define _UAPI_LINUX_BINDER_CTL_H
+
+#include <linux/android/binder.h>
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define BINDERFS_MAX_NAME 255
+
+/**
+ * struct binderfs_device - retrieve information about a new binder device
+ * @name:   the name to use for the new binderfs binder device
+ * @major:  major number allocated for binderfs binder devices
+ * @minor:  minor number allocated for the new binderfs binder device
+ *
+ */
+struct binderfs_device {
+	char name[BINDERFS_MAX_NAME + 1];
+	__u8 major;
+	__u8 minor;
+};
+
+/**
+ * Allocate a new binder device.
+ */
+#define BINDER_CTL_ADD _IOWR('b', 1, struct binderfs_device)
+
+#endif /* _UAPI_LINUX_BINDER_CTL_H */
+
-- 
cgit v1.2.3-59-g8ed1b


From 3affaa5a7ca3ca114e01049bc45e3ed4a3955505 Mon Sep 17 00:00:00 2001
From: Brian Starkey <brian.starkey@arm.com>
Date: Mon, 3 Dec 2018 11:31:57 +0000
Subject: drm/afbc: Add AFBC modifier usage documentation

AFBC is a flexible, proprietary, lossless compression protocol and
format, with a number of defined DRM format modifiers. To facilitate
consistency and compatibility between different AFBC producers and
consumers, document the expectations for usage of the AFBC DRM format
modifiers in a new .rst chapter.

Signed-off-by: Brian Starkey <brian.starkey@arm.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
[Updated MAINTAINERS entry to show that "Mali DP Maintainers" is
 actually a mailing list and added an SPDX-License-Identifier to
 the documentation]
Signed-off-by: Liviu Dudau <liviu.dudau@arm.com>
---
 Documentation/gpu/afbc.rst    | 235 ++++++++++++++++++++++++++++++++++++++++++
 Documentation/gpu/drivers.rst |   1 +
 MAINTAINERS                   |   3 +-
 include/uapi/drm/drm_fourcc.h |   3 +
 4 files changed, 241 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/gpu/afbc.rst

(limited to 'include/uapi')

diff --git a/Documentation/gpu/afbc.rst b/Documentation/gpu/afbc.rst
new file mode 100644
index 000000000000..4d38dc49d105
--- /dev/null
+++ b/Documentation/gpu/afbc.rst
@@ -0,0 +1,235 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+===================================
+ Arm Framebuffer Compression (AFBC)
+===================================
+
+AFBC is a proprietary lossless image compression protocol and format.
+It provides fine-grained random access and minimizes the amount of
+data transferred between IP blocks.
+
+AFBC can be enabled on drivers which support it via use of the AFBC
+format modifiers defined in drm_fourcc.h. See DRM_FORMAT_MOD_ARM_AFBC(*).
+
+All users of the AFBC modifiers must follow the usage guidelines laid
+out in this document, to ensure compatibility across different AFBC
+producers and consumers.
+
+Components and Ordering
+=======================
+
+AFBC streams can contain several components - where a component
+corresponds to a color channel (i.e. R, G, B, X, A, Y, Cb, Cr).
+The assignment of input/output color channels must be consistent
+between the encoder and the decoder for correct operation, otherwise
+the consumer will interpret the decoded data incorrectly.
+
+Furthermore, when the lossless colorspace transform is used
+(AFBC_FORMAT_MOD_YTR, which should be enabled for RGB buffers for
+maximum compression efficiency), the component order must be:
+
+ * Component 0: R
+ * Component 1: G
+ * Component 2: B
+
+The component ordering is communicated via the fourcc code in the
+fourcc:modifier pair. In general, component '0' is considered to
+reside in the least-significant bits of the corresponding linear
+format. For example, COMP(bits):
+
+ * DRM_FORMAT_ABGR8888
+
+   * Component 0: R(8)
+   * Component 1: G(8)
+   * Component 2: B(8)
+   * Component 3: A(8)
+
+ * DRM_FORMAT_BGR888
+
+   * Component 0: R(8)
+   * Component 1: G(8)
+   * Component 2: B(8)
+
+ * DRM_FORMAT_YUYV
+
+   * Component 0: Y(8)
+   * Component 1: Cb(8, 2x1 subsampled)
+   * Component 2: Cr(8, 2x1 subsampled)
+
+In AFBC, 'X' components are not treated any differently from any other
+component. Therefore, an AFBC buffer with fourcc DRM_FORMAT_XBGR8888
+encodes with 4 components, like so:
+
+ * DRM_FORMAT_XBGR8888
+
+   * Component 0: R(8)
+   * Component 1: G(8)
+   * Component 2: B(8)
+   * Component 3: X(8)
+
+Please note, however, that the inclusion of a "wasted" 'X' channel is
+bad for compression efficiency, and so it's recommended to avoid
+formats containing 'X' bits. If a fourth component is
+required/expected by the encoder/decoder, then it is recommended to
+instead use an equivalent format with alpha, setting all alpha bits to
+'1'. If there is no requirement for a fourth component, then a format
+which doesn't include alpha can be used, e.g. DRM_FORMAT_BGR888.
+
+Number of Planes
+================
+
+Formats which are typically multi-planar in linear layouts (e.g. YUV
+420), can be encoded into one, or multiple, AFBC planes. As with
+component order, the encoder and decoder must agree about the number
+of planes in order to correctly decode the buffer. The fourcc code is
+used to determine the number of encoded planes in an AFBC buffer,
+matching the number of planes for the linear (unmodified) format.
+Within each plane, the component ordering also follows the fourcc
+code:
+
+For example:
+
+ * DRM_FORMAT_YUYV: nplanes = 1
+
+   * Plane 0:
+
+     * Component 0: Y(8)
+     * Component 1: Cb(8, 2x1 subsampled)
+     * Component 2: Cr(8, 2x1 subsampled)
+
+ * DRM_FORMAT_NV12: nplanes = 2
+
+   * Plane 0:
+
+     * Component 0: Y(8)
+
+   * Plane 1:
+
+     * Component 0: Cb(8, 2x1 subsampled)
+     * Component 1: Cr(8, 2x1 subsampled)
+
+Cross-device interoperability
+=============================
+
+For maximum compatibility across devices, the table below defines
+canonical formats for use between AFBC-enabled devices. Formats which
+are listed here must be used exactly as specified when using the AFBC
+modifiers. Formats which are not listed should be avoided.
+
+.. flat-table:: AFBC formats
+
+   * - Fourcc code
+     - Description
+     - Planes/Components
+
+   * - DRM_FORMAT_ABGR2101010
+     - 10-bit per component RGB, with 2-bit alpha
+     - Plane 0: 4 components
+              * Component 0: R(10)
+              * Component 1: G(10)
+              * Component 2: B(10)
+              * Component 3: A(2)
+
+   * - DRM_FORMAT_ABGR8888
+     - 8-bit per component RGB, with 8-bit alpha
+     - Plane 0: 4 components
+              * Component 0: R(8)
+              * Component 1: G(8)
+              * Component 2: B(8)
+              * Component 3: A(8)
+
+   * - DRM_FORMAT_BGR888
+     - 8-bit per component RGB
+     - Plane 0: 3 components
+              * Component 0: R(8)
+              * Component 1: G(8)
+              * Component 2: B(8)
+
+   * - DRM_FORMAT_BGR565
+     - 5/6-bit per component RGB
+     - Plane 0: 3 components
+              * Component 0: R(5)
+              * Component 1: G(6)
+              * Component 2: B(5)
+
+   * - DRM_FORMAT_ABGR1555
+     - 5-bit per component RGB, with 1-bit alpha
+     - Plane 0: 4 components
+              * Component 0: R(5)
+              * Component 1: G(5)
+              * Component 2: B(5)
+              * Component 3: A(1)
+
+   * - DRM_FORMAT_VUY888
+     - 8-bit per component YCbCr 444, single plane
+     - Plane 0: 3 components
+              * Component 0: Y(8)
+              * Component 1: Cb(8)
+              * Component 2: Cr(8)
+
+   * - DRM_FORMAT_VUY101010
+     - 10-bit per component YCbCr 444, single plane
+     - Plane 0: 3 components
+              * Component 0: Y(10)
+              * Component 1: Cb(10)
+              * Component 2: Cr(10)
+
+   * - DRM_FORMAT_YUYV
+     - 8-bit per component YCbCr 422, single plane
+     - Plane 0: 3 components
+              * Component 0: Y(8)
+              * Component 1: Cb(8, 2x1 subsampled)
+              * Component 2: Cr(8, 2x1 subsampled)
+
+   * - DRM_FORMAT_NV16
+     - 8-bit per component YCbCr 422, two plane
+     - Plane 0: 1 component
+              * Component 0: Y(8)
+       Plane 1: 2 components
+              * Component 0: Cb(8, 2x1 subsampled)
+              * Component 1: Cr(8, 2x1 subsampled)
+
+   * - DRM_FORMAT_Y210
+     - 10-bit per component YCbCr 422, single plane
+     - Plane 0: 3 components
+              * Component 0: Y(10)
+              * Component 1: Cb(10, 2x1 subsampled)
+              * Component 2: Cr(10, 2x1 subsampled)
+
+   * - DRM_FORMAT_P210
+     - 10-bit per component YCbCr 422, two plane
+     - Plane 0: 1 component
+              * Component 0: Y(10)
+       Plane 1: 2 components
+              * Component 0: Cb(10, 2x1 subsampled)
+              * Component 1: Cr(10, 2x1 subsampled)
+
+   * - DRM_FORMAT_YUV420_8BIT
+     - 8-bit per component YCbCr 420, single plane
+     - Plane 0: 3 components
+              * Component 0: Y(8)
+              * Component 1: Cb(8, 2x2 subsampled)
+              * Component 2: Cr(8, 2x2 subsampled)
+
+   * - DRM_FORMAT_YUV420_10BIT
+     - 10-bit per component YCbCr 420, single plane
+     - Plane 0: 3 components
+              * Component 0: Y(10)
+              * Component 1: Cb(10, 2x2 subsampled)
+              * Component 2: Cr(10, 2x2 subsampled)
+
+   * - DRM_FORMAT_NV12
+     - 8-bit per component YCbCr 420, two plane
+     - Plane 0: 1 component
+              * Component 0: Y(8)
+       Plane 1: 2 components
+              * Component 0: Cb(8, 2x2 subsampled)
+              * Component 1: Cr(8, 2x2 subsampled)
+
+   * - DRM_FORMAT_P010
+     - 10-bit per component YCbCr 420, two plane
+     - Plane 0: 1 component
+              * Component 0: Y(10)
+       Plane 1: 2 components
+              * Component 0: Cb(10, 2x2 subsampled)
+              * Component 1: Cr(10, 2x2 subsampled)
diff --git a/Documentation/gpu/drivers.rst b/Documentation/gpu/drivers.rst
index 7c1672118a73..c176b34ae9c8 100644
--- a/Documentation/gpu/drivers.rst
+++ b/Documentation/gpu/drivers.rst
@@ -17,6 +17,7 @@ GPU Driver Documentation
    vkms
    bridge/dw-hdmi
    xen-front
+   afbc
 
 .. only::  subproject and html
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 32d444476a90..e79eca22532d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1136,10 +1136,11 @@ F:	Documentation/devicetree/bindings/display/arm,hdlcd.txt
 ARM MALI-DP DRM DRIVER
 M:	Liviu Dudau <liviu.dudau@arm.com>
 M:	Brian Starkey <brian.starkey@arm.com>
-M:	Mali DP Maintainers <malidp@foss.arm.com>
+L:	Mali DP Maintainers <malidp@foss.arm.com>
 S:	Supported
 F:	drivers/gpu/drm/arm/
 F:	Documentation/devicetree/bindings/display/arm,malidp.txt
+F:	Documentation/gpu/afbc.rst
 
 ARM MFM AND FLOPPY DRIVERS
 M:	Ian Molton <spyro@f2s.com>
diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index 0b44260a5ee9..df014ede4e57 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -572,6 +572,9 @@ extern "C" {
  * AFBC has several features which may be supported and/or used, which are
  * represented using bits in the modifier. Not all combinations are valid,
  * and different devices or use-cases may support different combinations.
+ *
+ * Further information on the use of AFBC modifiers can be found in
+ * Documentation/gpu/afbc.rst
  */
 #define DRM_FORMAT_MOD_ARM_AFBC(__afbc_mode)	fourcc_mod_code(ARM, __afbc_mode)
 
-- 
cgit v1.2.3-59-g8ed1b


From 2e746942ebacf1565caa72cf980745e5ce297c48 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 13 Jan 2019 22:28:05 -0800
Subject: Input: input_event - provide override for sparc64

The usec part of the timeval is defined as
__kernel_suseconds_t	tv_usec; /* microseconds */

Arnd noticed that sparc64 is the only architecture that defines
__kernel_suseconds_t as int rather than long.

This breaks the current y2038 fix for kernel as we only access and define
the timeval struct for non-kernel use cases.  But, this was hidden by an
another typo in the use of __KERNEL__ qualifier.

Fix the typo, and provide an override for sparc64.

Fixes: 152194fe9c3f ("Input: extend usable life of event timestamps to 2106 on 32 bit systems")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 7288a7c573cc..551866a4f658 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -26,13 +26,17 @@
  */
 
 struct input_event {
-#if (__BITS_PER_LONG != 32 || !defined(__USE_TIME_BITS64)) && !defined(__KERNEL)
+#if (__BITS_PER_LONG != 32 || !defined(__USE_TIME_BITS64)) && !defined(__KERNEL__)
 	struct timeval time;
 #define input_event_sec time.tv_sec
 #define input_event_usec time.tv_usec
 #else
 	__kernel_ulong_t __sec;
+#ifdef CONFIG_SPARC64
+	unsigned int __usec;
+#else
 	__kernel_ulong_t __usec;
+#endif
 #define input_event_sec  __sec
 #define input_event_usec __usec
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 2db8ebca1f6c158d0d439ebec4d4b9924c4c3d3c Mon Sep 17 00:00:00 2001
From: Matteo Franchin <matteo.franchin@arm.com>
Date: Wed, 9 Jan 2019 15:58:37 +0000
Subject: drm/fourcc: Add modifier defininitions for AFBC 1.3

This commit adds definitions of format modifiers for version 1.3 of the
Arm Framebuffer Compression (AFBC).

Signed-off-by: Matteo Franchin <matteo.franchin@arm.com>
Signed-off-by: Ayan Kumar Halder <ayan.halder@arm.com>
Reviewed-by: Brian Starkey <brian.starkey@arm.com>
Link: https://patchwork.freedesktop.org/patch/277333/
---
 include/uapi/drm/drm_fourcc.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index 0b44260a5ee9..41106c835747 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -581,10 +581,18 @@ extern "C" {
  * Indicates the superblock size(s) used for the AFBC buffer. The buffer
  * size (in pixels) must be aligned to a multiple of the superblock size.
  * Four lowest significant bits(LSBs) are reserved for block size.
+ *
+ * Where one superblock size is specified, it applies to all planes of the
+ * buffer (e.g. 16x16, 32x8). When multiple superblock sizes are specified,
+ * the first applies to the Luma plane and the second applies to the Chroma
+ * plane(s). e.g. (32x8_64x4 means 32x8 Luma, with 64x4 Chroma).
+ * Multiple superblock sizes are only valid for multi-plane YCbCr formats.
  */
 #define AFBC_FORMAT_MOD_BLOCK_SIZE_MASK      0xf
 #define AFBC_FORMAT_MOD_BLOCK_SIZE_16x16     (1ULL)
 #define AFBC_FORMAT_MOD_BLOCK_SIZE_32x8      (2ULL)
+#define AFBC_FORMAT_MOD_BLOCK_SIZE_64x4      (3ULL)
+#define AFBC_FORMAT_MOD_BLOCK_SIZE_32x8_64x4 (4ULL)
 
 /*
  * AFBC lossless colorspace transform
@@ -644,6 +652,21 @@ extern "C" {
  */
 #define AFBC_FORMAT_MOD_SC      (1ULL <<  9)
 
+/*
+ * AFBC double-buffer
+ *
+ * Indicates that the buffer is allocated in a layout safe for front-buffer
+ * rendering.
+ */
+#define AFBC_FORMAT_MOD_DB      (1ULL << 10)
+
+/*
+ * AFBC buffer content hints
+ *
+ * Indicates that the buffer includes per-superblock content hints.
+ */
+#define AFBC_FORMAT_MOD_BCH     (1ULL << 11)
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From f275ee0fa3a06eb87edc229749cf1eb18f0663fa Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 10 Jan 2019 21:24:13 +0100
Subject: IN_BADCLASS: fix macro to actually work

Commit 65cab850f0ee ("net: Allow class-e address assignment via ifconfig
ioctl") modified the IN_BADCLASS macro a bit, but unfortunatly one too
many '(' characters were added to the line, making any code that used
it, not build properly.

Also, the macro now compares an unsigned with a signed value, which
isn't ok, so fix that up by making both types match properly.

Reported-by: Christopher Ferris <cferris@google.com>
Fixes: 65cab850f0ee ("net: Allow class-e address assignment via ifconfig ioctl")
Cc: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index f6052e70bf40..a55cb8b10165 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -268,7 +268,7 @@ struct sockaddr_in {
 #define	IN_MULTICAST(a)		IN_CLASSD(a)
 #define	IN_MULTICAST_NET	0xe0000000
 
-#define	IN_BADCLASS(a)		((((long int) (a) ) == 0xffffffff)
+#define	IN_BADCLASS(a)		(((long int) (a) ) == (long int)0xffffffff)
 #define	IN_EXPERIMENTAL(a)	IN_BADCLASS((a))
 
 #define	IN_CLASSE(a)		((((long int) (a)) & 0xf0000000) == 0xf0000000)
-- 
cgit v1.2.3-59-g8ed1b


From 4b837c6d7ee771f68a30f362c9f68171a95be222 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 14 Jan 2019 09:01:54 -0500
Subject: media: v4l: uAPI: V4L2_BUF_TYPE_META_OUTPUT is an output buffer type

V4L2_BUF_TYPE_META_OUTPUT was added by commit 72148d1a57e7
("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT") but the patch
missed adding the type to the macro telling whether a given type is an
output type or not. Do that now. Getting this wrong leads to handling the
buffer as a capture buffer in a lot of places.

Fixes: 72148d1a57e7 ("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT")

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/videodev2.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index d6eed479c3a6..c0c36c165bf4 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -161,7 +161,8 @@ enum v4l2_buf_type {
 	 || (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY	\
 	 || (type) == V4L2_BUF_TYPE_VBI_OUTPUT			\
 	 || (type) == V4L2_BUF_TYPE_SLICED_VBI_OUTPUT		\
-	 || (type) == V4L2_BUF_TYPE_SDR_OUTPUT)
+	 || (type) == V4L2_BUF_TYPE_SDR_OUTPUT			\
+	 || (type) == V4L2_BUF_TYPE_META_OUTPUT)
 
 enum v4l2_tuner_type {
 	V4L2_TUNER_RADIO	     = 1,
-- 
cgit v1.2.3-59-g8ed1b


From 50656bad786d001b294764e9f047c5d5b3e4db75 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Thu, 10 Jan 2019 11:56:09 -0500
Subject: media: v4l2-ctrl: Add control to enable h.264 constrained intra
 prediction

Allow to enable h.264 constrained intra prediction (macroblocks using
intra prediction modes are not allowed to use residual data and decoded
samples of neighboring macroblocks coded using inter prediction modes).
This control directly corresponds to the constrained_intra_pred_flag
field in the h.264 picture parameter set.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/extended-controls.rst | 4 ++++
 drivers/media/v4l2-core/v4l2-ctrls.c               | 2 ++
 include/uapi/linux/v4l2-controls.h                 | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst
index af4273aa5e85..235d0c293983 100644
--- a/Documentation/media/uapi/v4l/extended-controls.rst
+++ b/Documentation/media/uapi/v4l/extended-controls.rst
@@ -1154,6 +1154,10 @@ enum v4l2_mpeg_video_h264_entropy_mode -
 ``V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM (boolean)``
     Enable 8X8 transform for H264. Applicable to the H264 encoder.
 
+``V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (boolean)``
+    Enable constrained intra prediction for H264. Applicable to the H264
+    encoder.
+
 ``V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB (integer)``
     Cyclic intra macroblock refresh. This is the number of continuous
     macroblocks refreshed every frame. Each frame a successive set of
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 8c47d8f00429..50b56185c02e 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -825,6 +825,8 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER:return "H264 Number of HC Layers";
 	case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP:
 								return "H264 Set QP Value for HC Layers";
+	case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION:
+								return "H264 Constrained Intra Pred";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP:		return "MPEG4 I-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP:		return "MPEG4 P-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP:		return "MPEG4 B-Frame QP Value";
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 3dcfc6148f99..fd65c710b144 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -533,6 +533,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type {
 };
 #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER	(V4L2_CID_MPEG_BASE+381)
 #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP	(V4L2_CID_MPEG_BASE+382)
+#define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION	(V4L2_CID_MPEG_BASE+383)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP	(V4L2_CID_MPEG_BASE+400)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP	(V4L2_CID_MPEG_BASE+401)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP	(V4L2_CID_MPEG_BASE+402)
-- 
cgit v1.2.3-59-g8ed1b


From d034696cbe5a6e00f76ca4b7869c6cdef66aebd5 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Thu, 10 Jan 2019 11:56:10 -0500
Subject: media: v4l2-ctrl: Add control for h.264 chroma qp offset

Allow to add fixed quantization parameter offset between luma and
chroma quantization parameters. This control directly corresponds
to the chroma_qp_index_offset field of the h.264 picture parameter
set.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/extended-controls.rst | 5 +++++
 drivers/media/v4l2-core/v4l2-ctrls.c               | 1 +
 include/uapi/linux/v4l2-controls.h                 | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst
index 235d0c293983..00934efdc9e4 100644
--- a/Documentation/media/uapi/v4l/extended-controls.rst
+++ b/Documentation/media/uapi/v4l/extended-controls.rst
@@ -1158,6 +1158,11 @@ enum v4l2_mpeg_video_h264_entropy_mode -
     Enable constrained intra prediction for H264. Applicable to the H264
     encoder.
 
+``V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET (integer)``
+    Specify the offset that should be added to the luma quantization
+    parameter to determine the chroma quantization parameter. Applicable
+    to the H264 encoder.
+
 ``V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB (integer)``
     Cyclic intra macroblock refresh. This is the number of continuous
     macroblocks refreshed every frame. Each frame a successive set of
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 50b56185c02e..99308dac2daa 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -827,6 +827,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 								return "H264 Set QP Value for HC Layers";
 	case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION:
 								return "H264 Constrained Intra Pred";
+	case V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET:	return "H264 Chroma QP Index Offset";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP:		return "MPEG4 I-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP:		return "MPEG4 P-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP:		return "MPEG4 B-Frame QP Value";
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index fd65c710b144..06479f2fb3ae 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -534,6 +534,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type {
 #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER	(V4L2_CID_MPEG_BASE+381)
 #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP	(V4L2_CID_MPEG_BASE+382)
 #define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION	(V4L2_CID_MPEG_BASE+383)
+#define V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET		(V4L2_CID_MPEG_BASE+384)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP	(V4L2_CID_MPEG_BASE+400)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP	(V4L2_CID_MPEG_BASE+401)
 #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP	(V4L2_CID_MPEG_BASE+402)
-- 
cgit v1.2.3-59-g8ed1b


From 1c3721b1f22286033abeda30b7e12439b083ed0f Mon Sep 17 00:00:00 2001
From: Steve Longerbeam <slongerbeam@gmail.com>
Date: Wed, 9 Jan 2019 13:30:04 -0500
Subject: media: videodev2.h: Add more field helper macros

Adds two helper macros:

V4L2_FIELD_IS_SEQUENTIAL: returns true if the given field type is
'sequential', that is a full frame is transmitted, or exists in
memory, as all top field lines followed by all bottom field lines,
or vice-versa.

V4L2_FIELD_IS_INTERLACED: returns true if the given field type is
'interlaced', that is a full frame is transmitted, or exists in
memory, as top field lines interlaced with bottom field lines.

Signed-off-by: Steve Longerbeam <slongerbeam@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/videodev2.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index c0c36c165bf4..9a920f071ff9 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -130,6 +130,13 @@ enum v4l2_field {
 	((field) == V4L2_FIELD_BOTTOM ||\
 	 (field) == V4L2_FIELD_TOP ||\
 	 (field) == V4L2_FIELD_ALTERNATE)
+#define V4L2_FIELD_IS_INTERLACED(field) \
+	((field) == V4L2_FIELD_INTERLACED ||\
+	 (field) == V4L2_FIELD_INTERLACED_TB ||\
+	 (field) == V4L2_FIELD_INTERLACED_BT)
+#define V4L2_FIELD_IS_SEQUENTIAL(field) \
+	((field) == V4L2_FIELD_SEQ_TB ||\
+	 (field) == V4L2_FIELD_SEQ_BT)
 
 enum v4l2_buf_type {
 	V4L2_BUF_TYPE_VIDEO_CAPTURE        = 1,
-- 
cgit v1.2.3-59-g8ed1b


From f5dd3d0c9638a9d9a02b5964c4ad636f06cf7e2c Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Tue, 15 Jan 2019 14:42:14 +0100
Subject: net: introduce SO_BINDTOIFINDEX sockopt

This introduces a new generic SOL_SOCKET-level socket option called
SO_BINDTOIFINDEX. It behaves similar to SO_BINDTODEVICE, but takes a
network interface index as argument, rather than the network interface
name.

User-space often refers to network-interfaces via their index, but has
to temporarily resolve it to a name for a call into SO_BINDTODEVICE.
This might pose problems when the network-device is renamed
asynchronously by other parts of the system. When this happens, the
SO_BINDTODEVICE might either fail, or worse, it might bind to the wrong
device.

In most cases user-space only ever operates on devices which they
either manage themselves, or otherwise have a guarantee that the device
name will not change (e.g., devices that are UP cannot be renamed).
However, particularly in libraries this guarantee is non-obvious and it
would be nice if that race-condition would simply not exist. It would
make it easier for those libraries to operate even in situations where
the device-name might change under the hood.

A real use-case that we recently hit is trying to start the network
stack early in the initrd but make it survive into the real system.
Existing distributions rename network-interfaces during the transition
from initrd into the real system. This, obviously, cannot affect
devices that are up and running (unless you also consider moving them
between network-namespaces). However, the network manager now has to
make sure its management engine for dormant devices will not run in
parallel to these renames. Particularly, when you offload operations
like DHCP into separate processes, these might setup their sockets
early, and thus have to resolve the device-name possibly running into
this race-condition.

By avoiding a call to resolve the device-name, we no longer depend on
the name and can run network setup of dormant devices in parallel to
the transition off the initrd. The SO_BINDTOIFINDEX ioctl plugs this
race.

Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  |  2 ++
 arch/ia64/include/uapi/asm/socket.h   |  2 ++
 arch/mips/include/uapi/asm/socket.h   |  2 ++
 arch/parisc/include/uapi/asm/socket.h |  2 ++
 arch/s390/include/uapi/asm/socket.h   |  2 ++
 arch/sparc/include/uapi/asm/socket.h  |  2 ++
 arch/xtensa/include/uapi/asm/socket.h |  2 ++
 include/uapi/asm-generic/socket.h     |  2 ++
 net/core/sock.c                       | 46 +++++++++++++++++++++++++++--------
 9 files changed, 52 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 065fb372e355..b1c9b542c021 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -115,4 +115,6 @@
 #define SO_TXTIME		61
 #define SCM_TXTIME		SO_TXTIME
 
+#define SO_BINDTOIFINDEX	62
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index c872c4e6bafb..ba0d245f9576 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -117,4 +117,6 @@
 #define SO_TXTIME		61
 #define SCM_TXTIME		SO_TXTIME
 
+#define SO_BINDTOIFINDEX	62
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 71370fb3ceef..73e25e35d803 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -126,4 +126,6 @@
 #define SO_TXTIME		61
 #define SCM_TXTIME		SO_TXTIME
 
+#define SO_BINDTOIFINDEX	62
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 061b9cf2a779..52bed5976cbe 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -107,4 +107,6 @@
 #define SO_TXTIME		0x4036
 #define SCM_TXTIME		SO_TXTIME
 
+#define SO_BINDTOIFINDEX	0x4037
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index 39d901476ee5..49c971587087 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -114,4 +114,6 @@
 #define SO_TXTIME		61
 #define SCM_TXTIME		SO_TXTIME
 
+#define SO_BINDTOIFINDEX	62
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 7ea35e5601b6..bbdb81594dd4 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -104,6 +104,8 @@
 #define SO_TXTIME		0x003f
 #define SCM_TXTIME		SO_TXTIME
 
+#define SO_BINDTOIFINDEX	0x0041
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index 1de07a7f7680..b434217783d0 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -119,4 +119,6 @@
 #define SO_TXTIME		61
 #define SCM_TXTIME		SO_TXTIME
 
+#define SO_BINDTOIFINDEX	62
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index a12692e5f7a8..3066ab3853a8 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -110,4 +110,6 @@
 #define SO_TXTIME		61
 #define SCM_TXTIME		SO_TXTIME
 
+#define SO_BINDTOIFINDEX	62
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 6aa2e7e0b4fb..b53764ebb973 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -520,20 +520,43 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 }
 EXPORT_SYMBOL(sk_dst_check);
 
-static int sock_setbindtodevice(struct sock *sk, char __user *optval,
-				int optlen)
+static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
 {
 	int ret = -ENOPROTOOPT;
 #ifdef CONFIG_NETDEVICES
 	struct net *net = sock_net(sk);
-	char devname[IFNAMSIZ];
-	int index;
 
 	/* Sorry... */
 	ret = -EPERM;
 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
 		goto out;
 
+	ret = -EINVAL;
+	if (ifindex < 0)
+		goto out;
+
+	sk->sk_bound_dev_if = ifindex;
+	if (sk->sk_prot->rehash)
+		sk->sk_prot->rehash(sk);
+	sk_dst_reset(sk);
+
+	ret = 0;
+
+out:
+#endif
+
+	return ret;
+}
+
+static int sock_setbindtodevice(struct sock *sk, char __user *optval,
+				int optlen)
+{
+	int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+	struct net *net = sock_net(sk);
+	char devname[IFNAMSIZ];
+	int index;
+
 	ret = -EINVAL;
 	if (optlen < 0)
 		goto out;
@@ -566,14 +589,9 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 	}
 
 	lock_sock(sk);
-	sk->sk_bound_dev_if = index;
-	if (sk->sk_prot->rehash)
-		sk->sk_prot->rehash(sk);
-	sk_dst_reset(sk);
+	ret = sock_setbindtodevice_locked(sk, index);
 	release_sock(sk);
 
-	ret = 0;
-
 out:
 #endif
 
@@ -1055,6 +1073,10 @@ set_rcvbuf:
 		}
 		break;
 
+	case SO_BINDTOIFINDEX:
+		ret = sock_setbindtodevice_locked(sk, val);
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1399,6 +1421,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 				  SOF_TXTIME_REPORT_ERRORS : 0;
 		break;
 
+	case SO_BINDTOIFINDEX:
+		v.val = sk->sk_bound_dev_if;
+		break;
+
 	default:
 		/* We implement the SO_SNDLOWAT etc to not be settable
 		 * (1003.1g 7).
-- 
cgit v1.2.3-59-g8ed1b


From 75dd48e2e420a3cbbe56dd7adfcc6f142c948272 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Mon, 14 Jan 2019 18:41:35 +0100
Subject: netfilter: nf_tables: Support RULE_ID reference in new rule

To allow for a batch to contain rules in arbitrary ordering, introduce
NFTA_RULE_POSITION_ID attribute which works just like NFTA_RULE_POSITION
but contains the ID of another rule within the same batch. This helps
iptables-nft-restore handling dumps with mixed insert/append commands
correctly.

Note that NFTA_RULE_POSITION takes precedence over
NFTA_RULE_POSITION_ID, so if the former is present, the latter is
ignored.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 ++
 net/netfilter/nf_tables_api.c            | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 7de4f1bdaf06..99ca95b830b6 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -219,6 +219,7 @@ enum nft_chain_attributes {
  * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64)
  * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN)
  * @NFTA_RULE_ID: uniquely identifies a rule in a transaction (NLA_U32)
+ * @NFTA_RULE_POSITION_ID: transaction unique identifier of the previous rule (NLA_U32)
  */
 enum nft_rule_attributes {
 	NFTA_RULE_UNSPEC,
@@ -231,6 +232,7 @@ enum nft_rule_attributes {
 	NFTA_RULE_USERDATA,
 	NFTA_RULE_PAD,
 	NFTA_RULE_ID,
+	NFTA_RULE_POSITION_ID,
 	__NFTA_RULE_MAX
 };
 #define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 621ff834d3a4..d88c86c5b433 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2610,6 +2610,9 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
 	return 0;
 }
 
+static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+					     const struct nlattr *nla);
+
 #define NFT_RULE_MAXEXPRS	128
 
 static int nf_tables_newrule(struct net *net, struct sock *nlsk,
@@ -2679,6 +2682,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 				NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
 				return PTR_ERR(old_rule);
 			}
+		} else if (nla[NFTA_RULE_POSITION_ID]) {
+			old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]);
+			if (IS_ERR(old_rule)) {
+				NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]);
+				return PTR_ERR(old_rule);
+			}
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 0fb4d21956f4a9af225594a46857ccf29bd747bc Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Wed, 16 Jan 2019 07:53:51 +0800
Subject: netfilter: nft_meta: Add NFT_META_I/OIFKIND meta type

In the ip_rcv the skb goes through the PREROUTING hook first, then kicks
in vrf device and go through the same hook again. When conntrack dnat
works with vrf, there will be some conflict with rules because the
packet goes through the hook twice with different nf status.

ip link add user1 type vrf table 1
ip link add user2 type vrf table 2
ip l set dev tun1 master user1
ip l set dev tun2 master user2

nft add table firewall
nft add chain firewall zones { type filter hook prerouting  priority - 300 \; }
nft add rule firewall zones counter ct zone set iif map { "tun1" : 1, "tun2" : 2 }
nft add chain firewall rule-1000-ingress
nft add rule firewall rule-1000-ingress ct zone 1 tcp dport 22 ct state new counter accept
nft add rule firewall rule-1000-ingress counter drop
nft add chain firewall rule-1000-egress
nft add rule firewall rule-1000-egress tcp dport 22 ct state new counter drop
nft add rule firewall rule-1000-egress counter accept

nft add chain firewall rules-all { type filter hook prerouting priority - 150 \; }
nft add rule firewall rules-all ip daddr vmap { "2.2.2.11" : jump rule-1000-ingress }
nft add rule firewall rules-all ct zone vmap { 1 : jump rule-1000-egress }

nft add rule firewall dnat-all ct zone vmap { 1 : jump dnat-1000 }
nft add rule firewall dnat-1000 ip daddr 2.2.2.11 counter dnat to 10.0.0.7

For a package with ip daddr 2.2.2.11 and tcp dport 22, first time accept in the
rule-1000-ingress and dnat to 10.0.0.7. Then second time the packet goto the wrong
chain rule-1000-egress which leads the packet drop

With this patch, userspace can add the 'don't re-do entire ruleset for
vrf' policy itself via:

nft add rule firewall rules-all meta iifkind "vrf" counter accept

Signed-off-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  4 ++++
 net/netfilter/nft_meta.c                 | 12 ++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 99ca95b830b6..0ba8f48bdf0b 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -791,6 +791,8 @@ enum nft_exthdr_attributes {
  * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid)
  * @NFT_META_PRANDOM: a 32bit pseudo-random number
  * @NFT_META_SECPATH: boolean, secpath_exists (!!skb->sp)
+ * @NFT_META_IIFKIND: packet input interface kind name (dev->rtnl_link_ops->kind)
+ * @NFT_META_OIFKIND: packet output interface kind name (dev->rtnl_link_ops->kind)
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -819,6 +821,8 @@ enum nft_meta_keys {
 	NFT_META_CGROUP,
 	NFT_META_PRANDOM,
 	NFT_META_SECPATH,
+	NFT_META_IIFKIND,
+	NFT_META_OIFKIND,
 };
 
 /**
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 6df486c5ebd3..987d2d6ce624 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -244,6 +244,16 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
 		return;
 #endif
+	case NFT_META_IIFKIND:
+		if (in == NULL || in->rtnl_link_ops == NULL)
+			goto err;
+		strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ);
+		break;
+	case NFT_META_OIFKIND:
+		if (out == NULL || out->rtnl_link_ops == NULL)
+			goto err;
+		strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ);
+		break;
 	default:
 		WARN_ON(1);
 		goto err;
@@ -340,6 +350,8 @@ static int nft_meta_get_init(const struct nft_ctx *ctx,
 		break;
 	case NFT_META_IIFNAME:
 	case NFT_META_OIFNAME:
+	case NFT_META_IIFKIND:
+	case NFT_META_OIFKIND:
 		len = IFNAMSIZ;
 		break;
 	case NFT_META_PRANDOM:
-- 
cgit v1.2.3-59-g8ed1b


From 0123a75e1d57c3df31e536868339c98c02c14917 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Fri, 18 Jan 2019 14:36:29 +0100
Subject: Revert "netfilter: nft_hash: add map lookups for hashing operations"

A better way to implement this from userspace has been found without
specific code in the kernel side, revert this.

Fixes: b9ccc07e3f31 ("netfilter: nft_hash: add map lookups for hashing operations")
Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +-
 net/netfilter/nft_hash.c                 | 121 -------------------------------
 2 files changed, 2 insertions(+), 123 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 0ba8f48bdf0b..030302893d96 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -877,8 +877,8 @@ enum nft_hash_attributes {
 	NFTA_HASH_SEED,
 	NFTA_HASH_OFFSET,
 	NFTA_HASH_TYPE,
-	NFTA_HASH_SET_NAME,
-	NFTA_HASH_SET_ID,
+	NFTA_HASH_SET_NAME,	/* deprecated */
+	NFTA_HASH_SET_ID,	/* deprecated */
 	__NFTA_HASH_MAX,
 };
 #define NFTA_HASH_MAX	(__NFTA_HASH_MAX - 1)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index c2d237144f74..ea658e6c53e3 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -25,7 +25,6 @@ struct nft_jhash {
 	u32			modulus;
 	u32			seed;
 	u32			offset;
-	struct nft_set		*map;
 };
 
 static void nft_jhash_eval(const struct nft_expr *expr,
@@ -42,33 +41,10 @@ static void nft_jhash_eval(const struct nft_expr *expr,
 	regs->data[priv->dreg] = h + priv->offset;
 }
 
-static void nft_jhash_map_eval(const struct nft_expr *expr,
-			       struct nft_regs *regs,
-			       const struct nft_pktinfo *pkt)
-{
-	struct nft_jhash *priv = nft_expr_priv(expr);
-	const void *data = &regs->data[priv->sreg];
-	const struct nft_set *map = priv->map;
-	const struct nft_set_ext *ext;
-	u32 result;
-	bool found;
-
-	result = reciprocal_scale(jhash(data, priv->len, priv->seed),
-					priv->modulus) + priv->offset;
-
-	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
-	if (!found)
-		return;
-
-	nft_data_copy(&regs->data[priv->dreg],
-		      nft_set_ext_data(ext), map->dlen);
-}
-
 struct nft_symhash {
 	enum nft_registers      dreg:8;
 	u32			modulus;
 	u32			offset;
-	struct nft_set		*map;
 };
 
 static void nft_symhash_eval(const struct nft_expr *expr,
@@ -84,28 +60,6 @@ static void nft_symhash_eval(const struct nft_expr *expr,
 	regs->data[priv->dreg] = h + priv->offset;
 }
 
-static void nft_symhash_map_eval(const struct nft_expr *expr,
-				 struct nft_regs *regs,
-				 const struct nft_pktinfo *pkt)
-{
-	struct nft_symhash *priv = nft_expr_priv(expr);
-	struct sk_buff *skb = pkt->skb;
-	const struct nft_set *map = priv->map;
-	const struct nft_set_ext *ext;
-	u32 result;
-	bool found;
-
-	result = reciprocal_scale(__skb_get_hash_symmetric(skb),
-				  priv->modulus) + priv->offset;
-
-	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
-	if (!found)
-		return;
-
-	nft_data_copy(&regs->data[priv->dreg],
-		      nft_set_ext_data(ext), map->dlen);
-}
-
 static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
 	[NFTA_HASH_SREG]	= { .type = NLA_U32 },
 	[NFTA_HASH_DREG]	= { .type = NLA_U32 },
@@ -114,9 +68,6 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
 	[NFTA_HASH_SEED]	= { .type = NLA_U32 },
 	[NFTA_HASH_OFFSET]	= { .type = NLA_U32 },
 	[NFTA_HASH_TYPE]	= { .type = NLA_U32 },
-	[NFTA_HASH_SET_NAME]	= { .type = NLA_STRING,
-				    .len = NFT_SET_MAXNAMELEN - 1 },
-	[NFTA_HASH_SET_ID]	= { .type = NLA_U32 },
 };
 
 static int nft_jhash_init(const struct nft_ctx *ctx,
@@ -166,20 +117,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
-static int nft_jhash_map_init(const struct nft_ctx *ctx,
-			      const struct nft_expr *expr,
-			      const struct nlattr * const tb[])
-{
-	struct nft_jhash *priv = nft_expr_priv(expr);
-	u8 genmask = nft_genmask_next(ctx->net);
-
-	nft_jhash_init(ctx, expr, tb);
-	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
-					  tb[NFTA_HASH_SET_NAME],
-					  tb[NFTA_HASH_SET_ID], genmask);
-	return PTR_ERR_OR_ZERO(priv->map);
-}
-
 static int nft_symhash_init(const struct nft_ctx *ctx,
 			    const struct nft_expr *expr,
 			    const struct nlattr * const tb[])
@@ -206,20 +143,6 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
-static int nft_symhash_map_init(const struct nft_ctx *ctx,
-				const struct nft_expr *expr,
-				const struct nlattr * const tb[])
-{
-	struct nft_jhash *priv = nft_expr_priv(expr);
-	u8 genmask = nft_genmask_next(ctx->net);
-
-	nft_symhash_init(ctx, expr, tb);
-	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
-					  tb[NFTA_HASH_SET_NAME],
-					  tb[NFTA_HASH_SET_ID], genmask);
-	return PTR_ERR_OR_ZERO(priv->map);
-}
-
 static int nft_jhash_dump(struct sk_buff *skb,
 			  const struct nft_expr *expr)
 {
@@ -247,18 +170,6 @@ nla_put_failure:
 	return -1;
 }
 
-static int nft_jhash_map_dump(struct sk_buff *skb,
-			       const struct nft_expr *expr)
-{
-	const struct nft_jhash *priv = nft_expr_priv(expr);
-
-	if (nft_jhash_dump(skb, expr) ||
-	    nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
-		return -1;
-
-	return 0;
-}
-
 static int nft_symhash_dump(struct sk_buff *skb,
 			    const struct nft_expr *expr)
 {
@@ -279,18 +190,6 @@ nla_put_failure:
 	return -1;
 }
 
-static int nft_symhash_map_dump(struct sk_buff *skb,
-				const struct nft_expr *expr)
-{
-	const struct nft_symhash *priv = nft_expr_priv(expr);
-
-	if (nft_symhash_dump(skb, expr) ||
-	    nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
-		return -1;
-
-	return 0;
-}
-
 static struct nft_expr_type nft_hash_type;
 static const struct nft_expr_ops nft_jhash_ops = {
 	.type		= &nft_hash_type,
@@ -300,14 +199,6 @@ static const struct nft_expr_ops nft_jhash_ops = {
 	.dump		= nft_jhash_dump,
 };
 
-static const struct nft_expr_ops nft_jhash_map_ops = {
-	.type		= &nft_hash_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_jhash)),
-	.eval		= nft_jhash_map_eval,
-	.init		= nft_jhash_map_init,
-	.dump		= nft_jhash_map_dump,
-};
-
 static const struct nft_expr_ops nft_symhash_ops = {
 	.type		= &nft_hash_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
@@ -316,14 +207,6 @@ static const struct nft_expr_ops nft_symhash_ops = {
 	.dump		= nft_symhash_dump,
 };
 
-static const struct nft_expr_ops nft_symhash_map_ops = {
-	.type		= &nft_hash_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
-	.eval		= nft_symhash_map_eval,
-	.init		= nft_symhash_map_init,
-	.dump		= nft_symhash_map_dump,
-};
-
 static const struct nft_expr_ops *
 nft_hash_select_ops(const struct nft_ctx *ctx,
 		    const struct nlattr * const tb[])
@@ -336,12 +219,8 @@ nft_hash_select_ops(const struct nft_ctx *ctx,
 	type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE]));
 	switch (type) {
 	case NFT_HASH_SYM:
-		if (tb[NFTA_HASH_SET_NAME])
-			return &nft_symhash_map_ops;
 		return &nft_symhash_ops;
 	case NFT_HASH_JENKINS:
-		if (tb[NFTA_HASH_SET_NAME])
-			return &nft_jhash_map_ops;
 		return &nft_jhash_ops;
 	default:
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 08cba016cdbe4f6bf0200366194a768cfef58edd Mon Sep 17 00:00:00 2001
From: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Date: Fri, 18 Jan 2019 15:51:21 +0100
Subject: drm/fourcc: Add definitions for Allwinner vendor and VPU tiled format

This introduces specific definitions for vendor Allwinner and its
associated tiled format modifier. This modifier is used for the output
format of the VPU, that can be imported directly with the display
engine hardware supported by the sun4i-drm driver.

Signed-off-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Reviewed-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190118145133.21281-12-paul.kocialkowski@bootlin.com
---
 include/uapi/drm/drm_fourcc.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index 41106c835747..91d08a23f125 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -238,6 +238,8 @@ extern "C" {
 #define DRM_FORMAT_MOD_VENDOR_VIVANTE 0x06
 #define DRM_FORMAT_MOD_VENDOR_BROADCOM 0x07
 #define DRM_FORMAT_MOD_VENDOR_ARM     0x08
+#define DRM_FORMAT_MOD_VENDOR_ALLWINNER 0x09
+
 /* add more to the end as needed */
 
 #define DRM_FORMAT_RESERVED	      ((1ULL << 56) - 1)
@@ -667,6 +669,20 @@ extern "C" {
  */
 #define AFBC_FORMAT_MOD_BCH     (1ULL << 11)
 
+/*
+ * Allwinner tiled modifier
+ *
+ * This tiling mode is implemented by the VPU found on all Allwinner platforms,
+ * codenamed sunxi. It is associated with a YUV format that uses either 2 or 3
+ * planes.
+ *
+ * With this tiling, the luminance samples are disposed in tiles representing
+ * 32x32 pixels and the chrominance samples in tiles representing 32x64 pixels.
+ * The pixel order in each tile is linear and the tiles are disposed linearly,
+ * both in row-major order.
+ */
+#define DRM_FORMAT_MOD_ALLWINNER_TILED fourcc_mod_code(ALLWINNER, 1)
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From f88c19aab5f34835f1ba467c5b508ec4f782f07f Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 17 Jan 2019 12:44:25 -0800
Subject: net_sched: add hit counter for matchall

Although matchall always matches packets, however, it still
relies on a protocol match first. So it is still useful to have
such a counter for matchall. Of course, unlike u32, every time
we hit a matchall filter, it is always a success, so we don't
have to distinguish them.

Sample output:

filter protocol 802.1Q pref 100 matchall chain 0
filter protocol 802.1Q pref 100 matchall chain 0 handle 0x1
  not_in_hw (rule hit 10)
	action order 1: vlan  pop continue
	 index 1 ref 1 bind 1 installed 40 sec used 1 sec
	Action statistics:
	Sent 836 bytes 10 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0

Reported-by: Martin Olsson <martin.olsson+netdev@sentorsecurity.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  6 ++++++
 net/sched/cls_matchall.c     | 24 ++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 95d0db2a8350..32a3416b51c3 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -527,11 +527,17 @@ enum {
 
 /* Match-all classifier */
 
+struct tc_matchall_pcnt {
+	__u64 rhit;
+};
+
 enum {
 	TCA_MATCHALL_UNSPEC,
 	TCA_MATCHALL_CLASSID,
 	TCA_MATCHALL_ACT,
 	TCA_MATCHALL_FLAGS,
+	TCA_MATCHALL_PCNT,
+	TCA_MATCHALL_PAD,
 	__TCA_MATCHALL_MAX,
 };
 
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 0e408ee9dcec..a1b803fd372e 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 
 #include <net/sch_generic.h>
 #include <net/pkt_cls.h>
@@ -22,6 +23,7 @@ struct cls_mall_head {
 	u32 handle;
 	u32 flags;
 	unsigned int in_hw_count;
+	struct tc_matchall_pcnt __percpu *pf;
 	struct rcu_work rwork;
 };
 
@@ -34,6 +36,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		return -1;
 
 	*res = head->res;
+	__this_cpu_inc(head->pf->rhit);
 	return tcf_exts_exec(skb, &head->exts, res);
 }
 
@@ -46,6 +49,7 @@ static void __mall_destroy(struct cls_mall_head *head)
 {
 	tcf_exts_destroy(&head->exts);
 	tcf_exts_put_net(&head->exts);
+	free_percpu(head->pf);
 	kfree(head);
 }
 
@@ -192,6 +196,11 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 		handle = 1;
 	new->handle = handle;
 	new->flags = flags;
+	new->pf = alloc_percpu(struct tc_matchall_pcnt);
+	if (!new->pf) {
+		err = -ENOMEM;
+		goto err_alloc_percpu;
+	}
 
 	err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr,
 			     extack);
@@ -214,6 +223,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 
 err_replace_hw_filter:
 err_set_parms:
+	free_percpu(new->pf);
+err_alloc_percpu:
 	tcf_exts_destroy(&new->exts);
 err_exts_init:
 	kfree(new);
@@ -270,8 +281,10 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
 static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
+	struct tc_matchall_pcnt gpf = {};
 	struct cls_mall_head *head = fh;
 	struct nlattr *nest;
+	int cpu;
 
 	if (!head)
 		return skb->len;
@@ -289,6 +302,17 @@ static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
 	if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags))
 		goto nla_put_failure;
 
+	for_each_possible_cpu(cpu) {
+		struct tc_matchall_pcnt *pf = per_cpu_ptr(head->pf, cpu);
+
+		gpf.rhit += pf->rhit;
+	}
+
+	if (nla_put_64bit(skb, TCA_MATCHALL_PCNT,
+			  sizeof(struct tc_matchall_pcnt),
+			  &gpf, TCA_MATCHALL_PAD))
+		goto nla_put_failure;
+
 	if (tcf_exts_dump(skb, &head->exts))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3-59-g8ed1b


From cb5ccfbe73b389470e1dc11061bb185ef4bc9aec Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 17 Jan 2019 23:59:10 +0200
Subject: devlink: Add health buffer support

Devlink health buffer is a mechanism to pass descriptors between drivers
and devlink. The API allows the driver to add objects, object pair,
value array (nested attributes), value and name.

Driver can use this API to fill the buffers in a format which can be
translated by the devlink to the netlink message.

In order to fulfill it, an internal buffer descriptor is defined. This
will hold the data and metadata per each attribute and by used to pass
actual commands to the netlink.

This mechanism will be later used in devlink health for dump and diagnose
data store by the drivers.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  76 +++++++
 include/uapi/linux/devlink.h |   8 +
 net/core/devlink.c           | 501 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 585 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 67f4293bc970..77c77319290a 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -423,6 +423,8 @@ struct devlink_region;
 
 typedef void devlink_snapshot_data_dest_t(const void *data);
 
+struct devlink_health_buffer;
+
 struct devlink_ops {
 	int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
 	int (*port_type_set)(struct devlink_port *devlink_port,
@@ -584,6 +586,22 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
 				   u8 *data, u32 snapshot_id,
 				   devlink_snapshot_data_dest_t *data_destructor);
 
+int devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer,
+				     int attrtype);
+void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer);
+void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer);
+int devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer,
+					  char *name);
+int devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer,
+				       u8 value);
+int devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer,
+					u32 value);
+int devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer,
+					u64 value);
+int devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer,
+					   char *name);
+int devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer,
+					 void *data, int len);
 #else
 
 static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
@@ -844,6 +862,64 @@ devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
 	return 0;
 }
 
+static inline int
+devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer,
+				 int attrtype)
+{
+	return 0;
+}
+
+static inline void
+devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer)
+{
+}
+
+static inline void
+devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer)
+{
+}
+
+static inline int
+devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer,
+				      char *name)
+{
+	return 0;
+}
+
+static inline int
+devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer,
+				   u8 value)
+{
+	return 0;
+}
+
+static inline int
+devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer,
+				    u32 value)
+{
+	return 0;
+}
+
+static inline int
+devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer,
+				    u64 value)
+{
+	return 0;
+}
+
+static inline int
+devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer,
+				       char *name)
+{
+	return 0;
+}
+
+static inline int
+devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer,
+				     void *data, int len)
+{
+	return 0;
+}
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6e52d3660654..cff0e0cb5ac2 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -285,6 +285,14 @@ enum devlink_attr {
 	DEVLINK_ATTR_REGION_CHUNK_ADDR,         /* u64 */
 	DEVLINK_ATTR_REGION_CHUNK_LEN,          /* u64 */
 
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT,		/* nested */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR,		/* nested */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME,		/* string */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE,	/* nested */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY,	/* nested */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,	/* u8 */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,	/* dynamic */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index abb0da9d7b4b..8984501edade 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3597,6 +3597,507 @@ out:
 	return 0;
 }
 
+#define DEVLINK_HEALTH_BUFFER_SIZE (4096 - GENL_HDRLEN)
+#define DEVLINK_HEALTH_BUFFER_DATA_SIZE (DEVLINK_HEALTH_BUFFER_SIZE / 2)
+#define DEVLINK_HEALTH_SIZE_TO_BUFFERS(size) DIV_ROUND_UP(size, DEVLINK_HEALTH_BUFFER_DATA_SIZE)
+#define DEVLINK_HEALTH_BUFFER_MAX_CHUNK 1024
+
+struct devlink_health_buffer {
+	void *data;
+	u64 offset;
+	u64 bytes_left;
+	u64 bytes_left_metadata;
+	u64 max_nested_depth;
+	u64 curr_nest;
+};
+
+struct devlink_health_buffer_desc {
+	int attrtype;
+	u16 len;
+	u8 nla_type;
+	u8 nest_end;
+	int value[0];
+};
+
+static void
+devlink_health_buffers_reset(struct devlink_health_buffer **buffers_list,
+			     u64 num_of_buffers)
+{
+	u64 i;
+
+	for (i = 0; i < num_of_buffers; i++) {
+		memset(buffers_list[i]->data, 0, DEVLINK_HEALTH_BUFFER_SIZE);
+		buffers_list[i]->offset = 0;
+		buffers_list[i]->bytes_left = DEVLINK_HEALTH_BUFFER_DATA_SIZE;
+		buffers_list[i]->bytes_left_metadata =
+			DEVLINK_HEALTH_BUFFER_DATA_SIZE;
+		buffers_list[i]->max_nested_depth = 0;
+		buffers_list[i]->curr_nest = 0;
+	}
+}
+
+static void
+devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list,
+			       u64 size);
+
+static struct devlink_health_buffer **
+devlink_health_buffers_create(u64 size)
+{
+	struct devlink_health_buffer **buffers_list;
+	u64 num_of_buffers = DEVLINK_HEALTH_SIZE_TO_BUFFERS(size);
+	u64 i;
+
+	buffers_list = kcalloc(num_of_buffers,
+			       sizeof(struct devlink_health_buffer *),
+			       GFP_KERNEL);
+	if (!buffers_list)
+		return NULL;
+
+	for (i = 0; i < num_of_buffers; i++) {
+		struct devlink_health_buffer *buffer;
+		void *data;
+
+		buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
+		data = kzalloc(DEVLINK_HEALTH_BUFFER_SIZE, GFP_KERNEL);
+		if (!buffer || !data) {
+			kfree(buffer);
+			kfree(data);
+			goto buffers_cleanup;
+		}
+		buffers_list[i] = buffer;
+		buffer->data = data;
+	}
+	devlink_health_buffers_reset(buffers_list, num_of_buffers);
+
+	return buffers_list;
+
+buffers_cleanup:
+	devlink_health_buffers_destroy(buffers_list, --i);
+	kfree(buffers_list);
+	return NULL;
+}
+
+static void
+devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list,
+			       u64 num_of_buffers)
+{
+	u64 i;
+
+	for (i = 0; i < num_of_buffers; i++) {
+		kfree(buffers_list[i]->data);
+		kfree(buffers_list[i]);
+	}
+}
+
+void
+devlink_health_buffer_offset_inc(struct devlink_health_buffer *buffer,
+				 int len)
+{
+	buffer->offset += len;
+}
+
+/* In order to store a nest, need two descriptors, for start and end */
+#define DEVLINK_HEALTH_BUFFER_NEST_SIZE (sizeof(struct devlink_health_buffer_desc) * 2)
+
+int devlink_health_buffer_verify_len(struct devlink_health_buffer *buffer,
+				     int len, int metadata_len)
+{
+	if (len > DEVLINK_HEALTH_BUFFER_DATA_SIZE)
+		return -EINVAL;
+
+	if (buffer->bytes_left < len ||
+	    buffer->bytes_left_metadata < metadata_len)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static struct devlink_health_buffer_desc *
+devlink_health_buffer_get_desc_from_offset(struct devlink_health_buffer *buffer)
+{
+	return buffer->data + buffer->offset;
+}
+
+int
+devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer,
+				 int attrtype)
+{
+	struct devlink_health_buffer_desc *desc;
+	int err;
+
+	err = devlink_health_buffer_verify_len(buffer, 0,
+					       DEVLINK_HEALTH_BUFFER_NEST_SIZE);
+	if (err)
+		return err;
+
+	if (attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT &&
+	    attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR &&
+	    attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE &&
+	    attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY)
+		return -EINVAL;
+
+	desc = devlink_health_buffer_get_desc_from_offset(buffer);
+
+	desc->attrtype = attrtype;
+	buffer->bytes_left_metadata -= DEVLINK_HEALTH_BUFFER_NEST_SIZE;
+	devlink_health_buffer_offset_inc(buffer, sizeof(*desc));
+
+	buffer->curr_nest++;
+	buffer->max_nested_depth = max(buffer->max_nested_depth,
+				       buffer->curr_nest);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_start);
+
+enum devlink_health_buffer_nest_end_cancel {
+	DEVLINK_HEALTH_BUFFER_NEST_END = 1,
+	DEVLINK_HEALTH_BUFFER_NEST_CANCEL,
+};
+
+static void
+devlink_health_buffer_nest_end_cancel(struct devlink_health_buffer *buffer,
+				      enum devlink_health_buffer_nest_end_cancel nest)
+{
+	struct devlink_health_buffer_desc *desc;
+
+	WARN_ON(!buffer->curr_nest);
+	buffer->curr_nest--;
+
+	desc = devlink_health_buffer_get_desc_from_offset(buffer);
+	desc->nest_end = nest;
+	devlink_health_buffer_offset_inc(buffer, sizeof(*desc));
+}
+
+void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer)
+{
+	devlink_health_buffer_nest_end_cancel(buffer,
+					      DEVLINK_HEALTH_BUFFER_NEST_END);
+}
+EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_end);
+
+void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer)
+{
+	devlink_health_buffer_nest_end_cancel(buffer,
+					      DEVLINK_HEALTH_BUFFER_NEST_CANCEL);
+}
+EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_cancel);
+
+int
+devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer,
+				      char *name)
+{
+	struct devlink_health_buffer_desc *desc;
+	int err;
+
+	err = devlink_health_buffer_verify_len(buffer, strlen(name) + 1,
+					       sizeof(*desc));
+	if (err)
+		return err;
+
+	desc = devlink_health_buffer_get_desc_from_offset(buffer);
+	desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME;
+	desc->nla_type = NLA_NUL_STRING;
+	desc->len = strlen(name) + 1;
+	memcpy(&desc->value, name, desc->len);
+	devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len);
+
+	buffer->bytes_left_metadata -= sizeof(*desc);
+	buffer->bytes_left -= (strlen(name) + 1);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_health_buffer_put_object_name);
+
+static int
+devlink_health_buffer_put_value(struct devlink_health_buffer *buffer,
+				u8 nla_type, void *value, int len)
+{
+	struct devlink_health_buffer_desc *desc;
+	int err;
+
+	err = devlink_health_buffer_verify_len(buffer, len, sizeof(*desc));
+	if (err)
+		return err;
+
+	desc = devlink_health_buffer_get_desc_from_offset(buffer);
+	desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA;
+	desc->nla_type = nla_type;
+	desc->len = len;
+	memcpy(&desc->value, value, len);
+	devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len);
+
+	buffer->bytes_left_metadata -= sizeof(*desc);
+	buffer->bytes_left -= len;
+
+	return 0;
+}
+
+int
+devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer,
+				   u8 value)
+{
+	int err;
+
+	err = devlink_health_buffer_put_value(buffer, NLA_U8, &value,
+					      sizeof(value));
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u8);
+
+int
+devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer,
+				    u32 value)
+{
+	int err;
+
+	err = devlink_health_buffer_put_value(buffer, NLA_U32, &value,
+					      sizeof(value));
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u32);
+
+int
+devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer,
+				    u64 value)
+{
+	int err;
+
+	err = devlink_health_buffer_put_value(buffer, NLA_U64, &value,
+					      sizeof(value));
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u64);
+
+int
+devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer,
+				       char *name)
+{
+	int err;
+
+	if (strlen(name) + 1 > DEVLINK_HEALTH_BUFFER_MAX_CHUNK)
+		return -EINVAL;
+
+	err = devlink_health_buffer_put_value(buffer, NLA_NUL_STRING, name,
+					      strlen(name) + 1);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_string);
+
+int
+devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer,
+				     void *data, int len)
+{
+	int err;
+
+	if (len > DEVLINK_HEALTH_BUFFER_MAX_CHUNK)
+		return -EINVAL;
+
+	err = devlink_health_buffer_put_value(buffer, NLA_BINARY, data, len);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_data);
+
+static int
+devlink_health_buffer_fill_data(struct sk_buff *skb,
+				struct devlink_health_buffer_desc *desc)
+{
+	int err = -EINVAL;
+
+	switch (desc->nla_type) {
+	case NLA_U8:
+		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
+				 *(u8 *)desc->value);
+		break;
+	case NLA_U32:
+		err = nla_put_u32(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
+				  *(u32 *)desc->value);
+		break;
+	case NLA_U64:
+		err = nla_put_u64_64bit(skb,
+					DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
+					*(u64 *)desc->value, DEVLINK_ATTR_PAD);
+		break;
+	case NLA_NUL_STRING:
+		err = nla_put_string(skb,
+				     DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
+				     (char *)&desc->value);
+		break;
+	case NLA_BINARY:
+		err = nla_put(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
+			      desc->len, (void *)&desc->value);
+		break;
+	}
+
+	return err;
+}
+
+static int
+devlink_health_buffer_fill_type(struct sk_buff *skb,
+				struct devlink_health_buffer_desc *desc)
+{
+	int err = -EINVAL;
+
+	switch (desc->nla_type) {
+	case NLA_U8:
+		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
+				 NLA_U8);
+		break;
+	case NLA_U32:
+		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
+				 NLA_U32);
+		break;
+	case NLA_U64:
+		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
+				 NLA_U64);
+		break;
+	case NLA_NUL_STRING:
+		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
+				 NLA_NUL_STRING);
+		break;
+	case NLA_BINARY:
+		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
+				 NLA_BINARY);
+		break;
+	}
+
+	return err;
+}
+
+static inline struct devlink_health_buffer_desc *
+devlink_health_buffer_get_next_desc(struct devlink_health_buffer_desc *desc)
+{
+	return (void *)&desc->value + desc->len;
+}
+
+static int
+devlink_health_buffer_prepare_skb(struct sk_buff *skb,
+				  struct devlink_health_buffer *buffer)
+{
+	struct devlink_health_buffer_desc *last_desc, *desc;
+	struct nlattr **buffer_nlattr;
+	int err;
+	int i = 0;
+
+	buffer_nlattr = kcalloc(buffer->max_nested_depth,
+				sizeof(*buffer_nlattr), GFP_KERNEL);
+	if (!buffer_nlattr)
+		return -EINVAL;
+
+	last_desc = devlink_health_buffer_get_desc_from_offset(buffer);
+	desc = buffer->data;
+	while (desc != last_desc) {
+		switch (desc->attrtype) {
+		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT:
+		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR:
+		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE:
+		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY:
+			buffer_nlattr[i] = nla_nest_start(skb, desc->attrtype);
+			if (!buffer_nlattr[i])
+				goto nla_put_failure;
+			i++;
+			break;
+		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA:
+			err = devlink_health_buffer_fill_data(skb, desc);
+			if (err)
+				goto nla_put_failure;
+			err = devlink_health_buffer_fill_type(skb, desc);
+			if (err)
+				goto nla_put_failure;
+			break;
+		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME:
+			err = nla_put_string(skb, desc->attrtype,
+					     (char *)&desc->value);
+			if (err)
+				goto nla_put_failure;
+			break;
+		default:
+			WARN_ON(!desc->nest_end);
+			WARN_ON(i <= 0);
+			if (desc->nest_end == DEVLINK_HEALTH_BUFFER_NEST_END)
+				nla_nest_end(skb, buffer_nlattr[--i]);
+			else
+				nla_nest_cancel(skb, buffer_nlattr[--i]);
+			break;
+		}
+		desc = devlink_health_buffer_get_next_desc(desc);
+	}
+
+	return 0;
+
+nla_put_failure:
+	kfree(buffer_nlattr);
+	return err;
+}
+
+static int
+devlink_health_buffer_snd(struct genl_info *info,
+			  enum devlink_command cmd, int flags,
+			  struct devlink_health_buffer **buffers_array,
+			  u64 num_of_buffers)
+{
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	void *hdr;
+	int err;
+	u64 i;
+
+	for (i = 0; i < num_of_buffers; i++) {
+		/* Skip buffer if driver did not fill it up with any data */
+		if (!buffers_array[i]->offset)
+			continue;
+
+		skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+		if (!skb)
+			return -ENOMEM;
+
+		hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+				  &devlink_nl_family, NLM_F_MULTI, cmd);
+		if (!hdr)
+			goto nla_put_failure;
+
+		err = devlink_health_buffer_prepare_skb(skb, buffers_array[i]);
+		if (err)
+			goto nla_put_failure;
+
+		genlmsg_end(skb, hdr);
+		err = genlmsg_reply(skb, info);
+		if (err)
+			return err;
+	}
+
+	skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+	nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+			NLMSG_DONE, 0, flags | NLM_F_MULTI);
+	err = genlmsg_reply(skb, info);
+	if (err)
+		return err;
+
+	return 0;
+
+nla_put_failure:
+	err = -EIO;
+	nlmsg_free(skb);
+	return err;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
-- 
cgit v1.2.3-59-g8ed1b


From ff253fedab961b22117a73ab808fcfa9e6852b50 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 17 Jan 2019 23:59:13 +0200
Subject: devlink: Add health get command

Add devlink health get command to provide reporter/s data for user space.
Add the ability to get data per reporter or dump data from all available
reporters.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  12 ++++
 net/core/devlink.c           | 152 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index cff0e0cb5ac2..c05470578b99 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -89,6 +89,8 @@ enum devlink_command {
 	DEVLINK_CMD_REGION_DEL,
 	DEVLINK_CMD_REGION_READ,
 
+	DEVLINK_CMD_HEALTH_REPORTER_GET,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -293,6 +295,16 @@ enum devlink_attr {
 	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,	/* u8 */
 	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,	/* dynamic */
 
+	DEVLINK_ATTR_HEALTH_REPORTER,			/* nested */
+	DEVLINK_ATTR_HEALTH_REPORTER_NAME,		/* string */
+	DEVLINK_ATTR_HEALTH_REPORTER_STATE,		/* u8 */
+	DEVLINK_ATTR_HEALTH_REPORTER_ERR,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_RECOVER,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL,	/* u8 */
+	DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,	/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,	/* u8 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 943d3e7dea6a..2ba9275449c2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4317,6 +4317,149 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
 }
 EXPORT_SYMBOL_GPL(devlink_health_report);
 
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_info(struct devlink *devlink,
+				      struct genl_info *info)
+{
+	char *reporter_name;
+
+	if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
+		return NULL;
+
+	reporter_name =
+		nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
+	return devlink_health_reporter_find_by_name(devlink, reporter_name);
+}
+
+static int
+devlink_nl_health_reporter_fill(struct sk_buff *msg,
+				struct devlink *devlink,
+				struct devlink_health_reporter *reporter,
+				enum devlink_command cmd, u32 portid,
+				u32 seq, int flags)
+{
+	struct nlattr *reporter_attr;
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto genlmsg_cancel;
+
+	reporter_attr = nla_nest_start(msg, DEVLINK_ATTR_HEALTH_REPORTER);
+	if (!reporter_attr)
+		goto genlmsg_cancel;
+	if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+			   reporter->ops->name))
+		goto reporter_nest_cancel;
+	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
+		       reporter->health_state))
+		goto reporter_nest_cancel;
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR,
+			      reporter->error_count, DEVLINK_ATTR_PAD))
+		goto reporter_nest_cancel;
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER,
+			      reporter->recovery_count, DEVLINK_ATTR_PAD))
+		goto reporter_nest_cancel;
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
+			      reporter->graceful_period,
+			      DEVLINK_ATTR_PAD))
+		goto reporter_nest_cancel;
+	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
+		       reporter->auto_recover))
+		goto reporter_nest_cancel;
+	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL,
+		       reporter->dump_avail))
+		goto reporter_nest_cancel;
+	if (reporter->dump_avail &&
+	    nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,
+			      jiffies_to_msecs(reporter->dump_ts),
+			      DEVLINK_ATTR_PAD))
+		goto reporter_nest_cancel;
+
+	nla_nest_end(msg, reporter_attr);
+	genlmsg_end(msg, hdr);
+	return 0;
+
+reporter_nest_cancel:
+	nla_nest_end(msg, reporter_attr);
+genlmsg_cancel:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb,
+						   struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+	struct sk_buff *msg;
+	int err;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_health_reporter_fill(msg, devlink, reporter,
+					      DEVLINK_CMD_HEALTH_REPORTER_GET,
+					      info->snd_portid, info->snd_seq,
+					      0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
+					  struct netlink_callback *cb)
+{
+	struct devlink_health_reporter *reporter;
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		mutex_lock(&devlink->lock);
+		list_for_each_entry(reporter, &devlink->reporter_list,
+				    list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_health_reporter_fill(msg, devlink,
+							      reporter,
+							      DEVLINK_CMD_HEALTH_REPORTER_GET,
+							      NETLINK_CB(cb->skb).portid,
+							      cb->nlh->nlmsg_seq,
+							      NLM_F_MULTI);
+			if (err) {
+				mutex_unlock(&devlink->lock);
+				goto out;
+			}
+			idx++;
+		}
+		mutex_unlock(&devlink->lock);
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -4342,6 +4485,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -4562,6 +4706,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_GET,
+		.doit = devlink_nl_cmd_health_reporter_get_doit,
+		.dumpit = devlink_nl_cmd_health_reporter_get_dumpit,
+		.policy = devlink_nl_policy,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 6f9d56132eb6d2603d4273cfc65bed914ec47acb Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 17 Jan 2019 23:59:14 +0200
Subject: devlink: Add health set command

Add devlink health set command, in order to set configuration parameters
for a specific reporter.
Supported parameters are:
- graceful_period: Time interval between auto recoveries (in msec)
- auto_recover: Determines if the devlink shall execute recover upon
		receiving error for the reporter

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index c05470578b99..49ad5a76b121 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -90,6 +90,7 @@ enum devlink_command {
 	DEVLINK_CMD_REGION_READ,
 
 	DEVLINK_CMD_HEALTH_REPORTER_GET,
+	DEVLINK_CMD_HEALTH_REPORTER_SET,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2ba9275449c2..a34414bf1c27 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4460,6 +4460,33 @@ out:
 	return msg->len;
 }
 
+static int
+devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	if (!reporter->ops->recover &&
+	    (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
+	     info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
+		return -EINVAL;
+
+	if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
+		reporter->graceful_period =
+			nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
+
+	if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
+		reporter->auto_recover =
+			nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
+
+	return 0;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -4486,6 +4513,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
 	[DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 },
+	[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -4714,6 +4743,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 		/* can be retrieved by unprivileged users */
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_SET,
+		.doit = devlink_nl_cmd_health_reporter_set_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From fcd852c69d776c0f46c8f79e8e431e5cc6ddc7b7 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 17 Jan 2019 23:59:15 +0200
Subject: devlink: Add health recover command

Add devlink health recover command to the uapi, in order to allow the user
to execute a recover operation over a specific reporter.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 49ad5a76b121..1c186fd77343 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -91,6 +91,7 @@ enum devlink_command {
 
 	DEVLINK_CMD_HEALTH_REPORTER_GET,
 	DEVLINK_CMD_HEALTH_REPORTER_SET,
+	DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index a34414bf1c27..b224d0d31c0c 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4487,6 +4487,19 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
 	return 0;
 }
 
+static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb,
+						       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	return devlink_health_reporter_recover(reporter, NULL);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -4750,6 +4763,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
+		.doit = devlink_nl_cmd_health_reporter_recover_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 8a66704a13d9713593342e29b4f0c19762f5746b Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 17 Jan 2019 23:59:16 +0200
Subject: devlink: Add health diagnose command

Add devlink health diagnose command, in order to run a diagnose
operation over a specific reporter.

It is expected from driver's callback for diagnose command to fill it
via the buffer descriptors API. Devlink will parse it and convert it to
netlink nla API in order to pass it to the user.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 51 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 1c186fd77343..51b4d7612cf8 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -92,6 +92,7 @@ enum devlink_command {
 	DEVLINK_CMD_HEALTH_REPORTER_GET,
 	DEVLINK_CMD_HEALTH_REPORTER_SET,
 	DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
+	DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b224d0d31c0c..57252ca31e1e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4500,6 +4500,50 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb,
 	return devlink_health_reporter_recover(reporter, NULL);
 }
 
+static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
+							struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+	u64 num_of_buffers;
+	int err;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	if (!reporter->ops->diagnose)
+		return -EOPNOTSUPP;
+
+	num_of_buffers =
+		DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size);
+
+	mutex_lock(&reporter->diagnose_lock);
+	devlink_health_buffers_reset(reporter->diagnose_buffers_array,
+				     num_of_buffers);
+
+	err = reporter->ops->diagnose(reporter,
+				      reporter->diagnose_buffers_array,
+				      DEVLINK_HEALTH_BUFFER_SIZE,
+				      num_of_buffers);
+	if (err)
+		goto out;
+
+	err = devlink_health_buffer_snd(info,
+					DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
+					0, reporter->diagnose_buffers_array,
+					num_of_buffers);
+	if (err)
+		goto out;
+
+	mutex_unlock(&reporter->diagnose_lock);
+	return 0;
+
+out:
+	mutex_unlock(&reporter->diagnose_lock);
+	return err;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -4770,6 +4814,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
+		.doit = devlink_nl_cmd_health_reporter_diagnose_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 12bd0dcefe88782ac1c9fff632958dd1b71d27e5 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 17 Jan 2019 23:59:17 +0200
Subject: devlink: Add health dump {get,clear} commands

Add devlink health dump commands, in order to run an dump operation
over a specific reporter.

The supported operations are dump_get in order to get last saved
dump (if not exist, dump now) and dump_clear to clear last saved
dump.

It is expected from driver's callback for diagnose command to fill it
via the buffer descriptors API. Devlink will parse it and convert it to
netlink nla API in order to pass it to the user.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  2 ++
 net/core/devlink.c           | 75 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 51b4d7612cf8..6b26bb2ce4dc 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -93,6 +93,8 @@ enum devlink_command {
 	DEVLINK_CMD_HEALTH_REPORTER_SET,
 	DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
 	DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
+	DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
+	DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 57252ca31e1e..60248a53c0ad 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4544,6 +4544,65 @@ out:
 	return err;
 }
 
+static void
+devlink_health_dump_clear(struct devlink_health_reporter *reporter)
+{
+	reporter->dump_avail = false;
+	reporter->dump_ts = 0;
+	devlink_health_buffers_reset(reporter->dump_buffers_array,
+				     DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size));
+}
+
+static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb,
+							struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+	u64 num_of_buffers;
+	int err;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	if (!reporter->ops->dump)
+		return -EOPNOTSUPP;
+
+	num_of_buffers =
+		DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size);
+
+	mutex_lock(&reporter->dump_lock);
+	err = devlink_health_do_dump(reporter, NULL);
+	if (err)
+		goto out;
+
+	err = devlink_health_buffer_snd(info,
+					DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
+					0, reporter->dump_buffers_array,
+					num_of_buffers);
+
+out:
+	mutex_unlock(&reporter->dump_lock);
+	return err;
+}
+
+static int
+devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
+					       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	mutex_lock(&reporter->dump_lock);
+	devlink_health_dump_clear(reporter);
+	mutex_unlock(&reporter->dump_lock);
+	return 0;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -4821,6 +4880,22 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
+		.doit = devlink_nl_cmd_health_reporter_dump_get_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
+	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
+		.doit = devlink_nl_cmd_health_reporter_dump_clear_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 36647055b37ec78e9068f470f14e7cd75c001c22 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Tue, 18 Dec 2018 17:02:07 -0800
Subject: cfg80211: Add airtime statistics and settings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds TX airtime statistics to the cfg80211 station dump (to go along
with the RX info already present), and adds a new parameter to set the
airtime weight of each station. The latter allows userspace to implement
policies for different stations by varying their weights.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
[rmanohar@codeaurora.org: fixed checkpatch warnings]
Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
[move airtime weight != 0 check into policy]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 11 ++++++++++-
 include/uapi/linux/nl80211.h | 15 +++++++++++++++
 net/wireless/nl80211.c       | 24 ++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index e0c41eb1c860..1691f52fcc80 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1003,6 +1003,7 @@ enum station_parameters_apply_mask {
  * @support_p2p_ps: information if station supports P2P PS mechanism
  * @he_capa: HE capabilities of station
  * @he_capa_len: the length of the HE capabilities
+ * @airtime_weight: airtime scheduler weight for this station
  */
 struct station_parameters {
 	const u8 *supported_rates;
@@ -1032,6 +1033,7 @@ struct station_parameters {
 	int support_p2p_ps;
 	const struct ieee80211_he_cap_elem *he_capa;
 	u8 he_capa_len;
+	u16 airtime_weight;
 };
 
 /**
@@ -1300,6 +1302,8 @@ struct cfg80211_tid_stats {
  *	from this peer
  * @connected_to_gate: true if mesh STA has a path to mesh gate
  * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
+ * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer
+ * @airtime_weight: current airtime scheduling weight
  * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
  *	(IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
  *	Note that this doesn't use the @filled bit, but is used if non-NULL.
@@ -1350,8 +1354,9 @@ struct station_info {
 
 	u32 expected_throughput;
 
-	u64 rx_beacon;
+	u64 tx_duration;
 	u64 rx_duration;
+	u64 rx_beacon;
 	u8 rx_beacon_signal_avg;
 	u8 connected_to_gate;
 
@@ -1359,6 +1364,8 @@ struct station_info {
 	s8 ack_signal;
 	s8 avg_ack_signal;
 
+	u16 airtime_weight;
+
 	u32 rx_mpdu_count;
 	u32 fcs_err_count;
 };
@@ -2391,6 +2398,8 @@ enum wiphy_params_flags {
 	WIPHY_PARAM_TXQ_QUANTUM		= 1 << 8,
 };
 
+#define IEEE80211_DEFAULT_AIRTIME_WEIGHT	256
+
 /**
  * struct cfg80211_pmksa - PMK Security Association
  *
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 31ae5c7f10e3..ebe79e12c82e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2299,6 +2299,9 @@ enum nl80211_commands {
  *	This is also used for capability advertisement in the wiphy information,
  *	with the appropriate sub-attributes.
  *
+ * @NL80211_ATTR_AIRTIME_WEIGHT: Station's weight when scheduled by the airtime
+ *	scheduler.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2748,6 +2751,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_PEER_MEASUREMENTS,
 
+	NL80211_ATTR_AIRTIME_WEIGHT,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3125,6 +3130,9 @@ enum nl80211_sta_bss_param {
  *	might not be fully accurate.
  * @NL80211_STA_INFO_CONNECTED_TO_GATE: set to true if STA has a path to a
  *	mesh gate (u8, 0 or 1)
+ * @NL80211_STA_INFO_TX_DURATION: aggregate PPDU duration for all frames
+ *	sent to the station (u64, usec)
+ * @NL80211_STA_INFO_AIRTIME_WEIGHT: current airtime weight for station (u16)
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3168,6 +3176,8 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_RX_MPDUS,
 	NL80211_STA_INFO_FCS_ERROR_COUNT,
 	NL80211_STA_INFO_CONNECTED_TO_GATE,
+	NL80211_STA_INFO_TX_DURATION,
+	NL80211_STA_INFO_AIRTIME_WEIGHT,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
@@ -5316,6 +5326,10 @@ enum nl80211_feature_flags {
  *      if this flag is not set. Ignoring this can leak clear text packets and/or
  *      freeze the connection.
  *
+ * @NL80211_EXT_FEATURE_AIRTIME_FAIRNESS: Driver supports getting airtime
+ *	fairness for transmitted packets and has enabled airtime fairness
+ *	scheduling.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5355,6 +5369,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT,
 	NL80211_EXT_FEATURE_CAN_REPLACE_PTK0,
 	NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER,
+	NL80211_EXT_FEATURE_AIRTIME_FAIRNESS,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5e49492d5911..a89688929b16 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -557,6 +557,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_PEER_MEASUREMENTS] =
 		NLA_POLICY_NESTED(NL80211_PMSR_FTM_REQ_ATTR_MAX,
 				  nl80211_pmsr_attr_policy),
+	[NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1),
 };
 
 /* policy for the key attributes */
@@ -4851,6 +4852,11 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 	PUT_SINFO(PLID, plid, u16);
 	PUT_SINFO(PLINK_STATE, plink_state, u8);
 	PUT_SINFO_U64(RX_DURATION, rx_duration);
+	PUT_SINFO_U64(TX_DURATION, tx_duration);
+
+	if (wiphy_ext_feature_isset(&rdev->wiphy,
+				    NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
+		PUT_SINFO(AIRTIME_WEIGHT, airtime_weight, u16);
 
 	switch (rdev->wiphy.signal_type) {
 	case CFG80211_SIGNAL_TYPE_MBM:
@@ -5470,6 +5476,15 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 			nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
 	}
 
+	if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT])
+		params.airtime_weight =
+			nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]);
+
+	if (params.airtime_weight &&
+	    !wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
+		return -EOPNOTSUPP;
+
 	/* Include parameters for TDLS peer (will check later) */
 	err = nl80211_set_station_tdls(info, &params);
 	if (err)
@@ -5598,6 +5613,15 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		params.plink_action =
 			nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
 
+	if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT])
+		params.airtime_weight =
+			nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]);
+
+	if (params.airtime_weight &&
+	    !wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
+		return -EOPNOTSUPP;
+
 	err = nl80211_parse_sta_channel_info(info, &params);
 	if (err)
 		return err;
-- 
cgit v1.2.3-59-g8ed1b


From cc24163690997c685641d84e77ff6f1c592b06fe Mon Sep 17 00:00:00 2001
From: Julan Hsu <julanhsu@google.com>
Date: Tue, 15 Jan 2019 15:28:42 -0800
Subject: nl80211/mac80211: mesh: add hop count to mpath info

Expose hop count to destination information in mpath info

Signed-off-by: Julan Hsu <julanhsu@google.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 4 ++++
 include/uapi/linux/nl80211.h | 2 ++
 net/mac80211/cfg.c           | 4 +++-
 net/mac80211/mesh_hwmp.c     | 5 +++++
 net/wireless/nl80211.c       | 5 ++++-
 5 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1691f52fcc80..37816786d3e1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1429,6 +1429,7 @@ enum monitor_flags {
  * @MPATH_INFO_DISCOVERY_TIMEOUT: @discovery_timeout filled
  * @MPATH_INFO_DISCOVERY_RETRIES: @discovery_retries filled
  * @MPATH_INFO_FLAGS: @flags filled
+ * @MPATH_INFO_HOP_COUNT: @hop_count filled
  */
 enum mpath_info_flags {
 	MPATH_INFO_FRAME_QLEN		= BIT(0),
@@ -1438,6 +1439,7 @@ enum mpath_info_flags {
 	MPATH_INFO_DISCOVERY_TIMEOUT	= BIT(4),
 	MPATH_INFO_DISCOVERY_RETRIES	= BIT(5),
 	MPATH_INFO_FLAGS		= BIT(6),
+	MPATH_INFO_HOP_COUNT		= BIT(7)
 };
 
 /**
@@ -1457,6 +1459,7 @@ enum mpath_info_flags {
  *	This number should increase every time the list of mesh paths
  *	changes, i.e. when a station is added or removed, so that
  *	userspace can tell whether it got a consistent snapshot.
+ * @hop_count: hops to destination
  */
 struct mpath_info {
 	u32 filled;
@@ -1467,6 +1470,7 @@ struct mpath_info {
 	u32 discovery_timeout;
 	u8 discovery_retries;
 	u8 flags;
+	u8 hop_count;
 
 	int generation;
 };
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index ebe79e12c82e..213a1d7c1063 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3287,6 +3287,7 @@ enum nl80211_mpath_flags {
  * 	&enum nl80211_mpath_flags;
  * @NL80211_MPATH_INFO_DISCOVERY_TIMEOUT: total path discovery timeout, in msec
  * @NL80211_MPATH_INFO_DISCOVERY_RETRIES: mesh path discovery retries
+ * @NL80211_MPATH_INFO_HOP_COUNT: hop count to destination
  * @NL80211_MPATH_INFO_MAX: highest mesh path information attribute number
  *	currently defind
  * @__NL80211_MPATH_INFO_AFTER_LAST: internal use
@@ -3300,6 +3301,7 @@ enum nl80211_mpath_info {
 	NL80211_MPATH_INFO_FLAGS,
 	NL80211_MPATH_INFO_DISCOVERY_TIMEOUT,
 	NL80211_MPATH_INFO_DISCOVERY_RETRIES,
+	NL80211_MPATH_INFO_HOP_COUNT,
 
 	/* keep last */
 	__NL80211_MPATH_INFO_AFTER_LAST,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 83ee573b1804..52cbaaf5caea 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1745,7 +1745,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
 			MPATH_INFO_EXPTIME |
 			MPATH_INFO_DISCOVERY_TIMEOUT |
 			MPATH_INFO_DISCOVERY_RETRIES |
-			MPATH_INFO_FLAGS;
+			MPATH_INFO_FLAGS |
+			MPATH_INFO_HOP_COUNT;
 
 	pinfo->frame_qlen = mpath->frame_queue.qlen;
 	pinfo->sn = mpath->sn;
@@ -1765,6 +1766,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
 		pinfo->flags |= NL80211_MPATH_FLAG_FIXED;
 	if (mpath->flags & MESH_PATH_RESOLVED)
 		pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED;
+	pinfo->hop_count = mpath->hop_count;
 }
 
 static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev,
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 6950cd0bf594..6d1190b3332f 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -386,6 +386,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 	unsigned long orig_lifetime, exp_time;
 	u32 last_hop_metric, new_metric;
 	bool process = true;
+	u8 hopcount;
 
 	rcu_read_lock();
 	sta = sta_info_get(sdata, mgmt->sa);
@@ -404,6 +405,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 		orig_sn = PREQ_IE_ORIG_SN(hwmp_ie);
 		orig_lifetime = PREQ_IE_LIFETIME(hwmp_ie);
 		orig_metric = PREQ_IE_METRIC(hwmp_ie);
+		hopcount = PREQ_IE_HOPCOUNT(hwmp_ie) + 1;
 		break;
 	case MPATH_PREP:
 		/* Originator here refers to the MP that was the target in the
@@ -415,6 +417,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 		orig_sn = PREP_IE_TARGET_SN(hwmp_ie);
 		orig_lifetime = PREP_IE_LIFETIME(hwmp_ie);
 		orig_metric = PREP_IE_METRIC(hwmp_ie);
+		hopcount = PREP_IE_HOPCOUNT(hwmp_ie) + 1;
 		break;
 	default:
 		rcu_read_unlock();
@@ -482,6 +485,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 			mpath->sn = orig_sn;
 			mpath->exp_time = time_after(mpath->exp_time, exp_time)
 					  ?  mpath->exp_time : exp_time;
+			mpath->hop_count = hopcount;
 			mesh_path_activate(mpath);
 			spin_unlock_bh(&mpath->state_lock);
 			ewma_mesh_fail_avg_init(&sta->mesh->fail_avg);
@@ -523,6 +527,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 			mpath->metric = last_hop_metric;
 			mpath->exp_time = time_after(mpath->exp_time, exp_time)
 					  ?  mpath->exp_time : exp_time;
+			mpath->hop_count = 1;
 			mesh_path_activate(mpath);
 			spin_unlock_bh(&mpath->state_lock);
 			ewma_mesh_fail_avg_init(&sta->mesh->fail_avg);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a89688929b16..159125e16c79 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5827,7 +5827,10 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq,
 			 pinfo->discovery_timeout)) ||
 	    ((pinfo->filled & MPATH_INFO_DISCOVERY_RETRIES) &&
 	     nla_put_u8(msg, NL80211_MPATH_INFO_DISCOVERY_RETRIES,
-			pinfo->discovery_retries)))
+			pinfo->discovery_retries)) ||
+	    ((pinfo->filled & MPATH_INFO_HOP_COUNT) &&
+	     nla_put_u8(msg, NL80211_MPATH_INFO_HOP_COUNT,
+			pinfo->hop_count)))
 		goto nla_put_failure;
 
 	nla_nest_end(msg, pinfoattr);
-- 
cgit v1.2.3-59-g8ed1b


From 540bbcb930ed2fc9d6a57e0babea00027a7ecc67 Mon Sep 17 00:00:00 2001
From: Julan Hsu <julanhsu@google.com>
Date: Tue, 15 Jan 2019 15:28:43 -0800
Subject: nl80211/mac80211: mesh: add mesh path change count to mpath info

Expose path change count to destination in mpath info

Signed-off-by: Julan Hsu <julanhsu@google.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 5 ++++-
 include/uapi/linux/nl80211.h | 4 +++-
 net/mac80211/cfg.c           | 4 +++-
 net/mac80211/mesh.h          | 2 ++
 net/mac80211/mesh_hwmp.c     | 4 ++++
 net/wireless/nl80211.c       | 5 ++++-
 6 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 37816786d3e1..9c1d7979c200 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1439,7 +1439,8 @@ enum mpath_info_flags {
 	MPATH_INFO_DISCOVERY_TIMEOUT	= BIT(4),
 	MPATH_INFO_DISCOVERY_RETRIES	= BIT(5),
 	MPATH_INFO_FLAGS		= BIT(6),
-	MPATH_INFO_HOP_COUNT		= BIT(7)
+	MPATH_INFO_HOP_COUNT		= BIT(7),
+	MPATH_INFO_PATH_CHANGE		= BIT(8),
 };
 
 /**
@@ -1460,6 +1461,7 @@ enum mpath_info_flags {
  *	changes, i.e. when a station is added or removed, so that
  *	userspace can tell whether it got a consistent snapshot.
  * @hop_count: hops to destination
+ * @path_change_count: total number of path changes to destination
  */
 struct mpath_info {
 	u32 filled;
@@ -1471,6 +1473,7 @@ struct mpath_info {
 	u8 discovery_retries;
 	u8 flags;
 	u8 hop_count;
+	u32 path_change_count;
 
 	int generation;
 };
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 213a1d7c1063..426db4d8f71c 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3288,8 +3288,9 @@ enum nl80211_mpath_flags {
  * @NL80211_MPATH_INFO_DISCOVERY_TIMEOUT: total path discovery timeout, in msec
  * @NL80211_MPATH_INFO_DISCOVERY_RETRIES: mesh path discovery retries
  * @NL80211_MPATH_INFO_HOP_COUNT: hop count to destination
+ * @NL80211_MPATH_INFO_PATH_CHANGE: total number of path changes to destination
  * @NL80211_MPATH_INFO_MAX: highest mesh path information attribute number
- *	currently defind
+ *	currently defined
  * @__NL80211_MPATH_INFO_AFTER_LAST: internal use
  */
 enum nl80211_mpath_info {
@@ -3302,6 +3303,7 @@ enum nl80211_mpath_info {
 	NL80211_MPATH_INFO_DISCOVERY_TIMEOUT,
 	NL80211_MPATH_INFO_DISCOVERY_RETRIES,
 	NL80211_MPATH_INFO_HOP_COUNT,
+	NL80211_MPATH_INFO_PATH_CHANGE,
 
 	/* keep last */
 	__NL80211_MPATH_INFO_AFTER_LAST,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 52cbaaf5caea..e5e0f100389c 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1746,7 +1746,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
 			MPATH_INFO_DISCOVERY_TIMEOUT |
 			MPATH_INFO_DISCOVERY_RETRIES |
 			MPATH_INFO_FLAGS |
-			MPATH_INFO_HOP_COUNT;
+			MPATH_INFO_HOP_COUNT |
+			MPATH_INFO_PATH_CHANGE;
 
 	pinfo->frame_qlen = mpath->frame_queue.qlen;
 	pinfo->sn = mpath->sn;
@@ -1767,6 +1768,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
 	if (mpath->flags & MESH_PATH_RESOLVED)
 		pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED;
 	pinfo->hop_count = mpath->hop_count;
+	pinfo->path_change_count = mpath->path_change_count;
 }
 
 static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev,
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index cad6592c52a1..8b26858ab4d5 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -94,6 +94,7 @@ enum mesh_deferred_task_flags {
  * @last_preq_to_root: Timestamp of last PREQ sent to root
  * @is_root: the destination station of this path is a root node
  * @is_gate: the destination station of this path is a mesh gate
+ * @path_change_count: the number of path changes to destination
  *
  *
  * The dst address is unique in the mesh path table. Since the mesh_path is
@@ -124,6 +125,7 @@ struct mesh_path {
 	unsigned long last_preq_to_root;
 	bool is_root;
 	bool is_gate;
+	u32 path_change_count;
 };
 
 /**
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 6d1190b3332f..a0aebf44493f 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -479,6 +479,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 		}
 
 		if (fresh_info) {
+			if (rcu_access_pointer(mpath->next_hop) != sta)
+				mpath->path_change_count++;
 			mesh_path_assign_nexthop(mpath, sta);
 			mpath->flags |= MESH_PATH_SN_VALID;
 			mpath->metric = new_metric;
@@ -523,6 +525,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 		}
 
 		if (fresh_info) {
+			if (rcu_access_pointer(mpath->next_hop) != sta)
+				mpath->path_change_count++;
 			mesh_path_assign_nexthop(mpath, sta);
 			mpath->metric = last_hop_metric;
 			mpath->exp_time = time_after(mpath->exp_time, exp_time)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 159125e16c79..e5f9c9ceb6c9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5830,7 +5830,10 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq,
 			pinfo->discovery_retries)) ||
 	    ((pinfo->filled & MPATH_INFO_HOP_COUNT) &&
 	     nla_put_u8(msg, NL80211_MPATH_INFO_HOP_COUNT,
-			pinfo->hop_count)))
+			pinfo->hop_count)) ||
+	    ((pinfo->filled & MPATH_INFO_PATH_CHANGE) &&
+	     nla_put_u32(msg, NL80211_MPATH_INFO_PATH_CHANGE,
+			 pinfo->path_change_count)))
 		goto nla_put_failure;
 
 	nla_nest_end(msg, pinfoattr);
-- 
cgit v1.2.3-59-g8ed1b


From 5954894ba3723995fbeab77bef62bb4878a654bb Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 17 Jan 2019 17:14:01 -0800
Subject: net_sched: add performance counters for basic filter

Similar to u32 filter, it is useful to know how many times
we reach each basic filter and how many times we pass the
ematch attached to it.

Sample output:

filter protocol arp pref 49152 basic chain 0
filter protocol arp pref 49152 basic chain 0 handle 0x1  (rule hit 3 success 3)
	action order 1: gact action pass
	 random type none pass val 0
	 index 1 ref 1 bind 1 installed 81 sec used 4 sec
	Action statistics:
	Sent 126 bytes 3 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  7 +++++++
 net/sched/cls_basic.c        | 25 +++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 32a3416b51c3..02ac251be8c4 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -333,12 +333,19 @@ enum {
 
 /* Basic filter */
 
+struct tc_basic_pcnt {
+	__u64 rcnt;
+	__u64 rhit;
+};
+
 enum {
 	TCA_BASIC_UNSPEC,
 	TCA_BASIC_CLASSID,
 	TCA_BASIC_EMATCHES,
 	TCA_BASIC_ACT,
 	TCA_BASIC_POLICE,
+	TCA_BASIC_PCNT,
+	TCA_BASIC_PAD,
 	__TCA_BASIC_MAX
 };
 
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 6a5dce8baf19..4a57fec6f306 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -18,6 +18,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
 #include <linux/idr.h>
+#include <linux/percpu.h>
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
@@ -35,6 +36,7 @@ struct basic_filter {
 	struct tcf_result	res;
 	struct tcf_proto	*tp;
 	struct list_head	link;
+	struct tc_basic_pcnt __percpu *pf;
 	struct rcu_work		rwork;
 };
 
@@ -46,8 +48,10 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	struct basic_filter *f;
 
 	list_for_each_entry_rcu(f, &head->flist, link) {
+		__this_cpu_inc(f->pf->rcnt);
 		if (!tcf_em_tree_match(skb, &f->ematches, NULL))
 			continue;
+		__this_cpu_inc(f->pf->rhit);
 		*res = f->res;
 		r = tcf_exts_exec(skb, &f->exts, res);
 		if (r < 0)
@@ -89,6 +93,7 @@ static void __basic_delete_filter(struct basic_filter *f)
 	tcf_exts_destroy(&f->exts);
 	tcf_em_tree_destroy(&f->ematches);
 	tcf_exts_put_net(&f->exts);
+	free_percpu(f->pf);
 	kfree(f);
 }
 
@@ -208,6 +213,11 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 	if (err)
 		goto errout;
 	fnew->handle = handle;
+	fnew->pf = alloc_percpu(struct tc_basic_pcnt);
+	if (!fnew->pf) {
+		err = -ENOMEM;
+		goto errout;
+	}
 
 	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr,
 			      extack);
@@ -231,6 +241,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 
 	return 0;
 errout:
+	free_percpu(fnew->pf);
 	tcf_exts_destroy(&fnew->exts);
 	kfree(fnew);
 	return err;
@@ -265,8 +276,10 @@ static void basic_bind_class(void *fh, u32 classid, unsigned long cl)
 static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		      struct sk_buff *skb, struct tcmsg *t)
 {
+	struct tc_basic_pcnt gpf = {};
 	struct basic_filter *f = fh;
 	struct nlattr *nest;
+	int cpu;
 
 	if (f == NULL)
 		return skb->len;
@@ -281,6 +294,18 @@ static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
 	    nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))
 		goto nla_put_failure;
 
+	for_each_possible_cpu(cpu) {
+		struct tc_basic_pcnt *pf = per_cpu_ptr(f->pf, cpu);
+
+		gpf.rcnt += pf->rcnt;
+		gpf.rhit += pf->rhit;
+	}
+
+	if (nla_put_64bit(skb, TCA_BASIC_PCNT,
+			  sizeof(struct tc_basic_pcnt),
+			  &gpf, TCA_BASIC_PAD))
+		goto nla_put_failure;
+
 	if (tcf_exts_dump(skb, &f->exts) < 0 ||
 	    tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
 		goto nla_put_failure;
-- 
cgit v1.2.3-59-g8ed1b


From ad07c8ceb6631a83b62d405a61448bba92adac68 Mon Sep 17 00:00:00 2001
From: Andrew Murray <andrew.murray@arm.com>
Date: Thu, 10 Jan 2019 13:53:34 +0000
Subject: perf/core: Remove unused perf_flags

Now that perf_flags is not used we remove it.

Signed-off-by: Andrew Murray <andrew.murray@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: robin.murphy@arm.com
Cc: suzuki.poulose@arm.com
Link: https://lkml.kernel.org/r/1547128414-50693-13-git-send-email-andrew.murray@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h       | 2 --
 tools/include/uapi/linux/perf_event.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9de8780ac8d9..ea19b5d491bf 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -445,8 +445,6 @@ struct perf_event_query_bpf {
 	__u32	ids[0];
 };
 
-#define perf_flags(attr)	(*(&(attr)->read_format + 1))
-
 /*
  * Ioctls that can be done on a perf event fd:
  */
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 9de8780ac8d9..ea19b5d491bf 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -445,8 +445,6 @@ struct perf_event_query_bpf {
 	__u32	ids[0];
 };
 
-#define perf_flags(attr)	(*(&(attr)->read_format + 1))
-
 /*
  * Ioctls that can be done on a perf event fd:
  */
-- 
cgit v1.2.3-59-g8ed1b


From 76193a94522f1d4edf2447a536f3f796ce56343b Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 17 Jan 2019 08:15:13 -0800
Subject: perf, bpf: Introduce PERF_RECORD_KSYMBOL

For better performance analysis of dynamically JITed and loaded kernel
functions, such as BPF programs, this patch introduces
PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol
register/unregister information to user space.

The following data structure is used for PERF_RECORD_KSYMBOL.

    /*
     * struct {
     *      struct perf_event_header        header;
     *      u64                             addr;
     *      u32                             len;
     *      u16                             ksym_type;
     *      u16                             flags;
     *      char                            name[];
     *      struct sample_id                sample_id;
     * };
     */

Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kernel-team@fb.com
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20190117161521.1341602-2-songliubraving@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/perf_event.h      |  8 ++++
 include/uapi/linux/perf_event.h | 26 ++++++++++-
 kernel/events/core.c            | 98 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 130 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4eb88065a9b5..136fe0495374 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1122,6 +1122,10 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
+
+extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
+			       bool unregister, const char *sym);
+
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -1342,6 +1346,10 @@ static inline int perf_unregister_guest_info_callbacks
 (struct perf_guest_info_callbacks *callbacks)				{ return 0; }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
+
+typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
+static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
+				      bool unregister, const char *sym)	{ }
 static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
 static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ea19b5d491bf..1dee5c8f166b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -372,7 +372,8 @@ struct perf_event_attr {
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
 				namespaces     :  1, /* include namespaces data */
-				__reserved_1   : 35;
+				ksymbol        :  1, /* include ksymbol events */
+				__reserved_1   : 34;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -963,9 +964,32 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_NAMESPACES			= 16,
 
+	/*
+	 * Record ksymbol register/unregister events:
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				addr;
+	 *	u32				len;
+	 *	u16				ksym_type;
+	 *	u16				flags;
+	 *	char				name[];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_KSYMBOL			= 17,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
+enum perf_record_ksymbol_type {
+	PERF_RECORD_KSYMBOL_TYPE_UNKNOWN	= 0,
+	PERF_RECORD_KSYMBOL_TYPE_BPF		= 1,
+	PERF_RECORD_KSYMBOL_TYPE_MAX		/* non-ABI */
+};
+
+#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER	(1 << 0)
+
 #define PERF_MAX_STACK_DEPTH		127
 #define PERF_MAX_CONTEXTS_PER_STACK	  8
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bc525cd1615c..e04ab5f325cf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -385,6 +385,7 @@ static atomic_t nr_namespaces_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_ksymbol_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -4235,7 +4236,7 @@ static bool is_sb_event(struct perf_event *event)
 
 	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
 	    attr->comm || attr->comm_exec ||
-	    attr->task ||
+	    attr->task || attr->ksymbol ||
 	    attr->context_switch)
 		return true;
 	return false;
@@ -4305,6 +4306,8 @@ static void unaccount_event(struct perf_event *event)
 		dec = true;
 	if (has_branch_stack(event))
 		dec = true;
+	if (event->attr.ksymbol)
+		atomic_dec(&nr_ksymbol_events);
 
 	if (dec) {
 		if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7653,6 +7656,97 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_output_end(&handle);
 }
 
+/*
+ * ksymbol register/unregister tracking
+ */
+
+struct perf_ksymbol_event {
+	const char	*name;
+	int		name_len;
+	struct {
+		struct perf_event_header        header;
+		u64				addr;
+		u32				len;
+		u16				ksym_type;
+		u16				flags;
+	} event_id;
+};
+
+static int perf_event_ksymbol_match(struct perf_event *event)
+{
+	return event->attr.ksymbol;
+}
+
+static void perf_event_ksymbol_output(struct perf_event *event, void *data)
+{
+	struct perf_ksymbol_event *ksymbol_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	if (!perf_event_ksymbol_match(event))
+		return;
+
+	perf_event_header__init_id(&ksymbol_event->event_id.header,
+				   &sample, event);
+	ret = perf_output_begin(&handle, event,
+				ksymbol_event->event_id.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, ksymbol_event->event_id);
+	__output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
+			const char *sym)
+{
+	struct perf_ksymbol_event ksymbol_event;
+	char name[KSYM_NAME_LEN];
+	u16 flags = 0;
+	int name_len;
+
+	if (!atomic_read(&nr_ksymbol_events))
+		return;
+
+	if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
+	    ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
+		goto err;
+
+	strlcpy(name, sym, KSYM_NAME_LEN);
+	name_len = strlen(name) + 1;
+	while (!IS_ALIGNED(name_len, sizeof(u64)))
+		name[name_len++] = '\0';
+	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
+
+	if (unregister)
+		flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
+
+	ksymbol_event = (struct perf_ksymbol_event){
+		.name = name,
+		.name_len = name_len,
+		.event_id = {
+			.header = {
+				.type = PERF_RECORD_KSYMBOL,
+				.size = sizeof(ksymbol_event.event_id) +
+					name_len,
+			},
+			.addr = addr,
+			.len = len,
+			.ksym_type = ksym_type,
+			.flags = flags,
+		},
+	};
+
+	perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
+	return;
+err:
+	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
+}
+
 void perf_event_itrace_started(struct perf_event *event)
 {
 	event->attach_state |= PERF_ATTACH_ITRACE;
@@ -9912,6 +10006,8 @@ static void account_event(struct perf_event *event)
 		inc = true;
 	if (is_cgroup_event(event))
 		inc = true;
+	if (event->attr.ksymbol)
+		atomic_inc(&nr_ksymbol_events);
 
 	if (inc) {
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From 6ee52e2a3fe4ea35520720736e6791df1fb67106 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 17 Jan 2019 08:15:15 -0800
Subject: perf, bpf: Introduce PERF_RECORD_BPF_EVENT

For better performance analysis of BPF programs, this patch introduces
PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program
load/unload information to user space.

Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs.
The following example shows kernel symbols for a BPF program with 7 sub
programs:

    ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F
    ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F
    ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F
    ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F
    ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F
    ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F
    ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F
    ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi

When a bpf program is loaded, PERF_RECORD_KSYMBOL is generated for each
of these sub programs. Therefore, PERF_RECORD_BPF_EVENT is not needed
for simple profiling.

For annotation, user space need to listen to PERF_RECORD_BPF_EVENT and
gather more information about these (sub) programs via sys_bpf.

Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradeaed.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kernel-team@fb.com
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20190117161521.1341602-4-songliubraving@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/filter.h          |   7 +++
 include/linux/perf_event.h      |   6 +++
 include/uapi/linux/perf_event.h |  29 +++++++++-
 kernel/bpf/core.c               |   2 +-
 kernel/bpf/syscall.c            |   2 +
 kernel/events/core.c            | 115 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 159 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ad106d845b22..d531d4250bff 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -951,6 +951,7 @@ bpf_address_lookup(unsigned long addr, unsigned long *size,
 
 void bpf_prog_kallsyms_add(struct bpf_prog *fp);
 void bpf_prog_kallsyms_del(struct bpf_prog *fp);
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym);
 
 #else /* CONFIG_BPF_JIT */
 
@@ -1006,6 +1007,12 @@ static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 {
 }
+
+static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+{
+	sym[0] = '\0';
+}
+
 #endif /* CONFIG_BPF_JIT */
 
 void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 136fe0495374..a79e59fc3b7d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1125,6 +1125,9 @@ extern void perf_event_mmap(struct vm_area_struct *vma);
 
 extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
 			       bool unregister, const char *sym);
+extern void perf_event_bpf_event(struct bpf_prog *prog,
+				 enum perf_bpf_event_type type,
+				 u16 flags);
 
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -1350,6 +1353,9 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
 static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
 				      bool unregister, const char *sym)	{ }
+static inline void perf_event_bpf_event(struct bpf_prog *prog,
+					enum perf_bpf_event_type type,
+					u16 flags)			{ }
 static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
 static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1dee5c8f166b..7198ddd0c6b1 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -373,7 +373,8 @@ struct perf_event_attr {
 				write_backward :  1, /* Write ring buffer from end to beginning */
 				namespaces     :  1, /* include namespaces data */
 				ksymbol        :  1, /* include ksymbol events */
-				__reserved_1   : 34;
+				bpf_event      :  1, /* include bpf events */
+				__reserved_1   : 33;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -979,6 +980,25 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_KSYMBOL			= 17,
 
+	/*
+	 * Record bpf events:
+	 *  enum perf_bpf_event_type {
+	 *	PERF_BPF_EVENT_UNKNOWN		= 0,
+	 *	PERF_BPF_EVENT_PROG_LOAD	= 1,
+	 *	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
+	 *  };
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u16				type;
+	 *	u16				flags;
+	 *	u32				id;
+	 *	u8				tag[BPF_TAG_SIZE];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_BPF_EVENT			= 18,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
@@ -990,6 +1010,13 @@ enum perf_record_ksymbol_type {
 
 #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER	(1 << 0)
 
+enum perf_bpf_event_type {
+	PERF_BPF_EVENT_UNKNOWN		= 0,
+	PERF_BPF_EVENT_PROG_LOAD	= 1,
+	PERF_BPF_EVENT_PROG_UNLOAD	= 2,
+	PERF_BPF_EVENT_MAX,		/* non-ABI */
+};
+
 #define PERF_MAX_STACK_DEPTH		127
 #define PERF_MAX_CONTEXTS_PER_STACK	  8
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f908b9356025..19c49313c709 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -495,7 +495,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
 	*symbol_end   = addr + hdr->pages * PAGE_SIZE;
 }
 
-static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
 	const char *end = sym + KSYM_NAME_LEN;
 	const struct btf_type *type;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..30ebd085790b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1211,6 +1211,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del_all(prog);
@@ -1554,6 +1555,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	}
 
 	bpf_prog_kallsyms_add(prog);
+	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
 	return err;
 
 free_used_maps:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e04ab5f325cf..236bb8ddb7bc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
 static atomic_t nr_ksymbol_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -4308,6 +4309,8 @@ static void unaccount_event(struct perf_event *event)
 		dec = true;
 	if (event->attr.ksymbol)
 		atomic_dec(&nr_ksymbol_events);
+	if (event->attr.bpf_event)
+		atomic_dec(&nr_bpf_events);
 
 	if (dec) {
 		if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7747,6 +7750,116 @@ err:
 	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
 }
 
+/*
+ * bpf program load/unload tracking
+ */
+
+struct perf_bpf_event {
+	struct bpf_prog	*prog;
+	struct {
+		struct perf_event_header        header;
+		u16				type;
+		u16				flags;
+		u32				id;
+		u8				tag[BPF_TAG_SIZE];
+	} event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+	return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event, void *data)
+{
+	struct perf_bpf_event *bpf_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	if (!perf_event_bpf_match(event))
+		return;
+
+	perf_event_header__init_id(&bpf_event->event_id.header,
+				   &sample, event);
+	ret = perf_output_begin(&handle, event,
+				bpf_event->event_id.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, bpf_event->event_id);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
+					 enum perf_bpf_event_type type)
+{
+	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
+	char sym[KSYM_NAME_LEN];
+	int i;
+
+	if (prog->aux->func_cnt == 0) {
+		bpf_get_prog_name(prog, sym);
+		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
+				   (u64)(unsigned long)prog->bpf_func,
+				   prog->jited_len, unregister, sym);
+	} else {
+		for (i = 0; i < prog->aux->func_cnt; i++) {
+			struct bpf_prog *subprog = prog->aux->func[i];
+
+			bpf_get_prog_name(subprog, sym);
+			perf_event_ksymbol(
+				PERF_RECORD_KSYMBOL_TYPE_BPF,
+				(u64)(unsigned long)subprog->bpf_func,
+				subprog->jited_len, unregister, sym);
+		}
+	}
+}
+
+void perf_event_bpf_event(struct bpf_prog *prog,
+			  enum perf_bpf_event_type type,
+			  u16 flags)
+{
+	struct perf_bpf_event bpf_event;
+
+	if (type <= PERF_BPF_EVENT_UNKNOWN ||
+	    type >= PERF_BPF_EVENT_MAX)
+		return;
+
+	switch (type) {
+	case PERF_BPF_EVENT_PROG_LOAD:
+	case PERF_BPF_EVENT_PROG_UNLOAD:
+		if (atomic_read(&nr_ksymbol_events))
+			perf_event_bpf_emit_ksymbols(prog, type);
+		break;
+	default:
+		break;
+	}
+
+	if (!atomic_read(&nr_bpf_events))
+		return;
+
+	bpf_event = (struct perf_bpf_event){
+		.prog = prog,
+		.event_id = {
+			.header = {
+				.type = PERF_RECORD_BPF_EVENT,
+				.size = sizeof(bpf_event.event_id),
+			},
+			.type = type,
+			.flags = flags,
+			.id = prog->aux->id,
+		},
+	};
+
+	BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
+
+	memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
+	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
+}
+
 void perf_event_itrace_started(struct perf_event *event)
 {
 	event->attach_state |= PERF_ATTACH_ITRACE;
@@ -10008,6 +10121,8 @@ static void account_event(struct perf_event *event)
 		inc = true;
 	if (event->attr.ksymbol)
 		atomic_inc(&nr_ksymbol_events);
+	if (event->attr.bpf_event)
+		atomic_inc(&nr_bpf_events);
 
 	if (inc) {
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From aefcb7460e0b5f35f72601b7a98eec5ca1639cf2 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Tue, 15 Jan 2019 15:18:56 +1100
Subject: m68k/mac: Fix PRAM accessors

PMU-based m68k Macs pre-date PowerMac-style NVRAM. Use the appropriate
PMU commands. Also implement the missing XPRAM accessors for VIA-based
Macs.

Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/m68k/mac/misc.c     | 43 +++++++++++++++++++++++++++++++++----------
 include/uapi/linux/pmu.h |  2 ++
 2 files changed, 35 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index af000a015f68..d016ca2e0d10 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -66,23 +66,22 @@ static unsigned char pmu_pram_read_byte(int offset)
 {
 	struct adb_request req;
 
-	if (pmu_request(&req, NULL, 3, PMU_READ_NVRAM,
-			(offset >> 8) & 0xFF, offset & 0xFF) < 0)
+	if (pmu_request(&req, NULL, 3, PMU_READ_XPRAM,
+	                offset & 0xFF, 1) < 0)
 		return 0;
-	while (!req.complete)
-		pmu_poll();
-	return req.reply[3];
+	pmu_wait_complete(&req);
+
+	return req.reply[0];
 }
 
 static void pmu_pram_write_byte(unsigned char data, int offset)
 {
 	struct adb_request req;
 
-	if (pmu_request(&req, NULL, 4, PMU_WRITE_NVRAM,
-			(offset >> 8) & 0xFF, offset & 0xFF, data) < 0)
+	if (pmu_request(&req, NULL, 4, PMU_WRITE_XPRAM,
+	                offset & 0xFF, 1, data) < 0)
 		return;
-	while (!req.complete)
-		pmu_poll();
+	pmu_wait_complete(&req);
 }
 #endif /* CONFIG_ADB_PMU */
 
@@ -151,6 +150,16 @@ static void via_rtc_send(__u8 data)
 #define RTC_REG_SECONDS_3       3
 #define RTC_REG_WRITE_PROTECT   13
 
+/*
+ * Inside Mac has no information about two-byte RTC commands but
+ * the MAME/MESS source code has the essentials.
+ */
+
+#define RTC_REG_XPRAM           14
+#define RTC_CMD_XPRAM_READ      (RTC_CMD_READ(RTC_REG_XPRAM) << 8)
+#define RTC_CMD_XPRAM_WRITE     (RTC_CMD_WRITE(RTC_REG_XPRAM) << 8)
+#define RTC_CMD_XPRAM_ARG(a)    (((a & 0xE0) << 3) | ((a & 0x1F) << 2))
+
 /*
  * Execute a VIA PRAM/RTC command. For read commands
  * data should point to a one-byte buffer for the
@@ -198,11 +207,25 @@ static void via_rtc_command(int command, __u8 *data)
 
 static unsigned char via_pram_read_byte(int offset)
 {
-	return 0;
+	unsigned char temp;
+
+	via_rtc_command(RTC_CMD_XPRAM_READ | RTC_CMD_XPRAM_ARG(offset), &temp);
+
+	return temp;
 }
 
 static void via_pram_write_byte(unsigned char data, int offset)
 {
+	unsigned char temp;
+
+	temp = 0x55;
+	via_rtc_command(RTC_CMD_WRITE(RTC_REG_WRITE_PROTECT), &temp);
+
+	temp = data;
+	via_rtc_command(RTC_CMD_XPRAM_WRITE | RTC_CMD_XPRAM_ARG(offset), &temp);
+
+	temp = 0x55 | RTC_FLG_WRITE_PROTECT;
+	via_rtc_command(RTC_CMD_WRITE(RTC_REG_WRITE_PROTECT), &temp);
 }
 
 /*
diff --git a/include/uapi/linux/pmu.h b/include/uapi/linux/pmu.h
index 97256f90e6df..f2fc1bd80017 100644
--- a/include/uapi/linux/pmu.h
+++ b/include/uapi/linux/pmu.h
@@ -19,7 +19,9 @@
 #define PMU_POWER_CTRL		0x11	/* control power of some devices */
 #define PMU_ADB_CMD		0x20	/* send ADB packet */
 #define PMU_ADB_POLL_OFF	0x21	/* disable ADB auto-poll */
+#define PMU_WRITE_XPRAM		0x32	/* write eXtended Parameter RAM */
 #define PMU_WRITE_NVRAM		0x33	/* write non-volatile RAM */
+#define PMU_READ_XPRAM		0x3a	/* read eXtended Parameter RAM */
 #define PMU_READ_NVRAM		0x3b	/* read non-volatile RAM */
 #define PMU_SET_RTC		0x30	/* set real-time clock */
 #define PMU_READ_RTC		0x38	/* read real-time clock */
-- 
cgit v1.2.3-59-g8ed1b


From 6fc23b6ed8fa0ba6cc47b2f8756df1199abc3a5c Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Mon, 21 Jan 2019 12:01:19 +0100
Subject: binderfs: use correct include guards in header

When we switched over from binder_ctl.h to binderfs.h we forgot to change
the include guards. It's minor but it's obviously correct.

Signed-off-by: Christian Brauner <christian@brauner.io>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/android/binderfs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/android/binderfs.h b/include/uapi/linux/android/binderfs.h
index 65b2efd1a0a5..b41628b77120 100644
--- a/include/uapi/linux/android/binderfs.h
+++ b/include/uapi/linux/android/binderfs.h
@@ -4,8 +4,8 @@
  *
  */
 
-#ifndef _UAPI_LINUX_BINDER_CTL_H
-#define _UAPI_LINUX_BINDER_CTL_H
+#ifndef _UAPI_LINUX_BINDERFS_H
+#define _UAPI_LINUX_BINDERFS_H
 
 #include <linux/android/binder.h>
 #include <linux/types.h>
@@ -31,5 +31,5 @@ struct binderfs_device {
  */
 #define BINDER_CTL_ADD _IOWR('b', 1, struct binderfs_device)
 
-#endif /* _UAPI_LINUX_BINDER_CTL_H */
+#endif /* _UAPI_LINUX_BINDERFS_H */
 
-- 
cgit v1.2.3-59-g8ed1b


From 7d0174065f4903fb0ce0bab3d5047284faa7226d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Mon, 21 Jan 2019 12:01:20 +0100
Subject: binderfs: use __u32 for device numbers

We allow more then 255 binderfs binder devices to be created since there
are workloads that require more than that. If we use __u8 we'll overflow
after 255. So let's use a __u32.
Note that there's no released kernel with binderfs out there so this is
not a regression.

Signed-off-by: Christian Brauner <christian@brauner.io>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/android/binderfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/android/binderfs.h b/include/uapi/linux/android/binderfs.h
index b41628b77120..87410477aea9 100644
--- a/include/uapi/linux/android/binderfs.h
+++ b/include/uapi/linux/android/binderfs.h
@@ -22,8 +22,8 @@
  */
 struct binderfs_device {
 	char name[BINDERFS_MAX_NAME + 1];
-	__u8 major;
-	__u8 minor;
+	__u32 major;
+	__u32 minor;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From ec74136ded792deed80780a2f8baf3521eeb72f9 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@android.com>
Date: Mon, 14 Jan 2019 09:10:21 -0800
Subject: binder: create node flag to request sender's security context

To allow servers to verify client identity, allow a node
flag to be set that causes the sender's security context
to be delivered with the transaction. The BR_TRANSACTION
command is extended in BR_TRANSACTION_SEC_CTX to
contain a pointer to the security context string.

Signed-off-by: Todd Kjos <tkjos@google.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 106 ++++++++++++++++++++++++++++--------
 include/uapi/linux/android/binder.h |  19 +++++++
 2 files changed, 102 insertions(+), 23 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index cdfc87629efb..5f6ef5e63b91 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -329,6 +329,8 @@ struct binder_error {
  *                        (invariant after initialized)
  * @min_priority:         minimum scheduling priority
  *                        (invariant after initialized)
+ * @txn_security_ctx:     require sender's security context
+ *                        (invariant after initialized)
  * @async_todo:           list of async work items
  *                        (protected by @proc->inner_lock)
  *
@@ -365,6 +367,7 @@ struct binder_node {
 		 * invariant after initialization
 		 */
 		u8 accept_fds:1;
+		u8 txn_security_ctx:1;
 		u8 min_priority;
 	};
 	bool has_async_transaction;
@@ -615,6 +618,7 @@ struct binder_transaction {
 	long	saved_priority;
 	kuid_t	sender_euid;
 	struct list_head fd_fixups;
+	binder_uintptr_t security_ctx;
 	/**
 	 * @lock:  protects @from, @to_proc, and @to_thread
 	 *
@@ -1152,6 +1156,7 @@ static struct binder_node *binder_init_node_ilocked(
 	node->work.type = BINDER_WORK_NODE;
 	node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
 	node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
+	node->txn_security_ctx = !!(flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX);
 	spin_lock_init(&node->lock);
 	INIT_LIST_HEAD(&node->work.entry);
 	INIT_LIST_HEAD(&node->async_todo);
@@ -2778,6 +2783,8 @@ static void binder_transaction(struct binder_proc *proc,
 	binder_size_t last_fixup_min_off = 0;
 	struct binder_context *context = proc->context;
 	int t_debug_id = atomic_inc_return(&binder_last_id);
+	char *secctx = NULL;
+	u32 secctx_sz = 0;
 
 	e = binder_transaction_log_add(&binder_transaction_log);
 	e->debug_id = t_debug_id;
@@ -3020,6 +3027,20 @@ static void binder_transaction(struct binder_proc *proc,
 	t->flags = tr->flags;
 	t->priority = task_nice(current);
 
+	if (target_node && target_node->txn_security_ctx) {
+		u32 secid;
+
+		security_task_getsecid(proc->tsk, &secid);
+		ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
+		if (ret) {
+			return_error = BR_FAILED_REPLY;
+			return_error_param = ret;
+			return_error_line = __LINE__;
+			goto err_get_secctx_failed;
+		}
+		extra_buffers_size += ALIGN(secctx_sz, sizeof(u64));
+	}
+
 	trace_binder_transaction(reply, t, target_node);
 
 	t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size,
@@ -3036,6 +3057,19 @@ static void binder_transaction(struct binder_proc *proc,
 		t->buffer = NULL;
 		goto err_binder_alloc_buf_failed;
 	}
+	if (secctx) {
+		size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) +
+				    ALIGN(tr->offsets_size, sizeof(void *)) +
+				    ALIGN(extra_buffers_size, sizeof(void *)) -
+				    ALIGN(secctx_sz, sizeof(u64));
+		char *kptr = t->buffer->data + buf_offset;
+
+		t->security_ctx = (uintptr_t)kptr +
+		    binder_alloc_get_user_buffer_offset(&target_proc->alloc);
+		memcpy(kptr, secctx, secctx_sz);
+		security_release_secctx(secctx, secctx_sz);
+		secctx = NULL;
+	}
 	t->buffer->debug_id = t->debug_id;
 	t->buffer->transaction = t;
 	t->buffer->target_node = target_node;
@@ -3305,6 +3339,9 @@ err_copy_data_failed:
 	t->buffer->transaction = NULL;
 	binder_alloc_free_buf(&target_proc->alloc, t->buffer);
 err_binder_alloc_buf_failed:
+	if (secctx)
+		security_release_secctx(secctx, secctx_sz);
+err_get_secctx_failed:
 	kfree(tcomplete);
 	binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
 err_alloc_tcomplete_failed:
@@ -4036,11 +4073,13 @@ retry:
 
 	while (1) {
 		uint32_t cmd;
-		struct binder_transaction_data tr;
+		struct binder_transaction_data_secctx tr;
+		struct binder_transaction_data *trd = &tr.transaction_data;
 		struct binder_work *w = NULL;
 		struct list_head *list = NULL;
 		struct binder_transaction *t = NULL;
 		struct binder_thread *t_from;
+		size_t trsize = sizeof(*trd);
 
 		binder_inner_proc_lock(proc);
 		if (!binder_worklist_empty_ilocked(&thread->todo))
@@ -4240,8 +4279,8 @@ retry:
 		if (t->buffer->target_node) {
 			struct binder_node *target_node = t->buffer->target_node;
 
-			tr.target.ptr = target_node->ptr;
-			tr.cookie =  target_node->cookie;
+			trd->target.ptr = target_node->ptr;
+			trd->cookie =  target_node->cookie;
 			t->saved_priority = task_nice(current);
 			if (t->priority < target_node->min_priority &&
 			    !(t->flags & TF_ONE_WAY))
@@ -4251,22 +4290,23 @@ retry:
 				binder_set_nice(target_node->min_priority);
 			cmd = BR_TRANSACTION;
 		} else {
-			tr.target.ptr = 0;
-			tr.cookie = 0;
+			trd->target.ptr = 0;
+			trd->cookie = 0;
 			cmd = BR_REPLY;
 		}
-		tr.code = t->code;
-		tr.flags = t->flags;
-		tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid);
+		trd->code = t->code;
+		trd->flags = t->flags;
+		trd->sender_euid = from_kuid(current_user_ns(), t->sender_euid);
 
 		t_from = binder_get_txn_from(t);
 		if (t_from) {
 			struct task_struct *sender = t_from->proc->tsk;
 
-			tr.sender_pid = task_tgid_nr_ns(sender,
-							task_active_pid_ns(current));
+			trd->sender_pid =
+				task_tgid_nr_ns(sender,
+						task_active_pid_ns(current));
 		} else {
-			tr.sender_pid = 0;
+			trd->sender_pid = 0;
 		}
 
 		ret = binder_apply_fd_fixups(t);
@@ -4297,15 +4337,20 @@ retry:
 			}
 			continue;
 		}
-		tr.data_size = t->buffer->data_size;
-		tr.offsets_size = t->buffer->offsets_size;
-		tr.data.ptr.buffer = (binder_uintptr_t)
+		trd->data_size = t->buffer->data_size;
+		trd->offsets_size = t->buffer->offsets_size;
+		trd->data.ptr.buffer = (binder_uintptr_t)
 			((uintptr_t)t->buffer->data +
 			binder_alloc_get_user_buffer_offset(&proc->alloc));
-		tr.data.ptr.offsets = tr.data.ptr.buffer +
+		trd->data.ptr.offsets = trd->data.ptr.buffer +
 					ALIGN(t->buffer->data_size,
 					    sizeof(void *));
 
+		tr.secctx = t->security_ctx;
+		if (t->security_ctx) {
+			cmd = BR_TRANSACTION_SEC_CTX;
+			trsize = sizeof(tr);
+		}
 		if (put_user(cmd, (uint32_t __user *)ptr)) {
 			if (t_from)
 				binder_thread_dec_tmpref(t_from);
@@ -4316,7 +4361,7 @@ retry:
 			return -EFAULT;
 		}
 		ptr += sizeof(uint32_t);
-		if (copy_to_user(ptr, &tr, sizeof(tr))) {
+		if (copy_to_user(ptr, &tr, trsize)) {
 			if (t_from)
 				binder_thread_dec_tmpref(t_from);
 
@@ -4325,7 +4370,7 @@ retry:
 
 			return -EFAULT;
 		}
-		ptr += sizeof(tr);
+		ptr += trsize;
 
 		trace_binder_transaction_received(t);
 		binder_stat_br(proc, thread, cmd);
@@ -4333,16 +4378,18 @@ retry:
 			     "%d:%d %s %d %d:%d, cmd %d size %zd-%zd ptr %016llx-%016llx\n",
 			     proc->pid, thread->pid,
 			     (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" :
-			     "BR_REPLY",
+				(cmd == BR_TRANSACTION_SEC_CTX) ?
+				     "BR_TRANSACTION_SEC_CTX" : "BR_REPLY",
 			     t->debug_id, t_from ? t_from->proc->pid : 0,
 			     t_from ? t_from->pid : 0, cmd,
 			     t->buffer->data_size, t->buffer->offsets_size,
-			     (u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets);
+			     (u64)trd->data.ptr.buffer,
+			     (u64)trd->data.ptr.offsets);
 
 		if (t_from)
 			binder_thread_dec_tmpref(t_from);
 		t->buffer->allow_user_free = 1;
-		if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) {
+		if (cmd != BR_REPLY && !(t->flags & TF_ONE_WAY)) {
 			binder_inner_proc_lock(thread->proc);
 			t->to_parent = thread->transaction_stack;
 			t->to_thread = thread;
@@ -4690,7 +4737,8 @@ out:
 	return ret;
 }
 
-static int binder_ioctl_set_ctx_mgr(struct file *filp)
+static int binder_ioctl_set_ctx_mgr(struct file *filp,
+				    struct flat_binder_object *fbo)
 {
 	int ret = 0;
 	struct binder_proc *proc = filp->private_data;
@@ -4719,7 +4767,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
 	} else {
 		context->binder_context_mgr_uid = curr_euid;
 	}
-	new_node = binder_new_node(proc, NULL);
+	new_node = binder_new_node(proc, fbo);
 	if (!new_node) {
 		ret = -ENOMEM;
 		goto out;
@@ -4842,8 +4890,20 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		binder_inner_proc_unlock(proc);
 		break;
 	}
+	case BINDER_SET_CONTEXT_MGR_EXT: {
+		struct flat_binder_object fbo;
+
+		if (copy_from_user(&fbo, ubuf, sizeof(fbo))) {
+			ret = -EINVAL;
+			goto err;
+		}
+		ret = binder_ioctl_set_ctx_mgr(filp, &fbo);
+		if (ret)
+			goto err;
+		break;
+	}
 	case BINDER_SET_CONTEXT_MGR:
-		ret = binder_ioctl_set_ctx_mgr(filp);
+		ret = binder_ioctl_set_ctx_mgr(filp, NULL);
 		if (ret)
 			goto err;
 		break;
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index b9ba520f7e4b..2832134e5397 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -41,6 +41,14 @@ enum {
 enum {
 	FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff,
 	FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100,
+
+	/**
+	 * @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts
+	 *
+	 * Only when set, causes senders to include their security
+	 * context
+	 */
+	FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000,
 };
 
 #ifdef BINDER_IPC_32BIT
@@ -218,6 +226,7 @@ struct binder_node_info_for_ref {
 #define BINDER_VERSION			_IOWR('b', 9, struct binder_version)
 #define BINDER_GET_NODE_DEBUG_INFO	_IOWR('b', 11, struct binder_node_debug_info)
 #define BINDER_GET_NODE_INFO_FOR_REF	_IOWR('b', 12, struct binder_node_info_for_ref)
+#define BINDER_SET_CONTEXT_MGR_EXT	_IOW('b', 13, struct flat_binder_object)
 
 /*
  * NOTE: Two special error codes you should check for when calling
@@ -276,6 +285,11 @@ struct binder_transaction_data {
 	} data;
 };
 
+struct binder_transaction_data_secctx {
+	struct binder_transaction_data transaction_data;
+	binder_uintptr_t secctx;
+};
+
 struct binder_transaction_data_sg {
 	struct binder_transaction_data transaction_data;
 	binder_size_t buffers_size;
@@ -311,6 +325,11 @@ enum binder_driver_return_protocol {
 	BR_OK = _IO('r', 1),
 	/* No parameters! */
 
+	BR_TRANSACTION_SEC_CTX = _IOR('r', 2,
+				      struct binder_transaction_data_secctx),
+	/*
+	 * binder_transaction_data_secctx: the received command.
+	 */
 	BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data),
 	BR_REPLY = _IOR('r', 3, struct binder_transaction_data),
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From a258aeacd7f0dc10bb45caa7e92a3ea3ca1a76e9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 18 Jan 2019 14:30:23 +0200
Subject: bonding: add support for xstats and export 3ad stats

This patch adds support for extended statistics (xstats) call to the
bonding. The first user would be the 3ad code which counts the following
events:
 - LACPDU Rx/Tx
 - LACPDU unknown type Rx
 - LACPDU illegal Rx
 - Marker Rx/Tx
 - Marker response Rx/Tx
 - Marker unknown type Rx

All of these are exported via netlink as separate attributes to be
easily extensible as we plan to add more in the future.
Similar to how the bridge and other xstats exports, the structure
inside is:
 [ IFLA_STATS_LINK_XSTATS ]
   -> [ LINK_XSTATS_TYPE_BOND ]
        -> [ BOND_XSTATS_3AD ]
             -> [ 3ad stats attributes ]

With this structure it's easy to add more stat types later.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_3ad.c     | 83 ++++++++++++++++++++++++++++++++++++++
 drivers/net/bonding/bond_netlink.c | 71 ++++++++++++++++++++++++++++++++
 include/net/bond_3ad.h             |  3 ++
 include/uapi/linux/if_bonding.h    | 24 +++++++++++
 include/uapi/linux/if_link.h       |  1 +
 5 files changed, 182 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index d1d8cb6b8cdc..d30c21b34858 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -31,6 +31,7 @@
 #include <net/net_namespace.h>
 #include <net/bonding.h>
 #include <net/bond_3ad.h>
+#include <net/netlink.h>
 
 /* General definitions */
 #define AD_SHORT_TIMEOUT           1
@@ -2696,3 +2697,85 @@ void bond_3ad_update_lacp_rate(struct bonding *bond)
 	}
 	spin_unlock_bh(&bond->mode_lock);
 }
+
+void bond_3ad_stats_add(struct slave *slave, struct bond_3ad_stats *stats)
+{
+	struct bond_3ad_stats *rstats = &SLAVE_AD_INFO(slave)->stats;
+	u64 stat;
+
+	atomic64_add(atomic64_read(&rstats->lacpdu_rx), &stats->lacpdu_rx);
+	atomic64_add(atomic64_read(&rstats->lacpdu_tx), &stats->lacpdu_tx);
+
+	stat = atomic64_read(&rstats->lacpdu_unknown_rx);
+	atomic64_add(stat, &stats->lacpdu_unknown_rx);
+	stat = atomic64_read(&rstats->lacpdu_illegal_rx);
+	atomic64_add(stat, &stats->lacpdu_illegal_rx);
+
+	atomic64_add(atomic64_read(&rstats->marker_rx), &stats->marker_rx);
+	atomic64_add(atomic64_read(&rstats->marker_tx), &stats->marker_tx);
+
+	stat = atomic64_read(&rstats->marker_resp_rx);
+	atomic64_add(stat, &stats->marker_resp_rx);
+	stat = atomic64_read(&rstats->marker_resp_tx);
+	atomic64_add(stat, &stats->marker_resp_tx);
+	stat = atomic64_read(&rstats->marker_unknown_rx);
+	atomic64_add(stat, &stats->marker_unknown_rx);
+}
+
+size_t bond_3ad_stats_size(void)
+{
+	return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */
+	       nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_TX */
+	       nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_UNKNOWN_RX */
+	       nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_ILLEGAL_RX */
+	       nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RX */
+	       nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_TX */
+	       nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_RX */
+	       nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_TX */
+	       nla_total_size_64bit(sizeof(u64)); /* BOND_3AD_STAT_MARKER_UNKNOWN_RX */
+}
+
+int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats)
+{
+	u64 val;
+
+	val = atomic64_read(&stats->lacpdu_rx);
+	if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_RX, val,
+			      BOND_3AD_STAT_PAD))
+		return -EMSGSIZE;
+	val = atomic64_read(&stats->lacpdu_tx);
+	if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_TX, val,
+			      BOND_3AD_STAT_PAD))
+		return -EMSGSIZE;
+	val = atomic64_read(&stats->lacpdu_unknown_rx);
+	if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_UNKNOWN_RX, val,
+			      BOND_3AD_STAT_PAD))
+		return -EMSGSIZE;
+	val = atomic64_read(&stats->lacpdu_illegal_rx);
+	if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_ILLEGAL_RX, val,
+			      BOND_3AD_STAT_PAD))
+		return -EMSGSIZE;
+
+	val = atomic64_read(&stats->marker_rx);
+	if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RX, val,
+			      BOND_3AD_STAT_PAD))
+		return -EMSGSIZE;
+	val = atomic64_read(&stats->marker_tx);
+	if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_TX, val,
+			      BOND_3AD_STAT_PAD))
+		return -EMSGSIZE;
+	val = atomic64_read(&stats->marker_resp_rx);
+	if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_RX, val,
+			      BOND_3AD_STAT_PAD))
+		return -EMSGSIZE;
+	val = atomic64_read(&stats->marker_resp_tx);
+	if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_TX, val,
+			      BOND_3AD_STAT_PAD))
+		return -EMSGSIZE;
+	val = atomic64_read(&stats->marker_unknown_rx);
+	if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_UNKNOWN_RX, val,
+			      BOND_3AD_STAT_PAD))
+		return -EMSGSIZE;
+
+	return 0;
+}
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 6b9ad8673218..d1338fbe1830 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -675,6 +675,75 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static size_t bond_get_linkxstats_size(const struct net_device *dev, int attr)
+{
+	switch (attr) {
+	case IFLA_STATS_LINK_XSTATS:
+	case IFLA_STATS_LINK_XSTATS_SLAVE:
+		break;
+	default:
+		return 0;
+	}
+
+	return bond_3ad_stats_size() + nla_total_size(0);
+}
+
+static int bond_fill_linkxstats(struct sk_buff *skb,
+				const struct net_device *dev,
+				int *prividx, int attr)
+{
+	struct nlattr *nla __maybe_unused;
+	struct slave *slave = NULL;
+	struct nlattr *nest, *nest2;
+	struct bonding *bond;
+
+	switch (attr) {
+	case IFLA_STATS_LINK_XSTATS:
+		bond = netdev_priv(dev);
+		break;
+	case IFLA_STATS_LINK_XSTATS_SLAVE:
+		slave = bond_slave_get_rtnl(dev);
+		if (!slave)
+			return 0;
+		bond = slave->bond;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BOND);
+	if (!nest)
+		return -EMSGSIZE;
+	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
+		struct bond_3ad_stats stats;
+		struct list_head *iter;
+
+		memset(&stats, 0, sizeof(stats));
+		if (slave) {
+			bond_3ad_stats_add(slave, &stats);
+		} else {
+			bond_for_each_slave(bond, slave, iter)
+				bond_3ad_stats_add(slave, &stats);
+		}
+
+		nest2 = nla_nest_start(skb, BOND_XSTATS_3AD);
+		if (!nest2) {
+			nla_nest_end(skb, nest);
+			return -EMSGSIZE;
+		}
+
+		if (bond_3ad_stats_fill(skb, &stats)) {
+			nla_nest_cancel(skb, nest2);
+			nla_nest_end(skb, nest);
+			return -EMSGSIZE;
+		}
+		nla_nest_end(skb, nest2);
+	}
+	nla_nest_end(skb, nest);
+
+	return 0;
+}
+
 struct rtnl_link_ops bond_link_ops __read_mostly = {
 	.kind			= "bond",
 	.priv_size		= sizeof(struct bonding),
@@ -689,6 +758,8 @@ struct rtnl_link_ops bond_link_ops __read_mostly = {
 	.get_num_tx_queues	= bond_get_num_tx_queues,
 	.get_num_rx_queues	= bond_get_num_tx_queues, /* Use the same number
 							     as for TX queues */
+	.fill_linkxstats        = bond_fill_linkxstats,
+	.get_linkxstats_size    = bond_get_linkxstats_size,
 	.slave_maxtype		= IFLA_BOND_SLAVE_MAX,
 	.slave_policy		= bond_slave_policy,
 	.slave_changelink	= bond_slave_changelink,
diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h
index 30e60dba1b2d..25aaf49d19be 100644
--- a/include/net/bond_3ad.h
+++ b/include/net/bond_3ad.h
@@ -321,5 +321,8 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond,
 int bond_3ad_set_carrier(struct bonding *bond);
 void bond_3ad_update_lacp_rate(struct bonding *bond);
 void bond_3ad_update_ad_actor_settings(struct bonding *bond);
+void bond_3ad_stats_add(struct slave *slave, struct bond_3ad_stats *stats);
+int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats);
+size_t bond_3ad_stats_size(void);
 #endif /* _NET_BOND_3AD_H */
 
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index 61a1bf6e865e..790585f0e61b 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -117,6 +117,30 @@ struct ad_info {
 	__u8 partner_system[ETH_ALEN];
 };
 
+/* Embedded inside LINK_XSTATS_TYPE_BOND */
+enum {
+	BOND_XSTATS_UNSPEC,
+	BOND_XSTATS_3AD,
+	__BOND_XSTATS_MAX
+};
+#define BOND_XSTATS_MAX (__BOND_XSTATS_MAX - 1)
+
+/* Embedded inside BOND_XSTATS_3AD */
+enum {
+	BOND_3AD_STAT_LACPDU_RX,
+	BOND_3AD_STAT_LACPDU_TX,
+	BOND_3AD_STAT_LACPDU_UNKNOWN_RX,
+	BOND_3AD_STAT_LACPDU_ILLEGAL_RX,
+	BOND_3AD_STAT_MARKER_RX,
+	BOND_3AD_STAT_MARKER_TX,
+	BOND_3AD_STAT_MARKER_RESP_RX,
+	BOND_3AD_STAT_MARKER_RESP_TX,
+	BOND_3AD_STAT_MARKER_UNKNOWN_RX,
+	BOND_3AD_STAT_PAD,
+	__BOND_3AD_STAT_MAX
+};
+#define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1)
+
 #endif /* _LINUX_IF_BONDING_H */
 
 /*
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d6533828123a..5b225ff63b48 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -925,6 +925,7 @@ enum {
 enum {
 	LINK_XSTATS_TYPE_UNSPEC,
 	LINK_XSTATS_TYPE_BRIDGE,
+	LINK_XSTATS_TYPE_BOND,
 	__LINK_XSTATS_TYPE_MAX
 };
 #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1)
-- 
cgit v1.2.3-59-g8ed1b


From 4effd28c1245303dce7fd290c501ac2c11052114 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Mon, 21 Jan 2019 07:26:27 +0100
Subject: bridge: join all-snoopers multicast address
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Next to snooping IGMP/MLD queries RFC4541, section 2.1.1.a) recommends
to snoop multicast router advertisements to detect multicast routers.

Multicast router advertisements are sent to an "all-snoopers"
multicast address. To be able to receive them reliably, we need to
join this group.

Otherwise other snooping switches might refrain from forwarding these
advertisements to us.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in.h   |  9 +++---
 net/bridge/br_multicast.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/mcast.c          |  2 ++
 3 files changed, 78 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index a55cb8b10165..e7ad9d350a28 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -292,10 +292,11 @@ struct sockaddr_in {
 #define	IN_LOOPBACK(a)		((((long int) (a)) & 0xff000000) == 0x7f000000)
 
 /* Defines for Multicast INADDR */
-#define INADDR_UNSPEC_GROUP   	0xe0000000U	/* 224.0.0.0   */
-#define INADDR_ALLHOSTS_GROUP 	0xe0000001U	/* 224.0.0.1   */
-#define INADDR_ALLRTRS_GROUP    0xe0000002U	/* 224.0.0.2 */
-#define INADDR_MAX_LOCAL_GROUP  0xe00000ffU	/* 224.0.0.255 */
+#define INADDR_UNSPEC_GROUP		0xe0000000U	/* 224.0.0.0   */
+#define INADDR_ALLHOSTS_GROUP		0xe0000001U	/* 224.0.0.1   */
+#define INADDR_ALLRTRS_GROUP		0xe0000002U	/* 224.0.0.2 */
+#define INADDR_ALLSNOOPERS_GROUP	0xe000006aU	/* 224.0.0.106 */
+#define INADDR_MAX_LOCAL_GROUP		0xe00000ffU	/* 224.0.0.255 */
 #endif
 
 /* <asm/byteorder.h> contains the htonl type stuff.. */
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 156c4905639e..2366f4a2780e 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1780,6 +1780,68 @@ void br_multicast_init(struct net_bridge *br)
 	INIT_HLIST_HEAD(&br->mdb_list);
 }
 
+static void br_ip4_multicast_join_snoopers(struct net_bridge *br)
+{
+	struct in_device *in_dev = in_dev_get(br->dev);
+
+	if (!in_dev)
+		return;
+
+	ip_mc_inc_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP));
+	in_dev_put(in_dev);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_join_snoopers(struct net_bridge *br)
+{
+	struct in6_addr addr;
+
+	ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a));
+	ipv6_dev_mc_inc(br->dev, &addr);
+}
+#else
+static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br)
+{
+}
+#endif
+
+static void br_multicast_join_snoopers(struct net_bridge *br)
+{
+	br_ip4_multicast_join_snoopers(br);
+	br_ip6_multicast_join_snoopers(br);
+}
+
+static void br_ip4_multicast_leave_snoopers(struct net_bridge *br)
+{
+	struct in_device *in_dev = in_dev_get(br->dev);
+
+	if (WARN_ON(!in_dev))
+		return;
+
+	ip_mc_dec_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP));
+	in_dev_put(in_dev);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_leave_snoopers(struct net_bridge *br)
+{
+	struct in6_addr addr;
+
+	ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a));
+	ipv6_dev_mc_dec(br->dev, &addr);
+}
+#else
+static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br)
+{
+}
+#endif
+
+static void br_multicast_leave_snoopers(struct net_bridge *br)
+{
+	br_ip4_multicast_leave_snoopers(br);
+	br_ip6_multicast_leave_snoopers(br);
+}
+
 static void __br_multicast_open(struct net_bridge *br,
 				struct bridge_mcast_own_query *query)
 {
@@ -1793,6 +1855,9 @@ static void __br_multicast_open(struct net_bridge *br,
 
 void br_multicast_open(struct net_bridge *br)
 {
+	if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+		br_multicast_join_snoopers(br);
+
 	__br_multicast_open(br, &br->ip4_own_query);
 #if IS_ENABLED(CONFIG_IPV6)
 	__br_multicast_open(br, &br->ip6_own_query);
@@ -1808,6 +1873,9 @@ void br_multicast_stop(struct net_bridge *br)
 	del_timer_sync(&br->ip6_other_query.timer);
 	del_timer_sync(&br->ip6_own_query.timer);
 #endif
+
+	if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+		br_multicast_leave_snoopers(br);
 }
 
 void br_multicast_dev_del(struct net_bridge *br)
@@ -1943,8 +2011,10 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
 
 	br_mc_disabled_update(br->dev, val);
 	br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val);
-	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
+	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) {
+		br_multicast_leave_snoopers(br);
 		goto unlock;
+	}
 
 	if (!netif_running(br->dev))
 		goto unlock;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 21f6deb2aec9..42f3f5cd349f 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -940,6 +940,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
 {
 	return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE);
 }
+EXPORT_SYMBOL(ipv6_dev_mc_inc);
 
 /*
  *	device multicast group del
@@ -987,6 +988,7 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr)
 
 	return err;
 }
+EXPORT_SYMBOL(ipv6_dev_mc_dec);
 
 /*
  *	check if the interface/address pair is valid
-- 
cgit v1.2.3-59-g8ed1b


From 4b3087c7e37f9e499127201849e33960dc81da11 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Mon, 21 Jan 2019 07:26:28 +0100
Subject: bridge: Snoop Multicast Router Advertisements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When multiple multicast routers are present in a broadcast domain then
only one of them will be detectable via IGMP/MLD query snooping. The
multicast router with the lowest IP address will become the selected and
active querier while all other multicast routers will then refrain from
sending queries.

To detect such rather silent multicast routers, too, RFC4286
("Multicast Router Discovery") provides a standardized protocol to
detect multicast routers for multicast snooping switches.

This patch implements the necessary MRD Advertisement message parsing
and after successful processing adds such routers to the internal
multicast router list.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h          |  5 +++++
 include/net/addrconf.h      | 15 +++++++++++++
 include/uapi/linux/icmpv6.h |  2 ++
 include/uapi/linux/igmp.h   |  1 +
 net/bridge/br_multicast.c   | 55 +++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/mcast_snoop.c      |  5 ++++-
 6 files changed, 82 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/linux/in.h b/include/linux/in.h
index 31b493734763..435e7f2a513a 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -60,6 +60,11 @@ static inline bool ipv4_is_lbcast(__be32 addr)
 	return addr == htonl(INADDR_BROADCAST);
 }
 
+static inline bool ipv4_is_all_snoopers(__be32 addr)
+{
+	return addr == htonl(INADDR_ALLSNOOPERS_GROUP);
+}
+
 static inline bool ipv4_is_zeronet(__be32 addr)
 {
 	return (addr & htonl(0xff000000)) == htonl(0x00000000);
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index daf11dcb0f70..20d523ee2fec 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -229,6 +229,7 @@ void ipv6_mc_unmap(struct inet6_dev *idev);
 void ipv6_mc_remap(struct inet6_dev *idev);
 void ipv6_mc_init_dev(struct inet6_dev *idev);
 void ipv6_mc_destroy_dev(struct inet6_dev *idev);
+int ipv6_mc_check_icmpv6(struct sk_buff *skb);
 int ipv6_mc_check_mld(struct sk_buff *skb);
 void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp);
 
@@ -499,6 +500,20 @@ static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr)
 #endif
 }
 
+static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	__be64 *p = (__be64 *)addr;
+
+	return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
+		(p[1] ^ cpu_to_be64(0x6a))) == 0UL;
+#else
+	return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
+		addr->s6_addr32[1] | addr->s6_addr32[2] |
+		(addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0;
+#endif
+}
+
 #ifdef CONFIG_PROC_FS
 int if6_proc_init(void);
 void if6_proc_exit(void);
diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h
index caf8dc019250..325395f56bfa 100644
--- a/include/uapi/linux/icmpv6.h
+++ b/include/uapi/linux/icmpv6.h
@@ -108,6 +108,8 @@ struct icmp6hdr {
 #define ICMPV6_MOBILE_PREFIX_SOL	146
 #define ICMPV6_MOBILE_PREFIX_ADV	147
 
+#define ICMPV6_MRDISC_ADV		151
+
 /*
  *	Codes for Destination Unreachable
  */
diff --git a/include/uapi/linux/igmp.h b/include/uapi/linux/igmp.h
index 7e44ac02ca18..90c28bc466c6 100644
--- a/include/uapi/linux/igmp.h
+++ b/include/uapi/linux/igmp.h
@@ -93,6 +93,7 @@ struct igmpv3_query {
 #define IGMP_MTRACE_RESP		0x1e
 #define IGMP_MTRACE			0x1f
 
+#define IGMP_MRDISC_ADV			0x30	/* From RFC4286 */
 
 /*
  *	Use the BSD names for these for compatibility
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 2366f4a2780e..2c46c7aca571 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/if_ether.h>
 #include <linux/igmp.h>
+#include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -29,10 +30,12 @@
 #include <net/ip.h>
 #include <net/switchdev.h>
 #if IS_ENABLED(CONFIG_IPV6)
+#include <linux/icmpv6.h>
 #include <net/ipv6.h>
 #include <net/mld.h>
 #include <net/ip6_checksum.h>
 #include <net/addrconf.h>
+#include <net/ipv6.h>
 #endif
 
 #include "br_private.h"
@@ -1583,6 +1586,19 @@ static void br_multicast_pim(struct net_bridge *br,
 	br_multicast_mark_router(br, port);
 }
 
+static int br_ip4_multicast_mrd_rcv(struct net_bridge *br,
+				    struct net_bridge_port *port,
+				    struct sk_buff *skb)
+{
+	if (ip_hdr(skb)->protocol != IPPROTO_IGMP ||
+	    igmp_hdr(skb)->type != IGMP_MRDISC_ADV)
+		return -ENOMSG;
+
+	br_multicast_mark_router(br, port);
+
+	return 0;
+}
+
 static int br_multicast_ipv4_rcv(struct net_bridge *br,
 				 struct net_bridge_port *port,
 				 struct sk_buff *skb,
@@ -1600,7 +1616,15 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 		} else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) {
 			if (ip_hdr(skb)->protocol == IPPROTO_PIM)
 				br_multicast_pim(br, port, skb);
+		} else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) {
+			err = br_ip4_multicast_mrd_rcv(br, port, skb);
+
+			if (err < 0 && err != -ENOMSG) {
+				br_multicast_err_count(br, port, skb->protocol);
+				return err;
+			}
 		}
+
 		return 0;
 	} else if (err < 0) {
 		br_multicast_err_count(br, port, skb->protocol);
@@ -1635,6 +1659,27 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
+static int br_ip6_multicast_mrd_rcv(struct net_bridge *br,
+				    struct net_bridge_port *port,
+				    struct sk_buff *skb)
+{
+	int ret;
+
+	if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+		return -ENOMSG;
+
+	ret = ipv6_mc_check_icmpv6(skb);
+	if (ret < 0)
+		return ret;
+
+	if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV)
+		return -ENOMSG;
+
+	br_multicast_mark_router(br, port);
+
+	return 0;
+}
+
 static int br_multicast_ipv6_rcv(struct net_bridge *br,
 				 struct net_bridge_port *port,
 				 struct sk_buff *skb,
@@ -1649,6 +1694,16 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 	if (err == -ENOMSG) {
 		if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
 			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+
+		if (ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) {
+			err = br_ip6_multicast_mrd_rcv(br, port, skb);
+
+			if (err < 0 && err != -ENOMSG) {
+				br_multicast_err_count(br, port, skb->protocol);
+				return err;
+			}
+		}
+
 		return 0;
 	} else if (err < 0) {
 		br_multicast_err_count(br, port, skb->protocol);
diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c
index a72ddfc40eb3..55e2ac179f28 100644
--- a/net/ipv6/mcast_snoop.c
+++ b/net/ipv6/mcast_snoop.c
@@ -41,6 +41,8 @@ static int ipv6_mc_check_ip6hdr(struct sk_buff *skb)
 	if (skb->len < len || len <= offset)
 		return -EINVAL;
 
+	skb_set_transport_header(skb, offset);
+
 	return 0;
 }
 
@@ -142,7 +144,7 @@ static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb)
 	return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo);
 }
 
-static int ipv6_mc_check_icmpv6(struct sk_buff *skb)
+int ipv6_mc_check_icmpv6(struct sk_buff *skb)
 {
 	unsigned int len = skb_transport_offset(skb) + sizeof(struct icmp6hdr);
 	unsigned int transport_len = ipv6_transport_len(skb);
@@ -161,6 +163,7 @@ static int ipv6_mc_check_icmpv6(struct sk_buff *skb)
 
 	return 0;
 }
+EXPORT_SYMBOL(ipv6_mc_check_icmpv6);
 
 /**
  * ipv6_mc_check_mld - checks whether this is a sane MLD packet
-- 
cgit v1.2.3-59-g8ed1b


From 141e5dcaa7356077028b4cd48ec351a38c70e5e5 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 24 Jan 2019 00:29:20 -0800
Subject: Input: input_event - fix the CONFIG_SPARC64 mixup

Arnd Bergmann pointed out that CONFIG_* cannot be used in a uapi header.
Override with an equivalent conditional.

Fixes: 2e746942ebac ("Input: input_event - provide override for sparc64")
Fixes: 152194fe9c3f ("Input: extend usable life of event timestamps to 2106 on 32 bit systems")
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index ffab958bc512..f056b2a00d5c 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -32,7 +32,7 @@ struct input_event {
 #define input_event_usec time.tv_usec
 #else
 	__kernel_ulong_t __sec;
-#ifdef CONFIG_SPARC64
+#if defined(__sparc__) && defined(__arch64__)
 	unsigned int __usec;
 #else
 	__kernel_ulong_t __usec;
-- 
cgit v1.2.3-59-g8ed1b


From d9ff286a0f59fa7843549e49bd240393dd7d8b87 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Jan 2019 09:22:27 -0800
Subject: bpf: allow BPF programs access skb_shared_info->gso_segs field

This adds the ability to read gso_segs from a BPF program.

v3: Use BPF_REG_AX instead of BPF_REG_TMP for the temporary register,
    as suggested by Martin.

v2: refined Eddie Hao patch to address Alexei feedback.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Eddie Hao <eddieh@google.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h                    |  1 +
 net/core/filter.c                           | 21 +++++++++++++++++
 tools/include/uapi/linux/bpf.h              |  1 +
 tools/testing/selftests/bpf/test_verifier.c | 36 +++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 91c43884f295..2940a9854f6d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2540,6 +2540,7 @@ struct __sk_buff {
 	__bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
 	__u64 tstamp;
 	__u32 wire_len;
+	__u32 gso_segs;
 };
 
 struct bpf_tunnel_key {
diff --git a/net/core/filter.c b/net/core/filter.c
index 2b3b436ef545..8e587dd1da20 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6700,6 +6700,27 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 							     target_size));
 		break;
 
+	case offsetof(struct __sk_buff, gso_segs):
+		/* si->dst_reg = skb_shinfo(SKB); */
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, head));
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+				      BPF_REG_AX, si->src_reg,
+				      offsetof(struct sk_buff, end));
+		*insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+#else
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, end));
+#endif
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
+				      si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct skb_shared_info,
+						     gso_segs, 2,
+						     target_size));
+		break;
 	case offsetof(struct __sk_buff, wire_len):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4);
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 91c43884f295..2940a9854f6d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2540,6 +2540,7 @@ struct __sk_buff {
 	__bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
 	__u64 tstamp;
 	__u32 wire_len;
+	__u32 gso_segs;
 };
 
 struct bpf_tunnel_key {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 4f67afeec8d9..e4fef6ca8071 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -5663,6 +5663,42 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
 	},
+	{
+		"read gso_segs from CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, gso_segs)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"write gso_segs from CGROUP_SKB",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, gso_segs)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.result_unpriv = REJECT,
+		.errstr = "invalid bpf_context access off=164 size=4",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"read gso_segs from CLS",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, gso_segs)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
 	{
 		"multiple registers share map_lookup_elem result",
 		.insns = {
-- 
cgit v1.2.3-59-g8ed1b


From 45383fb0f42db3945ac6cc658704706cdae19528 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.bie@intel.com>
Date: Wed, 23 Jan 2019 17:50:26 +0800
Subject: virtio: support VIRTIO_F_ORDER_PLATFORM

This patch introduces the support for VIRTIO_F_ORDER_PLATFORM.
If this feature is negotiated, the driver must use the barriers
suitable for hardware devices. Otherwise, the device and driver
are assumed to be implemented in software, that is they can be
assumed to run on identical CPUs in an SMP configuration. Thus
a weaker form of memory barriers is sufficient to yield better
performance.

It is recommended that an add-in card based PCI device offers
this feature for portability. The device will fail to operate
further or will operate in a slower emulation mode if this
feature is offered but not accepted.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c       | 8 ++++++++
 include/uapi/linux/virtio_config.h | 6 ++++++
 2 files changed, 14 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index cd7e755484e3..27d3f057493e 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1609,6 +1609,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
 		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
+	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
+		vq->weak_barriers = false;
+
 	vq->packed.ring_dma_addr = ring_dma_addr;
 	vq->packed.driver_event_dma_addr = driver_event_dma_addr;
 	vq->packed.device_event_dma_addr = device_event_dma_addr;
@@ -2079,6 +2082,9 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
+	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
+		vq->weak_barriers = false;
+
 	vq->split.queue_dma_addr = 0;
 	vq->split.queue_size_in_bytes = 0;
 
@@ -2213,6 +2219,8 @@ void vring_transport_features(struct virtio_device *vdev)
 			break;
 		case VIRTIO_F_RING_PACKED:
 			break;
+		case VIRTIO_F_ORDER_PLATFORM:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 1196e1c1d4f6..ff8e7dc9d4dd 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -78,6 +78,12 @@
 /* This feature indicates support for the packed virtqueue layout. */
 #define VIRTIO_F_RING_PACKED		34
 
+/*
+ * This feature indicates that memory accesses by the driver and the
+ * device are ordered in a way described by the platform.
+ */
+#define VIRTIO_F_ORDER_PLATFORM		36
+
 /*
  * Does the device support Single Root I/O Virtualization?
  */
-- 
cgit v1.2.3-59-g8ed1b


From 745815f955f65f22d378d69822da11043d00aaff Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 24 Jan 2019 18:20:13 +0900
Subject: uapi: fix ioctl documentation

The description of the BLKGETNRZONES zoned block device ioctl was not
added as a comment together with this ioctl definition in commit
65e4e3eee83d7 ("block: Introduce BLKGETNRZONES ioctl"). Add its
description here.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blkzoned.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 6fa38d001d84..498eec813494 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -138,6 +138,7 @@ struct blk_zone_range {
  * @BLKRESETZONE: Reset the write pointer of the zones in the specified
  *                sector range. The sector range must be zone aligned.
  * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
+ * @BLKGETNRZONES: Get the total number of zones of the device.
  */
 #define BLKREPORTZONE	_IOWR(0x12, 130, struct blk_zone_report)
 #define BLKRESETZONE	_IOW(0x12, 131, struct blk_zone_range)
-- 
cgit v1.2.3-59-g8ed1b


From a36b38aa2af61146ea80980a01cf6e952ab021c1 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Thu, 24 Jan 2019 19:59:39 +0100
Subject: xsk: add sock_diag interface for AF_XDP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds the sock_diag interface for querying sockets from user
space. Tools like iproute2 ss(8) can use this interface to list open
AF_XDP sockets.

The user-space ABI is defined in linux/xdp_diag.h and includes netlink
request and response structs. The request can query sockets and the
response contains socket information about the rings, umems, inode and
more.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/xdp_diag.h |  72 ++++++++++++++++
 net/xdp/Kconfig               |   8 ++
 net/xdp/Makefile              |   1 +
 net/xdp/xsk.c                 |   6 +-
 net/xdp/xsk.h                 |  12 +++
 net/xdp/xsk_diag.c            | 191 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 285 insertions(+), 5 deletions(-)
 create mode 100644 include/uapi/linux/xdp_diag.h
 create mode 100644 net/xdp/xsk.h
 create mode 100644 net/xdp/xsk_diag.c

(limited to 'include/uapi')

diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h
new file mode 100644
index 000000000000..78b2591a7782
--- /dev/null
+++ b/include/uapi/linux/xdp_diag.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * xdp_diag: interface for query/monitor XDP sockets
+ * Copyright(c) 2019 Intel Corporation.
+ */
+
+#ifndef _LINUX_XDP_DIAG_H
+#define _LINUX_XDP_DIAG_H
+
+#include <linux/types.h>
+
+struct xdp_diag_req {
+	__u8	sdiag_family;
+	__u8	sdiag_protocol;
+	__u16	pad;
+	__u32	xdiag_ino;
+	__u32	xdiag_show;
+	__u32	xdiag_cookie[2];
+};
+
+struct xdp_diag_msg {
+	__u8	xdiag_family;
+	__u8	xdiag_type;
+	__u16	pad;
+	__u32	xdiag_ino;
+	__u32	xdiag_cookie[2];
+};
+
+#define XDP_SHOW_INFO		(1 << 0) /* Basic information */
+#define XDP_SHOW_RING_CFG	(1 << 1)
+#define XDP_SHOW_UMEM		(1 << 2)
+#define XDP_SHOW_MEMINFO	(1 << 3)
+
+enum {
+	XDP_DIAG_NONE,
+	XDP_DIAG_INFO,
+	XDP_DIAG_UID,
+	XDP_DIAG_RX_RING,
+	XDP_DIAG_TX_RING,
+	XDP_DIAG_UMEM,
+	XDP_DIAG_UMEM_FILL_RING,
+	XDP_DIAG_UMEM_COMPLETION_RING,
+	XDP_DIAG_MEMINFO,
+	__XDP_DIAG_MAX,
+};
+
+#define XDP_DIAG_MAX (__XDP_DIAG_MAX - 1)
+
+struct xdp_diag_info {
+	__u32	ifindex;
+	__u32	queue_id;
+};
+
+struct xdp_diag_ring {
+	__u32	entries; /*num descs */
+};
+
+#define XDP_DU_F_ZEROCOPY (1 << 0)
+
+struct xdp_diag_umem {
+	__u64	size;
+	__u32	id;
+	__u32	num_pages;
+	__u32	chunk_size;
+	__u32	headroom;
+	__u32	ifindex;
+	__u32	queue_id;
+	__u32	flags;
+	__u32	refs;
+};
+
+#endif /* _LINUX_XDP_DIAG_H */
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
index 90e4a7152854..0255b33cff4b 100644
--- a/net/xdp/Kconfig
+++ b/net/xdp/Kconfig
@@ -5,3 +5,11 @@ config XDP_SOCKETS
 	help
 	  XDP sockets allows a channel between XDP programs and
 	  userspace applications.
+
+config XDP_SOCKETS_DIAG
+	tristate "XDP sockets: monitoring interface"
+	depends on XDP_SOCKETS
+	default n
+	help
+	  Support for PF_XDP sockets monitoring interface used by the ss tool.
+	  If unsure, say Y.
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index 04f073146256..59dbfdf93dca 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
+obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 80ca48cefc42..949d3bbccb2f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -27,14 +27,10 @@
 
 #include "xsk_queue.h"
 #include "xdp_umem.h"
+#include "xsk.h"
 
 #define TX_BATCH_SIZE 16
 
-static struct xdp_sock *xdp_sk(struct sock *sk)
-{
-	return (struct xdp_sock *)sk;
-}
-
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 {
 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
new file mode 100644
index 000000000000..ba8120610426
--- /dev/null
+++ b/net/xdp/xsk.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2019 Intel Corporation. */
+
+#ifndef XSK_H_
+#define XSK_H_
+
+static inline struct xdp_sock *xdp_sk(struct sock *sk)
+{
+	return (struct xdp_sock *)sk;
+}
+
+#endif /* XSK_H_ */
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
new file mode 100644
index 000000000000..661d007c3b28
--- /dev/null
+++ b/net/xdp/xsk_diag.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets monitoring support
+ *
+ * Copyright(c) 2019 Intel Corporation.
+ *
+ * Author: Björn Töpel <bjorn.topel@intel.com>
+ */
+
+#include <linux/module.h>
+#include <net/xdp_sock.h>
+#include <linux/xdp_diag.h>
+#include <linux/sock_diag.h>
+
+#include "xsk_queue.h"
+#include "xsk.h"
+
+static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb)
+{
+	struct xdp_diag_info di = {};
+
+	di.ifindex = xs->dev ? xs->dev->ifindex : 0;
+	di.queue_id = xs->queue_id;
+	return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di);
+}
+
+static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type,
+			     struct sk_buff *nlskb)
+{
+	struct xdp_diag_ring dr = {};
+
+	dr.entries = queue->nentries;
+	return nla_put(nlskb, nl_type, sizeof(dr), &dr);
+}
+
+static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs,
+				  struct sk_buff *nlskb)
+{
+	int err = 0;
+
+	if (xs->rx)
+		err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb);
+	if (!err && xs->tx)
+		err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb);
+	return err;
+}
+
+static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
+{
+	struct xdp_umem *umem = xs->umem;
+	struct xdp_diag_umem du = {};
+	int err;
+
+	if (!umem)
+		return 0;
+
+	du.id = umem->id;
+	du.size = umem->size;
+	du.num_pages = umem->npgs;
+	du.chunk_size = (__u32)(~umem->chunk_mask + 1);
+	du.headroom = umem->headroom;
+	du.ifindex = umem->dev ? umem->dev->ifindex : 0;
+	du.queue_id = umem->queue_id;
+	du.flags = 0;
+	if (umem->zc)
+		du.flags |= XDP_DU_F_ZEROCOPY;
+	du.refs = refcount_read(&umem->users);
+
+	err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du);
+
+	if (!err && umem->fq)
+		err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_FILL_RING, nlskb);
+	if (!err && umem->cq) {
+		err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_COMPLETION_RING,
+					nlskb);
+	}
+	return err;
+}
+
+static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
+			 struct xdp_diag_req *req,
+			 struct user_namespace *user_ns,
+			 u32 portid, u32 seq, u32 flags, int sk_ino)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct xdp_diag_msg *msg;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg),
+			flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	msg = nlmsg_data(nlh);
+	memset(msg, 0, sizeof(*msg));
+	msg->xdiag_family = AF_XDP;
+	msg->xdiag_type = sk->sk_type;
+	msg->xdiag_ino = sk_ino;
+	sock_diag_save_cookie(sk, msg->xdiag_cookie);
+
+	if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_INFO) &&
+	    nla_put_u32(nlskb, XDP_DIAG_UID,
+			from_kuid_munged(user_ns, sock_i_uid(sk))))
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_RING_CFG) &&
+	    xsk_diag_put_rings_cfg(xs, nlskb))
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_UMEM) &&
+	    xsk_diag_put_umem(xs, nlskb))
+		goto out_nlmsg_trim;
+
+	if ((req->xdiag_show & XDP_SHOW_MEMINFO) &&
+	    sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
+		goto out_nlmsg_trim;
+
+	nlmsg_end(nlskb, nlh);
+	return 0;
+
+out_nlmsg_trim:
+	nlmsg_cancel(nlskb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb)
+{
+	struct xdp_diag_req *req = nlmsg_data(cb->nlh);
+	struct net *net = sock_net(nlskb->sk);
+	int num = 0, s_num = cb->args[0];
+	struct sock *sk;
+
+	mutex_lock(&net->xdp.lock);
+
+	sk_for_each(sk, &net->xdp.list) {
+		if (!net_eq(sock_net(sk), net))
+			continue;
+		if (num++ < s_num)
+			continue;
+
+		if (xsk_diag_fill(sk, nlskb, req,
+				  sk_user_ns(NETLINK_CB(cb->skb).sk),
+				  NETLINK_CB(cb->skb).portid,
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				  sock_i_ino(sk)) < 0) {
+			num--;
+			break;
+		}
+	}
+
+	mutex_unlock(&net->xdp.lock);
+	cb->args[0] = num;
+	return nlskb->len;
+}
+
+static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr)
+{
+	struct netlink_dump_control c = { .dump = xsk_diag_dump };
+	int hdrlen = sizeof(struct xdp_diag_req);
+	struct net *net = sock_net(nlskb->sk);
+
+	if (nlmsg_len(hdr) < hdrlen)
+		return -EINVAL;
+
+	if (!(hdr->nlmsg_flags & NLM_F_DUMP))
+		return -EOPNOTSUPP;
+
+	return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c);
+}
+
+static const struct sock_diag_handler xsk_diag_handler = {
+	.family = AF_XDP,
+	.dump = xsk_diag_handler_dump,
+};
+
+static int __init xsk_diag_init(void)
+{
+	return sock_diag_register(&xsk_diag_handler);
+}
+
+static void __exit xsk_diag_exit(void)
+{
+	sock_diag_unregister(&xsk_diag_handler);
+}
+
+module_init(xsk_diag_init);
+module_exit(xsk_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP);
-- 
cgit v1.2.3-59-g8ed1b


From ab4dfa20534e32e48de6b761b42d943518fb26f7 Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <vjakkam@codeaurora.org>
Date: Wed, 19 Dec 2018 22:52:25 +0530
Subject: cfg80211: Allow drivers to advertise supported AKM suites

There was no such capability advertisement from the driver and thus the
current user space has to assume the driver to support all the AKMs. While
that may be the case with some drivers (e.g., mac80211-based ones), there
are cfg80211-based drivers that implement SME and have constraints on
which AKMs can be supported (e.g., such drivers may need an update to
support SAE AKM using NL80211_CMD_EXTERNAL_AUTH). Allow such drivers to
advertise the exact set of supported AKMs so that user space tools can
determine what network profile options should be allowed to be configured.

Signed-off-by: Veerendranath Jakkam <vjakkam@codeaurora.org>
[pmsr data might be big, start a new netlink message section]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 5 +++++
 include/uapi/linux/nl80211.h | 6 ++++++
 net/wireless/nl80211.c       | 9 +++++++++
 3 files changed, 20 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9c1d7979c200..b61ac6e9de08 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4128,6 +4128,8 @@ struct cfg80211_pmsr_capabilities {
  * @signal_type: signal type reported in &struct cfg80211_bss.
  * @cipher_suites: supported cipher suites
  * @n_cipher_suites: number of supported cipher suites
+ * @akm_suites: supported AKM suites
+ * @n_akm_suites: number of supported AKM suites
  * @retry_short: Retry limit for short frames (dot11ShortRetryLimit)
  * @retry_long: Retry limit for long frames (dot11LongRetryLimit)
  * @frag_threshold: Fragmentation threshold (dot11FragmentationThreshold);
@@ -4326,6 +4328,9 @@ struct wiphy {
 	int n_cipher_suites;
 	const u32 *cipher_suites;
 
+	int n_akm_suites;
+	const u32 *akm_suites;
+
 	u8 retry_short;
 	u8 retry_long;
 	u32 frag_threshold;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 426db4d8f71c..5f9d5cd458a1 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1565,6 +1565,12 @@ enum nl80211_commands {
  *	(a u32 with flags from &enum nl80211_wpa_versions).
  * @NL80211_ATTR_AKM_SUITES: Used with CONNECT, ASSOCIATE, and NEW_BEACON to
  *	indicate which key management algorithm(s) to use (an array of u32).
+ *	This attribute is also sent in response to @NL80211_CMD_GET_WIPHY,
+ *	indicating the supported AKM suites, intended for specific drivers which
+ *	implement SME and have constraints on which AKMs are supported and also
+ *	the cases where an AKM support is offloaded to the driver/firmware.
+ *	If there is no such notification from the driver, user space should
+ *	assume the driver supports all the AKM suites.
  *
  * @NL80211_ATTR_REQ_IE: (Re)association request information elements as
  *	sent out by the card, for ROAM and successful CONNECT events.
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e5f9c9ceb6c9..eb4437fa0539 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2279,6 +2279,15 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		if (nl80211_send_pmsr_capa(rdev, msg))
 			goto nla_put_failure;
 
+		state->split_start++;
+		break;
+	case 15:
+		if (rdev->wiphy.akm_suites &&
+		    nla_put(msg, NL80211_ATTR_AKM_SUITES,
+			    sizeof(u32) * rdev->wiphy.n_akm_suites,
+			    rdev->wiphy.akm_suites))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 0d6040d4681735dfc47565de288525de405a5c99 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 31 Dec 2018 14:38:26 +0100
Subject: arch: add split IPC system calls where needed

The IPC system call handling is highly inconsistent across architectures,
some use sys_ipc, some use separate calls, and some use both.  We also
have some architectures that require passing IPC_64 in the flags, and
others that set it implicitly.

For the addition of a y2038 safe semtimedop() system call, I chose to only
support the separate entry points, but that requires first supporting
the regular ones with their own syscall numbers.

The IPC_64 is now implied by the new semctl/shmctl/msgctl system
calls even on the architectures that require passing it with the ipc()
multiplexer.

I'm not adding the new semtimedop() or semop() on 32-bit architectures,
those will get implemented using the new semtimedop_time64() version
that gets added along with the other time64 calls.
Three 64-bit architectures (powerpc, s390 and sparc) get semtimedop().

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/m68k/kernel/syscalls/syscall.tbl     | 11 +++++++++++
 arch/mips/kernel/syscalls/syscall_o32.tbl | 11 +++++++++++
 arch/powerpc/kernel/syscalls/syscall.tbl  | 13 +++++++++++++
 arch/s390/kernel/syscalls/syscall.tbl     | 12 ++++++++++++
 arch/sh/kernel/syscalls/syscall.tbl       | 11 +++++++++++
 arch/sparc/kernel/syscalls/syscall.tbl    | 12 ++++++++++++
 arch/x86/entry/syscalls/syscall_32.tbl    | 11 +++++++++++
 arch/x86/entry/syscalls/syscall_64.tbl    |  2 ++
 include/uapi/asm-generic/unistd.h         |  1 +
 9 files changed, 84 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 85779d6ef935..5354ba02eed2 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -388,3 +388,14 @@
 378	common	pwritev2			sys_pwritev2
 379	common	statx				sys_statx
 380	common	seccomp				sys_seccomp
+# room for arch specific calls
+393	common	semget				sys_semget
+394	common	semctl				sys_semctl
+395	common	shmget				sys_shmget
+396	common	shmctl				sys_shmctl
+397	common	shmat				sys_shmat
+398	common	shmdt				sys_shmdt
+399	common	msgget				sys_msgget
+400	common	msgsnd				sys_msgsnd
+401	common	msgrcv				sys_msgrcv
+402	common	msgctl				sys_msgctl
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 3d5a47b80d2b..fa47ea8cc6ef 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -380,3 +380,14 @@
 366	o32	statx				sys_statx
 367	o32	rseq				sys_rseq
 368	o32	io_pgetevents			sys_io_pgetevents		compat_sys_io_pgetevents
+# room for arch specific calls
+393	o32	semget				sys_semget
+394	o32	semctl				sys_semctl			compat_sys_semctl
+395	o32	shmget				sys_shmget
+396	o32	shmctl				sys_shmctl			compat_sys_shmctl
+397	o32	shmat				sys_shmat			compat_sys_shmat
+398	o32	shmdt				sys_shmdt
+399	o32	msgget				sys_msgget
+400	o32	msgsnd				sys_msgsnd			compat_sys_msgsnd
+401	o32	msgrcv				sys_msgrcv			compat_sys_msgrcv
+402	o32	msgctl				sys_msgctl			compat_sys_msgctl
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index db3bbb8744af..7555874ce39c 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -414,6 +414,7 @@
 363	spu	switch_endian			sys_ni_syscall
 364	common	userfaultfd			sys_userfaultfd
 365	common	membarrier			sys_membarrier
+# 366-377 originally left for IPC, now unused
 378	nospu	mlock2				sys_mlock2
 379	nospu	copy_file_range			sys_copy_file_range
 380	common	preadv2				sys_preadv2			compat_sys_preadv2
@@ -425,3 +426,15 @@
 386	nospu	pkey_mprotect			sys_pkey_mprotect
 387	nospu	rseq				sys_rseq
 388	nospu	io_pgetevents			sys_io_pgetevents		compat_sys_io_pgetevents
+# room for arch specific syscalls
+392	64	semtimedop			sys_semtimedop
+393	common	semget				sys_semget
+394	common	semctl				sys_semctl			compat_sys_semctl
+395	common	shmget				sys_shmget
+396	common	shmctl				sys_shmctl			compat_sys_shmctl
+397	common	shmat				sys_shmat			compat_sys_shmat
+398	common	shmdt				sys_shmdt
+399	common	msgget				sys_msgget
+400	common	msgsnd				sys_msgsnd			compat_sys_msgsnd
+401	common	msgrcv				sys_msgrcv			compat_sys_msgrcv
+402	common	msgctl				sys_msgctl			compat_sys_msgctl
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 7413fd318e2a..0bccb01c6202 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -391,3 +391,15 @@
 381  common	kexec_file_load		sys_kexec_file_load		sys_kexec_file_load
 382  common	io_pgetevents		sys_io_pgetevents		compat_sys_io_pgetevents
 383  common	rseq			sys_rseq			sys_rseq
+# room for arch specific syscalls
+392	64	semtimedop		sys_semtimedop			-
+393  common	semget			sys_semget			sys_semget
+394  common	semctl			sys_semctl			compat_sys_semctl
+395  common	shmget			sys_shmget			sys_shmget
+396  common	shmctl			sys_shmctl			compat_sys_shmctl
+397  common	shmat			sys_shmat			compat_sys_shmat
+398  common	shmdt			sys_shmdt 			sys_shmdt
+399  common	msgget			sys_msgget			sys_msgget
+400  common	msgsnd			sys_msgsnd			compat_sys_msgsnd
+401  common	msgrcv			sys_msgrcv			compat_sys_msgrcv
+402  common	msgctl			sys_msgctl			compat_sys_msgctl
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index a70db013dbc7..6d0b84e3ef2d 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -391,3 +391,14 @@
 381	common	preadv2				sys_preadv2
 382	common	pwritev2			sys_pwritev2
 383	common	statx				sys_statx
+# room for arch specific syscalls
+393	common	semget				sys_semget
+394	common	semctl				sys_semctl
+395	common	shmget				sys_shmget
+396	common	shmctl				sys_shmctl
+397	common	shmat				sys_shmat
+398	common	shmdt				sys_shmdt
+399	common	msgget				sys_msgget
+400	common	msgsnd				sys_msgsnd
+401	common	msgrcv				sys_msgrcv
+402	common	msgctl				sys_msgctl
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index c8c77c05ea97..8c9580302422 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -407,3 +407,15 @@
 359	common	pwritev2		sys_pwritev2			compat_sys_pwritev2
 360	common	statx			sys_statx
 361	common	io_pgetevents		sys_io_pgetevents		compat_sys_io_pgetevents
+# room for arch specific syscalls
+392	64	semtimedop			sys_semtimedop
+393	common	semget			sys_semget
+394	common	semctl			sys_semctl			compat_sys_semctl
+395	common	shmget			sys_shmget
+396	common	shmctl			sys_shmctl			compat_sys_shmctl
+397	common	shmat			sys_shmat			compat_sys_shmat
+398	common	shmdt			sys_shmdt
+399	common	msgget			sys_msgget
+400	common	msgsnd			sys_msgsnd			compat_sys_msgsnd
+401	common	msgrcv			sys_msgrcv			compat_sys_msgrcv
+402	common	msgctl			sys_msgctl			compat_sys_msgctl
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..2be1d0eb7754 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,14 @@
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
 385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
+# don't use numbers 387 through 392, add new calls at the end
+393	i386	semget			sys_semget    			__ia32_sys_semget
+394	i386	semctl			sys_semctl    			__ia32_compat_sys_semctl
+395	i386	shmget			sys_shmget    			__ia32_sys_shmget
+396	i386	shmctl			sys_shmctl    			__ia32_compat_sys_shmctl
+397	i386	shmat			sys_shmat     			__ia32_compat_sys_shmat
+398	i386	shmdt			sys_shmdt     			__ia32_sys_shmdt
+399	i386	msgget			sys_msgget    			__ia32_sys_msgget
+400	i386	msgsnd			sys_msgsnd    			__ia32_compat_sys_msgsnd
+401	i386	msgrcv			sys_msgrcv    			__ia32_compat_sys_msgrcv
+402	i386	msgctl			sys_msgctl    			__ia32_compat_sys_msgctl
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..9d8128ef50a9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,8 @@
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+# don't use numbers 387 through 423, add new calls after the last
+# 'common' entry
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d90127298f12..509484dbfd5d 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -740,6 +740,7 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
 __SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
+/* 295 through 402 are unassigned to sync up with generic numbers, don't use */
 
 #undef __NR_syscalls
 #define __NR_syscalls 295
-- 
cgit v1.2.3-59-g8ed1b


From 9d9d4ff788845fad1626b80164e43a1f0f17ccbc Mon Sep 17 00:00:00 2001
From: Lijun Ou <oulijun@huawei.com>
Date: Wed, 23 Jan 2019 10:09:27 +0800
Subject: RDMA/hns: Update the kernel header file of hns

The hns_roce_ib_create_srq_resp is used to interact with the user for
data, this was open coded to use a u32 directly, instead use a properly
sized structure.

Fixes: c7bcb13442e1 ("RDMA/hns: Add SRQ support for hip08 kernel mode")
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_srq.c | 10 ++++++++--
 include/uapi/rdma/hns-abi.h              |  5 +++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 960b1946c365..12deacf442cf 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -210,6 +210,7 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
 				   struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+	struct hns_roce_ib_create_srq_resp resp = {};
 	struct hns_roce_srq *srq;
 	int srq_desc_size;
 	int srq_buf_size;
@@ -378,16 +379,21 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
 
 	srq->event = hns_roce_ib_srq_event;
 	srq->ibsrq.ext.xrc.srq_num = srq->srqn;
+	resp.srqn = srq->srqn;
 
 	if (udata) {
-		if (ib_copy_to_udata(udata, &srq->srqn, sizeof(__u32))) {
+		if (ib_copy_to_udata(udata, &resp,
+				     min(udata->outlen, sizeof(resp)))) {
 			ret = -EFAULT;
-			goto err_wrid;
+			goto err_srqc_alloc;
 		}
 	}
 
 	return &srq->ibsrq;
 
+err_srqc_alloc:
+	hns_roce_srq_free(hr_dev, srq);
+
 err_wrid:
 	kvfree(srq->wrid);
 
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
index ef3c7ec793a7..eb76b38a00d4 100644
--- a/include/uapi/rdma/hns-abi.h
+++ b/include/uapi/rdma/hns-abi.h
@@ -52,6 +52,11 @@ struct hns_roce_ib_create_srq {
 	__aligned_u64 que_addr;
 };
 
+struct hns_roce_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
 struct hns_roce_ib_create_qp {
 	__aligned_u64 buf_addr;
 	__aligned_u64 db_addr;
-- 
cgit v1.2.3-59-g8ed1b


From 30e5c2c6bf285d93dee4c45f23da95d7d50b125a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 25 Jan 2019 10:53:23 -0800
Subject: net: Revert devlink health changes.

This reverts the devlink health changes from 9/17/2019,
Jiri wants things to be designed differently and it was
agreed that the easiest way to do this is start from the
beginning again.

Commits reverted:

cb5ccfbe73b389470e1dc11061bb185ef4bc9aec
880ee82f0313453ec5a6cb122866ac057263066b
c7af343b4e33578b7de91786a3f639c8cfa0d97b
ff253fedab961b22117a73ab808fcfa9e6852b50
6f9d56132eb6d2603d4273cfc65bed914ec47acb
fcd852c69d776c0f46c8f79e8e431e5cc6ddc7b7
8a66704a13d9713593342e29b4f0c19762f5746b
12bd0dcefe88782ac1c9fff632958dd1b71d27e5
aba25279c10094c5c97d09c3491ca86d00b4ad5e
ce019faa70f81555fa17ebc1d5a03651f2e7e15a
b8c45a033acc607201588f7665ba84207e5149e0

And the follow-on build fix:

o33a0efa4baecd689da9474ce0e8b673eb6931c60

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink-health.txt        |   86 --
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |    2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   18 +-
 .../net/ethernet/mellanox/mlx5/core/en/reporter.h  |   15 -
 .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   |  356 -------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  186 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |    2 +-
 include/net/devlink.h                              |  144 ---
 include/trace/events/devlink.h                     |   62 --
 include/uapi/linux/devlink.h                       |   25 -
 net/core/devlink.c                                 | 1058 --------------------
 11 files changed, 169 insertions(+), 1785 deletions(-)
 delete mode 100644 Documentation/networking/devlink-health.txt
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c

(limited to 'include/uapi')

diff --git a/Documentation/networking/devlink-health.txt b/Documentation/networking/devlink-health.txt
deleted file mode 100644
index 1db3fbea0831..000000000000
--- a/Documentation/networking/devlink-health.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-The health mechanism is targeted for Real Time Alerting, in order to know when
-something bad had happened to a PCI device
-- Provide alert debug information
-- Self healing
-- If problem needs vendor support, provide a way to gather all needed debugging
-  information.
-
-The main idea is to unify and centralize driver health reports in the
-generic devlink instance and allow the user to set different
-attributes of the health reporting and recovery procedures.
-
-The devlink health reporter:
-Device driver creates a "health reporter" per each error/health type.
-Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error)
-or unknown (driver specific).
-For each registered health reporter a driver can issue error/health reports
-asynchronously. All health reports handling is done by devlink.
-Device driver can provide specific callbacks for each "health reporter", e.g.
- - Recovery procedures
- - Diagnostics and object dump procedures
- - OOB initial parameters
-Different parts of the driver can register different types of health reporters
-with different handlers.
-
-Once an error is reported, devlink health will do the following actions:
-  * A log is being send to the kernel trace events buffer
-  * Health status and statistics are being updated for the reporter instance
-  * Object dump is being taken and saved at the reporter instance (as long as
-    there is no other dump which is already stored)
-  * Auto recovery attempt is being done. Depends on:
-    - Auto-recovery configuration
-    - Grace period vs. time passed since last recover
-
-The user interface:
-User can access/change each reporter's parameters and driver specific callbacks
-via devlink, e.g per error type (per health reporter)
- - Configure reporter's generic parameters (like: disable/enable auto recovery)
- - Invoke recovery procedure
- - Run diagnostics
- - Object dump
-
-The devlink health interface (via netlink):
-DEVLINK_CMD_HEALTH_REPORTER_GET
-  Retrieves status and configuration info per DEV and reporter.
-DEVLINK_CMD_HEALTH_REPORTER_SET
-  Allows reporter-related configuration setting.
-DEVLINK_CMD_HEALTH_REPORTER_RECOVER
-  Triggers a reporter's recovery procedure.
-DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE
-  Retrieves diagnostics data from a reporter on a device.
-DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET
-  Retrieves the last stored dump. Devlink health
-  saves a single dump. If an dump is not already stored by the devlink
-  for this reporter, devlink generates a new dump.
-  dump output is defined by the reporter.
-DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR
-  Clears the last saved dump file for the specified reporter.
-
-
-                                               netlink
-                                      +--------------------------+
-                                      |                          |
-                                      |            +             |
-                                      |            |             |
-                                      +--------------------------+
-                                                   |request for ops
-                                                   |(diagnose,
- mlx5_core                             devlink     |recover,
-                                                   |dump)
-+--------+                            +--------------------------+
-|        |                            |    reporter|             |
-|        |                            |  +---------v----------+  |
-|        |   ops execution            |  |                    |  |
-|     <----------------------------------+                    |  |
-|        |                            |  |                    |  |
-|        |                            |  + ^------------------+  |
-|        |                            |    | request for ops     |
-|        |                            |    | (recover, dump)     |
-|        |                            |    |                     |
-|        |                            |  +-+------------------+  |
-|        |     health report          |  | health handler     |  |
-|        +------------------------------->                    |  |
-|        |                            |  +--------------------+  |
-|        |     health reporter create |                          |
-|        +---------------------------->                          |
-+--------+                            +--------------------------+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 6bb2a860b15b..9de9abacf7f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -22,7 +22,7 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 #
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 		en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
-		en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o
+		en_selftest.o en/port.o en/monitor_stats.o
 
 #
 # Netdev extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 27e276c9bf84..8fa8fdd30b85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -388,7 +388,10 @@ struct mlx5e_txqsq {
 	struct mlx5e_channel      *channel;
 	int                        txq_ix;
 	u32                        rate_limit;
-	struct work_struct         recover_work;
+	struct mlx5e_txqsq_recover {
+		struct work_struct         recover_work;
+		u64                        last_recover;
+	} recover;
 } ____cacheline_aligned_in_smp;
 
 struct mlx5e_dma_info {
@@ -679,13 +682,6 @@ struct mlx5e_rss_params {
 	u8	hfunc;
 };
 
-struct mlx5e_modify_sq_param {
-	int curr_state;
-	int next_state;
-	int rl_update;
-	int rl_index;
-};
-
 struct mlx5e_priv {
 	/* priv data path fields - start */
 	struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
@@ -741,7 +737,6 @@ struct mlx5e_priv {
 #ifdef CONFIG_MLX5_EN_TLS
 	struct mlx5e_tls          *tls;
 #endif
-	struct devlink_health_reporter *tx_reporter;
 };
 
 struct mlx5e_profile {
@@ -871,11 +866,6 @@ void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
 void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
 			       struct mlx5e_params *params);
 
-int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
-		    struct mlx5e_modify_sq_param *p);
-void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq);
-void mlx5e_tx_disable_queue(struct netdev_queue *txq);
-
 static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
 {
 	return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
deleted file mode 100644
index 2335c5b48820..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
-/* Copyright (c) 2018 Mellanox Technologies. */
-
-#ifndef __MLX5E_EN_REPORTER_H
-#define __MLX5E_EN_REPORTER_H
-
-#include <linux/mlx5/driver.h>
-#include "en.h"
-
-int mlx5e_tx_reporter_create(struct mlx5e_priv *priv);
-void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv);
-void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq);
-void mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq);
-
-#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
deleted file mode 100644
index d9675afbb924..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
-/* Copyright (c) 2018 Mellanox Technologies. */
-
-#include <net/devlink.h>
-#include "reporter.h"
-#include "lib/eq.h"
-
-#define MLX5E_TX_REPORTER_PER_SQ_MAX_LEN 256
-
-struct mlx5e_tx_err_ctx {
-	int (*recover)(struct mlx5e_txqsq *sq);
-	struct mlx5e_txqsq *sq;
-};
-
-static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
-{
-	unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
-
-	while (time_before(jiffies, exp_time)) {
-		if (sq->cc == sq->pc)
-			return 0;
-
-		msleep(20);
-	}
-
-	netdev_err(sq->channel->netdev,
-		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
-		   sq->sqn, sq->cc, sq->pc);
-
-	return -ETIMEDOUT;
-}
-
-static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
-{
-	WARN_ONCE(sq->cc != sq->pc,
-		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
-		  sq->sqn, sq->cc, sq->pc);
-	sq->cc = 0;
-	sq->dma_fifo_cc = 0;
-	sq->pc = 0;
-}
-
-static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state)
-{
-	struct mlx5_core_dev *mdev = sq->channel->mdev;
-	struct net_device *dev = sq->channel->netdev;
-	struct mlx5e_modify_sq_param msp = {0};
-	int err;
-
-	msp.curr_state = curr_state;
-	msp.next_state = MLX5_SQC_STATE_RST;
-
-	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
-	if (err) {
-		netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn);
-		return err;
-	}
-
-	memset(&msp, 0, sizeof(msp));
-	msp.curr_state = MLX5_SQC_STATE_RST;
-	msp.next_state = MLX5_SQC_STATE_RDY;
-
-	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
-	if (err) {
-		netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn);
-		return err;
-	}
-
-	return 0;
-}
-
-static int mlx5e_tx_reporter_err_cqe_recover(struct mlx5e_txqsq *sq)
-{
-	struct mlx5_core_dev *mdev = sq->channel->mdev;
-	struct net_device *dev = sq->channel->netdev;
-	u8 state;
-	int err;
-
-	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
-		return 0;
-
-	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
-	if (err) {
-		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
-			   sq->sqn, err);
-		return err;
-	}
-
-	if (state != MLX5_RQC_STATE_ERR) {
-		netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn);
-		return -EINVAL;
-	}
-
-	mlx5e_tx_disable_queue(sq->txq);
-
-	err = mlx5e_wait_for_sq_flush(sq);
-	if (err)
-		return err;
-
-	/* At this point, no new packets will arrive from the stack as TXQ is
-	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
-	 * pending WQEs.  SQ can safely reset the SQ.
-	 */
-
-	err = mlx5e_sq_to_ready(sq, state);
-	if (err)
-		return err;
-
-	mlx5e_reset_txqsq_cc_pc(sq);
-	sq->stats->recover++;
-	mlx5e_activate_txqsq(sq);
-
-	return 0;
-}
-
-void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq)
-{
-	char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN];
-	struct mlx5e_tx_err_ctx err_ctx = {0};
-
-	err_ctx.sq       = sq;
-	err_ctx.recover  = mlx5e_tx_reporter_err_cqe_recover;
-	sprintf(err_str, "ERR CQE on SQ: 0x%x", sq->sqn);
-
-	devlink_health_report(sq->channel->priv->tx_reporter, err_str,
-			      &err_ctx);
-}
-
-static int mlx5e_tx_reporter_timeout_recover(struct mlx5e_txqsq *sq)
-{
-	struct mlx5_eq_comp *eq = sq->cq.mcq.eq;
-	u32 eqe_count;
-
-	netdev_err(sq->channel->netdev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
-		   eq->core.eqn, eq->core.cons_index, eq->core.irqn);
-
-	eqe_count = mlx5_eq_poll_irq_disabled(eq);
-	if (!eqe_count) {
-		clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
-		return 1;
-	}
-
-	netdev_err(sq->channel->netdev, "Recover %d eqes on EQ 0x%x\n",
-		   eqe_count, eq->core.eqn);
-	sq->channel->stats->eq_rearm++;
-	return 0;
-}
-
-void mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq)
-{
-	struct mlx5e_tx_err_ctx err_ctx;
-	char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN];
-
-	err_ctx.sq       = sq;
-	err_ctx.recover  = mlx5e_tx_reporter_timeout_recover;
-	sprintf(err_str,
-		"TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
-		sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
-		jiffies_to_usecs(jiffies - sq->txq->trans_start));
-	devlink_health_report(sq->channel->priv->tx_reporter, err_str,
-			      &err_ctx);
-}
-
-/* state lock cannot be grabbed within this function.
- * It can cause a dead lock or a read-after-free.
- */
-int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_tx_err_ctx *err_ctx)
-{
-	return err_ctx->recover(err_ctx->sq);
-}
-
-static int mlx5e_tx_reporter_recover_all(struct mlx5e_priv *priv)
-{
-	int err;
-
-	mutex_lock(&priv->state_lock);
-	mlx5e_close_locked(priv->netdev);
-	err = mlx5e_open_locked(priv->netdev);
-	mutex_unlock(&priv->state_lock);
-
-	return err;
-}
-
-static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
-				     void *context)
-{
-	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
-	struct mlx5e_tx_err_ctx *err_ctx = context;
-
-	return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
-			 mlx5e_tx_reporter_recover_all(priv);
-}
-
-static int
-mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer,
-					u32 sqn, u8 state, u8 stopped)
-{
-	int err, i;
-	int nest = 0;
-	char name[20];
-
-	err = devlink_health_buffer_nest_start(buffer,
-					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
-	if (err)
-		goto buffer_error;
-	nest++;
-
-	err = devlink_health_buffer_nest_start(buffer,
-					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
-	if (err)
-		goto buffer_error;
-	nest++;
-
-	sprintf(name, "SQ 0x%x", sqn);
-	err = devlink_health_buffer_put_object_name(buffer, name);
-	if (err)
-		goto buffer_error;
-
-	err = devlink_health_buffer_nest_start(buffer,
-					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
-	if (err)
-		goto buffer_error;
-	nest++;
-
-	err = devlink_health_buffer_nest_start(buffer,
-					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
-	if (err)
-		goto buffer_error;
-	nest++;
-
-	err = devlink_health_buffer_nest_start(buffer,
-					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
-	if (err)
-		goto buffer_error;
-	nest++;
-
-	err = devlink_health_buffer_put_object_name(buffer, "HW state");
-	if (err)
-		goto buffer_error;
-
-	err = devlink_health_buffer_nest_start(buffer,
-					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
-	if (err)
-		goto buffer_error;
-	nest++;
-
-	err = devlink_health_buffer_put_value_u8(buffer, state);
-	if (err)
-		goto buffer_error;
-
-	devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE */
-	nest--;
-
-	devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR */
-	nest--;
-
-	err = devlink_health_buffer_nest_start(buffer,
-					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
-	if (err)
-		goto buffer_error;
-	nest++;
-
-	err = devlink_health_buffer_put_object_name(buffer, "stopped");
-	if (err)
-		goto buffer_error;
-
-	err = devlink_health_buffer_nest_start(buffer,
-					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
-	if (err)
-		goto buffer_error;
-	nest++;
-
-	err = devlink_health_buffer_put_value_u8(buffer, stopped);
-	if (err)
-		goto buffer_error;
-
-	for (i = 0; i < nest; i++)
-		devlink_health_buffer_nest_end(buffer);
-
-	return 0;
-
-buffer_error:
-	for (i = 0; i < nest; i++)
-		devlink_health_buffer_nest_cancel(buffer);
-	return err;
-}
-
-static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
-				      struct devlink_health_buffer **buffers_array,
-				      unsigned int buffer_size,
-				      unsigned int num_buffers)
-{
-	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
-	unsigned int buff = 0;
-	int i = 0, err = 0;
-
-	if (buffer_size < MLX5E_TX_REPORTER_PER_SQ_MAX_LEN)
-		return -ENOMEM;
-
-	mutex_lock(&priv->state_lock);
-
-	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
-		mutex_unlock(&priv->state_lock);
-		return 0;
-	}
-
-	while (i < priv->channels.num * priv->channels.params.num_tc) {
-		struct mlx5e_txqsq *sq = priv->txq2sq[i];
-		u8 state;
-
-		err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
-		if (err)
-			break;
-
-		err = mlx5e_tx_reporter_build_diagnose_output(buffers_array[buff],
-							      sq->sqn, state,
-							      netif_xmit_stopped(sq->txq));
-		if (err) {
-			if (++buff == num_buffers)
-				break;
-		} else {
-			i++;
-		}
-	}
-
-	mutex_unlock(&priv->state_lock);
-	return err;
-}
-
-static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
-		.name = "TX",
-		.recover = mlx5e_tx_reporter_recover,
-		.diagnose_size = MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC *
-				 MLX5E_TX_REPORTER_PER_SQ_MAX_LEN,
-		.diagnose = mlx5e_tx_reporter_diagnose,
-		.dump_size = 0,
-		.dump = NULL,
-};
-
-#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
-int mlx5e_tx_reporter_create(struct mlx5e_priv *priv)
-{
-	struct mlx5_core_dev *mdev = priv->mdev;
-	struct devlink *devlink = priv_to_devlink(mdev);
-
-	priv->tx_reporter =
-		devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops,
-					       MLX5_REPORTER_TX_GRACEFUL_PERIOD,
-					       true, priv);
-	return PTR_ERR_OR_ZERO(priv->tx_reporter);
-}
-
-void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv)
-{
-	devlink_health_reporter_destroy(priv->tx_reporter);
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index dee0c8f3d4e9..8cfd2ec7c0a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -51,7 +51,6 @@
 #include "en/xdp.h"
 #include "lib/eq.h"
 #include "en/monitor_stats.h"
-#include "en/reporter.h"
 
 struct mlx5e_rq_param {
 	u32			rqc[MLX5_ST_SZ_DW(rqc)];
@@ -1161,7 +1160,7 @@ static int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa)
 	return 0;
 }
 
-static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work);
+static void mlx5e_sq_recover(struct work_struct *work);
 static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 			     int txq_ix,
 			     struct mlx5e_params *params,
@@ -1183,7 +1182,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 	sq->stats     = &c->priv->channel_stats[c->ix].sq[tc];
-	INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
+	INIT_WORK(&sq->recover.recover_work, mlx5e_sq_recover);
 	if (MLX5_IPSEC_DEV(c->priv->mdev))
 		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
 	if (mlx5_accel_is_tls_device(c->priv->mdev))
@@ -1271,8 +1270,15 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
 	return err;
 }
 
-int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
-		    struct mlx5e_modify_sq_param *p)
+struct mlx5e_modify_sq_param {
+	int curr_state;
+	int next_state;
+	bool rl_update;
+	int rl_index;
+};
+
+static int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
+			   struct mlx5e_modify_sq_param *p)
 {
 	void *in;
 	void *sqc;
@@ -1370,7 +1376,17 @@ err_free_txqsq:
 	return err;
 }
 
-void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
+static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
+{
+	WARN_ONCE(sq->cc != sq->pc,
+		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
+		  sq->sqn, sq->cc, sq->pc);
+	sq->cc = 0;
+	sq->dma_fifo_cc = 0;
+	sq->pc = 0;
+}
+
+static void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
 {
 	sq->txq = netdev_get_tx_queue(sq->channel->netdev, sq->txq_ix);
 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
@@ -1379,7 +1395,7 @@ void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
 	netif_tx_start_queue(sq->txq);
 }
 
-void mlx5e_tx_disable_queue(struct netdev_queue *txq)
+static inline void netif_tx_disable_queue(struct netdev_queue *txq)
 {
 	__netif_tx_lock_bh(txq);
 	netif_tx_stop_queue(txq);
@@ -1395,7 +1411,7 @@ static void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq)
 	/* prevent netif_tx_wake_queue */
 	napi_synchronize(&c->napi);
 
-	mlx5e_tx_disable_queue(sq->txq);
+	netif_tx_disable_queue(sq->txq);
 
 	/* last doorbell out, godspeed .. */
 	if (mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1)) {
@@ -1415,7 +1431,6 @@ static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
 	struct mlx5_rate_limit rl = {0};
 
 	cancel_work_sync(&sq->dim.work);
-	cancel_work_sync(&sq->recover_work);
 	mlx5e_destroy_sq(mdev, sq->sqn);
 	if (sq->rate_limit) {
 		rl.rate = sq->rate_limit;
@@ -1425,15 +1440,105 @@ static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
 	mlx5e_free_txqsq(sq);
 }
 
-static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work)
+static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
+{
+	unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
+
+	while (time_before(jiffies, exp_time)) {
+		if (sq->cc == sq->pc)
+			return 0;
+
+		msleep(20);
+	}
+
+	netdev_err(sq->channel->netdev,
+		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
+		   sq->sqn, sq->cc, sq->pc);
+
+	return -ETIMEDOUT;
+}
+
+static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state)
 {
-	struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq,
-					      recover_work);
+	struct mlx5_core_dev *mdev = sq->channel->mdev;
+	struct net_device *dev = sq->channel->netdev;
+	struct mlx5e_modify_sq_param msp = {0};
+	int err;
 
-	if (!sq->channel->priv->tx_reporter)
+	msp.curr_state = curr_state;
+	msp.next_state = MLX5_SQC_STATE_RST;
+
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn);
+		return err;
+	}
+
+	memset(&msp, 0, sizeof(msp));
+	msp.curr_state = MLX5_SQC_STATE_RST;
+	msp.next_state = MLX5_SQC_STATE_RDY;
+
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn);
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx5e_sq_recover(struct work_struct *work)
+{
+	struct mlx5e_txqsq_recover *recover =
+		container_of(work, struct mlx5e_txqsq_recover,
+			     recover_work);
+	struct mlx5e_txqsq *sq = container_of(recover, struct mlx5e_txqsq,
+					      recover);
+	struct mlx5_core_dev *mdev = sq->channel->mdev;
+	struct net_device *dev = sq->channel->netdev;
+	u8 state;
+	int err;
+
+	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
+	if (err) {
+		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
+			   sq->sqn, err);
+		return;
+	}
+
+	if (state != MLX5_RQC_STATE_ERR) {
+		netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn);
+		return;
+	}
+
+	netif_tx_disable_queue(sq->txq);
+
+	if (mlx5e_wait_for_sq_flush(sq))
 		return;
 
-	mlx5e_tx_reporter_err_cqe(sq);
+	/* If the interval between two consecutive recovers per SQ is too
+	 * short, don't recover to avoid infinite loop of ERR_CQE -> recover.
+	 * If we reached this state, there is probably a bug that needs to be
+	 * fixed. let's keep the queue close and let tx timeout cleanup.
+	 */
+	if (jiffies_to_msecs(jiffies - recover->last_recover) <
+	    MLX5E_SQ_RECOVER_MIN_INTERVAL) {
+		netdev_err(dev, "Recover SQ 0x%x canceled, too many error CQEs\n",
+			   sq->sqn);
+		return;
+	}
+
+	/* At this point, no new packets will arrive from the stack as TXQ is
+	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
+	 * pending WQEs.  SQ can safely reset the SQ.
+	 */
+	if (mlx5e_sq_to_ready(sq, state))
+		return;
+
+	mlx5e_reset_txqsq_cc_pc(sq);
+	sq->stats->recover++;
+	recover->last_recover = jiffies;
+	mlx5e_activate_txqsq(sq);
 }
 
 static int mlx5e_open_icosq(struct mlx5e_channel *c,
@@ -3102,7 +3207,6 @@ static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
 {
 	int tc;
 
-	mlx5e_tx_reporter_destroy(priv);
 	for (tc = 0; tc < priv->profile->max_tc; tc++)
 		mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]);
 }
@@ -4074,14 +4178,31 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb,
 	return features;
 }
 
+static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev,
+					struct mlx5e_txqsq *sq)
+{
+	struct mlx5_eq_comp *eq = sq->cq.mcq.eq;
+	u32 eqe_count;
+
+	netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
+		   eq->core.eqn, eq->core.cons_index, eq->core.irqn);
+
+	eqe_count = mlx5_eq_poll_irq_disabled(eq);
+	if (!eqe_count)
+		return false;
+
+	netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->core.eqn);
+	sq->channel->stats->eq_rearm++;
+	return true;
+}
+
 static void mlx5e_tx_timeout_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
 					       tx_timeout_work);
-	int i;
-
-	if (!priv->tx_reporter)
-		return;
+	struct net_device *dev = priv->netdev;
+	bool reopen_channels = false;
+	int i, err;
 
 	rtnl_lock();
 	mutex_lock(&priv->state_lock);
@@ -4090,16 +4211,36 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
 		goto unlock;
 
 	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) {
-		struct netdev_queue *dev_queue =
-			netdev_get_tx_queue(priv->netdev, i);
+		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i);
 		struct mlx5e_txqsq *sq = priv->txq2sq[i];
 
 		if (!netif_xmit_stopped(dev_queue))
 			continue;
 
-		mlx5e_tx_reporter_timeout(sq);
+		netdev_err(dev,
+			   "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
+			   i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
+			   jiffies_to_usecs(jiffies - dev_queue->trans_start));
+
+		/* If we recover a lost interrupt, most likely TX timeout will
+		 * be resolved, skip reopening channels
+		 */
+		if (!mlx5e_tx_timeout_eq_recover(dev, sq)) {
+			clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+			reopen_channels = true;
+		}
 	}
 
+	if (!reopen_channels)
+		goto unlock;
+
+	mlx5e_close_locked(dev);
+	err = mlx5e_open_locked(dev);
+	if (err)
+		netdev_err(priv->netdev,
+			   "mlx5e_open_locked failed recovering from a tx_timeout, err(%d).\n",
+			   err);
+
 unlock:
 	mutex_unlock(&priv->state_lock);
 	rtnl_unlock();
@@ -4767,7 +4908,6 @@ static int mlx5e_init_nic_tx(struct mlx5e_priv *priv)
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 	mlx5e_dcbnl_initialize(priv);
 #endif
-	mlx5e_tx_reporter_create(priv);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index a8e052a5ce36..598ad7e4d5c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -514,7 +514,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 				mlx5e_dump_error_cqe(sq,
 						     (struct mlx5_err_cqe *)cqe);
 				queue_work(cq->channel->priv->wq,
-					   &sq->recover_work);
+					   &sq->recover.recover_work);
 			}
 			stats->cqe_err++;
 		}
diff --git a/include/net/devlink.h b/include/net/devlink.h
index a81a1b7a67d7..67f4293bc970 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -30,7 +30,6 @@ struct devlink {
 	struct list_head param_list;
 	struct list_head region_list;
 	u32 snapshot_id;
-	struct list_head reporter_list;
 	struct devlink_dpipe_headers *dpipe_headers;
 	const struct devlink_ops *ops;
 	struct device *dev;
@@ -424,36 +423,6 @@ struct devlink_region;
 
 typedef void devlink_snapshot_data_dest_t(const void *data);
 
-struct devlink_health_buffer;
-struct devlink_health_reporter;
-
-/**
- * struct devlink_health_reporter_ops - Reporter operations
- * @name: reporter name
- * dump_size: dump buffer size allocated by the devlink
- * diagnose_size: diagnose buffer size allocated by the devlink
- * recover: callback to recover from reported error
- *          if priv_ctx is NULL, run a full recover
- * dump: callback to dump an object
- *       if priv_ctx is NULL, run a full dump
- * diagnose: callback to diagnose the current status
- */
-
-struct devlink_health_reporter_ops {
-	char *name;
-	unsigned int dump_size;
-	unsigned int diagnose_size;
-	int (*recover)(struct devlink_health_reporter *reporter,
-		       void *priv_ctx);
-	int (*dump)(struct devlink_health_reporter *reporter,
-		    struct devlink_health_buffer **buffers_array,
-		    unsigned int buffer_size, unsigned int num_buffers,
-		    void *priv_ctx);
-	int (*diagnose)(struct devlink_health_reporter *reporter,
-			struct devlink_health_buffer **buffers_array,
-			unsigned int buffer_size, unsigned int num_buffers);
-};
-
 struct devlink_ops {
 	int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
 	int (*port_type_set)(struct devlink_port *devlink_port,
@@ -615,34 +584,6 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
 				   u8 *data, u32 snapshot_id,
 				   devlink_snapshot_data_dest_t *data_destructor);
 
-int devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer,
-				     int attrtype);
-void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer);
-void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer);
-int devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer,
-					  char *name);
-int devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer,
-				       u8 value);
-int devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer,
-					u32 value);
-int devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer,
-					u64 value);
-int devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer,
-					   char *name);
-int devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer,
-					 void *data, int len);
-struct devlink_health_reporter *
-devlink_health_reporter_create(struct devlink *devlink,
-			       const struct devlink_health_reporter_ops *ops,
-			       u64 graceful_period, bool auto_recover,
-			       void *priv);
-void
-devlink_health_reporter_destroy(struct devlink_health_reporter *reporter);
-
-void *
-devlink_health_reporter_priv(struct devlink_health_reporter *reporter);
-int devlink_health_report(struct devlink_health_reporter *reporter,
-			  const char *msg, void *priv_ctx);
 #else
 
 static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
@@ -903,91 +844,6 @@ devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
 	return 0;
 }
 
-static inline int
-devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer,
-				 int attrtype)
-{
-	return 0;
-}
-
-static inline void
-devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer)
-{
-}
-
-static inline void
-devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer)
-{
-}
-
-static inline int
-devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer,
-				      char *name)
-{
-	return 0;
-}
-
-static inline int
-devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer,
-				   u8 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer,
-				    u32 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer,
-				    u64 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer,
-				       char *name)
-{
-	return 0;
-}
-
-static inline int
-devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer,
-				     void *data, int len)
-{
-	return 0;
-}
-
-static inline struct devlink_health_reporter *
-devlink_health_reporter_create(struct devlink *devlink,
-			       const struct devlink_health_reporter_ops *ops,
-			       u64 graceful_period, bool auto_recover,
-			       void *priv)
-{
-	return NULL;
-}
-
-static inline void
-devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
-{
-}
-
-static inline void *
-devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
-{
-	return NULL;
-}
-
-static inline int
-devlink_health_report(struct devlink_health_reporter *reporter,
-		      const char *msg, void *priv_ctx)
-{
-	return 0;
-}
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h
index 7e39d2fc7c75..44acfbca1266 100644
--- a/include/trace/events/devlink.h
+++ b/include/trace/events/devlink.h
@@ -46,65 +46,6 @@ TRACE_EVENT(devlink_hwmsg,
 		  (int) __entry->len, __get_dynamic_array(buf), __entry->len)
 );
 
-TRACE_EVENT(devlink_health_report,
-	TP_PROTO(const struct devlink *devlink, const char *reporter_name,
-		 const char *msg),
-
-	TP_ARGS(devlink, reporter_name, msg),
-
-	TP_STRUCT__entry(
-		__string(bus_name, devlink->dev->bus->name)
-		__string(dev_name, dev_name(devlink->dev))
-		__string(driver_name, devlink->dev->driver->name)
-		__string(reporter_name, msg)
-		__string(msg, msg)
-	),
-
-	TP_fast_assign(
-		__assign_str(bus_name, devlink->dev->bus->name);
-		__assign_str(dev_name, dev_name(devlink->dev));
-		__assign_str(driver_name, devlink->dev->driver->name);
-		__assign_str(reporter_name, reporter_name);
-		__assign_str(msg, msg);
-	),
-
-	TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: %s",
-		  __get_str(bus_name), __get_str(dev_name),
-		  __get_str(driver_name), __get_str(reporter_name),
-		  __get_str(msg))
-);
-
-TRACE_EVENT(devlink_health_recover_aborted,
-	TP_PROTO(const struct devlink *devlink, const char *reporter_name,
-		 bool health_state, u64 time_since_last_recover),
-
-	TP_ARGS(devlink, reporter_name, health_state, time_since_last_recover),
-
-	TP_STRUCT__entry(
-		__string(bus_name, devlink->dev->bus->name)
-		__string(dev_name, dev_name(devlink->dev))
-		__string(driver_name, devlink->dev->driver->name)
-		__string(reporter_name, reporter_name)
-		__field(bool, health_state)
-		__field(u64, time_since_last_recover)
-	),
-
-	TP_fast_assign(
-		__assign_str(bus_name, devlink->dev->bus->name);
-		__assign_str(dev_name, dev_name(devlink->dev));
-		__assign_str(driver_name, devlink->dev->driver->name);
-		__assign_str(reporter_name, reporter_name);
-		__entry->health_state = health_state;
-		__entry->time_since_last_recover = time_since_last_recover;
-	),
-
-	TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: health_state=%d time_since_last_recover = %llu recover aborted",
-		  __get_str(bus_name), __get_str(dev_name),
-		  __get_str(driver_name), __get_str(reporter_name),
-		  __entry->health_state,
-		  __entry->time_since_last_recover)
-);
-
 #endif /* _TRACE_DEVLINK_H */
 
 /* This part must be outside protection */
@@ -123,9 +64,6 @@ static inline void trace_devlink_hwmsg(const struct devlink *devlink,
 {
 }
 
-static inline void trace_devlink_health(const char *msg)
-{
-}
 #endif /* _TRACE_DEVLINK_H */
 
 #endif
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6b26bb2ce4dc..6e52d3660654 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -89,13 +89,6 @@ enum devlink_command {
 	DEVLINK_CMD_REGION_DEL,
 	DEVLINK_CMD_REGION_READ,
 
-	DEVLINK_CMD_HEALTH_REPORTER_GET,
-	DEVLINK_CMD_HEALTH_REPORTER_SET,
-	DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
-	DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
-	DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
-	DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
-
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -292,24 +285,6 @@ enum devlink_attr {
 	DEVLINK_ATTR_REGION_CHUNK_ADDR,         /* u64 */
 	DEVLINK_ATTR_REGION_CHUNK_LEN,          /* u64 */
 
-	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT,		/* nested */
-	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR,		/* nested */
-	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME,		/* string */
-	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE,	/* nested */
-	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY,	/* nested */
-	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,	/* u8 */
-	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,	/* dynamic */
-
-	DEVLINK_ATTR_HEALTH_REPORTER,			/* nested */
-	DEVLINK_ATTR_HEALTH_REPORTER_NAME,		/* string */
-	DEVLINK_ATTR_HEALTH_REPORTER_STATE,		/* u8 */
-	DEVLINK_ATTR_HEALTH_REPORTER_ERR,		/* u64 */
-	DEVLINK_ATTR_HEALTH_REPORTER_RECOVER,		/* u64 */
-	DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL,	/* u8 */
-	DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,		/* u64 */
-	DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,	/* u64 */
-	DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,	/* u8 */
-
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 24f266468ca5..abb0da9d7b4b 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3597,1015 +3597,6 @@ out:
 	return 0;
 }
 
-#define DEVLINK_HEALTH_BUFFER_SIZE (4096 - GENL_HDRLEN)
-#define DEVLINK_HEALTH_BUFFER_DATA_SIZE (DEVLINK_HEALTH_BUFFER_SIZE / 2)
-#define DEVLINK_HEALTH_SIZE_TO_BUFFERS(size) DIV_ROUND_UP_ULL(size, DEVLINK_HEALTH_BUFFER_DATA_SIZE)
-#define DEVLINK_HEALTH_BUFFER_MAX_CHUNK 1024
-
-struct devlink_health_buffer {
-	void *data;
-	u64 offset;
-	u64 bytes_left;
-	u64 bytes_left_metadata;
-	u64 max_nested_depth;
-	u64 curr_nest;
-};
-
-struct devlink_health_buffer_desc {
-	int attrtype;
-	u16 len;
-	u8 nla_type;
-	u8 nest_end;
-	int value[0];
-};
-
-static void
-devlink_health_buffers_reset(struct devlink_health_buffer **buffers_list,
-			     u64 num_of_buffers)
-{
-	u64 i;
-
-	for (i = 0; i < num_of_buffers; i++) {
-		memset(buffers_list[i]->data, 0, DEVLINK_HEALTH_BUFFER_SIZE);
-		buffers_list[i]->offset = 0;
-		buffers_list[i]->bytes_left = DEVLINK_HEALTH_BUFFER_DATA_SIZE;
-		buffers_list[i]->bytes_left_metadata =
-			DEVLINK_HEALTH_BUFFER_DATA_SIZE;
-		buffers_list[i]->max_nested_depth = 0;
-		buffers_list[i]->curr_nest = 0;
-	}
-}
-
-static void
-devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list,
-			       u64 size);
-
-static struct devlink_health_buffer **
-devlink_health_buffers_create(u64 size)
-{
-	struct devlink_health_buffer **buffers_list;
-	u64 num_of_buffers = DEVLINK_HEALTH_SIZE_TO_BUFFERS(size);
-	u64 i;
-
-	buffers_list = kcalloc(num_of_buffers,
-			       sizeof(struct devlink_health_buffer *),
-			       GFP_KERNEL);
-	if (!buffers_list)
-		return NULL;
-
-	for (i = 0; i < num_of_buffers; i++) {
-		struct devlink_health_buffer *buffer;
-		void *data;
-
-		buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
-		data = kzalloc(DEVLINK_HEALTH_BUFFER_SIZE, GFP_KERNEL);
-		if (!buffer || !data) {
-			kfree(buffer);
-			kfree(data);
-			goto buffers_cleanup;
-		}
-		buffers_list[i] = buffer;
-		buffer->data = data;
-	}
-	devlink_health_buffers_reset(buffers_list, num_of_buffers);
-
-	return buffers_list;
-
-buffers_cleanup:
-	devlink_health_buffers_destroy(buffers_list, --i);
-	kfree(buffers_list);
-	return NULL;
-}
-
-static void
-devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list,
-			       u64 num_of_buffers)
-{
-	u64 i;
-
-	for (i = 0; i < num_of_buffers; i++) {
-		kfree(buffers_list[i]->data);
-		kfree(buffers_list[i]);
-	}
-}
-
-void
-devlink_health_buffer_offset_inc(struct devlink_health_buffer *buffer,
-				 int len)
-{
-	buffer->offset += len;
-}
-
-/* In order to store a nest, need two descriptors, for start and end */
-#define DEVLINK_HEALTH_BUFFER_NEST_SIZE (sizeof(struct devlink_health_buffer_desc) * 2)
-
-int devlink_health_buffer_verify_len(struct devlink_health_buffer *buffer,
-				     int len, int metadata_len)
-{
-	if (len > DEVLINK_HEALTH_BUFFER_DATA_SIZE)
-		return -EINVAL;
-
-	if (buffer->bytes_left < len ||
-	    buffer->bytes_left_metadata < metadata_len)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static struct devlink_health_buffer_desc *
-devlink_health_buffer_get_desc_from_offset(struct devlink_health_buffer *buffer)
-{
-	return buffer->data + buffer->offset;
-}
-
-int
-devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer,
-				 int attrtype)
-{
-	struct devlink_health_buffer_desc *desc;
-	int err;
-
-	err = devlink_health_buffer_verify_len(buffer, 0,
-					       DEVLINK_HEALTH_BUFFER_NEST_SIZE);
-	if (err)
-		return err;
-
-	if (attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT &&
-	    attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR &&
-	    attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE &&
-	    attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY)
-		return -EINVAL;
-
-	desc = devlink_health_buffer_get_desc_from_offset(buffer);
-
-	desc->attrtype = attrtype;
-	buffer->bytes_left_metadata -= DEVLINK_HEALTH_BUFFER_NEST_SIZE;
-	devlink_health_buffer_offset_inc(buffer, sizeof(*desc));
-
-	buffer->curr_nest++;
-	buffer->max_nested_depth = max(buffer->max_nested_depth,
-				       buffer->curr_nest);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_start);
-
-enum devlink_health_buffer_nest_end_cancel {
-	DEVLINK_HEALTH_BUFFER_NEST_END = 1,
-	DEVLINK_HEALTH_BUFFER_NEST_CANCEL,
-};
-
-static void
-devlink_health_buffer_nest_end_cancel(struct devlink_health_buffer *buffer,
-				      enum devlink_health_buffer_nest_end_cancel nest)
-{
-	struct devlink_health_buffer_desc *desc;
-
-	WARN_ON(!buffer->curr_nest);
-	buffer->curr_nest--;
-
-	desc = devlink_health_buffer_get_desc_from_offset(buffer);
-	desc->nest_end = nest;
-	devlink_health_buffer_offset_inc(buffer, sizeof(*desc));
-}
-
-void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer)
-{
-	devlink_health_buffer_nest_end_cancel(buffer,
-					      DEVLINK_HEALTH_BUFFER_NEST_END);
-}
-EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_end);
-
-void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer)
-{
-	devlink_health_buffer_nest_end_cancel(buffer,
-					      DEVLINK_HEALTH_BUFFER_NEST_CANCEL);
-}
-EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_cancel);
-
-int
-devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer,
-				      char *name)
-{
-	struct devlink_health_buffer_desc *desc;
-	int err;
-
-	err = devlink_health_buffer_verify_len(buffer, strlen(name) + 1,
-					       sizeof(*desc));
-	if (err)
-		return err;
-
-	desc = devlink_health_buffer_get_desc_from_offset(buffer);
-	desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME;
-	desc->nla_type = NLA_NUL_STRING;
-	desc->len = strlen(name) + 1;
-	memcpy(&desc->value, name, desc->len);
-	devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len);
-
-	buffer->bytes_left_metadata -= sizeof(*desc);
-	buffer->bytes_left -= (strlen(name) + 1);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_health_buffer_put_object_name);
-
-static int
-devlink_health_buffer_put_value(struct devlink_health_buffer *buffer,
-				u8 nla_type, void *value, int len)
-{
-	struct devlink_health_buffer_desc *desc;
-	int err;
-
-	err = devlink_health_buffer_verify_len(buffer, len, sizeof(*desc));
-	if (err)
-		return err;
-
-	desc = devlink_health_buffer_get_desc_from_offset(buffer);
-	desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA;
-	desc->nla_type = nla_type;
-	desc->len = len;
-	memcpy(&desc->value, value, len);
-	devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len);
-
-	buffer->bytes_left_metadata -= sizeof(*desc);
-	buffer->bytes_left -= len;
-
-	return 0;
-}
-
-int
-devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer,
-				   u8 value)
-{
-	int err;
-
-	err = devlink_health_buffer_put_value(buffer, NLA_U8, &value,
-					      sizeof(value));
-	if (err)
-		return err;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u8);
-
-int
-devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer,
-				    u32 value)
-{
-	int err;
-
-	err = devlink_health_buffer_put_value(buffer, NLA_U32, &value,
-					      sizeof(value));
-	if (err)
-		return err;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u32);
-
-int
-devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer,
-				    u64 value)
-{
-	int err;
-
-	err = devlink_health_buffer_put_value(buffer, NLA_U64, &value,
-					      sizeof(value));
-	if (err)
-		return err;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u64);
-
-int
-devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer,
-				       char *name)
-{
-	int err;
-
-	if (strlen(name) + 1 > DEVLINK_HEALTH_BUFFER_MAX_CHUNK)
-		return -EINVAL;
-
-	err = devlink_health_buffer_put_value(buffer, NLA_NUL_STRING, name,
-					      strlen(name) + 1);
-	if (err)
-		return err;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_string);
-
-int
-devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer,
-				     void *data, int len)
-{
-	int err;
-
-	if (len > DEVLINK_HEALTH_BUFFER_MAX_CHUNK)
-		return -EINVAL;
-
-	err = devlink_health_buffer_put_value(buffer, NLA_BINARY, data, len);
-	if (err)
-		return err;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_data);
-
-static int
-devlink_health_buffer_fill_data(struct sk_buff *skb,
-				struct devlink_health_buffer_desc *desc)
-{
-	int err = -EINVAL;
-
-	switch (desc->nla_type) {
-	case NLA_U8:
-		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
-				 *(u8 *)desc->value);
-		break;
-	case NLA_U32:
-		err = nla_put_u32(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
-				  *(u32 *)desc->value);
-		break;
-	case NLA_U64:
-		err = nla_put_u64_64bit(skb,
-					DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
-					*(u64 *)desc->value, DEVLINK_ATTR_PAD);
-		break;
-	case NLA_NUL_STRING:
-		err = nla_put_string(skb,
-				     DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
-				     (char *)&desc->value);
-		break;
-	case NLA_BINARY:
-		err = nla_put(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,
-			      desc->len, (void *)&desc->value);
-		break;
-	}
-
-	return err;
-}
-
-static int
-devlink_health_buffer_fill_type(struct sk_buff *skb,
-				struct devlink_health_buffer_desc *desc)
-{
-	int err = -EINVAL;
-
-	switch (desc->nla_type) {
-	case NLA_U8:
-		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
-				 NLA_U8);
-		break;
-	case NLA_U32:
-		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
-				 NLA_U32);
-		break;
-	case NLA_U64:
-		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
-				 NLA_U64);
-		break;
-	case NLA_NUL_STRING:
-		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
-				 NLA_NUL_STRING);
-		break;
-	case NLA_BINARY:
-		err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,
-				 NLA_BINARY);
-		break;
-	}
-
-	return err;
-}
-
-static inline struct devlink_health_buffer_desc *
-devlink_health_buffer_get_next_desc(struct devlink_health_buffer_desc *desc)
-{
-	return (void *)&desc->value + desc->len;
-}
-
-static int
-devlink_health_buffer_prepare_skb(struct sk_buff *skb,
-				  struct devlink_health_buffer *buffer)
-{
-	struct devlink_health_buffer_desc *last_desc, *desc;
-	struct nlattr **buffer_nlattr;
-	int err;
-	int i = 0;
-
-	buffer_nlattr = kcalloc(buffer->max_nested_depth,
-				sizeof(*buffer_nlattr), GFP_KERNEL);
-	if (!buffer_nlattr)
-		return -EINVAL;
-
-	last_desc = devlink_health_buffer_get_desc_from_offset(buffer);
-	desc = buffer->data;
-	while (desc != last_desc) {
-		switch (desc->attrtype) {
-		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT:
-		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR:
-		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE:
-		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY:
-			buffer_nlattr[i] = nla_nest_start(skb, desc->attrtype);
-			if (!buffer_nlattr[i])
-				goto nla_put_failure;
-			i++;
-			break;
-		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA:
-			err = devlink_health_buffer_fill_data(skb, desc);
-			if (err)
-				goto nla_put_failure;
-			err = devlink_health_buffer_fill_type(skb, desc);
-			if (err)
-				goto nla_put_failure;
-			break;
-		case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME:
-			err = nla_put_string(skb, desc->attrtype,
-					     (char *)&desc->value);
-			if (err)
-				goto nla_put_failure;
-			break;
-		default:
-			WARN_ON(!desc->nest_end);
-			WARN_ON(i <= 0);
-			if (desc->nest_end == DEVLINK_HEALTH_BUFFER_NEST_END)
-				nla_nest_end(skb, buffer_nlattr[--i]);
-			else
-				nla_nest_cancel(skb, buffer_nlattr[--i]);
-			break;
-		}
-		desc = devlink_health_buffer_get_next_desc(desc);
-	}
-
-	return 0;
-
-nla_put_failure:
-	kfree(buffer_nlattr);
-	return err;
-}
-
-static int
-devlink_health_buffer_snd(struct genl_info *info,
-			  enum devlink_command cmd, int flags,
-			  struct devlink_health_buffer **buffers_array,
-			  u64 num_of_buffers)
-{
-	struct sk_buff *skb;
-	struct nlmsghdr *nlh;
-	void *hdr;
-	int err;
-	u64 i;
-
-	for (i = 0; i < num_of_buffers; i++) {
-		/* Skip buffer if driver did not fill it up with any data */
-		if (!buffers_array[i]->offset)
-			continue;
-
-		skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
-		if (!skb)
-			return -ENOMEM;
-
-		hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
-				  &devlink_nl_family, NLM_F_MULTI, cmd);
-		if (!hdr)
-			goto nla_put_failure;
-
-		err = devlink_health_buffer_prepare_skb(skb, buffers_array[i]);
-		if (err)
-			goto nla_put_failure;
-
-		genlmsg_end(skb, hdr);
-		err = genlmsg_reply(skb, info);
-		if (err)
-			return err;
-	}
-
-	skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!skb)
-		return -ENOMEM;
-	nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
-			NLMSG_DONE, 0, flags | NLM_F_MULTI);
-	if (!nlh)
-		goto nla_put_failure;
-
-	err = genlmsg_reply(skb, info);
-	if (err)
-		return err;
-
-	return 0;
-
-nla_put_failure:
-	err = -EIO;
-	nlmsg_free(skb);
-	return err;
-}
-
-struct devlink_health_reporter {
-	struct list_head list;
-	struct devlink_health_buffer **dump_buffers_array;
-	struct mutex dump_lock; /* lock parallel read/write from dump buffers */
-	struct devlink_health_buffer **diagnose_buffers_array;
-	struct mutex diagnose_lock; /* lock parallel read/write from diagnose buffers */
-	void *priv;
-	const struct devlink_health_reporter_ops *ops;
-	struct devlink *devlink;
-	u64 graceful_period;
-	bool auto_recover;
-	u8 health_state;
-	u8 dump_avail;
-	u64 dump_ts;
-	u64 error_count;
-	u64 recovery_count;
-	u64 last_recovery_ts;
-};
-
-enum devlink_health_reporter_state {
-	DEVLINK_HEALTH_REPORTER_STATE_HEALTHY,
-	DEVLINK_HEALTH_REPORTER_STATE_ERROR,
-};
-
-void *
-devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
-{
-	return reporter->priv;
-}
-EXPORT_SYMBOL_GPL(devlink_health_reporter_priv);
-
-static struct devlink_health_reporter *
-devlink_health_reporter_find_by_name(struct devlink *devlink,
-				     const char *reporter_name)
-{
-	struct devlink_health_reporter *reporter;
-
-	list_for_each_entry(reporter, &devlink->reporter_list, list)
-		if (!strcmp(reporter->ops->name, reporter_name))
-			return reporter;
-	return NULL;
-}
-
-/**
- *	devlink_health_reporter_create - create devlink health reporter
- *
- *	@devlink: devlink
- *	@ops: ops
- *	@graceful_period: to avoid recovery loops, in msecs
- *	@auto_recover: auto recover when error occurs
- *	@priv: priv
- */
-struct devlink_health_reporter *
-devlink_health_reporter_create(struct devlink *devlink,
-			       const struct devlink_health_reporter_ops *ops,
-			       u64 graceful_period, bool auto_recover,
-			       void *priv)
-{
-	struct devlink_health_reporter *reporter;
-
-	mutex_lock(&devlink->lock);
-	if (devlink_health_reporter_find_by_name(devlink, ops->name)) {
-		reporter = ERR_PTR(-EEXIST);
-		goto unlock;
-	}
-
-	if (WARN_ON(ops->dump && !ops->dump_size) ||
-	    WARN_ON(ops->diagnose && !ops->diagnose_size) ||
-	    WARN_ON(auto_recover && !ops->recover) ||
-	    WARN_ON(graceful_period && !ops->recover)) {
-		reporter = ERR_PTR(-EINVAL);
-		goto unlock;
-	}
-
-	reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
-	if (!reporter) {
-		reporter = ERR_PTR(-ENOMEM);
-		goto unlock;
-	}
-
-	if (ops->dump) {
-		reporter->dump_buffers_array =
-			devlink_health_buffers_create(ops->dump_size);
-		if (!reporter->dump_buffers_array) {
-			kfree(reporter);
-			reporter = ERR_PTR(-ENOMEM);
-			goto unlock;
-		}
-	}
-
-	if (ops->diagnose) {
-		reporter->diagnose_buffers_array =
-			devlink_health_buffers_create(ops->diagnose_size);
-		if (!reporter->diagnose_buffers_array) {
-			devlink_health_buffers_destroy(reporter->dump_buffers_array,
-						       DEVLINK_HEALTH_SIZE_TO_BUFFERS(ops->dump_size));
-			kfree(reporter);
-			reporter = ERR_PTR(-ENOMEM);
-			goto unlock;
-		}
-	}
-
-	list_add_tail(&reporter->list, &devlink->reporter_list);
-	mutex_init(&reporter->dump_lock);
-	mutex_init(&reporter->diagnose_lock);
-
-	reporter->priv = priv;
-	reporter->ops = ops;
-	reporter->devlink = devlink;
-	reporter->graceful_period = graceful_period;
-	reporter->auto_recover = auto_recover;
-unlock:
-	mutex_unlock(&devlink->lock);
-	return reporter;
-}
-EXPORT_SYMBOL_GPL(devlink_health_reporter_create);
-
-/**
- *	devlink_health_reporter_destroy - destroy devlink health reporter
- *
- *	@reporter: devlink health reporter to destroy
- */
-void
-devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
-{
-	mutex_lock(&reporter->devlink->lock);
-	list_del(&reporter->list);
-	devlink_health_buffers_destroy(reporter->dump_buffers_array,
-				       DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size));
-	devlink_health_buffers_destroy(reporter->diagnose_buffers_array,
-				       DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size));
-	kfree(reporter);
-	mutex_unlock(&reporter->devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy);
-
-static int
-devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
-				void *priv_ctx)
-{
-	int err;
-
-	if (!reporter->ops->recover)
-		return -EOPNOTSUPP;
-
-	err = reporter->ops->recover(reporter, priv_ctx);
-	if (err)
-		return err;
-
-	reporter->recovery_count++;
-	reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
-	reporter->last_recovery_ts = jiffies;
-
-	return 0;
-}
-
-static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
-				  void *priv_ctx)
-{
-	int err;
-
-	if (!reporter->ops->dump)
-		return 0;
-
-	if (reporter->dump_avail)
-		return 0;
-
-	devlink_health_buffers_reset(reporter->dump_buffers_array,
-				     DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size));
-	err = reporter->ops->dump(reporter, reporter->dump_buffers_array,
-				     DEVLINK_HEALTH_BUFFER_SIZE,
-				     DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size),
-				     priv_ctx);
-	if (!err) {
-		reporter->dump_avail = true;
-		reporter->dump_ts = jiffies;
-	}
-
-	return err;
-}
-
-int devlink_health_report(struct devlink_health_reporter *reporter,
-			  const char *msg, void *priv_ctx)
-{
-	struct devlink *devlink = reporter->devlink;
-	int err = 0;
-
-	/* write a log message of the current error */
-	WARN_ON(!msg);
-	trace_devlink_health_report(devlink, reporter->ops->name, msg);
-	reporter->error_count++;
-
-	/* abort if the previous error wasn't recovered */
-	if (reporter->auto_recover &&
-	    (reporter->health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY ||
-	     jiffies - reporter->last_recovery_ts <
-	     msecs_to_jiffies(reporter->graceful_period))) {
-		trace_devlink_health_recover_aborted(devlink,
-						     reporter->ops->name,
-						     reporter->health_state,
-						     jiffies -
-						     reporter->last_recovery_ts);
-		return -ECANCELED;
-	}
-
-	reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
-
-	mutex_lock(&reporter->dump_lock);
-	/* store current dump of current error, for later analysis */
-	devlink_health_do_dump(reporter, priv_ctx);
-	mutex_unlock(&reporter->dump_lock);
-
-	if (reporter->auto_recover)
-		err = devlink_health_reporter_recover(reporter, priv_ctx);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(devlink_health_report);
-
-static struct devlink_health_reporter *
-devlink_health_reporter_get_from_info(struct devlink *devlink,
-				      struct genl_info *info)
-{
-	char *reporter_name;
-
-	if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
-		return NULL;
-
-	reporter_name =
-		nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
-	return devlink_health_reporter_find_by_name(devlink, reporter_name);
-}
-
-static int
-devlink_nl_health_reporter_fill(struct sk_buff *msg,
-				struct devlink *devlink,
-				struct devlink_health_reporter *reporter,
-				enum devlink_command cmd, u32 portid,
-				u32 seq, int flags)
-{
-	struct nlattr *reporter_attr;
-	void *hdr;
-
-	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
-	if (!hdr)
-		return -EMSGSIZE;
-
-	if (devlink_nl_put_handle(msg, devlink))
-		goto genlmsg_cancel;
-
-	reporter_attr = nla_nest_start(msg, DEVLINK_ATTR_HEALTH_REPORTER);
-	if (!reporter_attr)
-		goto genlmsg_cancel;
-	if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
-			   reporter->ops->name))
-		goto reporter_nest_cancel;
-	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
-		       reporter->health_state))
-		goto reporter_nest_cancel;
-	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR,
-			      reporter->error_count, DEVLINK_ATTR_PAD))
-		goto reporter_nest_cancel;
-	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER,
-			      reporter->recovery_count, DEVLINK_ATTR_PAD))
-		goto reporter_nest_cancel;
-	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
-			      reporter->graceful_period,
-			      DEVLINK_ATTR_PAD))
-		goto reporter_nest_cancel;
-	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
-		       reporter->auto_recover))
-		goto reporter_nest_cancel;
-	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL,
-		       reporter->dump_avail))
-		goto reporter_nest_cancel;
-	if (reporter->dump_avail &&
-	    nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,
-			      jiffies_to_msecs(reporter->dump_ts),
-			      DEVLINK_ATTR_PAD))
-		goto reporter_nest_cancel;
-
-	nla_nest_end(msg, reporter_attr);
-	genlmsg_end(msg, hdr);
-	return 0;
-
-reporter_nest_cancel:
-	nla_nest_end(msg, reporter_attr);
-genlmsg_cancel:
-	genlmsg_cancel(msg, hdr);
-	return -EMSGSIZE;
-}
-
-static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb,
-						   struct genl_info *info)
-{
-	struct devlink *devlink = info->user_ptr[0];
-	struct devlink_health_reporter *reporter;
-	struct sk_buff *msg;
-	int err;
-
-	reporter = devlink_health_reporter_get_from_info(devlink, info);
-	if (!reporter)
-		return -EINVAL;
-
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
-
-	err = devlink_nl_health_reporter_fill(msg, devlink, reporter,
-					      DEVLINK_CMD_HEALTH_REPORTER_GET,
-					      info->snd_portid, info->snd_seq,
-					      0);
-	if (err) {
-		nlmsg_free(msg);
-		return err;
-	}
-
-	return genlmsg_reply(msg, info);
-}
-
-static int
-devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
-					  struct netlink_callback *cb)
-{
-	struct devlink_health_reporter *reporter;
-	struct devlink *devlink;
-	int start = cb->args[0];
-	int idx = 0;
-	int err;
-
-	mutex_lock(&devlink_mutex);
-	list_for_each_entry(devlink, &devlink_list, list) {
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
-			continue;
-		mutex_lock(&devlink->lock);
-		list_for_each_entry(reporter, &devlink->reporter_list,
-				    list) {
-			if (idx < start) {
-				idx++;
-				continue;
-			}
-			err = devlink_nl_health_reporter_fill(msg, devlink,
-							      reporter,
-							      DEVLINK_CMD_HEALTH_REPORTER_GET,
-							      NETLINK_CB(cb->skb).portid,
-							      cb->nlh->nlmsg_seq,
-							      NLM_F_MULTI);
-			if (err) {
-				mutex_unlock(&devlink->lock);
-				goto out;
-			}
-			idx++;
-		}
-		mutex_unlock(&devlink->lock);
-	}
-out:
-	mutex_unlock(&devlink_mutex);
-
-	cb->args[0] = idx;
-	return msg->len;
-}
-
-static int
-devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
-					struct genl_info *info)
-{
-	struct devlink *devlink = info->user_ptr[0];
-	struct devlink_health_reporter *reporter;
-
-	reporter = devlink_health_reporter_get_from_info(devlink, info);
-	if (!reporter)
-		return -EINVAL;
-
-	if (!reporter->ops->recover &&
-	    (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
-	     info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
-		return -EINVAL;
-
-	if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
-		reporter->graceful_period =
-			nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
-
-	if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
-		reporter->auto_recover =
-			nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
-
-	return 0;
-}
-
-static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb,
-						       struct genl_info *info)
-{
-	struct devlink *devlink = info->user_ptr[0];
-	struct devlink_health_reporter *reporter;
-
-	reporter = devlink_health_reporter_get_from_info(devlink, info);
-	if (!reporter)
-		return -EINVAL;
-
-	return devlink_health_reporter_recover(reporter, NULL);
-}
-
-static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
-							struct genl_info *info)
-{
-	struct devlink *devlink = info->user_ptr[0];
-	struct devlink_health_reporter *reporter;
-	u64 num_of_buffers;
-	int err;
-
-	reporter = devlink_health_reporter_get_from_info(devlink, info);
-	if (!reporter)
-		return -EINVAL;
-
-	if (!reporter->ops->diagnose)
-		return -EOPNOTSUPP;
-
-	num_of_buffers =
-		DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size);
-
-	mutex_lock(&reporter->diagnose_lock);
-	devlink_health_buffers_reset(reporter->diagnose_buffers_array,
-				     num_of_buffers);
-
-	err = reporter->ops->diagnose(reporter,
-				      reporter->diagnose_buffers_array,
-				      DEVLINK_HEALTH_BUFFER_SIZE,
-				      num_of_buffers);
-	if (err)
-		goto out;
-
-	err = devlink_health_buffer_snd(info,
-					DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
-					0, reporter->diagnose_buffers_array,
-					num_of_buffers);
-	if (err)
-		goto out;
-
-	mutex_unlock(&reporter->diagnose_lock);
-	return 0;
-
-out:
-	mutex_unlock(&reporter->diagnose_lock);
-	return err;
-}
-
-static void
-devlink_health_dump_clear(struct devlink_health_reporter *reporter)
-{
-	reporter->dump_avail = false;
-	reporter->dump_ts = 0;
-	devlink_health_buffers_reset(reporter->dump_buffers_array,
-				     DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size));
-}
-
-static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb,
-							struct genl_info *info)
-{
-	struct devlink *devlink = info->user_ptr[0];
-	struct devlink_health_reporter *reporter;
-	u64 num_of_buffers;
-	int err;
-
-	reporter = devlink_health_reporter_get_from_info(devlink, info);
-	if (!reporter)
-		return -EINVAL;
-
-	if (!reporter->ops->dump)
-		return -EOPNOTSUPP;
-
-	num_of_buffers =
-		DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size);
-
-	mutex_lock(&reporter->dump_lock);
-	err = devlink_health_do_dump(reporter, NULL);
-	if (err)
-		goto out;
-
-	err = devlink_health_buffer_snd(info,
-					DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
-					0, reporter->dump_buffers_array,
-					num_of_buffers);
-
-out:
-	mutex_unlock(&reporter->dump_lock);
-	return err;
-}
-
-static int
-devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
-					       struct genl_info *info)
-{
-	struct devlink *devlink = info->user_ptr[0];
-	struct devlink_health_reporter *reporter;
-
-	reporter = devlink_health_reporter_get_from_info(devlink, info);
-	if (!reporter)
-		return -EINVAL;
-
-	mutex_lock(&reporter->dump_lock);
-	devlink_health_dump_clear(reporter);
-	mutex_unlock(&reporter->dump_lock);
-	return 0;
-}
-
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -4631,9 +3622,6 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
-	[DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING },
-	[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 },
-	[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -4854,51 +3842,6 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
-	{
-		.cmd = DEVLINK_CMD_HEALTH_REPORTER_GET,
-		.doit = devlink_nl_cmd_health_reporter_get_doit,
-		.dumpit = devlink_nl_cmd_health_reporter_get_dumpit,
-		.policy = devlink_nl_policy,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
-		/* can be retrieved by unprivileged users */
-	},
-	{
-		.cmd = DEVLINK_CMD_HEALTH_REPORTER_SET,
-		.doit = devlink_nl_cmd_health_reporter_set_doit,
-		.policy = devlink_nl_policy,
-		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
-	},
-	{
-		.cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
-		.doit = devlink_nl_cmd_health_reporter_recover_doit,
-		.policy = devlink_nl_policy,
-		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
-	},
-	{
-		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
-		.doit = devlink_nl_cmd_health_reporter_diagnose_doit,
-		.policy = devlink_nl_policy,
-		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
-	},
-	{
-		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
-		.doit = devlink_nl_cmd_health_reporter_dump_get_doit,
-		.policy = devlink_nl_policy,
-		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NO_LOCK,
-	},
-	{
-		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
-		.doit = devlink_nl_cmd_health_reporter_dump_clear_doit,
-		.policy = devlink_nl_policy,
-		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NO_LOCK,
-	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
@@ -4939,7 +3882,6 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->resource_list);
 	INIT_LIST_HEAD(&devlink->param_list);
 	INIT_LIST_HEAD(&devlink->region_list);
-	INIT_LIST_HEAD(&devlink->reporter_list);
 	mutex_init(&devlink->lock);
 	return devlink;
 }
-- 
cgit v1.2.3-59-g8ed1b


From fe4943702c850fa07f963eaa6f1530d9d4c2da78 Mon Sep 17 00:00:00 2001
From: Srinivas Dasari <dasaris@codeaurora.org>
Date: Wed, 23 Jan 2019 18:06:56 +0530
Subject: cfg80211: Authentication offload to user space in AP mode

commit 40cbfa90218b ("cfg80211/nl80211: Optional authentication
offload to userspace")' introduced authentication offload to user
space by the host drivers in station mode. This commit extends
the same for the AP mode too.

Extend NL80211_ATTR_EXTERNAL_AUTH_SUPPORT to also claim the
support of external authentication from the user space in AP mode.
A new flag parameter is introduced in cfg80211_ap_settings to
intend the same while "start ap".

Host driver to use NL80211_CMD_FRAME interface to transmit and
receive the authentication frames to / from the user space.

Host driver to indicate the flag NL80211_RXMGMT_FLAG_EXTERNAL_AUTH
while sending the authentication frame to the user space. This
intends to the user space that the driver wishes it to process
the authentication frame for certain protocols, though it had
initially advertised the support for SME functionality.

User space shall accordingly do the authentication and indicate
its final status through the command NL80211_CMD_EXTERNAL_AUTH.
Allow the command even if userspace doesn't include the attribute
NL80211_ATTR_SSID for AP interface.

Host driver shall continue with the association sequence and
indicate the STA connection status through cfg80211_new_sta.

To facilitate the host drivers in AP mode for matching the pmkid
by the stations during the association, NL80211_CMD_EXTERNAL_AUTH
is also enhanced to include the pmkid to drivers after
the authentication.
This pmkid can also be used in the STA mode to include in the
association request.

Also modify nl80211_external_auth to not mandate SSID in AP mode.

Signed-off-by: Srinivas Dasari <dasaris@codeaurora.org>
[remove useless nla_get_flag() usage]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 15 +++++++++++++++
 include/uapi/linux/nl80211.h | 13 +++++++++----
 net/wireless/nl80211.c       | 25 ++++++++++++++++++-------
 3 files changed, 42 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index b61ac6e9de08..7033c90850b0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -835,6 +835,17 @@ struct cfg80211_bitrate_mask {
 	} control[NUM_NL80211_BANDS];
 };
 
+/**
+ * enum cfg80211_ap_settings_flags - AP settings flags
+ *
+ * Used by cfg80211_ap_settings
+ *
+ * @AP_SETTINGS_EXTERNAL_AUTH_SUPPORT: AP supports external authentication
+ */
+enum cfg80211_ap_settings_flags {
+	AP_SETTINGS_EXTERNAL_AUTH_SUPPORT = BIT(0),
+};
+
 /**
  * struct cfg80211_ap_settings - AP configuration
  *
@@ -865,6 +876,7 @@ struct cfg80211_bitrate_mask {
  * @he_cap: HE capabilities (or %NULL if HE isn't enabled)
  * @ht_required: stations must support HT
  * @vht_required: stations must support VHT
+ * @flags: flags, as defined in enum cfg80211_ap_settings_flags
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -890,6 +902,7 @@ struct cfg80211_ap_settings {
 	const struct ieee80211_vht_cap *vht_cap;
 	const struct ieee80211_he_cap_elem *he_cap;
 	bool ht_required, vht_required;
+	u32 flags;
 };
 
 /**
@@ -2831,6 +2844,7 @@ struct cfg80211_pmk_conf {
  *	use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space cannot give you
  *	the real status code for failures. Used only for the authentication
  *	response command interface (user space to driver).
+ * @pmkid: The identifier to refer a PMKSA.
  */
 struct cfg80211_external_auth_params {
 	enum nl80211_external_auth_action action;
@@ -2838,6 +2852,7 @@ struct cfg80211_external_auth_params {
 	struct cfg80211_ssid ssid;
 	unsigned int key_mgmt_suite;
 	u16 status;
+	const u8 *pmkid;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5f9d5cd458a1..8b0fdb9e133b 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2266,10 +2266,10 @@ enum nl80211_commands {
  *     &enum nl80211_external_auth_action value). This is used with the
  *     %NL80211_CMD_EXTERNAL_AUTH request event.
  * @NL80211_ATTR_EXTERNAL_AUTH_SUPPORT: Flag attribute indicating that the user
- *     space supports external authentication. This attribute shall be used
- *     only with %NL80211_CMD_CONNECT request. The driver may offload
- *     authentication processing to user space if this capability is indicated
- *     in NL80211_CMD_CONNECT requests from the user space.
+ *	space supports external authentication. This attribute shall be used
+ *	with %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP request. The driver
+ *	may offload authentication processing to user space if this capability
+ *	is indicated in the respective requests from the user space.
  *
  * @NL80211_ATTR_NSS: Station's New/updated  RX_NSS value notified using this
  *	u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED.
@@ -5631,9 +5631,14 @@ enum nl80211_crit_proto_id {
  * Used by cfg80211_rx_mgmt()
  *
  * @NL80211_RXMGMT_FLAG_ANSWERED: frame was answered by device/driver.
+ * @NL80211_RXMGMT_FLAG_EXTERNAL_AUTH: Host driver intends to offload
+ *	the authentication. Exclusively defined for host drivers that
+ *	advertises the SME functionality but would like the userspace
+ *	to handle certain authentication algorithms (e.g. SAE).
  */
 enum nl80211_rxmgmt_flags {
 	NL80211_RXMGMT_FLAG_ANSWERED = 1 << 0,
+	NL80211_RXMGMT_FLAG_EXTERNAL_AUTH = 1 << 1,
 };
 
 /*
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index eb4437fa0539..dc96077afe5e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4550,6 +4550,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 
 	nl80211_calculate_ap_params(&params);
 
+	if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])
+		params.flags |= AP_SETTINGS_EXTERNAL_AUTH_SUPPORT;
+
 	wdev_lock(wdev);
 	err = rdev_start_ap(rdev, dev, &params);
 	if (!err) {
@@ -13086,7 +13089,9 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
 	if (!rdev->ops->external_auth)
 		return -EOPNOTSUPP;
 
-	if (!info->attrs[NL80211_ATTR_SSID])
+	if (!info->attrs[NL80211_ATTR_SSID] &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
 		return -EINVAL;
 
 	if (!info->attrs[NL80211_ATTR_BSSID])
@@ -13097,18 +13102,24 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
 
 	memset(&params, 0, sizeof(params));
 
-	params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
-	if (params.ssid.ssid_len == 0 ||
-	    params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN)
-		return -EINVAL;
-	memcpy(params.ssid.ssid, nla_data(info->attrs[NL80211_ATTR_SSID]),
-	       params.ssid.ssid_len);
+	if (info->attrs[NL80211_ATTR_SSID]) {
+		params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+		if (params.ssid.ssid_len == 0 ||
+		    params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN)
+			return -EINVAL;
+		memcpy(params.ssid.ssid,
+		       nla_data(info->attrs[NL80211_ATTR_SSID]),
+		       params.ssid.ssid_len);
+	}
 
 	memcpy(params.bssid, nla_data(info->attrs[NL80211_ATTR_BSSID]),
 	       ETH_ALEN);
 
 	params.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]);
 
+	if (info->attrs[NL80211_ATTR_PMKID])
+		params.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);
+
 	return rdev_external_auth(rdev, dev, &params);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 6c900360e7c0df6a4846ac97d7b548d72cd801b0 Mon Sep 17 00:00:00 2001
From: Liangwei Dong <liangwei@codeaurora.org>
Date: Fri, 18 Jan 2019 16:54:38 +0530
Subject: nl80211: Allow set/del pmksa operations for AP

Host drivers may offload authentication to the user space
through the commit ("cfg80211: Authentication offload to
user space in AP mode").

This interface can be used to implement SAE by having the
userspace do authentication/PMKID key derivation and driver
handle the association.

A step ahead, this interface can get further optimized if the
PMKID is passed to the host driver and also have it respond to
the association request by the STA on a valid PMKID.

This commit enables the userspace to pass the PMKID to the host
drivers through the set/del pmksa operations in AP mode.

Set/Del pmksa is now restricted to STA/P2P client mode only and
thus the drivers might not expect them in any other(AP) mode.

This commit also introduces a feature flag
NL80211_EXT_FEATURE_AP_PMKSA_CACHING (johannes: renamed) to
maintain the backward compatibility of such an expectation by
the host drivers. These operations are allowed in AP mode only
when the drivers advertize the capability through this flag.

Signed-off-by: Liangwei Dong <liangwei@codeaurora.org>
Signed-off-by: Srinivas Dasari <dasaris@codeaurora.org>
[rename flag to NL80211_EXT_FEATURE_AP_PMKSA_CACHING]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 4 ++++
 net/wireless/nl80211.c       | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8b0fdb9e133b..dd4f86ee286e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5340,6 +5340,9 @@ enum nl80211_feature_flags {
  *	fairness for transmitted packets and has enabled airtime fairness
  *	scheduling.
  *
+ * @NL80211_EXT_FEATURE_AP_PMKSA_CACHING: Driver/device supports PMKSA caching
+ *	(set/del PMKSA operations) in AP mode.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5380,6 +5383,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_CAN_REPLACE_PTK0,
 	NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER,
 	NL80211_EXT_FEATURE_AIRTIME_FAIRNESS,
+	NL80211_EXT_FEATURE_AP_PMKSA_CACHING,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index dc96077afe5e..af89e5c9fd0a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9899,7 +9899,10 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
-	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+	    !(dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP &&
+	      wiphy_ext_feature_isset(&rdev->wiphy,
+				      NL80211_EXT_FEATURE_AP_PMKSA_CACHING)))
 		return -EOPNOTSUPP;
 
 	switch (info->genlhdr->cmd) {
-- 
cgit v1.2.3-59-g8ed1b


From d405c7407a5468d4fc11724d76063e0647d80106 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Sat, 26 Jan 2019 12:25:59 -0500
Subject: bpf: allocate 0x06 to new eBPF instruction class JMP32

The new eBPF instruction class JMP32 uses the reserved class number 0x6.
Kernel BPF ISA documentation updated accordingly.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/networking/filter.txt | 15 ++++++++-------
 include/uapi/linux/bpf.h            |  1 +
 tools/include/uapi/linux/bpf.h      |  1 +
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 2196b824e96c..01603bc2eff1 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -865,7 +865,7 @@ Three LSB bits store instruction class which is one of:
   BPF_STX   0x03          BPF_STX   0x03
   BPF_ALU   0x04          BPF_ALU   0x04
   BPF_JMP   0x05          BPF_JMP   0x05
-  BPF_RET   0x06          [ class 6 unused, for future if needed ]
+  BPF_RET   0x06          BPF_JMP32 0x06
   BPF_MISC  0x07          BPF_ALU64 0x07
 
 When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ...
@@ -902,9 +902,9 @@ If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of:
   BPF_ARSH  0xc0  /* eBPF only: sign extending shift right */
   BPF_END   0xd0  /* eBPF only: endianness conversion */
 
-If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of:
+If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of:
 
-  BPF_JA    0x00
+  BPF_JA    0x00  /* BPF_JMP only */
   BPF_JEQ   0x10
   BPF_JGT   0x20
   BPF_JGE   0x30
@@ -912,8 +912,8 @@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of:
   BPF_JNE   0x50  /* eBPF only: jump != */
   BPF_JSGT  0x60  /* eBPF only: signed '>' */
   BPF_JSGE  0x70  /* eBPF only: signed '>=' */
-  BPF_CALL  0x80  /* eBPF only: function call */
-  BPF_EXIT  0x90  /* eBPF only: function return */
+  BPF_CALL  0x80  /* eBPF BPF_JMP only: function call */
+  BPF_EXIT  0x90  /* eBPF BPF_JMP only: function return */
   BPF_JLT   0xa0  /* eBPF only: unsigned '<' */
   BPF_JLE   0xb0  /* eBPF only: unsigned '<=' */
   BPF_JSLT  0xc0  /* eBPF only: signed '<' */
@@ -936,8 +936,9 @@ Classic BPF wastes the whole BPF_RET class to represent a single 'ret'
 operation. Classic BPF_RET | BPF_K means copy imm32 into return register
 and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT
 in eBPF means function exit only. The eBPF program needs to store return
-value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is currently
-unused and reserved for future use.
+value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as
+BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide
+operands for the comparisons instead.
 
 For load and store instructions the 8-bit 'code' field is divided as:
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2940a9854f6d..60b99b730a41 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -14,6 +14,7 @@
 /* Extended instruction set based on top of classic BPF */
 
 /* instruction classes */
+#define BPF_JMP32	0x06	/* jmp mode in word width */
 #define BPF_ALU64	0x07	/* alu mode in double word width */
 
 /* ld/ldx fields */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2940a9854f6d..60b99b730a41 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -14,6 +14,7 @@
 /* Extended instruction set based on top of classic BPF */
 
 /* instruction classes */
+#define BPF_JMP32	0x06	/* jmp mode in word width */
 #define BPF_ALU64	0x07	/* alu mode in double word width */
 
 /* ld/ldx fields */
-- 
cgit v1.2.3-59-g8ed1b


From 6bf8f22aea0ddd93af822aed8afeeee4acdf7694 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 22 Jan 2019 08:29:56 +0200
Subject: IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD

Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD and its initial implementation.

This object is from type class FD and will be used to read DEVX async
commands completion.

The core layer should allow the driver to set object from type FD in a
safe mode, this option was added with a matching comment in place.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/rdma_core.c      |  1 +
 drivers/infiniband/core/uverbs_uapi.c    | 15 ++++--
 drivers/infiniband/hw/mlx5/devx.c        | 83 ++++++++++++++++++++++++++++++++
 include/rdma/uverbs_types.h              |  1 +
 include/uapi/rdma/mlx5_user_ioctl_cmds.h |  9 ++++
 5 files changed, 104 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 6c4747e61d2b..a260d2f8e0b7 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -801,6 +801,7 @@ void uverbs_close_fd(struct file *f)
 	/* Pairs with filp->private_data in alloc_begin_fd_uobject */
 	uverbs_uobject_put(uobj);
 }
+EXPORT_SYMBOL(uverbs_close_fd);
 
 /*
  * Drop the ucontext off the ufile and completely disconnect it from the
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index 9ae08e4b78a3..7a987acf0c0b 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -188,13 +188,18 @@ static int uapi_merge_obj_tree(struct uverbs_api *uapi,
 		obj_elm->type_attrs = obj->type_attrs;
 		obj_elm->type_class = obj->type_attrs->type_class;
 		/*
-		 * Today drivers are only permitted to use idr_class
-		 * types. They cannot use FD types because we currently have
-		 * no way to revoke the fops pointer after device
-		 * disassociation.
+		 * Today drivers are only permitted to use idr_class and
+		 * fd_class types. We can revoke the IDR types during
+		 * disassociation, and the FD types require the driver to use
+		 * struct file_operations.owner to prevent the driver module
+		 * code from unloading while the file is open. This provides
+		 * enough safety that uverbs_close_fd() will continue to work.
+		 * Drivers using FD are responsible to handle disassociation of
+		 * the device on their own.
 		 */
 		if (WARN_ON(is_driver &&
-			    obj->type_attrs->type_class != &uverbs_idr_class))
+			    obj->type_attrs->type_class != &uverbs_idr_class &&
+			    obj->type_attrs->type_class != &uverbs_fd_class))
 			return -EINVAL;
 	}
 
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 5a588f3cfb1b..9933bcf83a6b 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1168,6 +1168,38 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
 			      cmd_out, cmd_out_len);
 }
 
+struct devx_async_event_queue {
+	spinlock_t		lock;
+	wait_queue_head_t	poll_wait;
+	struct list_head	event_list;
+};
+
+struct devx_async_cmd_event_file {
+	struct ib_uobject		uobj;
+	struct devx_async_event_queue	ev_queue;
+};
+
+static void devx_init_event_queue(struct devx_async_event_queue *ev_queue)
+{
+	spin_lock_init(&ev_queue->lock);
+	INIT_LIST_HEAD(&ev_queue->event_list);
+	init_waitqueue_head(&ev_queue->poll_wait);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct devx_async_cmd_event_file *ev_file;
+
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE);
+
+	ev_file = container_of(uobj, struct devx_async_cmd_event_file,
+			       uobj);
+	devx_init_event_queue(&ev_file->ev_queue);
+	return 0;
+}
+
 static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 			 struct uverbs_attr_bundle *attrs,
 			 struct devx_umem *obj)
@@ -1313,6 +1345,38 @@ static int devx_umem_cleanup(struct ib_uobject *uobject,
 	return 0;
 }
 
+static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
+					 size_t count, loff_t *pos)
+{
+	return -EINVAL;
+}
+
+static int devx_async_cmd_event_close(struct inode *inode, struct file *filp)
+{
+	uverbs_close_fd(filp);
+	return 0;
+}
+
+static __poll_t devx_async_cmd_event_poll(struct file *filp,
+					      struct poll_table_struct *wait)
+{
+	return 0;
+}
+
+const struct file_operations devx_async_cmd_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read	 = devx_async_cmd_event_read,
+	.poll    = devx_async_cmd_event_poll,
+	.release = devx_async_cmd_event_close,
+	.llseek	 = no_llseek,
+};
+
+static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
+						   enum rdma_remove_reason why)
+{
+	return 0;
+};
+
 DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_DEVX_UMEM_REG,
 	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE,
@@ -1440,6 +1504,22 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM,
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_REG),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_DEREG));
 
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC,
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+	MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+	UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_cmd_event_file),
+			     devx_hot_unplug_async_cmd_event_file,
+			     &devx_async_cmd_event_fops, "[devx_async_cmd]",
+			     O_RDONLY),
+	&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC));
+
 static bool devx_is_supported(struct ib_device *device)
 {
 	struct mlx5_ib_dev *dev = to_mdev(device);
@@ -1457,5 +1537,8 @@ const struct uapi_definition mlx5_ib_devx_defs[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
 		MLX5_IB_OBJECT_DEVX_UMEM,
 		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
 	{},
 };
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
index acb1bfa3cc99..175d761695e1 100644
--- a/include/rdma/uverbs_types.h
+++ b/include/rdma/uverbs_types.h
@@ -157,6 +157,7 @@ struct uverbs_obj_fd_type {
 
 extern const struct uverbs_obj_type_class uverbs_idr_class;
 extern const struct uverbs_obj_type_class uverbs_fd_class;
+void uverbs_close_fd(struct file *f);
 
 #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) -	\
 				   sizeof(char))
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index b8d121d457f1..6ceae29d77cd 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -113,11 +113,20 @@ enum mlx5_ib_devx_umem_methods {
 	MLX5_IB_METHOD_DEVX_UMEM_DEREG,
 };
 
+enum mlx5_ib_devx_async_cmd_fd_alloc_attrs {
+	MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+};
+
+enum mlx5_ib_devx_async_cmd_fd_methods {
+	MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT),
+};
+
 enum mlx5_ib_objects {
 	MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT),
 	MLX5_IB_OBJECT_DEVX_OBJ,
 	MLX5_IB_OBJECT_DEVX_UMEM,
 	MLX5_IB_OBJECT_FLOW_MATCHER,
+	MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
 };
 
 enum mlx5_ib_flow_matcher_create_attrs {
-- 
cgit v1.2.3-59-g8ed1b


From a124edba26270697540f1058bfcd490c1c65b116 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 22 Jan 2019 08:29:57 +0200
Subject: IB/mlx5: Introduce async DEVX obj query API

Introduce async DEVX obj query API to get the command response back to
user space once it's ready without blocking when calling the firmware.

The event's data includes a header with some meta data then the firmware
output command data.

The header includes:
- The input 'wr_id' to let application recognizing the response.

The input FD attribute is used to have the event data ready on.
Downstream patches from this series will implement the file ops to let
application read it.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c         | 164 +++++++++++++++++++++++++++++-
 include/uapi/rdma/mlx5_user_ioctl_cmds.h  |   9 ++
 include/uapi/rdma/mlx5_user_ioctl_verbs.h |   5 +
 3 files changed, 177 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 9933bcf83a6b..9ca116155f9c 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -8,6 +8,7 @@
 #include <rdma/uverbs_types.h>
 #include <rdma/uverbs_ioctl.h>
 #include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
 #include <rdma/ib_umem.h>
 #include <rdma/uverbs_std_types.h>
 #include <linux/mlx5/driver.h>
@@ -17,6 +18,16 @@
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
 
+struct devx_async_data {
+	struct mlx5_ib_dev *mdev;
+	struct list_head list;
+	struct ib_uobject *fd_uobj;
+	struct mlx5_async_work cb_work;
+	u16 cmd_out_len;
+	/* must be last field in this structure */
+	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
+};
+
 #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
 struct devx_obj {
 	struct mlx5_core_dev	*mdev;
@@ -1172,11 +1183,13 @@ struct devx_async_event_queue {
 	spinlock_t		lock;
 	wait_queue_head_t	poll_wait;
 	struct list_head	event_list;
+	atomic_t		bytes_in_use;
 };
 
 struct devx_async_cmd_event_file {
 	struct ib_uobject		uobj;
 	struct devx_async_event_queue	ev_queue;
+	struct mlx5_async_ctx		async_ctx;
 };
 
 static void devx_init_event_queue(struct devx_async_event_queue *ev_queue)
@@ -1184,6 +1197,7 @@ static void devx_init_event_queue(struct devx_async_event_queue *ev_queue)
 	spin_lock_init(&ev_queue->lock);
 	INIT_LIST_HEAD(&ev_queue->event_list);
 	init_waitqueue_head(&ev_queue->poll_wait);
+	atomic_set(&ev_queue->bytes_in_use, 0);
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
@@ -1193,13 +1207,123 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
 
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE);
+	struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
 
 	ev_file = container_of(uobj, struct devx_async_cmd_event_file,
 			       uobj);
 	devx_init_event_queue(&ev_file->ev_queue);
+	mlx5_cmd_init_async_ctx(mdev->mdev, &ev_file->async_ctx);
 	return 0;
 }
 
+static void devx_query_callback(int status, struct mlx5_async_work *context)
+{
+	struct devx_async_data *async_data =
+		container_of(context, struct devx_async_data, cb_work);
+	struct ib_uobject *fd_uobj = async_data->fd_uobj;
+	struct devx_async_cmd_event_file *ev_file;
+	struct devx_async_event_queue *ev_queue;
+	unsigned long flags;
+
+	ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
+			       uobj);
+	ev_queue = &ev_file->ev_queue;
+
+	spin_lock_irqsave(&ev_queue->lock, flags);
+	list_add_tail(&async_data->list, &ev_queue->event_list);
+	spin_unlock_irqrestore(&ev_queue->lock, flags);
+
+	wake_up_interruptible(&ev_queue->poll_wait);
+	fput(fd_uobj->object);
+}
+
+#define MAX_ASYNC_BYTES_IN_USE (1024 * 1024) /* 1MB */
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)(
+	struct uverbs_attr_bundle *attrs)
+{
+	void *cmd_in = uverbs_attr_get_alloced_ptr(attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN);
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+				attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE);
+	u16 cmd_out_len;
+	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+	struct ib_uobject *fd_uobj;
+	int err;
+	int uid;
+	struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
+	struct devx_async_cmd_event_file *ev_file;
+	struct devx_async_data *async_data;
+
+	uid = devx_get_uid(c, cmd_in);
+	if (uid < 0)
+		return uid;
+
+	if (!devx_is_obj_query_cmd(cmd_in))
+		return -EINVAL;
+
+	err = uverbs_get_const(&cmd_out_len, attrs,
+			       MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN);
+	if (err)
+		return err;
+
+	if (!devx_is_valid_obj_id(uobj, cmd_in))
+		return -EINVAL;
+
+	fd_uobj = uverbs_attr_get_uobject(attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD);
+	if (IS_ERR(fd_uobj))
+		return PTR_ERR(fd_uobj);
+
+	ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
+			       uobj);
+
+	if (atomic_add_return(cmd_out_len, &ev_file->ev_queue.bytes_in_use) >
+			MAX_ASYNC_BYTES_IN_USE) {
+		atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use);
+		return -EAGAIN;
+	}
+
+	async_data = kvzalloc(struct_size(async_data, hdr.out_data,
+					  cmd_out_len), GFP_KERNEL);
+	if (!async_data) {
+		err = -ENOMEM;
+		goto sub_bytes;
+	}
+
+	err = uverbs_copy_from(&async_data->hdr.wr_id, attrs,
+			       MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID);
+	if (err)
+		goto free_async;
+
+	async_data->cmd_out_len = cmd_out_len;
+	async_data->mdev = mdev;
+	async_data->fd_uobj = fd_uobj;
+
+	get_file(fd_uobj->object);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
+	err = mlx5_cmd_exec_cb(&ev_file->async_ctx, cmd_in,
+		    uverbs_attr_get_len(attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN),
+		    async_data->hdr.out_data,
+		    async_data->cmd_out_len,
+		    devx_query_callback, &async_data->cb_work);
+
+	if (err)
+		goto cb_err;
+
+	return 0;
+
+cb_err:
+	fput(fd_uobj->object);
+free_async:
+	kvfree(async_data);
+sub_bytes:
+	atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use);
+	return err;
+}
+
 static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 			 struct uverbs_attr_bundle *attrs,
 			 struct devx_umem *obj)
@@ -1353,6 +1477,17 @@ static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
 
 static int devx_async_cmd_event_close(struct inode *inode, struct file *filp)
 {
+	struct ib_uobject *uobj = filp->private_data;
+	struct devx_async_cmd_event_file *comp_ev_file = container_of(
+		uobj, struct devx_async_cmd_event_file, uobj);
+	struct devx_async_data *entry, *tmp;
+
+	spin_lock_irq(&comp_ev_file->ev_queue.lock);
+	list_for_each_entry_safe(entry, tmp,
+				 &comp_ev_file->ev_queue.event_list, list)
+		kvfree(entry);
+	spin_unlock_irq(&comp_ev_file->ev_queue.lock);
+
 	uverbs_close_fd(filp);
 	return 0;
 }
@@ -1374,6 +1509,11 @@ const struct file_operations devx_async_cmd_event_fops = {
 static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
 						   enum rdma_remove_reason why)
 {
+	struct devx_async_cmd_event_file *comp_ev_file =
+		container_of(uobj, struct devx_async_cmd_event_file,
+			     uobj);
+
+	mlx5_cmd_cleanup_async_ctx(&comp_ev_file->async_ctx);
 	return 0;
 };
 
@@ -1487,6 +1627,27 @@ DECLARE_UVERBS_NAMED_METHOD(
 		UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)),
 		UA_MANDATORY));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE,
+			UVERBS_IDR_ANY_OBJECT,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(
+		MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN,
+		UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)),
+		UA_MANDATORY,
+		UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN,
+		u16, UA_MANDATORY),
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD,
+		MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+		UVERBS_ACCESS_READ,
+		UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID,
+		UVERBS_ATTR_TYPE(u64),
+		UA_MANDATORY));
+
 DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX,
 			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER),
 			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR),
@@ -1497,7 +1658,8 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ,
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_CREATE),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_DESTROY),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_MODIFY),
-			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY));
+			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY));
 
 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM,
 			    UVERBS_TYPE_ALLOC_IDR(devx_umem_cleanup),
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index 6ceae29d77cd..8149d224030b 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -84,6 +84,14 @@ enum mlx5_ib_devx_obj_query_attrs {
 	MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT,
 };
 
+enum mlx5_ib_devx_obj_query_async_attrs {
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN,
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD,
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID,
+	MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN,
+};
+
 enum  mlx5_ib_devx_query_eqn_attrs {
 	MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT),
 	MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN,
@@ -94,6 +102,7 @@ enum mlx5_ib_devx_obj_methods {
 	MLX5_IB_METHOD_DEVX_OBJ_DESTROY,
 	MLX5_IB_METHOD_DEVX_OBJ_MODIFY,
 	MLX5_IB_METHOD_DEVX_OBJ_QUERY,
+	MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY,
 };
 
 enum mlx5_ib_devx_umem_reg_attrs {
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index 4ef62c0e8452..4a701033b93f 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -51,5 +51,10 @@ enum mlx5_ib_uapi_flow_action_packet_reformat_type {
 	MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3,
 };
 
+struct mlx5_ib_uapi_devx_async_cmd_hdr {
+	__aligned_u64	wr_id;
+	__u8		out_data[];
+};
+
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 71368af9027f18fe5d1c6f372cfdff7e4bde8b48 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 16 Jan 2019 17:01:36 -0500
Subject: x86/speculation: Add PR_SPEC_DISABLE_NOEXEC

With the default SPEC_STORE_BYPASS_SECCOMP/SPEC_STORE_BYPASS_PRCTL mode,
the TIF_SSBD bit will be inherited when a new task is fork'ed or cloned.
It will also remain when a new program is execve'ed.

Only certain class of applications (like Java) that can run on behalf of
multiple users on a single thread will require disabling speculative store
bypass for security purposes. Those applications will call prctl(2) at
startup time to disable SSB. They won't rely on the fact the SSB might have
been disabled. Other applications that don't need SSBD will just move on
without checking if SSBD has been turned on or not.

The fact that the TIF_SSBD is inherited across execve(2) boundary will
cause performance of applications that don't need SSBD but their
predecessors have SSBD on to be unwittingly impacted especially if they
write to memory a lot.

To remedy this problem, a new PR_SPEC_DISABLE_NOEXEC argument for the
PR_SET_SPECULATION_CTRL option of prctl(2) is added to allow applications
to specify that the SSBD feature bit on the task structure should be
cleared whenever a new program is being execve'ed.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-doc@vger.kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: KarimAllah Ahmed <karahmed@amazon.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Link: https://lkml.kernel.org/r/1547676096-3281-1-git-send-email-longman@redhat.com
---
 Documentation/userspace-api/spec_ctrl.rst | 27 +++++++++++++++------------
 arch/x86/kernel/cpu/bugs.c                | 12 ++++++++++++
 arch/x86/kernel/process.c                 | 12 ++++++++++++
 include/linux/sched.h                     |  5 +++++
 include/uapi/linux/prctl.h                |  1 +
 tools/include/uapi/linux/prctl.h          |  1 +
 6 files changed, 46 insertions(+), 12 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst
index c4dbe6f7cdae..1129c7550a48 100644
--- a/Documentation/userspace-api/spec_ctrl.rst
+++ b/Documentation/userspace-api/spec_ctrl.rst
@@ -28,18 +28,20 @@ PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature
 which is selected with arg2 of prctl(2). The return value uses bits 0-3 with
 the following meaning:
 
-==== ===================== ===================================================
-Bit  Define                Description
-==== ===================== ===================================================
-0    PR_SPEC_PRCTL         Mitigation can be controlled per task by
-                           PR_SET_SPECULATION_CTRL.
-1    PR_SPEC_ENABLE        The speculation feature is enabled, mitigation is
-                           disabled.
-2    PR_SPEC_DISABLE       The speculation feature is disabled, mitigation is
-                           enabled.
-3    PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A
-                           subsequent prctl(..., PR_SPEC_ENABLE) will fail.
-==== ===================== ===================================================
+==== ====================== ==================================================
+Bit  Define                 Description
+==== ====================== ==================================================
+0    PR_SPEC_PRCTL          Mitigation can be controlled per task by
+                            PR_SET_SPECULATION_CTRL.
+1    PR_SPEC_ENABLE         The speculation feature is enabled, mitigation is
+                            disabled.
+2    PR_SPEC_DISABLE        The speculation feature is disabled, mitigation is
+                            enabled.
+3    PR_SPEC_FORCE_DISABLE  Same as PR_SPEC_DISABLE, but cannot be undone. A
+                            subsequent prctl(..., PR_SPEC_ENABLE) will fail.
+4    PR_SPEC_DISABLE_NOEXEC Same as PR_SPEC_DISABLE, but the state will be
+                            cleared on :manpage:`execve(2)`.
+==== ====================== ==================================================
 
 If all bits are 0 the CPU is not affected by the speculation misfeature.
 
@@ -92,6 +94,7 @@ Speculation misfeature controls
    * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
    * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
    * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0);
+   * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE_NOEXEC, 0, 0);
 
 - PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes
                         (Mitigate Spectre V2 style attacks against user processes)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1de0f4170178..2faeaf46347a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -798,15 +798,25 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
 		if (task_spec_ssb_force_disable(task))
 			return -EPERM;
 		task_clear_spec_ssb_disable(task);
+		task_clear_spec_ssb_noexec(task);
 		task_update_spec_tif(task);
 		break;
 	case PR_SPEC_DISABLE:
 		task_set_spec_ssb_disable(task);
+		task_clear_spec_ssb_noexec(task);
 		task_update_spec_tif(task);
 		break;
 	case PR_SPEC_FORCE_DISABLE:
 		task_set_spec_ssb_disable(task);
 		task_set_spec_ssb_force_disable(task);
+		task_clear_spec_ssb_noexec(task);
+		task_update_spec_tif(task);
+		break;
+	case PR_SPEC_DISABLE_NOEXEC:
+		if (task_spec_ssb_force_disable(task))
+			return -EPERM;
+		task_set_spec_ssb_disable(task);
+		task_set_spec_ssb_noexec(task);
 		task_update_spec_tif(task);
 		break;
 	default:
@@ -885,6 +895,8 @@ static int ssb_prctl_get(struct task_struct *task)
 	case SPEC_STORE_BYPASS_PRCTL:
 		if (task_spec_ssb_force_disable(task))
 			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+		if (task_spec_ssb_noexec(task))
+			return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
 		if (task_spec_ssb_disable(task))
 			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
 		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 90ae0ca51083..58ac7be52c7a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -255,6 +255,18 @@ void arch_setup_new_exec(void)
 	/* If cpuid was previously disabled for this task, re-enable it. */
 	if (test_thread_flag(TIF_NOCPUID))
 		enable_cpuid();
+
+	/*
+	 * Don't inherit TIF_SSBD across exec boundary when
+	 * PR_SPEC_DISABLE_NOEXEC is used.
+	 */
+	if (test_thread_flag(TIF_SSBD) &&
+	    task_spec_ssb_noexec(current)) {
+		clear_thread_flag(TIF_SSBD);
+		task_clear_spec_ssb_disable(current);
+		task_clear_spec_ssb_noexec(current);
+		speculation_ctrl_update(task_thread_info(current)->flags);
+	}
 }
 
 static inline void switch_to_bitmap(struct thread_struct *prev,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2f90fa92468..fc836dc71bba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1459,6 +1459,7 @@ static inline bool is_percpu_thread(void)
 #define PFA_SPEC_SSB_FORCE_DISABLE	4	/* Speculative Store Bypass force disabled*/
 #define PFA_SPEC_IB_DISABLE		5	/* Indirect branch speculation restricted */
 #define PFA_SPEC_IB_FORCE_DISABLE	6	/* Indirect branch speculation permanently restricted */
+#define PFA_SPEC_SSB_NOEXEC		7	/* Speculative Store Bypass clear on execve() */
 
 #define TASK_PFA_TEST(name, func)					\
 	static inline bool task_##func(struct task_struct *p)		\
@@ -1487,6 +1488,10 @@ TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
 TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
 TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
 
+TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
+TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
+TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)
+
 TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
 TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b4875a93363a..094bb03b9cc2 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -219,6 +219,7 @@ struct prctl_mm_map {
 # define PR_SPEC_ENABLE			(1UL << 1)
 # define PR_SPEC_DISABLE		(1UL << 2)
 # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
+# define PR_SPEC_DISABLE_NOEXEC		(1UL << 4)
 
 /* Reset arm64 pointer authentication keys */
 #define PR_PAC_RESET_KEYS		54
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index b4875a93363a..094bb03b9cc2 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -219,6 +219,7 @@ struct prctl_mm_map {
 # define PR_SPEC_ENABLE			(1UL << 1)
 # define PR_SPEC_DISABLE		(1UL << 2)
 # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
+# define PR_SPEC_DISABLE_NOEXEC		(1UL << 4)
 
 /* Reset arm64 pointer authentication keys */
 #define PR_PAC_RESET_KEYS		54
-- 
cgit v1.2.3-59-g8ed1b


From 1194c4133195dfcb6c5fc0935d54bbed872a5285 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Tue, 29 Jan 2019 00:56:17 +0000
Subject: nfit: Add Hyper-V NVDIMM DSM command set to white list

Add the Hyper-V _DSM command set to the white list of NVDIMM command
sets.

This command set is documented at http://www.uefi.org/RFIC_LIST
(see "Virtual NVDIMM 0x1901").

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c   | 17 ++++++++++++++---
 drivers/acpi/nfit/nfit.h   |  6 +++++-
 include/uapi/linux/ndctl.h |  1 +
 3 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 1598e3a121a6..4a7e8b1fa43b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1846,9 +1846,17 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	dev_set_drvdata(&adev_dimm->dev, nfit_mem);
 
 	/*
-	 * Until standardization materializes we need to consider 4
-	 * different command sets.  Note, that checking for function0 (bit0)
-	 * tells us if any commands are reachable through this GUID.
+	 * There are 4 "legacy" NVDIMM command sets
+	 * (NVDIMM_FAMILY_{INTEL,MSFT,HPE1,HPE2}) that were created before
+	 * an EFI working group was established to constrain this
+	 * proliferation. The nfit driver probes for the supported command
+	 * set by GUID. Note, if you're a platform developer looking to add
+	 * a new command set to this probe, consider using an existing set,
+	 * or otherwise seek approval to publish the command set at
+	 * http://www.uefi.org/RFIC_LIST.
+	 *
+	 * Note, that checking for function0 (bit0) tells us if any commands
+	 * are reachable through this GUID.
 	 */
 	for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
 		if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
@@ -1871,6 +1879,8 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 			dsm_mask &= ~(1 << 8);
 	} else if (nfit_mem->family == NVDIMM_FAMILY_MSFT) {
 		dsm_mask = 0xffffffff;
+	} else if (nfit_mem->family == NVDIMM_FAMILY_HYPERV) {
+		dsm_mask = 0x1f;
 	} else {
 		dev_dbg(dev, "unknown dimm command family\n");
 		nfit_mem->family = -1;
@@ -3714,6 +3724,7 @@ static __init int nfit_init(void)
 	guid_parse(UUID_NFIT_DIMM_N_HPE1, &nfit_uuid[NFIT_DEV_DIMM_N_HPE1]);
 	guid_parse(UUID_NFIT_DIMM_N_HPE2, &nfit_uuid[NFIT_DEV_DIMM_N_HPE2]);
 	guid_parse(UUID_NFIT_DIMM_N_MSFT, &nfit_uuid[NFIT_DEV_DIMM_N_MSFT]);
+	guid_parse(UUID_NFIT_DIMM_N_HYPERV, &nfit_uuid[NFIT_DEV_DIMM_N_HYPERV]);
 
 	nfit_wq = create_singlethread_workqueue("nfit");
 	if (!nfit_wq)
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 33691aecfcee..4de167b4f76f 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -34,11 +34,14 @@
 /* https://msdn.microsoft.com/library/windows/hardware/mt604741 */
 #define UUID_NFIT_DIMM_N_MSFT "1ee68b36-d4bd-4a1a-9a16-4f8e53d46e05"
 
+/* http://www.uefi.org/RFIC_LIST (see "Virtual NVDIMM 0x1901") */
+#define UUID_NFIT_DIMM_N_HYPERV "5746c5f2-a9a2-4264-ad0e-e4ddc9e09e80"
+
 #define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
 		| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
 		| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
 
-#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT
+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
 
 #define NVDIMM_STANDARD_CMDMASK \
 (1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
@@ -94,6 +97,7 @@ enum nfit_uuids {
 	NFIT_DEV_DIMM_N_HPE1 = NVDIMM_FAMILY_HPE1,
 	NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2,
 	NFIT_DEV_DIMM_N_MSFT = NVDIMM_FAMILY_MSFT,
+	NFIT_DEV_DIMM_N_HYPERV = NVDIMM_FAMILY_HYPERV,
 	NFIT_SPA_VOLATILE,
 	NFIT_SPA_PM,
 	NFIT_SPA_DCR,
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index f57c9e434d2d..de5d90212409 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -243,6 +243,7 @@ struct nd_cmd_pkg {
 #define NVDIMM_FAMILY_HPE1 1
 #define NVDIMM_FAMILY_HPE2 2
 #define NVDIMM_FAMILY_MSFT 3
+#define NVDIMM_FAMILY_HYPERV 4
 
 #define ND_IOCTL_CALL			_IOWR(ND_IOCTL, ND_CMD_CALL,\
 					struct nd_cmd_pkg)
-- 
cgit v1.2.3-59-g8ed1b


From f4601dee25d5fe8010023552b10879f3d62e45ce Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Mon, 28 Jan 2019 18:00:21 +0530
Subject: devlink: Add port param get command

Add port param get command which gets data per parameter.
It also has option to dump the parameters data per port.

v7->v8: Append "Acked-by: Jiri Pirko <jiri@mellanox.com>"

Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |   2 +
 net/core/devlink.c           | 102 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 97 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6e52d3660654..448973beac9d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -89,6 +89,8 @@ enum devlink_command {
 	DEVLINK_CMD_REGION_DEL,
 	DEVLINK_CMD_REGION_READ,
 
+	DEVLINK_CMD_PORT_PARAM_GET,	/* can dump */
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 371481ca2afd..66313dcdbdca 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2843,6 +2843,7 @@ nla_put_failure:
 }
 
 static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
+				 unsigned int port_index,
 				 struct devlink_param_item *param_item,
 				 enum devlink_command cmd,
 				 u32 portid, u32 seq, int flags)
@@ -2880,6 +2881,11 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
 
 	if (devlink_nl_put_handle(msg, devlink))
 		goto genlmsg_cancel;
+
+	if (cmd == DEVLINK_CMD_PORT_PARAM_GET)
+		if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index))
+			goto genlmsg_cancel;
+
 	param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM);
 	if (!param_attr)
 		goto genlmsg_cancel;
@@ -2933,7 +2939,7 @@ static void devlink_param_notify(struct devlink *devlink,
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
 		return;
-	err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0);
+	err = devlink_nl_param_fill(msg, devlink, 0, param_item, cmd, 0, 0, 0);
 	if (err) {
 		nlmsg_free(msg);
 		return;
@@ -2962,7 +2968,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
 				idx++;
 				continue;
 			}
-			err = devlink_nl_param_fill(msg, devlink, param_item,
+			err = devlink_nl_param_fill(msg, devlink, 0, param_item,
 						    DEVLINK_CMD_PARAM_GET,
 						    NETLINK_CB(cb->skb).portid,
 						    cb->nlh->nlmsg_seq,
@@ -3051,7 +3057,7 @@ devlink_param_value_get_from_info(const struct devlink_param *param,
 }
 
 static struct devlink_param_item *
-devlink_param_get_from_info(struct devlink *devlink,
+devlink_param_get_from_info(struct list_head *param_list,
 			    struct genl_info *info)
 {
 	char *param_name;
@@ -3060,7 +3066,7 @@ devlink_param_get_from_info(struct devlink *devlink,
 		return NULL;
 
 	param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
-	return devlink_param_find_by_name(&devlink->param_list, param_name);
+	return devlink_param_find_by_name(param_list, param_name);
 }
 
 static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
@@ -3071,7 +3077,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
 	struct sk_buff *msg;
 	int err;
 
-	param_item = devlink_param_get_from_info(devlink, info);
+	param_item = devlink_param_get_from_info(&devlink->param_list, info);
 	if (!param_item)
 		return -EINVAL;
 
@@ -3079,7 +3085,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
 	if (!msg)
 		return -ENOMEM;
 
-	err = devlink_nl_param_fill(msg, devlink, param_item,
+	err = devlink_nl_param_fill(msg, devlink, 0, param_item,
 				    DEVLINK_CMD_PARAM_GET,
 				    info->snd_portid, info->snd_seq, 0);
 	if (err) {
@@ -3102,7 +3108,7 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
 	union devlink_param_value value;
 	int err = 0;
 
-	param_item = devlink_param_get_from_info(devlink, info);
+	param_item = devlink_param_get_from_info(&devlink->param_list, info);
 	if (!param_item)
 		return -EINVAL;
 	param = param_item->param;
@@ -3183,6 +3189,80 @@ static void devlink_param_unregister_one(struct devlink *devlink,
 	kfree(param_item);
 }
 
+static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
+						struct netlink_callback *cb)
+{
+	struct devlink_param_item *param_item;
+	struct devlink_port *devlink_port;
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		mutex_lock(&devlink->lock);
+		list_for_each_entry(devlink_port, &devlink->port_list, list) {
+			list_for_each_entry(param_item,
+					    &devlink_port->param_list, list) {
+				if (idx < start) {
+					idx++;
+					continue;
+				}
+				err = devlink_nl_param_fill(msg,
+						devlink_port->devlink,
+						devlink_port->index, param_item,
+						DEVLINK_CMD_PORT_PARAM_GET,
+						NETLINK_CB(cb->skb).portid,
+						cb->nlh->nlmsg_seq,
+						NLM_F_MULTI);
+				if (err) {
+					mutex_unlock(&devlink->lock);
+					goto out;
+				}
+				idx++;
+			}
+		}
+		mutex_unlock(&devlink->lock);
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
+					      struct genl_info *info)
+{
+	struct devlink_port *devlink_port = info->user_ptr[0];
+	struct devlink_param_item *param_item;
+	struct sk_buff *msg;
+	int err;
+
+	param_item = devlink_param_get_from_info(&devlink_port->param_list,
+						 info);
+	if (!param_item)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_param_fill(msg, devlink_port->devlink,
+				    devlink_port->index, param_item,
+				    DEVLINK_CMD_PORT_PARAM_GET,
+				    info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
 static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
 					     struct devlink *devlink,
 					     struct devlink_snapshot *snapshot)
@@ -3820,6 +3900,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_PORT_PARAM_GET,
+		.doit = devlink_nl_cmd_port_param_get_doit,
+		.dumpit = devlink_nl_cmd_port_param_get_dumpit,
+		.policy = devlink_nl_policy,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+		/* can be retrieved by unprivileged users */
+	},
 	{
 		.cmd = DEVLINK_CMD_REGION_GET,
 		.doit = devlink_nl_cmd_region_get_doit,
-- 
cgit v1.2.3-59-g8ed1b


From 9c54873b4e2ee22507627b1adac9e3a8407741bd Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Mon, 28 Jan 2019 18:00:22 +0530
Subject: devlink: Add port param set command

Add port param set command to set the value for a parameter.
Value can be set to any of the supported configuration modes.

v7->v8: Append "Acked-by: Jiri Pirko <jiri@mellanox.com>"

Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 37 ++++++++++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 448973beac9d..3658fb20b190 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -90,6 +90,7 @@ enum devlink_command {
 	DEVLINK_CMD_REGION_READ,
 
 	DEVLINK_CMD_PORT_PARAM_GET,	/* can dump */
+	DEVLINK_CMD_PORT_PARAM_SET,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 66313dcdbdca..113ad9f529e1 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3096,10 +3096,11 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
 	return genlmsg_reply(msg, info);
 }
 
-static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
-					 struct genl_info *info)
+static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
+					   struct list_head *param_list,
+					   struct genl_info *info,
+					   enum devlink_command cmd)
 {
-	struct devlink *devlink = info->user_ptr[0];
 	enum devlink_param_type param_type;
 	struct devlink_param_gset_ctx ctx;
 	enum devlink_param_cmode cmode;
@@ -3108,7 +3109,7 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
 	union devlink_param_value value;
 	int err = 0;
 
-	param_item = devlink_param_get_from_info(&devlink->param_list, info);
+	param_item = devlink_param_get_from_info(param_list, info);
 	if (!param_item)
 		return -EINVAL;
 	param = param_item->param;
@@ -3148,10 +3149,19 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
 			return err;
 	}
 
-	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+	devlink_param_notify(devlink, param_item, cmd);
 	return 0;
 }
 
+static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
+					 struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+
+	return __devlink_nl_cmd_param_set_doit(devlink, &devlink->param_list,
+					       info, DEVLINK_CMD_PARAM_NEW);
+}
+
 static int devlink_param_register_one(struct devlink *devlink,
 				      struct list_head *param_list,
 				      const struct devlink_param *param)
@@ -3263,6 +3273,16 @@ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
 	return genlmsg_reply(msg, info);
 }
 
+static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb,
+					      struct genl_info *info)
+{
+	struct devlink_port *devlink_port = info->user_ptr[0];
+
+	return __devlink_nl_cmd_param_set_doit(devlink_port->devlink,
+					       &devlink_port->param_list,
+					       info, 0);
+}
+
 static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
 					     struct devlink *devlink,
 					     struct devlink_snapshot *snapshot)
@@ -3908,6 +3928,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
 		/* can be retrieved by unprivileged users */
 	},
+	{
+		.cmd = DEVLINK_CMD_PORT_PARAM_SET,
+		.doit = devlink_nl_cmd_port_param_set_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+	},
 	{
 		.cmd = DEVLINK_CMD_REGION_GET,
 		.doit = devlink_nl_cmd_region_get_doit,
-- 
cgit v1.2.3-59-g8ed1b


From c1e5786d6771c67fe044c3bcaa23e631e0503261 Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Mon, 28 Jan 2019 18:00:25 +0530
Subject: devlink: Add devlink notifications support for port params

Add notification call for devlink port param set, register and unregister
functions.
Add devlink_port_param_value_changed() function to enable the driver notify
devlink on value change. Driver should use this function after value was
changed on any configuration mode part to driverinit.

v7->v8:
Order devlink_port_param_value_changed() definitions followed by
devlink_param_value_changed()

Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |   8 ++++
 include/uapi/linux/devlink.h |   2 +
 net/core/devlink.c           | 111 ++++++++++++++++++++++++++++++++-----------
 3 files changed, 94 insertions(+), 27 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index ae2ccf297946..ceb5e89d74d6 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -586,6 +586,8 @@ int devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port,
 					    u32 param_id,
 					    union devlink_param_value init_val);
 void devlink_param_value_changed(struct devlink *devlink, u32 param_id);
+void devlink_port_param_value_changed(struct devlink_port *devlink_port,
+				      u32 param_id);
 void devlink_param_value_str_fill(union devlink_param_value *dst_val,
 				  const char *src);
 struct devlink_region *devlink_region_create(struct devlink *devlink,
@@ -855,6 +857,12 @@ devlink_param_value_changed(struct devlink *devlink, u32 param_id)
 {
 }
 
+static inline void
+devlink_port_param_value_changed(struct devlink_port *devlink_port,
+				 u32 param_id)
+{
+}
+
 static inline void
 devlink_param_value_str_fill(union devlink_param_value *dst_val,
 			     const char *src)
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 3658fb20b190..61b4447a6c5b 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -91,6 +91,8 @@ enum devlink_command {
 
 	DEVLINK_CMD_PORT_PARAM_GET,	/* can dump */
 	DEVLINK_CMD_PORT_PARAM_SET,
+	DEVLINK_CMD_PORT_PARAM_NEW,
+	DEVLINK_CMD_PORT_PARAM_DEL,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 55456cc36833..451ab4725340 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2882,7 +2882,9 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
 	if (devlink_nl_put_handle(msg, devlink))
 		goto genlmsg_cancel;
 
-	if (cmd == DEVLINK_CMD_PORT_PARAM_GET)
+	if (cmd == DEVLINK_CMD_PORT_PARAM_GET ||
+	    cmd == DEVLINK_CMD_PORT_PARAM_NEW ||
+	    cmd == DEVLINK_CMD_PORT_PARAM_DEL)
 		if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index))
 			goto genlmsg_cancel;
 
@@ -2928,18 +2930,22 @@ genlmsg_cancel:
 }
 
 static void devlink_param_notify(struct devlink *devlink,
+				 unsigned int port_index,
 				 struct devlink_param_item *param_item,
 				 enum devlink_command cmd)
 {
 	struct sk_buff *msg;
 	int err;
 
-	WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL);
+	WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL &&
+		cmd != DEVLINK_CMD_PORT_PARAM_NEW &&
+		cmd != DEVLINK_CMD_PORT_PARAM_DEL);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
 		return;
-	err = devlink_nl_param_fill(msg, devlink, 0, param_item, cmd, 0, 0, 0);
+	err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd,
+				    0, 0, 0);
 	if (err) {
 		nlmsg_free(msg);
 		return;
@@ -3097,6 +3103,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
 }
 
 static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
+					   unsigned int port_index,
 					   struct list_head *param_list,
 					   struct genl_info *info,
 					   enum devlink_command cmd)
@@ -3149,7 +3156,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
 			return err;
 	}
 
-	devlink_param_notify(devlink, param_item, cmd);
+	devlink_param_notify(devlink, port_index, param_item, cmd);
 	return 0;
 }
 
@@ -3158,13 +3165,15 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
 {
 	struct devlink *devlink = info->user_ptr[0];
 
-	return __devlink_nl_cmd_param_set_doit(devlink, &devlink->param_list,
+	return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->param_list,
 					       info, DEVLINK_CMD_PARAM_NEW);
 }
 
 static int devlink_param_register_one(struct devlink *devlink,
+				      unsigned int port_index,
 				      struct list_head *param_list,
-				      const struct devlink_param *param)
+				      const struct devlink_param *param,
+				      enum devlink_command cmd)
 {
 	struct devlink_param_item *param_item;
 
@@ -3182,19 +3191,21 @@ static int devlink_param_register_one(struct devlink *devlink,
 	param_item->param = param;
 
 	list_add_tail(&param_item->list, param_list);
-	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+	devlink_param_notify(devlink, port_index, param_item, cmd);
 	return 0;
 }
 
 static void devlink_param_unregister_one(struct devlink *devlink,
+					 unsigned int port_index,
 					 struct list_head *param_list,
-					 const struct devlink_param *param)
+					 const struct devlink_param *param,
+					 enum devlink_command cmd)
 {
 	struct devlink_param_item *param_item;
 
 	param_item = devlink_param_find_by_name(param_list, param->name);
 	WARN_ON(!param_item);
-	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL);
+	devlink_param_notify(devlink, port_index, param_item, cmd);
 	list_del(&param_item->list);
 	kfree(param_item);
 }
@@ -3279,8 +3290,9 @@ static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb,
 	struct devlink_port *devlink_port = info->user_ptr[0];
 
 	return __devlink_nl_cmd_param_set_doit(devlink_port->devlink,
-					       &devlink_port->param_list,
-					       info, 0);
+					       devlink_port->index,
+					       &devlink_port->param_list, info,
+					       DEVLINK_CMD_PORT_PARAM_NEW);
 }
 
 static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
@@ -4598,9 +4610,12 @@ static int devlink_param_verify(const struct devlink_param *param)
 }
 
 static int __devlink_params_register(struct devlink *devlink,
+				     unsigned int port_index,
 				     struct list_head *param_list,
 				     const struct devlink_param *params,
-				     size_t params_count)
+				     size_t params_count,
+				     enum devlink_command reg_cmd,
+				     enum devlink_command unreg_cmd)
 {
 	const struct devlink_param *param = params;
 	int i;
@@ -4612,7 +4627,8 @@ static int __devlink_params_register(struct devlink *devlink,
 		if (err)
 			goto rollback;
 
-		err = devlink_param_register_one(devlink, param_list, param);
+		err = devlink_param_register_one(devlink, port_index,
+						 param_list, param, reg_cmd);
 		if (err)
 			goto rollback;
 	}
@@ -4624,23 +4640,27 @@ rollback:
 	if (!i)
 		goto unlock;
 	for (param--; i > 0; i--, param--)
-		devlink_param_unregister_one(devlink, param_list, param);
+		devlink_param_unregister_one(devlink, port_index, param_list,
+					     param, unreg_cmd);
 unlock:
 	mutex_unlock(&devlink->lock);
 	return err;
 }
 
 static void __devlink_params_unregister(struct devlink *devlink,
+					unsigned int port_index,
 					struct list_head *param_list,
 					const struct devlink_param *params,
-					size_t params_count)
+					size_t params_count,
+					enum devlink_command cmd)
 {
 	const struct devlink_param *param = params;
 	int i;
 
 	mutex_lock(&devlink->lock);
 	for (i = 0; i < params_count; i++, param++)
-		devlink_param_unregister_one(devlink, param_list, param);
+		devlink_param_unregister_one(devlink, 0, param_list, param,
+					     cmd);
 	mutex_unlock(&devlink->lock);
 }
 
@@ -4657,8 +4677,10 @@ int devlink_params_register(struct devlink *devlink,
 			    const struct devlink_param *params,
 			    size_t params_count)
 {
-	return __devlink_params_register(devlink, &devlink->param_list, params,
-					 params_count);
+	return __devlink_params_register(devlink, 0, &devlink->param_list,
+					 params, params_count,
+					 DEVLINK_CMD_PARAM_NEW,
+					 DEVLINK_CMD_PARAM_DEL);
 }
 EXPORT_SYMBOL_GPL(devlink_params_register);
 
@@ -4672,8 +4694,9 @@ void devlink_params_unregister(struct devlink *devlink,
 			       const struct devlink_param *params,
 			       size_t params_count)
 {
-	return __devlink_params_unregister(devlink, &devlink->param_list,
-					   params, params_count);
+	return __devlink_params_unregister(devlink, 0, &devlink->param_list,
+					   params, params_count,
+					   DEVLINK_CMD_PARAM_DEL);
 }
 EXPORT_SYMBOL_GPL(devlink_params_unregister);
 
@@ -4691,8 +4714,11 @@ int devlink_port_params_register(struct devlink_port *devlink_port,
 				 size_t params_count)
 {
 	return __devlink_params_register(devlink_port->devlink,
+					 devlink_port->index,
 					 &devlink_port->param_list, params,
-					 params_count);
+					 params_count,
+					 DEVLINK_CMD_PORT_PARAM_NEW,
+					 DEVLINK_CMD_PORT_PARAM_DEL);
 }
 EXPORT_SYMBOL_GPL(devlink_port_params_register);
 
@@ -4709,8 +4735,10 @@ void devlink_port_params_unregister(struct devlink_port *devlink_port,
 				    size_t params_count)
 {
 	return __devlink_params_unregister(devlink_port->devlink,
+					   devlink_port->index,
 					   &devlink_port->param_list,
-					   params, params_count);
+					   params, params_count,
+					   DEVLINK_CMD_PORT_PARAM_DEL);
 }
 EXPORT_SYMBOL_GPL(devlink_port_params_unregister);
 
@@ -4739,6 +4767,7 @@ __devlink_param_driverinit_value_get(struct list_head *param_list, u32 param_id,
 
 static int
 __devlink_param_driverinit_value_set(struct devlink *devlink,
+				     unsigned int port_index,
 				     struct list_head *param_list, u32 param_id,
 				     union devlink_param_value init_val,
 				     enum devlink_command cmd)
@@ -4759,7 +4788,7 @@ __devlink_param_driverinit_value_set(struct devlink *devlink,
 		param_item->driverinit_value = init_val;
 	param_item->driverinit_value_valid = true;
 
-	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+	devlink_param_notify(devlink, port_index, param_item, cmd);
 	return 0;
 }
 
@@ -4800,7 +4829,7 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);
 int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
 				       union devlink_param_value init_val)
 {
-	return __devlink_param_driverinit_value_set(devlink,
+	return __devlink_param_driverinit_value_set(devlink, 0,
 						    &devlink->param_list,
 						    param_id, init_val,
 						    DEVLINK_CMD_PARAM_NEW);
@@ -4849,8 +4878,10 @@ int devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port,
 					    union devlink_param_value init_val)
 {
 	return __devlink_param_driverinit_value_set(devlink_port->devlink,
+						    devlink_port->index,
 						    &devlink_port->param_list,
-						    param_id, init_val, 0);
+						    param_id, init_val,
+						    DEVLINK_CMD_PORT_PARAM_NEW);
 }
 EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_set);
 
@@ -4865,7 +4896,6 @@ EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_set);
  *	This function should be used by the driver to notify devlink on value
  *	change, excluding driverinit configuration mode.
  *	For driverinit configuration mode driver should use the function
- *	devlink_param_driverinit_value_set() instead.
  */
 void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
 {
@@ -4874,10 +4904,37 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
 	param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
 	WARN_ON(!param_item);
 
-	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+	devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
 }
 EXPORT_SYMBOL_GPL(devlink_param_value_changed);
 
+/**
+ *     devlink_port_param_value_changed - notify devlink on a parameter's value
+ *                                      change. Should be called by the driver
+ *                                      right after the change.
+ *
+ *     @devlink_port: devlink_port
+ *     @param_id: parameter ID
+ *
+ *     This function should be used by the driver to notify devlink on value
+ *     change, excluding driverinit configuration mode.
+ *     For driverinit configuration mode driver should use the function
+ *     devlink_port_param_driverinit_value_set() instead.
+ */
+void devlink_port_param_value_changed(struct devlink_port *devlink_port,
+				      u32 param_id)
+{
+	struct devlink_param_item *param_item;
+
+	param_item = devlink_param_find_by_id(&devlink_port->param_list,
+					      param_id);
+	WARN_ON(!param_item);
+
+	devlink_param_notify(devlink_port->devlink, devlink_port->index,
+			     param_item, DEVLINK_CMD_PORT_PARAM_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_port_param_value_changed);
+
 /**
  *	devlink_param_value_str_fill - Safely fill-up the string preventing
  *				       from overflow of the preallocated buffer
-- 
cgit v1.2.3-59-g8ed1b


From 2d908b38d40921a03225d42fd6e48eb51bffd606 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 23 Jan 2019 11:28:19 +0100
Subject: serial: Add Tegra Combined UART driver

The Tegra Combined UART (TCU) is a mailbox-based mechanism that allows
multiplexing multiple "virtual UARTs" into a single hardware serial
port. The TCU is the primary serial port on Tegra194 devices.

Add a TCU driver utilizing the mailbox framework, as the used mailboxes
are part of Tegra HSP blocks that are already controlled by the Tegra
HSP mailbox driver.

Based on work by  Mikko Perttunen <mperttunen@nvidia.com>.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig       |  22 +++
 drivers/tty/serial/Makefile      |   1 +
 drivers/tty/serial/tegra-tcu.c   | 298 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/serial_core.h |   3 +
 4 files changed, 324 insertions(+)
 create mode 100644 drivers/tty/serial/tegra-tcu.c

(limited to 'include/uapi')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 089a6f285d5e..72966bc0ac76 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -335,6 +335,28 @@ config SERIAL_TEGRA
 	  are enabled). This driver uses the APB DMA to achieve higher baudrate
 	  and better performance.
 
+config SERIAL_TEGRA_TCU
+	tristate "NVIDIA Tegra Combined UART"
+	depends on ARCH_TEGRA && TEGRA_HSP_MBOX
+	select SERIAL_CORE
+	help
+	  Support for the mailbox-based TCU (Tegra Combined UART) serial port.
+	  TCU is a virtual serial port that allows multiplexing multiple data
+	  streams into a single hardware serial port.
+
+config SERIAL_TEGRA_TCU_CONSOLE
+	bool "Support for console on a Tegra TCU serial port"
+	depends on SERIAL_TEGRA_TCU=y
+	select SERIAL_CORE_CONSOLE
+	default y
+	---help---
+	  If you say Y here, it will be possible to use a the Tegra TCU as the
+	  system console (the system console is the device which receives all
+	  kernel messages and warnings and which allows logins in single user
+	  mode).
+
+	  If unsure, say Y.
+
 config SERIAL_MAX3100
 	tristate "MAX3100 support"
 	depends on SPI
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 1511e8a9f856..40b702aaa85e 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -77,6 +77,7 @@ obj-$(CONFIG_SERIAL_LANTIQ)	+= lantiq.o
 obj-$(CONFIG_SERIAL_XILINX_PS_UART) += xilinx_uartps.o
 obj-$(CONFIG_SERIAL_SIRFSOC) += sirfsoc_uart.o
 obj-$(CONFIG_SERIAL_TEGRA) += serial-tegra.o
+obj-$(CONFIG_SERIAL_TEGRA_TCU) += tegra-tcu.o
 obj-$(CONFIG_SERIAL_AR933X)   += ar933x_uart.o
 obj-$(CONFIG_SERIAL_EFM32_UART) += efm32-uart.o
 obj-$(CONFIG_SERIAL_ARC)	+= arc_uart.o
diff --git a/drivers/tty/serial/tegra-tcu.c b/drivers/tty/serial/tegra-tcu.c
new file mode 100644
index 000000000000..aaf8748a6147
--- /dev/null
+++ b/drivers/tty/serial/tegra-tcu.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ */
+
+#include <linux/console.h>
+#include <linux/mailbox_client.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
+#include <linux/slab.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#define TCU_MBOX_BYTE(i, x)			((x) << (i * 8))
+#define TCU_MBOX_BYTE_V(x, i)			(((x) >> (i * 8)) & 0xff)
+#define TCU_MBOX_NUM_BYTES(x)			((x) << 24)
+#define TCU_MBOX_NUM_BYTES_V(x)			(((x) >> 24) & 0x3)
+
+struct tegra_tcu {
+	struct uart_driver driver;
+#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE)
+	struct console console;
+#endif
+	struct uart_port port;
+
+	struct mbox_client tx_client, rx_client;
+	struct mbox_chan *tx, *rx;
+};
+
+static unsigned int tegra_tcu_uart_tx_empty(struct uart_port *port)
+{
+	return TIOCSER_TEMT;
+}
+
+static void tegra_tcu_uart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+}
+
+static unsigned int tegra_tcu_uart_get_mctrl(struct uart_port *port)
+{
+	return 0;
+}
+
+static void tegra_tcu_uart_stop_tx(struct uart_port *port)
+{
+}
+
+static void tegra_tcu_write_one(struct tegra_tcu *tcu, u32 value,
+				unsigned int count)
+{
+	void *msg;
+
+	value |= TCU_MBOX_NUM_BYTES(count);
+	msg = (void *)(unsigned long)value;
+	mbox_send_message(tcu->tx, msg);
+	mbox_flush(tcu->tx, 1000);
+}
+
+static void tegra_tcu_write(struct tegra_tcu *tcu, const char *s,
+			    unsigned int count)
+{
+	unsigned int written = 0, i = 0;
+	bool insert_nl = false;
+	u32 value = 0;
+
+	while (i < count) {
+		if (insert_nl) {
+			value |= TCU_MBOX_BYTE(written++, '\n');
+			insert_nl = false;
+			i++;
+		} else if (s[i] == '\n') {
+			value |= TCU_MBOX_BYTE(written++, '\r');
+			insert_nl = true;
+		} else {
+			value |= TCU_MBOX_BYTE(written++, s[i++]);
+		}
+
+		if (written == 3) {
+			tegra_tcu_write_one(tcu, value, 3);
+			value = written = 0;
+		}
+	}
+
+	if (written)
+		tegra_tcu_write_one(tcu, value, written);
+}
+
+static void tegra_tcu_uart_start_tx(struct uart_port *port)
+{
+	struct tegra_tcu *tcu = port->private_data;
+	struct circ_buf *xmit = &port->state->xmit;
+	unsigned long count;
+
+	for (;;) {
+		count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+		if (!count)
+			break;
+
+		tegra_tcu_write(tcu, &xmit->buf[xmit->tail], count);
+		xmit->tail = (xmit->tail + count) & (UART_XMIT_SIZE - 1);
+	}
+
+	uart_write_wakeup(port);
+}
+
+static void tegra_tcu_uart_stop_rx(struct uart_port *port)
+{
+}
+
+static void tegra_tcu_uart_break_ctl(struct uart_port *port, int ctl)
+{
+}
+
+static int tegra_tcu_uart_startup(struct uart_port *port)
+{
+	return 0;
+}
+
+static void tegra_tcu_uart_shutdown(struct uart_port *port)
+{
+}
+
+static void tegra_tcu_uart_set_termios(struct uart_port *port,
+				       struct ktermios *new,
+				       struct ktermios *old)
+{
+}
+
+static const struct uart_ops tegra_tcu_uart_ops = {
+	.tx_empty = tegra_tcu_uart_tx_empty,
+	.set_mctrl = tegra_tcu_uart_set_mctrl,
+	.get_mctrl = tegra_tcu_uart_get_mctrl,
+	.stop_tx = tegra_tcu_uart_stop_tx,
+	.start_tx = tegra_tcu_uart_start_tx,
+	.stop_rx = tegra_tcu_uart_stop_rx,
+	.break_ctl = tegra_tcu_uart_break_ctl,
+	.startup = tegra_tcu_uart_startup,
+	.shutdown = tegra_tcu_uart_shutdown,
+	.set_termios = tegra_tcu_uart_set_termios,
+};
+
+#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE)
+static void tegra_tcu_console_write(struct console *cons, const char *s,
+				    unsigned int count)
+{
+	struct tegra_tcu *tcu = container_of(cons, struct tegra_tcu, console);
+
+	tegra_tcu_write(tcu, s, count);
+}
+
+static int tegra_tcu_console_setup(struct console *cons, char *options)
+{
+	return 0;
+}
+#endif
+
+static void tegra_tcu_receive(struct mbox_client *cl, void *msg)
+{
+	struct tegra_tcu *tcu = container_of(cl, struct tegra_tcu, rx_client);
+	struct tty_port *port = &tcu->port.state->port;
+	u32 value = (u32)(unsigned long)msg;
+	unsigned int num_bytes, i;
+
+	num_bytes = TCU_MBOX_NUM_BYTES_V(value);
+
+	for (i = 0; i < num_bytes; i++)
+		tty_insert_flip_char(port, TCU_MBOX_BYTE_V(value, i),
+				     TTY_NORMAL);
+
+	tty_flip_buffer_push(port);
+}
+
+static int tegra_tcu_probe(struct platform_device *pdev)
+{
+	struct uart_port *port;
+	struct tegra_tcu *tcu;
+	int err;
+
+	tcu = devm_kzalloc(&pdev->dev, sizeof(*tcu), GFP_KERNEL);
+	if (!tcu)
+		return -ENOMEM;
+
+	tcu->tx_client.dev = &pdev->dev;
+	tcu->rx_client.dev = &pdev->dev;
+	tcu->rx_client.rx_callback = tegra_tcu_receive;
+
+	tcu->tx = mbox_request_channel_byname(&tcu->tx_client, "tx");
+	if (IS_ERR(tcu->tx)) {
+		err = PTR_ERR(tcu->tx);
+		dev_err(&pdev->dev, "failed to get tx mailbox: %d\n", err);
+		return err;
+	}
+
+	tcu->rx = mbox_request_channel_byname(&tcu->rx_client, "rx");
+	if (IS_ERR(tcu->rx)) {
+		err = PTR_ERR(tcu->rx);
+		dev_err(&pdev->dev, "failed to get rx mailbox: %d\n", err);
+		goto free_tx;
+	}
+
+#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE)
+	/* setup the console */
+	strcpy(tcu->console.name, "ttyTCU");
+	tcu->console.device = uart_console_device;
+	tcu->console.flags = CON_PRINTBUFFER | CON_ANYTIME;
+	tcu->console.index = -1;
+	tcu->console.write = tegra_tcu_console_write;
+	tcu->console.setup = tegra_tcu_console_setup;
+	tcu->console.data = &tcu->driver;
+#endif
+
+	/* setup the driver */
+	tcu->driver.owner = THIS_MODULE;
+	tcu->driver.driver_name = "tegra-tcu";
+	tcu->driver.dev_name = "ttyTCU";
+#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE)
+	tcu->driver.cons = &tcu->console;
+#endif
+	tcu->driver.nr = 1;
+
+	err = uart_register_driver(&tcu->driver);
+	if (err) {
+		dev_err(&pdev->dev, "failed to register UART driver: %d\n",
+			err);
+		goto free_rx;
+	}
+
+	/* setup the port */
+	port = &tcu->port;
+	spin_lock_init(&port->lock);
+	port->dev = &pdev->dev;
+	port->type = PORT_TEGRA_TCU;
+	port->ops = &tegra_tcu_uart_ops;
+	port->fifosize = 1;
+	port->iotype = UPIO_MEM;
+	port->flags = UPF_BOOT_AUTOCONF;
+	port->private_data = tcu;
+
+	err = uart_add_one_port(&tcu->driver, port);
+	if (err) {
+		dev_err(&pdev->dev, "failed to add UART port: %d\n", err);
+		goto unregister_uart;
+	}
+
+	platform_set_drvdata(pdev, tcu);
+#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE)
+	register_console(&tcu->console);
+#endif
+
+	return 0;
+
+unregister_uart:
+	uart_unregister_driver(&tcu->driver);
+free_rx:
+	mbox_free_channel(tcu->rx);
+free_tx:
+	mbox_free_channel(tcu->tx);
+
+	return err;
+}
+
+static int tegra_tcu_remove(struct platform_device *pdev)
+{
+	struct tegra_tcu *tcu = platform_get_drvdata(pdev);
+
+#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE)
+	unregister_console(&tcu->console);
+#endif
+	uart_remove_one_port(&tcu->driver, &tcu->port);
+	uart_unregister_driver(&tcu->driver);
+	mbox_free_channel(tcu->rx);
+	mbox_free_channel(tcu->tx);
+
+	return 0;
+}
+
+static const struct of_device_id tegra_tcu_match[] = {
+	{ .compatible = "nvidia,tegra194-tcu" },
+	{ }
+};
+
+static struct platform_driver tegra_tcu_driver = {
+	.driver = {
+		.name = "tegra-tcu",
+		.of_match_table = tegra_tcu_match,
+	},
+	.probe = tegra_tcu_probe,
+	.remove = tegra_tcu_remove,
+};
+module_platform_driver(tegra_tcu_driver);
+
+MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("NVIDIA Tegra Combined UART driver");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index df4a7534e239..6009ee2c2e99 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -79,6 +79,9 @@
 /* Nuvoton UART */
 #define PORT_NPCM	40
 
+/* NVIDIA Tegra Combined UART */
+#define PORT_TEGRA_TCU	41
+
 /* Intel EG20 */
 #define PORT_PCH_8LINE	44
 #define PORT_PCH_2LINE	45
-- 
cgit v1.2.3-59-g8ed1b


From 80df2704a375bb4b3c9c5cce9c00052361b16d61 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 28 Jan 2019 15:08:23 +0800
Subject: sctp: introduce SCTP_FUTURE/CURRENT/ALL_ASSOC

This patch is to add 3 constants SCTP_FUTURE_ASSOC,
SCTP_CURRENT_ASSOC and SCTP_ALL_ASSOC for reserved
assoc_ids, as defined in rfc6458#section-7.2.

And add the process for them when doing lookup and
inserting in sctp_id2assoc and sctp_assoc_set_id.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h | 4 ++++
 net/sctp/associola.c      | 7 +++++--
 net/sctp/socket.c         | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index d584073532b8..b8f2c4d56532 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -59,6 +59,10 @@
 
 typedef __s32 sctp_assoc_t;
 
+#define SCTP_FUTURE_ASSOC	0
+#define SCTP_CURRENT_ASSOC	1
+#define SCTP_ALL_ASSOC		2
+
 /* The following symbols come from the Sockets API Extensions for
  * SCTP <draft-ietf-tsvwg-sctpsocket-07.txt>.
  */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 201c888604e4..b99f163e33ac 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1651,8 +1651,11 @@ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp)
 	if (preload)
 		idr_preload(gfp);
 	spin_lock_bh(&sctp_assocs_id_lock);
-	/* 0 is not a valid assoc_id, must be >= 1 */
-	ret = idr_alloc_cyclic(&sctp_assocs_id, asoc, 1, 0, GFP_NOWAIT);
+	/* 0, 1, 2 are used as SCTP_FUTURE_ASSOC, SCTP_CURRENT_ASSOC and
+	 * SCTP_ALL_ASSOC, so an available id must be > SCTP_ALL_ASSOC.
+	 */
+	ret = idr_alloc_cyclic(&sctp_assocs_id, asoc, SCTP_ALL_ASSOC + 1, 0,
+			       GFP_NOWAIT);
 	spin_unlock_bh(&sctp_assocs_id_lock);
 	if (preload)
 		idr_preload_end();
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f93c3cf9e567..a52d132470ea 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -248,7 +248,7 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id)
 	}
 
 	/* Otherwise this is a UDP-style socket. */
-	if (!id || (id == (sctp_assoc_t)-1))
+	if (id <= SCTP_ALL_ASSOC)
 		return NULL;
 
 	spin_lock_bh(&sctp_assocs_id_lock);
-- 
cgit v1.2.3-59-g8ed1b


From d0a060be573bfbf8753a15dca35497db5e968bb0 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Wed, 30 Jan 2019 12:02:44 +0000
Subject: arm64: add ptrace regsets for ptrauth key management

Add two new ptrace regsets, which can be used to request and change the
pointer authentication keys of a thread. NT_ARM_PACA_KEYS gives access
to the instruction/data address keys, and NT_ARM_PACG_KEYS to the
generic authentication key. The keys are also part of the core dump file
of the process.

The regsets are only exposed if the kernel is compiled with
CONFIG_CHECKPOINT_RESTORE=y, as the only intended use case is
checkpointing and restoring processes that are using pointer
authentication. (This can be changed later if there are other use
cases.)

Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arm64/pointer-authentication.txt |   5 +
 arch/arm64/include/uapi/asm/ptrace.h           |  13 +++
 arch/arm64/kernel/ptrace.c                     | 147 +++++++++++++++++++++++++
 include/uapi/linux/elf.h                       |   2 +
 4 files changed, 167 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/arm64/pointer-authentication.txt b/Documentation/arm64/pointer-authentication.txt
index a25cd21290e9..5baca42ba146 100644
--- a/Documentation/arm64/pointer-authentication.txt
+++ b/Documentation/arm64/pointer-authentication.txt
@@ -78,6 +78,11 @@ bits can vary between the two. Note that the masks apply to TTBR0
 addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel
 pointers).
 
+Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel
+will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct
+user_pac_address_keys and struct user_pac_generic_keys). These can be
+used to get and set the keys for a thread.
+
 
 Virtualization
 --------------
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 28d77c9ed531..d78623acb649 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -233,6 +233,19 @@ struct user_pac_mask {
 	__u64		insn_mask;
 };
 
+/* pointer authentication keys (NT_ARM_PACA_KEYS, NT_ARM_PACG_KEYS) */
+
+struct user_pac_address_keys {
+	__uint128_t	apiakey;
+	__uint128_t	apibkey;
+	__uint128_t	apdakey;
+	__uint128_t	apdbkey;
+};
+
+struct user_pac_generic_keys {
+	__uint128_t	apgakey;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 9dce33b0e260..a86413be5a2d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -979,6 +979,131 @@ static int pac_mask_get(struct task_struct *target,
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &uregs, 0, -1);
 }
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static __uint128_t pac_key_to_user(const struct ptrauth_key *key)
+{
+	return (__uint128_t)key->hi << 64 | key->lo;
+}
+
+static struct ptrauth_key pac_key_from_user(__uint128_t ukey)
+{
+	struct ptrauth_key key = {
+		.lo = (unsigned long)ukey,
+		.hi = (unsigned long)(ukey >> 64),
+	};
+
+	return key;
+}
+
+static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys,
+				     const struct ptrauth_keys *keys)
+{
+	ukeys->apiakey = pac_key_to_user(&keys->apia);
+	ukeys->apibkey = pac_key_to_user(&keys->apib);
+	ukeys->apdakey = pac_key_to_user(&keys->apda);
+	ukeys->apdbkey = pac_key_to_user(&keys->apdb);
+}
+
+static void pac_address_keys_from_user(struct ptrauth_keys *keys,
+				       const struct user_pac_address_keys *ukeys)
+{
+	keys->apia = pac_key_from_user(ukeys->apiakey);
+	keys->apib = pac_key_from_user(ukeys->apibkey);
+	keys->apda = pac_key_from_user(ukeys->apdakey);
+	keys->apdb = pac_key_from_user(ukeys->apdbkey);
+}
+
+static int pac_address_keys_get(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				void *kbuf, void __user *ubuf)
+{
+	struct ptrauth_keys *keys = &target->thread.keys_user;
+	struct user_pac_address_keys user_keys;
+
+	if (!system_supports_address_auth())
+		return -EINVAL;
+
+	pac_address_keys_to_user(&user_keys, keys);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &user_keys, 0, -1);
+}
+
+static int pac_address_keys_set(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				const void *kbuf, const void __user *ubuf)
+{
+	struct ptrauth_keys *keys = &target->thread.keys_user;
+	struct user_pac_address_keys user_keys;
+	int ret;
+
+	if (!system_supports_address_auth())
+		return -EINVAL;
+
+	pac_address_keys_to_user(&user_keys, keys);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &user_keys, 0, -1);
+	if (ret)
+		return ret;
+	pac_address_keys_from_user(keys, &user_keys);
+
+	return 0;
+}
+
+static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys,
+				     const struct ptrauth_keys *keys)
+{
+	ukeys->apgakey = pac_key_to_user(&keys->apga);
+}
+
+static void pac_generic_keys_from_user(struct ptrauth_keys *keys,
+				       const struct user_pac_generic_keys *ukeys)
+{
+	keys->apga = pac_key_from_user(ukeys->apgakey);
+}
+
+static int pac_generic_keys_get(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				void *kbuf, void __user *ubuf)
+{
+	struct ptrauth_keys *keys = &target->thread.keys_user;
+	struct user_pac_generic_keys user_keys;
+
+	if (!system_supports_generic_auth())
+		return -EINVAL;
+
+	pac_generic_keys_to_user(&user_keys, keys);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &user_keys, 0, -1);
+}
+
+static int pac_generic_keys_set(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				const void *kbuf, const void __user *ubuf)
+{
+	struct ptrauth_keys *keys = &target->thread.keys_user;
+	struct user_pac_generic_keys user_keys;
+	int ret;
+
+	if (!system_supports_generic_auth())
+		return -EINVAL;
+
+	pac_generic_keys_to_user(&user_keys, keys);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &user_keys, 0, -1);
+	if (ret)
+		return ret;
+	pac_generic_keys_from_user(keys, &user_keys);
+
+	return 0;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
 #endif /* CONFIG_ARM64_PTR_AUTH */
 
 enum aarch64_regset {
@@ -995,6 +1120,10 @@ enum aarch64_regset {
 #endif
 #ifdef CONFIG_ARM64_PTR_AUTH
 	REGSET_PAC_MASK,
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	REGSET_PACA_KEYS,
+	REGSET_PACG_KEYS,
+#endif
 #endif
 };
 
@@ -1074,6 +1203,24 @@ static const struct user_regset aarch64_regsets[] = {
 		.get = pac_mask_get,
 		/* this cannot be set dynamically */
 	},
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	[REGSET_PACA_KEYS] = {
+		.core_note_type = NT_ARM_PACA_KEYS,
+		.n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t),
+		.size = sizeof(__uint128_t),
+		.align = sizeof(__uint128_t),
+		.get = pac_address_keys_get,
+		.set = pac_address_keys_set,
+	},
+	[REGSET_PACG_KEYS] = {
+		.core_note_type = NT_ARM_PACG_KEYS,
+		.n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t),
+		.size = sizeof(__uint128_t),
+		.align = sizeof(__uint128_t),
+		.get = pac_generic_keys_get,
+		.set = pac_generic_keys_set,
+	},
+#endif
 #endif
 };
 
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index e4d6ddd93567..34c02e4290fe 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -421,6 +421,8 @@ typedef struct elf64_shdr {
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
 #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
 #define NT_ARM_PAC_MASK		0x406	/* ARM pointer authentication code masks */
+#define NT_ARM_PACA_KEYS	0x407	/* ARM pointer authentication address keys */
+#define NT_ARM_PACG_KEYS	0x408	/* ARM pointer authentication generic key */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
-- 
cgit v1.2.3-59-g8ed1b


From d83525ca62cf8ebe3271d14c36fb900c294274a2 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 31 Jan 2019 15:40:04 -0800
Subject: bpf: introduce bpf_spin_lock

Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let
bpf program serialize access to other variables.

Example:
struct hash_elem {
    int cnt;
    struct bpf_spin_lock lock;
};
struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key);
if (val) {
    bpf_spin_lock(&val->lock);
    val->cnt++;
    bpf_spin_unlock(&val->lock);
}

Restrictions and safety checks:
- bpf_spin_lock is only allowed inside HASH and ARRAY maps.
- BTF description of the map is mandatory for safety analysis.
- bpf program can take one bpf_spin_lock at a time, since two or more can
  cause dead locks.
- only one 'struct bpf_spin_lock' is allowed per map element.
  It drastically simplifies implementation yet allows bpf program to use
  any number of bpf_spin_locks.
- when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed.
- bpf program must bpf_spin_unlock() before return.
- bpf program can access 'struct bpf_spin_lock' only via
  bpf_spin_lock()/bpf_spin_unlock() helpers.
- load/store into 'struct bpf_spin_lock lock;' field is not allowed.
- to use bpf_spin_lock() helper the BTF description of map value must be
  a struct and have 'struct bpf_spin_lock anyname;' field at the top level.
  Nested lock inside another struct is not allowed.
- syscall map_lookup doesn't copy bpf_spin_lock field to user space.
- syscall map_update and program map_update do not update bpf_spin_lock field.
- bpf_spin_lock cannot be on the stack or inside networking packet.
  bpf_spin_lock can only be inside HASH or ARRAY map value.
- bpf_spin_lock is available to root only and to all program types.
- bpf_spin_lock is not allowed in inner maps of map-in-map.
- ld_abs is not allowed inside spin_lock-ed region.
- tracing progs and socket filter progs cannot use bpf_spin_lock due to
  insufficient preemption checks

Implementation details:
- cgroup-bpf class of programs can nest with xdp/tc programs.
  Hence bpf_spin_lock is equivalent to spin_lock_irqsave.
  Other solutions to avoid nested bpf_spin_lock are possible.
  Like making sure that all networking progs run with softirq disabled.
  spin_lock_irqsave is the simplest and doesn't add overhead to the
  programs that don't use it.
- arch_spinlock_t is used when its implemented as queued_spin_lock
- archs can force their own arch_spinlock_t
- on architectures where queued_spin_lock is not available and
  sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used.
- presence of bpf_spin_lock inside map value could have been indicated via
  extra flag during map_create, but specifying it via BTF is cleaner.
  It provides introspection for map key/value and reduces user mistakes.

Next steps:
- allow bpf_spin_lock in other map types (like cgroup local storage)
- introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
  to request kernel to grab bpf_spin_lock before rewriting the value.
  That will serialize access to map elements.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h          |  37 +++++++++-
 include/linux/bpf_verifier.h |   1 +
 include/linux/btf.h          |   1 +
 include/uapi/linux/bpf.h     |   7 +-
 kernel/Kconfig.locks         |   3 +
 kernel/bpf/arraymap.c        |   7 +-
 kernel/bpf/btf.c             |  42 +++++++++++
 kernel/bpf/core.c            |   2 +
 kernel/bpf/hashtab.c         |  21 +++---
 kernel/bpf/helpers.c         |  80 ++++++++++++++++++++
 kernel/bpf/map_in_map.c      |   5 ++
 kernel/bpf/syscall.c         |  21 +++++-
 kernel/bpf/verifier.c        | 169 ++++++++++++++++++++++++++++++++++++++++++-
 net/core/filter.c            |  16 +++-
 14 files changed, 386 insertions(+), 26 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0394f1f9213b..2ae615b48bb8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -72,14 +72,15 @@ struct bpf_map {
 	u32 value_size;
 	u32 max_entries;
 	u32 map_flags;
-	u32 pages;
+	int spin_lock_off; /* >=0 valid offset, <0 error */
 	u32 id;
 	int numa_node;
 	u32 btf_key_type_id;
 	u32 btf_value_type_id;
 	struct btf *btf;
+	u32 pages;
 	bool unpriv_array;
-	/* 55 bytes hole */
+	/* 51 bytes hole */
 
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
@@ -91,6 +92,34 @@ struct bpf_map {
 	char name[BPF_OBJ_NAME_LEN];
 };
 
+static inline bool map_value_has_spin_lock(const struct bpf_map *map)
+{
+	return map->spin_lock_off >= 0;
+}
+
+static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
+{
+	if (likely(!map_value_has_spin_lock(map)))
+		return;
+	*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
+		(struct bpf_spin_lock){};
+}
+
+/* copy everything but bpf_spin_lock */
+static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
+{
+	if (unlikely(map_value_has_spin_lock(map))) {
+		u32 off = map->spin_lock_off;
+
+		memcpy(dst, src, off);
+		memcpy(dst + off + sizeof(struct bpf_spin_lock),
+		       src + off + sizeof(struct bpf_spin_lock),
+		       map->value_size - off - sizeof(struct bpf_spin_lock));
+	} else {
+		memcpy(dst, src, map->value_size);
+	}
+}
+
 struct bpf_offload_dev;
 struct bpf_offloaded_map;
 
@@ -162,6 +191,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */
+	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
 };
 
 /* type of values returned from helper functions */
@@ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
-
+extern const struct bpf_func_proto bpf_spin_lock_proto;
+extern const struct bpf_func_proto bpf_spin_unlock_proto;
 extern const struct bpf_func_proto bpf_get_local_storage_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0620e418dde5..69f7a3449eda 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -148,6 +148,7 @@ struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
 	u32 curframe;
+	u32 active_spin_lock;
 	bool speculative;
 };
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 12502e25e767..455d31b55828 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf);
 bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
 
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 60b99b730a41..86f7c438d40f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2422,7 +2422,9 @@ union bpf_attr {
 	FN(map_peek_elem),		\
 	FN(msg_push_data),		\
 	FN(msg_pop_data),		\
-	FN(rc_pointer_rel),
+	FN(rc_pointer_rel),		\
+	FN(spin_lock),			\
+	FN(spin_unlock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3056,4 +3058,7 @@ struct bpf_line_info {
 	__u32	line_col;
 };
 
+struct bpf_spin_lock {
+	__u32	val;
+};
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 84d882f3e299..fbba478ae522 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
 	def_bool y if ARCH_USE_QUEUED_SPINLOCKS
 	depends on SMP
 
+config BPF_ARCH_SPINLOCK
+	bool
+
 config ARCH_USE_QUEUED_RWLOCKS
 	bool
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 25632a75d630..d6d979910a2a 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -270,9 +270,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
 		       value, map->value_size);
 	else
-		memcpy(array->value +
-		       array->elem_size * (index & array->index_mask),
-		       value, map->value_size);
+		copy_map_value(map,
+			       array->value +
+			       array->elem_size * (index & array->index_mask),
+			       value);
 	return 0;
 }
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3d661f0606fe..7019c1f05cab 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t)
 	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
 }
 
+static bool __btf_type_is_struct(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+}
+
 static bool btf_type_is_array(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
@@ -2045,6 +2050,43 @@ static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+	const struct btf_member *member;
+	u32 i, off = -ENOENT;
+
+	if (!__btf_type_is_struct(t))
+		return -EINVAL;
+
+	for_each_member(i, t, member) {
+		const struct btf_type *member_type = btf_type_by_id(btf,
+								    member->type);
+		if (!__btf_type_is_struct(member_type))
+			continue;
+		if (member_type->size != sizeof(struct bpf_spin_lock))
+			continue;
+		if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
+			   "bpf_spin_lock"))
+			continue;
+		if (off != -ENOENT)
+			/* only one 'struct bpf_spin_lock' is allowed */
+			return -E2BIG;
+		off = btf_member_bit_offset(t, member);
+		if (off % 8)
+			/* valid C code cannot generate such BTF */
+			return -EINVAL;
+		off /= 8;
+		if (off % __alignof__(struct bpf_spin_lock))
+			/* valid struct bpf_spin_lock will be 4 byte aligned */
+			return -EINVAL;
+	}
+	return off;
+}
+
 static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
 				u32 type_id, void *data, u8 bits_offset,
 				struct seq_file *m)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f13c543b7b36..ef88b167959d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2002,6 +2002,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_map_push_elem_proto __weak;
 const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
 const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_spin_lock_proto __weak;
+const struct bpf_func_proto bpf_spin_unlock_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..6d3b22c5ad68 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
 	       BITS_PER_LONG == 64;
 }
 
-static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
-{
-	u32 size = htab->map.value_size;
-
-	if (percpu || fd_htab_map_needs_adjust(htab))
-		size = round_up(size, 8);
-	return size;
-}
-
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 void *value, u32 key_size, u32 hash,
 					 bool percpu, bool onallcpus,
 					 struct htab_elem *old_elem)
 {
-	u32 size = htab_size_value(htab, percpu);
+	u32 size = htab->map.value_size;
 	bool prealloc = htab_is_prealloc(htab);
 	struct htab_elem *l_new, **pl_new;
 	void __percpu *pptr;
@@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			l_new = ERR_PTR(-ENOMEM);
 			goto dec_count;
 		}
+		check_and_init_map_lock(&htab->map,
+					l_new->key + round_up(key_size, 8));
 	}
 
 	memcpy(l_new->key, key, key_size);
 	if (percpu) {
+		size = round_up(size, 8);
 		if (prealloc) {
 			pptr = htab_elem_get_ptr(l_new, key_size);
 		} else {
@@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 
 		if (!prealloc)
 			htab_elem_set_ptr(l_new, key_size, pptr);
-	} else {
+	} else if (fd_htab_map_needs_adjust(htab)) {
+		size = round_up(size, 8);
 		memcpy(l_new->key + round_up(key_size, 8), value, size);
+	} else {
+		copy_map_value(&htab->map,
+			       l_new->key + round_up(key_size, 8),
+			       value);
 	}
 
 	l_new->hash = hash;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a74972b07e74..fbe544761628 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -221,6 +221,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.arg2_type	= ARG_CONST_SIZE,
 };
 
+#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+	arch_spinlock_t *l = (void *)lock;
+	union {
+		__u32 val;
+		arch_spinlock_t lock;
+	} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
+
+	compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
+	BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
+	BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+	arch_spin_lock(l);
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+	arch_spinlock_t *l = (void *)lock;
+
+	arch_spin_unlock(l);
+}
+
+#else
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+	atomic_t *l = (void *)lock;
+
+	BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
+	do {
+		atomic_cond_read_relaxed(l, !VAL);
+	} while (atomic_xchg(l, 1));
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+	atomic_t *l = (void *)lock;
+
+	atomic_set_release(l, 0);
+}
+
+#endif
+
+static DEFINE_PER_CPU(unsigned long, irqsave_flags);
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__bpf_spin_lock(lock);
+	__this_cpu_write(irqsave_flags, flags);
+	return 0;
+}
+
+const struct bpf_func_proto bpf_spin_lock_proto = {
+	.func		= bpf_spin_lock,
+	.gpl_only	= false,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
+};
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+	unsigned long flags;
+
+	flags = __this_cpu_read(irqsave_flags);
+	__bpf_spin_unlock(lock);
+	local_irq_restore(flags);
+	return 0;
+}
+
+const struct bpf_func_proto bpf_spin_unlock_proto = {
+	.func		= bpf_spin_unlock,
+	.gpl_only	= false,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
+};
+
 #ifdef CONFIG_CGROUPS
 BPF_CALL_0(bpf_get_current_cgroup_id)
 {
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 52378d3e34b3..583346a0ab29 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (map_value_has_spin_lock(inner_map)) {
+		fdput(f);
+		return ERR_PTR(-ENOTSUPP);
+	}
+
 	inner_map_meta_size = sizeof(*inner_map_meta);
 	/* In some cases verifier needs to access beyond just base map. */
 	if (inner_map->ops == &array_map_ops)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..ebf0a673cb83 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
-static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 			 u32 btf_key_id, u32 btf_value_id)
 {
 	const struct btf_type *key_type, *value_type;
@@ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
 	if (!value_type || value_size != map->value_size)
 		return -EINVAL;
 
+	map->spin_lock_off = btf_find_spin_lock(btf, value_type);
+
+	if (map_value_has_spin_lock(map)) {
+		if (map->map_type != BPF_MAP_TYPE_HASH &&
+		    map->map_type != BPF_MAP_TYPE_ARRAY)
+			return -ENOTSUPP;
+		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
+		    map->value_size) {
+			WARN_ONCE(1,
+				  "verifier bug spin_lock_off %d value_size %d\n",
+				  map->spin_lock_off, map->value_size);
+			return -EFAULT;
+		}
+	}
+
 	if (map->ops->map_check_btf)
 		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
 
@@ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr)
 		map->btf = btf;
 		map->btf_key_type_id = attr->btf_key_type_id;
 		map->btf_value_type_id = attr->btf_value_type_id;
+	} else {
+		map->spin_lock_off = -EINVAL;
 	}
 
 	err = security_bpf_map_alloc(map);
@@ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 			err = -ENOENT;
 		} else {
 			err = 0;
-			memcpy(value, ptr, value_size);
+			copy_map_value(map, value, ptr);
 		}
 		rcu_read_unlock();
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8c1c21cd50b4..38892bdee651 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -213,6 +213,7 @@ struct bpf_call_arg_meta {
 	s64 msize_smax_value;
 	u64 msize_umax_value;
 	int ptr_id;
+	int func_id;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -351,6 +352,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg)
 	return type_is_refcounted(reg->type);
 }
 
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+{
+	return reg->type == PTR_TO_MAP_VALUE &&
+		map_value_has_spin_lock(reg->map_ptr);
+}
+
 static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
 {
 	return type_is_refcounted_or_null(reg->type);
@@ -712,6 +719,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	}
 	dst_state->speculative = src->speculative;
 	dst_state->curframe = src->curframe;
+	dst_state->active_spin_lock = src->active_spin_lock;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -1483,6 +1491,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	if (err)
 		verbose(env, "R%d max value is outside of the array range\n",
 			regno);
+
+	if (map_value_has_spin_lock(reg->map_ptr)) {
+		u32 lock = reg->map_ptr->spin_lock_off;
+
+		/* if any part of struct bpf_spin_lock can be touched by
+		 * load/store reject this program.
+		 * To check that [x1, x2) overlaps with [y1, y2)
+		 * it is sufficient to check x1 < y2 && y1 < x2.
+		 */
+		if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
+		     lock < reg->umax_value + off + size) {
+			verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
+			return -EACCES;
+		}
+	}
 	return err;
 }
 
@@ -2192,6 +2215,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	}
 }
 
+/* Implementation details:
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
+ * value_or_null->value transition, since the verifier only cares about
+ * the range of access to valid map value pointer and doesn't care about actual
+ * address of the map element.
+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
+ * reg->id > 0 after value_or_null->value transition. By doing so
+ * two bpf_map_lookups will be considered two different pointers that
+ * point to different bpf_spin_locks.
+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
+ * dead-locks.
+ * Since only one bpf_spin_lock is allowed the checks are simpler than
+ * reg_is_refcounted() logic. The verifier needs to remember only
+ * one spin_lock instead of array of acquired_refs.
+ * cur_state->active_spin_lock remembers which map value element got locked
+ * and clears it after bpf_spin_unlock.
+ */
+static int process_spin_lock(struct bpf_verifier_env *env, int regno,
+			     bool is_lock)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	struct bpf_verifier_state *cur = env->cur_state;
+	bool is_const = tnum_is_const(reg->var_off);
+	struct bpf_map *map = reg->map_ptr;
+	u64 val = reg->var_off.value;
+
+	if (reg->type != PTR_TO_MAP_VALUE) {
+		verbose(env, "R%d is not a pointer to map_value\n", regno);
+		return -EINVAL;
+	}
+	if (!is_const) {
+		verbose(env,
+			"R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map->btf) {
+		verbose(env,
+			"map '%s' has to have BTF in order to use bpf_spin_lock\n",
+			map->name);
+		return -EINVAL;
+	}
+	if (!map_value_has_spin_lock(map)) {
+		if (map->spin_lock_off == -E2BIG)
+			verbose(env,
+				"map '%s' has more than one 'struct bpf_spin_lock'\n",
+				map->name);
+		else if (map->spin_lock_off == -ENOENT)
+			verbose(env,
+				"map '%s' doesn't have 'struct bpf_spin_lock'\n",
+				map->name);
+		else
+			verbose(env,
+				"map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+				map->name);
+		return -EINVAL;
+	}
+	if (map->spin_lock_off != val + reg->off) {
+		verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
+			val + reg->off);
+		return -EINVAL;
+	}
+	if (is_lock) {
+		if (cur->active_spin_lock) {
+			verbose(env,
+				"Locking two bpf_spin_locks are not allowed\n");
+			return -EINVAL;
+		}
+		cur->active_spin_lock = reg->id;
+	} else {
+		if (!cur->active_spin_lock) {
+			verbose(env, "bpf_spin_unlock without taking a lock\n");
+			return -EINVAL;
+		}
+		if (cur->active_spin_lock != reg->id) {
+			verbose(env, "bpf_spin_unlock of different lock\n");
+			return -EINVAL;
+		}
+		cur->active_spin_lock = 0;
+	}
+	return 0;
+}
+
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
 	return type == ARG_PTR_TO_MEM ||
@@ -2268,6 +2376,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			return -EFAULT;
 		}
 		meta->ptr_id = reg->id;
+	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+		if (meta->func_id == BPF_FUNC_spin_lock) {
+			if (process_spin_lock(env, regno, true))
+				return -EACCES;
+		} else if (meta->func_id == BPF_FUNC_spin_unlock) {
+			if (process_spin_lock(env, regno, false))
+				return -EACCES;
+		} else {
+			verbose(env, "verifier internal error\n");
+			return -EFAULT;
+		}
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
@@ -2887,6 +3006,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return err;
 	}
 
+	meta.func_id = func_id;
 	/* check args */
 	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
 	if (err)
@@ -4473,7 +4593,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
 			reg->type = PTR_TO_SOCKET;
 		}
-		if (is_null || !reg_is_refcounted(reg)) {
+		if (is_null || !(reg_is_refcounted(reg) ||
+				 reg_may_point_to_spin_lock(reg))) {
 			/* We don't need id from this point onwards anymore,
 			 * thus we should better reset it, so that state
 			 * pruning has chances to take effect.
@@ -4871,6 +4992,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 	}
 
+	if (env->cur_state->active_spin_lock) {
+		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
+		return -EINVAL;
+	}
+
 	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
 		verbose(env,
 			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -5607,8 +5733,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_MAP_VALUE:
 		/* If the new min/max/var_off satisfy the old ones and
 		 * everything else matches, we are OK.
-		 * We don't care about the 'id' value, because nothing
-		 * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+		 * 'id' is not compared, since it's only used for maps with
+		 * bpf_spin_lock inside map element and in such cases if
+		 * the rest of the prog is valid for one map element then
+		 * it's valid for all map elements regardless of the key
+		 * used in bpf_map_lookup()
 		 */
 		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
 		       range_within(rold, rcur) &&
@@ -5811,6 +5940,9 @@ static bool states_equal(struct bpf_verifier_env *env,
 	if (old->speculative && !cur->speculative)
 		return false;
 
+	if (old->active_spin_lock != cur->active_spin_lock)
+		return false;
+
 	/* for states to be equal callsites have to be the same
 	 * and all frame states need to be equivalent
 	 */
@@ -6229,6 +6361,12 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
+				if (env->cur_state->active_spin_lock &&
+				    (insn->src_reg == BPF_PSEUDO_CALL ||
+				     insn->imm != BPF_FUNC_spin_unlock)) {
+					verbose(env, "function calls are not allowed while holding a lock\n");
+					return -EINVAL;
+				}
 				if (insn->src_reg == BPF_PSEUDO_CALL)
 					err = check_func_call(env, insn, &env->insn_idx);
 				else
@@ -6259,6 +6397,11 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
+				if (env->cur_state->active_spin_lock) {
+					verbose(env, "bpf_spin_unlock is missing\n");
+					return -EINVAL;
+				}
+
 				if (state->curframe) {
 					/* exit from nested function */
 					env->prev_insn_idx = env->insn_idx;
@@ -6356,6 +6499,19 @@ static int check_map_prealloc(struct bpf_map *map)
 		!(map->map_flags & BPF_F_NO_PREALLOC);
 }
 
+static bool is_tracing_prog_type(enum bpf_prog_type type)
+{
+	switch (type) {
+	case BPF_PROG_TYPE_KPROBE:
+	case BPF_PROG_TYPE_TRACEPOINT:
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 					struct bpf_map *map,
 					struct bpf_prog *prog)
@@ -6378,6 +6534,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		}
 	}
 
+	if ((is_tracing_prog_type(prog->type) ||
+	     prog->type == BPF_PROG_TYPE_SOCKET_FILTER) &&
+	    map_value_has_spin_lock(map)) {
+		verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
+		return -EINVAL;
+	}
+
 	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
 	    !bpf_offload_prog_map_match(prog, map)) {
 		verbose(env, "offload device mismatch between prog and map\n");
diff --git a/net/core/filter.c b/net/core/filter.c
index 41984ad4b9b4..3a49f68eda10 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5314,10 +5314,20 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+	default:
+		break;
+	}
+
+	if (!capable(CAP_SYS_ADMIN))
+		return NULL;
+
+	switch (func_id) {
+	case BPF_FUNC_spin_lock:
+		return &bpf_spin_lock_proto;
+	case BPF_FUNC_spin_unlock:
+		return &bpf_spin_unlock_proto;
 	case BPF_FUNC_trace_printk:
-		if (capable(CAP_SYS_ADMIN))
-			return bpf_get_trace_printk_proto();
-		/* else, fall through */
+		return bpf_get_trace_printk_proto();
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 96049f3afd50fe8db69fa0068cdca822e747b1e4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 31 Jan 2019 15:40:09 -0800
Subject: bpf: introduce BPF_F_LOCK flag

Introduce BPF_F_LOCK flag for map_lookup and map_update syscall commands
and for map_update() helper function.
In all these cases take a lock of existing element (which was provided
in BTF description) before copying (in or out) the rest of map value.

Implementation details that are part of uapi:

Array:
The array map takes the element lock for lookup/update.

Hash:
hash map also takes the lock for lookup/update and tries to avoid the bucket lock.
If old element exists it takes the element lock and updates the element in place.
If element doesn't exist it allocates new one and inserts into hash table
while holding the bucket lock.
In rare case the hashmap has to take both the bucket lock and the element lock
to update old value in place.

Cgroup local storage:
It is similar to array. update in place and lookup are done with lock taken.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h        |  2 ++
 include/uapi/linux/bpf.h   |  1 +
 kernel/bpf/arraymap.c      | 24 ++++++++++++++++--------
 kernel/bpf/hashtab.c       | 42 +++++++++++++++++++++++++++++++++++++++---
 kernel/bpf/helpers.c       | 16 ++++++++++++++++
 kernel/bpf/local_storage.c | 14 +++++++++++++-
 kernel/bpf/syscall.c       | 25 +++++++++++++++++++++++--
 7 files changed, 110 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2ae615b48bb8..bd169a7bcc93 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -119,6 +119,8 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 		memcpy(dst, src, map->value_size);
 	}
 }
+void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
+			   bool lock_src);
 
 struct bpf_offload_dev;
 struct bpf_offloaded_map;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 86f7c438d40f..1777fa0c61e4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -267,6 +267,7 @@ enum bpf_attach_type {
 #define BPF_ANY		0 /* create new element or update existing */
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
+#define BPF_F_LOCK	4 /* spin_lock-ed map_lookup/map_update */
 
 /* flags for BPF_MAP_CREATE command */
 #define BPF_F_NO_PREALLOC	(1U << 0)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index d6d979910a2a..c72e0d8e1e65 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -253,8 +253,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
+	char *val;
 
-	if (unlikely(map_flags > BPF_EXIST))
+	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
@@ -262,18 +263,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 		/* all elements were pre-allocated, cannot insert a new one */
 		return -E2BIG;
 
-	if (unlikely(map_flags == BPF_NOEXIST))
+	if (unlikely(map_flags & BPF_NOEXIST))
 		/* all elements already exist */
 		return -EEXIST;
 
-	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+	if (unlikely((map_flags & BPF_F_LOCK) &&
+		     !map_value_has_spin_lock(map)))
+		return -EINVAL;
+
+	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
 		       value, map->value_size);
-	else
-		copy_map_value(map,
-			       array->value +
-			       array->elem_size * (index & array->index_mask),
-			       value);
+	} else {
+		val = array->value +
+			array->elem_size * (index & array->index_mask);
+		if (map_flags & BPF_F_LOCK)
+			copy_map_value_locked(map, val, value, false);
+		else
+			copy_map_value(map, val, value);
+	}
 	return 0;
 }
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 6d3b22c5ad68..937776531998 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -804,11 +804,11 @@ dec_count:
 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 		       u64 map_flags)
 {
-	if (l_old && map_flags == BPF_NOEXIST)
+	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
 		/* elem already exists */
 		return -EEXIST;
 
-	if (!l_old && map_flags == BPF_EXIST)
+	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
 		/* elem doesn't exist, cannot update it */
 		return -ENOENT;
 
@@ -827,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	u32 key_size, hash;
 	int ret;
 
-	if (unlikely(map_flags > BPF_EXIST))
+	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
@@ -840,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
+	if (unlikely(map_flags & BPF_F_LOCK)) {
+		if (unlikely(!map_value_has_spin_lock(map)))
+			return -EINVAL;
+		/* find an element without taking the bucket lock */
+		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
+					      htab->n_buckets);
+		ret = check_flags(htab, l_old, map_flags);
+		if (ret)
+			return ret;
+		if (l_old) {
+			/* grab the element lock and update value in place */
+			copy_map_value_locked(map,
+					      l_old->key + round_up(key_size, 8),
+					      value, false);
+			return 0;
+		}
+		/* fall through, grab the bucket lock and lookup again.
+		 * 99.9% chance that the element won't be found,
+		 * but second lookup under lock has to be done.
+		 */
+	}
+
 	/* bpf_map_update_elem() can be called in_irq() */
 	raw_spin_lock_irqsave(&b->lock, flags);
 
@@ -849,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (ret)
 		goto err;
 
+	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
+		/* first lookup without the bucket lock didn't find the element,
+		 * but second lookup with the bucket lock found it.
+		 * This case is highly unlikely, but has to be dealt with:
+		 * grab the element lock in addition to the bucket lock
+		 * and update element in place
+		 */
+		copy_map_value_locked(map,
+				      l_old->key + round_up(key_size, 8),
+				      value, false);
+		ret = 0;
+		goto err;
+	}
+
 	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
 				l_old);
 	if (IS_ERR(l_new)) {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index fbe544761628..a411fc17d265 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -301,6 +301,22 @@ const struct bpf_func_proto bpf_spin_unlock_proto = {
 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
 };
 
+void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
+			   bool lock_src)
+{
+	struct bpf_spin_lock *lock;
+
+	if (lock_src)
+		lock = src + map->spin_lock_off;
+	else
+		lock = dst + map->spin_lock_off;
+	preempt_disable();
+	____bpf_spin_lock(lock);
+	copy_map_value(map, dst, src);
+	____bpf_spin_unlock(lock);
+	preempt_enable();
+}
+
 #ifdef CONFIG_CGROUPS
 BPF_CALL_0(bpf_get_current_cgroup_id)
 {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 0295427f06e2..6b572e2de7fb 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
 	struct bpf_cgroup_storage *storage;
 	struct bpf_storage_buffer *new;
 
-	if (flags != BPF_ANY && flags != BPF_EXIST)
+	if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
+		return -EINVAL;
+
+	if (unlikely(flags & BPF_NOEXIST))
+		return -EINVAL;
+
+	if (unlikely((flags & BPF_F_LOCK) &&
+		     !map_value_has_spin_lock(map)))
 		return -EINVAL;
 
 	storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
 	if (!storage)
 		return -ENOENT;
 
+	if (flags & BPF_F_LOCK) {
+		copy_map_value_locked(map, storage->buf->data, value, false);
+		return 0;
+	}
+
 	new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
 			   map->value_size,
 			   __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b29e6dc44650..0834958f1dc4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -682,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
 
 static int map_lookup_elem(union bpf_attr *attr)
 {
@@ -698,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
 		return -EINVAL;
 
+	if (attr->flags & ~BPF_F_LOCK)
+		return -EINVAL;
+
 	f = fdget(ufd);
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
@@ -708,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
+	if ((attr->flags & BPF_F_LOCK) &&
+	    !map_value_has_spin_lock(map)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
 	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -758,7 +767,13 @@ static int map_lookup_elem(union bpf_attr *attr)
 			err = -ENOENT;
 		} else {
 			err = 0;
-			copy_map_value(map, value, ptr);
+			if (attr->flags & BPF_F_LOCK)
+				/* lock 'ptr' and copy everything but lock */
+				copy_map_value_locked(map, value, ptr, true);
+			else
+				copy_map_value(map, value, ptr);
+			/* mask lock, since value wasn't zero inited */
+			check_and_init_map_lock(map, value);
 		}
 		rcu_read_unlock();
 	}
@@ -818,6 +833,12 @@ static int map_update_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
+	if ((attr->flags & BPF_F_LOCK) &&
+	    !map_value_has_spin_lock(map)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
 	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
-- 
cgit v1.2.3-59-g8ed1b


From fb99bce7120014307dde57b3d7def6977a9a62a1 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Wed, 30 Jan 2019 21:58:05 +0000
Subject: net: tls: Support 256 bit keys

Wire up support for 256 bit keys from the setsockopt to the crypto
framework

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h                 |  5 +++-
 include/uapi/linux/tls.h          | 15 ++++++++++
 net/tls/tls_main.c                | 33 +++++++++++++++++++--
 net/tls/tls_sw.c                  | 29 +++++++++++++++---
 tools/testing/selftests/net/tls.c | 62 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 137 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/tls.h b/include/net/tls.h
index 4592606e136a..da616db48413 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -206,7 +206,10 @@ struct cipher_context {
 
 union tls_crypto_context {
 	struct tls_crypto_info info;
-	struct tls12_crypto_info_aes_gcm_128 aes_gcm_128;
+	union {
+		struct tls12_crypto_info_aes_gcm_128 aes_gcm_128;
+		struct tls12_crypto_info_aes_gcm_256 aes_gcm_256;
+	};
 };
 
 struct tls_context {
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index ff02287495ac..9affceaa3db4 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -59,6 +59,13 @@
 #define TLS_CIPHER_AES_GCM_128_TAG_SIZE		16
 #define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE		8
 
+#define TLS_CIPHER_AES_GCM_256				52
+#define TLS_CIPHER_AES_GCM_256_IV_SIZE			8
+#define TLS_CIPHER_AES_GCM_256_KEY_SIZE		32
+#define TLS_CIPHER_AES_GCM_256_SALT_SIZE		4
+#define TLS_CIPHER_AES_GCM_256_TAG_SIZE		16
+#define TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE		8
+
 #define TLS_SET_RECORD_TYPE	1
 #define TLS_GET_RECORD_TYPE	2
 
@@ -75,4 +82,12 @@ struct tls12_crypto_info_aes_gcm_128 {
 	unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
 };
 
+struct tls12_crypto_info_aes_gcm_256 {
+	struct tls_crypto_info info;
+	unsigned char iv[TLS_CIPHER_AES_GCM_256_IV_SIZE];
+	unsigned char key[TLS_CIPHER_AES_GCM_256_KEY_SIZE];
+	unsigned char salt[TLS_CIPHER_AES_GCM_256_SALT_SIZE];
+	unsigned char rec_seq[TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE];
+};
+
 #endif /* _UAPI_LINUX_TLS_H */
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index d36d095cbcf0..0f028cfdf835 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -372,6 +372,30 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
 			rc = -EFAULT;
 		break;
 	}
+	case TLS_CIPHER_AES_GCM_256: {
+		struct tls12_crypto_info_aes_gcm_256 *
+		  crypto_info_aes_gcm_256 =
+		  container_of(crypto_info,
+			       struct tls12_crypto_info_aes_gcm_256,
+			       info);
+
+		if (len != sizeof(*crypto_info_aes_gcm_256)) {
+			rc = -EINVAL;
+			goto out;
+		}
+		lock_sock(sk);
+		memcpy(crypto_info_aes_gcm_256->iv,
+		       ctx->tx.iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE,
+		       TLS_CIPHER_AES_GCM_256_IV_SIZE);
+		memcpy(crypto_info_aes_gcm_256->rec_seq, ctx->tx.rec_seq,
+		       TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE);
+		release_sock(sk);
+		if (copy_to_user(optval,
+				 crypto_info_aes_gcm_256,
+				 sizeof(*crypto_info_aes_gcm_256)))
+			rc = -EFAULT;
+		break;
+	}
 	default:
 		rc = -EINVAL;
 	}
@@ -412,6 +436,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
 {
 	struct tls_crypto_info *crypto_info;
 	struct tls_context *ctx = tls_get_ctx(sk);
+	size_t optsize;
 	int rc = 0;
 	int conf;
 
@@ -444,8 +469,12 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
 	}
 
 	switch (crypto_info->cipher_type) {
-	case TLS_CIPHER_AES_GCM_128: {
-		if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) {
+	case TLS_CIPHER_AES_GCM_128:
+	case TLS_CIPHER_AES_GCM_256: {
+		optsize = crypto_info->cipher_type == TLS_CIPHER_AES_GCM_128 ?
+			sizeof(struct tls12_crypto_info_aes_gcm_128) :
+			sizeof(struct tls12_crypto_info_aes_gcm_256);
+		if (optlen != optsize) {
 			rc = -EINVAL;
 			goto err_crypto_info;
 		}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 3f2a6af27e62..9326c06c2ffe 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1999,6 +1999,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 {
 	struct tls_crypto_info *crypto_info;
 	struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
+	struct tls12_crypto_info_aes_gcm_256 *gcm_256_info;
 	struct tls_sw_context_tx *sw_ctx_tx = NULL;
 	struct tls_sw_context_rx *sw_ctx_rx = NULL;
 	struct cipher_context *cctx;
@@ -2006,7 +2007,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	struct strp_callbacks cb;
 	u16 nonce_size, tag_size, iv_size, rec_seq_size;
 	struct crypto_tfm *tfm;
-	char *iv, *rec_seq;
+	char *iv, *rec_seq, *key, *salt;
+	size_t keysize;
 	int rc = 0;
 
 	if (!ctx) {
@@ -2067,6 +2069,24 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		 ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
 		gcm_128_info =
 			(struct tls12_crypto_info_aes_gcm_128 *)crypto_info;
+		keysize = TLS_CIPHER_AES_GCM_128_KEY_SIZE;
+		key = gcm_128_info->key;
+		salt = gcm_128_info->salt;
+		break;
+	}
+	case TLS_CIPHER_AES_GCM_256: {
+		nonce_size = TLS_CIPHER_AES_GCM_256_IV_SIZE;
+		tag_size = TLS_CIPHER_AES_GCM_256_TAG_SIZE;
+		iv_size = TLS_CIPHER_AES_GCM_256_IV_SIZE;
+		iv = ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->iv;
+		rec_seq_size = TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE;
+		rec_seq =
+		 ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->rec_seq;
+		gcm_256_info =
+			(struct tls12_crypto_info_aes_gcm_256 *)crypto_info;
+		keysize = TLS_CIPHER_AES_GCM_256_KEY_SIZE;
+		key = gcm_256_info->key;
+		salt = gcm_256_info->salt;
 		break;
 	}
 	default:
@@ -2090,7 +2110,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		rc = -ENOMEM;
 		goto free_priv;
 	}
-	memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	/* Note: 128 & 256 bit salt are the same size */
+	memcpy(cctx->iv, salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
 	memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
 	cctx->rec_seq_size = rec_seq_size;
 	cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL);
@@ -2110,8 +2131,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 
 	ctx->push_pending_record = tls_sw_push_pending_record;
 
-	rc = crypto_aead_setkey(*aead, gcm_128_info->key,
-				TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+	rc = crypto_aead_setkey(*aead, key, keysize);
+
 	if (rc)
 		goto free_aead;
 
diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index ff68ed19c0ef..c356f481de79 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -763,4 +763,66 @@ TEST_F(tls, control_msg)
 	EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
 }
 
+TEST(keysizes) {
+	struct tls12_crypto_info_aes_gcm_256 tls12;
+	struct sockaddr_in addr;
+	int sfd, ret, fd, cfd;
+	socklen_t len;
+	bool notls;
+
+	notls = false;
+	len = sizeof(addr);
+
+	memset(&tls12, 0, sizeof(tls12));
+	tls12.info.version = TLS_1_2_VERSION;
+	tls12.info.cipher_type = TLS_CIPHER_AES_GCM_256;
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = htonl(INADDR_ANY);
+	addr.sin_port = 0;
+
+	fd = socket(AF_INET, SOCK_STREAM, 0);
+	sfd = socket(AF_INET, SOCK_STREAM, 0);
+
+	ret = bind(sfd, &addr, sizeof(addr));
+	ASSERT_EQ(ret, 0);
+	ret = listen(sfd, 10);
+	ASSERT_EQ(ret, 0);
+
+	ret = getsockname(sfd, &addr, &len);
+	ASSERT_EQ(ret, 0);
+
+	ret = connect(fd, &addr, sizeof(addr));
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+	if (ret != 0) {
+		notls = true;
+		printf("Failure setting TCP_ULP, testing without tls\n");
+	}
+
+	if (!notls) {
+		ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12,
+				 sizeof(tls12));
+		EXPECT_EQ(ret, 0);
+	}
+
+	cfd = accept(sfd, &addr, &len);
+	ASSERT_GE(cfd, 0);
+
+	if (!notls) {
+		ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls",
+				 sizeof("tls"));
+		EXPECT_EQ(ret, 0);
+
+		ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12,
+				 sizeof(tls12));
+		EXPECT_EQ(ret, 0);
+	}
+
+	close(sfd);
+	close(fd);
+	close(cfd);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3-59-g8ed1b


From 130b392c6cd6b2aed1b7eb32253d4920babb4891 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Wed, 30 Jan 2019 21:58:31 +0000
Subject: net: tls: Add tls 1.3 support

TLS 1.3 has minor changes from TLS 1.2 at the record layer.

* Header now hardcodes the same version and application content type in
  the header.
* The real content type is appended after the data, before encryption (or
  after decryption).
* The IV is xored with the sequence number, instead of concatinating four
  bytes of IV with the explicit IV.
* Zero-padding:  No exlicit length is given, we search backwards from the
  end of the decrypted data for the first non-zero byte, which is the
  content type.  Currently recv supports reading zero-padding, but there
  is no way for send to add zero padding.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h             |  66 +++++++++++++++++-------
 include/uapi/linux/tls.h      |   4 ++
 net/tls/tls_device.c          |   5 +-
 net/tls/tls_device_fallback.c |   3 +-
 net/tls/tls_main.c            |   3 +-
 net/tls/tls_sw.c              | 116 ++++++++++++++++++++++++++++++++++--------
 6 files changed, 154 insertions(+), 43 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/tls.h b/include/net/tls.h
index 754b130672f0..004bf01ce868 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -119,6 +119,9 @@ struct tls_rec {
 	/* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */
 	struct scatterlist sg_aead_out[2];
 
+	char content_type;
+	struct scatterlist sg_content_type;
+
 	char aad_space[TLS_AAD_SPACE_SIZE];
 	u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE +
 		   TLS_CIPHER_AES_GCM_128_SALT_SIZE];
@@ -203,6 +206,7 @@ struct cipher_context {
 	u16 rec_seq_size;
 	char *rec_seq;
 	u16 aad_size;
+	u16 tail_size;
 };
 
 union tls_crypto_context {
@@ -397,49 +401,77 @@ static inline bool tls_bigint_increment(unsigned char *seq, int len)
 }
 
 static inline void tls_advance_record_sn(struct sock *sk,
-					 struct cipher_context *ctx)
+					 struct cipher_context *ctx,
+					 int version)
 {
 	if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
 		tls_err_abort(sk, EBADMSG);
-	tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
-			     ctx->iv_size);
+
+	if (version != TLS_1_3_VERSION) {
+		tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+				     ctx->iv_size);
+	}
 }
 
 static inline void tls_fill_prepend(struct tls_context *ctx,
 			     char *buf,
 			     size_t plaintext_len,
-			     unsigned char record_type)
+			     unsigned char record_type,
+			     int version)
 {
 	size_t pkt_len, iv_size = ctx->tx.iv_size;
 
-	pkt_len = plaintext_len + iv_size + ctx->tx.tag_size;
+	pkt_len = plaintext_len + ctx->tx.tag_size;
+	if (version != TLS_1_3_VERSION) {
+		pkt_len += iv_size;
+
+		memcpy(buf + TLS_NONCE_OFFSET,
+		       ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
+	}
 
 	/* we cover nonce explicit here as well, so buf should be of
 	 * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
 	 */
-	buf[0] = record_type;
-	buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.info.version);
-	buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.info.version);
+	buf[0] = version == TLS_1_3_VERSION ?
+		   TLS_RECORD_TYPE_DATA : record_type;
+	/* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */
+	buf[1] = TLS_1_2_VERSION_MINOR;
+	buf[2] = TLS_1_2_VERSION_MAJOR;
 	/* we can use IV for nonce explicit according to spec */
 	buf[3] = pkt_len >> 8;
 	buf[4] = pkt_len & 0xFF;
-	memcpy(buf + TLS_NONCE_OFFSET,
-	       ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
 }
 
 static inline void tls_make_aad(char *buf,
 				size_t size,
 				char *record_sequence,
 				int record_sequence_size,
-				unsigned char record_type)
+				unsigned char record_type,
+				int version)
+{
+	if (version != TLS_1_3_VERSION) {
+		memcpy(buf, record_sequence, record_sequence_size);
+		buf += 8;
+	} else {
+		size += TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+	}
+
+	buf[0] = version == TLS_1_3_VERSION ?
+		  TLS_RECORD_TYPE_DATA : record_type;
+	buf[1] = TLS_1_2_VERSION_MAJOR;
+	buf[2] = TLS_1_2_VERSION_MINOR;
+	buf[3] = size >> 8;
+	buf[4] = size & 0xFF;
+}
+
+static inline void xor_iv_with_seq(int version, char *iv, char *seq)
 {
-	memcpy(buf, record_sequence, record_sequence_size);
+	int i;
 
-	buf[8] = record_type;
-	buf[9] = TLS_1_2_VERSION_MAJOR;
-	buf[10] = TLS_1_2_VERSION_MINOR;
-	buf[11] = size >> 8;
-	buf[12] = size & 0xFF;
+	if (version == TLS_1_3_VERSION) {
+		for (i = 0; i < 8; i++)
+			iv[i + 4] ^= seq[i];
+	}
 }
 
 static inline struct tls_context *tls_get_ctx(const struct sock *sk)
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index 9affceaa3db4..401d6f01de6a 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -51,6 +51,10 @@
 #define TLS_1_2_VERSION_MINOR	0x3
 #define TLS_1_2_VERSION		TLS_VERSION_NUMBER(TLS_1_2)
 
+#define TLS_1_3_VERSION_MAJOR	0x3
+#define TLS_1_3_VERSION_MINOR	0x4
+#define TLS_1_3_VERSION		TLS_VERSION_NUMBER(TLS_1_3)
+
 /* Supported ciphers */
 #define TLS_CIPHER_AES_GCM_128				51
 #define TLS_CIPHER_AES_GCM_128_IV_SIZE			8
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index d753e362d2d9..7ee9008b2187 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -257,7 +257,8 @@ static int tls_push_record(struct sock *sk,
 	tls_fill_prepend(ctx,
 			 skb_frag_address(frag),
 			 record->len - ctx->tx.prepend_size,
-			 record_type);
+			 record_type,
+			 ctx->crypto_send.info.version);
 
 	/* HW doesn't care about the data in the tag, because it fills it. */
 	dummy_tag_frag.page = skb_frag_page(frag);
@@ -270,7 +271,7 @@ static int tls_push_record(struct sock *sk,
 	spin_unlock_irq(&offload_ctx->lock);
 	offload_ctx->open_record = NULL;
 	set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
-	tls_advance_record_sn(sk, &ctx->tx);
+	tls_advance_record_sn(sk, &ctx->tx, ctx->crypto_send.info.version);
 
 	for (i = 0; i < record->num_frags; i++) {
 		frag = &record->frags[i];
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 450a6dbc5a88..54c3a758f2a7 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -73,7 +73,8 @@ static int tls_enc_record(struct aead_request *aead_req,
 	len -= TLS_CIPHER_AES_GCM_128_IV_SIZE;
 
 	tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE,
-		     (char *)&rcd_sn, sizeof(rcd_sn), buf[0]);
+		(char *)&rcd_sn, sizeof(rcd_sn), buf[0],
+		TLS_1_2_VERSION);
 
 	memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE,
 	       TLS_CIPHER_AES_GCM_128_IV_SIZE);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 0f028cfdf835..d1c2fd9a3f63 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -463,7 +463,8 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
 	}
 
 	/* check version */
-	if (crypto_info->version != TLS_1_2_VERSION) {
+	if (crypto_info->version != TLS_1_2_VERSION &&
+	    crypto_info->version != TLS_1_3_VERSION) {
 		rc = -ENOTSUPP;
 		goto err_crypto_info;
 	}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 34f3523f668e..06d7ae97b929 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -120,6 +120,34 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len)
         return __skb_nsg(skb, offset, len, 0);
 }
 
+static int padding_length(struct tls_sw_context_rx *ctx,
+			  struct tls_context *tls_ctx, struct sk_buff *skb)
+{
+	struct strp_msg *rxm = strp_msg(skb);
+	int sub = 0;
+
+	/* Determine zero-padding length */
+	if (tls_ctx->crypto_recv.info.version == TLS_1_3_VERSION) {
+		char content_type = 0;
+		int err;
+		int back = 17;
+
+		while (content_type == 0) {
+			if (back > rxm->full_len)
+				return -EBADMSG;
+			err = skb_copy_bits(skb,
+					    rxm->offset + rxm->full_len - back,
+					    &content_type, 1);
+			if (content_type)
+				break;
+			sub++;
+			back++;
+		}
+		ctx->control = content_type;
+	}
+	return sub;
+}
+
 static void tls_decrypt_done(struct crypto_async_request *req, int err)
 {
 	struct aead_request *aead_req = (struct aead_request *)req;
@@ -142,7 +170,7 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err)
 		tls_err_abort(skb->sk, err);
 	} else {
 		struct strp_msg *rxm = strp_msg(skb);
-
+		rxm->full_len -= padding_length(ctx, tls_ctx, skb);
 		rxm->offset += tls_ctx->rx.prepend_size;
 		rxm->full_len -= tls_ctx->rx.overhead_size;
 	}
@@ -448,6 +476,8 @@ static int tls_do_encryption(struct sock *sk,
 	int rc;
 
 	memcpy(rec->iv_data, tls_ctx->tx.iv, sizeof(rec->iv_data));
+	xor_iv_with_seq(tls_ctx->crypto_send.info.version, rec->iv_data,
+			tls_ctx->tx.rec_seq);
 
 	sge->offset += tls_ctx->tx.prepend_size;
 	sge->length -= tls_ctx->tx.prepend_size;
@@ -483,7 +513,8 @@ static int tls_do_encryption(struct sock *sk,
 
 	/* Unhook the record from context if encryption is not failure */
 	ctx->open_rec = NULL;
-	tls_advance_record_sn(sk, &tls_ctx->tx);
+	tls_advance_record_sn(sk, &tls_ctx->tx,
+			      tls_ctx->crypto_send.info.version);
 	return rc;
 }
 
@@ -640,7 +671,17 @@ static int tls_push_record(struct sock *sk, int flags,
 
 	i = msg_pl->sg.end;
 	sk_msg_iter_var_prev(i);
-	sg_mark_end(sk_msg_elem(msg_pl, i));
+
+	rec->content_type = record_type;
+	if (tls_ctx->crypto_send.info.version == TLS_1_3_VERSION) {
+		/* Add content type to end of message.  No padding added */
+		sg_set_buf(&rec->sg_content_type, &rec->content_type, 1);
+		sg_mark_end(&rec->sg_content_type);
+		sg_chain(msg_pl->sg.data, msg_pl->sg.end + 1,
+			 &rec->sg_content_type);
+	} else {
+		sg_mark_end(sk_msg_elem(msg_pl, i));
+	}
 
 	i = msg_pl->sg.start;
 	sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ?
@@ -653,18 +694,22 @@ static int tls_push_record(struct sock *sk, int flags,
 	i = msg_en->sg.start;
 	sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]);
 
-	tls_make_aad(rec->aad_space, msg_pl->sg.size,
+	tls_make_aad(rec->aad_space, msg_pl->sg.size + tls_ctx->tx.tail_size,
 		     tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
-		     record_type);
+		     record_type,
+		     tls_ctx->crypto_send.info.version);
 
 	tls_fill_prepend(tls_ctx,
 			 page_address(sg_page(&msg_en->sg.data[i])) +
-			 msg_en->sg.data[i].offset, msg_pl->sg.size,
-			 record_type);
+			 msg_en->sg.data[i].offset,
+			 msg_pl->sg.size + tls_ctx->tx.tail_size,
+			 record_type,
+			 tls_ctx->crypto_send.info.version);
 
 	tls_ctx->pending_open_record_frags = false;
 
-	rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i);
+	rc = tls_do_encryption(sk, tls_ctx, ctx, req,
+			       msg_pl->sg.size + tls_ctx->tx.tail_size, i);
 	if (rc < 0) {
 		if (rc != -EINPROGRESS) {
 			tls_err_abort(sk, EBADMSG);
@@ -1292,7 +1337,8 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
 	u8 *aad, *iv, *mem = NULL;
 	struct scatterlist *sgin = NULL;
 	struct scatterlist *sgout = NULL;
-	const int data_len = rxm->full_len - tls_ctx->rx.overhead_size;
+	const int data_len = rxm->full_len - tls_ctx->rx.overhead_size +
+		tls_ctx->rx.tail_size;
 
 	if (*zc && (out_iov || out_sg)) {
 		if (out_iov)
@@ -1343,12 +1389,20 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
 		kfree(mem);
 		return err;
 	}
-	memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	if (tls_ctx->crypto_recv.info.version == TLS_1_3_VERSION)
+		memcpy(iv, tls_ctx->rx.iv, crypto_aead_ivsize(ctx->aead_recv));
+	else
+		memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+
+	xor_iv_with_seq(tls_ctx->crypto_recv.info.version, iv,
+			tls_ctx->rx.rec_seq);
 
 	/* Prepare AAD */
-	tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size,
+	tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size +
+		     tls_ctx->rx.tail_size,
 		     tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size,
-		     ctx->control);
+		     ctx->control,
+		     tls_ctx->crypto_recv.info.version);
 
 	/* Prepare sgin */
 	sg_init_table(sgin, n_sgin);
@@ -1405,6 +1459,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	int version = tls_ctx->crypto_recv.info.version;
 	struct strp_msg *rxm = strp_msg(skb);
 	int err = 0;
 
@@ -1417,13 +1472,17 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 		err = decrypt_internal(sk, skb, dest, NULL, chunk, zc, async);
 		if (err < 0) {
 			if (err == -EINPROGRESS)
-				tls_advance_record_sn(sk, &tls_ctx->rx);
+				tls_advance_record_sn(sk, &tls_ctx->rx,
+						      version);
 
 			return err;
 		}
+
+		rxm->full_len -= padding_length(ctx, tls_ctx, skb);
+
 		rxm->offset += tls_ctx->rx.prepend_size;
 		rxm->full_len -= tls_ctx->rx.overhead_size;
-		tls_advance_record_sn(sk, &tls_ctx->rx);
+		tls_advance_record_sn(sk, &tls_ctx->rx, version);
 		ctx->decrypted = true;
 		ctx->saved_data_ready(sk);
 	} else {
@@ -1611,7 +1670,8 @@ int tls_sw_recvmsg(struct sock *sk,
 		to_decrypt = rxm->full_len - tls_ctx->rx.overhead_size;
 
 		if (to_decrypt <= len && !is_kvec && !is_peek &&
-		    ctx->control == TLS_RECORD_TYPE_DATA)
+		    ctx->control == TLS_RECORD_TYPE_DATA &&
+		    tls_ctx->crypto_recv.info.version != TLS_1_3_VERSION)
 			zc = true;
 
 		err = decrypt_skb_update(sk, skb, &msg->msg_iter,
@@ -1835,9 +1895,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 
 	data_len = ((header[4] & 0xFF) | (header[3] << 8));
 
-	cipher_overhead = tls_ctx->rx.tag_size + tls_ctx->rx.iv_size;
+	cipher_overhead = tls_ctx->rx.tag_size;
+	if (tls_ctx->crypto_recv.info.version != TLS_1_3_VERSION)
+		cipher_overhead += tls_ctx->rx.iv_size;
 
-	if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead) {
+	if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead +
+	    tls_ctx->rx.tail_size) {
 		ret = -EMSGSIZE;
 		goto read_failure;
 	}
@@ -1846,12 +1909,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 		goto read_failure;
 	}
 
-	if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.info.version) ||
-	    header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.info.version)) {
+	/* Note that both TLS1.3 and TLS1.2 use TLS_1_2 version here */
+	if (header[1] != TLS_1_2_VERSION_MINOR ||
+	    header[2] != TLS_1_2_VERSION_MAJOR) {
 		ret = -EINVAL;
 		goto read_failure;
 	}
-
 #ifdef CONFIG_TLS_DEVICE
 	handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset,
 			     *(u64*)tls_ctx->rx.rec_seq);
@@ -2100,10 +2163,19 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto free_priv;
 	}
 
-	cctx->aad_size = TLS_AAD_SPACE_SIZE;
+	if (crypto_info->version == TLS_1_3_VERSION) {
+		nonce_size = 0;
+		cctx->aad_size = TLS_HEADER_SIZE;
+		cctx->tail_size = 1;
+	} else {
+		cctx->aad_size = TLS_AAD_SPACE_SIZE;
+		cctx->tail_size = 0;
+	}
+
 	cctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
 	cctx->tag_size = tag_size;
-	cctx->overhead_size = cctx->prepend_size + cctx->tag_size;
+	cctx->overhead_size = cctx->prepend_size + cctx->tag_size +
+		cctx->tail_size;
 	cctx->iv_size = iv_size;
 	cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
 			   GFP_KERNEL);
-- 
cgit v1.2.3-59-g8ed1b


From f9cf22882c606f3ffe06f620bb6d03b9eff18d3d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 31 Jan 2019 10:50:40 -0800
Subject: devlink: add device information API

ethtool -i has served us well for a long time, but its showing
its limitations more and more. The device information should
also be reported per device not per-netdev.

Lay foundation for a simple devlink-based way of reading device
info. Add driver name and device serial number as initial pieces
of information exposed via this new API.

v3:
 - rename helpers (Jiri);
 - rename driver name attr (Jiri);
 - remove double spacing in commit message (Jiri).
RFC v2:
 - wrap the skb into an opaque structure (Jiri);
 - allow the serial number of be any length (Jiri & Andrew);
 - add driver name (Jonathan).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  18 +++++++
 include/uapi/linux/devlink.h |   5 ++
 net/core/devlink.c           | 112 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 85c9eabaf056..a6d0a530483d 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -429,6 +429,7 @@ enum devlink_param_wol_types {
 }
 
 struct devlink_region;
+struct devlink_info_req;
 
 typedef void devlink_snapshot_data_dest_t(const void *data);
 
@@ -484,6 +485,8 @@ struct devlink_ops {
 	int (*eswitch_encap_mode_get)(struct devlink *devlink, u8 *p_encap_mode);
 	int (*eswitch_encap_mode_set)(struct devlink *devlink, u8 encap_mode,
 				      struct netlink_ext_ack *extack);
+	int (*info_get)(struct devlink *devlink, struct devlink_info_req *req,
+			struct netlink_ext_ack *extack);
 };
 
 static inline void *devlink_priv(struct devlink *devlink)
@@ -607,6 +610,10 @@ u32 devlink_region_shapshot_id_get(struct devlink *devlink);
 int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
 				   u8 *data, u32 snapshot_id,
 				   devlink_snapshot_data_dest_t *data_destructor);
+int devlink_info_serial_number_put(struct devlink_info_req *req,
+				   const char *sn);
+int devlink_info_driver_name_put(struct devlink_info_req *req,
+				 const char *name);
 
 #else
 
@@ -905,6 +912,17 @@ devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
 	return 0;
 }
 
+static inline int
+devlink_info_driver_name_put(struct devlink_info_req *req, const char *name)
+{
+	return 0;
+}
+
+static inline int
+devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
+{
+	return 0;
+}
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 61b4447a6c5b..142710d45093 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -94,6 +94,8 @@ enum devlink_command {
 	DEVLINK_CMD_PORT_PARAM_NEW,
 	DEVLINK_CMD_PORT_PARAM_DEL,
 
+	DEVLINK_CMD_INFO_GET,		/* can dump */
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -290,6 +292,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_REGION_CHUNK_ADDR,         /* u64 */
 	DEVLINK_ATTR_REGION_CHUNK_LEN,          /* u64 */
 
+	DEVLINK_ATTR_INFO_DRIVER_NAME,		/* string */
+	DEVLINK_ATTR_INFO_SERIAL_NUMBER,	/* string */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index e6f170caf449..f456f6aa3d40 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3714,6 +3714,110 @@ out:
 	return 0;
 }
 
+struct devlink_info_req {
+	struct sk_buff *msg;
+};
+
+int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name)
+{
+	return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name);
+}
+EXPORT_SYMBOL_GPL(devlink_info_driver_name_put);
+
+int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
+{
+	return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn);
+}
+EXPORT_SYMBOL_GPL(devlink_info_serial_number_put);
+
+static int
+devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
+		     enum devlink_command cmd, u32 portid,
+		     u32 seq, int flags, struct netlink_ext_ack *extack)
+{
+	struct devlink_info_req req;
+	void *hdr;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	err = -EMSGSIZE;
+	if (devlink_nl_put_handle(msg, devlink))
+		goto err_cancel_msg;
+
+	req.msg = msg;
+	err = devlink->ops->info_get(devlink, &req, extack);
+	if (err)
+		goto err_cancel_msg;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+err_cancel_msg:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct sk_buff *msg;
+	int err;
+
+	if (!devlink->ops || !devlink->ops->info_get)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
+				   info->snd_portid, info->snd_seq, 0,
+				   info->extack);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
+					  struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+
+		mutex_lock(&devlink->lock);
+		err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
+					   NETLINK_CB(cb->skb).portid,
+					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					   cb->extack);
+		mutex_unlock(&devlink->lock);
+		if (err)
+			break;
+		idx++;
+	}
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -3974,6 +4078,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_INFO_GET,
+		.doit = devlink_nl_cmd_info_get_doit,
+		.dumpit = devlink_nl_cmd_info_get_dumpit,
+		.policy = devlink_nl_policy,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From fc6fae7dd987dccce3f322c32dc26b52d69ad00e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 31 Jan 2019 10:50:41 -0800
Subject: devlink: add version reporting to devlink info API

ethtool -i has a few fixed-size fields which can be used to report
firmware version and expansion ROM version. Unfortunately, modern
hardware has more firmware components. There is usually some
datapath microcode, management controller, PXE drivers, and a
CPLD load. Running ethtool -i on modern controllers reveals the
fact that vendors cram multiple values into firmware version field.

Here are some examples from systems I could lay my hands on quickly:

tg3:  "FFV20.2.17 bc 5720-v1.39"
i40e: "6.01 0x800034a4 1.1747.0"
nfp:  "0.0.3.5 0.25 sriov-2.1.16 nic"

Add a new devlink API to allow retrieving multiple versions, and
provide user-readable name for those versions.

While at it break down the versions into three categories:
 - fixed - this is the board/fixed component version, usually vendors
           report information like the board version in the PCI VPD,
           but it will benefit from naming and common API as well;
 - running - this is the running firmware version;
 - stored - this is firmware in the flash, after firmware update
            this value will reflect the flashed version, while the
            running version may only be updated after reboot.

v3:
 - add per-type helpers instead of using the special argument (Jiri).
RFCv2:
 - remove the nesting in attr DEVLINK_ATTR_INFO_VERSIONS (now
   versions are mixed with other info attrs)l
 - have the driver report versions from the same callback as
   other info.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        | 33 +++++++++++++++++++++++++
 include/uapi/linux/devlink.h |  5 ++++
 net/core/devlink.c           | 57 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index a6d0a530483d..6dc0ef964392 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -614,6 +614,15 @@ int devlink_info_serial_number_put(struct devlink_info_req *req,
 				   const char *sn);
 int devlink_info_driver_name_put(struct devlink_info_req *req,
 				 const char *name);
+int devlink_info_version_fixed_put(struct devlink_info_req *req,
+				   const char *version_name,
+				   const char *version_value);
+int devlink_info_version_stored_put(struct devlink_info_req *req,
+				    const char *version_name,
+				    const char *version_value);
+int devlink_info_version_running_put(struct devlink_info_req *req,
+				     const char *version_name,
+				     const char *version_value);
 
 #else
 
@@ -923,6 +932,30 @@ devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
 {
 	return 0;
 }
+
+static inline int
+devlink_info_version_fixed_put(struct devlink_info_req *req,
+			       const char *version_name,
+			       const char *version_value)
+{
+	return 0;
+}
+
+static inline int
+devlink_info_version_stored_put(struct devlink_info_req *req,
+				const char *version_name,
+				const char *version_value)
+{
+	return 0;
+}
+
+static inline int
+devlink_info_version_running_put(struct devlink_info_req *req,
+				 const char *version_name,
+				 const char *version_value)
+{
+	return 0;
+}
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 142710d45093..7fffd879c328 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -294,6 +294,11 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_INFO_DRIVER_NAME,		/* string */
 	DEVLINK_ATTR_INFO_SERIAL_NUMBER,	/* string */
+	DEVLINK_ATTR_INFO_VERSION_FIXED,	/* nested */
+	DEVLINK_ATTR_INFO_VERSION_RUNNING,	/* nested */
+	DEVLINK_ATTR_INFO_VERSION_STORED,	/* nested */
+	DEVLINK_ATTR_INFO_VERSION_NAME,		/* string */
+	DEVLINK_ATTR_INFO_VERSION_VALUE,	/* string */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index f456f6aa3d40..e31b6d617837 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3730,6 +3730,63 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
 }
 EXPORT_SYMBOL_GPL(devlink_info_serial_number_put);
 
+static int devlink_info_version_put(struct devlink_info_req *req, int attr,
+				    const char *version_name,
+				    const char *version_value)
+{
+	struct nlattr *nest;
+	int err;
+
+	nest = nla_nest_start(req->msg, attr);
+	if (!nest)
+		return -EMSGSIZE;
+
+	err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME,
+			     version_name);
+	if (err)
+		goto nla_put_failure;
+
+	err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE,
+			     version_value);
+	if (err)
+		goto nla_put_failure;
+
+	nla_nest_end(req->msg, nest);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(req->msg, nest);
+	return err;
+}
+
+int devlink_info_version_fixed_put(struct devlink_info_req *req,
+				   const char *version_name,
+				   const char *version_value)
+{
+	return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED,
+					version_name, version_value);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put);
+
+int devlink_info_version_stored_put(struct devlink_info_req *req,
+				    const char *version_name,
+				    const char *version_value)
+{
+	return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
+					version_name, version_value);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_stored_put);
+
+int devlink_info_version_running_put(struct devlink_info_req *req,
+				     const char *version_name,
+				     const char *version_value)
+{
+	return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+					version_name, version_value);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_running_put);
+
 static int
 devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
 		     enum devlink_command cmd, u32 portid,
-- 
cgit v1.2.3-59-g8ed1b


From 7f1bc6e95d7840d4305595b3e4025cddda88cee5 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 2 Feb 2019 07:34:46 -0800
Subject: sockopt: Rename SO_TIMESTAMP* to SO_TIMESTAMP*_OLD

SO_TIMESTAMP, SO_TIMESTAMPNS and SO_TIMESTAMPING options, the
way they are currently defined, are not y2038 safe.
Subsequent patches in the series add new y2038 safe versions
of these options which provide 64 bit timestamps on all
architectures uniformly.
Hence, rename existing options with OLD tag suffixes.

Also note that kernel will not use the untagged SO_TIMESTAMP*
and SCM_TIMESTAMP* options internally anymore.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Cc: deller@gmx.de
Cc: dhowells@redhat.com
Cc: jejb@parisc-linux.org
Cc: ralf@linux-mips.org
Cc: rth@twiddle.net
Cc: linux-afs@lists.infradead.org
Cc: linux-alpha@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: linux-parisc@vger.kernel.org
Cc: linux-rdma@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  | 23 ++++++++++++++++-------
 arch/mips/include/uapi/asm/socket.h   | 23 ++++++++++++++++-------
 arch/parisc/include/uapi/asm/socket.h | 23 ++++++++++++++++-------
 arch/sparc/include/uapi/asm/socket.h  | 24 ++++++++++++++++--------
 include/uapi/asm-generic/socket.h     | 23 ++++++++++++++++-------
 net/compat.c                          |  6 +++---
 net/core/sock.c                       | 16 ++++++++--------
 net/ipv4/tcp.c                        |  6 +++---
 net/rds/af_rds.c                      |  2 +-
 net/rds/recv.c                        |  2 +-
 net/rxrpc/local_object.c              |  2 +-
 net/socket.c                          |  8 ++++----
 12 files changed, 101 insertions(+), 57 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index b1c9b542c021..992a0a6dcea1 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -51,13 +51,9 @@
 #define SO_GET_FILTER		SO_ATTACH_FILTER
 
 #define SO_PEERNAME		28
-#define SO_TIMESTAMP		29
-#define SCM_TIMESTAMP		SO_TIMESTAMP
 
 #define SO_PEERSEC		30
 #define SO_PASSSEC		34
-#define SO_TIMESTAMPNS		35
-#define SCM_TIMESTAMPNS		SO_TIMESTAMPNS
 
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		19
@@ -66,9 +62,6 @@
 
 #define SO_MARK			36
 
-#define SO_TIMESTAMPING		37
-#define SCM_TIMESTAMPING	SO_TIMESTAMPING
-
 #define SO_RXQ_OVFL             40
 
 #define SO_WIFI_STATUS		41
@@ -117,4 +110,20 @@
 
 #define SO_BINDTOIFINDEX	62
 
+#define SO_TIMESTAMP_OLD        29
+#define SO_TIMESTAMPNS_OLD      35
+#define SO_TIMESTAMPING_OLD     37
+
+#if !defined(__KERNEL__)
+
+#define SO_TIMESTAMP            SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS          SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
+
+#define SCM_TIMESTAMP           SO_TIMESTAMP
+#define SCM_TIMESTAMPNS         SO_TIMESTAMPNS
+#define SCM_TIMESTAMPING        SO_TIMESTAMPING
+
+#endif
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 73e25e35d803..0f4516c34df2 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -65,21 +65,14 @@
 #define SO_GET_FILTER		SO_ATTACH_FILTER
 
 #define SO_PEERNAME		28
-#define SO_TIMESTAMP		29
-#define SCM_TIMESTAMP		SO_TIMESTAMP
 
 #define SO_PEERSEC		30
 #define SO_SNDBUFFORCE		31
 #define SO_RCVBUFFORCE		33
 #define SO_PASSSEC		34
-#define SO_TIMESTAMPNS		35
-#define SCM_TIMESTAMPNS		SO_TIMESTAMPNS
 
 #define SO_MARK			36
 
-#define SO_TIMESTAMPING		37
-#define SCM_TIMESTAMPING	SO_TIMESTAMPING
-
 #define SO_RXQ_OVFL		40
 
 #define SO_WIFI_STATUS		41
@@ -128,4 +121,20 @@
 
 #define SO_BINDTOIFINDEX	62
 
+#define SO_TIMESTAMP_OLD        29
+#define SO_TIMESTAMPNS_OLD      35
+#define SO_TIMESTAMPING_OLD     37
+
+#if !defined(__KERNEL__)
+
+#define SO_TIMESTAMP            SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS          SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
+
+#define SCM_TIMESTAMP           SO_TIMESTAMP
+#define SCM_TIMESTAMPNS         SO_TIMESTAMPNS
+#define SCM_TIMESTAMPING        SO_TIMESTAMPING
+
+#endif
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 52bed5976cbe..7c180321ebd6 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -34,10 +34,6 @@
 #define SO_BSDCOMPAT	0x400e
 #define SO_PASSCRED	0x4010
 #define SO_PEERCRED	0x4011
-#define SO_TIMESTAMP	0x4012
-#define SCM_TIMESTAMP	SO_TIMESTAMP
-#define SO_TIMESTAMPNS	0x4013
-#define SCM_TIMESTAMPNS	SO_TIMESTAMPNS
 
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x4016
@@ -58,9 +54,6 @@
 
 #define SO_MARK			0x401f
 
-#define SO_TIMESTAMPING		0x4020
-#define SCM_TIMESTAMPING	SO_TIMESTAMPING
-
 #define SO_RXQ_OVFL             0x4021
 
 #define SO_WIFI_STATUS		0x4022
@@ -109,4 +102,20 @@
 
 #define SO_BINDTOIFINDEX	0x4037
 
+#define SO_TIMESTAMP_OLD        0x4012
+#define SO_TIMESTAMPNS_OLD      0x4013
+#define SO_TIMESTAMPING_OLD     0x4020
+
+#if !defined(__KERNEL__)
+
+#define SO_TIMESTAMP            SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS          SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
+
+#define SCM_TIMESTAMP           SO_TIMESTAMP
+#define SCM_TIMESTAMPNS         SO_TIMESTAMPNS
+#define SCM_TIMESTAMPING        SO_TIMESTAMPING
+
+#endif
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index bbdb81594dd4..d8a1bbc3e6c4 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -33,7 +33,6 @@
 #define SO_PROTOCOL	0x1028
 #define SO_DOMAIN	0x1029
 
-
 /* Linux specific, keep the same. */
 #define SO_NO_CHECK	0x000b
 #define SO_PRIORITY	0x000c
@@ -45,19 +44,12 @@
 #define SO_GET_FILTER		SO_ATTACH_FILTER
 
 #define SO_PEERNAME		0x001c
-#define SO_TIMESTAMP		0x001d
-#define SCM_TIMESTAMP		SO_TIMESTAMP
 
 #define SO_PEERSEC		0x001e
 #define SO_PASSSEC		0x001f
-#define SO_TIMESTAMPNS		0x0021
-#define SCM_TIMESTAMPNS		SO_TIMESTAMPNS
 
 #define SO_MARK			0x0022
 
-#define SO_TIMESTAMPING		0x0023
-#define SCM_TIMESTAMPING	SO_TIMESTAMPING
-
 #define SO_RXQ_OVFL             0x0024
 
 #define SO_WIFI_STATUS		0x0025
@@ -111,4 +103,20 @@
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
 #define SO_SECURITY_ENCRYPTION_NETWORK		0x5004
 
+#define SO_TIMESTAMP_OLD         0x001d
+#define SO_TIMESTAMPNS_OLD       0x0021
+#define SO_TIMESTAMPING_OLD      0x0023
+
+#if !defined(__KERNEL__)
+
+#define SO_TIMESTAMP           SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS         SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING        SO_TIMESTAMPING_OLD
+
+#define SCM_TIMESTAMP          SO_TIMESTAMP
+#define SCM_TIMESTAMPNS        SO_TIMESTAMPNS
+#define SCM_TIMESTAMPING       SO_TIMESTAMPING
+
+#endif
+
 #endif /* _ASM_SOCKET_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 3066ab3853a8..4ef3aed31fb7 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -46,21 +46,14 @@
 #define SO_GET_FILTER		SO_ATTACH_FILTER
 
 #define SO_PEERNAME		28
-#define SO_TIMESTAMP		29
-#define SCM_TIMESTAMP		SO_TIMESTAMP
 
 #define SO_ACCEPTCONN		30
 
 #define SO_PEERSEC		31
 #define SO_PASSSEC		34
-#define SO_TIMESTAMPNS		35
-#define SCM_TIMESTAMPNS		SO_TIMESTAMPNS
 
 #define SO_MARK			36
 
-#define SO_TIMESTAMPING		37
-#define SCM_TIMESTAMPING	SO_TIMESTAMPING
-
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
@@ -112,4 +105,20 @@
 
 #define SO_BINDTOIFINDEX	62
 
+#define SO_TIMESTAMP_OLD        29
+#define SO_TIMESTAMPNS_OLD      35
+#define SO_TIMESTAMPING_OLD     37
+
+#if !defined(__KERNEL__)
+
+#define SO_TIMESTAMP            SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS          SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
+
+#define SCM_TIMESTAMP           SO_TIMESTAMP
+#define SCM_TIMESTAMPNS         SO_TIMESTAMPNS
+#define SCM_TIMESTAMPING        SO_TIMESTAMPING
+
+#endif
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/compat.c b/net/compat.c
index ce8f6e8cdcd2..ccf93cd0e49b 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -219,7 +219,7 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat
 	}
 
 	if (!COMPAT_USE_64BIT_TIME) {
-		if (level == SOL_SOCKET && type == SCM_TIMESTAMP) {
+		if (level == SOL_SOCKET && type == SO_TIMESTAMP_OLD) {
 			struct timeval *tv = (struct timeval *)data;
 			ctv.tv_sec = tv->tv_sec;
 			ctv.tv_usec = tv->tv_usec;
@@ -227,8 +227,8 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat
 			len = sizeof(ctv);
 		}
 		if (level == SOL_SOCKET &&
-		    (type == SCM_TIMESTAMPNS || type == SCM_TIMESTAMPING)) {
-			int count = type == SCM_TIMESTAMPNS ? 1 : 3;
+		    (type == SO_TIMESTAMPNS_OLD || type == SO_TIMESTAMPING_OLD)) {
+			int count = type == SO_TIMESTAMPNS_OLD ? 1 : 3;
 			int i;
 			struct timespec *ts = (struct timespec *)data;
 			for (i = 0; i < count; i++) {
diff --git a/net/core/sock.c b/net/core/sock.c
index 29c0028df5ae..d5ca8641968f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -867,10 +867,10 @@ set_rcvbuf:
 			clear_bit(SOCK_PASSCRED, &sock->flags);
 		break;
 
-	case SO_TIMESTAMP:
-	case SO_TIMESTAMPNS:
+	case SO_TIMESTAMP_OLD:
+	case SO_TIMESTAMPNS_OLD:
 		if (valbool)  {
-			if (optname == SO_TIMESTAMP)
+			if (optname == SO_TIMESTAMP_OLD)
 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 			else
 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
@@ -882,7 +882,7 @@ set_rcvbuf:
 		}
 		break;
 
-	case SO_TIMESTAMPING:
+	case SO_TIMESTAMPING_OLD:
 		if (val & ~SOF_TIMESTAMPING_MASK) {
 			ret = -EINVAL;
 			break;
@@ -1243,16 +1243,16 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		sock_warn_obsolete_bsdism("getsockopt");
 		break;
 
-	case SO_TIMESTAMP:
+	case SO_TIMESTAMP_OLD:
 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
 		break;
 
-	case SO_TIMESTAMPNS:
+	case SO_TIMESTAMPNS_OLD:
 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 		break;
 
-	case SO_TIMESTAMPING:
+	case SO_TIMESTAMPING_OLD:
 		v.val = sk->sk_tsflags;
 		break;
 
@@ -2168,7 +2168,7 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
 			return -EINVAL;
 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
 		break;
-	case SO_TIMESTAMPING:
+	case SO_TIMESTAMPING_OLD:
 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
 			return -EINVAL;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6f8d292ad501..e29aec59cad1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1867,13 +1867,13 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 	if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
 		if (sock_flag(sk, SOCK_RCVTSTAMP)) {
 			if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
-				put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
+				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
 					 sizeof(tss->ts[0]), &tss->ts[0]);
 			} else {
 				tv.tv_sec = tss->ts[0].tv_sec;
 				tv.tv_usec = tss->ts[0].tv_nsec / 1000;
 
-				put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
 					 sizeof(tv), &tv);
 			}
 		}
@@ -1893,7 +1893,7 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 
 	if (has_timestamping) {
 		tss->ts[1] = (struct timespec) {0};
-		put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
+		put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD,
 			 sizeof(*tss), tss);
 	}
 }
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 65387e1e6964..eeb4639adbe5 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -430,7 +430,7 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
 		ret = rds_set_transport(rs, optval, optlen);
 		release_sock(sock->sk);
 		break;
-	case SO_TIMESTAMP:
+	case SO_TIMESTAMP_OLD:
 		lock_sock(sock->sk);
 		ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
 		release_sock(sock->sk);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 727639dac8a7..04e30d63a159 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -550,7 +550,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 	if ((inc->i_rx_tstamp != 0) &&
 	    sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
 		struct timeval tv = ktime_to_timeval(inc->i_rx_tstamp);
-		ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+		ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
 			       sizeof(tv), &tv);
 		if (ret)
 			goto out;
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 0906e51d3cfb..15cf42d5b53a 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -202,7 +202,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 
 		/* We want receive timestamps. */
 		opt = 1;
-		ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS,
+		ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
 					(char *)&opt, sizeof(opt));
 		if (ret < 0) {
 			_debug("setsockopt failed");
diff --git a/net/socket.c b/net/socket.c
index e89884e2197b..5087f9e40f3a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -669,7 +669,7 @@ static bool skb_is_err_queue(const struct sk_buff *skb)
  * before the software timestamp is received, a hardware TX timestamp may be
  * returned only if there is no software TX timestamp. Ignore false software
  * timestamps, which may be made in the __sock_recv_timestamp() call when the
- * option SO_TIMESTAMP(NS) is enabled on the socket, even when the skb has a
+ * option SO_TIMESTAMP_OLD(NS) is enabled on the socket, even when the skb has a
  * hardware timestamp.
  */
 static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
@@ -721,12 +721,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 		if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
 			struct timeval tv;
 			skb_get_timestamp(skb, &tv);
-			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+			put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
 				 sizeof(tv), &tv);
 		} else {
 			struct timespec ts;
 			skb_get_timestampns(skb, &ts);
-			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
+			put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
 				 sizeof(ts), &ts);
 		}
 	}
@@ -746,7 +746,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 	}
 	if (!empty) {
 		put_cmsg(msg, SOL_SOCKET,
-			 SCM_TIMESTAMPING, sizeof(tss), &tss);
+			 SO_TIMESTAMPING_OLD, sizeof(tss), &tss);
 
 		if (skb_is_err_queue(skb) && skb->len &&
 		    SKB_EXT_ERR(skb)->opt_stats)
-- 
cgit v1.2.3-59-g8ed1b


From bcb3fc3247e5a7ceb467ca0cfdaa4c1b830dd8f9 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 2 Feb 2019 07:34:47 -0800
Subject: arch: sparc: Override struct __kernel_old_timeval

struct __kernel_old_timeval is supposed to have the same
layout as struct timeval. But, it was inadvarently missed
that __kernel_suseconds has a different definition for
sparc64.
Provide an asm-specific override that fixes it.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Cc: sparclinux@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/uapi/asm/posix_types.h | 10 ++++++++++
 include/uapi/linux/time.h                 |  2 ++
 2 files changed, 12 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/sparc/include/uapi/asm/posix_types.h b/arch/sparc/include/uapi/asm/posix_types.h
index fec499d6efb7..f139e0048628 100644
--- a/arch/sparc/include/uapi/asm/posix_types.h
+++ b/arch/sparc/include/uapi/asm/posix_types.h
@@ -19,6 +19,16 @@ typedef unsigned short         __kernel_old_gid_t;
 typedef int		       __kernel_suseconds_t;
 #define __kernel_suseconds_t __kernel_suseconds_t
 
+typedef long		__kernel_long_t;
+typedef unsigned long	__kernel_ulong_t;
+#define __kernel_long_t __kernel_long_t
+
+struct __kernel_old_timeval {
+	__kernel_long_t tv_sec;
+	__kernel_suseconds_t tv_usec;
+};
+#define __kernel_old_timeval __kernel_old_timeval
+
 #else
 /* sparc 32 bit */
 
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 6b56a2208be7..04d5587f30d3 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -63,10 +63,12 @@ struct __kernel_itimerspec {
  * here, this is probably because it is not y2038 safe and needs to
  * be changed to use another interface.
  */
+#ifndef __kernel_old_timeval
 struct __kernel_old_timeval {
 	__kernel_long_t tv_sec;
 	__kernel_long_t tv_usec;
 };
+#endif
 
 /*
  * The IDs of the various system clocks (for POSIX.1b interval timers):
-- 
cgit v1.2.3-59-g8ed1b


From 98bb03c865d7ddf7a9a2eb80f9a0dd5f261c56ad Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 2 Feb 2019 07:34:49 -0800
Subject: socket: Add struct __kernel_sock_timeval

The new type is meant to be used as a y2038 safe structure
to be used as part of cmsg data.
Presently the SO_TIMESTAMP socket option uses struct timeval
for timestamps. This is not y2038 safe.
Subsequent patches in the series add new y2038 safe socket
option to be used in the place of SO_TIMESTAMP_OLD.
struct __kernel_sock_timeval will be used as the timestamp
format at that time.

struct __kernel_sock_timeval also maintains the same layout
across 32 bit and 64 bit ABIs.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/time.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 04d5587f30d3..b8ad1b86b942 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -70,6 +70,11 @@ struct __kernel_old_timeval {
 };
 #endif
 
+struct __kernel_sock_timeval {
+	__s64 tv_sec;
+	__s64 tv_usec;
+};
+
 /*
  * The IDs of the various system clocks (for POSIX.1b interval timers):
  */
-- 
cgit v1.2.3-59-g8ed1b


From 887feae36aee6c08e0dafcdaa5ba921abbb2c56b Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 2 Feb 2019 07:34:50 -0800
Subject: socket: Add SO_TIMESTAMP[NS]_NEW

Add SO_TIMESTAMP_NEW and SO_TIMESTAMPNS_NEW variants of
socket timestamp options.
These are the y2038 safe versions of the SO_TIMESTAMP_OLD
and SO_TIMESTAMPNS_OLD for all architectures.

Note that the format of scm_timestamping.ts[0] is not changed
in this patch.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Cc: jejb@parisc-linux.org
Cc: ralf@linux-mips.org
Cc: rth@twiddle.net
Cc: linux-alpha@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: linux-parisc@vger.kernel.org
Cc: linux-rdma@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  | 14 ++++++++++++--
 arch/mips/include/uapi/asm/socket.h   | 14 ++++++++++++--
 arch/parisc/include/uapi/asm/socket.h | 14 ++++++++++++--
 arch/sparc/include/uapi/asm/socket.h  | 14 ++++++++++++--
 include/linux/skbuff.h                | 18 ++++++++++++++++++
 include/net/sock.h                    |  1 +
 include/uapi/asm-generic/socket.h     | 15 +++++++++++++--
 net/core/sock.c                       | 21 +++++++++++++++++++--
 net/ipv4/tcp.c                        | 33 +++++++++++++++++++++++++--------
 net/rds/af_rds.c                      |  8 ++++++--
 net/rds/recv.c                        | 16 ++++++++++++++--
 net/socket.c                          | 35 +++++++++++++++++++++++++++--------
 12 files changed, 171 insertions(+), 32 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 992a0a6dcea1..aab11eec7c22 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -3,6 +3,7 @@
 #define _UAPI_ASM_SOCKET_H
 
 #include <asm/sockios.h>
+#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 /*
@@ -114,10 +115,19 @@
 #define SO_TIMESTAMPNS_OLD      35
 #define SO_TIMESTAMPING_OLD     37
 
+#define SO_TIMESTAMP_NEW        63
+#define SO_TIMESTAMPNS_NEW      64
+
 #if !defined(__KERNEL__)
 
-#define SO_TIMESTAMP            SO_TIMESTAMP_OLD
-#define SO_TIMESTAMPNS          SO_TIMESTAMPNS_OLD
+#if __BITS_PER_LONG == 64
+#define SO_TIMESTAMP		SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#else
+#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
+#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#endif
+
 #define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
 
 #define SCM_TIMESTAMP           SO_TIMESTAMP
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 0f4516c34df2..11014f684d9f 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -11,6 +11,7 @@
 #define _UAPI_ASM_SOCKET_H
 
 #include <asm/sockios.h>
+#include <asm/bitsperlong.h>
 
 /*
  * For setsockopt(2)
@@ -125,10 +126,19 @@
 #define SO_TIMESTAMPNS_OLD      35
 #define SO_TIMESTAMPING_OLD     37
 
+#define SO_TIMESTAMP_NEW        63
+#define SO_TIMESTAMPNS_NEW      64
+
 #if !defined(__KERNEL__)
 
-#define SO_TIMESTAMP            SO_TIMESTAMP_OLD
-#define SO_TIMESTAMPNS          SO_TIMESTAMPNS_OLD
+#if __BITS_PER_LONG == 64
+#define SO_TIMESTAMP		SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#else
+#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
+#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#endif
+
 #define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
 
 #define SCM_TIMESTAMP           SO_TIMESTAMP
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 7c180321ebd6..cbc4b89c2fe4 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -3,6 +3,7 @@
 #define _UAPI_ASM_SOCKET_H
 
 #include <asm/sockios.h>
+#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 #define SOL_SOCKET	0xffff
@@ -106,10 +107,19 @@
 #define SO_TIMESTAMPNS_OLD      0x4013
 #define SO_TIMESTAMPING_OLD     0x4020
 
+#define SO_TIMESTAMP_NEW        0x4038
+#define SO_TIMESTAMPNS_NEW      0x4039
+
 #if !defined(__KERNEL__)
 
-#define SO_TIMESTAMP            SO_TIMESTAMP_OLD
-#define SO_TIMESTAMPNS          SO_TIMESTAMPNS_OLD
+#if __BITS_PER_LONG == 64
+#define SO_TIMESTAMP		SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#else
+#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
+#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#endif
+
 #define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
 
 #define SCM_TIMESTAMP           SO_TIMESTAMP
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index d8a1bbc3e6c4..85127425b294 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -3,6 +3,7 @@
 #define _ASM_SOCKET_H
 
 #include <asm/sockios.h>
+#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 #define SOL_SOCKET	0xffff
@@ -107,10 +108,19 @@
 #define SO_TIMESTAMPNS_OLD       0x0021
 #define SO_TIMESTAMPING_OLD      0x0023
 
+#define SO_TIMESTAMP_NEW         0x0041
+#define SO_TIMESTAMPNS_NEW       0x0042
+
 #if !defined(__KERNEL__)
 
-#define SO_TIMESTAMP           SO_TIMESTAMP_OLD
-#define SO_TIMESTAMPNS         SO_TIMESTAMPNS_OLD
+#if __BITS_PER_LONG == 64
+#define SO_TIMESTAMP		SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#else
+#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
+#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#endif
+
 #define SO_TIMESTAMPING        SO_TIMESTAMPING_OLD
 
 #define SCM_TIMESTAMP          SO_TIMESTAMP
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4001611a4c9f..831846617d07 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3498,12 +3498,30 @@ static inline void skb_get_timestamp(const struct sk_buff *skb,
 	*stamp = ns_to_kernel_old_timeval(skb->tstamp);
 }
 
+static inline void skb_get_new_timestamp(const struct sk_buff *skb,
+					 struct __kernel_sock_timeval *stamp)
+{
+	struct timespec64 ts = ktime_to_timespec64(skb->tstamp);
+
+	stamp->tv_sec = ts.tv_sec;
+	stamp->tv_usec = ts.tv_nsec / 1000;
+}
+
 static inline void skb_get_timestampns(const struct sk_buff *skb,
 				       struct timespec *stamp)
 {
 	*stamp = ktime_to_timespec(skb->tstamp);
 }
 
+static inline void skb_get_new_timestampns(const struct sk_buff *skb,
+					   struct __kernel_timespec *stamp)
+{
+	struct timespec64 ts = ktime_to_timespec64(skb->tstamp);
+
+	stamp->tv_sec = ts.tv_sec;
+	stamp->tv_nsec = ts.tv_nsec;
+}
+
 static inline void __net_timestamp(struct sk_buff *skb)
 {
 	skb->tstamp = ktime_get_real();
diff --git a/include/net/sock.h b/include/net/sock.h
index 2b229f7be8eb..6679f3c120b0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -805,6 +805,7 @@ enum sock_flags {
 	SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
 	SOCK_TXTIME,
 	SOCK_XDP, /* XDP is attached */
+	SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
 };
 
 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 4ef3aed31fb7..f22d3f7162f8 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -3,6 +3,7 @@
 #define __ASM_GENERIC_SOCKET_H
 
 #include <asm/sockios.h>
+#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 #define SOL_SOCKET	1
@@ -109,10 +110,20 @@
 #define SO_TIMESTAMPNS_OLD      35
 #define SO_TIMESTAMPING_OLD     37
 
+#define SO_TIMESTAMP_NEW        63
+#define SO_TIMESTAMPNS_NEW      64
+
 #if !defined(__KERNEL__)
 
-#define SO_TIMESTAMP            SO_TIMESTAMP_OLD
-#define SO_TIMESTAMPNS          SO_TIMESTAMPNS_OLD
+#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
+/* on 64-bit and x32, avoid the ?: operator */
+#define SO_TIMESTAMP		SO_TIMESTAMP_OLD
+#define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#else
+#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
+#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#endif
+
 #define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
 
 #define SCM_TIMESTAMP           SO_TIMESTAMP
diff --git a/net/core/sock.c b/net/core/sock.c
index d5ca8641968f..14b987eab10c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -868,9 +868,16 @@ set_rcvbuf:
 		break;
 
 	case SO_TIMESTAMP_OLD:
+	case SO_TIMESTAMP_NEW:
 	case SO_TIMESTAMPNS_OLD:
+	case SO_TIMESTAMPNS_NEW:
 		if (valbool)  {
-			if (optname == SO_TIMESTAMP_OLD)
+			if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
+				sock_set_flag(sk, SOCK_TSTAMP_NEW);
+			else
+				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
+
+			if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 			else
 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
@@ -879,6 +886,7 @@ set_rcvbuf:
 		} else {
 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+			sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 		}
 		break;
 
@@ -1245,11 +1253,20 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 
 	case SO_TIMESTAMP_OLD:
 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
 		break;
 
 	case SO_TIMESTAMPNS_OLD:
-		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
+		break;
+
+	case SO_TIMESTAMP_NEW:
+		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
+		break;
+
+	case SO_TIMESTAMPNS_NEW:
+		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
 		break;
 
 	case SO_TIMESTAMPING_OLD:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ce41b04c0f0..4e9388bf104a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1861,20 +1861,37 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb,
 static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 			       struct scm_timestamping *tss)
 {
-	struct __kernel_old_timeval tv;
+	int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
 	bool has_timestamping = false;
 
 	if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
 		if (sock_flag(sk, SOCK_RCVTSTAMP)) {
 			if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
-				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
-					 sizeof(tss->ts[0]), &tss->ts[0]);
+				if (new_tstamp) {
+					struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec};
+
+					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+						 sizeof(kts), &kts);
+				} else {
+					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
+						 sizeof(tss->ts[0]), &tss->ts[0]);
+				}
 			} else {
-				tv.tv_sec = tss->ts[0].tv_sec;
-				tv.tv_usec = tss->ts[0].tv_nsec / 1000;
-
-				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
-					 sizeof(tv), &tv);
+				if (new_tstamp) {
+					struct __kernel_sock_timeval stv;
+
+					stv.tv_sec = tss->ts[0].tv_sec;
+					stv.tv_usec = tss->ts[0].tv_nsec / 1000;
+					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
+						 sizeof(stv), &stv);
+				} else {
+					struct __kernel_old_timeval tv;
+
+					tv.tv_sec = tss->ts[0].tv_sec;
+					tv.tv_usec = tss->ts[0].tv_nsec / 1000;
+					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
+						 sizeof(tv), &tv);
+				}
 			}
 		}
 
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index eeb4639adbe5..65571a6273c3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -348,7 +348,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval,
 }
 
 static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
-				 int optlen)
+				 int optlen, int optname)
 {
 	int val, valbool;
 
@@ -360,6 +360,9 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
 
 	valbool = val ? 1 : 0;
 
+	if (optname == SO_TIMESTAMP_NEW)
+		sock_set_flag(sk, SOCK_TSTAMP_NEW);
+
 	if (valbool)
 		sock_set_flag(sk, SOCK_RCVTSTAMP);
 	else
@@ -431,8 +434,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
 		release_sock(sock->sk);
 		break;
 	case SO_TIMESTAMP_OLD:
+	case SO_TIMESTAMP_NEW:
 		lock_sock(sock->sk);
-		ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
+		ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname);
 		release_sock(sock->sk);
 		break;
 	case SO_RDS_MSG_RXPATH_LATENCY:
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 435bf2320cd3..6bb6b16ca270 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -550,8 +550,20 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
 	if ((inc->i_rx_tstamp != 0) &&
 	    sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
 		struct __kernel_old_timeval tv = ns_to_kernel_old_timeval(inc->i_rx_tstamp);
-		ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
-			       sizeof(tv), &tv);
+
+		if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) {
+			ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
+				       sizeof(tv), &tv);
+		} else {
+			struct __kernel_sock_timeval sk_tv;
+
+			sk_tv.tv_sec = tv.tv_sec;
+			sk_tv.tv_usec = tv.tv_usec;
+
+			ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
+				       sizeof(sk_tv), &sk_tv);
+		}
+
 		if (ret)
 			goto out;
 	}
diff --git a/net/socket.c b/net/socket.c
index 9cc281cdb9d9..1de96abd78d3 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -705,6 +705,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 	struct sk_buff *skb)
 {
 	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
+	int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
 	struct scm_timestamping tss;
 	int empty = 1, false_tstamp = 0;
 	struct skb_shared_hwtstamps *shhwtstamps =
@@ -719,15 +720,33 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 
 	if (need_software_tstamp) {
 		if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
-			struct __kernel_old_timeval tv;
-			skb_get_timestamp(skb, &tv);
-			put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
-				 sizeof(tv), &tv);
+			if (new_tstamp) {
+				struct __kernel_sock_timeval tv;
+
+				skb_get_new_timestamp(skb, &tv);
+				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
+					 sizeof(tv), &tv);
+			} else {
+				struct __kernel_old_timeval tv;
+
+				skb_get_timestamp(skb, &tv);
+				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
+					 sizeof(tv), &tv);
+			}
 		} else {
-			struct timespec ts;
-			skb_get_timestampns(skb, &ts);
-			put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
-				 sizeof(ts), &ts);
+			if (new_tstamp) {
+				struct __kernel_timespec ts;
+
+				skb_get_new_timestampns(skb, &ts);
+				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+					 sizeof(ts), &ts);
+			} else {
+				struct timespec ts;
+
+				skb_get_timestampns(skb, &ts);
+				put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
+					 sizeof(ts), &ts);
+			}
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 9718475e69084de15c3930ce35672a7dc6da866b Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 2 Feb 2019 07:34:51 -0800
Subject: socket: Add SO_TIMESTAMPING_NEW

Add SO_TIMESTAMPING_NEW variant of socket timestamp options.
This is the y2038 safe versions of the SO_TIMESTAMPING_OLD
for all architectures.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Cc: chris@zankel.net
Cc: fenghua.yu@intel.com
Cc: rth@twiddle.net
Cc: tglx@linutronix.de
Cc: ubraun@linux.ibm.com
Cc: linux-alpha@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: linux-s390@vger.kernel.org
Cc: linux-xtensa@linux-xtensa.org
Cc: sparclinux@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  |  5 +++--
 arch/mips/include/uapi/asm/socket.h   |  5 +++--
 arch/parisc/include/uapi/asm/socket.h |  5 +++--
 arch/sparc/include/uapi/asm/socket.h  |  5 +++--
 include/linux/socket.h                |  8 ++++++++
 include/uapi/asm-generic/socket.h     |  5 +++--
 include/uapi/linux/errqueue.h         |  4 ++++
 net/core/scm.c                        | 27 +++++++++++++++++++++++++++
 net/core/sock.c                       |  8 +++++++-
 net/ipv4/tcp.c                        | 30 +++++++++++++++++-------------
 net/smc/af_smc.c                      |  3 ++-
 net/socket.c                          | 13 ++++++++-----
 12 files changed, 88 insertions(+), 30 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index aab11eec7c22..934ea6268f1a 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -117,19 +117,20 @@
 
 #define SO_TIMESTAMP_NEW        63
 #define SO_TIMESTAMPNS_NEW      64
+#define SO_TIMESTAMPING_NEW     65
 
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
 #endif
 
-#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
-
 #define SCM_TIMESTAMP           SO_TIMESTAMP
 #define SCM_TIMESTAMPNS         SO_TIMESTAMPNS
 #define SCM_TIMESTAMPING        SO_TIMESTAMPING
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 11014f684d9f..110f9506d64f 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -128,19 +128,20 @@
 
 #define SO_TIMESTAMP_NEW        63
 #define SO_TIMESTAMPNS_NEW      64
+#define SO_TIMESTAMPING_NEW     65
 
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING		SO_TIMESTAMPING_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
 #endif
 
-#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
-
 #define SCM_TIMESTAMP           SO_TIMESTAMP
 #define SCM_TIMESTAMPNS         SO_TIMESTAMPNS
 #define SCM_TIMESTAMPING        SO_TIMESTAMPING
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index cbc4b89c2fe4..bee2a9dde656 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -109,19 +109,20 @@
 
 #define SO_TIMESTAMP_NEW        0x4038
 #define SO_TIMESTAMPNS_NEW      0x4039
+#define SO_TIMESTAMPING_NEW     0x403A
 
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
 #endif
 
-#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
-
 #define SCM_TIMESTAMP           SO_TIMESTAMP
 #define SCM_TIMESTAMPNS         SO_TIMESTAMPNS
 #define SCM_TIMESTAMPING        SO_TIMESTAMPING
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 85127425b294..2b38dda51426 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -110,19 +110,20 @@
 
 #define SO_TIMESTAMP_NEW         0x0041
 #define SO_TIMESTAMPNS_NEW       0x0042
+#define SO_TIMESTAMPING_NEW      0x0043
 
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING		SO_TIMESTAMPING_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
 #endif
 
-#define SO_TIMESTAMPING        SO_TIMESTAMPING_OLD
-
 #define SCM_TIMESTAMP          SO_TIMESTAMP
 #define SCM_TIMESTAMPNS        SO_TIMESTAMPNS
 #define SCM_TIMESTAMPING       SO_TIMESTAMPING
diff --git a/include/linux/socket.h b/include/linux/socket.h
index ab2041a00e01..6016daeecee4 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -349,9 +349,17 @@ struct ucred {
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
+struct timespec64;
 struct __kernel_timespec;
 struct old_timespec32;
 
+struct scm_timestamping_internal {
+	struct timespec64 ts[3];
+};
+
+extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss);
+extern void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss);
+
 /* The __sys_...msg variants allow MSG_CMSG_COMPAT iff
  * forbid_cmsg_compat==false
  */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index f22d3f7162f8..2713e0fa68ef 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -112,6 +112,7 @@
 
 #define SO_TIMESTAMP_NEW        63
 #define SO_TIMESTAMPNS_NEW      64
+#define SO_TIMESTAMPING_NEW     65
 
 #if !defined(__KERNEL__)
 
@@ -119,13 +120,13 @@
 /* on 64-bit and x32, avoid the ?: operator */
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
+#define SO_TIMESTAMPING		SO_TIMESTAMPING_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
+#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
 #endif
 
-#define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
-
 #define SCM_TIMESTAMP           SO_TIMESTAMP
 #define SCM_TIMESTAMPNS         SO_TIMESTAMPNS
 #define SCM_TIMESTAMPING        SO_TIMESTAMPING
diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index c0151200f7d1..d955b9e32288 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -41,6 +41,10 @@ struct scm_timestamping {
 	struct timespec ts[3];
 };
 
+struct scm_timestamping64 {
+	struct __kernel_timespec ts[3];
+};
+
 /* The type of scm_timestamping, passed in sock_extended_err ee_info.
  * This defines the type of ts[0]. For SCM_TSTAMP_SND only, if ts[0]
  * is zero, then this is a hardware timestamp and recorded in ts[2].
diff --git a/net/core/scm.c b/net/core/scm.c
index b1ff8a441748..52ef219cf6df 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -29,6 +29,7 @@
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
+#include <linux/errqueue.h>
 
 #include <linux/uaccess.h>
 
@@ -252,6 +253,32 @@ out:
 }
 EXPORT_SYMBOL(put_cmsg);
 
+void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal)
+{
+	struct scm_timestamping64 tss;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tss.ts); i++) {
+		tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec;
+		tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec;
+	}
+
+	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss);
+}
+EXPORT_SYMBOL(put_cmsg_scm_timestamping64);
+
+void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal)
+{
+	struct scm_timestamping tss;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tss.ts); i++)
+		tss.ts[i] = timespec64_to_timespec(tss_internal->ts[i]);
+
+	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss);
+}
+EXPORT_SYMBOL(put_cmsg_scm_timestamping);
+
 void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 {
 	struct cmsghdr __user *cm
diff --git a/net/core/sock.c b/net/core/sock.c
index 14b987eab10c..a9d1ecce96e5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -890,6 +890,8 @@ set_rcvbuf:
 		}
 		break;
 
+	case SO_TIMESTAMPING_NEW:
+		sock_set_flag(sk, SOCK_TSTAMP_NEW);
 	case SO_TIMESTAMPING_OLD:
 		if (val & ~SOF_TIMESTAMPING_MASK) {
 			ret = -EINVAL;
@@ -921,9 +923,13 @@ set_rcvbuf:
 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 			sock_enable_timestamp(sk,
 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
-		else
+		else {
+			if (optname == SO_TIMESTAMPING_NEW)
+				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
+
 			sock_disable_timestamp(sk,
 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+		}
 		break;
 
 	case SO_RCVLOWAT:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4e9388bf104a..cab6b2f2f61d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1844,22 +1844,22 @@ out:
 #endif
 
 static void tcp_update_recv_tstamps(struct sk_buff *skb,
-				    struct scm_timestamping *tss)
+				    struct scm_timestamping_internal *tss)
 {
 	if (skb->tstamp)
-		tss->ts[0] = ktime_to_timespec(skb->tstamp);
+		tss->ts[0] = ktime_to_timespec64(skb->tstamp);
 	else
-		tss->ts[0] = (struct timespec) {0};
+		tss->ts[0] = (struct timespec64) {0};
 
 	if (skb_hwtstamps(skb)->hwtstamp)
-		tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
+		tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
 	else
-		tss->ts[2] = (struct timespec) {0};
+		tss->ts[2] = (struct timespec64) {0};
 }
 
 /* Similar to __sock_recv_timestamp, but does not require an skb */
 static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
-			       struct scm_timestamping *tss)
+			       struct scm_timestamping_internal *tss)
 {
 	int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
 	bool has_timestamping = false;
@@ -1873,8 +1873,10 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
 						 sizeof(kts), &kts);
 				} else {
+					struct timespec ts_old = timespec64_to_timespec(tss->ts[0]);
+
 					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
-						 sizeof(tss->ts[0]), &tss->ts[0]);
+						 sizeof(ts_old), &ts_old);
 				}
 			} else {
 				if (new_tstamp) {
@@ -1898,20 +1900,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 		if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
 			has_timestamping = true;
 		else
-			tss->ts[0] = (struct timespec) {0};
+			tss->ts[0] = (struct timespec64) {0};
 	}
 
 	if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
 		if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
 			has_timestamping = true;
 		else
-			tss->ts[2] = (struct timespec) {0};
+			tss->ts[2] = (struct timespec64) {0};
 	}
 
 	if (has_timestamping) {
-		tss->ts[1] = (struct timespec) {0};
-		put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD,
-			 sizeof(*tss), tss);
+		tss->ts[1] = (struct timespec64) {0};
+		if (sock_flag(sk, SOCK_TSTAMP_NEW))
+			put_cmsg_scm_timestamping64(msg, tss);
+		else
+			put_cmsg_scm_timestamping(msg, tss);
 	}
 }
 
@@ -1952,7 +1956,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	long timeo;
 	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
-	struct scm_timestamping tss;
+	struct scm_timestamping_internal tss;
 	bool has_tss = false;
 	bool has_cmsg;
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index c4e56602e0c6..369870b0ef79 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -291,7 +291,8 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
 			     (1UL << SOCK_RXQ_OVFL) | \
 			     (1UL << SOCK_WIFI_STATUS) | \
 			     (1UL << SOCK_NOFCS) | \
-			     (1UL << SOCK_FILTER_LOCKED))
+			     (1UL << SOCK_FILTER_LOCKED) | \
+			     (1UL << SOCK_TSTAMP_NEW))
 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
  * clc socket (since smc is not called for these options from net/core)
  */
diff --git a/net/socket.c b/net/socket.c
index 1de96abd78d3..d51930689b98 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -706,7 +706,8 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 {
 	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
 	int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
-	struct scm_timestamping tss;
+	struct scm_timestamping_internal tss;
+
 	int empty = 1, false_tstamp = 0;
 	struct skb_shared_hwtstamps *shhwtstamps =
 		skb_hwtstamps(skb);
@@ -752,20 +753,22 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 
 	memset(&tss, 0, sizeof(tss));
 	if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
-	    ktime_to_timespec_cond(skb->tstamp, tss.ts + 0))
+	    ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
 		empty = 0;
 	if (shhwtstamps &&
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
 	    !skb_is_swtx_tstamp(skb, false_tstamp) &&
-	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
+	    ktime_to_timespec64_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
 		empty = 0;
 		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
 		    !skb_is_err_queue(skb))
 			put_ts_pktinfo(msg, skb);
 	}
 	if (!empty) {
-		put_cmsg(msg, SOL_SOCKET,
-			 SO_TIMESTAMPING_OLD, sizeof(tss), &tss);
+		if (sock_flag(sk, SOCK_TSTAMP_NEW))
+			put_cmsg_scm_timestamping64(msg, &tss);
+		else
+			put_cmsg_scm_timestamping(msg, &tss);
 
 		if (skb_is_err_queue(skb) && skb->len &&
 		    SKB_EXT_ERR(skb)->opt_stats)
-- 
cgit v1.2.3-59-g8ed1b


From 45bdc66159d49bfc7f75fe02d25bc74f5d2660cf Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 2 Feb 2019 07:34:53 -0800
Subject: socket: Rename SO_RCVTIMEO/ SO_SNDTIMEO with _OLD suffixes

SO_RCVTIMEO and SO_SNDTIMEO socket options use struct timeval
as the time format. struct timeval is not y2038 safe.
The subsequent patches in the series add support for new socket
timeout options with _NEW suffix that will use y2038 safe
data structures. Although the existing struct timeval layout
is sufficiently wide to represent timeouts, because of the way
libc will interpret time_t based on user defined flag, these
new flags provide a way of having a structure that is the same
for all architectures consistently.
Rename the existing options with _OLD suffix forms so that the
right option is enabled for userspace applications according
to the architecture and time_t definition of libc.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Cc: ccaulfie@redhat.com
Cc: deller@gmx.de
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: rth@twiddle.net
Cc: cluster-devel@redhat.com
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-alpha@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mips@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h   | 7 +++++--
 arch/mips/include/uapi/asm/socket.h    | 6 ++++--
 arch/parisc/include/uapi/asm/socket.h  | 6 ++++--
 arch/powerpc/include/uapi/asm/socket.h | 4 ++--
 arch/sparc/include/uapi/asm/socket.h   | 7 +++++--
 fs/dlm/lowcomms.c                      | 4 ++--
 include/uapi/asm-generic/socket.h      | 6 ++++--
 net/core/sock.c                        | 4 ++--
 8 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 934ea6268f1a..9826d1db71d0 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -31,8 +31,8 @@
 #define SO_RCVBUFFORCE	0x100b
 #define	SO_RCVLOWAT	0x1010
 #define	SO_SNDLOWAT	0x1011
-#define	SO_RCVTIMEO	0x1012
-#define	SO_SNDTIMEO	0x1013
+#define	SO_RCVTIMEO_OLD	0x1012
+#define	SO_SNDTIMEO_OLD	0x1013
 #define SO_ACCEPTCONN	0x1014
 #define SO_PROTOCOL	0x1028
 #define SO_DOMAIN	0x1029
@@ -121,6 +121,9 @@
 
 #if !defined(__KERNEL__)
 
+#define	SO_RCVTIMEO SO_RCVTIMEO_OLD
+#define	SO_SNDTIMEO SO_SNDTIMEO_OLD
+
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 110f9506d64f..96cc0e907f12 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -39,8 +39,8 @@
 #define SO_RCVBUF	0x1002	/* Receive buffer. */
 #define SO_SNDLOWAT	0x1003	/* send low-water mark */
 #define SO_RCVLOWAT	0x1004	/* receive low-water mark */
-#define SO_SNDTIMEO	0x1005	/* send timeout */
-#define SO_RCVTIMEO	0x1006	/* receive timeout */
+#define SO_SNDTIMEO_OLD	0x1005	/* send timeout */
+#define SO_RCVTIMEO_OLD	0x1006	/* receive timeout */
 #define SO_ACCEPTCONN	0x1009
 #define SO_PROTOCOL	0x1028	/* protocol type */
 #define SO_DOMAIN	0x1029	/* domain/socket family */
@@ -132,6 +132,8 @@
 
 #if !defined(__KERNEL__)
 
+#define	SO_RCVTIMEO SO_RCVTIMEO_OLD
+#define	SO_SNDTIMEO SO_SNDTIMEO_OLD
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index bee2a9dde656..046f0cd9cce4 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -22,8 +22,8 @@
 #define SO_RCVBUFFORCE	0x100b
 #define SO_SNDLOWAT	0x1003
 #define SO_RCVLOWAT	0x1004
-#define SO_SNDTIMEO	0x1005
-#define SO_RCVTIMEO	0x1006
+#define SO_SNDTIMEO_OLD	0x1005
+#define SO_RCVTIMEO_OLD	0x1006
 #define SO_ERROR	0x1007
 #define SO_TYPE		0x1008
 #define SO_PROTOCOL	0x1028
@@ -113,6 +113,8 @@
 
 #if !defined(__KERNEL__)
 
+#define	SO_RCVTIMEO SO_RCVTIMEO_OLD
+#define	SO_SNDTIMEO SO_SNDTIMEO_OLD
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index 94de465e0920..12aa0c43e775 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -11,8 +11,8 @@
 
 #define SO_RCVLOWAT	16
 #define SO_SNDLOWAT	17
-#define SO_RCVTIMEO	18
-#define SO_SNDTIMEO	19
+#define SO_RCVTIMEO_OLD	18
+#define SO_SNDTIMEO_OLD	19
 #define SO_PASSCRED	20
 #define SO_PEERCRED	21
 
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 2b38dda51426..342ffdc3b424 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -21,8 +21,8 @@
 #define SO_BSDCOMPAT    0x0400
 #define SO_RCVLOWAT     0x0800
 #define SO_SNDLOWAT     0x1000
-#define SO_RCVTIMEO     0x2000
-#define SO_SNDTIMEO     0x4000
+#define SO_RCVTIMEO_OLD     0x2000
+#define SO_SNDTIMEO_OLD     0x4000
 #define SO_ACCEPTCONN	0x8000
 
 #define SO_SNDBUF	0x1001
@@ -114,6 +114,9 @@
 
 #if !defined(__KERNEL__)
 
+#define SO_RCVTIMEO              SO_RCVTIMEO_OLD
+#define SO_SNDTIMEO              SO_SNDTIMEO_OLD
+
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 76976d6e50f9..c98ad9777ad9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1089,12 +1089,12 @@ static void sctp_connect_to_sock(struct connection *con)
 	 * since O_NONBLOCK argument in connect() function does not work here,
 	 * then, we should restore the default value of this attribute.
 	 */
-	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
 			  sizeof(tv));
 	result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
 				   0);
 	memset(&tv, 0, sizeof(tv));
-	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
 			  sizeof(tv));
 
 	if (result == -EINPROGRESS)
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 2713e0fa68ef..c56b8b487c12 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -30,8 +30,8 @@
 #define SO_PEERCRED	17
 #define SO_RCVLOWAT	18
 #define SO_SNDLOWAT	19
-#define SO_RCVTIMEO	20
-#define SO_SNDTIMEO	21
+#define SO_RCVTIMEO_OLD	20
+#define SO_SNDTIMEO_OLD	21
 #endif
 
 /* Security levels - as per NRL IPv6 - don't actually do anything */
@@ -116,6 +116,8 @@
 
 #if !defined(__KERNEL__)
 
+#define	SO_RCVTIMEO SO_RCVTIMEO_OLD
+#define	SO_SNDTIMEO SO_SNDTIMEO_OLD
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
 /* on 64-bit and x32, avoid the ?: operator */
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
diff --git a/net/core/sock.c b/net/core/sock.c
index a9d1ecce96e5..27b40002b780 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -941,11 +941,11 @@ set_rcvbuf:
 			sk->sk_rcvlowat = val ? : 1;
 		break;
 
-	case SO_RCVTIMEO:
+	case SO_RCVTIMEO_OLD:
 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 		break;
 
-	case SO_SNDTIMEO:
+	case SO_SNDTIMEO_OLD:
 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 		break;
 
-- 
cgit v1.2.3-59-g8ed1b


From a9beb86ae6e55bd92f38453c8623de60b8e5a308 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 2 Feb 2019 07:34:54 -0800
Subject: sock: Add SO_RCVTIMEO_NEW and SO_SNDTIMEO_NEW

Add new socket timeout options that are y2038 safe.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Cc: ccaulfie@redhat.com
Cc: davem@davemloft.net
Cc: deller@gmx.de
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: rth@twiddle.net
Cc: cluster-devel@redhat.com
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-alpha@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mips@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  | 12 ++++++--
 arch/mips/include/uapi/asm/socket.h   | 11 ++++++--
 arch/parisc/include/uapi/asm/socket.h | 10 +++++--
 arch/sparc/include/uapi/asm/socket.h  | 11 ++++++--
 include/uapi/asm-generic/socket.h     | 11 ++++++--
 net/core/sock.c                       | 53 ++++++++++++++++++++++++++---------
 6 files changed, 83 insertions(+), 25 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 9826d1db71d0..0d0fddb7e738 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -119,19 +119,25 @@
 #define SO_TIMESTAMPNS_NEW      64
 #define SO_TIMESTAMPING_NEW     65
 
-#if !defined(__KERNEL__)
+#define SO_RCVTIMEO_NEW         66
+#define SO_SNDTIMEO_NEW         67
 
-#define	SO_RCVTIMEO SO_RCVTIMEO_OLD
-#define	SO_SNDTIMEO SO_SNDTIMEO_OLD
+#if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
 #define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
+
+#define SO_RCVTIMEO		SO_RCVTIMEO_OLD
+#define SO_SNDTIMEO		SO_SNDTIMEO_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
 #define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
+
+#define SO_RCVTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_RCVTIMEO_OLD : SO_RCVTIMEO_NEW)
+#define SO_SNDTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_SNDTIMEO_OLD : SO_SNDTIMEO_NEW)
 #endif
 
 #define SCM_TIMESTAMP           SO_TIMESTAMP
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 96cc0e907f12..eb9f33f8a8b3 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -130,18 +130,25 @@
 #define SO_TIMESTAMPNS_NEW      64
 #define SO_TIMESTAMPING_NEW     65
 
+#define SO_RCVTIMEO_NEW         66
+#define SO_SNDTIMEO_NEW         67
+
 #if !defined(__KERNEL__)
 
-#define	SO_RCVTIMEO SO_RCVTIMEO_OLD
-#define	SO_SNDTIMEO SO_SNDTIMEO_OLD
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
 #define SO_TIMESTAMPING		SO_TIMESTAMPING_OLD
+
+#define SO_RCVTIMEO             SO_RCVTIMEO_OLD
+#define SO_SNDTIMEO             SO_SNDTIMEO_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
 #define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
+
+#define SO_RCVTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_RCVTIMEO_OLD : SO_RCVTIMEO_NEW)
+#define SO_SNDTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_SNDTIMEO_OLD : SO_SNDTIMEO_NEW)
 #endif
 
 #define SCM_TIMESTAMP           SO_TIMESTAMP
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 046f0cd9cce4..16e428f03526 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -111,18 +111,24 @@
 #define SO_TIMESTAMPNS_NEW      0x4039
 #define SO_TIMESTAMPING_NEW     0x403A
 
+#define SO_RCVTIMEO_NEW         0x4040
+#define SO_SNDTIMEO_NEW         0x4041
+
 #if !defined(__KERNEL__)
 
-#define	SO_RCVTIMEO SO_RCVTIMEO_OLD
-#define	SO_SNDTIMEO SO_SNDTIMEO_OLD
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
 #define SO_TIMESTAMPING         SO_TIMESTAMPING_OLD
+#define SO_RCVTIMEO		SO_RCVTIMEO_OLD
+#define SO_SNDTIMEO		SO_SNDTIMEO_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
 #define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
+
+#define SO_RCVTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_RCVTIMEO_OLD : SO_RCVTIMEO_NEW)
+#define SO_SNDTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_SNDTIMEO_OLD : SO_SNDTIMEO_NEW)
 #endif
 
 #define SCM_TIMESTAMP           SO_TIMESTAMP
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 342ffdc3b424..8c9f74a66b55 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -112,19 +112,26 @@
 #define SO_TIMESTAMPNS_NEW       0x0042
 #define SO_TIMESTAMPING_NEW      0x0043
 
+#define SO_RCVTIMEO_NEW          0x0044
+#define SO_SNDTIMEO_NEW          0x0045
+
 #if !defined(__KERNEL__)
 
-#define SO_RCVTIMEO              SO_RCVTIMEO_OLD
-#define SO_SNDTIMEO              SO_SNDTIMEO_OLD
 
 #if __BITS_PER_LONG == 64
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
 #define SO_TIMESTAMPING		SO_TIMESTAMPING_OLD
+
+#define SO_RCVTIMEO		SO_RCVTIMEO_OLD
+#define SO_SNDTIMEO		SO_SNDTIMEO_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
 #define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
+
+#define SO_RCVTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_RCVTIMEO_OLD : SO_RCVTIMEO_NEW)
+#define SO_SNDTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_SNDTIMEO_OLD : SO_SNDTIMEO_NEW)
 #endif
 
 #define SCM_TIMESTAMP          SO_TIMESTAMP
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index c56b8b487c12..c8b430cb6dc4 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -114,19 +114,26 @@
 #define SO_TIMESTAMPNS_NEW      64
 #define SO_TIMESTAMPING_NEW     65
 
+#define SO_RCVTIMEO_NEW         66
+#define SO_SNDTIMEO_NEW         67
+
 #if !defined(__KERNEL__)
 
-#define	SO_RCVTIMEO SO_RCVTIMEO_OLD
-#define	SO_SNDTIMEO SO_SNDTIMEO_OLD
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
 /* on 64-bit and x32, avoid the ?: operator */
 #define SO_TIMESTAMP		SO_TIMESTAMP_OLD
 #define SO_TIMESTAMPNS		SO_TIMESTAMPNS_OLD
 #define SO_TIMESTAMPING		SO_TIMESTAMPING_OLD
+
+#define SO_RCVTIMEO		SO_RCVTIMEO_OLD
+#define SO_SNDTIMEO		SO_SNDTIMEO_OLD
 #else
 #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW)
 #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW)
 #define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW)
+
+#define SO_RCVTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_RCVTIMEO_OLD : SO_RCVTIMEO_NEW)
+#define SO_SNDTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_SNDTIMEO_OLD : SO_SNDTIMEO_NEW)
 #endif
 
 #define SCM_TIMESTAMP           SO_TIMESTAMP
diff --git a/net/core/sock.c b/net/core/sock.c
index 27b40002b780..a8904ae40713 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -335,9 +335,10 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(__sk_backlog_rcv);
 
-static int sock_get_timeout(long timeo, void *optval)
+static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 {
-	struct __kernel_old_timeval tv;
+	struct __kernel_sock_timeval tv;
+	int size;
 
 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
 		tv.tv_sec = 0;
@@ -353,13 +354,23 @@ static int sock_get_timeout(long timeo, void *optval)
 		return sizeof(tv32);
 	}
 
-	*(struct __kernel_old_timeval *)optval = tv;
-	return sizeof(tv);
+	if (old_timeval) {
+		struct __kernel_old_timeval old_tv;
+		old_tv.tv_sec = tv.tv_sec;
+		old_tv.tv_usec = tv.tv_usec;
+		*(struct __kernel_old_timeval *)optval = old_tv;
+		size = sizeof(old_tv);
+	} else {
+		*(struct __kernel_sock_timeval *)optval = tv;
+		size = sizeof(tv);
+	}
+
+	return size;
 }
 
-static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
+static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
 {
-	struct __kernel_old_timeval tv;
+	struct __kernel_sock_timeval tv;
 
 	if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 		struct old_timeval32 tv32;
@@ -371,6 +382,15 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 			return -EFAULT;
 		tv.tv_sec = tv32.tv_sec;
 		tv.tv_usec = tv32.tv_usec;
+	} else if (old_timeval) {
+		struct __kernel_old_timeval old_tv;
+
+		if (optlen < sizeof(old_tv))
+			return -EINVAL;
+		if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
+			return -EFAULT;
+		tv.tv_sec = old_tv.tv_sec;
+		tv.tv_usec = old_tv.tv_usec;
 	} else {
 		if (optlen < sizeof(tv))
 			return -EINVAL;
@@ -394,8 +414,8 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 		return 0;
-	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
-		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
+	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
+		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 	return 0;
 }
 
@@ -942,11 +962,13 @@ set_rcvbuf:
 		break;
 
 	case SO_RCVTIMEO_OLD:
-		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+	case SO_RCVTIMEO_NEW:
+		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
 		break;
 
 	case SO_SNDTIMEO_OLD:
-		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+	case SO_SNDTIMEO_NEW:
+		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
 		break;
 
 	case SO_ATTACH_FILTER:
@@ -1171,6 +1193,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		struct linger ling;
 		struct old_timeval32 tm32;
 		struct __kernel_old_timeval tm;
+		struct  __kernel_sock_timeval stm;
 		struct sock_txtime txtime;
 	} v;
 
@@ -1279,12 +1302,14 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sk->sk_tsflags;
 		break;
 
-	case SO_RCVTIMEO:
-		lv = sock_get_timeout(sk->sk_rcvtimeo, &v);
+	case SO_RCVTIMEO_OLD:
+	case SO_RCVTIMEO_NEW:
+		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
 		break;
 
-	case SO_SNDTIMEO:
-		lv = sock_get_timeout(sk->sk_sndtimeo, &v);
+	case SO_SNDTIMEO_OLD:
+	case SO_SNDTIMEO_NEW:
+		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
 		break;
 
 	case SO_RCVLOWAT:
-- 
cgit v1.2.3-59-g8ed1b


From bff5731d43efbdf0bbd2d73cab32fe6435ea1046 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 1 Feb 2019 17:56:28 -0800
Subject: net: devlink: report cell size of shared buffers

Shared buffer allocation is usually done in cell increments.
Drivers will either round up the allocation or refuse the
configuration if it's not an exact multiple of cell size.
Drivers know exactly the cell size of shared buffer, so help
out users by providing this information in dumps.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c | 1 +
 drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c    | 1 +
 include/net/devlink.h                                  | 1 +
 include/uapi/linux/devlink.h                           | 2 ++
 net/core/devlink.c                                     | 3 +++
 5 files changed, 8 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
index 12c61e0cc570..80066f437a65 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
@@ -713,6 +713,7 @@ int mlxsw_sp_sb_pool_get(struct mlxsw_core *mlxsw_core,
 	pool_info->pool_type = (enum devlink_sb_pool_type) dir;
 	pool_info->size = mlxsw_sp_cells_bytes(mlxsw_sp, pr->size);
 	pool_info->threshold_type = (enum devlink_sb_threshold_type) pr->mode;
+	pool_info->cell_size = mlxsw_sp->sb->cell_size;
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c b/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c
index 814360ed3a20..ea2e3f829aba 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c
@@ -48,6 +48,7 @@ int nfp_shared_buf_pool_get(struct nfp_pf *pf, unsigned int sb, u16 pool_index,
 	pool_info->pool_type = le32_to_cpu(get_data.pool_type);
 	pool_info->threshold_type = le32_to_cpu(get_data.threshold_type);
 	pool_info->size = le32_to_cpu(get_data.size) * unit_size;
+	pool_info->cell_size = unit_size;
 
 	return 0;
 }
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 1c8523920f66..74d992a68a06 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -62,6 +62,7 @@ struct devlink_sb_pool_info {
 	enum devlink_sb_pool_type pool_type;
 	u32 size;
 	enum devlink_sb_threshold_type threshold_type;
+	u32 cell_size;
 };
 
 /**
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 7fffd879c328..054b2d1a4537 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -300,6 +300,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_INFO_VERSION_NAME,		/* string */
 	DEVLINK_ATTR_INFO_VERSION_VALUE,	/* string */
 
+	DEVLINK_ATTR_SB_POOL_CELL_SIZE,		/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index eb839d74bcc0..52bf27491fb8 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -932,6 +932,9 @@ static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink,
 	if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
 		       pool_info.threshold_type))
 		goto nla_put_failure;
+	if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE,
+			pool_info.cell_size))
+		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From a46c52d9f2659498f0c0871f7f2333a692c243fe Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Tue, 29 Jan 2019 15:51:17 +0800
Subject: netfilter: nft_tunnel: Add NFTA_TUNNEL_MODE options

nft "tunnel" expr match both the tun_info of RX and TX. This patch
provide the NFTA_TUNNEL_MODE to individually match the tun_info of
RX or TX.

Signed-off-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  9 +++++++++
 net/netfilter/nft_tunnel.c               | 34 ++++++++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 030302893d96..a66c8de006cc 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1727,10 +1727,19 @@ enum nft_tunnel_keys {
 };
 #define NFT_TUNNEL_MAX	(__NFT_TUNNEL_MAX - 1)
 
+enum nft_tunnel_mode {
+	NFT_TUNNEL_MODE_NONE,
+	NFT_TUNNEL_MODE_RX,
+	NFT_TUNNEL_MODE_TX,
+	__NFT_TUNNEL_MODE_MAX
+};
+#define NFT_TUNNEL_MODE_MAX	(__NFT_TUNNEL_MODE_MAX - 1)
+
 enum nft_tunnel_attributes {
 	NFTA_TUNNEL_UNSPEC,
 	NFTA_TUNNEL_KEY,
 	NFTA_TUNNEL_DREG,
+	NFTA_TUNNEL_MODE,
 	__NFTA_TUNNEL_MAX
 };
 #define NFTA_TUNNEL_MAX	(__NFTA_TUNNEL_MAX - 1)
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 3a15f219e4e7..ea28588c5eed 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -15,6 +15,7 @@
 struct nft_tunnel {
 	enum nft_tunnel_keys	key:8;
 	enum nft_registers	dreg:8;
+	enum nft_tunnel_mode	mode:8;
 };
 
 static void nft_tunnel_get_eval(const struct nft_expr *expr,
@@ -29,14 +30,32 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr,
 
 	switch (priv->key) {
 	case NFT_TUNNEL_PATH:
-		nft_reg_store8(dest, !!tun_info);
+		if (!tun_info) {
+			nft_reg_store8(dest, false);
+			return;
+		}
+		if (priv->mode == NFT_TUNNEL_MODE_NONE ||
+		    (priv->mode == NFT_TUNNEL_MODE_RX &&
+		     !(tun_info->mode & IP_TUNNEL_INFO_TX)) ||
+		    (priv->mode == NFT_TUNNEL_MODE_TX &&
+		     (tun_info->mode & IP_TUNNEL_INFO_TX)))
+			nft_reg_store8(dest, true);
+		else
+			nft_reg_store8(dest, false);
 		break;
 	case NFT_TUNNEL_ID:
 		if (!tun_info) {
 			regs->verdict.code = NFT_BREAK;
 			return;
 		}
-		*dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
+		if (priv->mode == NFT_TUNNEL_MODE_NONE ||
+		    (priv->mode == NFT_TUNNEL_MODE_RX &&
+		     !(tun_info->mode & IP_TUNNEL_INFO_TX)) ||
+		    (priv->mode == NFT_TUNNEL_MODE_TX &&
+		     (tun_info->mode & IP_TUNNEL_INFO_TX)))
+			*dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
+		else
+			regs->verdict.code = NFT_BREAK;
 		break;
 	default:
 		WARN_ON(1);
@@ -47,6 +66,7 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr,
 static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = {
 	[NFTA_TUNNEL_KEY]	= { .type = NLA_U32 },
 	[NFTA_TUNNEL_DREG]	= { .type = NLA_U32 },
+	[NFTA_TUNNEL_MODE]	= { .type = NLA_U32 },
 };
 
 static int nft_tunnel_get_init(const struct nft_ctx *ctx,
@@ -74,6 +94,14 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
 
 	priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]);
 
+	if (tb[NFTA_TUNNEL_MODE]) {
+		priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE]));
+		if (priv->mode > NFT_TUNNEL_MODE_MAX)
+			return -EOPNOTSUPP;
+	} else {
+		priv->mode = NFT_TUNNEL_MODE_NONE;
+	}
+
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, len);
 }
@@ -87,6 +115,8 @@ static int nft_tunnel_get_dump(struct sk_buff *skb,
 		goto nla_put_failure;
 	if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_TUNNEL_MODE, htonl(priv->mode)))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3-59-g8ed1b


From 52a72e2a395fa3c5ab5df41058a8511e87215730 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 22 Jan 2019 08:48:42 +0200
Subject: IB/uverbs: Expose XRC ODP device capabilities

Expose XRC ODP capabilities as part of the extended device capabilities.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 1 +
 include/rdma/ib_verbs.h              | 1 +
 include/uapi/rdma/ib_user_verbs.h    | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index d4f1a2ef5015..68c4ea514faf 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3613,6 +3613,7 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs)
 		attr.odp_caps.per_transport_caps.uc_odp_caps;
 	resp.odp_caps.per_transport_caps.ud_odp_caps =
 		attr.odp_caps.per_transport_caps.ud_odp_caps;
+	resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps;
 
 	resp.timestamp_mask = attr.timestamp_mask;
 	resp.hca_core_clock = attr.hca_core_clock;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 5eefdea62831..8219c07340a9 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -277,6 +277,7 @@ struct ib_odp_caps {
 		uint32_t  rc_odp_caps;
 		uint32_t  uc_odp_caps;
 		uint32_t  ud_odp_caps;
+		uint32_t  xrc_odp_caps;
 	} per_transport_caps;
 };
 
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 480d9a60b68e..0474c7400268 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -270,6 +270,8 @@ struct ib_uverbs_ex_query_device_resp {
 	struct ib_uverbs_tm_caps tm_caps;
 	struct ib_uverbs_cq_moderation_caps cq_moderation_caps;
 	__aligned_u64 max_dm_size;
+	__u32 xrc_odp_caps;
+	__u32 reserved;
 };
 
 struct ib_uverbs_query_port {
-- 
cgit v1.2.3-59-g8ed1b


From 668aa15b5bf87f156ec805cb7348c785c56b82ab Mon Sep 17 00:00:00 2001
From: Kamal Heib <kamalheib1@gmail.com>
Date: Tue, 29 Jan 2019 12:08:50 +0200
Subject: RDMA/rxe: Improve loopback marking

Currently a packet is marked for loopback only if the source and
destination addresses equals. This is not enough when multiple gids are
present in rxe device's gid table and the traffic is from one gid to
another. Fix it by marking the packet for loopback if the destination MAC
address is equal to the source MAC address.

Signed-off-by: Kamal Heib <kamalheib1@gmail.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
Tested-by: Yuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/rxe/rxe_av.c  | 1 +
 drivers/infiniband/sw/rxe/rxe_net.c | 9 +++------
 include/uapi/rdma/rdma_user_rxe.h   | 3 +--
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
index 27a7dec18874..81ee756c19b8 100644
--- a/drivers/infiniband/sw/rxe/rxe_av.c
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -38,6 +38,7 @@ void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av)
 {
 	rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr);
 	rxe_av_fill_ip_info(av, attr);
+	memcpy(av->dmac, attr->roce.dmac, ETH_ALEN);
 }
 
 int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr)
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 19f3c69916b1..3b162e92e8e8 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -384,9 +384,6 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 		return -EHOSTUNREACH;
 	}
 
-	if (!memcmp(saddr, daddr, sizeof(*daddr)))
-		pkt->mask |= RXE_LOOPBACK_MASK;
-
 	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
 			cpu_to_be16(ROCE_V2_UDP_DPORT));
 
@@ -411,9 +408,6 @@ static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 		return -EHOSTUNREACH;
 	}
 
-	if (!memcmp(saddr, daddr, sizeof(*daddr)))
-		pkt->mask |= RXE_LOOPBACK_MASK;
-
 	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
 			cpu_to_be16(ROCE_V2_UDP_DPORT));
 
@@ -437,6 +431,9 @@ int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc)
 
 	*crc = rxe_icrc_hdr(pkt, skb);
 
+	if (ether_addr_equal(skb->dev->dev_addr, av->dmac))
+		pkt->mask |= RXE_LOOPBACK_MASK;
+
 	return err;
 }
 
diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h
index 44ef6a3b7afc..aae2e696bb38 100644
--- a/include/uapi/rdma/rdma_user_rxe.h
+++ b/include/uapi/rdma/rdma_user_rxe.h
@@ -58,8 +58,7 @@ struct rxe_global_route {
 struct rxe_av {
 	__u8			port_num;
 	__u8			network_type;
-	__u16			reserved1;
-	__u32			reserved2;
+	__u8			dmac[6];
 	struct rxe_global_route	grh;
 	union {
 		struct sockaddr_in	_sockaddr_in;
-- 
cgit v1.2.3-59-g8ed1b


From 3eb450367d0823226515ee24712ed08eccb33eb9 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Date: Tue, 23 Oct 2018 23:21:14 -0400
Subject: rds: add type of service(tos) infrastructure

RDS Service type (TOS) is user-defined and needs to be configured
via RDS IOCTL interface. It must be set before initiating any
traffic and once set the TOS can not be changed. All out-going
traffic from the socket will be associated with its TOS.

Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
[yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes]
Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
---
 include/uapi/linux/rds.h | 11 +++++++++++
 net/rds/af_rds.c         | 35 ++++++++++++++++++++++++++++++++++-
 net/rds/connection.c     | 20 +++++++++++---------
 net/rds/ib.c             |  1 +
 net/rds/ib_cm.c          |  2 +-
 net/rds/rdma_transport.c |  1 +
 net/rds/rds.h            |  9 +++++++--
 net/rds/recv.c           |  1 +
 net/rds/send.c           |  6 +++---
 net/rds/tcp.c            |  1 +
 net/rds/tcp_listen.c     |  2 +-
 11 files changed, 72 insertions(+), 17 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 8b73cb603c5f..5d0f76c780e5 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -69,6 +69,12 @@
 #define RDS_TRANS_COUNT	3
 #define	RDS_TRANS_NONE	(~0)
 
+/* IOCTLS commands for SOL_RDS */
+#define SIOCRDSSETTOS		(SIOCPROTOPRIVATE)
+#define SIOCRDSGETTOS		(SIOCPROTOPRIVATE + 1)
+
+typedef __u8	rds_tos_t;
+
 /*
  * Control message types for SOL_RDS.
  *
@@ -149,6 +155,7 @@ struct rds_info_connection {
 	__be32		faddr;
 	__u8		transport[TRANSNAMSIZ];		/* null term ascii */
 	__u8		flags;
+	__u8		tos;
 } __attribute__((packed));
 
 struct rds6_info_connection {
@@ -171,6 +178,7 @@ struct rds_info_message {
 	__be16		lport;
 	__be16		fport;
 	__u8		flags;
+	__u8		tos;
 } __attribute__((packed));
 
 struct rds6_info_message {
@@ -214,6 +222,7 @@ struct rds_info_tcp_socket {
 	__u32           last_sent_nxt;
 	__u32           last_expected_una;
 	__u32           last_seen_una;
+	__u8		tos;
 } __attribute__((packed));
 
 struct rds6_info_tcp_socket {
@@ -240,6 +249,7 @@ struct rds_info_rdma_connection {
 	__u32		max_send_sge;
 	__u32		rdma_mr_max;
 	__u32		rdma_mr_size;
+	__u8		tos;
 };
 
 struct rds6_info_rdma_connection {
@@ -253,6 +263,7 @@ struct rds6_info_rdma_connection {
 	__u32		max_send_sge;
 	__u32		rdma_mr_max;
 	__u32		rdma_mr_size;
+	__u8		tos;
 };
 
 /* RDS message Receive Path Latency points */
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 65571a6273c3..904515858037 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -254,7 +254,38 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
 
 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
-	return -ENOIOCTLCMD;
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+	rds_tos_t tos;
+
+	switch (cmd) {
+	case SIOCRDSSETTOS:
+		if (get_user(tos, (rds_tos_t __user *)arg))
+			return -EFAULT;
+
+		if (rs->rs_transport &&
+		    rs->rs_transport->t_type == RDS_TRANS_TCP)
+			tos = 0;
+
+		spin_lock_bh(&rds_sock_lock);
+		if (rs->rs_tos || rs->rs_conn) {
+			spin_unlock_bh(&rds_sock_lock);
+			return -EINVAL;
+		}
+		rs->rs_tos = tos;
+		spin_unlock_bh(&rds_sock_lock);
+		break;
+	case SIOCRDSGETTOS:
+		spin_lock_bh(&rds_sock_lock);
+		tos = rs->rs_tos;
+		spin_unlock_bh(&rds_sock_lock);
+		if (put_user(tos, (rds_tos_t __user *)arg))
+			return -EFAULT;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
 }
 
 static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
@@ -650,6 +681,8 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	spin_lock_init(&rs->rs_rdma_lock);
 	rs->rs_rdma_keys = RB_ROOT;
 	rs->rs_rx_traces = 0;
+	rs->rs_tos = 0;
+	rs->rs_conn = NULL;
 
 	spin_lock_bh(&rds_sock_lock);
 	list_add_tail(&rs->rs_item, &rds_sock_list);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 1ab14b68ecc8..7ea134f9a825 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -84,7 +84,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
 					      const struct in6_addr *laddr,
 					      const struct in6_addr *faddr,
 					      struct rds_transport *trans,
-					      int dev_if)
+					      u8 tos, int dev_if)
 {
 	struct rds_connection *conn, *ret = NULL;
 
@@ -92,6 +92,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
 		if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
 		    ipv6_addr_equal(&conn->c_laddr, laddr) &&
 		    conn->c_trans == trans &&
+		    conn->c_tos == tos &&
 		    net == rds_conn_net(conn) &&
 		    conn->c_dev_if == dev_if) {
 			ret = conn;
@@ -160,7 +161,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 						const struct in6_addr *laddr,
 						const struct in6_addr *faddr,
 						struct rds_transport *trans,
-						gfp_t gfp,
+						gfp_t gfp, u8 tos,
 						int is_outgoing,
 						int dev_if)
 {
@@ -172,7 +173,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 	int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
 
 	rcu_read_lock();
-	conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
+	conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if);
 	if (conn &&
 	    conn->c_loopback &&
 	    conn->c_trans != &rds_loop_transport &&
@@ -206,6 +207,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 	conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
 	conn->c_faddr = *faddr;
 	conn->c_dev_if = dev_if;
+	conn->c_tos = tos;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	/* If the local address is link local, set c_bound_if to be the
@@ -298,7 +300,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 		struct rds_connection *found;
 
 		found = rds_conn_lookup(net, head, laddr, faddr, trans,
-					dev_if);
+					tos, dev_if);
 		if (found) {
 			struct rds_conn_path *cp;
 			int i;
@@ -333,10 +335,10 @@ out:
 struct rds_connection *rds_conn_create(struct net *net,
 				       const struct in6_addr *laddr,
 				       const struct in6_addr *faddr,
-				       struct rds_transport *trans, gfp_t gfp,
-				       int dev_if)
+				       struct rds_transport *trans, u8 tos,
+				       gfp_t gfp, int dev_if)
 {
-	return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
+	return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
@@ -344,9 +346,9 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net,
 						const struct in6_addr *laddr,
 						const struct in6_addr *faddr,
 						struct rds_transport *trans,
-						gfp_t gfp, int dev_if)
+						u8 tos, gfp_t gfp, int dev_if)
 {
-	return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
+	return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 9d7b7586f240..21b6588b71ca 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -301,6 +301,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 
 	iinfo->src_addr = conn->c_laddr.s6_addr32[3];
 	iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
+	iinfo->tos = conn->c_tos;
 
 	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
 	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index a1c3ad380ec8..70518e329a9e 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -786,7 +786,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 
 	/* RDS/IB is not currently netns aware, thus init_net */
 	conn = rds_conn_create(&init_net, daddr6, saddr6,
-			       &rds_ib_transport, GFP_KERNEL, ifindex);
+			       &rds_ib_transport, 0, GFP_KERNEL, ifindex);
 	if (IS_ERR(conn)) {
 		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
 		conn = NULL;
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 63cbc6b8560c..e37f91537d29 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -115,6 +115,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
 			pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
 				&conn->c_laddr, &conn->c_faddr);
 			conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
+			conn->c_tos = 0;
 			rds_conn_drop(conn);
 		}
 		rdsdebug("Connection rejected: %s\n",
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 660023f08553..7e52b92092d7 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -158,6 +158,9 @@ struct rds_connection {
 	unsigned int		c_version;
 	possible_net_t		c_net;
 
+	/* TOS */
+	u8			c_tos;
+
 	struct list_head	c_map_item;
 	unsigned long		c_map_queued;
 
@@ -652,6 +655,7 @@ struct rds_sock {
 	u8			rs_rx_traces;
 	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
 	struct rds_msg_zcopy_queue rs_zcookie_queue;
+	u8			rs_tos;
 };
 
 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@@ -760,13 +764,14 @@ void rds_conn_exit(void);
 struct rds_connection *rds_conn_create(struct net *net,
 				       const struct in6_addr *laddr,
 				       const struct in6_addr *faddr,
-				       struct rds_transport *trans, gfp_t gfp,
+				       struct rds_transport *trans,
+				       u8 tos, gfp_t gfp,
 				       int dev_if);
 struct rds_connection *rds_conn_create_outgoing(struct net *net,
 						const struct in6_addr *laddr,
 						const struct in6_addr *faddr,
 						struct rds_transport *trans,
-						gfp_t gfp, int dev_if);
+						u8 tos, gfp_t gfp, int dev_if);
 void rds_conn_shutdown(struct rds_conn_path *cpath);
 void rds_conn_destroy(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 6bb6b16ca270..853de4876088 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -782,6 +782,7 @@ void rds_inc_info_copy(struct rds_incoming *inc,
 
 	minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
 	minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+	minfo.tos = inc->i_conn->c_tos;
 
 	if (flip) {
 		minfo.laddr = daddr;
diff --git a/net/rds/send.c b/net/rds/send.c
index fd8b687d5c05..c555e121b908 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1277,12 +1277,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 
 	/* rds_conn_create has a spinlock that runs with IRQ off.
 	 * Caching the conn in the socket helps a lot. */
-	if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
+	if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) {
 		conn = rs->rs_conn;
-	else {
+	} else {
 		conn = rds_conn_create_outgoing(sock_net(sock->sk),
 						&rs->rs_bound_addr, &daddr,
-						rs->rs_transport,
+						rs->rs_transport, 0,
 						sock->sk->sk_allocation,
 						scope_id);
 		if (IS_ERR(conn)) {
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index c16f0a362c32..eb6851952cbf 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -267,6 +267,7 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
 		tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
 		tsinfo.last_expected_una = tc->t_last_expected_una;
 		tsinfo.last_seen_una = tc->t_last_seen_una;
+		tsinfo.tos = tc->t_cpath->cp_conn->c_tos;
 
 		rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
 	}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index c12203f646da..810a3a49e947 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -200,7 +200,7 @@ int rds_tcp_accept_one(struct socket *sock)
 
 	conn = rds_conn_create(sock_net(sock->sk),
 			       my_addr, peer_addr,
-			       &rds_tcp_transport, GFP_KERNEL, dev_if);
+			       &rds_tcp_transport, 0, GFP_KERNEL, dev_if);
 
 	if (IS_ERR(conn)) {
 		ret = PTR_ERR(conn);
-- 
cgit v1.2.3-59-g8ed1b


From f76903d574b26bc596951a5c5e757eb02c67abbd Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Tue, 29 Jan 2019 13:33:11 -0800
Subject: RDMA/IWPM: refactor the IWPM message attribute names

In order to add new IWPM_NL attributes, the enums for the IWPM commands
attributes are refactored such that a new attribute can be added without
breaking ABI version 3. Instead of sharing nl attribute enums for both
request and response messages, we create separate enums for each IWPM
message request and reply.  This allows us to extend any given IWPM
message by adding new attributes for just that message.  These new enums
are created, though, in a way to avoid breaking ABI version 3.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Tatyana Nikolova <Tatyana.E.Nikolova@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/iwpm_msg.c | 40 ++++++++++++++++++++++----------------
 include/uapi/rdma/rdma_netlink.h   | 21 +++++++++++++++++---
 2 files changed, 41 insertions(+), 20 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 8861c052155a..3a8753595c8f 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -403,10 +403,12 @@ register_pid_response_exit:
 
 /* netlink attribute policy for the received response to add mapping request */
 static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
-	[IWPM_NLA_MANAGE_MAPPING_SEQ]     = { .type = NLA_U32 },
-	[IWPM_NLA_MANAGE_ADDR]            = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_RMANAGE_MAPPING_ERR]	  = { .type = NLA_U16 }
+	[IWPM_NLA_RMANAGE_MAPPING_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_RMANAGE_ADDR]            = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RMANAGE_MAPPING_ERR]	   = { .type = NLA_U16 }
 };
 
 /*
@@ -430,7 +432,7 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
 
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 
-	msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]);
 	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
 	if (!nlmsg_request) {
 		pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -439,9 +441,9 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 	pm_msg = nlmsg_request->req_buffer;
 	local_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+			nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]);
 	mapped_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+			nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]);
 
 	if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
 		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
@@ -472,11 +474,15 @@ add_mapping_response_exit:
 /* netlink attribute policy for the response to add and query mapping request
  * and response with remote address info */
 static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
-	[IWPM_NLA_QUERY_MAPPING_SEQ]      = { .type = NLA_U32 },
-	[IWPM_NLA_QUERY_LOCAL_ADDR]       = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_QUERY_REMOTE_ADDR]      = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPING_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_RQUERY_LOCAL_ADDR]      = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_REMOTE_ADDR]     = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = {
+				.len = sizeof(struct sockaddr_storage) },
 	[IWPM_NLA_RQUERY_MAPPING_ERR]	  = { .type = NLA_U16 }
 };
 
@@ -502,7 +508,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
 		return -EINVAL;
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 
-	msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]);
 	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
 	if (!nlmsg_request) {
 		pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -511,9 +517,9 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
 	}
 	pm_msg = nlmsg_request->req_buffer;
 	local_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
 	remote_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
 	mapped_loc_sockaddr = (struct sockaddr_storage *)
 			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
 	mapped_rem_sockaddr = (struct sockaddr_storage *)
@@ -588,9 +594,9 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 
 	local_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
 	remote_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
 	mapped_loc_sockaddr = (struct sockaddr_storage *)
 			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
 	mapped_rem_sockaddr = (struct sockaddr_storage *)
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 2e18b77a817f..42d53e182d5f 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -83,13 +83,20 @@ enum {
 	IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0,
 	IWPM_NLA_MANAGE_MAPPING_SEQ,
 	IWPM_NLA_MANAGE_ADDR,
-	IWPM_NLA_MANAGE_MAPPED_LOC_ADDR,
+	IWPM_NLA_MANAGE_MAPPING_MAX
+};
+
+enum {
+	IWPM_NLA_RMANAGE_MAPPING_UNSPEC = 0,
+	IWPM_NLA_RMANAGE_MAPPING_SEQ,
+	IWPM_NLA_RMANAGE_ADDR,
+	IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR,
+	/* The following maintains bisectability of rdma-core */
+	IWPM_NLA_MANAGE_MAPPED_LOC_ADDR = IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR,
 	IWPM_NLA_RMANAGE_MAPPING_ERR,
 	IWPM_NLA_RMANAGE_MAPPING_MAX
 };
 
-#define IWPM_NLA_MANAGE_MAPPING_MAX 3
-#define IWPM_NLA_QUERY_MAPPING_MAX  4
 #define IWPM_NLA_MAPINFO_SEND_MAX   3
 
 enum {
@@ -97,6 +104,14 @@ enum {
 	IWPM_NLA_QUERY_MAPPING_SEQ,
 	IWPM_NLA_QUERY_LOCAL_ADDR,
 	IWPM_NLA_QUERY_REMOTE_ADDR,
+	IWPM_NLA_QUERY_MAPPING_MAX,
+};
+
+enum {
+	IWPM_NLA_RQUERY_MAPPING_UNSPEC = 0,
+	IWPM_NLA_RQUERY_MAPPING_SEQ,
+	IWPM_NLA_RQUERY_LOCAL_ADDR,
+	IWPM_NLA_RQUERY_REMOTE_ADDR,
 	IWPM_NLA_RQUERY_MAPPED_LOC_ADDR,
 	IWPM_NLA_RQUERY_MAPPED_REM_ADDR,
 	IWPM_NLA_RQUERY_MAPPING_ERR,
-- 
cgit v1.2.3-59-g8ed1b


From b0bad9ad514fc1dd8890f1749f5d2425a73270e3 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Tue, 29 Jan 2019 13:33:16 -0800
Subject: RDMA/IWPM: Support no port mapping requirements

A soft iwarp driver that uses the host TCP stack via a kernel mode socket
does not need port mapping.  In fact, if the port map daemon, iwpmd, is
running, then iwpmd must not try and create/bind a socket to the actual
port for a soft iwarp connection, since the driver already has that socket
bound.

Yet if the soft iwarp driver wants to interoperate with hard iwarp devices
that -are- using port mapping, then the soft iwarp driver's mappings still
need to be maintained and advertised by the iwpm protocol.

This patch enhances the rdma driver<->iwcm interface to allow an iwarp
driver to specify that it does not want port mapping.  The iwpm
kernel<->iwpmd interface is also enhanced to pass up this information on
map requests.

Care is taken to interoperate with the current iwpmd version (ABI version
3) and only use the new NL attributes if iwpmd supports ABI version 4.

The ABI version define has also been created in rdma_netlink.h so both
kernel and user code can share it.  The iwcm and iwpmd negotiate the ABI
version to use with a new HELLO netlink message.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Tatyana Nikolova <Tatyana.E.Nikolova@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/iwcm.c      |  7 +++-
 drivers/infiniband/core/iwpm_msg.c  | 80 +++++++++++++++++++++++++++++++++++--
 drivers/infiniband/core/iwpm_util.c | 48 +++++++++++++++++++++-
 drivers/infiniband/core/iwpm_util.h | 12 ++++++
 include/rdma/iw_cm.h                | 13 ++++++
 include/rdma/iw_portmap.h           | 15 ++++++-
 include/uapi/rdma/rdma_netlink.h    | 24 +++++++++++
 7 files changed, 192 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 476abc74178e..350ea2bab78a 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -87,7 +87,8 @@ static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = {
 	[RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
 	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
 	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
-	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb},
+	[RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb}
 };
 
 static struct workqueue_struct *iwcm_wq;
@@ -525,6 +526,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
 	cm_id->mapped = true;
 	pm_msg.loc_addr = cm_id->local_addr;
 	pm_msg.rem_addr = cm_id->remote_addr;
+	pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ?
+		       IWPM_FLAGS_NO_PORT_MAP : 0;
 	if (active)
 		status = iwpm_add_and_query_mapping(&pm_msg,
 						    RDMA_NL_IWCM);
@@ -543,7 +546,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
 
 	return iwpm_create_mapinfo(&cm_id->local_addr,
 				   &cm_id->m_local_addr,
-				   RDMA_NL_IWCM);
+				   RDMA_NL_IWCM, pm_msg.flags);
 }
 
 /*
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 3a8753595c8f..2e30e65b0816 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -34,7 +34,7 @@
 #include "iwpm_util.h"
 
 static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
-static int iwpm_ulib_version = 3;
+u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN;
 static int iwpm_user_pid = IWPM_PID_UNDEFINED;
 static atomic_t echo_nlmsg_seq;
 
@@ -130,6 +130,7 @@ pid_query_error:
  * nlmsg attributes:
  *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
  *	[IWPM_NLA_MANAGE_ADDR]
+ *	[IWPM_NLA_MANAGE_FLAGS]
  */
 int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 {
@@ -173,6 +174,18 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 	if (ret)
 		goto add_mapping_error;
 
+	/* If flags are required and we're not V4, then return a quiet error */
+	if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+		ret = -EINVAL;
+		goto add_mapping_error_nowarn;
+	}
+	if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+		ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+				IWPM_NLA_MANAGE_FLAGS);
+		if (ret)
+			goto add_mapping_error;
+	}
+
 	nlmsg_end(skb, nlh);
 	nlmsg_request->req_buffer = pm_msg;
 
@@ -187,6 +200,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 	return ret;
 add_mapping_error:
 	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+add_mapping_error_nowarn:
 	if (skb)
 		dev_kfree_skb(skb);
 	if (nlmsg_request)
@@ -201,6 +215,7 @@ add_mapping_error:
  *	[IWPM_NLA_QUERY_MAPPING_SEQ]
  *	[IWPM_NLA_QUERY_LOCAL_ADDR]
  *	[IWPM_NLA_QUERY_REMOTE_ADDR]
+ *	[IWPM_NLA_QUERY_FLAGS]
  */
 int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 {
@@ -251,6 +266,18 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 	if (ret)
 		goto query_mapping_error;
 
+	/* If flags are required and we're not V4, then return a quite error */
+	if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+		ret = -EINVAL;
+		goto query_mapping_error_nowarn;
+	}
+	if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+		ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+				IWPM_NLA_QUERY_FLAGS);
+		if (ret)
+			goto query_mapping_error;
+	}
+
 	nlmsg_end(skb, nlh);
 	nlmsg_request->req_buffer = pm_msg;
 
@@ -264,6 +291,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 	return ret;
 query_mapping_error:
 	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+query_mapping_error_nowarn:
 	if (skb)
 		dev_kfree_skb(skb);
 	if (nlmsg_request)
@@ -379,7 +407,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	/* check device name, ulib name and version */
 	if (strcmp(pm_msg->dev_name, dev_name) ||
 			strcmp(iwpm_ulib_name, iwpm_name) ||
-			iwpm_version != iwpm_ulib_version) {
+			iwpm_version < IWPM_UABI_VERSION_MIN) {
 
 		pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
 				__func__, dev_name, iwpm_name, iwpm_version);
@@ -387,6 +415,10 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
 		goto register_pid_response_exit;
 	}
 	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	iwpm_ulib_version = iwpm_version;
+	if (iwpm_ulib_version < IWPM_UABI_VERSION)
+		pr_warn_once("%s: Down level iwpmd/pid %u.  Continuing...",
+			__func__, iwpm_user_pid);
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
 			__func__, iwpm_user_pid);
@@ -661,7 +693,7 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
 	iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
 	if (strcmp(iwpm_ulib_name, iwpm_name) ||
-			iwpm_version != iwpm_ulib_version) {
+			iwpm_version < IWPM_UABI_VERSION_MIN) {
 		pr_info("%s: Invalid port mapper name = %s version = %d\n",
 				__func__, iwpm_name, iwpm_version);
 		return ret;
@@ -675,6 +707,11 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 	iwpm_user_pid = cb->nlh->nlmsg_pid;
+
+	if (iwpm_ulib_version < IWPM_UABI_VERSION)
+		pr_warn_once("%s: Down level iwpmd/pid %u.  Continuing...",
+			__func__, iwpm_user_pid);
+
 	if (!iwpm_mapinfo_available())
 		return 0;
 	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
@@ -754,3 +791,40 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	up(&nlmsg_request->sem);
 	return 0;
 }
+
+/* netlink attribute policy for the received hello request */
+static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = {
+	[IWPM_NLA_HELLO_ABI_VERSION]     = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_hello_cb - Process a port mapper hello request
+ */
+int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_HELLO_MAX];
+	const char *msg_type = "Hello request";
+	u8 nl_client;
+	u16 abi_version;
+	int ret = -EINVAL;
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb,
+			     msg_type)) {
+		pr_info("%s: Unable to parse nlmsg\n", __func__);
+		return ret;
+	}
+	abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]);
+	nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid port mapper client = %d\n",
+				__func__, nl_client);
+		return ret;
+	}
+	iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version);
+	pr_debug("Using ABI version %u\n", iwpm_ulib_version);
+	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version);
+	return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
index cdb63f3f4de7..363938435476 100644
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -114,7 +114,7 @@ static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
 
 int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
 			struct sockaddr_storage *mapped_sockaddr,
-			u8 nl_client)
+			u8 nl_client, u32 map_flags)
 {
 	struct hlist_head *hash_bucket_head = NULL;
 	struct iwpm_mapping_info *map_info;
@@ -132,6 +132,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
 	memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
 	       sizeof(struct sockaddr_storage));
 	map_info->nl_client = nl_client;
+	map_info->map_flags = map_flags;
 
 	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
 	if (iwpm_hash_bucket) {
@@ -686,6 +687,14 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
 			if (ret)
 				goto send_mapping_info_unlock;
 
+			if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+				ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+						&map_info->map_flags,
+						IWPM_NLA_MAPINFO_FLAGS);
+				if (ret)
+					goto send_mapping_info_unlock;
+			}
+
 			nlmsg_end(skb, nlh);
 
 			iwpm_print_sockaddr(&map_info->local_sockaddr,
@@ -754,3 +763,40 @@ int iwpm_mapinfo_available(void)
 	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
 	return full_bucket;
 }
+
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto hello_num_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	msg_seq = 0;
+	err_str = "Unable to put attribute of abi_version into nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version,
+			    IWPM_NLA_HELLO_ABI_VERSION);
+	if (ret)
+		goto hello_num_error;
+	nlmsg_end(skb, nlh);
+
+	ret = rdma_nl_unicast(skb, iwpm_pid);
+	if (ret) {
+		skb = NULL;
+		err_str = "Unable to send a nlmsg";
+		goto hello_num_error;
+	}
+	pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version);
+	return 0;
+hello_num_error:
+	pr_info("%s: %s\n", __func__, err_str);
+	if (skb)
+		dev_kfree_skb(skb);
+	return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
index af1fc14a0d3d..7e2bcc72f66c 100644
--- a/drivers/infiniband/core/iwpm_util.h
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -78,6 +78,7 @@ struct iwpm_mapping_info {
 	struct sockaddr_storage local_sockaddr;
 	struct sockaddr_storage mapped_sockaddr;
 	u8     nl_client;
+	u32    map_flags;
 };
 
 struct iwpm_remote_info {
@@ -266,4 +267,15 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
  * @msg: Message to print
  */
 void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+
+/**
+ * iwpm_send_hello - Send hello response to iwpmd
+ *
+ * @nl_client: The index of the netlink client
+ * @abi_version: The kernel's abi_version
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version);
+extern u16 iwpm_ulib_version;
 #endif
diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h
index 5cd7701db148..48512abd3162 100644
--- a/include/rdma/iw_cm.h
+++ b/include/rdma/iw_cm.h
@@ -105,6 +105,18 @@ struct iw_cm_conn_param {
 	u32 qpn;
 };
 
+enum iw_flags {
+
+	/*
+	 * This flag allows the iwcm and iwpmd to still advertise
+	 * mappings but the real and mapped port numbers are the
+	 * same.  Further, iwpmd will not bind any user socket to
+	 * reserve the port.  This is required for soft iwarp
+	 * to play in the port mapped iwarp space.
+	 */
+	IW_F_NO_PORT_MAP = (1 << 0),
+};
+
 struct iw_cm_verbs {
 	void		(*add_ref)(struct ib_qp *qp);
 
@@ -127,6 +139,7 @@ struct iw_cm_verbs {
 
 	int		(*destroy_listen)(struct iw_cm_id *cm_id);
 	char		ifname[IFNAMSIZ];
+	enum iw_flags	driver_flags;
 };
 
 /**
diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h
index fda31673a562..84fac196ef80 100644
--- a/include/rdma/iw_portmap.h
+++ b/include/rdma/iw_portmap.h
@@ -58,6 +58,7 @@ struct iwpm_sa_data {
 	struct sockaddr_storage mapped_loc_addr;
 	struct sockaddr_storage rem_addr;
 	struct sockaddr_storage mapped_rem_addr;
+	u32 flags;
 };
 
 /**
@@ -205,9 +206,11 @@ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
  * @local_addr: Local ip/tcp address
  * @mapped_addr: Mapped local ip/tcp address
  * @nl_client: The index of the netlink client
+ * @map_flags: IWPM mapping flags
  */
 int iwpm_create_mapinfo(struct sockaddr_storage *local_addr,
-			struct sockaddr_storage *mapped_addr, u8 nl_client);
+			struct sockaddr_storage *mapped_addr, u8 nl_client,
+			u32 map_flags);
 
 /**
  * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
@@ -221,4 +224,14 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr,
 int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr,
 			struct sockaddr_storage *mapped_addr);
 
+/**
+ * iwpm_hello_cb - Process a hello message from iwpmd
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send the kernel's abi_version
+ * after adjusting it to support the iwpmd version.
+ */
+int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb);
 #endif /* _IW_PORTMAP_H */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 42d53e182d5f..0f5263767fb4 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -35,6 +35,19 @@ enum {
 	RDMA_NL_RDMA_CM_NUM_ATTR,
 };
 
+/* The minimum version that the iwpm kernel supports */
+#define IWPM_UABI_VERSION_MIN	3
+
+/* The latest version that the iwpm kernel supports */
+#define IWPM_UABI_VERSION	4
+
+/* iwarp port mapper message flags */
+enum {
+
+	/* Do not map the port for this IWPM request */
+	IWPM_FLAGS_NO_PORT_MAP = (1 << 0),
+};
+
 /* iwarp port mapper op-codes */
 enum {
 	RDMA_NL_IWPM_REG_PID = 0,
@@ -45,6 +58,7 @@ enum {
 	RDMA_NL_IWPM_HANDLE_ERR,
 	RDMA_NL_IWPM_MAPINFO,
 	RDMA_NL_IWPM_MAPINFO_NUM,
+	RDMA_NL_IWPM_HELLO,
 	RDMA_NL_IWPM_NUM_OPS
 };
 
@@ -83,6 +97,7 @@ enum {
 	IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0,
 	IWPM_NLA_MANAGE_MAPPING_SEQ,
 	IWPM_NLA_MANAGE_ADDR,
+	IWPM_NLA_MANAGE_FLAGS,
 	IWPM_NLA_MANAGE_MAPPING_MAX
 };
 
@@ -98,12 +113,14 @@ enum {
 };
 
 #define IWPM_NLA_MAPINFO_SEND_MAX   3
+#define IWPM_NLA_REMOVE_MAPPING_MAX 3
 
 enum {
 	IWPM_NLA_QUERY_MAPPING_UNSPEC = 0,
 	IWPM_NLA_QUERY_MAPPING_SEQ,
 	IWPM_NLA_QUERY_LOCAL_ADDR,
 	IWPM_NLA_QUERY_REMOTE_ADDR,
+	IWPM_NLA_QUERY_FLAGS,
 	IWPM_NLA_QUERY_MAPPING_MAX,
 };
 
@@ -129,6 +146,7 @@ enum {
 	IWPM_NLA_MAPINFO_UNSPEC = 0,
 	IWPM_NLA_MAPINFO_LOCAL_ADDR,
 	IWPM_NLA_MAPINFO_MAPPED_ADDR,
+	IWPM_NLA_MAPINFO_FLAGS,
 	IWPM_NLA_MAPINFO_MAX
 };
 
@@ -147,6 +165,12 @@ enum {
 	IWPM_NLA_ERR_MAX
 };
 
+enum {
+	IWPM_NLA_HELLO_UNSPEC = 0,
+	IWPM_NLA_HELLO_ABI_VERSION,
+	IWPM_NLA_HELLO_MAX
+};
+
 /*
  * Local service operations:
  *   RESOLVE - The client requests the local service to resolve a path.
-- 
cgit v1.2.3-59-g8ed1b


From e46c2e99f60043bfdb63f18fe775db1f688906d9 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 5 Feb 2019 09:50:31 +0000
Subject: drm/i915: Expose RPCS (SSEU) configuration to userspace (Gen11 only)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We want to allow userspace to reconfigure the subslice configuration on a
per context basis.

This is required for the functional requirement of shutting down non-VME
enabled sub-slices on Gen11 parts.

To do so, we expose a context parameter to allow adjustment of the RPCS
register stored within the context image (and currently not accessible via
LRI).

If the context is adjusted before first use or whilst idle, the adjustment
is for "free"; otherwise if the context is active we queue a request to do
so (using the kernel context), following all other activity by that
context, which is also marked as barrier for all following submission
against the same context.

Since the overhead of device re-configuration during context switching can
be significant, especially in multi-context workloads, we limit this new
uAPI to only support the Gen11 VME use case. In this use case either the
device is fully enabled, and exactly one slice and half of the subslices
are enabled.

Example usage:

	struct drm_i915_gem_context_param_sseu sseu = { };
	struct drm_i915_gem_context_param arg = {
		.param = I915_CONTEXT_PARAM_SSEU,
		.ctx_id = gem_context_create(fd),
		.size = sizeof(sseu),
		.value = to_user_pointer(&sseu)
	};

	/* Query device defaults. */
	gem_context_get_param(fd, &arg);

	/* Set VME configuration on a 1x6x8 part. */
	sseu.slice_mask = 0x1;
	sseu.subslice_mask = 0xe0;
	gem_context_set_param(fd, &arg);

v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu()
    (Lionel)

v3: Add ability to program this per engine (Chris)

v4: Move most get_sseu() into i915_gem_context.c (Lionel)

v5: Validate sseu configuration against the device's capabilities (Lionel)

v6: Change context powergating settings through MI_SDM on kernel context
    (Chris)

v7: Synchronize the requests following a powergating setting change using
    a global dependency (Chris)
    Iterate timelines through dev_priv.gt.active_rings (Tvrtko)
    Disable RPCS configuration setting for non capable users
    (Lionel/Tvrtko)

v8: s/union intel_sseu/struct intel_sseu/ (Lionel)
    s/dev_priv/i915/ (Tvrtko)
    Change uapi class/instance fields to u16 (Tvrtko)
    Bump mask fields to 64bits (Lionel)
    Don't return EPERM when dynamic sseu is disabled (Tvrtko)

v9: Import context image into kernel context's ppgtt only when
    reconfiguring powergated slice/subslices (Chris)
    Use aliasing ppgtt when needed (Michel)

Tvrtko Ursulin:

v10:
 * Update for upstream changes.
 * Request submit needs a RPM reference.
 * Reject on !FULL_PPGTT for simplicity.
 * Pull out get/set param to helpers for readability and less indent.
 * Use i915_request_await_dma_fence in add_global_barrier to skip waits
   on the same timeline and avoid GEM_BUG_ON.
 * No need to explicitly assign a NULL pointer to engine in legacy mode.
 * No need to move gen8_make_rpcs up.
 * Factored out global barrier as prep patch.
 * Allow to only CAP_SYS_ADMIN if !Gen11.

v11:
 * Remove engine vfunc in favour of local helper. (Chris Wilson)
 * Stop retiring requests before updates since it is not needed
   (Chris Wilson)
 * Implement direct CPU update path for idle contexts. (Chris Wilson)
 * Left side dependency needs only be on the same context timeline.
   (Chris Wilson)
 * It is sufficient to order the timeline. (Chris Wilson)
 * Reject !RCS configuration attempts with -ENODEV for now.

v12:
 * Rebase for make_rpcs.

v13:
 * Centralize SSEU normalization to make_rpcs.
 * Type width checking (uAPI <-> implementation).
 * Gen11 restrictions uAPI checks.
 * Gen11 subslice count differences handling.
 Chris Wilson:
 * args->size handling fixes.
 * Update context image from GGTT.
 * Postpone context image update to pinning.
 * Use i915_gem_active_raw instead of last_request_on_engine.

v14:
 * Add activity tracker on intel_context to fix the lifetime issues
   and simplify the code. (Chris Wilson)

v15:
 * Fix context pin leak if no space in ring by simplifying the
   context pinning sequence.

v16:
 * Rebase for context get/set param locking changes.
 * Just -ENODEV on !Gen11. (Joonas)

v17:
 * Fix one Gen11 subslice enablement rule.
 * Handle error from i915_sw_fence_await_sw_fence_gfp. (Chris Wilson)

v18:
 * Update commit message. (Joonas)
 * Restrict uAPI to VME use case. (Joonas)

v19:
 * Rebase.

v20:
 * Rebase for ce->active_tracker.

v21:
 * Rebase for IS_GEN changes.

v22:
 * Reserve uAPI for flags straight away. (Chris Wilson)

v23:
 * Rebase for RUNTIME_INFO.

v24:
 * Added some headline docs for the uapi usage. (Joonas/Chris)

v25:
 * Renamed class/instance to engine_class/engine_instance to avoid clash
   with C++ keyword. (Tony Ye)

v26:
 * Rebased for runtime pm api changes.

v27:
 * Rebased for intel_context_init.
 * Wrap commit msg to 75.

v28:
 (Chris Wilson)
 * Use i915_gem_ggtt.
 * Use i915_request_await_dma_fence to show a better example.

v29:
 * i915_timeline_set_barrier can now fail. (Chris Wilson)

v30:
 * Capture some acks.

v31:
 * Drop the WARN_ON from use controllable paths. (Chris Wilson)
 * Use overflows_type for all checks.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107634
Issue: https://github.com/intel/media-driver/issues/267
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Zhipeng Gong <zhipeng.gong@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tony Ye <tony.ye@intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Timo Aaltonen <timo.aaltonen@canonical.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Stéphane Marchesin <marcheu@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190205095032.22673-4-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/i915_gem_context.c | 340 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_context.h |   6 +
 drivers/gpu/drm/i915/intel_lrc.c        |   4 +-
 include/uapi/drm/i915_drm.h             |  64 ++++++
 4 files changed, 411 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index d3887c27c3ba..2d3e1ce9cc76 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -89,6 +89,7 @@
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include "i915_trace.h"
+#include "intel_lrc_reg.h"
 #include "intel_workarounds.h"
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
@@ -321,6 +322,15 @@ static u32 default_desc_template(const struct drm_i915_private *i915,
 	return desc;
 }
 
+static void intel_context_retire(struct i915_gem_active *active,
+				 struct i915_request *rq)
+{
+	struct intel_context *ce =
+		container_of(active, typeof(*ce), active_tracker);
+
+	intel_context_unpin(ce);
+}
+
 void
 intel_context_init(struct intel_context *ce,
 		   struct i915_gem_context *ctx,
@@ -333,6 +343,8 @@ intel_context_init(struct intel_context *ce,
 
 	/* Use the whole device by default */
 	ce->sseu = intel_device_default_sseu(ctx->i915);
+
+	init_request_active(&ce->active_tracker, intel_context_retire);
 }
 
 static struct i915_gem_context *
@@ -850,6 +862,56 @@ out:
 	return 0;
 }
 
+static int get_sseu(struct i915_gem_context *ctx,
+		    struct drm_i915_gem_context_param *args)
+{
+	struct drm_i915_gem_context_param_sseu user_sseu;
+	struct intel_engine_cs *engine;
+	struct intel_context *ce;
+	int ret;
+
+	if (args->size == 0)
+		goto out;
+	else if (args->size < sizeof(user_sseu))
+		return -EINVAL;
+
+	if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
+			   sizeof(user_sseu)))
+		return -EFAULT;
+
+	if (user_sseu.flags || user_sseu.rsvd)
+		return -EINVAL;
+
+	engine = intel_engine_lookup_user(ctx->i915,
+					  user_sseu.engine_class,
+					  user_sseu.engine_instance);
+	if (!engine)
+		return -EINVAL;
+
+	/* Only use for mutex here is to serialize get_param and set_param. */
+	ret = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex);
+	if (ret)
+		return ret;
+
+	ce = to_intel_context(ctx, engine);
+
+	user_sseu.slice_mask = ce->sseu.slice_mask;
+	user_sseu.subslice_mask = ce->sseu.subslice_mask;
+	user_sseu.min_eus_per_subslice = ce->sseu.min_eus_per_subslice;
+	user_sseu.max_eus_per_subslice = ce->sseu.max_eus_per_subslice;
+
+	mutex_unlock(&ctx->i915->drm.struct_mutex);
+
+	if (copy_to_user(u64_to_user_ptr(args->value), &user_sseu,
+			 sizeof(user_sseu)))
+		return -EFAULT;
+
+out:
+	args->size = sizeof(user_sseu);
+
+	return 0;
+}
+
 int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 				    struct drm_file *file)
 {
@@ -862,15 +924,17 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	if (!ctx)
 		return -ENOENT;
 
-	args->size = 0;
 	switch (args->param) {
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 		ret = -EINVAL;
 		break;
 	case I915_CONTEXT_PARAM_NO_ZEROMAP:
+		args->size = 0;
 		args->value = test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags);
 		break;
 	case I915_CONTEXT_PARAM_GTT_SIZE:
+		args->size = 0;
+
 		if (ctx->ppgtt)
 			args->value = ctx->ppgtt->vm.total;
 		else if (to_i915(dev)->mm.aliasing_ppgtt)
@@ -879,14 +943,20 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 			args->value = to_i915(dev)->ggtt.vm.total;
 		break;
 	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
+		args->size = 0;
 		args->value = i915_gem_context_no_error_capture(ctx);
 		break;
 	case I915_CONTEXT_PARAM_BANNABLE:
+		args->size = 0;
 		args->value = i915_gem_context_is_bannable(ctx);
 		break;
 	case I915_CONTEXT_PARAM_PRIORITY:
+		args->size = 0;
 		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
 		break;
+	case I915_CONTEXT_PARAM_SSEU:
+		ret = get_sseu(ctx, args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -896,6 +966,270 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	return ret;
 }
 
+static int gen8_emit_rpcs_config(struct i915_request *rq,
+				 struct intel_context *ce,
+				 struct intel_sseu sseu)
+{
+	u64 offset;
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	offset = i915_ggtt_offset(ce->state) +
+		 LRC_STATE_PN * PAGE_SIZE +
+		 (CTX_R_PWR_CLK_STATE + 1) * 4;
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = lower_32_bits(offset);
+	*cs++ = upper_32_bits(offset);
+	*cs++ = gen8_make_rpcs(rq->i915, &sseu);
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int
+gen8_modify_rpcs_gpu(struct intel_context *ce,
+		     struct intel_engine_cs *engine,
+		     struct intel_sseu sseu)
+{
+	struct drm_i915_private *i915 = engine->i915;
+	struct i915_request *rq, *prev;
+	intel_wakeref_t wakeref;
+	int ret;
+
+	GEM_BUG_ON(!ce->pin_count);
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	/* Submitting requests etc needs the hw awake. */
+	wakeref = intel_runtime_pm_get(i915);
+
+	rq = i915_request_alloc(engine, i915->kernel_context);
+	if (IS_ERR(rq)) {
+		ret = PTR_ERR(rq);
+		goto out_put;
+	}
+
+	/* Queue this switch after all other activity by this context. */
+	prev = i915_gem_active_raw(&ce->ring->timeline->last_request,
+				   &i915->drm.struct_mutex);
+	if (prev && !i915_request_completed(prev)) {
+		ret = i915_request_await_dma_fence(rq, &prev->fence);
+		if (ret < 0)
+			goto out_add;
+	}
+
+	/* Order all following requests to be after. */
+	ret = i915_timeline_set_barrier(ce->ring->timeline, rq);
+	if (ret)
+		goto out_add;
+
+	ret = gen8_emit_rpcs_config(rq, ce, sseu);
+	if (ret)
+		goto out_add;
+
+	/*
+	 * Guarantee context image and the timeline remains pinned until the
+	 * modifying request is retired by setting the ce activity tracker.
+	 *
+	 * But we only need to take one pin on the account of it. Or in other
+	 * words transfer the pinned ce object to tracked active request.
+	 */
+	if (!i915_gem_active_isset(&ce->active_tracker))
+		__intel_context_pin(ce);
+	i915_gem_active_set(&ce->active_tracker, rq);
+
+out_add:
+	i915_request_add(rq);
+out_put:
+	intel_runtime_pm_put(i915, wakeref);
+
+	return ret;
+}
+
+static int
+i915_gem_context_reconfigure_sseu(struct i915_gem_context *ctx,
+				  struct intel_engine_cs *engine,
+				  struct intel_sseu sseu)
+{
+	struct intel_context *ce = to_intel_context(ctx, engine);
+	int ret;
+
+	GEM_BUG_ON(INTEL_GEN(ctx->i915) < 8);
+	GEM_BUG_ON(engine->id != RCS);
+
+	ret = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex);
+	if (ret)
+		return ret;
+
+	/* Nothing to do if unmodified. */
+	if (!memcmp(&ce->sseu, &sseu, sizeof(sseu)))
+		goto out;
+
+	/*
+	 * If context is not idle we have to submit an ordered request to modify
+	 * its context image via the kernel context. Pristine and idle contexts
+	 * will be configured on pinning.
+	 */
+	if (ce->pin_count)
+		ret = gen8_modify_rpcs_gpu(ce, engine, sseu);
+
+	if (!ret)
+		ce->sseu = sseu;
+
+out:
+	mutex_unlock(&ctx->i915->drm.struct_mutex);
+
+	return ret;
+}
+
+static int
+user_to_context_sseu(struct drm_i915_private *i915,
+		     const struct drm_i915_gem_context_param_sseu *user,
+		     struct intel_sseu *context)
+{
+	const struct sseu_dev_info *device = &RUNTIME_INFO(i915)->sseu;
+
+	/* No zeros in any field. */
+	if (!user->slice_mask || !user->subslice_mask ||
+	    !user->min_eus_per_subslice || !user->max_eus_per_subslice)
+		return -EINVAL;
+
+	/* Max > min. */
+	if (user->max_eus_per_subslice < user->min_eus_per_subslice)
+		return -EINVAL;
+
+	/*
+	 * Some future proofing on the types since the uAPI is wider than the
+	 * current internal implementation.
+	 */
+	if (overflows_type(user->slice_mask, context->slice_mask) ||
+	    overflows_type(user->subslice_mask, context->subslice_mask) ||
+	    overflows_type(user->min_eus_per_subslice,
+			   context->min_eus_per_subslice) ||
+	    overflows_type(user->max_eus_per_subslice,
+			   context->max_eus_per_subslice))
+		return -EINVAL;
+
+	/* Check validity against hardware. */
+	if (user->slice_mask & ~device->slice_mask)
+		return -EINVAL;
+
+	if (user->subslice_mask & ~device->subslice_mask[0])
+		return -EINVAL;
+
+	if (user->max_eus_per_subslice > device->max_eus_per_subslice)
+		return -EINVAL;
+
+	context->slice_mask = user->slice_mask;
+	context->subslice_mask = user->subslice_mask;
+	context->min_eus_per_subslice = user->min_eus_per_subslice;
+	context->max_eus_per_subslice = user->max_eus_per_subslice;
+
+	/* Part specific restrictions. */
+	if (IS_GEN(i915, 11)) {
+		unsigned int hw_s = hweight8(device->slice_mask);
+		unsigned int hw_ss_per_s = hweight8(device->subslice_mask[0]);
+		unsigned int req_s = hweight8(context->slice_mask);
+		unsigned int req_ss = hweight8(context->subslice_mask);
+
+		/*
+		 * Only full subslice enablement is possible if more than one
+		 * slice is turned on.
+		 */
+		if (req_s > 1 && req_ss != hw_ss_per_s)
+			return -EINVAL;
+
+		/*
+		 * If more than four (SScount bitfield limit) subslices are
+		 * requested then the number has to be even.
+		 */
+		if (req_ss > 4 && (req_ss & 1))
+			return -EINVAL;
+
+		/*
+		 * If only one slice is enabled and subslice count is below the
+		 * device full enablement, it must be at most half of the all
+		 * available subslices.
+		 */
+		if (req_s == 1 && req_ss < hw_ss_per_s &&
+		    req_ss > (hw_ss_per_s / 2))
+			return -EINVAL;
+
+		/* ABI restriction - VME use case only. */
+
+		/* All slices or one slice only. */
+		if (req_s != 1 && req_s != hw_s)
+			return -EINVAL;
+
+		/*
+		 * Half subslices or full enablement only when one slice is
+		 * enabled.
+		 */
+		if (req_s == 1 &&
+		    (req_ss != hw_ss_per_s && req_ss != (hw_ss_per_s / 2)))
+			return -EINVAL;
+
+		/* No EU configuration changes. */
+		if ((user->min_eus_per_subslice !=
+		     device->max_eus_per_subslice) ||
+		    (user->max_eus_per_subslice !=
+		     device->max_eus_per_subslice))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int set_sseu(struct i915_gem_context *ctx,
+		    struct drm_i915_gem_context_param *args)
+{
+	struct drm_i915_private *i915 = ctx->i915;
+	struct drm_i915_gem_context_param_sseu user_sseu;
+	struct intel_engine_cs *engine;
+	struct intel_sseu sseu;
+	int ret;
+
+	if (args->size < sizeof(user_sseu))
+		return -EINVAL;
+
+	if (!IS_GEN(i915, 11))
+		return -ENODEV;
+
+	if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value),
+			   sizeof(user_sseu)))
+		return -EFAULT;
+
+	if (user_sseu.flags || user_sseu.rsvd)
+		return -EINVAL;
+
+	engine = intel_engine_lookup_user(i915,
+					  user_sseu.engine_class,
+					  user_sseu.engine_instance);
+	if (!engine)
+		return -EINVAL;
+
+	/* Only render engine supports RPCS configuration. */
+	if (engine->class != RENDER_CLASS)
+		return -ENODEV;
+
+	ret = user_to_context_sseu(i915, &user_sseu, &sseu);
+	if (ret)
+		return ret;
+
+	ret = i915_gem_context_reconfigure_sseu(ctx, engine, sseu);
+	if (ret)
+		return ret;
+
+	args->size = sizeof(user_sseu);
+
+	return 0;
+}
+
 int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				    struct drm_file *file)
 {
@@ -958,7 +1292,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 					I915_USER_PRIORITY(priority);
 		}
 		break;
-
+	case I915_CONTEXT_PARAM_SSEU:
+		ret = set_sseu(ctx, args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 919f6f0a0f7a..92ad5272e57f 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -183,6 +183,12 @@ struct i915_gem_context {
 		u64 lrc_desc;
 		int pin_count;
 
+		/**
+		 * active_tracker: Active tracker for the external rq activity
+		 * on this intel_context object.
+		 */
+		struct i915_gem_active active_tracker;
+
 		const struct intel_context_ops *ops;
 
 		/** sseu: Control eu/slice partitioning */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d99c462e2c09..5e98fd79bd9d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2498,7 +2498,9 @@ u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *req_sseu)
 	 * subslices are enabled, or a count between one and four on the first
 	 * slice.
 	 */
-	if (IS_GEN(i915, 11) && slices == 1 && subslices >= 4) {
+	if (IS_GEN(i915, 11) &&
+	    slices == 1 &&
+	    subslices > min_t(u8, 4, hweight8(sseu->subslice_mask[0]) / 2)) {
 		GEM_BUG_ON(subslices & 1);
 
 		subslice_pg = false;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 298b2e197744..397810fa2d33 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1486,9 +1486,73 @@ struct drm_i915_gem_context_param {
 #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
 #define   I915_CONTEXT_DEFAULT_PRIORITY		0
 #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
+	/*
+	 * When using the following param, value should be a pointer to
+	 * drm_i915_gem_context_param_sseu.
+	 */
+#define I915_CONTEXT_PARAM_SSEU		0x7
 	__u64 value;
 };
 
+/**
+ * Context SSEU programming
+ *
+ * It may be necessary for either functional or performance reason to configure
+ * a context to run with a reduced number of SSEU (where SSEU stands for Slice/
+ * Sub-slice/EU).
+ *
+ * This is done by configuring SSEU configuration using the below
+ * @struct drm_i915_gem_context_param_sseu for every supported engine which
+ * userspace intends to use.
+ *
+ * Not all GPUs or engines support this functionality in which case an error
+ * code -ENODEV will be returned.
+ *
+ * Also, flexibility of possible SSEU configuration permutations varies between
+ * GPU generations and software imposed limitations. Requesting such a
+ * combination will return an error code of -EINVAL.
+ *
+ * NOTE: When perf/OA is active the context's SSEU configuration is ignored in
+ * favour of a single global setting.
+ */
+struct drm_i915_gem_context_param_sseu {
+	/*
+	 * Engine class & instance to be configured or queried.
+	 */
+	__u16 engine_class;
+	__u16 engine_instance;
+
+	/*
+	 * Unused for now. Must be cleared to zero.
+	 */
+	__u32 flags;
+
+	/*
+	 * Mask of slices to enable for the context. Valid values are a subset
+	 * of the bitmask value returned for I915_PARAM_SLICE_MASK.
+	 */
+	__u64 slice_mask;
+
+	/*
+	 * Mask of subslices to enable for the context. Valid values are a
+	 * subset of the bitmask value return by I915_PARAM_SUBSLICE_MASK.
+	 */
+	__u64 subslice_mask;
+
+	/*
+	 * Minimum/Maximum number of EUs to enable per subslice for the
+	 * context. min_eus_per_subslice must be inferior or equal to
+	 * max_eus_per_subslice.
+	 */
+	__u16 min_eus_per_subslice;
+	__u16 max_eus_per_subslice;
+
+	/*
+	 * Unused for now. Must be cleared to zero.
+	 */
+	__u32 rsvd;
+};
+
 enum drm_i915_oa_format {
 	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
 	I915_OA_FORMAT_A29,	    /* HSW only */
-- 
cgit v1.2.3-59-g8ed1b


From 9c0644ee4aa8792f1e60a2b014b4710faaddafeb Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 1 Feb 2019 17:13:57 -0500
Subject: virtio: drop internal struct from UAPI

There's no reason to expose struct vring_packed in UAPI - if we do we
won't be able to change or drop it, and it's not part of any interface.

Let's move it to virtio_ring.c

Cc: Tiwei Bie <tiwei.bie@intel.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c     |  7 ++++++-
 include/uapi/linux/virtio_ring.h | 10 ----------
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 27d3f057493e..a0b07c331255 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -152,7 +152,12 @@ struct vring_virtqueue {
 		/* Available for packed ring */
 		struct {
 			/* Actual memory layout for this queue. */
-			struct vring_packed vring;
+			struct {
+				unsigned int num;
+				struct vring_packed_desc *desc;
+				struct vring_packed_desc_event *driver;
+				struct vring_packed_desc_event *device;
+			} vring;
 
 			/* Driver ring wrap counter. */
 			bool avail_wrap_counter;
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 2414f8af26b3..4c4e24c291a5 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -213,14 +213,4 @@ struct vring_packed_desc {
 	__le16 flags;
 };
 
-struct vring_packed {
-	unsigned int num;
-
-	struct vring_packed_desc *desc;
-
-	struct vring_packed_desc_event *driver;
-
-	struct vring_packed_desc_event *device;
-};
-
 #endif /* _UAPI_LINUX_VIRTIO_RING_H */
-- 
cgit v1.2.3-59-g8ed1b


From a78e8723a50530d15faa25cc0b6f009bcd251c20 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 16 Jan 2019 09:55:41 +0200
Subject: RDMA/cma: Remove CM_ID statistics provided by rdma-cm module

Netlink statistics exported by rdma-cm never had any working user space
component published to the mailing list or to any open source
project. Canvassing various proprietary users, and the original requester,
we find that there are no real users of this interface.

This patch simply removes all occurrences of RDMA CM netlink in favour of
modern nldev implementation, which provides the same information and
accompanied by widely used user space component.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c     | 83 ---------------------------------------
 drivers/infiniband/core/netlink.c |  4 +-
 include/uapi/rdma/rdma_netlink.h  | 17 +-------
 3 files changed, 3 insertions(+), 101 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 81bded0d37d1..e15546ae4d0f 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4616,85 +4616,6 @@ static void cma_remove_one(struct ib_device *device, void *client_data)
 	kfree(cma_dev);
 }
 
-static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct nlmsghdr *nlh;
-	struct rdma_cm_id_stats *id_stats;
-	struct rdma_id_private *id_priv;
-	struct rdma_cm_id *id = NULL;
-	struct cma_device *cma_dev;
-	int i_dev = 0, i_id = 0;
-
-	/*
-	 * We export all of the IDs as a sequence of messages.  Each
-	 * ID gets its own netlink message.
-	 */
-	mutex_lock(&lock);
-
-	list_for_each_entry(cma_dev, &dev_list, list) {
-		if (i_dev < cb->args[0]) {
-			i_dev++;
-			continue;
-		}
-
-		i_id = 0;
-		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
-			if (i_id < cb->args[1]) {
-				i_id++;
-				continue;
-			}
-
-			id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
-						sizeof *id_stats, RDMA_NL_RDMA_CM,
-						RDMA_NL_RDMA_CM_ID_STATS,
-						NLM_F_MULTI);
-			if (!id_stats)
-				goto out;
-
-			memset(id_stats, 0, sizeof *id_stats);
-			id = &id_priv->id;
-			id_stats->node_type = id->route.addr.dev_addr.dev_type;
-			id_stats->port_num = id->port_num;
-			id_stats->bound_dev_if =
-				id->route.addr.dev_addr.bound_dev_if;
-
-			if (ibnl_put_attr(skb, nlh,
-					  rdma_addr_size(cma_src_addr(id_priv)),
-					  cma_src_addr(id_priv),
-					  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
-				goto out;
-			if (ibnl_put_attr(skb, nlh,
-					  rdma_addr_size(cma_dst_addr(id_priv)),
-					  cma_dst_addr(id_priv),
-					  RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
-				goto out;
-
-			id_stats->pid	= task_pid_vnr(id_priv->res.task);
-			id_stats->port_space	= id->ps;
-			id_stats->cm_state	= id_priv->state;
-			id_stats->qp_num	= id_priv->qp_num;
-			id_stats->qp_type	= id->qp_type;
-
-			i_id++;
-			nlmsg_end(skb, nlh);
-		}
-
-		cb->args[1] = 0;
-		i_dev++;
-	}
-
-out:
-	mutex_unlock(&lock);
-	cb->args[0] = i_dev;
-	cb->args[1] = i_id;
-
-	return skb->len;
-}
-
-static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = {
-	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
-};
-
 static int cma_init_net(struct net *net)
 {
 	struct cma_pernet *pernet = cma_pernet(net);
@@ -4743,7 +4664,6 @@ static int __init cma_init(void)
 	if (ret)
 		goto err;
 
-	rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table);
 	cma_configfs_init();
 
 	return 0;
@@ -4759,7 +4679,6 @@ err_wq:
 static void __exit cma_cleanup(void)
 {
 	cma_configfs_exit();
-	rdma_nl_unregister(RDMA_NL_RDMA_CM);
 	ib_unregister_client(&cma_client);
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
@@ -4767,7 +4686,5 @@ static void __exit cma_cleanup(void)
 	destroy_workqueue(cma_wq);
 }
 
-MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1);
-
 module_init(cma_init);
 module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 724f5a62e82f..eecfc0b377c9 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -56,7 +56,6 @@ EXPORT_SYMBOL(rdma_nl_chk_listeners);
 static bool is_nl_msg_valid(unsigned int type, unsigned int op)
 {
 	static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = {
-		[RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS,
 		[RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS,
 		[RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS,
 		[RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS,
@@ -181,8 +180,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 	/* FIXME: Convert IWCM to properly handle doit callbacks */
-	if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM ||
-	    index == RDMA_NL_IWCM) {
+	if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) {
 		struct netlink_dump_control c = {
 			.dump = cb_table[op].dump,
 		};
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 0f5263767fb4..3a9e681e4257 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -5,8 +5,7 @@
 #include <linux/types.h>
 
 enum {
-	RDMA_NL_RDMA_CM = 1,
-	RDMA_NL_IWCM,
+	RDMA_NL_IWCM = 2,
 	RDMA_NL_RSVD,
 	RDMA_NL_LS,	/* RDMA Local Services */
 	RDMA_NL_NLDEV,	/* RDMA device interface */
@@ -14,8 +13,7 @@ enum {
 };
 
 enum {
-	RDMA_NL_GROUP_CM = 1,
-	RDMA_NL_GROUP_IWPM,
+	RDMA_NL_GROUP_IWPM = 2,
 	RDMA_NL_GROUP_LS,
 	RDMA_NL_NUM_GROUPS
 };
@@ -24,17 +22,6 @@ enum {
 #define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1))
 #define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op)
 
-enum {
-	RDMA_NL_RDMA_CM_ID_STATS = 0,
-	RDMA_NL_RDMA_CM_NUM_OPS
-};
-
-enum {
-	RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1,
-	RDMA_NL_RDMA_CM_ATTR_DST_ADDR,
-	RDMA_NL_RDMA_CM_NUM_ATTR,
-};
-
 /* The minimum version that the iwpm kernel supports */
 #define IWPM_UABI_VERSION_MIN	3
 
-- 
cgit v1.2.3-59-g8ed1b


From 67dd1a36334ffce82bebeb2d633e152aa436d370 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Thu, 31 Jan 2019 15:44:22 -0500
Subject: drm/amdgpu: Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New chunk for dependency on start of job's execution instead on
the end. This is used for GPU deadlock prevention when
userspace uses mid-IB fences to wait for mid-IB work on other rings.

v2: Fix typo in AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
v3: Bump KMS version
v4: put old fence AFTER acquiring the scheduled fence.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: Christian Koenig <Christian.Koenig@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 13 ++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
 include/uapi/drm/amdgpu_drm.h           |  1 +
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 1c49b8266d69..52a5e4fdc95b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -214,6 +214,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 		case AMDGPU_CHUNK_ID_DEPENDENCIES:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
 		case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
+		case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
 			break;
 
 		default:
@@ -1090,6 +1091,15 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
 
 		fence = amdgpu_ctx_get_fence(ctx, entity,
 					     deps[i].handle);
+
+		if (chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) {
+			struct drm_sched_fence *s_fence = to_drm_sched_fence(fence);
+			struct dma_fence *old = fence;
+
+			fence = dma_fence_get(&s_fence->scheduled);
+			dma_fence_put(old);
+		}
+
 		if (IS_ERR(fence)) {
 			r = PTR_ERR(fence);
 			amdgpu_ctx_put(ctx);
@@ -1177,7 +1187,8 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev,
 
 		chunk = &p->chunks[i];
 
-		if (chunk->chunk_id == AMDGPU_CHUNK_ID_DEPENDENCIES) {
+		if (chunk->chunk_id == AMDGPU_CHUNK_ID_DEPENDENCIES ||
+		    chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) {
 			r = amdgpu_cs_process_fence_dep(p, chunk);
 			if (r)
 				return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c806f984bcc5..1158a6f4eec6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -71,9 +71,10 @@
  * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk).
  * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
  * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation.
+ * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	27
+#define KMS_DRIVER_MINOR	28
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index be84e43c1e19..47296f2ec3fa 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -523,6 +523,7 @@ struct drm_amdgpu_gem_va {
 #define AMDGPU_CHUNK_ID_SYNCOBJ_IN      0x04
 #define AMDGPU_CHUNK_ID_SYNCOBJ_OUT     0x05
 #define AMDGPU_CHUNK_ID_BO_HANDLES      0x06
+#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES	0x07
 
 struct drm_amdgpu_cs_chunk {
 	__u32		chunk_id;
-- 
cgit v1.2.3-59-g8ed1b


From 41cca166cc57e75e94d888595a428d23a3bf4e36 Mon Sep 17 00:00:00 2001
From: Marek Olšák <marek.olsak@amd.com>
Date: Mon, 21 Jan 2019 17:22:55 -0500
Subject: drm/amdgpu: add a workaround for GDS ordered append hangs with
 compute queues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I'm not increasing the DRM version because GDS isn't totally without bugs yet.

v2: update emit_ib_size

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 19 +++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 21 +++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 40 +++++++++++++++++++++++++++++++--
 include/uapi/drm/amdgpu_drm.h           |  5 +++++
 6 files changed, 84 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1158a6f4eec6..2f0ea380c031 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -72,9 +72,10 @@
  * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
  * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation.
  * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
+ * - 3.29.0 - Add AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	28
+#define KMS_DRIVER_MINOR	29
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
index ecbcefe49a98..f89f5734d985 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
@@ -37,6 +37,8 @@ struct amdgpu_gds {
 	struct amdgpu_gds_asic_info	mem;
 	struct amdgpu_gds_asic_info	gws;
 	struct amdgpu_gds_asic_info	oa;
+	uint32_t			gds_compute_max_wave_id;
+
 	/* At present, GDS, GWS and OA resources for gfx (graphics)
 	 * is always pre-allocated and available for graphics operation.
 	 * Such resource is shared between all gfx clients.
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 7984292f9282..a59e0fdf5a97 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2264,6 +2264,22 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -5000,7 +5016,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
 		7 + /* gfx_v7_0_ring_emit_pipeline_sync */
 		CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v7_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v7_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v7_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v7_0_ring_emit_fence_compute,
 	.emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
@@ -5057,6 +5073,7 @@ static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index a26747681ed6..b8e50a34bdb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6084,6 +6084,22 @@ static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -6890,7 +6906,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v8_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v8_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v8_0_ring_emit_fence_compute,
 	.emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
@@ -6920,7 +6936,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		17 + /* gfx_v8_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v8_0_ring_emit_ib_compute */
 	.emit_fence = gfx_v8_0_ring_emit_fence_kiq,
 	.test_ring = gfx_v8_0_ring_test_ring,
 	.insert_nop = amdgpu_ring_insert_nop,
@@ -6996,6 +7012,7 @@ static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 262ee3cf6f1c..5533f6e4f4a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4010,6 +4010,22 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
 	amdgpu_ring_write(ring,
@@ -4729,7 +4745,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v9_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v9_0_ring_emit_fence,
 	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
@@ -4764,7 +4780,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
-	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
+	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
 	.emit_fence = gfx_v9_0_ring_emit_fence_kiq,
 	.test_ring = gfx_v9_0_ring_test_ring,
 	.insert_nop = amdgpu_ring_insert_nop,
@@ -4846,6 +4862,26 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
 		break;
 	}
 
+	switch (adev->asic_type) {
+	case CHIP_VEGA10:
+	case CHIP_VEGA20:
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	case CHIP_VEGA12:
+		adev->gds.gds_compute_max_wave_id = 0x27f;
+		break;
+	case CHIP_RAVEN:
+		if (adev->rev_id >= 0x8)
+			adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
+		else
+			adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
+		break;
+	default:
+		/* this really depends on the chip */
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	}
+
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
 
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 47296f2ec3fa..2acbc8bc465b 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -566,6 +566,11 @@ union drm_amdgpu_cs {
  * caches (L2/vL1/sL1/I$). */
 #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
 
+/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
+ * This will reset wave ID counters for the IB.
+ */
+#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
+
 struct drm_amdgpu_cs_chunk_ib {
 	__u32 _pad;
 	/** AMDGPU_IB_FLAG_* */
-- 
cgit v1.2.3-59-g8ed1b


From 2c620ff93d9fbd5d644760d4c21d389078ec1080 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Mon, 2 Jul 2018 22:44:20 -0700
Subject: time: Add struct __kernel_timex

struct timex uses struct timeval internally.
struct timeval is not y2038 safe.
Introduce a new UAPI type struct __kernel_timex
that is y2038 safe.

struct __kernel_timex uses a timeval type that is
similar to struct __kernel_timespec which preserves the
same structure size across 32 bit and 64 bit ABIs.
struct __kernel_timex also restructures other members of the
structure to make the structure the same on 64 bit and 32 bit
architectures.
Note that struct __kernel_timex is the same as struct timex
on a 64 bit architecture.

The above solution is similar to other new y2038 syscalls
that are being introduced: both 32 bit and 64 bit ABIs
have a common entry, and the compat entry supports the old 32 bit
syscall interface.

Alternatives considered were:
1. Add new time type to struct timex that makes use of padded
   bits. This time type could be based on the struct __kernel_timespec.
   modes will use a flag to notify which time structure should be
   used internally.
   This needs some application level changes on both 64 bit and 32 bit
   architectures. Although 64 bit machines could continue to use the
   older timeval structure without any changes.

2. Add a new u8 type to struct timex that makes use of padded bits. This
   can be used to save higher order tv_sec bits. modes will use a flag to
   notify presence of such a type.
   This will need some application level changes on 32 bit architectures.

3. Add a new compat_timex structure that differs in only the size of the
   time type; keep rest of struct timex the same.
   This requires extra syscalls to manage all 3 cases on 64 bit
   architectures. This will not need any application level changes but will
   add more complexity from kernel side.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/timex.h      |  7 +++++++
 include/uapi/linux/timex.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'include/uapi')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 39c25dbebfe8..7f40e9e42ecc 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -53,6 +53,13 @@
 #ifndef _LINUX_TIMEX_H
 #define _LINUX_TIMEX_H
 
+/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path
+ * and 32-bit emulation.
+ */
+#ifndef CONFIG_64BIT_TIME
+#define __kernel_timex timex
+#endif
+
 #include <uapi/linux/timex.h>
 
 #define ADJ_ADJTIME		0x8000	/* switch between adjtime/adjtimex modes */
diff --git a/include/uapi/linux/timex.h b/include/uapi/linux/timex.h
index 92685d826444..a1c6b73016a5 100644
--- a/include/uapi/linux/timex.h
+++ b/include/uapi/linux/timex.h
@@ -92,6 +92,47 @@ struct timex {
 	int  :32; int  :32; int  :32;
 };
 
+struct __kernel_timex_timeval {
+	__kernel_time64_t       tv_sec;
+	long long		tv_usec;
+};
+
+#ifndef __kernel_timex
+struct __kernel_timex {
+	unsigned int modes;	/* mode selector */
+	int :32;            /* pad */
+	long long offset;	/* time offset (usec) */
+	long long freq;	/* frequency offset (scaled ppm) */
+	long long maxerror;/* maximum error (usec) */
+	long long esterror;/* estimated error (usec) */
+	int status;		/* clock command/status */
+	int :32;            /* pad */
+	long long constant;/* pll time constant */
+	long long precision;/* clock precision (usec) (read only) */
+	long long tolerance;/* clock frequency tolerance (ppm)
+				   * (read only)
+				   */
+	struct __kernel_timex_timeval time;	/* (read only, except for ADJ_SETOFFSET) */
+	long long tick;	/* (modified) usecs between clock ticks */
+
+	long long ppsfreq;/* pps frequency (scaled ppm) (ro) */
+	long long jitter; /* pps jitter (us) (ro) */
+	int shift;              /* interval duration (s) (shift) (ro) */
+	int :32;            /* pad */
+	long long stabil;            /* pps stability (scaled ppm) (ro) */
+	long long jitcnt; /* jitter limit exceeded (ro) */
+	long long calcnt; /* calibration intervals (ro) */
+	long long errcnt; /* calibration errors (ro) */
+	long long stbcnt; /* stability limit exceeded (ro) */
+
+	int tai;		/* TAI offset (ro) */
+
+	int  :32; int  :32; int  :32; int  :32;
+	int  :32; int  :32; int  :32; int  :32;
+	int  :32; int  :32; int  :32;
+};
+#endif
+
 /*
  * Mode codes (timex.mode)
  */
-- 
cgit v1.2.3-59-g8ed1b


From 8dabe7245bbc134f2cfcc12cde75c019dab924cc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 7 Jan 2019 00:33:08 +0100
Subject: y2038: syscalls: rename y2038 compat syscalls

A lot of system calls that pass a time_t somewhere have an implementation
using a COMPAT_SYSCALL_DEFINEx() on 64-bit architectures, and have
been reworked so that this implementation can now be used on 32-bit
architectures as well.

The missing step is to redefine them using the regular SYSCALL_DEFINEx()
to get them out of the compat namespace and make it possible to build them
on 32-bit architectures.

Any system call that ends in 'time' gets a '32' suffix on its name for
that version, while the others get a '_time32' suffix, to distinguish
them from the normal version, which takes a 64-bit time argument in the
future.

In this step, only 64-bit architectures are changed, doing this rename
first lets us avoid touching the 32-bit architectures twice.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm64/include/asm/unistd32.h         | 48 ++++++++++----------
 arch/mips/kernel/syscalls/syscall_n32.tbl | 50 ++++++++++-----------
 arch/mips/kernel/syscalls/syscall_o32.tbl | 52 +++++++++++-----------
 arch/parisc/kernel/syscalls/syscall.tbl   | 54 +++++++++++------------
 arch/powerpc/kernel/syscalls/syscall.tbl  | 52 +++++++++++-----------
 arch/s390/kernel/syscalls/syscall.tbl     | 52 +++++++++++-----------
 arch/sparc/kernel/syscalls/syscall.tbl    | 52 +++++++++++-----------
 arch/x86/entry/syscalls/syscall_32.tbl    | 52 +++++++++++-----------
 fs/aio.c                                  | 10 ++---
 fs/select.c                               |  4 +-
 fs/timerfd.c                              |  4 +-
 fs/utimes.c                               | 10 ++---
 include/linux/compat.h                    | 73 ++-----------------------------
 include/linux/syscalls.h                  | 57 ++++++++++++++++++++++++
 include/uapi/asm-generic/unistd.h         | 44 +++++++++----------
 ipc/mqueue.c                              | 16 +++----
 ipc/sem.c                                 |  2 +-
 kernel/futex.c                            |  2 +-
 kernel/sched/core.c                       |  5 +--
 kernel/signal.c                           |  2 +-
 kernel/sys_ni.c                           | 18 ++++----
 kernel/time/hrtimer.c                     |  2 +-
 kernel/time/posix-stubs.c                 | 25 ++++++-----
 kernel/time/posix-timers.c                | 32 +++++++-------
 kernel/time/time.c                        |  8 ++--
 net/compat.c                              |  2 +-
 26 files changed, 361 insertions(+), 367 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index d10cce69a4b0..1ded82857161 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -270,7 +270,7 @@ __SYSCALL(__NR_uname, sys_newuname)
 			/* 123 was sys_modify_ldt */
 __SYSCALL(123, sys_ni_syscall)
 #define __NR_adjtimex 124
-__SYSCALL(__NR_adjtimex, compat_sys_adjtimex)
+__SYSCALL(__NR_adjtimex, sys_adjtimex_time32)
 #define __NR_mprotect 125
 __SYSCALL(__NR_mprotect, sys_mprotect)
 #define __NR_sigprocmask 126
@@ -344,9 +344,9 @@ __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
 #define __NR_sched_get_priority_min 160
 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
 #define __NR_sched_rr_get_interval 161
-__SYSCALL(__NR_sched_rr_get_interval, compat_sys_sched_rr_get_interval)
+__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval_time32)
 #define __NR_nanosleep 162
-__SYSCALL(__NR_nanosleep, compat_sys_nanosleep)
+__SYSCALL(__NR_nanosleep, sys_nanosleep_time32)
 #define __NR_mremap 163
 __SYSCALL(__NR_mremap, sys_mremap)
 #define __NR_setresuid 164
@@ -376,7 +376,7 @@ __SYSCALL(__NR_rt_sigprocmask, compat_sys_rt_sigprocmask)
 #define __NR_rt_sigpending 176
 __SYSCALL(__NR_rt_sigpending, compat_sys_rt_sigpending)
 #define __NR_rt_sigtimedwait 177
-__SYSCALL(__NR_rt_sigtimedwait, compat_sys_rt_sigtimedwait)
+__SYSCALL(__NR_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time32)
 #define __NR_rt_sigqueueinfo 178
 __SYSCALL(__NR_rt_sigqueueinfo, compat_sys_rt_sigqueueinfo)
 #define __NR_rt_sigsuspend 179
@@ -502,7 +502,7 @@ __SYSCALL(__NR_tkill, sys_tkill)
 #define __NR_sendfile64 239
 __SYSCALL(__NR_sendfile64, sys_sendfile64)
 #define __NR_futex 240
-__SYSCALL(__NR_futex, compat_sys_futex)
+__SYSCALL(__NR_futex, sys_futex_time32)
 #define __NR_sched_setaffinity 241
 __SYSCALL(__NR_sched_setaffinity, compat_sys_sched_setaffinity)
 #define __NR_sched_getaffinity 242
@@ -512,7 +512,7 @@ __SYSCALL(__NR_io_setup, compat_sys_io_setup)
 #define __NR_io_destroy 244
 __SYSCALL(__NR_io_destroy, sys_io_destroy)
 #define __NR_io_getevents 245
-__SYSCALL(__NR_io_getevents, compat_sys_io_getevents)
+__SYSCALL(__NR_io_getevents, sys_io_getevents_time32)
 #define __NR_io_submit 246
 __SYSCALL(__NR_io_submit, compat_sys_io_submit)
 #define __NR_io_cancel 247
@@ -538,21 +538,21 @@ __SYSCALL(__NR_set_tid_address, sys_set_tid_address)
 #define __NR_timer_create 257
 __SYSCALL(__NR_timer_create, compat_sys_timer_create)
 #define __NR_timer_settime 258
-__SYSCALL(__NR_timer_settime, compat_sys_timer_settime)
+__SYSCALL(__NR_timer_settime, sys_timer_settime32)
 #define __NR_timer_gettime 259
-__SYSCALL(__NR_timer_gettime, compat_sys_timer_gettime)
+__SYSCALL(__NR_timer_gettime, sys_timer_gettime32)
 #define __NR_timer_getoverrun 260
 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
 #define __NR_timer_delete 261
 __SYSCALL(__NR_timer_delete, sys_timer_delete)
 #define __NR_clock_settime 262
-__SYSCALL(__NR_clock_settime, compat_sys_clock_settime)
+__SYSCALL(__NR_clock_settime, sys_clock_settime32)
 #define __NR_clock_gettime 263
-__SYSCALL(__NR_clock_gettime, compat_sys_clock_gettime)
+__SYSCALL(__NR_clock_gettime, sys_clock_gettime32)
 #define __NR_clock_getres 264
-__SYSCALL(__NR_clock_getres, compat_sys_clock_getres)
+__SYSCALL(__NR_clock_getres, sys_clock_getres_time32)
 #define __NR_clock_nanosleep 265
-__SYSCALL(__NR_clock_nanosleep, compat_sys_clock_nanosleep)
+__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep_time32)
 #define __NR_statfs64 266
 __SYSCALL(__NR_statfs64, compat_sys_aarch32_statfs64)
 #define __NR_fstatfs64 267
@@ -560,7 +560,7 @@ __SYSCALL(__NR_fstatfs64, compat_sys_aarch32_fstatfs64)
 #define __NR_tgkill 268
 __SYSCALL(__NR_tgkill, sys_tgkill)
 #define __NR_utimes 269
-__SYSCALL(__NR_utimes, compat_sys_utimes)
+__SYSCALL(__NR_utimes, sys_utimes_time32)
 #define __NR_arm_fadvise64_64 270
 __SYSCALL(__NR_arm_fadvise64_64, compat_sys_aarch32_fadvise64_64)
 #define __NR_pciconfig_iobase 271
@@ -574,9 +574,9 @@ __SYSCALL(__NR_mq_open, compat_sys_mq_open)
 #define __NR_mq_unlink 275
 __SYSCALL(__NR_mq_unlink, sys_mq_unlink)
 #define __NR_mq_timedsend 276
-__SYSCALL(__NR_mq_timedsend, compat_sys_mq_timedsend)
+__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend_time32)
 #define __NR_mq_timedreceive 277
-__SYSCALL(__NR_mq_timedreceive, compat_sys_mq_timedreceive)
+__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive_time32)
 #define __NR_mq_notify 278
 __SYSCALL(__NR_mq_notify, compat_sys_mq_notify)
 #define __NR_mq_getsetattr 279
@@ -646,7 +646,7 @@ __SYSCALL(__NR_request_key, sys_request_key)
 #define __NR_keyctl 311
 __SYSCALL(__NR_keyctl, compat_sys_keyctl)
 #define __NR_semtimedop 312
-__SYSCALL(__NR_semtimedop, compat_sys_semtimedop)
+__SYSCALL(__NR_semtimedop, sys_semtimedop_time32)
 #define __NR_vserver 313
 __SYSCALL(__NR_vserver, sys_ni_syscall)
 #define __NR_ioprio_set 314
@@ -674,7 +674,7 @@ __SYSCALL(__NR_mknodat, sys_mknodat)
 #define __NR_fchownat 325
 __SYSCALL(__NR_fchownat, sys_fchownat)
 #define __NR_futimesat 326
-__SYSCALL(__NR_futimesat, compat_sys_futimesat)
+__SYSCALL(__NR_futimesat, sys_futimesat_time32)
 #define __NR_fstatat64 327
 __SYSCALL(__NR_fstatat64, sys_fstatat64)
 #define __NR_unlinkat 328
@@ -692,9 +692,9 @@ __SYSCALL(__NR_fchmodat, sys_fchmodat)
 #define __NR_faccessat 334
 __SYSCALL(__NR_faccessat, sys_faccessat)
 #define __NR_pselect6 335
-__SYSCALL(__NR_pselect6, compat_sys_pselect6)
+__SYSCALL(__NR_pselect6, compat_sys_pselect6_time32)
 #define __NR_ppoll 336
-__SYSCALL(__NR_ppoll, compat_sys_ppoll)
+__SYSCALL(__NR_ppoll, compat_sys_ppoll_time32)
 #define __NR_unshare 337
 __SYSCALL(__NR_unshare, sys_unshare)
 #define __NR_set_robust_list 338
@@ -718,7 +718,7 @@ __SYSCALL(__NR_epoll_pwait, compat_sys_epoll_pwait)
 #define __NR_kexec_load 347
 __SYSCALL(__NR_kexec_load, compat_sys_kexec_load)
 #define __NR_utimensat 348
-__SYSCALL(__NR_utimensat, compat_sys_utimensat)
+__SYSCALL(__NR_utimensat, sys_utimensat_time32)
 #define __NR_signalfd 349
 __SYSCALL(__NR_signalfd, compat_sys_signalfd)
 #define __NR_timerfd_create 350
@@ -728,9 +728,9 @@ __SYSCALL(__NR_eventfd, sys_eventfd)
 #define __NR_fallocate 352
 __SYSCALL(__NR_fallocate, compat_sys_aarch32_fallocate)
 #define __NR_timerfd_settime 353
-__SYSCALL(__NR_timerfd_settime, compat_sys_timerfd_settime)
+__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime32)
 #define __NR_timerfd_gettime 354
-__SYSCALL(__NR_timerfd_gettime, compat_sys_timerfd_gettime)
+__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime32)
 #define __NR_signalfd4 355
 __SYSCALL(__NR_signalfd4, compat_sys_signalfd4)
 #define __NR_eventfd2 356
@@ -752,7 +752,7 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, compat_sys_rt_tgsigqueueinfo)
 #define __NR_perf_event_open 364
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_recvmmsg 365
-__SYSCALL(__NR_recvmmsg, compat_sys_recvmmsg)
+__SYSCALL(__NR_recvmmsg, compat_sys_recvmmsg_time32)
 #define __NR_accept4 366
 __SYSCALL(__NR_accept4, sys_accept4)
 #define __NR_fanotify_init 367
@@ -766,7 +766,7 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 #define __NR_open_by_handle_at 371
 __SYSCALL(__NR_open_by_handle_at, compat_sys_open_by_handle_at)
 #define __NR_clock_adjtime 372
-__SYSCALL(__NR_clock_adjtime, compat_sys_clock_adjtime)
+__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime32)
 #define __NR_syncfs 373
 __SYSCALL(__NR_syncfs, sys_syncfs)
 #define __NR_sendmmsg 374
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index cc134b1211aa..6d1e019817c8 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -41,7 +41,7 @@
 31	n32	dup				sys_dup
 32	n32	dup2				sys_dup2
 33	n32	pause				sys_pause
-34	n32	nanosleep			compat_sys_nanosleep
+34	n32	nanosleep			sys_nanosleep_time32
 35	n32	getitimer			compat_sys_getitimer
 36	n32	setitimer			compat_sys_setitimer
 37	n32	alarm				sys_alarm
@@ -133,11 +133,11 @@
 123	n32	capget				sys_capget
 124	n32	capset				sys_capset
 125	n32	rt_sigpending			compat_sys_rt_sigpending
-126	n32	rt_sigtimedwait			compat_sys_rt_sigtimedwait
+126	n32	rt_sigtimedwait			compat_sys_rt_sigtimedwait_time32
 127	n32	rt_sigqueueinfo			compat_sys_rt_sigqueueinfo
 128	n32	rt_sigsuspend			compat_sys_rt_sigsuspend
 129	n32	sigaltstack			compat_sys_sigaltstack
-130	n32	utime				compat_sys_utime
+130	n32	utime				sys_utime32
 131	n32	mknod				sys_mknod
 132	n32	personality			sys_32_personality
 133	n32	ustat				compat_sys_ustat
@@ -152,7 +152,7 @@
 142	n32	sched_getscheduler		sys_sched_getscheduler
 143	n32	sched_get_priority_max		sys_sched_get_priority_max
 144	n32	sched_get_priority_min		sys_sched_get_priority_min
-145	n32	sched_rr_get_interval		compat_sys_sched_rr_get_interval
+145	n32	sched_rr_get_interval		sys_sched_rr_get_interval_time32
 146	n32	mlock				sys_mlock
 147	n32	munlock				sys_munlock
 148	n32	mlockall			sys_mlockall
@@ -161,7 +161,7 @@
 151	n32	pivot_root			sys_pivot_root
 152	n32	_sysctl				compat_sys_sysctl
 153	n32	prctl				sys_prctl
-154	n32	adjtimex			compat_sys_adjtimex
+154	n32	adjtimex			sys_adjtimex_time32
 155	n32	setrlimit			compat_sys_setrlimit
 156	n32	chroot				sys_chroot
 157	n32	sync				sys_sync
@@ -202,7 +202,7 @@
 191	n32	fremovexattr			sys_fremovexattr
 192	n32	tkill				sys_tkill
 193	n32	reserved193			sys_ni_syscall
-194	n32	futex				compat_sys_futex
+194	n32	futex				sys_futex_time32
 195	n32	sched_setaffinity		compat_sys_sched_setaffinity
 196	n32	sched_getaffinity		compat_sys_sched_getaffinity
 197	n32	cacheflush			sys_cacheflush
@@ -210,7 +210,7 @@
 199	n32	sysmips				__sys_sysmips
 200	n32	io_setup			compat_sys_io_setup
 201	n32	io_destroy			sys_io_destroy
-202	n32	io_getevents			compat_sys_io_getevents
+202	n32	io_getevents			sys_io_getevents_time32
 203	n32	io_submit			compat_sys_io_submit
 204	n32	io_cancel			sys_io_cancel
 205	n32	exit_group			sys_exit_group
@@ -223,29 +223,29 @@
 212	n32	fcntl64				compat_sys_fcntl64
 213	n32	set_tid_address			sys_set_tid_address
 214	n32	restart_syscall			sys_restart_syscall
-215	n32	semtimedop			compat_sys_semtimedop
+215	n32	semtimedop			sys_semtimedop_time32
 216	n32	fadvise64			sys_fadvise64_64
 217	n32	statfs64			compat_sys_statfs64
 218	n32	fstatfs64			compat_sys_fstatfs64
 219	n32	sendfile64			sys_sendfile64
 220	n32	timer_create			compat_sys_timer_create
-221	n32	timer_settime			compat_sys_timer_settime
-222	n32	timer_gettime			compat_sys_timer_gettime
+221	n32	timer_settime			sys_timer_settime32
+222	n32	timer_gettime			sys_timer_gettime32
 223	n32	timer_getoverrun		sys_timer_getoverrun
 224	n32	timer_delete			sys_timer_delete
-225	n32	clock_settime			compat_sys_clock_settime
-226	n32	clock_gettime			compat_sys_clock_gettime
-227	n32	clock_getres			compat_sys_clock_getres
-228	n32	clock_nanosleep			compat_sys_clock_nanosleep
+225	n32	clock_settime			sys_clock_settime32
+226	n32	clock_gettime			sys_clock_gettime32
+227	n32	clock_getres			sys_clock_getres_time32
+228	n32	clock_nanosleep			sys_clock_nanosleep_time32
 229	n32	tgkill				sys_tgkill
-230	n32	utimes				compat_sys_utimes
+230	n32	utimes				sys_utimes_time32
 231	n32	mbind				compat_sys_mbind
 232	n32	get_mempolicy			compat_sys_get_mempolicy
 233	n32	set_mempolicy			compat_sys_set_mempolicy
 234	n32	mq_open				compat_sys_mq_open
 235	n32	mq_unlink			sys_mq_unlink
-236	n32	mq_timedsend			compat_sys_mq_timedsend
-237	n32	mq_timedreceive			compat_sys_mq_timedreceive
+236	n32	mq_timedsend			sys_mq_timedsend_time32
+237	n32	mq_timedreceive			sys_mq_timedreceive_time32
 238	n32	mq_notify			compat_sys_mq_notify
 239	n32	mq_getsetattr			compat_sys_mq_getsetattr
 240	n32	vserver				sys_ni_syscall
@@ -263,7 +263,7 @@
 252	n32	mkdirat				sys_mkdirat
 253	n32	mknodat				sys_mknodat
 254	n32	fchownat			sys_fchownat
-255	n32	futimesat			compat_sys_futimesat
+255	n32	futimesat			sys_futimesat_time32
 256	n32	newfstatat			sys_newfstatat
 257	n32	unlinkat			sys_unlinkat
 258	n32	renameat			sys_renameat
@@ -272,8 +272,8 @@
 261	n32	readlinkat			sys_readlinkat
 262	n32	fchmodat			sys_fchmodat
 263	n32	faccessat			sys_faccessat
-264	n32	pselect6			compat_sys_pselect6
-265	n32	ppoll				compat_sys_ppoll
+264	n32	pselect6			compat_sys_pselect6_time32
+265	n32	ppoll				compat_sys_ppoll_time32
 266	n32	unshare				sys_unshare
 267	n32	splice				sys_splice
 268	n32	sync_file_range			sys_sync_file_range
@@ -287,14 +287,14 @@
 276	n32	epoll_pwait			compat_sys_epoll_pwait
 277	n32	ioprio_set			sys_ioprio_set
 278	n32	ioprio_get			sys_ioprio_get
-279	n32	utimensat			compat_sys_utimensat
+279	n32	utimensat			sys_utimensat_time32
 280	n32	signalfd			compat_sys_signalfd
 281	n32	timerfd				sys_ni_syscall
 282	n32	eventfd				sys_eventfd
 283	n32	fallocate			sys_fallocate
 284	n32	timerfd_create			sys_timerfd_create
-285	n32	timerfd_gettime			compat_sys_timerfd_gettime
-286	n32	timerfd_settime			compat_sys_timerfd_settime
+285	n32	timerfd_gettime			sys_timerfd_gettime32
+286	n32	timerfd_settime			sys_timerfd_settime32
 287	n32	signalfd4			compat_sys_signalfd4
 288	n32	eventfd2			sys_eventfd2
 289	n32	epoll_create1			sys_epoll_create1
@@ -306,14 +306,14 @@
 295	n32	rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
 296	n32	perf_event_open			sys_perf_event_open
 297	n32	accept4				sys_accept4
-298	n32	recvmmsg			compat_sys_recvmmsg
+298	n32	recvmmsg			compat_sys_recvmmsg_time32
 299	n32	getdents64			sys_getdents64
 300	n32	fanotify_init			sys_fanotify_init
 301	n32	fanotify_mark			sys_fanotify_mark
 302	n32	prlimit64			sys_prlimit64
 303	n32	name_to_handle_at		sys_name_to_handle_at
 304	n32	open_by_handle_at		sys_open_by_handle_at
-305	n32	clock_adjtime			compat_sys_clock_adjtime
+305	n32	clock_adjtime			sys_clock_adjtime32
 306	n32	syncfs				sys_syncfs
 307	n32	sendmmsg			compat_sys_sendmmsg
 308	n32	setns				sys_setns
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index fa47ea8cc6ef..e9fec7bac5a9 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -20,7 +20,7 @@
 10	o32	unlink				sys_unlink
 11	o32	execve				sys_execve			compat_sys_execve
 12	o32	chdir				sys_chdir
-13	o32	time				sys_time			compat_sys_time
+13	o32	time				sys_time			sys_time32
 14	o32	mknod				sys_mknod
 15	o32	chmod				sys_chmod
 16	o32	lchown				sys_lchown
@@ -33,13 +33,13 @@
 22	o32	umount				sys_oldumount
 23	o32	setuid				sys_setuid
 24	o32	getuid				sys_getuid
-25	o32	stime				sys_stime			compat_sys_stime
+25	o32	stime				sys_stime			sys_stime32
 26	o32	ptrace				sys_ptrace			compat_sys_ptrace
 27	o32	alarm				sys_alarm
 # 28 was sys_fstat
 28	o32	unused28			sys_ni_syscall
 29	o32	pause				sys_pause
-30	o32	utime				sys_utime			compat_sys_utime
+30	o32	utime				sys_utime			sys_utime32
 31	o32	stty				sys_ni_syscall
 32	o32	gtty				sys_ni_syscall
 33	o32	access				sys_access
@@ -135,7 +135,7 @@
 121	o32	setdomainname			sys_setdomainname
 122	o32	uname				sys_newuname
 123	o32	modify_ldt			sys_ni_syscall
-124	o32	adjtimex			sys_adjtimex			compat_sys_adjtimex
+124	o32	adjtimex			sys_adjtimex			sys_adjtimex_time32
 125	o32	mprotect			sys_mprotect
 126	o32	sigprocmask			sys_sigprocmask			compat_sys_sigprocmask
 127	o32	create_module			sys_ni_syscall
@@ -176,8 +176,8 @@
 162	o32	sched_yield			sys_sched_yield
 163	o32	sched_get_priority_max		sys_sched_get_priority_max
 164	o32	sched_get_priority_min		sys_sched_get_priority_min
-165	o32	sched_rr_get_interval		sys_sched_rr_get_interval	compat_sys_sched_rr_get_interval
-166	o32	nanosleep			sys_nanosleep			compat_sys_nanosleep
+165	o32	sched_rr_get_interval		sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
+166	o32	nanosleep			sys_nanosleep			sys_nanosleep_time32
 167	o32	mremap				sys_mremap
 168	o32	accept				sys_accept
 169	o32	bind				sys_bind
@@ -208,7 +208,7 @@
 194	o32	rt_sigaction			sys_rt_sigaction		compat_sys_rt_sigaction
 195	o32	rt_sigprocmask			sys_rt_sigprocmask		compat_sys_rt_sigprocmask
 196	o32	rt_sigpending			sys_rt_sigpending		compat_sys_rt_sigpending
-197	o32	rt_sigtimedwait			sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+197	o32	rt_sigtimedwait			sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
 198	o32	rt_sigqueueinfo			sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 199	o32	rt_sigsuspend			sys_rt_sigsuspend		compat_sys_rt_sigsuspend
 200	o32	pread64				sys_pread64			sys_32_pread
@@ -249,12 +249,12 @@
 235	o32	fremovexattr			sys_fremovexattr
 236	o32	tkill				sys_tkill
 237	o32	sendfile64			sys_sendfile64
-238	o32	futex				sys_futex			compat_sys_futex
+238	o32	futex				sys_futex			sys_futex_time32
 239	o32	sched_setaffinity		sys_sched_setaffinity		compat_sys_sched_setaffinity
 240	o32	sched_getaffinity		sys_sched_getaffinity		compat_sys_sched_getaffinity
 241	o32	io_setup			sys_io_setup			compat_sys_io_setup
 242	o32	io_destroy			sys_io_destroy
-243	o32	io_getevents			sys_io_getevents		compat_sys_io_getevents
+243	o32	io_getevents			sys_io_getevents		sys_io_getevents_time32
 244	o32	io_submit			sys_io_submit			compat_sys_io_submit
 245	o32	io_cancel			sys_io_cancel
 246	o32	exit_group			sys_exit_group
@@ -269,23 +269,23 @@
 255	o32	statfs64			sys_statfs64			compat_sys_statfs64
 256	o32	fstatfs64			sys_fstatfs64			compat_sys_fstatfs64
 257	o32	timer_create			sys_timer_create		compat_sys_timer_create
-258	o32	timer_settime			sys_timer_settime		compat_sys_timer_settime
-259	o32	timer_gettime			sys_timer_gettime		compat_sys_timer_gettime
+258	o32	timer_settime			sys_timer_settime		sys_timer_settime32
+259	o32	timer_gettime			sys_timer_gettime		sys_timer_gettime32
 260	o32	timer_getoverrun		sys_timer_getoverrun
 261	o32	timer_delete			sys_timer_delete
-262	o32	clock_settime			sys_clock_settime		compat_sys_clock_settime
-263	o32	clock_gettime			sys_clock_gettime		compat_sys_clock_gettime
-264	o32	clock_getres			sys_clock_getres		compat_sys_clock_getres
-265	o32	clock_nanosleep			sys_clock_nanosleep		compat_sys_clock_nanosleep
+262	o32	clock_settime			sys_clock_settime		sys_clock_settime32
+263	o32	clock_gettime			sys_clock_gettime		sys_clock_gettime32
+264	o32	clock_getres			sys_clock_getres		sys_clock_getres_time32
+265	o32	clock_nanosleep			sys_clock_nanosleep		sys_clock_nanosleep_time32
 266	o32	tgkill				sys_tgkill
-267	o32	utimes				sys_utimes			compat_sys_utimes
+267	o32	utimes				sys_utimes			sys_utimes_time32
 268	o32	mbind				sys_mbind			compat_sys_mbind
 269	o32	get_mempolicy			sys_get_mempolicy		compat_sys_get_mempolicy
 270	o32	set_mempolicy			sys_set_mempolicy		compat_sys_set_mempolicy
 271	o32	mq_open				sys_mq_open			compat_sys_mq_open
 272	o32	mq_unlink			sys_mq_unlink
-273	o32	mq_timedsend			sys_mq_timedsend		compat_sys_mq_timedsend
-274	o32	mq_timedreceive			sys_mq_timedreceive		compat_sys_mq_timedreceive
+273	o32	mq_timedsend			sys_mq_timedsend		sys_mq_timedsend_time32
+274	o32	mq_timedreceive			sys_mq_timedreceive		sys_mq_timedreceive_time32
 275	o32	mq_notify			sys_mq_notify			compat_sys_mq_notify
 276	o32	mq_getsetattr			sys_mq_getsetattr		compat_sys_mq_getsetattr
 277	o32	vserver				sys_ni_syscall
@@ -303,7 +303,7 @@
 289	o32	mkdirat				sys_mkdirat
 290	o32	mknodat				sys_mknodat
 291	o32	fchownat			sys_fchownat
-292	o32	futimesat			sys_futimesat			compat_sys_futimesat
+292	o32	futimesat			sys_futimesat			sys_futimesat_time32
 293	o32	fstatat64			sys_fstatat64			sys_newfstatat
 294	o32	unlinkat			sys_unlinkat
 295	o32	renameat			sys_renameat
@@ -312,8 +312,8 @@
 298	o32	readlinkat			sys_readlinkat
 299	o32	fchmodat			sys_fchmodat
 300	o32	faccessat			sys_faccessat
-301	o32	pselect6			sys_pselect6			compat_sys_pselect6
-302	o32	ppoll				sys_ppoll			compat_sys_ppoll
+301	o32	pselect6			sys_pselect6			compat_sys_pselect6_time32
+302	o32	ppoll				sys_ppoll			compat_sys_ppoll_time32
 303	o32	unshare				sys_unshare
 304	o32	splice				sys_splice
 305	o32	sync_file_range			sys_sync_file_range		sys32_sync_file_range
@@ -327,14 +327,14 @@
 313	o32	epoll_pwait			sys_epoll_pwait			compat_sys_epoll_pwait
 314	o32	ioprio_set			sys_ioprio_set
 315	o32	ioprio_get			sys_ioprio_get
-316	o32	utimensat			sys_utimensat			compat_sys_utimensat
+316	o32	utimensat			sys_utimensat			sys_utimensat_time32
 317	o32	signalfd			sys_signalfd			compat_sys_signalfd
 318	o32	timerfd				sys_ni_syscall
 319	o32	eventfd				sys_eventfd
 320	o32	fallocate			sys_fallocate			sys32_fallocate
 321	o32	timerfd_create			sys_timerfd_create
-322	o32	timerfd_gettime			sys_timerfd_gettime		compat_sys_timerfd_gettime
-323	o32	timerfd_settime			sys_timerfd_settime		compat_sys_timerfd_settime
+322	o32	timerfd_gettime			sys_timerfd_gettime		sys_timerfd_gettime32
+323	o32	timerfd_settime			sys_timerfd_settime		sys_timerfd_settime32
 324	o32	signalfd4			sys_signalfd4			compat_sys_signalfd4
 325	o32	eventfd2			sys_eventfd2
 326	o32	epoll_create1			sys_epoll_create1
@@ -346,13 +346,13 @@
 332	o32	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
 333	o32	perf_event_open			sys_perf_event_open
 334	o32	accept4				sys_accept4
-335	o32	recvmmsg			sys_recvmmsg			compat_sys_recvmmsg
+335	o32	recvmmsg			sys_recvmmsg			compat_sys_recvmmsg_time32
 336	o32	fanotify_init			sys_fanotify_init
 337	o32	fanotify_mark			sys_fanotify_mark		compat_sys_fanotify_mark
 338	o32	prlimit64			sys_prlimit64
 339	o32	name_to_handle_at		sys_name_to_handle_at
 340	o32	open_by_handle_at		sys_open_by_handle_at		compat_sys_open_by_handle_at
-341	o32	clock_adjtime			sys_clock_adjtime		compat_sys_clock_adjtime
+341	o32	clock_adjtime			sys_clock_adjtime		sys_clock_adjtime32
 342	o32	syncfs				sys_syncfs
 343	o32	sendmmsg			sys_sendmmsg			compat_sys_sendmmsg
 344	o32	setns				sys_setns
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 71873bb72782..f7440427d459 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -20,7 +20,7 @@
 10	common	unlink			sys_unlink
 11	common	execve			sys_execve			compat_sys_execve
 12	common	chdir			sys_chdir
-13	common	time			sys_time			compat_sys_time
+13	common	time			sys_time			sys_time32
 14	common	mknod			sys_mknod
 15	common	chmod			sys_chmod
 16	common	lchown			sys_lchown
@@ -32,12 +32,12 @@
 22	common	bind			sys_bind
 23	common	setuid			sys_setuid
 24	common	getuid			sys_getuid
-25	common	stime			sys_stime			compat_sys_stime
+25	common	stime			sys_stime			sys_stime32
 26	common	ptrace			sys_ptrace			compat_sys_ptrace
 27	common	alarm			sys_alarm
 28	common	fstat			sys_newfstat			compat_sys_newfstat
 29	common	pause			sys_pause
-30	common	utime			sys_utime			compat_sys_utime
+30	common	utime			sys_utime			sys_utime32
 31	common	connect			sys_connect
 32	common	listen			sys_listen
 33	common	access			sys_access
@@ -133,7 +133,7 @@
 121	common	setdomainname		sys_setdomainname
 122	common	sendfile		sys_sendfile			compat_sys_sendfile
 123	common	recvfrom		sys_recvfrom
-124	common	adjtimex		sys_adjtimex			compat_sys_adjtimex
+124	common	adjtimex		sys_adjtimex			sys_adjtimex_time32
 125	common	mprotect		sys_mprotect
 126	common	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
 # 127 was create_module
@@ -171,8 +171,8 @@
 158	common	sched_yield		sys_sched_yield
 159	common	sched_get_priority_max	sys_sched_get_priority_max
 160	common	sched_get_priority_min	sys_sched_get_priority_min
-161	common	sched_rr_get_interval	sys_sched_rr_get_interval	compat_sys_sched_rr_get_interval
-162	common	nanosleep		sys_nanosleep			compat_sys_nanosleep
+161	common	sched_rr_get_interval	sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
+162	common	nanosleep		sys_nanosleep			sys_nanosleep_time32
 163	common	mremap			sys_mremap
 164	common	setresuid		sys_setresuid
 165	common	getresuid		sys_getresuid
@@ -187,7 +187,7 @@
 174	common	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
 175	common	rt_sigprocmask		sys_rt_sigprocmask		compat_sys_rt_sigprocmask
 176	common	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
-177	common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+177	common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
 178	common	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 179	common	rt_sigsuspend		sys_rt_sigsuspend		compat_sys_rt_sigsuspend
 180	common	chown			sys_chown
@@ -223,14 +223,14 @@
 207	64	readahead		sys_readahead
 208	common	tkill			sys_tkill
 209	common	sendfile64		sys_sendfile64			compat_sys_sendfile64
-210	common	futex			sys_futex			compat_sys_futex
+210	common	futex			sys_futex			sys_futex_time32
 211	common	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
 212	common	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
 # 213 was set_thread_area
 # 214 was get_thread_area
 215	common	io_setup		sys_io_setup			compat_sys_io_setup
 216	common	io_destroy		sys_io_destroy
-217	common	io_getevents		sys_io_getevents		compat_sys_io_getevents
+217	common	io_getevents		sys_io_getevents		sys_io_getevents_time32
 218	common	io_submit		sys_io_submit			compat_sys_io_submit
 219	common	io_cancel		sys_io_cancel
 # 220 was alloc_hugepages
@@ -241,11 +241,11 @@
 225	common	epoll_ctl		sys_epoll_ctl
 226	common	epoll_wait		sys_epoll_wait
 227	common	remap_file_pages	sys_remap_file_pages
-228	common	semtimedop		sys_semtimedop			compat_sys_semtimedop
+228	common	semtimedop		sys_semtimedop			sys_semtimedop_time32
 229	common	mq_open			sys_mq_open			compat_sys_mq_open
 230	common	mq_unlink		sys_mq_unlink
-231	common	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend
-232	common	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive
+231	common	mq_timedsend		sys_mq_timedsend		sys_mq_timedsend_time32
+232	common	mq_timedreceive		sys_mq_timedreceive		sys_mq_timedreceive_time32
 233	common	mq_notify		sys_mq_notify			compat_sys_mq_notify
 234	common	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
 235	common	waitid			sys_waitid			compat_sys_waitid
@@ -265,14 +265,14 @@
 248	common	lremovexattr		sys_lremovexattr
 249	common	fremovexattr		sys_fremovexattr
 250	common	timer_create		sys_timer_create		compat_sys_timer_create
-251	common	timer_settime		sys_timer_settime		compat_sys_timer_settime
-252	common	timer_gettime		sys_timer_gettime		compat_sys_timer_gettime
+251	common	timer_settime		sys_timer_settime		sys_timer_settime32
+252	common	timer_gettime		sys_timer_gettime		sys_timer_gettime32
 253	common	timer_getoverrun	sys_timer_getoverrun
 254	common	timer_delete		sys_timer_delete
-255	common	clock_settime		sys_clock_settime		compat_sys_clock_settime
-256	common	clock_gettime		sys_clock_gettime		compat_sys_clock_gettime
-257	common	clock_getres		sys_clock_getres		compat_sys_clock_getres
-258	common	clock_nanosleep		sys_clock_nanosleep		compat_sys_clock_nanosleep
+255	common	clock_settime		sys_clock_settime		sys_clock_settime32
+256	common	clock_gettime		sys_clock_gettime		sys_clock_gettime32
+257	common	clock_getres		sys_clock_getres		sys_clock_getres_time32
+258	common	clock_nanosleep		sys_clock_nanosleep		sys_clock_nanosleep_time32
 259	common	tgkill			sys_tgkill
 260	common	mbind			sys_mbind			compat_sys_mbind
 261	common	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
@@ -287,13 +287,13 @@
 270	common	inotify_add_watch	sys_inotify_add_watch
 271	common	inotify_rm_watch	sys_inotify_rm_watch
 272	common	migrate_pages		sys_migrate_pages
-273	common	pselect6		sys_pselect6			compat_sys_pselect6
-274	common	ppoll			sys_ppoll			compat_sys_ppoll
+273	common	pselect6		sys_pselect6			compat_sys_pselect6_time32
+274	common	ppoll			sys_ppoll			compat_sys_ppoll_time32
 275	common	openat			sys_openat			compat_sys_openat
 276	common	mkdirat			sys_mkdirat
 277	common	mknodat			sys_mknodat
 278	common	fchownat		sys_fchownat
-279	common	futimesat		sys_futimesat			compat_sys_futimesat
+279	common	futimesat		sys_futimesat			sys_futimesat_time32
 280	common	fstatat64		sys_fstatat64
 281	common	unlinkat		sys_unlinkat
 282	common	renameat		sys_renameat
@@ -316,15 +316,15 @@
 298	common	statfs64		sys_statfs64			compat_sys_statfs64
 299	common	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
 300	common	kexec_load		sys_kexec_load			compat_sys_kexec_load
-301	common	utimensat		sys_utimensat			compat_sys_utimensat
+301	common	utimensat		sys_utimensat			sys_utimensat_time32
 302	common	signalfd		sys_signalfd			compat_sys_signalfd
 # 303 was timerfd
 304	common	eventfd			sys_eventfd
 305	32	fallocate		parisc_fallocate
 305	64	fallocate		sys_fallocate
 306	common	timerfd_create		sys_timerfd_create
-307	common	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
-308	common	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
+307	common	timerfd_settime		sys_timerfd_settime		sys_timerfd_settime32
+308	common	timerfd_gettime		sys_timerfd_gettime		sys_timerfd_gettime32
 309	common	signalfd4		sys_signalfd4			compat_sys_signalfd4
 310	common	eventfd2		sys_eventfd2
 311	common	epoll_create1		sys_epoll_create1
@@ -335,12 +335,12 @@
 316	common	pwritev	sys_pwritev	compat_sys_pwritev
 317	common	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
 318	common	perf_event_open		sys_perf_event_open
-319	common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg
+319	common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg_time32
 320	common	accept4			sys_accept4
 321	common	prlimit64		sys_prlimit64
 322	common	fanotify_init		sys_fanotify_init
 323	common	fanotify_mark		sys_fanotify_mark		sys32_fanotify_mark
-324	common	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime
+324	common	clock_adjtime		sys_clock_adjtime		sys_clock_adjtime32
 325	common	name_to_handle_at	sys_name_to_handle_at
 326	common	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
 327	common	syncfs			sys_syncfs
@@ -352,7 +352,7 @@
 333	common	finit_module		sys_finit_module
 334	common	sched_setattr		sys_sched_setattr
 335	common	sched_getattr		sys_sched_getattr
-336	common	utimes			sys_utimes			compat_sys_utimes
+336	common	utimes			sys_utimes			sys_utimes_time32
 337	common	renameat2		sys_renameat2
 338	common	seccomp			sys_seccomp
 339	common	getrandom		sys_getrandom
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 7555874ce39c..86650dcd2185 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -20,7 +20,7 @@
 10	common	unlink				sys_unlink
 11	nospu	execve				sys_execve			compat_sys_execve
 12	common	chdir				sys_chdir
-13	common	time				sys_time			compat_sys_time
+13	common	time				sys_time			sys_time32
 14	common	mknod				sys_mknod
 15	common	chmod				sys_chmod
 16	common	lchown				sys_lchown
@@ -36,14 +36,14 @@
 22	spu	umount				sys_ni_syscall
 23	common	setuid				sys_setuid
 24	common	getuid				sys_getuid
-25	common	stime				sys_stime			compat_sys_stime
+25	common	stime				sys_stime			sys_stime32
 26	nospu	ptrace				sys_ptrace			compat_sys_ptrace
 27	common	alarm				sys_alarm
 28	32	oldfstat			sys_fstat			sys_ni_syscall
 28	64	oldfstat			sys_ni_syscall
 28	spu	oldfstat			sys_ni_syscall
 29	nospu	pause				sys_pause
-30	nospu	utime				sys_utime			compat_sys_utime
+30	nospu	utime				sys_utime			sys_utime32
 31	common	stty				sys_ni_syscall
 32	common	gtty				sys_ni_syscall
 33	common	access				sys_access
@@ -157,7 +157,7 @@
 121	common	setdomainname			sys_setdomainname
 122	common	uname				sys_newuname
 123	common	modify_ldt			sys_ni_syscall
-124	common	adjtimex			sys_adjtimex			compat_sys_adjtimex
+124	common	adjtimex			sys_adjtimex			sys_adjtimex_time32
 125	common	mprotect			sys_mprotect
 126	32	sigprocmask			sys_sigprocmask			compat_sys_sigprocmask
 126	64	sigprocmask			sys_ni_syscall
@@ -198,8 +198,8 @@
 158	common	sched_yield			sys_sched_yield
 159	common	sched_get_priority_max		sys_sched_get_priority_max
 160	common	sched_get_priority_min		sys_sched_get_priority_min
-161	common	sched_rr_get_interval		sys_sched_rr_get_interval	compat_sys_sched_rr_get_interval
-162	common	nanosleep			sys_nanosleep			compat_sys_nanosleep
+161	common	sched_rr_get_interval		sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
+162	common	nanosleep			sys_nanosleep			sys_nanosleep_time32
 163	common	mremap				sys_mremap
 164	common	setresuid			sys_setresuid
 165	common	getresuid			sys_getresuid
@@ -213,7 +213,7 @@
 173	nospu	rt_sigaction			sys_rt_sigaction		compat_sys_rt_sigaction
 174	nospu	rt_sigprocmask			sys_rt_sigprocmask		compat_sys_rt_sigprocmask
 175	nospu	rt_sigpending			sys_rt_sigpending		compat_sys_rt_sigpending
-176	nospu	rt_sigtimedwait			sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+176	nospu	rt_sigtimedwait			sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
 177	nospu 	rt_sigqueueinfo			sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 178	nospu 	rt_sigsuspend			sys_rt_sigsuspend		compat_sys_rt_sigsuspend
 179	common	pread64				sys_pread64			compat_sys_pread64
@@ -260,7 +260,7 @@
 218	common	removexattr			sys_removexattr
 219	common	lremovexattr			sys_lremovexattr
 220	common	fremovexattr			sys_fremovexattr
-221	common	futex				sys_futex			compat_sys_futex
+221	common	futex				sys_futex			sys_futex_time32
 222	common	sched_setaffinity		sys_sched_setaffinity		compat_sys_sched_setaffinity
 223	common	sched_getaffinity		sys_sched_getaffinity		compat_sys_sched_getaffinity
 # 224 unused
@@ -268,7 +268,7 @@
 226	32	sendfile64			sys_sendfile64			compat_sys_sendfile64
 227	common	io_setup			sys_io_setup			compat_sys_io_setup
 228	common	io_destroy			sys_io_destroy
-229	common	io_getevents			sys_io_getevents		compat_sys_io_getevents
+229	common	io_getevents			sys_io_getevents		sys_io_getevents_time32
 230	common	io_submit			sys_io_submit			compat_sys_io_submit
 231	common	io_cancel			sys_io_cancel
 232	nospu	set_tid_address			sys_set_tid_address
@@ -280,19 +280,19 @@
 238	common	epoll_wait			sys_epoll_wait
 239	common	remap_file_pages		sys_remap_file_pages
 240	common	timer_create			sys_timer_create		compat_sys_timer_create
-241	common	timer_settime			sys_timer_settime		compat_sys_timer_settime
-242	common	timer_gettime			sys_timer_gettime		compat_sys_timer_gettime
+241	common	timer_settime			sys_timer_settime		sys_timer_settime32
+242	common	timer_gettime			sys_timer_gettime		sys_timer_gettime32
 243	common	timer_getoverrun		sys_timer_getoverrun
 244	common	timer_delete			sys_timer_delete
-245	common	clock_settime			sys_clock_settime		compat_sys_clock_settime
-246	common	clock_gettime			sys_clock_gettime		compat_sys_clock_gettime
-247	common	clock_getres			sys_clock_getres		compat_sys_clock_getres
-248	common	clock_nanosleep			sys_clock_nanosleep		compat_sys_clock_nanosleep
+245	common	clock_settime			sys_clock_settime		sys_clock_settime32
+246	common	clock_gettime			sys_clock_gettime		sys_clock_gettime32
+247	common	clock_getres			sys_clock_getres		sys_clock_getres_time32
+248	common	clock_nanosleep			sys_clock_nanosleep		sys_clock_nanosleep_time32
 249	32	swapcontext			ppc_swapcontext			ppc32_swapcontext
 249	64	swapcontext			ppc64_swapcontext
 249	spu	swapcontext			sys_ni_syscall
 250	common	tgkill				sys_tgkill
-251	common	utimes				sys_utimes			compat_sys_utimes
+251	common	utimes				sys_utimes			sys_utimes_time32
 252	common	statfs64			sys_statfs64			compat_sys_statfs64
 253	common	fstatfs64			sys_fstatfs64			compat_sys_fstatfs64
 254	32	fadvise64_64			ppc_fadvise64_64
@@ -308,8 +308,8 @@
 261	nospu	set_mempolicy			sys_set_mempolicy		compat_sys_set_mempolicy
 262	nospu	mq_open				sys_mq_open			compat_sys_mq_open
 263	nospu	mq_unlink			sys_mq_unlink
-264	nospu	mq_timedsend			sys_mq_timedsend		compat_sys_mq_timedsend
-265	nospu	mq_timedreceive			sys_mq_timedreceive		compat_sys_mq_timedreceive
+264	nospu	mq_timedsend			sys_mq_timedsend		sys_mq_timedsend_time32
+265	nospu	mq_timedreceive			sys_mq_timedreceive		sys_mq_timedreceive_time32
 266	nospu	mq_notify			sys_mq_notify			compat_sys_mq_notify
 267	nospu	mq_getsetattr			sys_mq_getsetattr		compat_sys_mq_getsetattr
 268	nospu	kexec_load			sys_kexec_load			compat_sys_kexec_load
@@ -324,8 +324,8 @@
 277	nospu	inotify_rm_watch		sys_inotify_rm_watch
 278	nospu	spu_run				sys_spu_run
 279	nospu	spu_create			sys_spu_create
-280	nospu	pselect6			sys_pselect6			compat_sys_pselect6
-281	nospu	ppoll				sys_ppoll			compat_sys_ppoll
+280	nospu	pselect6			sys_pselect6			compat_sys_pselect6_time32
+281	nospu	ppoll				sys_ppoll			compat_sys_ppoll_time32
 282	common	unshare				sys_unshare
 283	common	splice				sys_splice
 284	common	tee				sys_tee
@@ -334,7 +334,7 @@
 287	common	mkdirat				sys_mkdirat
 288	common	mknodat				sys_mknodat
 289	common	fchownat			sys_fchownat
-290	common	futimesat			sys_futimesat			compat_sys_futimesat
+290	common	futimesat			sys_futimesat			sys_futimesat_time32
 291	32	fstatat64			sys_fstatat64
 291	64	newfstatat			sys_newfstatat
 291	spu	newfstatat			sys_newfstatat
@@ -350,15 +350,15 @@
 301	common	move_pages			sys_move_pages			compat_sys_move_pages
 302	common	getcpu				sys_getcpu
 303	nospu	epoll_pwait			sys_epoll_pwait			compat_sys_epoll_pwait
-304	common	utimensat			sys_utimensat			compat_sys_utimensat
+304	common	utimensat			sys_utimensat			sys_utimensat_time32
 305	common	signalfd			sys_signalfd			compat_sys_signalfd
 306	common	timerfd_create			sys_timerfd_create
 307	common	eventfd				sys_eventfd
 308	common	sync_file_range2		sys_sync_file_range2		compat_sys_sync_file_range2
 309	nospu	fallocate			sys_fallocate			compat_sys_fallocate
 310	nospu	subpage_prot			sys_subpage_prot
-311	common	timerfd_settime			sys_timerfd_settime		compat_sys_timerfd_settime
-312	common	timerfd_gettime			sys_timerfd_gettime		compat_sys_timerfd_gettime
+311	common	timerfd_settime			sys_timerfd_settime		sys_timerfd_settime32
+312	common	timerfd_gettime			sys_timerfd_gettime		sys_timerfd_gettime32
 313	common	signalfd4			sys_signalfd4			compat_sys_signalfd4
 314	common	eventfd2			sys_eventfd2
 315	common	epoll_create1			sys_epoll_create1
@@ -389,11 +389,11 @@
 340	common	getsockopt			sys_getsockopt			compat_sys_getsockopt
 341	common	sendmsg				sys_sendmsg			compat_sys_sendmsg
 342	common	recvmsg				sys_recvmsg			compat_sys_recvmsg
-343	common	recvmmsg			sys_recvmmsg			compat_sys_recvmmsg
+343	common	recvmmsg			sys_recvmmsg			compat_sys_recvmmsg_time32
 344	common	accept4				sys_accept4
 345	common	name_to_handle_at		sys_name_to_handle_at
 346	common	open_by_handle_at		sys_open_by_handle_at		compat_sys_open_by_handle_at
-347	common	clock_adjtime			sys_clock_adjtime		compat_sys_clock_adjtime
+347	common	clock_adjtime			sys_clock_adjtime		sys_clock_adjtime32
 348	common	syncfs				sys_syncfs
 349	common	sendmmsg			sys_sendmmsg			compat_sys_sendmmsg
 350	common	setns				sys_setns
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 620e222003ca..285201cf1f83 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -20,7 +20,7 @@
 10   common	unlink			sys_unlink			sys_unlink
 11   common	execve			sys_execve			compat_sys_execve
 12   common	chdir			sys_chdir			sys_chdir
-13   32		time			-				compat_sys_time
+13   32		time			-				sys_time32
 14   common	mknod			sys_mknod			sys_mknod
 15   common	chmod			sys_chmod			sys_chmod
 16   32		lchown			-				sys_lchown16
@@ -30,11 +30,11 @@
 22   common	umount			sys_oldumount			sys_oldumount
 23   32		setuid			-				sys_setuid16
 24   32		getuid			-				sys_getuid16
-25   32		stime			-				compat_sys_stime
+25   32		stime			-				sys_stime32
 26   common	ptrace			sys_ptrace			compat_sys_ptrace
 27   common	alarm			sys_alarm			sys_alarm
 29   common	pause			sys_pause			sys_pause
-30   common	utime			sys_utime			compat_sys_utime
+30   common	utime			sys_utime			sys_utime32
 33   common	access			sys_access			sys_access
 34   common	nice			sys_nice			sys_nice
 36   common	sync			sys_sync			sys_sync
@@ -112,7 +112,7 @@
 120  common	clone			sys_clone			sys_clone
 121  common	setdomainname		sys_setdomainname		sys_setdomainname
 122  common	uname			sys_newuname			sys_newuname
-124  common	adjtimex		sys_adjtimex			compat_sys_adjtimex
+124  common	adjtimex		sys_adjtimex			sys_adjtimex_time32
 125  common	mprotect		sys_mprotect			sys_mprotect
 126  common	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
 127  common	create_module		-				-
@@ -150,8 +150,8 @@
 158  common	sched_yield		sys_sched_yield			sys_sched_yield
 159  common	sched_get_priority_max	sys_sched_get_priority_max	sys_sched_get_priority_max
 160  common	sched_get_priority_min	sys_sched_get_priority_min	sys_sched_get_priority_min
-161  common	sched_rr_get_interval	sys_sched_rr_get_interval	compat_sys_sched_rr_get_interval
-162  common	nanosleep		sys_nanosleep			compat_sys_nanosleep
+161  common	sched_rr_get_interval	sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
+162  common	nanosleep		sys_nanosleep			sys_nanosleep_time32
 163  common	mremap			sys_mremap			sys_mremap
 164  32		setresuid		-				sys_setresuid16
 165  32		getresuid		-				sys_getresuid16
@@ -165,7 +165,7 @@
 174  common	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
 175  common	rt_sigprocmask		sys_rt_sigprocmask		compat_sys_rt_sigprocmask
 176  common	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
-177  common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+177  common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
 178  common	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 179  common	rt_sigsuspend		sys_rt_sigsuspend		compat_sys_rt_sigsuspend
 180  common	pread64			sys_pread64			compat_sys_s390_pread64
@@ -246,13 +246,13 @@
 235  common	fremovexattr		sys_fremovexattr		sys_fremovexattr
 236  common	gettid			sys_gettid			sys_gettid
 237  common	tkill			sys_tkill			sys_tkill
-238  common	futex			sys_futex			compat_sys_futex
+238  common	futex			sys_futex			sys_futex_time32
 239  common	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
 240  common	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
 241  common	tgkill			sys_tgkill			sys_tgkill
 243  common	io_setup		sys_io_setup			compat_sys_io_setup
 244  common	io_destroy		sys_io_destroy			sys_io_destroy
-245  common	io_getevents		sys_io_getevents		compat_sys_io_getevents
+245  common	io_getevents		sys_io_getevents		sys_io_getevents_time32
 246  common	io_submit		sys_io_submit			compat_sys_io_submit
 247  common	io_cancel		sys_io_cancel			sys_io_cancel
 248  common	exit_group		sys_exit_group			sys_exit_group
@@ -262,14 +262,14 @@
 252  common	set_tid_address		sys_set_tid_address		sys_set_tid_address
 253  common	fadvise64		sys_fadvise64_64		compat_sys_s390_fadvise64
 254  common	timer_create		sys_timer_create		compat_sys_timer_create
-255  common	timer_settime		sys_timer_settime		compat_sys_timer_settime
-256  common	timer_gettime		sys_timer_gettime		compat_sys_timer_gettime
+255  common	timer_settime		sys_timer_settime		sys_timer_settime32
+256  common	timer_gettime		sys_timer_gettime		sys_timer_gettime32
 257  common	timer_getoverrun	sys_timer_getoverrun		sys_timer_getoverrun
 258  common	timer_delete		sys_timer_delete		sys_timer_delete
-259  common	clock_settime		sys_clock_settime		compat_sys_clock_settime
-260  common	clock_gettime		sys_clock_gettime		compat_sys_clock_gettime
-261  common	clock_getres		sys_clock_getres		compat_sys_clock_getres
-262  common	clock_nanosleep		sys_clock_nanosleep		compat_sys_clock_nanosleep
+259  common	clock_settime		sys_clock_settime		sys_clock_settime32
+260  common	clock_gettime		sys_clock_gettime		sys_clock_gettime32
+261  common	clock_getres		sys_clock_getres		sys_clock_getres_time32
+262  common	clock_nanosleep		sys_clock_nanosleep		sys_clock_nanosleep_time32
 264  32		fadvise64_64		-				compat_sys_s390_fadvise64_64
 265  common	statfs64		sys_statfs64			compat_sys_statfs64
 266  common	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
@@ -279,8 +279,8 @@
 270  common	set_mempolicy		sys_set_mempolicy		compat_sys_set_mempolicy
 271  common	mq_open			sys_mq_open			compat_sys_mq_open
 272  common	mq_unlink		sys_mq_unlink			sys_mq_unlink
-273  common	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend
-274  common	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive
+273  common	mq_timedsend		sys_mq_timedsend		sys_mq_timedsend_time32
+274  common	mq_timedreceive		sys_mq_timedreceive		sys_mq_timedreceive_time32
 275  common	mq_notify		sys_mq_notify			compat_sys_mq_notify
 276  common	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
 277  common	kexec_load		sys_kexec_load			compat_sys_kexec_load
@@ -298,7 +298,7 @@
 289  common	mkdirat			sys_mkdirat			sys_mkdirat
 290  common	mknodat			sys_mknodat			sys_mknodat
 291  common	fchownat		sys_fchownat			sys_fchownat
-292  common	futimesat		sys_futimesat			compat_sys_futimesat
+292  common	futimesat		sys_futimesat			sys_futimesat_time32
 293  32		fstatat64		-				compat_sys_s390_fstatat64
 293  64		newfstatat		sys_newfstatat			-
 294  common	unlinkat		sys_unlinkat			sys_unlinkat
@@ -308,8 +308,8 @@
 298  common	readlinkat		sys_readlinkat			sys_readlinkat
 299  common	fchmodat		sys_fchmodat			sys_fchmodat
 300  common	faccessat		sys_faccessat			sys_faccessat
-301  common	pselect6		sys_pselect6			compat_sys_pselect6
-302  common	ppoll			sys_ppoll			compat_sys_ppoll
+301  common	pselect6		sys_pselect6			compat_sys_pselect6_time32
+302  common	ppoll			sys_ppoll			compat_sys_ppoll_time32
 303  common	unshare			sys_unshare			sys_unshare
 304  common	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
 305  common	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
@@ -320,15 +320,15 @@
 310  common	move_pages		sys_move_pages			compat_sys_move_pages
 311  common	getcpu			sys_getcpu			sys_getcpu
 312  common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
-313  common	utimes			sys_utimes			compat_sys_utimes
+313  common	utimes			sys_utimes			sys_utimes_time32
 314  common	fallocate		sys_fallocate			compat_sys_s390_fallocate
-315  common	utimensat		sys_utimensat			compat_sys_utimensat
+315  common	utimensat		sys_utimensat			sys_utimensat_time32
 316  common	signalfd		sys_signalfd			compat_sys_signalfd
 317  common	timerfd			-				-
 318  common	eventfd			sys_eventfd			sys_eventfd
 319  common	timerfd_create		sys_timerfd_create		sys_timerfd_create
-320  common	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
-321  common	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
+320  common	timerfd_settime		sys_timerfd_settime		sys_timerfd_settime32
+321  common	timerfd_gettime		sys_timerfd_gettime		sys_timerfd_gettime32
 322  common	signalfd4		sys_signalfd4			compat_sys_signalfd4
 323  common	eventfd2		sys_eventfd2			sys_eventfd2
 324  common	inotify_init1		sys_inotify_init1		sys_inotify_init1
@@ -344,7 +344,7 @@
 334  common	prlimit64		sys_prlimit64			sys_prlimit64
 335  common	name_to_handle_at	sys_name_to_handle_at		sys_name_to_handle_at
 336  common	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
-337  common	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime
+337  common	clock_adjtime		sys_clock_adjtime		sys_clock_adjtime32
 338  common	syncfs			sys_syncfs			sys_syncfs
 339  common	setns			sys_setns			sys_setns
 340  common	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
@@ -364,7 +364,7 @@
 354  common	execveat		sys_execveat			compat_sys_execveat
 355  common	userfaultfd		sys_userfaultfd			sys_userfaultfd
 356  common	membarrier		sys_membarrier			sys_membarrier
-357  common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg
+357  common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg_time32
 358  common	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
 359  common	socket			sys_socket			sys_socket
 360  common	socketpair		sys_socketpair			sys_socketpair
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index e63cd013cc77..7cb05b50aeaa 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -44,7 +44,7 @@
 28	common	sigaltstack		sys_sigaltstack			compat_sys_sigaltstack
 29	32    	pause			sys_pause
 29	64    	pause			sys_nis_syscall
-30	common	utime			sys_utime			compat_sys_utime
+30	common	utime			sys_utime			sys_utime32
 31	32    	lchown32		sys_lchown
 32	32    	fchown32		sys_fchown
 33	common	access			sys_access
@@ -128,7 +128,7 @@
 102	common	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
 103	common	rt_sigprocmask		sys_rt_sigprocmask		compat_sys_rt_sigprocmask
 104	common	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
-105	common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+105	common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
 106	common	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 107	common	rt_sigsuspend		sys_rt_sigsuspend		compat_sys_rt_sigsuspend
 108	32	setresuid32		sys_setresuid
@@ -168,11 +168,11 @@
 135	common	socketpair		sys_socketpair
 136	common	mkdir			sys_mkdir
 137	common	rmdir			sys_rmdir
-138	common	utimes			sys_utimes			compat_sys_utimes
+138	common	utimes			sys_utimes			sys_utimes_time32
 139	common	stat64			sys_stat64			compat_sys_stat64
 140	common	sendfile64		sys_sendfile64
 141	common	getpeername		sys_getpeername
-142	common	futex			sys_futex			compat_sys_futex
+142	common	futex			sys_futex			sys_futex_time32
 143	common	gettid			sys_gettid
 144	common	getrlimit		sys_getrlimit			compat_sys_getrlimit
 145	common	setrlimit		sys_setrlimit			compat_sys_setrlimit
@@ -258,7 +258,7 @@
 216	64	sigreturn		sys_nis_syscall
 217	common	clone			sys_clone
 218	common	ioprio_get		sys_ioprio_get
-219	32	adjtimex		sys_adjtimex			compat_sys_adjtimex
+219	32	adjtimex		sys_adjtimex			sys_adjtimex_time32
 219	64	adjtimex		sys_sparc_adjtimex
 220	32	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
 220	64	sigprocmask		sys_nis_syscall
@@ -272,9 +272,9 @@
 228	common	setfsuid		sys_setfsuid16
 229	common	setfsgid		sys_setfsgid16
 230	common	_newselect		sys_select			compat_sys_select
-231	32	time			sys_time			compat_sys_time
+231	32	time			sys_time			sys_time32
 232	common	splice			sys_splice
-233	common	stime			sys_stime			compat_sys_stime
+233	common	stime			sys_stime			sys_stime32
 234	common	statfs64		sys_statfs64			compat_sys_statfs64
 235	common	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
 236	common	_llseek			sys_llseek
@@ -289,8 +289,8 @@
 245	common	sched_yield		sys_sched_yield
 246	common	sched_get_priority_max	sys_sched_get_priority_max
 247	common	sched_get_priority_min	sys_sched_get_priority_min
-248	common	sched_rr_get_interval	sys_sched_rr_get_interval	compat_sys_sched_rr_get_interval
-249	common	nanosleep		sys_nanosleep			compat_sys_nanosleep
+248	common	sched_rr_get_interval	sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
+249	common	nanosleep		sys_nanosleep			sys_nanosleep_time32
 250	32	mremap			sys_mremap
 250	64	mremap			sys_64_mremap
 251	common	_sysctl			sys_sysctl			compat_sys_sysctl
@@ -299,14 +299,14 @@
 254	32	nfsservctl		sys_ni_syscall			sys_nis_syscall
 254	64	nfsservctl		sys_nis_syscall
 255	common	sync_file_range		sys_sync_file_range		compat_sys_sync_file_range
-256	common	clock_settime		sys_clock_settime		compat_sys_clock_settime
-257	common	clock_gettime		sys_clock_gettime		compat_sys_clock_gettime
-258	common	clock_getres		sys_clock_getres		compat_sys_clock_getres
-259	common	clock_nanosleep		sys_clock_nanosleep		compat_sys_clock_nanosleep
+256	common	clock_settime		sys_clock_settime		sys_clock_settime32
+257	common	clock_gettime		sys_clock_gettime		sys_clock_gettime32
+258	common	clock_getres		sys_clock_getres		sys_clock_getres_time32
+259	common	clock_nanosleep		sys_clock_nanosleep		sys_clock_nanosleep_time32
 260	common	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
 261	common	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
-262	common	timer_settime		sys_timer_settime		compat_sys_timer_settime
-263	common	timer_gettime		sys_timer_gettime		compat_sys_timer_gettime
+262	common	timer_settime		sys_timer_settime		sys_timer_settime32
+263	common	timer_gettime		sys_timer_gettime		sys_timer_gettime32
 264	common	timer_getoverrun	sys_timer_getoverrun
 265	common	timer_delete		sys_timer_delete
 266	common	timer_create		sys_timer_create		compat_sys_timer_create
@@ -316,11 +316,11 @@
 269	common	io_destroy		sys_io_destroy
 270	common	io_submit		sys_io_submit			compat_sys_io_submit
 271	common	io_cancel		sys_io_cancel
-272	common	io_getevents		sys_io_getevents		compat_sys_io_getevents
+272	common	io_getevents		sys_io_getevents		sys_io_getevents_time32
 273	common	mq_open			sys_mq_open			compat_sys_mq_open
 274	common	mq_unlink		sys_mq_unlink
-275	common	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend
-276	common	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive
+275	common	mq_timedsend		sys_mq_timedsend		sys_mq_timedsend_time32
+276	common	mq_timedreceive		sys_mq_timedreceive		sys_mq_timedreceive_time32
 277	common	mq_notify		sys_mq_notify			compat_sys_mq_notify
 278	common	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
 279	common	waitid			sys_waitid			compat_sys_waitid
@@ -332,7 +332,7 @@
 285	common	mkdirat			sys_mkdirat
 286	common	mknodat			sys_mknodat
 287	common	fchownat		sys_fchownat
-288	common	futimesat		sys_futimesat			compat_sys_futimesat
+288	common	futimesat		sys_futimesat			sys_futimesat_time32
 289	common	fstatat64		sys_fstatat64			compat_sys_fstatat64
 290	common	unlinkat		sys_unlinkat
 291	common	renameat		sys_renameat
@@ -341,8 +341,8 @@
 294	common	readlinkat		sys_readlinkat
 295	common	fchmodat		sys_fchmodat
 296	common	faccessat		sys_faccessat
-297	common	pselect6		sys_pselect6			compat_sys_pselect6
-298	common	ppoll			sys_ppoll			compat_sys_ppoll
+297	common	pselect6		sys_pselect6			compat_sys_pselect6_time32
+298	common	ppoll			sys_ppoll			compat_sys_ppoll_time32
 299	common	unshare			sys_unshare
 300	common	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
 301	common	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
@@ -354,13 +354,13 @@
 307	common	move_pages		sys_move_pages			compat_sys_move_pages
 308	common	getcpu			sys_getcpu
 309	common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
-310	common	utimensat		sys_utimensat			compat_sys_utimensat
+310	common	utimensat		sys_utimensat			sys_utimensat_time32
 311	common	signalfd		sys_signalfd			compat_sys_signalfd
 312	common	timerfd_create		sys_timerfd_create
 313	common	eventfd			sys_eventfd
 314	common	fallocate		sys_fallocate			compat_sys_fallocate
-315	common	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
-316	common	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
+315	common	timerfd_settime		sys_timerfd_settime		sys_timerfd_settime32
+316	common	timerfd_gettime		sys_timerfd_gettime		sys_timerfd_gettime32
 317	common	signalfd4		sys_signalfd4			compat_sys_signalfd4
 318	common	eventfd2		sys_eventfd2
 319	common	epoll_create1		sys_epoll_create1
@@ -372,13 +372,13 @@
 325	common	pwritev			sys_pwritev			compat_sys_pwritev
 326	common	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
 327	common	perf_event_open		sys_perf_event_open
-328	common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg
+328	common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg_time32
 329	common	fanotify_init		sys_fanotify_init
 330	common	fanotify_mark		sys_fanotify_mark		compat_sys_fanotify_mark
 331	common	prlimit64		sys_prlimit64
 332	common	name_to_handle_at	sys_name_to_handle_at
 333	common	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
-334	32	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime
+334	32	clock_adjtime		sys_clock_adjtime		sys_clock_adjtime32
 334	64	clock_adjtime		sys_sparc_clock_adjtime
 335	common	syncfs			sys_syncfs
 336	common	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2be1d0eb7754..7705d5ecad25 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -24,7 +24,7 @@
 10	i386	unlink			sys_unlink			__ia32_sys_unlink
 11	i386	execve			sys_execve			__ia32_compat_sys_execve
 12	i386	chdir			sys_chdir			__ia32_sys_chdir
-13	i386	time			sys_time			__ia32_compat_sys_time
+13	i386	time			sys_time			__ia32_sys_time32
 14	i386	mknod			sys_mknod			__ia32_sys_mknod
 15	i386	chmod			sys_chmod			__ia32_sys_chmod
 16	i386	lchown			sys_lchown16			__ia32_sys_lchown16
@@ -36,12 +36,12 @@
 22	i386	umount			sys_oldumount			__ia32_sys_oldumount
 23	i386	setuid			sys_setuid16			__ia32_sys_setuid16
 24	i386	getuid			sys_getuid16			__ia32_sys_getuid16
-25	i386	stime			sys_stime			__ia32_compat_sys_stime
+25	i386	stime			sys_stime			__ia32_sys_stime32
 26	i386	ptrace			sys_ptrace			__ia32_compat_sys_ptrace
 27	i386	alarm			sys_alarm			__ia32_sys_alarm
 28	i386	oldfstat		sys_fstat			__ia32_sys_fstat
 29	i386	pause			sys_pause			__ia32_sys_pause
-30	i386	utime			sys_utime			__ia32_compat_sys_utime
+30	i386	utime			sys_utime			__ia32_sys_utime32
 31	i386	stty
 32	i386	gtty
 33	i386	access			sys_access			__ia32_sys_access
@@ -135,7 +135,7 @@
 121	i386	setdomainname		sys_setdomainname		__ia32_sys_setdomainname
 122	i386	uname			sys_newuname			__ia32_sys_newuname
 123	i386	modify_ldt		sys_modify_ldt			__ia32_sys_modify_ldt
-124	i386	adjtimex		sys_adjtimex			__ia32_compat_sys_adjtimex
+124	i386	adjtimex		sys_adjtimex			__ia32_sys_adjtimex_time32
 125	i386	mprotect		sys_mprotect			__ia32_sys_mprotect
 126	i386	sigprocmask		sys_sigprocmask			__ia32_compat_sys_sigprocmask
 127	i386	create_module
@@ -172,8 +172,8 @@
 158	i386	sched_yield		sys_sched_yield			__ia32_sys_sched_yield
 159	i386	sched_get_priority_max	sys_sched_get_priority_max	__ia32_sys_sched_get_priority_max
 160	i386	sched_get_priority_min	sys_sched_get_priority_min	__ia32_sys_sched_get_priority_min
-161	i386	sched_rr_get_interval	sys_sched_rr_get_interval	__ia32_compat_sys_sched_rr_get_interval
-162	i386	nanosleep		sys_nanosleep			__ia32_compat_sys_nanosleep
+161	i386	sched_rr_get_interval	sys_sched_rr_get_interval	__ia32_sys_sched_rr_get_interval_time32
+162	i386	nanosleep		sys_nanosleep			__ia32_sys_nanosleep_time32
 163	i386	mremap			sys_mremap			__ia32_sys_mremap
 164	i386	setresuid		sys_setresuid16			__ia32_sys_setresuid16
 165	i386	getresuid		sys_getresuid16			__ia32_sys_getresuid16
@@ -188,7 +188,7 @@
 174	i386	rt_sigaction		sys_rt_sigaction		__ia32_compat_sys_rt_sigaction
 175	i386	rt_sigprocmask		sys_rt_sigprocmask		__ia32_sys_rt_sigprocmask
 176	i386	rt_sigpending		sys_rt_sigpending		__ia32_compat_sys_rt_sigpending
-177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		__ia32_compat_sys_rt_sigtimedwait
+177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		__ia32_compat_sys_rt_sigtimedwait_time32
 178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		__ia32_compat_sys_rt_sigqueueinfo
 179	i386	rt_sigsuspend		sys_rt_sigsuspend		__ia32_sys_rt_sigsuspend
 180	i386	pread64			sys_pread64			__ia32_compat_sys_x86_pread
@@ -251,14 +251,14 @@
 237	i386	fremovexattr		sys_fremovexattr		__ia32_sys_fremovexattr
 238	i386	tkill			sys_tkill			__ia32_sys_tkill
 239	i386	sendfile64		sys_sendfile64			__ia32_sys_sendfile64
-240	i386	futex			sys_futex			__ia32_compat_sys_futex
+240	i386	futex			sys_futex			__ia32_sys_futex_time32
 241	i386	sched_setaffinity	sys_sched_setaffinity		__ia32_compat_sys_sched_setaffinity
 242	i386	sched_getaffinity	sys_sched_getaffinity		__ia32_compat_sys_sched_getaffinity
 243	i386	set_thread_area		sys_set_thread_area		__ia32_sys_set_thread_area
 244	i386	get_thread_area		sys_get_thread_area		__ia32_sys_get_thread_area
 245	i386	io_setup		sys_io_setup			__ia32_compat_sys_io_setup
 246	i386	io_destroy		sys_io_destroy			__ia32_sys_io_destroy
-247	i386	io_getevents		sys_io_getevents		__ia32_compat_sys_io_getevents
+247	i386	io_getevents		sys_io_getevents		__ia32_sys_io_getevents_time32
 248	i386	io_submit		sys_io_submit			__ia32_compat_sys_io_submit
 249	i386	io_cancel		sys_io_cancel			__ia32_sys_io_cancel
 250	i386	fadvise64		sys_fadvise64			__ia32_compat_sys_x86_fadvise64
@@ -271,18 +271,18 @@
 257	i386	remap_file_pages	sys_remap_file_pages		__ia32_sys_remap_file_pages
 258	i386	set_tid_address		sys_set_tid_address		__ia32_sys_set_tid_address
 259	i386	timer_create		sys_timer_create		__ia32_compat_sys_timer_create
-260	i386	timer_settime		sys_timer_settime		__ia32_compat_sys_timer_settime
-261	i386	timer_gettime		sys_timer_gettime		__ia32_compat_sys_timer_gettime
+260	i386	timer_settime		sys_timer_settime		__ia32_sys_timer_settime32
+261	i386	timer_gettime		sys_timer_gettime		__ia32_sys_timer_gettime32
 262	i386	timer_getoverrun	sys_timer_getoverrun		__ia32_sys_timer_getoverrun
 263	i386	timer_delete		sys_timer_delete		__ia32_sys_timer_delete
-264	i386	clock_settime		sys_clock_settime		__ia32_compat_sys_clock_settime
-265	i386	clock_gettime		sys_clock_gettime		__ia32_compat_sys_clock_gettime
-266	i386	clock_getres		sys_clock_getres		__ia32_compat_sys_clock_getres
-267	i386	clock_nanosleep		sys_clock_nanosleep		__ia32_compat_sys_clock_nanosleep
+264	i386	clock_settime		sys_clock_settime		__ia32_sys_clock_settime32
+265	i386	clock_gettime		sys_clock_gettime		__ia32_sys_clock_gettime32
+266	i386	clock_getres		sys_clock_getres		__ia32_sys_clock_getres_time32
+267	i386	clock_nanosleep		sys_clock_nanosleep		__ia32_sys_clock_nanosleep_time32
 268	i386	statfs64		sys_statfs64			__ia32_compat_sys_statfs64
 269	i386	fstatfs64		sys_fstatfs64			__ia32_compat_sys_fstatfs64
 270	i386	tgkill			sys_tgkill			__ia32_sys_tgkill
-271	i386	utimes			sys_utimes			__ia32_compat_sys_utimes
+271	i386	utimes			sys_utimes			__ia32_sys_utimes_time32
 272	i386	fadvise64_64		sys_fadvise64_64		__ia32_compat_sys_x86_fadvise64_64
 273	i386	vserver
 274	i386	mbind			sys_mbind			__ia32_sys_mbind
@@ -290,8 +290,8 @@
 276	i386	set_mempolicy		sys_set_mempolicy		__ia32_sys_set_mempolicy
 277	i386	mq_open			sys_mq_open			__ia32_compat_sys_mq_open
 278	i386	mq_unlink		sys_mq_unlink			__ia32_sys_mq_unlink
-279	i386	mq_timedsend		sys_mq_timedsend		__ia32_compat_sys_mq_timedsend
-280	i386	mq_timedreceive		sys_mq_timedreceive		__ia32_compat_sys_mq_timedreceive
+279	i386	mq_timedsend		sys_mq_timedsend		__ia32_sys_mq_timedsend_time32
+280	i386	mq_timedreceive		sys_mq_timedreceive		__ia32_sys_mq_timedreceive_time32
 281	i386	mq_notify		sys_mq_notify			__ia32_compat_sys_mq_notify
 282	i386	mq_getsetattr		sys_mq_getsetattr		__ia32_compat_sys_mq_getsetattr
 283	i386	kexec_load		sys_kexec_load			__ia32_compat_sys_kexec_load
@@ -310,7 +310,7 @@
 296	i386	mkdirat			sys_mkdirat			__ia32_sys_mkdirat
 297	i386	mknodat			sys_mknodat			__ia32_sys_mknodat
 298	i386	fchownat		sys_fchownat			__ia32_sys_fchownat
-299	i386	futimesat		sys_futimesat			__ia32_compat_sys_futimesat
+299	i386	futimesat		sys_futimesat			__ia32_sys_futimesat_time32
 300	i386	fstatat64		sys_fstatat64			__ia32_compat_sys_x86_fstatat
 301	i386	unlinkat		sys_unlinkat			__ia32_sys_unlinkat
 302	i386	renameat		sys_renameat			__ia32_sys_renameat
@@ -319,8 +319,8 @@
 305	i386	readlinkat		sys_readlinkat			__ia32_sys_readlinkat
 306	i386	fchmodat		sys_fchmodat			__ia32_sys_fchmodat
 307	i386	faccessat		sys_faccessat			__ia32_sys_faccessat
-308	i386	pselect6		sys_pselect6			__ia32_compat_sys_pselect6
-309	i386	ppoll			sys_ppoll			__ia32_compat_sys_ppoll
+308	i386	pselect6		sys_pselect6			__ia32_compat_sys_pselect6_time32
+309	i386	ppoll			sys_ppoll			__ia32_compat_sys_ppoll_time32
 310	i386	unshare			sys_unshare			__ia32_sys_unshare
 311	i386	set_robust_list		sys_set_robust_list		__ia32_compat_sys_set_robust_list
 312	i386	get_robust_list		sys_get_robust_list		__ia32_compat_sys_get_robust_list
@@ -331,13 +331,13 @@
 317	i386	move_pages		sys_move_pages			__ia32_compat_sys_move_pages
 318	i386	getcpu			sys_getcpu			__ia32_sys_getcpu
 319	i386	epoll_pwait		sys_epoll_pwait			__ia32_sys_epoll_pwait
-320	i386	utimensat		sys_utimensat			__ia32_compat_sys_utimensat
+320	i386	utimensat		sys_utimensat			__ia32_sys_utimensat_time32
 321	i386	signalfd		sys_signalfd			__ia32_compat_sys_signalfd
 322	i386	timerfd_create		sys_timerfd_create		__ia32_sys_timerfd_create
 323	i386	eventfd			sys_eventfd			__ia32_sys_eventfd
 324	i386	fallocate		sys_fallocate			__ia32_compat_sys_x86_fallocate
-325	i386	timerfd_settime		sys_timerfd_settime		__ia32_compat_sys_timerfd_settime
-326	i386	timerfd_gettime		sys_timerfd_gettime		__ia32_compat_sys_timerfd_gettime
+325	i386	timerfd_settime		sys_timerfd_settime		__ia32_sys_timerfd_settime32
+326	i386	timerfd_gettime		sys_timerfd_gettime		__ia32_sys_timerfd_gettime32
 327	i386	signalfd4		sys_signalfd4			__ia32_compat_sys_signalfd4
 328	i386	eventfd2		sys_eventfd2			__ia32_sys_eventfd2
 329	i386	epoll_create1		sys_epoll_create1		__ia32_sys_epoll_create1
@@ -348,13 +348,13 @@
 334	i386	pwritev			sys_pwritev			__ia32_compat_sys_pwritev
 335	i386	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		__ia32_compat_sys_rt_tgsigqueueinfo
 336	i386	perf_event_open		sys_perf_event_open		__ia32_sys_perf_event_open
-337	i386	recvmmsg		sys_recvmmsg			__ia32_compat_sys_recvmmsg
+337	i386	recvmmsg		sys_recvmmsg			__ia32_compat_sys_recvmmsg_time32
 338	i386	fanotify_init		sys_fanotify_init		__ia32_sys_fanotify_init
 339	i386	fanotify_mark		sys_fanotify_mark		__ia32_compat_sys_fanotify_mark
 340	i386	prlimit64		sys_prlimit64			__ia32_sys_prlimit64
 341	i386	name_to_handle_at	sys_name_to_handle_at		__ia32_sys_name_to_handle_at
 342	i386	open_by_handle_at	sys_open_by_handle_at		__ia32_compat_sys_open_by_handle_at
-343	i386	clock_adjtime		sys_clock_adjtime		__ia32_compat_sys_clock_adjtime
+343	i386	clock_adjtime		sys_clock_adjtime		__ia32_sys_clock_adjtime32
 344	i386	syncfs			sys_syncfs			__ia32_sys_syncfs
 345	i386	sendmmsg		sys_sendmmsg			__ia32_compat_sys_sendmmsg
 346	i386	setns			sys_setns			__ia32_sys_setns
diff --git a/fs/aio.c b/fs/aio.c
index b906ff70c90f..4394d3fe116a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2198,11 +2198,11 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
 
 #if defined(CONFIG_COMPAT_32BIT_TIME)
 
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
-		       compat_long_t, min_nr,
-		       compat_long_t, nr,
-		       struct io_event __user *, events,
-		       struct old_timespec32 __user *, timeout)
+SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
+		__s32, min_nr,
+		__s32, nr,
+		struct io_event __user *, events,
+		struct old_timespec32 __user *, timeout)
 {
 	struct timespec64 t;
 	int ret;
diff --git a/fs/select.c b/fs/select.c
index d0f35dbc0e8f..6cbc9ff56ba0 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1379,7 +1379,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
 
 #if defined(CONFIG_COMPAT_32BIT_TIME)
 
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
 	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
 	struct old_timespec32 __user *, tsp, void __user *, sig)
 {
@@ -1402,7 +1402,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
 #endif
 
 #if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct old_timespec32 __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 803ca070d42e..6a6fc8aa1de7 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -560,7 +560,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *,
 }
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags,
 		const struct old_itimerspec32 __user *, utmr,
 		struct old_itimerspec32 __user *, otmr)
 {
@@ -577,7 +577,7 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
+SYSCALL_DEFINE2(timerfd_gettime32, int, ufd,
 		struct old_itimerspec32 __user *, otmr)
 {
 	struct itimerspec64 kotmr;
diff --git a/fs/utimes.c b/fs/utimes.c
index bdcf2daf39c1..350c9c16ace1 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -224,8 +224,8 @@ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
  * of sys_utimes.
  */
 #ifdef __ARCH_WANT_SYS_UTIME32
-COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
-		       struct old_utimbuf32 __user *, t)
+SYSCALL_DEFINE2(utime32, const char __user *, filename,
+		struct old_utimbuf32 __user *, t)
 {
 	struct timespec64 tv[2];
 
@@ -240,7 +240,7 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
 }
 #endif
 
-COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
+SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
 {
 	struct timespec64 tv[2];
 
@@ -276,14 +276,14 @@ static long do_compat_futimesat(unsigned int dfd, const char __user *filename,
 	return do_utimes(dfd, filename, t ? tv : NULL, 0);
 }
 
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd,
+SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd,
 		       const char __user *, filename,
 		       struct old_timeval32 __user *, t)
 {
 	return do_compat_futimesat(dfd, filename, t);
 }
 
-COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct old_timeval32 __user *, t)
+SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t)
 {
 	return do_compat_futimesat(AT_FDCWD, filename, t);
 }
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 657ca6abd855..ebddcb6cfcf8 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -520,11 +520,6 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p);
 asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr,
 				     u32 __user *iocb);
-asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id,
-					compat_long_t min_nr,
-					compat_long_t nr,
-					struct io_event __user *events,
-					struct old_timespec32 __user *timeout);
 asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id,
 					compat_long_t min_nr,
 					compat_long_t nr,
@@ -617,7 +612,7 @@ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
 				    compat_loff_t __user *offset, compat_size_t count);
 
 /* fs/select.c */
-asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
+asmlinkage long compat_sys_pselect6_time32(int n, compat_ulong_t __user *inp,
 				    compat_ulong_t __user *outp,
 				    compat_ulong_t __user *exp,
 				    struct old_timespec32 __user *tsp,
@@ -627,7 +622,7 @@ asmlinkage long compat_sys_pselect6_time64(int n, compat_ulong_t __user *inp,
 				    compat_ulong_t __user *exp,
 				    struct __kernel_timespec __user *tsp,
 				    void __user *sig);
-asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
+asmlinkage long compat_sys_ppoll_time32(struct pollfd __user *ufds,
 				 unsigned int nfds,
 				 struct old_timespec32 __user *tsp,
 				 const compat_sigset_t __user *sigmask,
@@ -657,19 +652,6 @@ asmlinkage long compat_sys_newfstat(unsigned int fd,
 
 /* fs/sync.c: No generic prototype for sync_file_range and sync_file_range2 */
 
-/* fs/timerfd.c */
-asmlinkage long compat_sys_timerfd_gettime(int ufd,
-				   struct old_itimerspec32 __user *otmr);
-asmlinkage long compat_sys_timerfd_settime(int ufd, int flags,
-				   const struct old_itimerspec32 __user *utmr,
-				   struct old_itimerspec32 __user *otmr);
-
-/* fs/utimes.c */
-asmlinkage long compat_sys_utimensat(unsigned int dfd,
-				     const char __user *filename,
-				     struct old_timespec32 __user *t,
-				     int flags);
-
 /* kernel/exit.c */
 asmlinkage long compat_sys_waitid(int, compat_pid_t,
 		struct compat_siginfo __user *, int,
@@ -678,9 +660,6 @@ asmlinkage long compat_sys_waitid(int, compat_pid_t,
 
 
 /* kernel/futex.c */
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
-		struct old_timespec32 __user *utime, u32 __user *uaddr2,
-		u32 val3);
 asmlinkage long
 compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
 			   compat_size_t len);
@@ -688,10 +667,6 @@ asmlinkage long
 compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 			   compat_size_t __user *len_ptr);
 
-/* kernel/hrtimer.c */
-asmlinkage long compat_sys_nanosleep(struct old_timespec32 __user *rqtp,
-				     struct old_timespec32 __user *rmtp);
-
 /* kernel/itimer.c */
 asmlinkage long compat_sys_getitimer(int which,
 				     struct compat_itimerval __user *it);
@@ -709,20 +684,6 @@ asmlinkage long compat_sys_kexec_load(compat_ulong_t entry,
 asmlinkage long compat_sys_timer_create(clockid_t which_clock,
 			struct compat_sigevent __user *timer_event_spec,
 			timer_t __user *created_timer_id);
-asmlinkage long compat_sys_timer_gettime(timer_t timer_id,
-				 struct old_itimerspec32 __user *setting);
-asmlinkage long compat_sys_timer_settime(timer_t timer_id, int flags,
-					 struct old_itimerspec32 __user *new,
-					 struct old_itimerspec32 __user *old);
-asmlinkage long compat_sys_clock_settime(clockid_t which_clock,
-					 struct old_timespec32 __user *tp);
-asmlinkage long compat_sys_clock_gettime(clockid_t which_clock,
-					 struct old_timespec32 __user *tp);
-asmlinkage long compat_sys_clock_getres(clockid_t which_clock,
-					struct old_timespec32 __user *tp);
-asmlinkage long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
-					   struct old_timespec32 __user *rqtp,
-					   struct old_timespec32 __user *rmtp);
 
 /* kernel/ptrace.c */
 asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
@@ -735,8 +696,6 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
 asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid,
 				     unsigned int len,
 				     compat_ulong_t __user *user_mask_ptr);
-asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
-						 struct old_timespec32 __user *interval);
 
 /* kernel/signal.c */
 asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
@@ -754,7 +713,7 @@ asmlinkage long compat_sys_rt_sigprocmask(int how, compat_sigset_t __user *set,
 					  compat_size_t sigsetsize);
 asmlinkage long compat_sys_rt_sigpending(compat_sigset_t __user *uset,
 					 compat_size_t sigsetsize);
-asmlinkage long compat_sys_rt_sigtimedwait(compat_sigset_t __user *uthese,
+asmlinkage long compat_sys_rt_sigtimedwait_time32(compat_sigset_t __user *uthese,
 		struct compat_siginfo __user *uinfo,
 		struct old_timespec32 __user *uts, compat_size_t sigsetsize);
 asmlinkage long compat_sys_rt_sigtimedwait_time64(compat_sigset_t __user *uthese,
@@ -777,7 +736,6 @@ asmlinkage long compat_sys_gettimeofday(struct old_timeval32 __user *tv,
 		struct timezone __user *tz);
 asmlinkage long compat_sys_settimeofday(struct old_timeval32 __user *tv,
 		struct timezone __user *tz);
-asmlinkage long compat_sys_adjtimex(struct old_timex32 __user *utp);
 
 /* kernel/timer.c */
 asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info);
@@ -786,14 +744,6 @@ asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info);
 asmlinkage long compat_sys_mq_open(const char __user *u_name,
 			int oflag, compat_mode_t mode,
 			struct compat_mq_attr __user *u_attr);
-asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes,
-			const char __user *u_msg_ptr,
-			compat_size_t msg_len, unsigned int msg_prio,
-			const struct old_timespec32 __user *u_abs_timeout);
-asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes,
-			char __user *u_msg_ptr,
-			compat_size_t msg_len, unsigned int __user *u_msg_prio,
-			const struct old_timespec32 __user *u_abs_timeout);
 asmlinkage long compat_sys_mq_notify(mqd_t mqdes,
 			const struct compat_sigevent __user *u_notification);
 asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes,
@@ -809,8 +759,6 @@ asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp,
 
 /* ipc/sem.c */
 asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg);
-asmlinkage long compat_sys_semtimedop(int semid, struct sembuf __user *tsems,
-		unsigned nsems, const struct old_timespec32 __user *timeout);
 
 /* ipc/shm.c */
 asmlinkage long compat_sys_shmctl(int first, int second, void __user *uptr);
@@ -868,7 +816,7 @@ asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid,
 asmlinkage long compat_sys_recvmmsg_time64(int fd, struct compat_mmsghdr __user *mmsg,
 				    unsigned vlen, unsigned int flags,
 				    struct __kernel_timespec __user *timeout);
-asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+asmlinkage long compat_sys_recvmmsg_time32(int fd, struct compat_mmsghdr __user *mmsg,
 				    unsigned vlen, unsigned int flags,
 				    struct old_timespec32 __user *timeout);
 asmlinkage long compat_sys_wait4(compat_pid_t pid,
@@ -879,8 +827,6 @@ asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
 asmlinkage long compat_sys_open_by_handle_at(int mountdirfd,
 					     struct file_handle __user *handle,
 					     int flags);
-asmlinkage long compat_sys_clock_adjtime(clockid_t which_clock,
-					 struct old_timex32 __user *tp);
 asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg,
 				    unsigned vlen, unsigned int flags);
 asmlinkage ssize_t compat_sys_process_vm_readv(compat_pid_t pid,
@@ -921,8 +867,6 @@ asmlinkage long compat_sys_pwritev64v2(unsigned long fd,
 /* __ARCH_WANT_SYSCALL_NO_AT */
 asmlinkage long compat_sys_open(const char __user *filename, int flags,
 				umode_t mode);
-asmlinkage long compat_sys_utimes(const char __user *filename,
-				  struct old_timeval32 __user *t);
 
 /* __ARCH_WANT_SYSCALL_NO_FLAGS */
 asmlinkage long compat_sys_signalfd(int ufd,
@@ -936,12 +880,6 @@ asmlinkage long compat_sys_newlstat(const char __user *filename,
 				    struct compat_stat __user *statbuf);
 
 /* __ARCH_WANT_SYSCALL_DEPRECATED */
-asmlinkage long compat_sys_time(old_time32_t __user *tloc);
-asmlinkage long compat_sys_utime(const char __user *filename,
-				 struct old_utimbuf32 __user *t);
-asmlinkage long compat_sys_futimesat(unsigned int dfd,
-				     const char __user *filename,
-				     struct old_timeval32 __user *t);
 asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 		struct old_timeval32 __user *tvp);
@@ -976,9 +914,6 @@ asmlinkage long compat_sys_sigaction(int sig,
                                    struct compat_old_sigaction __user *oact);
 #endif
 
-/* obsolete: kernel/time/time.c */
-asmlinkage long compat_sys_stime(old_time32_t __user *tptr);
-
 /* obsolete: net/socket.c */
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args);
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 09330d5bda0c..94369f5bd8e5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -297,6 +297,11 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id,
 				long nr,
 				struct io_event __user *events,
 				struct __kernel_timespec __user *timeout);
+asmlinkage long sys_io_getevents_time32(__u32 ctx_id,
+				__s32 min_nr,
+				__s32 nr,
+				struct io_event __user *events,
+				struct old_timespec32 __user *timeout);
 asmlinkage long sys_io_pgetevents(aio_context_t ctx_id,
 				long min_nr,
 				long nr,
@@ -522,11 +527,19 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
 				    const struct __kernel_itimerspec __user *utmr,
 				    struct __kernel_itimerspec __user *otmr);
 asmlinkage long sys_timerfd_gettime(int ufd, struct __kernel_itimerspec __user *otmr);
+asmlinkage long sys_timerfd_gettime32(int ufd,
+				   struct old_itimerspec32 __user *otmr);
+asmlinkage long sys_timerfd_settime32(int ufd, int flags,
+				   const struct old_itimerspec32 __user *utmr,
+				   struct old_itimerspec32 __user *otmr);
 
 /* fs/utimes.c */
 asmlinkage long sys_utimensat(int dfd, const char __user *filename,
 				struct __kernel_timespec __user *utimes,
 				int flags);
+asmlinkage long sys_utimensat_time32(unsigned int dfd,
+				const char __user *filename,
+				struct old_timespec32 __user *t, int flags);
 
 /* kernel/acct.c */
 asmlinkage long sys_acct(const char __user *name);
@@ -555,6 +568,9 @@ asmlinkage long sys_unshare(unsigned long unshare_flags);
 asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			struct __kernel_timespec __user *utime, u32 __user *uaddr2,
 			u32 val3);
+asmlinkage long sys_futex_time32(u32 __user *uaddr, int op, u32 val,
+			struct old_timespec32 __user *utime, u32 __user *uaddr2,
+			u32 val3);
 asmlinkage long sys_get_robust_list(int pid,
 				    struct robust_list_head __user * __user *head_ptr,
 				    size_t __user *len_ptr);
@@ -564,6 +580,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 /* kernel/hrtimer.c */
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
+asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
+				     struct old_timespec32 __user *rmtp);
 
 /* kernel/itimer.c */
 asmlinkage long sys_getitimer(int which, struct itimerval __user *value);
@@ -602,6 +620,20 @@ asmlinkage long sys_clock_getres(clockid_t which_clock,
 asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
 				const struct __kernel_timespec __user *rqtp,
 				struct __kernel_timespec __user *rmtp);
+asmlinkage long sys_timer_gettime32(timer_t timer_id,
+				 struct old_itimerspec32 __user *setting);
+asmlinkage long sys_timer_settime32(timer_t timer_id, int flags,
+					 struct old_itimerspec32 __user *new,
+					 struct old_itimerspec32 __user *old);
+asmlinkage long sys_clock_settime32(clockid_t which_clock,
+				struct old_timespec32 __user *tp);
+asmlinkage long sys_clock_gettime32(clockid_t which_clock,
+				struct old_timespec32 __user *tp);
+asmlinkage long sys_clock_getres_time32(clockid_t which_clock,
+				struct old_timespec32 __user *tp);
+asmlinkage long sys_clock_nanosleep_time32(clockid_t which_clock, int flags,
+				struct old_timespec32 __user *rqtp,
+				struct old_timespec32 __user *rmtp);
 
 /* kernel/printk.c */
 asmlinkage long sys_syslog(int type, char __user *buf, int len);
@@ -627,6 +659,8 @@ asmlinkage long sys_sched_get_priority_max(int policy);
 asmlinkage long sys_sched_get_priority_min(int policy);
 asmlinkage long sys_sched_rr_get_interval(pid_t pid,
 				struct __kernel_timespec __user *interval);
+asmlinkage long sys_sched_rr_get_interval_time32(pid_t pid,
+						 struct old_timespec32 __user *interval);
 
 /* kernel/signal.c */
 asmlinkage long sys_restart_syscall(void);
@@ -696,6 +730,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv,
 asmlinkage long sys_settimeofday(struct timeval __user *tv,
 				struct timezone __user *tz);
 asmlinkage long sys_adjtimex(struct __kernel_timex __user *txc_p);
+asmlinkage long sys_adjtimex_time32(struct old_timex32 __user *txc_p);
 
 /* kernel/timer.c */
 asmlinkage long sys_getpid(void);
@@ -714,6 +749,14 @@ asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t
 asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct __kernel_timespec __user *abs_timeout);
 asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
 asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
+asmlinkage long sys_mq_timedreceive_time32(mqd_t mqdes,
+			char __user *u_msg_ptr,
+			unsigned int msg_len, unsigned int __user *u_msg_prio,
+			const struct old_timespec32 __user *u_abs_timeout);
+asmlinkage long sys_mq_timedsend_time32(mqd_t mqdes,
+			const char __user *u_msg_ptr,
+			unsigned int msg_len, unsigned int msg_prio,
+			const struct old_timespec32 __user *u_abs_timeout);
 
 /* ipc/msg.c */
 asmlinkage long sys_msgget(key_t key, int msgflg);
@@ -731,6 +774,9 @@ asmlinkage long sys_old_semctl(int semid, int semnum, int cmd, unsigned long arg
 asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops,
 				unsigned nsops,
 				const struct __kernel_timespec __user *timeout);
+asmlinkage long sys_semtimedop_time32(int semid, struct sembuf __user *sops,
+				unsigned nsops,
+				const struct old_timespec32 __user *timeout);
 asmlinkage long sys_semop(int semid, struct sembuf __user *sops,
 				unsigned nsops);
 
@@ -871,6 +917,8 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd,
 				      int flags);
 asmlinkage long sys_clock_adjtime(clockid_t which_clock,
 				struct __kernel_timex __user *tx);
+asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
+				struct old_timex32 __user *tx);
 asmlinkage long sys_syncfs(int fd);
 asmlinkage long sys_setns(int fd, int nstype);
 asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
@@ -1006,6 +1054,7 @@ asmlinkage long sys_alarm(unsigned int seconds);
 asmlinkage long sys_getpgrp(void);
 asmlinkage long sys_pause(void);
 asmlinkage long sys_time(time_t __user *tloc);
+asmlinkage long sys_time32(old_time32_t __user *tloc);
 #ifdef __ARCH_WANT_SYS_UTIME
 asmlinkage long sys_utime(char __user *filename,
 				struct utimbuf __user *times);
@@ -1014,6 +1063,13 @@ asmlinkage long sys_utimes(char __user *filename,
 asmlinkage long sys_futimesat(int dfd, const char __user *filename,
 			      struct timeval __user *utimes);
 #endif
+asmlinkage long sys_futimesat_time32(unsigned int dfd,
+				     const char __user *filename,
+				     struct old_timeval32 __user *t);
+asmlinkage long sys_utime32(const char __user *filename,
+				 struct old_utimbuf32 __user *t);
+asmlinkage long sys_utimes_time32(const char __user *filename,
+				  struct old_timeval32 __user *t);
 asmlinkage long sys_creat(const char __user *pathname, umode_t mode);
 asmlinkage long sys_getdents(unsigned int fd,
 				struct linux_dirent __user *dirent,
@@ -1038,6 +1094,7 @@ asmlinkage long sys_fork(void);
 
 /* obsolete: kernel/time/time.c */
 asmlinkage long sys_stime(time_t __user *tptr);
+asmlinkage long sys_stime32(old_time32_t __user *tptr);
 
 /* obsolete: kernel/signal.c */
 asmlinkage long sys_sigpending(old_sigset_t __user *uset);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 509484dbfd5d..153b55b94234 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -39,7 +39,7 @@ __SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)
 #define __NR_io_cancel 3
 __SYSCALL(__NR_io_cancel, sys_io_cancel)
 #define __NR_io_getevents 4
-__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents)
+__SC_COMP(__NR_io_getevents, sys_io_getevents, sys_io_getevents_time32)
 
 /* fs/xattr.c */
 #define __NR_setxattr 5
@@ -223,9 +223,9 @@ __SYSCALL(__NR3264_sendfile, sys_sendfile64)
 
 /* fs/select.c */
 #define __NR_pselect6 72
-__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6)
+__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6_time32)
 #define __NR_ppoll 73
-__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll)
+__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll_time32)
 
 /* fs/signalfd.c */
 #define __NR_signalfd4 74
@@ -271,14 +271,14 @@ __SC_COMP(__NR_sync_file_range, sys_sync_file_range, \
 __SYSCALL(__NR_timerfd_create, sys_timerfd_create)
 #define __NR_timerfd_settime 86
 __SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \
-	  compat_sys_timerfd_settime)
+	  sys_timerfd_settime32)
 #define __NR_timerfd_gettime 87
 __SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \
-	  compat_sys_timerfd_gettime)
+	  sys_timerfd_gettime32)
 
 /* fs/utimes.c */
 #define __NR_utimensat 88
-__SC_COMP(__NR_utimensat, sys_utimensat, compat_sys_utimensat)
+__SC_COMP(__NR_utimensat, sys_utimensat, sys_utimensat_time32)
 
 /* kernel/acct.c */
 #define __NR_acct 89
@@ -310,7 +310,7 @@ __SYSCALL(__NR_unshare, sys_unshare)
 
 /* kernel/futex.c */
 #define __NR_futex 98
-__SC_COMP(__NR_futex, sys_futex, compat_sys_futex)
+__SC_COMP(__NR_futex, sys_futex, sys_futex_time32)
 #define __NR_set_robust_list 99
 __SC_COMP(__NR_set_robust_list, sys_set_robust_list, \
 	  compat_sys_set_robust_list)
@@ -320,7 +320,7 @@ __SC_COMP(__NR_get_robust_list, sys_get_robust_list, \
 
 /* kernel/hrtimer.c */
 #define __NR_nanosleep 101
-__SC_COMP(__NR_nanosleep, sys_nanosleep, compat_sys_nanosleep)
+__SC_COMP(__NR_nanosleep, sys_nanosleep, sys_nanosleep_time32)
 
 /* kernel/itimer.c */
 #define __NR_getitimer 102
@@ -342,22 +342,22 @@ __SYSCALL(__NR_delete_module, sys_delete_module)
 #define __NR_timer_create 107
 __SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create)
 #define __NR_timer_gettime 108
-__SC_COMP(__NR_timer_gettime, sys_timer_gettime, compat_sys_timer_gettime)
+__SC_COMP(__NR_timer_gettime, sys_timer_gettime, sys_timer_gettime32)
 #define __NR_timer_getoverrun 109
 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
 #define __NR_timer_settime 110
-__SC_COMP(__NR_timer_settime, sys_timer_settime, compat_sys_timer_settime)
+__SC_COMP(__NR_timer_settime, sys_timer_settime, sys_timer_settime32)
 #define __NR_timer_delete 111
 __SYSCALL(__NR_timer_delete, sys_timer_delete)
 #define __NR_clock_settime 112
-__SC_COMP(__NR_clock_settime, sys_clock_settime, compat_sys_clock_settime)
+__SC_COMP(__NR_clock_settime, sys_clock_settime, sys_clock_settime32)
 #define __NR_clock_gettime 113
-__SC_COMP(__NR_clock_gettime, sys_clock_gettime, compat_sys_clock_gettime)
+__SC_COMP(__NR_clock_gettime, sys_clock_gettime, sys_clock_gettime32)
 #define __NR_clock_getres 114
-__SC_COMP(__NR_clock_getres, sys_clock_getres, compat_sys_clock_getres)
+__SC_COMP(__NR_clock_getres, sys_clock_getres, sys_clock_getres_time32)
 #define __NR_clock_nanosleep 115
 __SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \
-	  compat_sys_clock_nanosleep)
+	  sys_clock_nanosleep_time32)
 
 /* kernel/printk.c */
 #define __NR_syslog 116
@@ -390,7 +390,7 @@ __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
 #define __NR_sched_rr_get_interval 127
 __SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \
-	  compat_sys_sched_rr_get_interval)
+	  sys_sched_rr_get_interval_time32)
 
 /* kernel/signal.c */
 #define __NR_restart_syscall 128
@@ -413,7 +413,7 @@ __SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask)
 __SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending)
 #define __NR_rt_sigtimedwait 137
 __SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \
-	  compat_sys_rt_sigtimedwait)
+	  compat_sys_rt_sigtimedwait_time32)
 #define __NR_rt_sigqueueinfo 138
 __SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \
 	  compat_sys_rt_sigqueueinfo)
@@ -486,7 +486,7 @@ __SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday)
 #define __NR_settimeofday 170
 __SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday)
 #define __NR_adjtimex 171
-__SC_COMP(__NR_adjtimex, sys_adjtimex, compat_sys_adjtimex)
+__SC_COMP(__NR_adjtimex, sys_adjtimex, sys_adjtimex_time32)
 
 /* kernel/timer.c */
 #define __NR_getpid 172
@@ -512,10 +512,10 @@ __SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open)
 #define __NR_mq_unlink 181
 __SYSCALL(__NR_mq_unlink, sys_mq_unlink)
 #define __NR_mq_timedsend 182
-__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, compat_sys_mq_timedsend)
+__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, sys_mq_timedsend_time32)
 #define __NR_mq_timedreceive 183
 __SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \
-	  compat_sys_mq_timedreceive)
+	  sys_mq_timedreceive_time32)
 #define __NR_mq_notify 184
 __SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify)
 #define __NR_mq_getsetattr 185
@@ -537,7 +537,7 @@ __SYSCALL(__NR_semget, sys_semget)
 #define __NR_semctl 191
 __SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl)
 #define __NR_semtimedop 192
-__SC_COMP(__NR_semtimedop, sys_semtimedop, compat_sys_semtimedop)
+__SC_COMP(__NR_semtimedop, sys_semtimedop, sys_semtimedop_time32)
 #define __NR_semop 193
 __SYSCALL(__NR_semop, sys_semop)
 
@@ -659,7 +659,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_accept4 242
 __SYSCALL(__NR_accept4, sys_accept4)
 #define __NR_recvmmsg 243
-__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg)
+__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg_time32)
 
 /*
  * Architectures may provide up to 16 syscalls of their own
@@ -681,7 +681,7 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 __SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \
 	  compat_sys_open_by_handle_at)
 #define __NR_clock_adjtime 266
-__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, compat_sys_clock_adjtime)
+__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, sys_clock_adjtime32)
 #define __NR_syncfs 267
 __SYSCALL(__NR_syncfs, sys_syncfs)
 #define __NR_setns 268
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c595bed7bfcb..c839bf83231d 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1471,10 +1471,10 @@ static int compat_prepare_timeout(const struct old_timespec32 __user *p,
 	return 0;
 }
 
-COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes,
-		       const char __user *, u_msg_ptr,
-		       compat_size_t, msg_len, unsigned int, msg_prio,
-		       const struct old_timespec32 __user *, u_abs_timeout)
+SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes,
+		const char __user *, u_msg_ptr,
+		unsigned int, msg_len, unsigned int, msg_prio,
+		const struct old_timespec32 __user *, u_abs_timeout)
 {
 	struct timespec64 ts, *p = NULL;
 	if (u_abs_timeout) {
@@ -1486,10 +1486,10 @@ COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes,
 	return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
 }
 
-COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes,
-		       char __user *, u_msg_ptr,
-		       compat_size_t, msg_len, unsigned int __user *, u_msg_prio,
-		       const struct old_timespec32 __user *, u_abs_timeout)
+SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes,
+		char __user *, u_msg_ptr,
+		unsigned int, msg_len, unsigned int __user *, u_msg_prio,
+		const struct old_timespec32 __user *, u_abs_timeout)
 {
 	struct timespec64 ts, *p = NULL;
 	if (u_abs_timeout) {
diff --git a/ipc/sem.c b/ipc/sem.c
index d1efff3a81bb..80909464acff 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2250,7 +2250,7 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
 	return do_semtimedop(semid, tsems, nsops, NULL);
 }
 
-COMPAT_SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsems,
+SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems,
 		       unsigned int, nsops,
 		       const struct old_timespec32 __user *, timeout)
 {
diff --git a/kernel/futex.c b/kernel/futex.c
index be3bff2315ff..caead6c113d4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3812,7 +3812,7 @@ err_unlock:
 #endif /* CONFIG_COMPAT */
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
 		struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
 		u32, val3)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a674c7db2f29..62862419cd05 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5252,9 +5252,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 }
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
-		       compat_pid_t, pid,
-		       struct old_timespec32 __user *, interval)
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+		struct old_timespec32 __user *, interval)
 {
 	struct timespec64 t;
 	int retval = sched_rr_get_interval(pid, &t);
diff --git a/kernel/signal.c b/kernel/signal.c
index e1d7ad8e6ab1..af27629918cf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3397,7 +3397,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
 }
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
 		struct compat_siginfo __user *, uinfo,
 		struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ce04431a40d1..85e5ccec0955 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,9 +42,11 @@ COND_SYSCALL(io_destroy);
 COND_SYSCALL(io_submit);
 COND_SYSCALL_COMPAT(io_submit);
 COND_SYSCALL(io_cancel);
+COND_SYSCALL(io_getevents_time32);
 COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents_time32);
 COND_SYSCALL(io_pgetevents);
-COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents_time32);
 COND_SYSCALL_COMPAT(io_pgetevents);
 
 /* fs/xattr.c */
@@ -114,9 +116,9 @@ COND_SYSCALL_COMPAT(signalfd4);
 /* fs/timerfd.c */
 COND_SYSCALL(timerfd_create);
 COND_SYSCALL(timerfd_settime);
-COND_SYSCALL_COMPAT(timerfd_settime);
+COND_SYSCALL(timerfd_settime32);
 COND_SYSCALL(timerfd_gettime);
-COND_SYSCALL_COMPAT(timerfd_gettime);
+COND_SYSCALL(timerfd_gettime32);
 
 /* fs/utimes.c */
 
@@ -135,7 +137,7 @@ COND_SYSCALL(capset);
 
 /* kernel/futex.c */
 COND_SYSCALL(futex);
-COND_SYSCALL_COMPAT(futex);
+COND_SYSCALL(futex_time32);
 COND_SYSCALL(set_robust_list);
 COND_SYSCALL_COMPAT(set_robust_list);
 COND_SYSCALL(get_robust_list);
@@ -187,9 +189,9 @@ COND_SYSCALL(mq_open);
 COND_SYSCALL_COMPAT(mq_open);
 COND_SYSCALL(mq_unlink);
 COND_SYSCALL(mq_timedsend);
-COND_SYSCALL_COMPAT(mq_timedsend);
+COND_SYSCALL(mq_timedsend_time32);
 COND_SYSCALL(mq_timedreceive);
-COND_SYSCALL_COMPAT(mq_timedreceive);
+COND_SYSCALL(mq_timedreceive_time32);
 COND_SYSCALL(mq_notify);
 COND_SYSCALL_COMPAT(mq_notify);
 COND_SYSCALL(mq_getsetattr);
@@ -211,7 +213,7 @@ COND_SYSCALL(old_semctl);
 COND_SYSCALL(semctl);
 COND_SYSCALL_COMPAT(semctl);
 COND_SYSCALL(semtimedop);
-COND_SYSCALL_COMPAT(semtimedop);
+COND_SYSCALL(semtimedop_time32);
 COND_SYSCALL(semop);
 
 /* ipc/shm.c */
@@ -288,7 +290,7 @@ COND_SYSCALL(perf_event_open);
 COND_SYSCALL(accept4);
 COND_SYSCALL(recvmmsg);
 COND_SYSCALL(recvmmsg_time32);
-COND_SYSCALL_COMPAT(recvmmsg);
+COND_SYSCALL_COMPAT(recvmmsg_time32);
 COND_SYSCALL_COMPAT(recvmmsg_time64);
 
 /*
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f5cfa1b73d6f..0f5f96075110 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1771,7 +1771,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
 
-COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp,
+SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
 		       struct old_timespec32 __user *, rmtp)
 {
 	struct timespec64 tu;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index a51895486e5e..67df65f887ac 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -45,6 +45,7 @@ SYS_NI(timer_delete);
 SYS_NI(clock_adjtime);
 SYS_NI(getitimer);
 SYS_NI(setitimer);
+SYS_NI(clock_adjtime32);
 #ifdef __ARCH_WANT_SYS_ALARM
 SYS_NI(alarm);
 #endif
@@ -150,16 +151,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 
 #ifdef CONFIG_COMPAT
 COMPAT_SYS_NI(timer_create);
-COMPAT_SYS_NI(clock_adjtime);
-COMPAT_SYS_NI(timer_settime);
-COMPAT_SYS_NI(timer_gettime);
 COMPAT_SYS_NI(getitimer);
 COMPAT_SYS_NI(setitimer);
 #endif
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
-		       struct old_timespec32 __user *, tp)
+SYS_NI(timer_settime32);
+SYS_NI(timer_gettime32);
+
+SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
+		struct old_timespec32 __user *, tp)
 {
 	struct timespec64 new_tp;
 
@@ -171,8 +172,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 	return do_sys_settimeofday64(&new_tp, NULL);
 }
 
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
-		       struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
+		struct old_timespec32 __user *, tp)
 {
 	int ret;
 	struct timespec64 kernel_tp;
@@ -186,8 +187,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
 	return 0;
 }
 
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
-		       struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
+		struct old_timespec32 __user *, tp)
 {
 	struct timespec64 rtn_tp = {
 		.tv_sec = 0,
@@ -206,9 +207,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
 	}
 }
 
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
-		       struct old_timespec32 __user *, rqtp,
-		       struct old_timespec32 __user *, rmtp)
+SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
+		struct old_timespec32 __user *, rqtp,
+		struct old_timespec32 __user *, rmtp)
 {
 	struct timespec64 t;
 
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index de79f85ae14f..29176635991f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -730,8 +730,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
 
-COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
-		       struct old_itimerspec32 __user *, setting)
+SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
+		struct old_itimerspec32 __user *, setting)
 {
 	struct itimerspec64 cur_setting;
 
@@ -903,9 +903,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 }
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
-		       struct old_itimerspec32 __user *, new,
-		       struct old_itimerspec32 __user *, old)
+SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags,
+		struct old_itimerspec32 __user *, new,
+		struct old_itimerspec32 __user *, old)
 {
 	struct itimerspec64 new_spec, old_spec;
 	struct itimerspec64 *rtn = old ? &old_spec : NULL;
@@ -1096,8 +1096,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
 
-COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
-		       struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock,
+		struct old_timespec32 __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 ts;
@@ -1111,8 +1111,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
 	return kc->clock_set(which_clock, &ts);
 }
 
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
-		       struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
+		struct old_timespec32 __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 ts;
@@ -1129,8 +1129,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
 	return err;
 }
 
-COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
-		       struct old_timex32 __user *, utp)
+SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
+		struct old_timex32 __user *, utp)
 {
 	struct __kernel_timex ktx;
 	int err;
@@ -1147,8 +1147,8 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
 	return err;
 }
 
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
-		       struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
+		struct old_timespec32 __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 ts;
@@ -1204,9 +1204,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
 
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
-		       struct old_timespec32 __user *, rqtp,
-		       struct old_timespec32 __user *, rmtp)
+SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
+		struct old_timespec32 __user *, rqtp,
+		struct old_timespec32 __user *, rmtp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 t;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 78b5c8f1495a..6261f969dcb7 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -98,11 +98,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)
 
 #endif /* __ARCH_WANT_SYS_TIME */
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
 #ifdef __ARCH_WANT_COMPAT_SYS_TIME
 
 /* old_time32_t is a 32 bit "long" and needs to get converted. */
-COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc)
+SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
 {
 	old_time32_t i;
 
@@ -116,7 +116,7 @@ COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc)
 	return i;
 }
 
-COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr)
+SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
 {
 	struct timespec64 tv;
 	int err;
@@ -344,7 +344,7 @@ int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex
 	return 0;
 }
 
-COMPAT_SYSCALL_DEFINE1(adjtimex, struct old_timex32 __user *, utp)
+SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
 {
 	struct __kernel_timex txc;
 	int err, ret;
diff --git a/net/compat.c b/net/compat.c
index 959d1c51826d..2fef7b9db434 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -822,7 +822,7 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg_time64, int, fd, struct compat_mmsghdr __user *,
 }
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
+COMPAT_SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct compat_mmsghdr __user *, mmsg,
 		       unsigned int, vlen, unsigned int, flags,
 		       struct old_timespec32 __user *, timeout)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 00bf25d693e7f69497cb7f61d46ef99fe295a8a5 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 1 Jan 2019 01:13:32 +0100
Subject: y2038: use time32 syscall names on 32-bit

This is the big flip, where all 32-bit architectures set COMPAT_32BIT_TIME
and use the _time32 system calls from the former compat layer instead
of the system calls that take __kernel_timespec and similar arguments.

The temporary redirects for __kernel_timespec, __kernel_itimerspec
and __kernel_timex can get removed with this.

It would be easy to split this commit by architecture, but with the new
generated system call tables, it's easy enough to do it all at once,
which makes it a little easier to check that the changes are the same
in each table.

Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/Kconfig                                |  2 +-
 arch/arm/kernel/sys_oabi-compat.c           |  8 +--
 arch/arm/tools/syscall.tbl                  | 46 ++++++++--------
 arch/m68k/kernel/syscalls/syscall.tbl       | 42 +++++++--------
 arch/microblaze/kernel/syscalls/syscall.tbl | 46 ++++++++--------
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 44 ++++++++--------
 arch/parisc/kernel/syscalls/syscall.tbl     | 69 ++++++++++++++++--------
 arch/powerpc/kernel/syscalls/syscall.tbl    | 82 +++++++++++++++++++++--------
 arch/sh/kernel/syscalls/syscall.tbl         | 42 +++++++--------
 arch/sparc/kernel/syscalls/syscall.tbl      | 64 ++++++++++++++--------
 arch/x86/entry/syscalls/syscall_32.tbl      | 44 ++++++++--------
 arch/xtensa/kernel/syscalls/syscall.tbl     | 44 ++++++++--------
 include/uapi/asm-generic/unistd.h           | 56 ++++++++++----------
 13 files changed, 335 insertions(+), 254 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/Kconfig b/arch/Kconfig
index 4cfb6de48f79..46db715a7f42 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -759,7 +759,7 @@ config 64BIT_TIME
 	  handling.
 
 config COMPAT_32BIT_TIME
-	def_bool (!64BIT && 64BIT_TIME) || COMPAT
+	def_bool !64BIT || COMPAT
 	help
 	  This enables 32 bit time_t support in addition to 64 bit time_t support.
 	  This is relevant on all 32-bit architectures, and 64-bit architectures
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 92ab36f38795..acd054a42ba2 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -317,10 +317,10 @@ struct oabi_sembuf {
 asmlinkage long sys_oabi_semtimedop(int semid,
 				    struct oabi_sembuf __user *tsops,
 				    unsigned nsops,
-				    const struct timespec __user *timeout)
+				    const struct old_timespec32 __user *timeout)
 {
 	struct sembuf *sops;
-	struct timespec local_timeout;
+	struct old_timespec32 local_timeout;
 	long err;
 	int i;
 
@@ -350,7 +350,7 @@ asmlinkage long sys_oabi_semtimedop(int semid,
 	} else {
 		mm_segment_t fs = get_fs();
 		set_fs(KERNEL_DS);
-		err = sys_semtimedop(semid, sops, nsops, timeout);
+		err = sys_semtimedop_time32(semid, sops, nsops, timeout);
 		set_fs(fs);
 	}
 	kfree(sops);
@@ -375,7 +375,7 @@ asmlinkage int sys_oabi_ipc(uint call, int first, int second, int third,
 		return  sys_oabi_semtimedop(first,
 					    (struct oabi_sembuf __user *)ptr,
 					    second,
-					    (const struct timespec __user *)fifth);
+					    (const struct old_timespec32 __user *)fifth);
 	default:
 		return sys_ipc(call, first, second, third, ptr, fifth);
 	}
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index b54b7f2bc24a..200f4b878a46 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -137,7 +137,7 @@
 121	common	setdomainname		sys_setdomainname
 122	common	uname			sys_newuname
 # 123 was sys_modify_ldt
-124	common	adjtimex		sys_adjtimex
+124	common	adjtimex		sys_adjtimex_time32
 125	common	mprotect		sys_mprotect
 126	common	sigprocmask		sys_sigprocmask
 # 127 was sys_create_module
@@ -174,8 +174,8 @@
 158	common	sched_yield		sys_sched_yield
 159	common	sched_get_priority_max	sys_sched_get_priority_max
 160	common	sched_get_priority_min	sys_sched_get_priority_min
-161	common	sched_rr_get_interval	sys_sched_rr_get_interval
-162	common	nanosleep		sys_nanosleep
+161	common	sched_rr_get_interval	sys_sched_rr_get_interval_time32
+162	common	nanosleep		sys_nanosleep_time32
 163	common	mremap			sys_mremap
 164	common	setresuid		sys_setresuid16
 165	common	getresuid		sys_getresuid16
@@ -190,7 +190,7 @@
 174	common	rt_sigaction		sys_rt_sigaction
 175	common	rt_sigprocmask		sys_rt_sigprocmask
 176	common	rt_sigpending		sys_rt_sigpending
-177	common	rt_sigtimedwait		sys_rt_sigtimedwait
+177	common	rt_sigtimedwait		sys_rt_sigtimedwait_time32
 178	common	rt_sigqueueinfo		sys_rt_sigqueueinfo
 179	common	rt_sigsuspend		sys_rt_sigsuspend
 180	common	pread64			sys_pread64		sys_oabi_pread64
@@ -254,12 +254,12 @@
 237	common	fremovexattr		sys_fremovexattr
 238	common	tkill			sys_tkill
 239	common	sendfile64		sys_sendfile64
-240	common	futex			sys_futex
+240	common	futex			sys_futex_time32
 241	common	sched_setaffinity	sys_sched_setaffinity
 242	common	sched_getaffinity	sys_sched_getaffinity
 243	common	io_setup		sys_io_setup
 244	common	io_destroy		sys_io_destroy
-245	common	io_getevents		sys_io_getevents
+245	common	io_getevents		sys_io_getevents_time32
 246	common	io_submit		sys_io_submit
 247	common	io_cancel		sys_io_cancel
 248	common	exit_group		sys_exit_group
@@ -272,14 +272,14 @@
 # 255 for get_thread_area
 256	common	set_tid_address		sys_set_tid_address
 257	common	timer_create		sys_timer_create
-258	common	timer_settime		sys_timer_settime
-259	common	timer_gettime		sys_timer_gettime
+258	common	timer_settime		sys_timer_settime32
+259	common	timer_gettime		sys_timer_gettime32
 260	common	timer_getoverrun	sys_timer_getoverrun
 261	common	timer_delete		sys_timer_delete
-262	common	clock_settime		sys_clock_settime
-263	common	clock_gettime		sys_clock_gettime
-264	common	clock_getres		sys_clock_getres
-265	common	clock_nanosleep		sys_clock_nanosleep
+262	common	clock_settime		sys_clock_settime32
+263	common	clock_gettime		sys_clock_gettime32
+264	common	clock_getres		sys_clock_getres_time32
+265	common	clock_nanosleep		sys_clock_nanosleep_time32
 266	common	statfs64		sys_statfs64_wrapper
 267	common	fstatfs64		sys_fstatfs64_wrapper
 268	common	tgkill			sys_tgkill
@@ -290,8 +290,8 @@
 273	common	pciconfig_write		sys_pciconfig_write
 274	common	mq_open			sys_mq_open
 275	common	mq_unlink		sys_mq_unlink
-276	common	mq_timedsend		sys_mq_timedsend
-277	common	mq_timedreceive		sys_mq_timedreceive
+276	common	mq_timedsend		sys_mq_timedsend_time32
+277	common	mq_timedreceive		sys_mq_timedreceive_time32
 278	common	mq_notify		sys_mq_notify
 279	common	mq_getsetattr		sys_mq_getsetattr
 280	common	waitid			sys_waitid
@@ -326,7 +326,7 @@
 309	common	add_key			sys_add_key
 310	common	request_key		sys_request_key
 311	common	keyctl			sys_keyctl
-312	common	semtimedop		sys_semtimedop		sys_oabi_semtimedop
+312	common	semtimedop		sys_semtimedop_time32	sys_oabi_semtimedop
 313	common	vserver
 314	common	ioprio_set		sys_ioprio_set
 315	common	ioprio_get		sys_ioprio_get
@@ -349,8 +349,8 @@
 332	common	readlinkat		sys_readlinkat
 333	common	fchmodat		sys_fchmodat
 334	common	faccessat		sys_faccessat
-335	common	pselect6		sys_pselect6
-336	common	ppoll			sys_ppoll
+335	common	pselect6		sys_pselect6_time32
+336	common	ppoll			sys_ppoll_time32
 337	common	unshare			sys_unshare
 338	common	set_robust_list		sys_set_robust_list
 339	common	get_robust_list		sys_get_robust_list
@@ -362,13 +362,13 @@
 345	common	getcpu			sys_getcpu
 346	common	epoll_pwait		sys_epoll_pwait
 347	common	kexec_load		sys_kexec_load
-348	common	utimensat		sys_utimensat
+348	common	utimensat		sys_utimensat_time32
 349	common	signalfd		sys_signalfd
 350	common	timerfd_create		sys_timerfd_create
 351	common	eventfd			sys_eventfd
 352	common	fallocate		sys_fallocate
-353	common	timerfd_settime		sys_timerfd_settime
-354	common	timerfd_gettime		sys_timerfd_gettime
+353	common	timerfd_settime		sys_timerfd_settime32
+354	common	timerfd_gettime		sys_timerfd_gettime32
 355	common	signalfd4		sys_signalfd4
 356	common	eventfd2		sys_eventfd2
 357	common	epoll_create1		sys_epoll_create1
@@ -379,14 +379,14 @@
 362	common	pwritev			sys_pwritev
 363	common	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo
 364	common	perf_event_open		sys_perf_event_open
-365	common	recvmmsg		sys_recvmmsg
+365	common	recvmmsg		sys_recvmmsg_time32
 366	common	accept4			sys_accept4
 367	common	fanotify_init		sys_fanotify_init
 368	common	fanotify_mark		sys_fanotify_mark
 369	common	prlimit64		sys_prlimit64
 370	common	name_to_handle_at	sys_name_to_handle_at
 371	common	open_by_handle_at	sys_open_by_handle_at
-372	common	clock_adjtime		sys_clock_adjtime
+372	common	clock_adjtime		sys_clock_adjtime32
 373	common	syncfs			sys_syncfs
 374	common	sendmmsg		sys_sendmmsg
 375	common	setns			sys_setns
@@ -413,6 +413,6 @@
 396	common	pkey_free		sys_pkey_free
 397	common	statx			sys_statx
 398	common	rseq			sys_rseq
-399	common	io_pgetevents		sys_io_pgetevents
+399	common	io_pgetevents		sys_io_pgetevents_time32
 400	common	migrate_pages		sys_migrate_pages
 401	common	kexec_file_load		sys_kexec_file_load
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index bffffb202f8a..01529dce1d2e 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -131,7 +131,7 @@
 121	common	setdomainname			sys_setdomainname
 122	common	uname				sys_newuname
 123	common	cacheflush			sys_cacheflush
-124	common	adjtimex			sys_adjtimex
+124	common	adjtimex			sys_adjtimex_time32
 125	common	mprotect			sys_mprotect
 126	common	sigprocmask			sys_sigprocmask
 127	common	create_module			sys_ni_syscall
@@ -168,8 +168,8 @@
 158	common	sched_yield			sys_sched_yield
 159	common	sched_get_priority_max		sys_sched_get_priority_max
 160	common	sched_get_priority_min		sys_sched_get_priority_min
-161	common	sched_rr_get_interval		sys_sched_rr_get_interval
-162	common	nanosleep			sys_nanosleep
+161	common	sched_rr_get_interval		sys_sched_rr_get_interval_time32
+162	common	nanosleep			sys_nanosleep_time32
 163	common	mremap				sys_mremap
 164	common	setresuid			sys_setresuid16
 165	common	getresuid			sys_getresuid16
@@ -184,7 +184,7 @@
 174	common	rt_sigaction			sys_rt_sigaction
 175	common	rt_sigprocmask			sys_rt_sigprocmask
 176	common	rt_sigpending			sys_rt_sigpending
-177	common	rt_sigtimedwait			sys_rt_sigtimedwait
+177	common	rt_sigtimedwait			sys_rt_sigtimedwait_time32
 178	common	rt_sigqueueinfo			sys_rt_sigqueueinfo
 179	common	rt_sigsuspend			sys_rt_sigsuspend
 180	common	pread64				sys_pread64
@@ -242,7 +242,7 @@
 232	common	removexattr			sys_removexattr
 233	common	lremovexattr			sys_lremovexattr
 234	common	fremovexattr			sys_fremovexattr
-235	common	futex				sys_futex
+235	common	futex				sys_futex_time32
 236	common	sendfile64			sys_sendfile64
 237	common	mincore				sys_mincore
 238	common	madvise				sys_madvise
@@ -250,7 +250,7 @@
 240	common	readahead			sys_readahead
 241	common	io_setup			sys_io_setup
 242	common	io_destroy			sys_io_destroy
-243	common	io_getevents			sys_io_getevents
+243	common	io_getevents			sys_io_getevents_time32
 244	common	io_submit			sys_io_submit
 245	common	io_cancel			sys_io_cancel
 246	common	fadvise64			sys_fadvise64
@@ -262,14 +262,14 @@
 252	common	remap_file_pages		sys_remap_file_pages
 253	common	set_tid_address			sys_set_tid_address
 254	common	timer_create			sys_timer_create
-255	common	timer_settime			sys_timer_settime
-256	common	timer_gettime			sys_timer_gettime
+255	common	timer_settime			sys_timer_settime32
+256	common	timer_gettime			sys_timer_gettime32
 257	common	timer_getoverrun		sys_timer_getoverrun
 258	common	timer_delete			sys_timer_delete
-259	common	clock_settime			sys_clock_settime
-260	common	clock_gettime			sys_clock_gettime
-261	common	clock_getres			sys_clock_getres
-262	common	clock_nanosleep			sys_clock_nanosleep
+259	common	clock_settime			sys_clock_settime32
+260	common	clock_gettime			sys_clock_gettime32
+261	common	clock_getres			sys_clock_getres_time32
+262	common	clock_nanosleep			sys_clock_nanosleep_time32
 263	common	statfs64			sys_statfs64
 264	common	fstatfs64			sys_fstatfs64
 265	common	tgkill				sys_tgkill
@@ -280,8 +280,8 @@
 270	common	set_mempolicy			sys_set_mempolicy
 271	common	mq_open				sys_mq_open
 272	common	mq_unlink			sys_mq_unlink
-273	common	mq_timedsend			sys_mq_timedsend
-274	common	mq_timedreceive			sys_mq_timedreceive
+273	common	mq_timedsend			sys_mq_timedsend_time32
+274	common	mq_timedreceive			sys_mq_timedreceive_time32
 275	common	mq_notify			sys_mq_notify
 276	common	mq_getsetattr			sys_mq_getsetattr
 277	common	waitid				sys_waitid
@@ -308,8 +308,8 @@
 298	common	readlinkat			sys_readlinkat
 299	common	fchmodat			sys_fchmodat
 300	common	faccessat			sys_faccessat
-301	common	pselect6			sys_pselect6
-302	common	ppoll				sys_ppoll
+301	common	pselect6			sys_pselect6_time32
+302	common	ppoll				sys_ppoll_time32
 303	common	unshare				sys_unshare
 304	common	set_robust_list			sys_set_robust_list
 305	common	get_robust_list			sys_get_robust_list
@@ -323,13 +323,13 @@
 313	common	kexec_load			sys_kexec_load
 314	common	getcpu				sys_getcpu
 315	common	epoll_pwait			sys_epoll_pwait
-316	common	utimensat			sys_utimensat
+316	common	utimensat			sys_utimensat_time32
 317	common	signalfd			sys_signalfd
 318	common	timerfd_create			sys_timerfd_create
 319	common	eventfd				sys_eventfd
 320	common	fallocate			sys_fallocate
-321	common	timerfd_settime			sys_timerfd_settime
-322	common	timerfd_gettime			sys_timerfd_gettime
+321	common	timerfd_settime			sys_timerfd_settime32
+322	common	timerfd_gettime			sys_timerfd_gettime32
 323	common	signalfd4			sys_signalfd4
 324	common	eventfd2			sys_eventfd2
 325	common	epoll_create1			sys_epoll_create1
@@ -349,7 +349,7 @@
 339	common	prlimit64			sys_prlimit64
 340	common	name_to_handle_at		sys_name_to_handle_at
 341	common	open_by_handle_at		sys_open_by_handle_at
-342	common	clock_adjtime			sys_clock_adjtime
+342	common	clock_adjtime			sys_clock_adjtime32
 343	common	syncfs				sys_syncfs
 344	common	setns				sys_setns
 345	common	process_vm_readv		sys_process_vm_readv
@@ -378,7 +378,7 @@
 368	common	recvfrom			sys_recvfrom
 369	common	recvmsg				sys_recvmsg
 370	common	shutdown			sys_shutdown
-371	common	recvmmsg			sys_recvmmsg
+371	common	recvmmsg			sys_recvmmsg_time32
 372	common	sendmmsg			sys_sendmmsg
 373	common	userfaultfd			sys_userfaultfd
 374	common	membarrier			sys_membarrier
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 7cc0f9554da3..492ff5c35b68 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -131,7 +131,7 @@
 121	common	setdomainname			sys_setdomainname
 122	common	uname				sys_newuname
 123	common	modify_ldt			sys_ni_syscall
-124	common	adjtimex			sys_adjtimex
+124	common	adjtimex			sys_adjtimex_time32
 125	common	mprotect			sys_mprotect
 126	common	sigprocmask			sys_sigprocmask
 127	common	create_module			sys_ni_syscall
@@ -168,8 +168,8 @@
 158	common	sched_yield			sys_sched_yield
 159	common	sched_get_priority_max		sys_sched_get_priority_max
 160	common	sched_get_priority_min		sys_sched_get_priority_min
-161	common	sched_rr_get_interval		sys_sched_rr_get_interval
-162	common	nanosleep			sys_nanosleep
+161	common	sched_rr_get_interval		sys_sched_rr_get_interval_time32
+162	common	nanosleep			sys_nanosleep_time32
 163	common	mremap				sys_mremap
 164	common	setresuid			sys_setresuid
 165	common	getresuid			sys_getresuid
@@ -184,7 +184,7 @@
 174	common	rt_sigaction			sys_rt_sigaction
 175	common	rt_sigprocmask			sys_rt_sigprocmask
 176	common	rt_sigpending			sys_rt_sigpending
-177	common	rt_sigtimedwait			sys_rt_sigtimedwait
+177	common	rt_sigtimedwait			sys_rt_sigtimedwait_time32
 178	common	rt_sigqueueinfo			sys_rt_sigqueueinfo
 179	common	rt_sigsuspend			sys_rt_sigsuspend
 180	common	pread64				sys_pread64
@@ -247,14 +247,14 @@
 237	common	fremovexattr			sys_fremovexattr
 238	common	tkill				sys_tkill
 239	common	sendfile64			sys_sendfile64
-240	common	futex				sys_futex
+240	common	futex				sys_futex_time32
 241	common	sched_setaffinity		sys_sched_setaffinity
 242	common	sched_getaffinity		sys_sched_getaffinity
 243	common	set_thread_area			sys_ni_syscall
 244	common	get_thread_area			sys_ni_syscall
 245	common	io_setup			sys_io_setup
 246	common	io_destroy			sys_io_destroy
-247	common	io_getevents			sys_io_getevents
+247	common	io_getevents			sys_io_getevents_time32
 248	common	io_submit			sys_io_submit
 249	common	io_cancel			sys_io_cancel
 250	common	fadvise64			sys_fadvise64
@@ -267,14 +267,14 @@
 257	common	remap_file_pages		sys_remap_file_pages
 258	common	set_tid_address			sys_set_tid_address
 259	common	timer_create			sys_timer_create
-260	common	timer_settime			sys_timer_settime
-261	common	timer_gettime			sys_timer_gettime
+260	common	timer_settime			sys_timer_settime32
+261	common	timer_gettime			sys_timer_gettime32
 262	common	timer_getoverrun		sys_timer_getoverrun
 263	common	timer_delete			sys_timer_delete
-264	common	clock_settime			sys_clock_settime
-265	common	clock_gettime			sys_clock_gettime
-266	common	clock_getres			sys_clock_getres
-267	common	clock_nanosleep			sys_clock_nanosleep
+264	common	clock_settime			sys_clock_settime32
+265	common	clock_gettime			sys_clock_gettime32
+266	common	clock_getres			sys_clock_getres_time32
+267	common	clock_nanosleep			sys_clock_nanosleep_time32
 268	common	statfs64			sys_statfs64
 269	common	fstatfs64			sys_fstatfs64
 270	common	tgkill				sys_tgkill
@@ -286,8 +286,8 @@
 276	common	set_mempolicy			sys_set_mempolicy
 277	common	mq_open				sys_mq_open
 278	common	mq_unlink			sys_mq_unlink
-279	common	mq_timedsend			sys_mq_timedsend
-280	common	mq_timedreceive			sys_mq_timedreceive
+279	common	mq_timedsend			sys_mq_timedsend_time32
+280	common	mq_timedreceive			sys_mq_timedreceive_time32
 281	common	mq_notify			sys_mq_notify
 282	common	mq_getsetattr			sys_mq_getsetattr
 283	common	kexec_load			sys_kexec_load
@@ -315,8 +315,8 @@
 305	common	readlinkat			sys_readlinkat
 306	common	fchmodat			sys_fchmodat
 307	common	faccessat			sys_faccessat
-308	common	pselect6			sys_pselect6
-309	common	ppoll				sys_ppoll
+308	common	pselect6			sys_pselect6_time32
+309	common	ppoll				sys_ppoll_time32
 310	common	unshare				sys_unshare
 311	common	set_robust_list			sys_set_robust_list
 312	common	get_robust_list			sys_get_robust_list
@@ -327,14 +327,14 @@
 317	common	move_pages			sys_move_pages
 318	common	getcpu				sys_getcpu
 319	common	epoll_pwait			sys_epoll_pwait
-320	common	utimensat			sys_utimensat
+320	common	utimensat			sys_utimensat_time32
 321	common	signalfd			sys_signalfd
 322	common	timerfd_create			sys_timerfd_create
 323	common	eventfd				sys_eventfd
 324	common	fallocate			sys_fallocate
-325	common	semtimedop			sys_semtimedop
-326	common	timerfd_settime			sys_timerfd_settime
-327	common	timerfd_gettime			sys_timerfd_gettime
+325	common	semtimedop			sys_semtimedop_time32
+326	common	timerfd_settime			sys_timerfd_settime32
+327	common	timerfd_gettime			sys_timerfd_gettime32
 328	common	semctl				sys_old_semctl
 329	common	semget				sys_semget
 330	common	semop				sys_semop
@@ -374,13 +374,13 @@
 364	common	pwritev				sys_pwritev
 365	common	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo
 366	common	perf_event_open			sys_perf_event_open
-367	common	recvmmsg			sys_recvmmsg
+367	common	recvmmsg			sys_recvmmsg_time32
 368	common	fanotify_init			sys_fanotify_init
 369	common	fanotify_mark			sys_fanotify_mark
 370	common	prlimit64			sys_prlimit64
 371	common	name_to_handle_at		sys_name_to_handle_at
 372	common	open_by_handle_at		sys_open_by_handle_at
-373	common	clock_adjtime			sys_clock_adjtime
+373	common	clock_adjtime			sys_clock_adjtime32
 374	common	syncfs				sys_syncfs
 375	common	setns				sys_setns
 376	common	sendmmsg			sys_sendmmsg
@@ -406,5 +406,5 @@
 396	common	pkey_alloc			sys_pkey_alloc
 397	common	pkey_free			sys_pkey_free
 398	common	statx				sys_statx
-399	common	io_pgetevents			sys_io_pgetevents
+399	common	io_pgetevents			sys_io_pgetevents_time32
 400	common	rseq				sys_rseq
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index e9fec7bac5a9..5642d93b64c0 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -135,7 +135,7 @@
 121	o32	setdomainname			sys_setdomainname
 122	o32	uname				sys_newuname
 123	o32	modify_ldt			sys_ni_syscall
-124	o32	adjtimex			sys_adjtimex			sys_adjtimex_time32
+124	o32	adjtimex			sys_adjtimex_time32
 125	o32	mprotect			sys_mprotect
 126	o32	sigprocmask			sys_sigprocmask			compat_sys_sigprocmask
 127	o32	create_module			sys_ni_syscall
@@ -176,8 +176,8 @@
 162	o32	sched_yield			sys_sched_yield
 163	o32	sched_get_priority_max		sys_sched_get_priority_max
 164	o32	sched_get_priority_min		sys_sched_get_priority_min
-165	o32	sched_rr_get_interval		sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
-166	o32	nanosleep			sys_nanosleep			sys_nanosleep_time32
+165	o32	sched_rr_get_interval		sys_sched_rr_get_interval_time32
+166	o32	nanosleep			sys_nanosleep_time32
 167	o32	mremap				sys_mremap
 168	o32	accept				sys_accept
 169	o32	bind				sys_bind
@@ -208,7 +208,7 @@
 194	o32	rt_sigaction			sys_rt_sigaction		compat_sys_rt_sigaction
 195	o32	rt_sigprocmask			sys_rt_sigprocmask		compat_sys_rt_sigprocmask
 196	o32	rt_sigpending			sys_rt_sigpending		compat_sys_rt_sigpending
-197	o32	rt_sigtimedwait			sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
+197	o32	rt_sigtimedwait			sys_rt_sigtimedwait_time32	compat_sys_rt_sigtimedwait_time32
 198	o32	rt_sigqueueinfo			sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 199	o32	rt_sigsuspend			sys_rt_sigsuspend		compat_sys_rt_sigsuspend
 200	o32	pread64				sys_pread64			sys_32_pread
@@ -249,12 +249,12 @@
 235	o32	fremovexattr			sys_fremovexattr
 236	o32	tkill				sys_tkill
 237	o32	sendfile64			sys_sendfile64
-238	o32	futex				sys_futex			sys_futex_time32
+238	o32	futex				sys_futex_time32
 239	o32	sched_setaffinity		sys_sched_setaffinity		compat_sys_sched_setaffinity
 240	o32	sched_getaffinity		sys_sched_getaffinity		compat_sys_sched_getaffinity
 241	o32	io_setup			sys_io_setup			compat_sys_io_setup
 242	o32	io_destroy			sys_io_destroy
-243	o32	io_getevents			sys_io_getevents		sys_io_getevents_time32
+243	o32	io_getevents			sys_io_getevents_time32
 244	o32	io_submit			sys_io_submit			compat_sys_io_submit
 245	o32	io_cancel			sys_io_cancel
 246	o32	exit_group			sys_exit_group
@@ -269,14 +269,14 @@
 255	o32	statfs64			sys_statfs64			compat_sys_statfs64
 256	o32	fstatfs64			sys_fstatfs64			compat_sys_fstatfs64
 257	o32	timer_create			sys_timer_create		compat_sys_timer_create
-258	o32	timer_settime			sys_timer_settime		sys_timer_settime32
-259	o32	timer_gettime			sys_timer_gettime		sys_timer_gettime32
+258	o32	timer_settime			sys_timer_settime32
+259	o32	timer_gettime			sys_timer_gettime32
 260	o32	timer_getoverrun		sys_timer_getoverrun
 261	o32	timer_delete			sys_timer_delete
-262	o32	clock_settime			sys_clock_settime		sys_clock_settime32
-263	o32	clock_gettime			sys_clock_gettime		sys_clock_gettime32
-264	o32	clock_getres			sys_clock_getres		sys_clock_getres_time32
-265	o32	clock_nanosleep			sys_clock_nanosleep		sys_clock_nanosleep_time32
+262	o32	clock_settime			sys_clock_settime32
+263	o32	clock_gettime			sys_clock_gettime32
+264	o32	clock_getres			sys_clock_getres_time32
+265	o32	clock_nanosleep			sys_clock_nanosleep_time32
 266	o32	tgkill				sys_tgkill
 267	o32	utimes				sys_utimes			sys_utimes_time32
 268	o32	mbind				sys_mbind			compat_sys_mbind
@@ -284,8 +284,8 @@
 270	o32	set_mempolicy			sys_set_mempolicy		compat_sys_set_mempolicy
 271	o32	mq_open				sys_mq_open			compat_sys_mq_open
 272	o32	mq_unlink			sys_mq_unlink
-273	o32	mq_timedsend			sys_mq_timedsend		sys_mq_timedsend_time32
-274	o32	mq_timedreceive			sys_mq_timedreceive		sys_mq_timedreceive_time32
+273	o32	mq_timedsend			sys_mq_timedsend_time32
+274	o32	mq_timedreceive			sys_mq_timedreceive_time32
 275	o32	mq_notify			sys_mq_notify			compat_sys_mq_notify
 276	o32	mq_getsetattr			sys_mq_getsetattr		compat_sys_mq_getsetattr
 277	o32	vserver				sys_ni_syscall
@@ -312,8 +312,8 @@
 298	o32	readlinkat			sys_readlinkat
 299	o32	fchmodat			sys_fchmodat
 300	o32	faccessat			sys_faccessat
-301	o32	pselect6			sys_pselect6			compat_sys_pselect6_time32
-302	o32	ppoll				sys_ppoll			compat_sys_ppoll_time32
+301	o32	pselect6			sys_pselect6_time32		compat_sys_pselect6_time32
+302	o32	ppoll				sys_ppoll_time32		compat_sys_ppoll_time32
 303	o32	unshare				sys_unshare
 304	o32	splice				sys_splice
 305	o32	sync_file_range			sys_sync_file_range		sys32_sync_file_range
@@ -327,14 +327,14 @@
 313	o32	epoll_pwait			sys_epoll_pwait			compat_sys_epoll_pwait
 314	o32	ioprio_set			sys_ioprio_set
 315	o32	ioprio_get			sys_ioprio_get
-316	o32	utimensat			sys_utimensat			sys_utimensat_time32
+316	o32	utimensat			sys_utimensat_time32
 317	o32	signalfd			sys_signalfd			compat_sys_signalfd
 318	o32	timerfd				sys_ni_syscall
 319	o32	eventfd				sys_eventfd
 320	o32	fallocate			sys_fallocate			sys32_fallocate
 321	o32	timerfd_create			sys_timerfd_create
-322	o32	timerfd_gettime			sys_timerfd_gettime		sys_timerfd_gettime32
-323	o32	timerfd_settime			sys_timerfd_settime		sys_timerfd_settime32
+322	o32	timerfd_gettime			sys_timerfd_gettime32
+323	o32	timerfd_settime			sys_timerfd_settime32
 324	o32	signalfd4			sys_signalfd4			compat_sys_signalfd4
 325	o32	eventfd2			sys_eventfd2
 326	o32	epoll_create1			sys_epoll_create1
@@ -346,13 +346,13 @@
 332	o32	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
 333	o32	perf_event_open			sys_perf_event_open
 334	o32	accept4				sys_accept4
-335	o32	recvmmsg			sys_recvmmsg			compat_sys_recvmmsg_time32
+335	o32	recvmmsg			sys_recvmmsg_time32		compat_sys_recvmmsg_time32
 336	o32	fanotify_init			sys_fanotify_init
 337	o32	fanotify_mark			sys_fanotify_mark		compat_sys_fanotify_mark
 338	o32	prlimit64			sys_prlimit64
 339	o32	name_to_handle_at		sys_name_to_handle_at
 340	o32	open_by_handle_at		sys_open_by_handle_at		compat_sys_open_by_handle_at
-341	o32	clock_adjtime			sys_clock_adjtime		sys_clock_adjtime32
+341	o32	clock_adjtime			sys_clock_adjtime32
 342	o32	syncfs				sys_syncfs
 343	o32	sendmmsg			sys_sendmmsg			compat_sys_sendmmsg
 344	o32	setns				sys_setns
@@ -379,7 +379,7 @@
 365	o32	pkey_free			sys_pkey_free
 366	o32	statx				sys_statx
 367	o32	rseq				sys_rseq
-368	o32	io_pgetevents			sys_io_pgetevents		compat_sys_io_pgetevents
+368	o32	io_pgetevents			sys_io_pgetevents_time32	compat_sys_io_pgetevents
 # room for arch specific calls
 393	o32	semget				sys_semget
 394	o32	semctl				sys_semctl			compat_sys_semctl
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index f7440427d459..b110c7371b08 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -133,7 +133,8 @@
 121	common	setdomainname		sys_setdomainname
 122	common	sendfile		sys_sendfile			compat_sys_sendfile
 123	common	recvfrom		sys_recvfrom
-124	common	adjtimex		sys_adjtimex			sys_adjtimex_time32
+124	32	adjtimex		sys_adjtimex_time32
+124	64	adjtimex		sys_adjtimex
 125	common	mprotect		sys_mprotect
 126	common	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
 # 127 was create_module
@@ -171,8 +172,10 @@
 158	common	sched_yield		sys_sched_yield
 159	common	sched_get_priority_max	sys_sched_get_priority_max
 160	common	sched_get_priority_min	sys_sched_get_priority_min
-161	common	sched_rr_get_interval	sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
-162	common	nanosleep		sys_nanosleep			sys_nanosleep_time32
+161	32	sched_rr_get_interval	sys_sched_rr_get_interval_time32
+161	64	sched_rr_get_interval	sys_sched_rr_get_interval
+162	32	nanosleep		sys_nanosleep_time32
+162	64	nanosleep		sys_nanosleep
 163	common	mremap			sys_mremap
 164	common	setresuid		sys_setresuid
 165	common	getresuid		sys_getresuid
@@ -187,7 +190,8 @@
 174	common	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
 175	common	rt_sigprocmask		sys_rt_sigprocmask		compat_sys_rt_sigprocmask
 176	common	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
-177	common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
+177	32	rt_sigtimedwait		sys_rt_sigtimedwait_time32	compat_sys_rt_sigtimedwait_time32
+177	64	rt_sigtimedwait		sys_rt_sigtimedwait
 178	common	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 179	common	rt_sigsuspend		sys_rt_sigsuspend		compat_sys_rt_sigsuspend
 180	common	chown			sys_chown
@@ -223,14 +227,16 @@
 207	64	readahead		sys_readahead
 208	common	tkill			sys_tkill
 209	common	sendfile64		sys_sendfile64			compat_sys_sendfile64
-210	common	futex			sys_futex			sys_futex_time32
+210	32	futex			sys_futex_time32
+210	64	futex			sys_futex
 211	common	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
 212	common	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
 # 213 was set_thread_area
 # 214 was get_thread_area
 215	common	io_setup		sys_io_setup			compat_sys_io_setup
 216	common	io_destroy		sys_io_destroy
-217	common	io_getevents		sys_io_getevents		sys_io_getevents_time32
+217	32	io_getevents		sys_io_getevents_time32
+217	64	io_getevents		sys_io_getevents
 218	common	io_submit		sys_io_submit			compat_sys_io_submit
 219	common	io_cancel		sys_io_cancel
 # 220 was alloc_hugepages
@@ -241,11 +247,14 @@
 225	common	epoll_ctl		sys_epoll_ctl
 226	common	epoll_wait		sys_epoll_wait
 227	common	remap_file_pages	sys_remap_file_pages
-228	common	semtimedop		sys_semtimedop			sys_semtimedop_time32
+228	32	semtimedop		sys_semtimedop_time32
+228	64	semtimedop		sys_semtimedop
 229	common	mq_open			sys_mq_open			compat_sys_mq_open
 230	common	mq_unlink		sys_mq_unlink
-231	common	mq_timedsend		sys_mq_timedsend		sys_mq_timedsend_time32
-232	common	mq_timedreceive		sys_mq_timedreceive		sys_mq_timedreceive_time32
+231	32	mq_timedsend		sys_mq_timedsend_time32
+231	64	mq_timedsend		sys_mq_timedsend
+232	32	mq_timedreceive		sys_mq_timedreceive_time32
+232	64	mq_timedreceive		sys_mq_timedreceive
 233	common	mq_notify		sys_mq_notify			compat_sys_mq_notify
 234	common	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
 235	common	waitid			sys_waitid			compat_sys_waitid
@@ -265,14 +274,20 @@
 248	common	lremovexattr		sys_lremovexattr
 249	common	fremovexattr		sys_fremovexattr
 250	common	timer_create		sys_timer_create		compat_sys_timer_create
-251	common	timer_settime		sys_timer_settime		sys_timer_settime32
-252	common	timer_gettime		sys_timer_gettime		sys_timer_gettime32
+251	32	timer_settime		sys_timer_settime32
+251	64	timer_settime		sys_timer_settime
+252	32	timer_gettime		sys_timer_gettime32
+252	64	timer_gettime		sys_timer_gettime
 253	common	timer_getoverrun	sys_timer_getoverrun
 254	common	timer_delete		sys_timer_delete
-255	common	clock_settime		sys_clock_settime		sys_clock_settime32
-256	common	clock_gettime		sys_clock_gettime		sys_clock_gettime32
-257	common	clock_getres		sys_clock_getres		sys_clock_getres_time32
-258	common	clock_nanosleep		sys_clock_nanosleep		sys_clock_nanosleep_time32
+255	32	clock_settime		sys_clock_settime32
+255	64	clock_settime		sys_clock_settime
+256	32	clock_gettime		sys_clock_gettime32
+256	64	clock_gettime		sys_clock_gettime
+257	32	clock_getres		sys_clock_getres_time32
+257	64	clock_getres		sys_clock_getres
+258	32	clock_nanosleep		sys_clock_nanosleep_time32
+258	64	clock_nanosleep		sys_clock_nanosleep
 259	common	tgkill			sys_tgkill
 260	common	mbind			sys_mbind			compat_sys_mbind
 261	common	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
@@ -287,8 +302,10 @@
 270	common	inotify_add_watch	sys_inotify_add_watch
 271	common	inotify_rm_watch	sys_inotify_rm_watch
 272	common	migrate_pages		sys_migrate_pages
-273	common	pselect6		sys_pselect6			compat_sys_pselect6_time32
-274	common	ppoll			sys_ppoll			compat_sys_ppoll_time32
+273	32	pselect6		sys_pselect6_time32		compat_sys_pselect6_time32
+273	64	pselect6		sys_pselect6
+274	32	ppoll			sys_ppoll_time32		compat_sys_ppoll_time32
+274	64	ppoll			sys_ppoll
 275	common	openat			sys_openat			compat_sys_openat
 276	common	mkdirat			sys_mkdirat
 277	common	mknodat			sys_mknodat
@@ -316,15 +333,18 @@
 298	common	statfs64		sys_statfs64			compat_sys_statfs64
 299	common	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
 300	common	kexec_load		sys_kexec_load			compat_sys_kexec_load
-301	common	utimensat		sys_utimensat			sys_utimensat_time32
+301	32	utimensat		sys_utimensat_time32
+301	64	utimensat		sys_utimensat
 302	common	signalfd		sys_signalfd			compat_sys_signalfd
 # 303 was timerfd
 304	common	eventfd			sys_eventfd
 305	32	fallocate		parisc_fallocate
 305	64	fallocate		sys_fallocate
 306	common	timerfd_create		sys_timerfd_create
-307	common	timerfd_settime		sys_timerfd_settime		sys_timerfd_settime32
-308	common	timerfd_gettime		sys_timerfd_gettime		sys_timerfd_gettime32
+307	32	timerfd_settime		sys_timerfd_settime32
+307	64	timerfd_settime		sys_timerfd_settime
+308	32	timerfd_gettime		sys_timerfd_gettime32
+308	64	timerfd_gettime		sys_timerfd_gettime
 309	common	signalfd4		sys_signalfd4			compat_sys_signalfd4
 310	common	eventfd2		sys_eventfd2
 311	common	epoll_create1		sys_epoll_create1
@@ -335,12 +355,14 @@
 316	common	pwritev	sys_pwritev	compat_sys_pwritev
 317	common	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
 318	common	perf_event_open		sys_perf_event_open
-319	common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg_time32
+319	32	recvmmsg		sys_recvmmsg_time32		compat_sys_recvmmsg_time32
+319	64	recvmmsg		sys_recvmmsg
 320	common	accept4			sys_accept4
 321	common	prlimit64		sys_prlimit64
 322	common	fanotify_init		sys_fanotify_init
 323	common	fanotify_mark		sys_fanotify_mark		sys32_fanotify_mark
-324	common	clock_adjtime		sys_clock_adjtime		sys_clock_adjtime32
+324	32	clock_adjtime		sys_clock_adjtime32
+324	64	clock_adjtime		sys_clock_adjtime
 325	common	name_to_handle_at	sys_name_to_handle_at
 326	common	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
 327	common	syncfs			sys_syncfs
@@ -366,7 +388,8 @@
 347	common	preadv2			sys_preadv2			compat_sys_preadv2
 348	common	pwritev2		sys_pwritev2			compat_sys_pwritev2
 349	common	statx			sys_statx
-350	common	io_pgetevents		sys_io_pgetevents		compat_sys_io_pgetevents
+350	32	io_pgetevents		sys_io_pgetevents_time32	compat_sys_io_pgetevents
+350	64	io_pgetevents		sys_io_pgetevents
 351	common	pkey_mprotect		sys_pkey_mprotect
 352	common	pkey_alloc		sys_pkey_alloc
 353	common	pkey_free		sys_pkey_free
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 86650dcd2185..2a8b060f73b3 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -157,7 +157,9 @@
 121	common	setdomainname			sys_setdomainname
 122	common	uname				sys_newuname
 123	common	modify_ldt			sys_ni_syscall
-124	common	adjtimex			sys_adjtimex			sys_adjtimex_time32
+124	32	adjtimex			sys_adjtimex_time32
+124	64	adjtimex			sys_adjtimex
+124	spu	adjtimex			sys_adjtimex
 125	common	mprotect			sys_mprotect
 126	32	sigprocmask			sys_sigprocmask			compat_sys_sigprocmask
 126	64	sigprocmask			sys_ni_syscall
@@ -198,8 +200,12 @@
 158	common	sched_yield			sys_sched_yield
 159	common	sched_get_priority_max		sys_sched_get_priority_max
 160	common	sched_get_priority_min		sys_sched_get_priority_min
-161	common	sched_rr_get_interval		sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
-162	common	nanosleep			sys_nanosleep			sys_nanosleep_time32
+161	32	sched_rr_get_interval		sys_sched_rr_get_interval_time32
+161	64	sched_rr_get_interval		sys_sched_rr_get_interval
+161	spu	sched_rr_get_interval		sys_sched_rr_get_interval
+162	32	nanosleep			sys_nanosleep_time32
+162	64	nanosleep			sys_nanosleep
+162	spu	nanosleep			sys_nanosleep
 163	common	mremap				sys_mremap
 164	common	setresuid			sys_setresuid
 165	common	getresuid			sys_getresuid
@@ -213,7 +219,8 @@
 173	nospu	rt_sigaction			sys_rt_sigaction		compat_sys_rt_sigaction
 174	nospu	rt_sigprocmask			sys_rt_sigprocmask		compat_sys_rt_sigprocmask
 175	nospu	rt_sigpending			sys_rt_sigpending		compat_sys_rt_sigpending
-176	nospu	rt_sigtimedwait			sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
+176	32	rt_sigtimedwait			sys_rt_sigtimedwait_time32	compat_sys_rt_sigtimedwait_time32
+176	64	rt_sigtimedwait			sys_rt_sigtimedwait
 177	nospu 	rt_sigqueueinfo			sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 178	nospu 	rt_sigsuspend			sys_rt_sigsuspend		compat_sys_rt_sigsuspend
 179	common	pread64				sys_pread64			compat_sys_pread64
@@ -260,7 +267,9 @@
 218	common	removexattr			sys_removexattr
 219	common	lremovexattr			sys_lremovexattr
 220	common	fremovexattr			sys_fremovexattr
-221	common	futex				sys_futex			sys_futex_time32
+221	32	futex				sys_futex_time32
+221	64	futex				sys_futex
+221	spu	futex				sys_futex
 222	common	sched_setaffinity		sys_sched_setaffinity		compat_sys_sched_setaffinity
 223	common	sched_getaffinity		sys_sched_getaffinity		compat_sys_sched_getaffinity
 # 224 unused
@@ -268,7 +277,9 @@
 226	32	sendfile64			sys_sendfile64			compat_sys_sendfile64
 227	common	io_setup			sys_io_setup			compat_sys_io_setup
 228	common	io_destroy			sys_io_destroy
-229	common	io_getevents			sys_io_getevents		sys_io_getevents_time32
+229	32	io_getevents			sys_io_getevents_time32
+229	64	io_getevents			sys_io_getevents
+229	spu	io_getevents			sys_io_getevents
 230	common	io_submit			sys_io_submit			compat_sys_io_submit
 231	common	io_cancel			sys_io_cancel
 232	nospu	set_tid_address			sys_set_tid_address
@@ -280,14 +291,26 @@
 238	common	epoll_wait			sys_epoll_wait
 239	common	remap_file_pages		sys_remap_file_pages
 240	common	timer_create			sys_timer_create		compat_sys_timer_create
-241	common	timer_settime			sys_timer_settime		sys_timer_settime32
-242	common	timer_gettime			sys_timer_gettime		sys_timer_gettime32
+241	32	timer_settime			sys_timer_settime32
+241	64	timer_settime			sys_timer_settime
+241	spu	timer_settime			sys_timer_settime
+242	32	timer_gettime			sys_timer_gettime32
+242	64	timer_gettime			sys_timer_gettime
+242	spu	timer_gettime			sys_timer_gettime
 243	common	timer_getoverrun		sys_timer_getoverrun
 244	common	timer_delete			sys_timer_delete
-245	common	clock_settime			sys_clock_settime		sys_clock_settime32
-246	common	clock_gettime			sys_clock_gettime		sys_clock_gettime32
-247	common	clock_getres			sys_clock_getres		sys_clock_getres_time32
-248	common	clock_nanosleep			sys_clock_nanosleep		sys_clock_nanosleep_time32
+245	32	clock_settime			sys_clock_settime32
+245	64	clock_settime			sys_clock_settime
+245	spu	clock_settime			sys_clock_settime
+246	32	clock_gettime			sys_clock_gettime32
+246	64	clock_gettime			sys_clock_gettime
+246	spu	clock_gettime			sys_clock_gettime
+247	32	clock_getres			sys_clock_getres_time32
+247	64	clock_getres			sys_clock_getres
+247	spu	clock_getres			sys_clock_getres
+248	32	clock_nanosleep			sys_clock_nanosleep_time32
+248	64	clock_nanosleep			sys_clock_nanosleep
+248	spu	clock_nanosleep			sys_clock_nanosleep
 249	32	swapcontext			ppc_swapcontext			ppc32_swapcontext
 249	64	swapcontext			ppc64_swapcontext
 249	spu	swapcontext			sys_ni_syscall
@@ -308,8 +331,10 @@
 261	nospu	set_mempolicy			sys_set_mempolicy		compat_sys_set_mempolicy
 262	nospu	mq_open				sys_mq_open			compat_sys_mq_open
 263	nospu	mq_unlink			sys_mq_unlink
-264	nospu	mq_timedsend			sys_mq_timedsend		sys_mq_timedsend_time32
-265	nospu	mq_timedreceive			sys_mq_timedreceive		sys_mq_timedreceive_time32
+264	32	mq_timedsend			sys_mq_timedsend_time32
+264	64	mq_timedsend			sys_mq_timedsend
+265	32	mq_timedreceive			sys_mq_timedreceive_time32
+265	64	mq_timedreceive			sys_mq_timedreceive
 266	nospu	mq_notify			sys_mq_notify			compat_sys_mq_notify
 267	nospu	mq_getsetattr			sys_mq_getsetattr		compat_sys_mq_getsetattr
 268	nospu	kexec_load			sys_kexec_load			compat_sys_kexec_load
@@ -324,8 +349,10 @@
 277	nospu	inotify_rm_watch		sys_inotify_rm_watch
 278	nospu	spu_run				sys_spu_run
 279	nospu	spu_create			sys_spu_create
-280	nospu	pselect6			sys_pselect6			compat_sys_pselect6_time32
-281	nospu	ppoll				sys_ppoll			compat_sys_ppoll_time32
+280	32	pselect6			sys_pselect6_time32		compat_sys_pselect6_time32
+280	64	pselect6			sys_pselect6
+281	32	ppoll				sys_ppoll_time32		compat_sys_ppoll_time32
+281	64	ppoll				sys_ppoll
 282	common	unshare				sys_unshare
 283	common	splice				sys_splice
 284	common	tee				sys_tee
@@ -350,15 +377,21 @@
 301	common	move_pages			sys_move_pages			compat_sys_move_pages
 302	common	getcpu				sys_getcpu
 303	nospu	epoll_pwait			sys_epoll_pwait			compat_sys_epoll_pwait
-304	common	utimensat			sys_utimensat			sys_utimensat_time32
+304	32	utimensat			sys_utimensat_time32
+304	64	utimensat			sys_utimensat
+304	spu	utimensat			sys_utimensat
 305	common	signalfd			sys_signalfd			compat_sys_signalfd
 306	common	timerfd_create			sys_timerfd_create
 307	common	eventfd				sys_eventfd
 308	common	sync_file_range2		sys_sync_file_range2		compat_sys_sync_file_range2
 309	nospu	fallocate			sys_fallocate			compat_sys_fallocate
 310	nospu	subpage_prot			sys_subpage_prot
-311	common	timerfd_settime			sys_timerfd_settime		sys_timerfd_settime32
-312	common	timerfd_gettime			sys_timerfd_gettime		sys_timerfd_gettime32
+311	32	timerfd_settime			sys_timerfd_settime32
+311	64	timerfd_settime			sys_timerfd_settime
+311	spu	timerfd_settime			sys_timerfd_settime
+312	32	timerfd_gettime			sys_timerfd_gettime32
+312	64	timerfd_gettime			sys_timerfd_gettime
+312	spu	timerfd_gettime			sys_timerfd_gettime
 313	common	signalfd4			sys_signalfd4			compat_sys_signalfd4
 314	common	eventfd2			sys_eventfd2
 315	common	epoll_create1			sys_epoll_create1
@@ -389,11 +422,15 @@
 340	common	getsockopt			sys_getsockopt			compat_sys_getsockopt
 341	common	sendmsg				sys_sendmsg			compat_sys_sendmsg
 342	common	recvmsg				sys_recvmsg			compat_sys_recvmsg
-343	common	recvmmsg			sys_recvmmsg			compat_sys_recvmmsg_time32
+343	32	recvmmsg			sys_recvmmsg_time32		compat_sys_recvmmsg_time32
+343	64	recvmmsg			sys_recvmmsg
+343	spu	recvmmsg			sys_recvmmsg
 344	common	accept4				sys_accept4
 345	common	name_to_handle_at		sys_name_to_handle_at
 346	common	open_by_handle_at		sys_open_by_handle_at		compat_sys_open_by_handle_at
-347	common	clock_adjtime			sys_clock_adjtime		sys_clock_adjtime32
+347	32	clock_adjtime			sys_clock_adjtime32
+347	64	clock_adjtime			sys_clock_adjtime
+347	spu	clock_adjtime			sys_clock_adjtime
 348	common	syncfs				sys_syncfs
 349	common	sendmmsg			sys_sendmmsg			compat_sys_sendmmsg
 350	common	setns				sys_setns
@@ -425,7 +462,8 @@
 385	nospu	pkey_free			sys_pkey_free
 386	nospu	pkey_mprotect			sys_pkey_mprotect
 387	nospu	rseq				sys_rseq
-388	nospu	io_pgetevents			sys_io_pgetevents		compat_sys_io_pgetevents
+388	32	io_pgetevents			sys_io_pgetevents_time32	compat_sys_io_pgetevents
+388	64	io_pgetevents			sys_io_pgetevents
 # room for arch specific syscalls
 392	64	semtimedop			sys_semtimedop
 393	common	semget				sys_semget
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index cafa63c6a932..a08a5312cd12 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -131,7 +131,7 @@
 121	common	setdomainname			sys_setdomainname
 122	common	uname				sys_newuname
 123	common	cacheflush			sys_cacheflush
-124	common	adjtimex			sys_adjtimex
+124	common	adjtimex			sys_adjtimex_time32
 125	common	mprotect			sys_mprotect
 126	common	sigprocmask			sys_sigprocmask
 # 127 was create_module
@@ -168,8 +168,8 @@
 158	common	sched_yield			sys_sched_yield
 159	common	sched_get_priority_max		sys_sched_get_priority_max
 160	common	sched_get_priority_min		sys_sched_get_priority_min
-161	common	sched_rr_get_interval		sys_sched_rr_get_interval
-162	common	nanosleep			sys_nanosleep
+161	common	sched_rr_get_interval		sys_sched_rr_get_interval_time32
+162	common	nanosleep			sys_nanosleep_time32
 163	common	mremap				sys_mremap
 164	common	setresuid			sys_setresuid16
 165	common	getresuid			sys_getresuid16
@@ -184,7 +184,7 @@
 174	common	rt_sigaction			sys_rt_sigaction
 175	common	rt_sigprocmask			sys_rt_sigprocmask
 176	common	rt_sigpending			sys_rt_sigpending
-177	common	rt_sigtimedwait			sys_rt_sigtimedwait
+177	common	rt_sigtimedwait			sys_rt_sigtimedwait_time32
 178	common	rt_sigqueueinfo			sys_rt_sigqueueinfo
 179	common	rt_sigsuspend			sys_rt_sigsuspend
 180	common	pread64				sys_pread_wrapper
@@ -247,14 +247,14 @@
 237	common	fremovexattr			sys_fremovexattr
 238	common	tkill				sys_tkill
 239	common	sendfile64			sys_sendfile64
-240	common	futex				sys_futex
+240	common	futex				sys_futex_time32
 241	common	sched_setaffinity		sys_sched_setaffinity
 242	common	sched_getaffinity		sys_sched_getaffinity
 # 243 is reserved for set_thread_area
 # 244 is reserved for get_thread_area
 245	common	io_setup			sys_io_setup
 246	common	io_destroy			sys_io_destroy
-247	common	io_getevents			sys_io_getevents
+247	common	io_getevents			sys_io_getevents_time32
 248	common	io_submit			sys_io_submit
 249	common	io_cancel			sys_io_cancel
 250	common	fadvise64			sys_fadvise64
@@ -267,14 +267,14 @@
 257	common	remap_file_pages		sys_remap_file_pages
 258	common	set_tid_address			sys_set_tid_address
 259	common	timer_create			sys_timer_create
-260	common	timer_settime			sys_timer_settime
-261	common	timer_gettime			sys_timer_gettime
+260	common	timer_settime			sys_timer_settime32
+261	common	timer_gettime			sys_timer_gettime32
 262	common	timer_getoverrun		sys_timer_getoverrun
 263	common	timer_delete			sys_timer_delete
-264	common	clock_settime			sys_clock_settime
-265	common	clock_gettime			sys_clock_gettime
-266	common	clock_getres			sys_clock_getres
-267	common	clock_nanosleep			sys_clock_nanosleep
+264	common	clock_settime			sys_clock_settime32
+265	common	clock_gettime			sys_clock_gettime32
+266	common	clock_getres			sys_clock_getres_time32
+267	common	clock_nanosleep			sys_clock_nanosleep_time32
 268	common	statfs64			sys_statfs64
 269	common	fstatfs64			sys_fstatfs64
 270	common	tgkill				sys_tgkill
@@ -286,8 +286,8 @@
 276	common	set_mempolicy			sys_set_mempolicy
 277	common	mq_open				sys_mq_open
 278	common	mq_unlink			sys_mq_unlink
-279	common	mq_timedsend			sys_mq_timedsend
-280	common	mq_timedreceive			sys_mq_timedreceive
+279	common	mq_timedsend			sys_mq_timedsend_time32
+280	common	mq_timedreceive			sys_mq_timedreceive_time32
 281	common	mq_notify			sys_mq_notify
 282	common	mq_getsetattr			sys_mq_getsetattr
 283	common	kexec_load			sys_kexec_load
@@ -315,8 +315,8 @@
 305	common	readlinkat			sys_readlinkat
 306	common	fchmodat			sys_fchmodat
 307	common	faccessat			sys_faccessat
-308	common	pselect6			sys_pselect6
-309	common	ppoll				sys_ppoll
+308	common	pselect6			sys_pselect6_time32
+309	common	ppoll				sys_ppoll_time32
 310	common	unshare				sys_unshare
 311	common	set_robust_list			sys_set_robust_list
 312	common	get_robust_list			sys_get_robust_list
@@ -327,13 +327,13 @@
 317	common	move_pages			sys_move_pages
 318	common	getcpu				sys_getcpu
 319	common	epoll_pwait			sys_epoll_pwait
-320	common	utimensat			sys_utimensat
+320	common	utimensat			sys_utimensat_time32
 321	common	signalfd			sys_signalfd
 322	common	timerfd_create			sys_timerfd_create
 323	common	eventfd				sys_eventfd
 324	common	fallocate			sys_fallocate
-325	common	timerfd_settime			sys_timerfd_settime
-326	common	timerfd_gettime			sys_timerfd_gettime
+325	common	timerfd_settime			sys_timerfd_settime32
+326	common	timerfd_gettime			sys_timerfd_gettime32
 327	common	signalfd4			sys_signalfd4
 328	common	eventfd2			sys_eventfd2
 329	common	epoll_create1			sys_epoll_create1
@@ -364,11 +364,11 @@
 354	common	getsockopt			sys_getsockopt
 355	common	sendmsg				sys_sendmsg
 356	common	recvmsg				sys_recvmsg
-357	common	recvmmsg			sys_recvmmsg
+357	common	recvmmsg			sys_recvmmsg_time32
 358	common	accept4				sys_accept4
 359	common	name_to_handle_at		sys_name_to_handle_at
 360	common	open_by_handle_at		sys_open_by_handle_at
-361	common	clock_adjtime			sys_clock_adjtime
+361	common	clock_adjtime			sys_clock_adjtime32
 362	common	syncfs				sys_syncfs
 363	common	sendmmsg			sys_sendmmsg
 364	common	setns				sys_setns
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 7cb05b50aeaa..dc1e08040b39 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -128,7 +128,8 @@
 102	common	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
 103	common	rt_sigprocmask		sys_rt_sigprocmask		compat_sys_rt_sigprocmask
 104	common	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
-105	common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time32
+105	32	rt_sigtimedwait		sys_rt_sigtimedwait_time32	compat_sys_rt_sigtimedwait_time32
+105	64	rt_sigtimedwait		sys_rt_sigtimedwait
 106	common	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 107	common	rt_sigsuspend		sys_rt_sigsuspend		compat_sys_rt_sigsuspend
 108	32	setresuid32		sys_setresuid
@@ -172,7 +173,8 @@
 139	common	stat64			sys_stat64			compat_sys_stat64
 140	common	sendfile64		sys_sendfile64
 141	common	getpeername		sys_getpeername
-142	common	futex			sys_futex			sys_futex_time32
+142	32	futex			sys_futex_time32
+142	64	futex			sys_futex
 143	common	gettid			sys_gettid
 144	common	getrlimit		sys_getrlimit			compat_sys_getrlimit
 145	common	setrlimit		sys_setrlimit			compat_sys_setrlimit
@@ -258,7 +260,7 @@
 216	64	sigreturn		sys_nis_syscall
 217	common	clone			sys_clone
 218	common	ioprio_get		sys_ioprio_get
-219	32	adjtimex		sys_adjtimex			sys_adjtimex_time32
+219	32	adjtimex		sys_adjtimex_time32
 219	64	adjtimex		sys_sparc_adjtimex
 220	32	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
 220	64	sigprocmask		sys_nis_syscall
@@ -289,8 +291,10 @@
 245	common	sched_yield		sys_sched_yield
 246	common	sched_get_priority_max	sys_sched_get_priority_max
 247	common	sched_get_priority_min	sys_sched_get_priority_min
-248	common	sched_rr_get_interval	sys_sched_rr_get_interval	sys_sched_rr_get_interval_time32
-249	common	nanosleep		sys_nanosleep			sys_nanosleep_time32
+248	32	sched_rr_get_interval	sys_sched_rr_get_interval_time32
+248	64	sched_rr_get_interval	sys_sched_rr_get_interval
+249	32	nanosleep		sys_nanosleep_time32
+249	64	nanosleep		sys_nanosleep
 250	32	mremap			sys_mremap
 250	64	mremap			sys_64_mremap
 251	common	_sysctl			sys_sysctl			compat_sys_sysctl
@@ -299,14 +303,20 @@
 254	32	nfsservctl		sys_ni_syscall			sys_nis_syscall
 254	64	nfsservctl		sys_nis_syscall
 255	common	sync_file_range		sys_sync_file_range		compat_sys_sync_file_range
-256	common	clock_settime		sys_clock_settime		sys_clock_settime32
-257	common	clock_gettime		sys_clock_gettime		sys_clock_gettime32
-258	common	clock_getres		sys_clock_getres		sys_clock_getres_time32
-259	common	clock_nanosleep		sys_clock_nanosleep		sys_clock_nanosleep_time32
+256	32	clock_settime		sys_clock_settime32
+256	64	clock_settime		sys_clock_settime
+257	32	clock_gettime		sys_clock_gettime32
+257	64	clock_gettime		sys_clock_gettime
+258	32	clock_getres		sys_clock_getres_time32
+258	64	clock_getres		sys_clock_getres
+259	32	clock_nanosleep		sys_clock_nanosleep_time32
+259	64	clock_nanosleep		sys_clock_nanosleep
 260	common	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
 261	common	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
-262	common	timer_settime		sys_timer_settime		sys_timer_settime32
-263	common	timer_gettime		sys_timer_gettime		sys_timer_gettime32
+262	32	timer_settime		sys_timer_settime32
+262	64	timer_settime		sys_timer_settime
+263	32	timer_gettime		sys_timer_gettime32
+263	64	timer_gettime		sys_timer_gettime
 264	common	timer_getoverrun	sys_timer_getoverrun
 265	common	timer_delete		sys_timer_delete
 266	common	timer_create		sys_timer_create		compat_sys_timer_create
@@ -316,11 +326,14 @@
 269	common	io_destroy		sys_io_destroy
 270	common	io_submit		sys_io_submit			compat_sys_io_submit
 271	common	io_cancel		sys_io_cancel
-272	common	io_getevents		sys_io_getevents		sys_io_getevents_time32
+272	32	io_getevents		sys_io_getevents_time32
+272	64	io_getevents		sys_io_getevents
 273	common	mq_open			sys_mq_open			compat_sys_mq_open
 274	common	mq_unlink		sys_mq_unlink
-275	common	mq_timedsend		sys_mq_timedsend		sys_mq_timedsend_time32
-276	common	mq_timedreceive		sys_mq_timedreceive		sys_mq_timedreceive_time32
+275	32	mq_timedsend		sys_mq_timedsend_time32
+275	64	mq_timedsend		sys_mq_timedsend
+276	32	mq_timedreceive		sys_mq_timedreceive_time32
+276	64	mq_timedreceive		sys_mq_timedreceive
 277	common	mq_notify		sys_mq_notify			compat_sys_mq_notify
 278	common	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
 279	common	waitid			sys_waitid			compat_sys_waitid
@@ -341,8 +354,10 @@
 294	common	readlinkat		sys_readlinkat
 295	common	fchmodat		sys_fchmodat
 296	common	faccessat		sys_faccessat
-297	common	pselect6		sys_pselect6			compat_sys_pselect6_time32
-298	common	ppoll			sys_ppoll			compat_sys_ppoll_time32
+297	32	pselect6		sys_pselect6_time32		compat_sys_pselect6_time32
+297	64	pselect6		sys_pselect6
+298	32	ppoll			sys_ppoll_time32		compat_sys_ppoll_time32
+298	64	ppoll			sys_ppoll
 299	common	unshare			sys_unshare
 300	common	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
 301	common	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
@@ -354,13 +369,16 @@
 307	common	move_pages		sys_move_pages			compat_sys_move_pages
 308	common	getcpu			sys_getcpu
 309	common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
-310	common	utimensat		sys_utimensat			sys_utimensat_time32
+310	32	utimensat		sys_utimensat_time32
+310	64	utimensat		sys_utimensat
 311	common	signalfd		sys_signalfd			compat_sys_signalfd
 312	common	timerfd_create		sys_timerfd_create
 313	common	eventfd			sys_eventfd
 314	common	fallocate		sys_fallocate			compat_sys_fallocate
-315	common	timerfd_settime		sys_timerfd_settime		sys_timerfd_settime32
-316	common	timerfd_gettime		sys_timerfd_gettime		sys_timerfd_gettime32
+315	32	timerfd_settime		sys_timerfd_settime32
+315	64	timerfd_settime		sys_timerfd_settime
+316	32	timerfd_gettime		sys_timerfd_gettime32
+316	64	timerfd_gettime		sys_timerfd_gettime
 317	common	signalfd4		sys_signalfd4			compat_sys_signalfd4
 318	common	eventfd2		sys_eventfd2
 319	common	epoll_create1		sys_epoll_create1
@@ -372,13 +390,14 @@
 325	common	pwritev			sys_pwritev			compat_sys_pwritev
 326	common	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
 327	common	perf_event_open		sys_perf_event_open
-328	common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg_time32
+328	32	recvmmsg		sys_recvmmsg_time32		compat_sys_recvmmsg_time32
+328	64	recvmmsg		sys_recvmmsg
 329	common	fanotify_init		sys_fanotify_init
 330	common	fanotify_mark		sys_fanotify_mark		compat_sys_fanotify_mark
 331	common	prlimit64		sys_prlimit64
 332	common	name_to_handle_at	sys_name_to_handle_at
 333	common	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
-334	32	clock_adjtime		sys_clock_adjtime		sys_clock_adjtime32
+334	32	clock_adjtime		sys_clock_adjtime32
 334	64	clock_adjtime		sys_sparc_clock_adjtime
 335	common	syncfs			sys_syncfs
 336	common	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
@@ -408,7 +427,8 @@
 358	common	preadv2			sys_preadv2			compat_sys_preadv2
 359	common	pwritev2		sys_pwritev2			compat_sys_pwritev2
 360	common	statx			sys_statx
-361	common	io_pgetevents		sys_io_pgetevents		compat_sys_io_pgetevents
+361	32	io_pgetevents		sys_io_pgetevents_time32	compat_sys_io_pgetevents
+361	64	io_pgetevents		sys_io_pgetevents
 362	common	pkey_mprotect		sys_pkey_mprotect
 363	common	pkey_alloc		sys_pkey_alloc
 364	common	pkey_free		sys_pkey_free
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 7705d5ecad25..28888d6b7f61 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -135,7 +135,7 @@
 121	i386	setdomainname		sys_setdomainname		__ia32_sys_setdomainname
 122	i386	uname			sys_newuname			__ia32_sys_newuname
 123	i386	modify_ldt		sys_modify_ldt			__ia32_sys_modify_ldt
-124	i386	adjtimex		sys_adjtimex			__ia32_sys_adjtimex_time32
+124	i386	adjtimex		sys_adjtimex_time32			__ia32_sys_adjtimex_time32
 125	i386	mprotect		sys_mprotect			__ia32_sys_mprotect
 126	i386	sigprocmask		sys_sigprocmask			__ia32_compat_sys_sigprocmask
 127	i386	create_module
@@ -172,8 +172,8 @@
 158	i386	sched_yield		sys_sched_yield			__ia32_sys_sched_yield
 159	i386	sched_get_priority_max	sys_sched_get_priority_max	__ia32_sys_sched_get_priority_max
 160	i386	sched_get_priority_min	sys_sched_get_priority_min	__ia32_sys_sched_get_priority_min
-161	i386	sched_rr_get_interval	sys_sched_rr_get_interval	__ia32_sys_sched_rr_get_interval_time32
-162	i386	nanosleep		sys_nanosleep			__ia32_sys_nanosleep_time32
+161	i386	sched_rr_get_interval	sys_sched_rr_get_interval_time32	__ia32_sys_sched_rr_get_interval_time32
+162	i386	nanosleep		sys_nanosleep_time32		__ia32_sys_nanosleep_time32
 163	i386	mremap			sys_mremap			__ia32_sys_mremap
 164	i386	setresuid		sys_setresuid16			__ia32_sys_setresuid16
 165	i386	getresuid		sys_getresuid16			__ia32_sys_getresuid16
@@ -188,7 +188,7 @@
 174	i386	rt_sigaction		sys_rt_sigaction		__ia32_compat_sys_rt_sigaction
 175	i386	rt_sigprocmask		sys_rt_sigprocmask		__ia32_sys_rt_sigprocmask
 176	i386	rt_sigpending		sys_rt_sigpending		__ia32_compat_sys_rt_sigpending
-177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		__ia32_compat_sys_rt_sigtimedwait_time32
+177	i386	rt_sigtimedwait		sys_rt_sigtimedwait_time32	__ia32_compat_sys_rt_sigtimedwait_time32
 178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		__ia32_compat_sys_rt_sigqueueinfo
 179	i386	rt_sigsuspend		sys_rt_sigsuspend		__ia32_sys_rt_sigsuspend
 180	i386	pread64			sys_pread64			__ia32_compat_sys_x86_pread
@@ -251,14 +251,14 @@
 237	i386	fremovexattr		sys_fremovexattr		__ia32_sys_fremovexattr
 238	i386	tkill			sys_tkill			__ia32_sys_tkill
 239	i386	sendfile64		sys_sendfile64			__ia32_sys_sendfile64
-240	i386	futex			sys_futex			__ia32_sys_futex_time32
+240	i386	futex			sys_futex_time32		__ia32_sys_futex_time32
 241	i386	sched_setaffinity	sys_sched_setaffinity		__ia32_compat_sys_sched_setaffinity
 242	i386	sched_getaffinity	sys_sched_getaffinity		__ia32_compat_sys_sched_getaffinity
 243	i386	set_thread_area		sys_set_thread_area		__ia32_sys_set_thread_area
 244	i386	get_thread_area		sys_get_thread_area		__ia32_sys_get_thread_area
 245	i386	io_setup		sys_io_setup			__ia32_compat_sys_io_setup
 246	i386	io_destroy		sys_io_destroy			__ia32_sys_io_destroy
-247	i386	io_getevents		sys_io_getevents		__ia32_sys_io_getevents_time32
+247	i386	io_getevents		sys_io_getevents_time32		__ia32_sys_io_getevents_time32
 248	i386	io_submit		sys_io_submit			__ia32_compat_sys_io_submit
 249	i386	io_cancel		sys_io_cancel			__ia32_sys_io_cancel
 250	i386	fadvise64		sys_fadvise64			__ia32_compat_sys_x86_fadvise64
@@ -271,14 +271,14 @@
 257	i386	remap_file_pages	sys_remap_file_pages		__ia32_sys_remap_file_pages
 258	i386	set_tid_address		sys_set_tid_address		__ia32_sys_set_tid_address
 259	i386	timer_create		sys_timer_create		__ia32_compat_sys_timer_create
-260	i386	timer_settime		sys_timer_settime		__ia32_sys_timer_settime32
-261	i386	timer_gettime		sys_timer_gettime		__ia32_sys_timer_gettime32
+260	i386	timer_settime		sys_timer_settime32		__ia32_sys_timer_settime32
+261	i386	timer_gettime		sys_timer_gettime32		__ia32_sys_timer_gettime32
 262	i386	timer_getoverrun	sys_timer_getoverrun		__ia32_sys_timer_getoverrun
 263	i386	timer_delete		sys_timer_delete		__ia32_sys_timer_delete
-264	i386	clock_settime		sys_clock_settime		__ia32_sys_clock_settime32
-265	i386	clock_gettime		sys_clock_gettime		__ia32_sys_clock_gettime32
-266	i386	clock_getres		sys_clock_getres		__ia32_sys_clock_getres_time32
-267	i386	clock_nanosleep		sys_clock_nanosleep		__ia32_sys_clock_nanosleep_time32
+264	i386	clock_settime		sys_clock_settime32		__ia32_sys_clock_settime32
+265	i386	clock_gettime		sys_clock_gettime32		__ia32_sys_clock_gettime32
+266	i386	clock_getres		sys_clock_getres_time32		__ia32_sys_clock_getres_time32
+267	i386	clock_nanosleep		sys_clock_nanosleep_time32	__ia32_sys_clock_nanosleep_time32
 268	i386	statfs64		sys_statfs64			__ia32_compat_sys_statfs64
 269	i386	fstatfs64		sys_fstatfs64			__ia32_compat_sys_fstatfs64
 270	i386	tgkill			sys_tgkill			__ia32_sys_tgkill
@@ -290,8 +290,8 @@
 276	i386	set_mempolicy		sys_set_mempolicy		__ia32_sys_set_mempolicy
 277	i386	mq_open			sys_mq_open			__ia32_compat_sys_mq_open
 278	i386	mq_unlink		sys_mq_unlink			__ia32_sys_mq_unlink
-279	i386	mq_timedsend		sys_mq_timedsend		__ia32_sys_mq_timedsend_time32
-280	i386	mq_timedreceive		sys_mq_timedreceive		__ia32_sys_mq_timedreceive_time32
+279	i386	mq_timedsend		sys_mq_timedsend_time32		__ia32_sys_mq_timedsend_time32
+280	i386	mq_timedreceive		sys_mq_timedreceive_time32	__ia32_sys_mq_timedreceive_time32
 281	i386	mq_notify		sys_mq_notify			__ia32_compat_sys_mq_notify
 282	i386	mq_getsetattr		sys_mq_getsetattr		__ia32_compat_sys_mq_getsetattr
 283	i386	kexec_load		sys_kexec_load			__ia32_compat_sys_kexec_load
@@ -319,8 +319,8 @@
 305	i386	readlinkat		sys_readlinkat			__ia32_sys_readlinkat
 306	i386	fchmodat		sys_fchmodat			__ia32_sys_fchmodat
 307	i386	faccessat		sys_faccessat			__ia32_sys_faccessat
-308	i386	pselect6		sys_pselect6			__ia32_compat_sys_pselect6_time32
-309	i386	ppoll			sys_ppoll			__ia32_compat_sys_ppoll_time32
+308	i386	pselect6		sys_pselect6_time32		__ia32_compat_sys_pselect6_time32
+309	i386	ppoll			sys_ppoll_time32		__ia32_compat_sys_ppoll_time32
 310	i386	unshare			sys_unshare			__ia32_sys_unshare
 311	i386	set_robust_list		sys_set_robust_list		__ia32_compat_sys_set_robust_list
 312	i386	get_robust_list		sys_get_robust_list		__ia32_compat_sys_get_robust_list
@@ -331,13 +331,13 @@
 317	i386	move_pages		sys_move_pages			__ia32_compat_sys_move_pages
 318	i386	getcpu			sys_getcpu			__ia32_sys_getcpu
 319	i386	epoll_pwait		sys_epoll_pwait			__ia32_sys_epoll_pwait
-320	i386	utimensat		sys_utimensat			__ia32_sys_utimensat_time32
+320	i386	utimensat		sys_utimensat_time32		__ia32_sys_utimensat_time32
 321	i386	signalfd		sys_signalfd			__ia32_compat_sys_signalfd
 322	i386	timerfd_create		sys_timerfd_create		__ia32_sys_timerfd_create
 323	i386	eventfd			sys_eventfd			__ia32_sys_eventfd
 324	i386	fallocate		sys_fallocate			__ia32_compat_sys_x86_fallocate
-325	i386	timerfd_settime		sys_timerfd_settime		__ia32_sys_timerfd_settime32
-326	i386	timerfd_gettime		sys_timerfd_gettime		__ia32_sys_timerfd_gettime32
+325	i386	timerfd_settime		sys_timerfd_settime32		__ia32_sys_timerfd_settime32
+326	i386	timerfd_gettime		sys_timerfd_gettime32		__ia32_sys_timerfd_gettime32
 327	i386	signalfd4		sys_signalfd4			__ia32_compat_sys_signalfd4
 328	i386	eventfd2		sys_eventfd2			__ia32_sys_eventfd2
 329	i386	epoll_create1		sys_epoll_create1		__ia32_sys_epoll_create1
@@ -348,13 +348,13 @@
 334	i386	pwritev			sys_pwritev			__ia32_compat_sys_pwritev
 335	i386	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		__ia32_compat_sys_rt_tgsigqueueinfo
 336	i386	perf_event_open		sys_perf_event_open		__ia32_sys_perf_event_open
-337	i386	recvmmsg		sys_recvmmsg			__ia32_compat_sys_recvmmsg_time32
+337	i386	recvmmsg		sys_recvmmsg_time32		__ia32_compat_sys_recvmmsg_time32
 338	i386	fanotify_init		sys_fanotify_init		__ia32_sys_fanotify_init
 339	i386	fanotify_mark		sys_fanotify_mark		__ia32_compat_sys_fanotify_mark
 340	i386	prlimit64		sys_prlimit64			__ia32_sys_prlimit64
 341	i386	name_to_handle_at	sys_name_to_handle_at		__ia32_sys_name_to_handle_at
 342	i386	open_by_handle_at	sys_open_by_handle_at		__ia32_compat_sys_open_by_handle_at
-343	i386	clock_adjtime		sys_clock_adjtime		__ia32_sys_clock_adjtime32
+343	i386	clock_adjtime		sys_clock_adjtime32		__ia32_sys_clock_adjtime32
 344	i386	syncfs			sys_syncfs			__ia32_sys_syncfs
 345	i386	sendmmsg		sys_sendmmsg			__ia32_compat_sys_sendmmsg
 346	i386	setns			sys_setns			__ia32_sys_setns
@@ -396,7 +396,7 @@
 382	i386	pkey_free		sys_pkey_free			__ia32_sys_pkey_free
 383	i386	statx			sys_statx			__ia32_sys_statx
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
-385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
+385	i386	io_pgetevents		sys_io_pgetevents_time32	__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
 # don't use numbers 387 through 392, add new calls at the end
 393	i386	semget			sys_semget    			__ia32_sys_semget
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index c699e014e0dd..6f05bc8f015a 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -174,7 +174,7 @@
 158	common	capget				sys_capget
 159	common	capset				sys_capset
 160	common	ptrace				sys_ptrace
-161	common	semtimedop			sys_semtimedop
+161	common	semtimedop			sys_semtimedop_time32
 162	common	semget				sys_semget
 163	common	semop				sys_semop
 164	common	semctl				sys_old_semctl
@@ -206,11 +206,11 @@
 188	common	setrlimit			sys_setrlimit
 189	common	getrlimit			sys_getrlimit
 190	common	getrusage			sys_getrusage
-191	common	futex				sys_futex
+191	common	futex				sys_futex_time32
 192	common	gettimeofday			sys_gettimeofday
 193	common	settimeofday			sys_settimeofday
-194	common	adjtimex			sys_adjtimex
-195	common	nanosleep			sys_nanosleep
+194	common	adjtimex			sys_adjtimex_time32
+195	common	nanosleep			sys_nanosleep_time32
 196	common	getgroups			sys_getgroups
 197	common	setgroups			sys_setgroups
 198	common	sethostname			sys_sethostname
@@ -234,7 +234,7 @@
 215	common	sched_getscheduler		sys_sched_getscheduler
 216	common	sched_get_priority_max		sys_sched_get_priority_max
 217	common	sched_get_priority_min		sys_sched_get_priority_min
-218	common	sched_rr_get_interval		sys_sched_rr_get_interval
+218	common	sched_rr_get_interval		sys_sched_rr_get_interval_time32
 219	common	sched_yield			sys_sched_yield
 222	common	available222			sys_ni_syscall
 # Signal Handling
@@ -244,14 +244,14 @@
 226	common	rt_sigaction			sys_rt_sigaction
 227	common	rt_sigprocmask			sys_rt_sigprocmask
 228	common	rt_sigpending			sys_rt_sigpending
-229	common	rt_sigtimedwait			sys_rt_sigtimedwait
+229	common	rt_sigtimedwait			sys_rt_sigtimedwait_time32
 230	common	rt_sigqueueinfo			sys_rt_sigqueueinfo
 231	common	rt_sigsuspend			sys_rt_sigsuspend
 # Message
 232	common	mq_open				sys_mq_open
 233	common	mq_unlink			sys_mq_unlink
-234	common	mq_timedsend			sys_mq_timedsend
-235	common	mq_timedreceive			sys_mq_timedreceive
+234	common	mq_timedsend			sys_mq_timedsend_time32
+235	common	mq_timedreceive			sys_mq_timedreceive_time32
 236	common	mq_notify			sys_mq_notify
 237	common	mq_getsetattr			sys_mq_getsetattr
 238	common	available238			sys_ni_syscall
@@ -259,17 +259,17 @@
 # IO
 240	common	io_destroy			sys_io_destroy
 241	common	io_submit			sys_io_submit
-242	common	io_getevents			sys_io_getevents
+242	common	io_getevents			sys_io_getevents_time32
 243	common	io_cancel			sys_io_cancel
-244	common	clock_settime			sys_clock_settime
-245	common	clock_gettime			sys_clock_gettime
-246	common	clock_getres			sys_clock_getres
-247	common	clock_nanosleep			sys_clock_nanosleep
+244	common	clock_settime			sys_clock_settime32
+245	common	clock_gettime			sys_clock_gettime32
+246	common	clock_getres			sys_clock_getres_time32
+247	common	clock_nanosleep			sys_clock_nanosleep_time32
 # Timer
 248	common	timer_create			sys_timer_create
 249	common	timer_delete			sys_timer_delete
-250	common	timer_settime			sys_timer_settime
-251	common	timer_gettime			sys_timer_gettime
+250	common	timer_settime			sys_timer_settime32
+251	common	timer_gettime			sys_timer_gettime32
 252	common	timer_getoverrun		sys_timer_getoverrun
 # System
 253	common	reserved253			sys_ni_syscall
@@ -291,8 +291,8 @@
 269	common	tee				sys_tee
 270	common	vmsplice			sys_vmsplice
 271	common	available271			sys_ni_syscall
-272	common	pselect6			sys_pselect6
-273	common	ppoll				sys_ppoll
+272	common	pselect6			sys_pselect6_time32
+273	common	ppoll				sys_ppoll_time32
 274	common	epoll_pwait			sys_epoll_pwait
 275	common	epoll_create1			sys_epoll_create1
 276	common	inotify_init			sys_inotify_init
@@ -316,7 +316,7 @@
 293	common	linkat				sys_linkat
 294	common	symlinkat			sys_symlinkat
 295	common	readlinkat			sys_readlinkat
-296	common	utimensat			sys_utimensat
+296	common	utimensat			sys_utimensat_time32
 297	common	fchownat			sys_fchownat
 298	common	futimesat			sys_futimesat
 299	common	fstatat64			sys_fstatat64
@@ -327,14 +327,14 @@
 304	common	signalfd			sys_signalfd
 # 305 was timerfd
 306	common	eventfd				sys_eventfd
-307	common	recvmmsg			sys_recvmmsg
+307	common	recvmmsg			sys_recvmmsg_time32
 308	common	setns				sys_setns
 309	common	signalfd4			sys_signalfd4
 310	common	dup3				sys_dup3
 311	common	pipe2				sys_pipe2
 312	common	timerfd_create			sys_timerfd_create
-313	common	timerfd_settime			sys_timerfd_settime
-314	common	timerfd_gettime			sys_timerfd_gettime
+313	common	timerfd_settime			sys_timerfd_settime32
+314	common	timerfd_gettime			sys_timerfd_gettime32
 315	common	available315			sys_ni_syscall
 316	common	eventfd2			sys_eventfd2
 317	common	preadv				sys_preadv
@@ -349,7 +349,7 @@
 326	common	sync_file_range2		sys_sync_file_range2
 327	common	perf_event_open			sys_perf_event_open
 328	common	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo
-329	common	clock_adjtime			sys_clock_adjtime
+329	common	clock_adjtime			sys_clock_adjtime32
 330	common	prlimit64			sys_prlimit64
 331	common	kcmp				sys_kcmp
 332	common	finit_module			sys_finit_module
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 153b55b94234..ab1831769030 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -39,7 +39,7 @@ __SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)
 #define __NR_io_cancel 3
 __SYSCALL(__NR_io_cancel, sys_io_cancel)
 #define __NR_io_getevents 4
-__SC_COMP(__NR_io_getevents, sys_io_getevents, sys_io_getevents_time32)
+__SC_3264(__NR_io_getevents, sys_io_getevents_time32, sys_io_getevents)
 
 /* fs/xattr.c */
 #define __NR_setxattr 5
@@ -223,9 +223,9 @@ __SYSCALL(__NR3264_sendfile, sys_sendfile64)
 
 /* fs/select.c */
 #define __NR_pselect6 72
-__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6_time32)
+__SC_COMP_3264(__NR_pselect6, sys_pselect6_time32, sys_pselect6, compat_sys_pselect6_time32)
 #define __NR_ppoll 73
-__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll_time32)
+__SC_COMP_3264(__NR_ppoll, sys_ppoll_time32, sys_ppoll, compat_sys_ppoll_time32)
 
 /* fs/signalfd.c */
 #define __NR_signalfd4 74
@@ -270,15 +270,15 @@ __SC_COMP(__NR_sync_file_range, sys_sync_file_range, \
 #define __NR_timerfd_create 85
 __SYSCALL(__NR_timerfd_create, sys_timerfd_create)
 #define __NR_timerfd_settime 86
-__SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \
-	  sys_timerfd_settime32)
+__SC_3264(__NR_timerfd_settime, sys_timerfd_settime32, \
+	  sys_timerfd_settime)
 #define __NR_timerfd_gettime 87
-__SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \
-	  sys_timerfd_gettime32)
+__SC_3264(__NR_timerfd_gettime, sys_timerfd_gettime32, \
+	  sys_timerfd_gettime)
 
 /* fs/utimes.c */
 #define __NR_utimensat 88
-__SC_COMP(__NR_utimensat, sys_utimensat, sys_utimensat_time32)
+__SC_3264(__NR_utimensat, sys_utimensat_time32, sys_utimensat)
 
 /* kernel/acct.c */
 #define __NR_acct 89
@@ -310,7 +310,7 @@ __SYSCALL(__NR_unshare, sys_unshare)
 
 /* kernel/futex.c */
 #define __NR_futex 98
-__SC_COMP(__NR_futex, sys_futex, sys_futex_time32)
+__SC_3264(__NR_futex, sys_futex_time32, sys_futex)
 #define __NR_set_robust_list 99
 __SC_COMP(__NR_set_robust_list, sys_set_robust_list, \
 	  compat_sys_set_robust_list)
@@ -320,7 +320,7 @@ __SC_COMP(__NR_get_robust_list, sys_get_robust_list, \
 
 /* kernel/hrtimer.c */
 #define __NR_nanosleep 101
-__SC_COMP(__NR_nanosleep, sys_nanosleep, sys_nanosleep_time32)
+__SC_3264(__NR_nanosleep, sys_nanosleep_time32, sys_nanosleep)
 
 /* kernel/itimer.c */
 #define __NR_getitimer 102
@@ -342,22 +342,22 @@ __SYSCALL(__NR_delete_module, sys_delete_module)
 #define __NR_timer_create 107
 __SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create)
 #define __NR_timer_gettime 108
-__SC_COMP(__NR_timer_gettime, sys_timer_gettime, sys_timer_gettime32)
+__SC_3264(__NR_timer_gettime, sys_timer_gettime32, sys_timer_gettime)
 #define __NR_timer_getoverrun 109
 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
 #define __NR_timer_settime 110
-__SC_COMP(__NR_timer_settime, sys_timer_settime, sys_timer_settime32)
+__SC_3264(__NR_timer_settime, sys_timer_settime32, sys_timer_settime)
 #define __NR_timer_delete 111
 __SYSCALL(__NR_timer_delete, sys_timer_delete)
 #define __NR_clock_settime 112
-__SC_COMP(__NR_clock_settime, sys_clock_settime, sys_clock_settime32)
+__SC_3264(__NR_clock_settime, sys_clock_settime32, sys_clock_settime)
 #define __NR_clock_gettime 113
-__SC_COMP(__NR_clock_gettime, sys_clock_gettime, sys_clock_gettime32)
+__SC_3264(__NR_clock_gettime, sys_clock_gettime32, sys_clock_gettime)
 #define __NR_clock_getres 114
-__SC_COMP(__NR_clock_getres, sys_clock_getres, sys_clock_getres_time32)
+__SC_3264(__NR_clock_getres, sys_clock_getres_time32, sys_clock_getres)
 #define __NR_clock_nanosleep 115
-__SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \
-	  sys_clock_nanosleep_time32)
+__SC_3264(__NR_clock_nanosleep, sys_clock_nanosleep_time32, \
+	  sys_clock_nanosleep)
 
 /* kernel/printk.c */
 #define __NR_syslog 116
@@ -389,8 +389,8 @@ __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
 #define __NR_sched_get_priority_min 126
 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
 #define __NR_sched_rr_get_interval 127
-__SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \
-	  sys_sched_rr_get_interval_time32)
+__SC_3264(__NR_sched_rr_get_interval, sys_sched_rr_get_interval_time32, \
+	  sys_sched_rr_get_interval)
 
 /* kernel/signal.c */
 #define __NR_restart_syscall 128
@@ -412,8 +412,8 @@ __SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask)
 #define __NR_rt_sigpending 136
 __SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending)
 #define __NR_rt_sigtimedwait 137
-__SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \
-	  compat_sys_rt_sigtimedwait_time32)
+__SC_COMP_3264(__NR_rt_sigtimedwait, sys_rt_sigtimedwait_time32, \
+	  sys_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time32)
 #define __NR_rt_sigqueueinfo 138
 __SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \
 	  compat_sys_rt_sigqueueinfo)
@@ -486,7 +486,7 @@ __SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday)
 #define __NR_settimeofday 170
 __SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday)
 #define __NR_adjtimex 171
-__SC_COMP(__NR_adjtimex, sys_adjtimex, sys_adjtimex_time32)
+__SC_3264(__NR_adjtimex, sys_adjtimex_time32, sys_adjtimex)
 
 /* kernel/timer.c */
 #define __NR_getpid 172
@@ -512,10 +512,10 @@ __SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open)
 #define __NR_mq_unlink 181
 __SYSCALL(__NR_mq_unlink, sys_mq_unlink)
 #define __NR_mq_timedsend 182
-__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, sys_mq_timedsend_time32)
+__SC_3264(__NR_mq_timedsend, sys_mq_timedsend_time32, sys_mq_timedsend)
 #define __NR_mq_timedreceive 183
-__SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \
-	  sys_mq_timedreceive_time32)
+__SC_3264(__NR_mq_timedreceive, sys_mq_timedreceive_time32, \
+	  sys_mq_timedreceive)
 #define __NR_mq_notify 184
 __SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify)
 #define __NR_mq_getsetattr 185
@@ -659,7 +659,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_accept4 242
 __SYSCALL(__NR_accept4, sys_accept4)
 #define __NR_recvmmsg 243
-__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg_time32)
+__SC_COMP_3264(__NR_recvmmsg, sys_recvmmsg_time32, sys_recvmmsg, compat_sys_recvmmsg_time32)
 
 /*
  * Architectures may provide up to 16 syscalls of their own
@@ -681,7 +681,7 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 __SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \
 	  compat_sys_open_by_handle_at)
 #define __NR_clock_adjtime 266
-__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, sys_clock_adjtime32)
+__SC_3264(__NR_clock_adjtime, sys_clock_adjtime32, sys_clock_adjtime)
 #define __NR_syncfs 267
 __SYSCALL(__NR_syncfs, sys_syncfs)
 #define __NR_setns 268
@@ -735,7 +735,7 @@ __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 #define __NR_statx 291
 __SYSCALL(__NR_statx,     sys_statx)
 #define __NR_io_pgetevents 292
-__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
+__SC_COMP_3264(__NR_io_pgetevents, sys_io_pgetevents_time32, sys_io_pgetevents, compat_sys_io_pgetevents)
 #define __NR_rseq 293
 __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
-- 
cgit v1.2.3-59-g8ed1b


From c70a772fda11570ebddecbce1543a3fda008db4a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 7 Jan 2019 00:00:34 +0100
Subject: y2038: remove struct definition redirects

We now use 64-bit time_t on all architectures, so the __kernel_timex,
__kernel_timeval and __kernel_timespec redirects can be removed
after having served their purpose.

This makes it all much less confusing, as the __kernel_* types
now always refer to the same layout based on 64-bit time_t across
all 32-bit and 64-bit architectures.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/time64.h     | 8 --------
 include/linux/timex.h      | 7 -------
 include/uapi/linux/time.h  | 4 ----
 include/uapi/linux/timex.h | 2 --
 4 files changed, 21 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 05634afba0db..f38d382ffec1 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -7,14 +7,6 @@
 typedef __s64 time64_t;
 typedef __u64 timeu64_t;
 
-/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path
- * and 32-bit emulation.
- */
-#ifndef CONFIG_64BIT_TIME
-#define __kernel_timespec timespec
-#define __kernel_itimerspec itimerspec
-#endif
-
 #include <uapi/linux/time.h>
 
 struct timespec64 {
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 4aff9f0d1367..ce0859763670 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -53,13 +53,6 @@
 #ifndef _LINUX_TIMEX_H
 #define _LINUX_TIMEX_H
 
-/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path
- * and 32-bit emulation.
- */
-#ifndef CONFIG_64BIT_TIME
-#define __kernel_timex timex
-#endif
-
 #include <uapi/linux/timex.h>
 
 #define ADJ_ADJTIME		0x8000	/* switch between adjtime/adjtimex modes */
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 6b56a2208be7..b03f8717c312 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -42,19 +42,15 @@ struct itimerval {
 	struct timeval it_value;	/* current value */
 };
 
-#ifndef __kernel_timespec
 struct __kernel_timespec {
 	__kernel_time64_t       tv_sec;                 /* seconds */
 	long long               tv_nsec;                /* nanoseconds */
 };
-#endif
 
-#ifndef __kernel_itimerspec
 struct __kernel_itimerspec {
 	struct __kernel_timespec it_interval;    /* timer period */
 	struct __kernel_timespec it_value;       /* timer expiration */
 };
-#endif
 
 /*
  * legacy timeval structure, only embedded in structures that
diff --git a/include/uapi/linux/timex.h b/include/uapi/linux/timex.h
index a1c6b73016a5..9f517f9010bb 100644
--- a/include/uapi/linux/timex.h
+++ b/include/uapi/linux/timex.h
@@ -97,7 +97,6 @@ struct __kernel_timex_timeval {
 	long long		tv_usec;
 };
 
-#ifndef __kernel_timex
 struct __kernel_timex {
 	unsigned int modes;	/* mode selector */
 	int :32;            /* pad */
@@ -131,7 +130,6 @@ struct __kernel_timex {
 	int  :32; int  :32; int  :32; int  :32;
 	int  :32; int  :32; int  :32;
 };
-#endif
 
 /*
  * Mode codes (timex.mode)
-- 
cgit v1.2.3-59-g8ed1b


From 48166e6ea47d23984f0b481ca199250e1ce0730a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 10 Jan 2019 12:45:11 +0100
Subject: y2038: add 64-bit time_t syscalls to all 32-bit architectures

This adds 21 new system calls on each ABI that has 32-bit time_t
today. All of these have the exact same semantics as their existing
counterparts, and the new ones all have macro names that end in 'time64'
for clarification.

This gets us to the point of being able to safely use a C library
that has 64-bit time_t in user space. There are still a couple of
loose ends to tie up in various areas of the code, but this is the
big one, and should be entirely uncontroversial at this point.

In particular, there are four system calls (getitimer, setitimer,
waitid, and getrusage) that don't have a 64-bit counterpart yet,
but these can all be safely implemented in the C library by wrapping
around the existing system calls because the 32-bit time_t they
pass only counts elapsed time, not time since the epoch. They
will be dealt with later.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |  2 ++
 arch/arm/tools/syscall.tbl                  | 21 ++++++++++++++
 arch/arm64/include/asm/unistd.h             |  2 +-
 arch/arm64/include/asm/unistd32.h           | 41 +++++++++++++++++++++++++++
 arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl       | 20 +++++++++++++
 arch/microblaze/kernel/syscalls/syscall.tbl | 21 ++++++++++++++
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 21 ++++++++++++++
 arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 20 +++++++++++++
 arch/parisc/kernel/syscalls/syscall.tbl     | 21 ++++++++++++++
 arch/powerpc/kernel/syscalls/syscall.tbl    | 20 +++++++++++++
 arch/s390/kernel/syscalls/syscall.tbl       | 20 +++++++++++++
 arch/sh/kernel/syscalls/syscall.tbl         | 20 +++++++++++++
 arch/sparc/kernel/syscalls/syscall.tbl      | 20 +++++++++++++
 arch/x86/entry/syscalls/syscall_32.tbl      | 20 +++++++++++++
 arch/xtensa/kernel/syscalls/syscall.tbl     | 21 ++++++++++++++
 include/uapi/asm-generic/unistd.h           | 44 ++++++++++++++++++++++++++++-
 scripts/checksyscalls.sh                    | 40 ++++++++++++++++++++++++++
 19 files changed, 374 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 340b88dd397e..63ed39cbd3bd 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -461,3 +461,5 @@
 530	common	getegid				sys_getegid
 531	common	geteuid				sys_geteuid
 532	common	getppid				sys_getppid
+# all other architectures have common numbers for new syscall, alpha
+# is the exception.
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index a96d9b5ee04e..9016f4081bb9 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -416,3 +416,24 @@
 399	common	io_pgetevents		sys_io_pgetevents_time32
 400	common	migrate_pages		sys_migrate_pages
 401	common	kexec_file_load		sys_kexec_file_load
+# 402 is unused
+403	common	clock_gettime64			sys_clock_gettime
+404	common	clock_settime64			sys_clock_settime
+405	common	clock_adjtime64			sys_clock_adjtime
+406	common	clock_getres_time64		sys_clock_getres
+407	common	clock_nanosleep_time64		sys_clock_nanosleep
+408	common	timer_gettime64			sys_timer_gettime
+409	common	timer_settime64			sys_timer_settime
+410	common	timerfd_gettime64		sys_timerfd_gettime
+411	common	timerfd_settime64		sys_timerfd_settime
+412	common	utimensat_time64		sys_utimensat
+413	common	pselect6_time64			sys_pselect6
+414	common	ppoll_time64			sys_ppoll
+416	common	io_pgetevents_time64		sys_io_pgetevents
+417	common	recvmmsg_time64			sys_recvmmsg
+418	common	mq_timedsend_time64		sys_mq_timedsend
+419	common	mq_timedreceive_time64		sys_mq_timedreceive
+420	common	semtimedop_time64		sys_semtimedop
+421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
+422	common	futex_time64			sys_futex
+423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 2c30e6f145ff..d1dd93436e1e 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -44,7 +44,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		402
+#define __NR_compat_syscalls		424
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 1ded82857161..5590f2623690 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -825,6 +825,47 @@ __SYSCALL(__NR_io_pgetevents, compat_sys_io_pgetevents)
 __SYSCALL(__NR_migrate_pages, compat_sys_migrate_pages)
 #define __NR_kexec_file_load 401
 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
+/* 402 is unused */
+#define __NR_clock_gettime64 403
+__SYSCALL(__NR_clock_gettime64, sys_clock_gettime)
+#define __NR_clock_settime64 404
+__SYSCALL(__NR_clock_settime64, sys_clock_settime)
+#define __NR_clock_adjtime64 405
+__SYSCALL(__NR_clock_adjtime64, sys_clock_adjtime)
+#define __NR_clock_getres_time64 406
+__SYSCALL(__NR_clock_getres_time64, sys_clock_getres)
+#define __NR_clock_nanosleep_time64 407
+__SYSCALL(__NR_clock_nanosleep_time64, sys_clock_nanosleep)
+#define __NR_timer_gettime64 408
+__SYSCALL(__NR_timer_gettime64, sys_timer_gettime)
+#define __NR_timer_settime64 409
+__SYSCALL(__NR_timer_settime64, sys_timer_settime)
+#define __NR_timerfd_gettime64 410
+__SYSCALL(__NR_timerfd_gettime64, sys_timerfd_gettime)
+#define __NR_timerfd_settime64 411
+__SYSCALL(__NR_timerfd_settime64, sys_timerfd_settime)
+#define __NR_utimensat_time64 412
+__SYSCALL(__NR_utimensat_time64, sys_utimensat)
+#define __NR_pselect6_time64 413
+__SYSCALL(__NR_pselect6_time64, compat_sys_pselect6_time64)
+#define __NR_ppoll_time64 414
+__SYSCALL(__NR_ppoll_time64, compat_sys_ppoll_time64)
+#define __NR_io_pgetevents_time64 416
+__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents)
+#define __NR_recvmmsg_time64 417
+__SYSCALL(__NR_recvmmsg_time64, compat_sys_recvmmsg_time64)
+#define __NR_mq_timedsend_time64 418
+__SYSCALL(__NR_mq_timedsend_time64, sys_mq_timedsend)
+#define __NR_mq_timedreceive_time64 419
+__SYSCALL(__NR_mq_timedreceive_time64, sys_mq_timedreceive)
+#define __NR_semtimedop_time64 420
+__SYSCALL(__NR_semtimedop_time64, sys_semtimedop)
+#define __NR_rt_sigtimedwait_time64 421
+__SYSCALL(__NR_rt_sigtimedwait_time64, compat_sys_rt_sigtimedwait_time64)
+#define __NR_futex_time64 422
+__SYSCALL(__NR_futex_time64, sys_futex)
+#define __NR_sched_rr_get_interval_time64 423
+__SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 52319006de0d..ab9cda5f6136 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -343,3 +343,4 @@
 331	common	pkey_alloc			sys_pkey_alloc
 332	common	pkey_free			sys_pkey_free
 333	common	rseq				sys_rseq
+# 334 through 423 are reserved to sync up with other architectures
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 253bd2a069bd..125c14178979 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -403,3 +403,23 @@
 400	common	msgsnd				sys_msgsnd
 401	common	msgrcv				sys_msgrcv
 402	common	msgctl				sys_msgctl
+403	common	clock_gettime64			sys_clock_gettime
+404	common	clock_settime64			sys_clock_settime
+405	common	clock_adjtime64			sys_clock_adjtime
+406	common	clock_getres_time64		sys_clock_getres
+407	common	clock_nanosleep_time64		sys_clock_nanosleep
+408	common	timer_gettime64			sys_timer_gettime
+409	common	timer_settime64			sys_timer_settime
+410	common	timerfd_gettime64		sys_timerfd_gettime
+411	common	timerfd_settime64		sys_timerfd_settime
+412	common	utimensat_time64		sys_utimensat
+413	common	pselect6_time64			sys_pselect6
+414	common	ppoll_time64			sys_ppoll
+416	common	io_pgetevents_time64		sys_io_pgetevents
+417	common	recvmmsg_time64			sys_recvmmsg
+418	common	mq_timedsend_time64		sys_mq_timedsend
+419	common	mq_timedreceive_time64		sys_mq_timedreceive
+420	common	semtimedop_time64		sys_semtimedop
+421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
+422	common	futex_time64			sys_futex
+423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 44a87649d681..8ee3a8c18498 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -408,3 +408,24 @@
 398	common	statx				sys_statx
 399	common	io_pgetevents			sys_io_pgetevents_time32
 400	common	rseq				sys_rseq
+# 401 and 402 are unused
+403	common	clock_gettime64			sys_clock_gettime
+404	common	clock_settime64			sys_clock_settime
+405	common	clock_adjtime64			sys_clock_adjtime
+406	common	clock_getres_time64		sys_clock_getres
+407	common	clock_nanosleep_time64		sys_clock_nanosleep
+408	common	timer_gettime64			sys_timer_gettime
+409	common	timer_settime64			sys_timer_settime
+410	common	timerfd_gettime64		sys_timerfd_gettime
+411	common	timerfd_settime64		sys_timerfd_settime
+412	common	utimensat_time64		sys_utimensat
+413	common	pselect6_time64			sys_pselect6
+414	common	ppoll_time64			sys_ppoll
+416	common	io_pgetevents_time64		sys_io_pgetevents
+417	common	recvmmsg_time64			sys_recvmmsg
+418	common	mq_timedsend_time64		sys_mq_timedsend
+419	common	mq_timedreceive_time64		sys_mq_timedreceive
+420	common	semtimedop_time64		sys_semtimedop
+421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
+422	common	futex_time64			sys_futex
+423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 6d1e019817c8..15f4117900ee 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -341,3 +341,24 @@
 330	n32	statx				sys_statx
 331	n32	rseq				sys_rseq
 332	n32	io_pgetevents			compat_sys_io_pgetevents
+# 333 through 402 are unassigned to sync up with generic numbers
+403	n32	clock_gettime64			sys_clock_gettime
+404	n32	clock_settime64			sys_clock_settime
+405	n32	clock_adjtime64			sys_clock_adjtime
+406	n32	clock_getres_time64		sys_clock_getres
+407	n32	clock_nanosleep_time64		sys_clock_nanosleep
+408	n32	timer_gettime64			sys_timer_gettime
+409	n32	timer_settime64			sys_timer_settime
+410	n32	timerfd_gettime64		sys_timerfd_gettime
+411	n32	timerfd_settime64		sys_timerfd_settime
+412	n32	utimensat_time64		sys_utimensat
+413	n32	pselect6_time64			compat_sys_pselect6_time64
+414	n32	ppoll_time64			compat_sys_ppoll_time64
+416	n32	io_pgetevents_time64		sys_io_pgetevents
+417	n32	recvmmsg_time64			compat_sys_recvmmsg_time64
+418	n32	mq_timedsend_time64		sys_mq_timedsend
+419	n32	mq_timedreceive_time64		sys_mq_timedreceive
+420	n32	semtimedop_time64		sys_semtimedop
+421	n32	rt_sigtimedwait_time64		compat_sys_rt_sigtimedwait_time64
+422	n32	futex_time64			sys_futex
+423	n32	sched_rr_get_interval_time64	sys_sched_rr_get_interval
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index af0da757a7b2..c85502e67b44 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -337,3 +337,4 @@
 326	n64	statx				sys_statx
 327	n64	rseq				sys_rseq
 328	n64	io_pgetevents			sys_io_pgetevents
+# 329 through 423 are reserved to sync up with other architectures
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 54312c5b5343..2e063d0f837e 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -391,3 +391,23 @@
 400	o32	msgsnd				sys_msgsnd			compat_sys_msgsnd
 401	o32	msgrcv				sys_msgrcv			compat_sys_msgrcv
 402	o32	msgctl				sys_msgctl			compat_sys_msgctl
+403	o32	clock_gettime64			sys_clock_gettime		sys_clock_gettime
+404	o32	clock_settime64			sys_clock_settime		sys_clock_settime
+405	o32	clock_adjtime64			sys_clock_adjtime		sys_clock_adjtime
+406	o32	clock_getres_time64		sys_clock_getres		sys_clock_getres
+407	o32	clock_nanosleep_time64		sys_clock_nanosleep		sys_clock_nanosleep
+408	o32	timer_gettime64			sys_timer_gettime		sys_timer_gettime
+409	o32	timer_settime64			sys_timer_settime		sys_timer_settime
+410	o32	timerfd_gettime64		sys_timerfd_gettime		sys_timerfd_gettime
+411	o32	timerfd_settime64		sys_timerfd_settime		sys_timerfd_settime
+412	o32	utimensat_time64		sys_utimensat			sys_utimensat
+413	o32	pselect6_time64			sys_pselect6			compat_sys_pselect6_time64
+414	o32	ppoll_time64			sys_ppoll			compat_sys_ppoll_time64
+416	o32	io_pgetevents_time64		sys_io_pgetevents		sys_io_pgetevents
+417	o32	recvmmsg_time64			sys_recvmmsg			compat_sys_recvmmsg_time64
+418	o32	mq_timedsend_time64		sys_mq_timedsend		sys_mq_timedsend
+419	o32	mq_timedreceive_time64		sys_mq_timedreceive		sys_mq_timedreceive
+420	o32	semtimedop_time64		sys_semtimedop			sys_semtimedop
+421	o32	rt_sigtimedwait_time64		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
+422	o32	futex_time64			sys_futex			sys_futex
+423	o32	sched_rr_get_interval_time64	sys_sched_rr_get_interval	sys_sched_rr_get_interval
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 7eff3dc3d613..b26766c6647d 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -399,3 +399,24 @@
 352	common	pkey_alloc		sys_pkey_alloc
 353	common	pkey_free		sys_pkey_free
 354	common	rseq			sys_rseq
+# 355 through 402 are unassigned to sync up with generic numbers
+403	32	clock_gettime64			sys_clock_gettime		sys_clock_gettime
+404	32	clock_settime64			sys_clock_settime		sys_clock_settime
+405	32	clock_adjtime64			sys_clock_adjtime		sys_clock_adjtime
+406	32	clock_getres_time64		sys_clock_getres		sys_clock_getres
+407	32	clock_nanosleep_time64		sys_clock_nanosleep		sys_clock_nanosleep
+408	32	timer_gettime64			sys_timer_gettime		sys_timer_gettime
+409	32	timer_settime64			sys_timer_settime		sys_timer_settime
+410	32	timerfd_gettime64		sys_timerfd_gettime		sys_timerfd_gettime
+411	32	timerfd_settime64		sys_timerfd_settime		sys_timerfd_settime
+412	32	utimensat_time64		sys_utimensat			sys_utimensat
+413	32	pselect6_time64			sys_pselect6			compat_sys_pselect6_time64
+414	32	ppoll_time64			sys_ppoll			compat_sys_ppoll_time64
+416	32	io_pgetevents_time64		sys_io_pgetevents		sys_io_pgetevents
+417	32	recvmmsg_time64			sys_recvmmsg			compat_sys_recvmmsg_time64
+418	32	mq_timedsend_time64		sys_mq_timedsend		sys_mq_timedsend
+419	32	mq_timedreceive_time64		sys_mq_timedreceive		sys_mq_timedreceive
+420	32	semtimedop_time64		sys_semtimedop			sys_semtimedop
+421	32	rt_sigtimedwait_time64		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
+422	32	futex_time64			sys_futex			sys_futex
+423	32	sched_rr_get_interval_time64	sys_sched_rr_get_interval	sys_sched_rr_get_interval
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 500edbf9e8a6..b18abb0c3dae 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -485,3 +485,23 @@
 400	common	msgsnd				sys_msgsnd			compat_sys_msgsnd
 401	common	msgrcv				sys_msgrcv			compat_sys_msgrcv
 402	common	msgctl				sys_msgctl			compat_sys_msgctl
+403	32	clock_gettime64			sys_clock_gettime		sys_clock_gettime
+404	32	clock_settime64			sys_clock_settime		sys_clock_settime
+405	32	clock_adjtime64			sys_clock_adjtime		sys_clock_adjtime
+406	32	clock_getres_time64		sys_clock_getres		sys_clock_getres
+407	32	clock_nanosleep_time64		sys_clock_nanosleep		sys_clock_nanosleep
+408	32	timer_gettime64			sys_timer_gettime		sys_timer_gettime
+409	32	timer_settime64			sys_timer_settime		sys_timer_settime
+410	32	timerfd_gettime64		sys_timerfd_gettime		sys_timerfd_gettime
+411	32	timerfd_settime64		sys_timerfd_settime		sys_timerfd_settime
+412	32	utimensat_time64		sys_utimensat			sys_utimensat
+413	32	pselect6_time64			sys_pselect6			compat_sys_pselect6_time64
+414	32	ppoll_time64			sys_ppoll			compat_sys_ppoll_time64
+416	32	io_pgetevents_time64		sys_io_pgetevents		sys_io_pgetevents
+417	32	recvmmsg_time64			sys_recvmmsg			compat_sys_recvmmsg_time64
+418	32	mq_timedsend_time64		sys_mq_timedsend		sys_mq_timedsend
+419	32	mq_timedreceive_time64		sys_mq_timedreceive		sys_mq_timedreceive
+420	32	semtimedop_time64		sys_semtimedop			sys_semtimedop
+421	32	rt_sigtimedwait_time64		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
+422	32	futex_time64			sys_futex			sys_futex
+423	32	sched_rr_get_interval_time64	sys_sched_rr_get_interval	sys_sched_rr_get_interval
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 285201cf1f83..02579f95f391 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -406,3 +406,23 @@
 400  common	msgsnd			sys_msgsnd			compat_sys_msgsnd
 401  common	msgrcv			sys_msgrcv			compat_sys_msgrcv
 402  common	msgctl			sys_msgctl			compat_sys_msgctl
+403	32	clock_gettime64		-				sys_clock_gettime
+404	32	clock_settime64		-				sys_clock_settime
+405	32	clock_adjtime64		-				sys_clock_adjtime
+406	32	clock_getres_time64	-				sys_clock_getres
+407	32	clock_nanosleep_time64	-				sys_clock_nanosleep
+408	32	timer_gettime64		-				sys_timer_gettime
+409	32	timer_settime64		-				sys_timer_settime
+410	32	timerfd_gettime64	-				sys_timerfd_gettime
+411	32	timerfd_settime64	-				sys_timerfd_settime
+412	32	utimensat_time64	-				sys_utimensat
+413	32	pselect6_time64		-				compat_sys_pselect6_time64
+414	32	ppoll_time64		-				compat_sys_ppoll_time64
+416	32	io_pgetevents_time64	-				sys_io_pgetevents
+417	32	recvmmsg_time64		-				compat_sys_recvmmsg_time64
+418	32	mq_timedsend_time64	-				sys_mq_timedsend
+419	32	mq_timedreceive_time64	-				sys_mq_timedreceive
+420	32	semtimedop_time64	-				sys_semtimedop
+421	32	rt_sigtimedwait_time64	-				compat_sys_rt_sigtimedwait_time64
+422	32	futex_time64		-				sys_futex
+423	32	sched_rr_get_interval_time64	-			sys_sched_rr_get_interval
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 06d768c3cc4c..bfda678576e4 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -406,3 +406,23 @@
 400	common	msgsnd				sys_msgsnd
 401	common	msgrcv				sys_msgrcv
 402	common	msgctl				sys_msgctl
+403	common	clock_gettime64			sys_clock_gettime
+404	common	clock_settime64			sys_clock_settime
+405	common	clock_adjtime64			sys_clock_adjtime
+406	common	clock_getres_time64		sys_clock_getres
+407	common	clock_nanosleep_time64		sys_clock_nanosleep
+408	common	timer_gettime64			sys_timer_gettime
+409	common	timer_settime64			sys_timer_settime
+410	common	timerfd_gettime64		sys_timerfd_gettime
+411	common	timerfd_settime64		sys_timerfd_settime
+412	common	utimensat_time64		sys_utimensat
+413	common	pselect6_time64			sys_pselect6
+414	common	ppoll_time64			sys_ppoll
+416	common	io_pgetevents_time64		sys_io_pgetevents
+417	common	recvmmsg_time64			sys_recvmmsg
+418	common	mq_timedsend_time64		sys_mq_timedsend
+419	common	mq_timedreceive_time64		sys_mq_timedreceive
+420	common	semtimedop_time64		sys_semtimedop
+421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
+422	common	futex_time64			sys_futex
+423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 99c40abd8878..b9a5a04b2d2c 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -449,3 +449,23 @@
 400	common	msgsnd			sys_msgsnd			compat_sys_msgsnd
 401	common	msgrcv			sys_msgrcv			compat_sys_msgrcv
 402	common	msgctl			sys_msgctl			compat_sys_msgctl
+403	32	clock_gettime64			sys_clock_gettime		sys_clock_gettime
+404	32	clock_settime64			sys_clock_settime		sys_clock_settime
+405	32	clock_adjtime64			sys_clock_adjtime		sys_clock_adjtime
+406	32	clock_getres_time64		sys_clock_getres		sys_clock_getres
+407	32	clock_nanosleep_time64		sys_clock_nanosleep		sys_clock_nanosleep
+408	32	timer_gettime64			sys_timer_gettime		sys_timer_gettime
+409	32	timer_settime64			sys_timer_settime		sys_timer_settime
+410	32	timerfd_gettime64		sys_timerfd_gettime		sys_timerfd_gettime
+411	32	timerfd_settime64		sys_timerfd_settime		sys_timerfd_settime
+412	32	utimensat_time64		sys_utimensat			sys_utimensat
+413	32	pselect6_time64			sys_pselect6			compat_sys_pselect6_time64
+414	32	ppoll_time64			sys_ppoll			compat_sys_ppoll_time64
+416	32	io_pgetevents_time64		sys_io_pgetevents		sys_io_pgetevents
+417	32	recvmmsg_time64			sys_recvmmsg			compat_sys_recvmmsg_time64
+418	32	mq_timedsend_time64		sys_mq_timedsend		sys_mq_timedsend
+419	32	mq_timedreceive_time64		sys_mq_timedreceive		sys_mq_timedreceive
+420	32	semtimedop_time64		sys_semtimedop			sys_semtimedop
+421	32	rt_sigtimedwait_time64		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
+422	32	futex_time64			sys_futex			sys_futex
+423	32	sched_rr_get_interval_time64	sys_sched_rr_get_interval	sys_sched_rr_get_interval
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 8c47c1191a53..955ab6a3b61f 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -409,3 +409,23 @@
 400	i386	msgsnd			sys_msgsnd    			__ia32_compat_sys_msgsnd
 401	i386	msgrcv			sys_msgrcv    			__ia32_compat_sys_msgrcv
 402	i386	msgctl			sys_msgctl    			__ia32_compat_sys_msgctl
+403	i386	clock_gettime64		sys_clock_gettime		__ia32_sys_clock_gettime
+404	i386	clock_settime64		sys_clock_settime		__ia32_sys_clock_settime
+405	i386	clock_adjtime64		sys_clock_adjtime		__ia32_sys_clock_adjtime
+406	i386	clock_getres_time64	sys_clock_getres		__ia32_sys_clock_getres
+407	i386	clock_nanosleep_time64	sys_clock_nanosleep		__ia32_sys_clock_nanosleep
+408	i386	timer_gettime64		sys_timer_gettime		__ia32_sys_timer_gettime
+409	i386	timer_settime64		sys_timer_settime		__ia32_sys_timer_settime
+410	i386	timerfd_gettime64	sys_timerfd_gettime		__ia32_sys_timerfd_gettime
+411	i386	timerfd_settime64	sys_timerfd_settime		__ia32_sys_timerfd_settime
+412	i386	utimensat_time64	sys_utimensat			__ia32_sys_utimensat
+413	i386	pselect6_time64		sys_pselect6			__ia32_compat_sys_pselect6_time64
+414	i386	ppoll_time64		sys_ppoll			__ia32_compat_sys_ppoll_time64
+416	i386	io_pgetevents_time64	sys_io_pgetevents		__ia32_sys_io_pgetevents
+417	i386	recvmmsg_time64		sys_recvmmsg			__ia32_compat_sys_recvmmsg_time64
+418	i386	mq_timedsend_time64	sys_mq_timedsend		__ia32_sys_mq_timedsend
+419	i386	mq_timedreceive_time64	sys_mq_timedreceive		__ia32_sys_mq_timedreceive
+420	i386	semtimedop_time64	sys_semtimedop			__ia32_sys_semtimedop
+421	i386	rt_sigtimedwait_time64	sys_rt_sigtimedwait		__ia32_compat_sys_rt_sigtimedwait_time64
+422	i386	futex_time64		sys_futex			__ia32_sys_futex
+423	i386	sched_rr_get_interval_time64	sys_sched_rr_get_interval	__ia32_sys_sched_rr_get_interval
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 482673389e21..6af49929de85 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -373,3 +373,24 @@
 350	common	pkey_free			sys_pkey_free
 351	common	statx				sys_statx
 352	common	rseq				sys_rseq
+# 353 through 402 are unassigned to sync up with generic numbers
+403	common	clock_gettime64			sys_clock_gettime
+404	common	clock_settime64			sys_clock_settime
+405	common	clock_adjtime64			sys_clock_adjtime
+406	common	clock_getres_time64		sys_clock_getres
+407	common	clock_nanosleep_time64		sys_clock_nanosleep
+408	common	timer_gettime64			sys_timer_gettime
+409	common	timer_settime64			sys_timer_settime
+410	common	timerfd_gettime64		sys_timerfd_gettime
+411	common	timerfd_settime64		sys_timerfd_settime
+412	common	utimensat_time64		sys_utimensat
+413	common	pselect6_time64			sys_pselect6
+414	common	ppoll_time64			sys_ppoll
+416	common	io_pgetevents_time64		sys_io_pgetevents
+417	common	recvmmsg_time64			sys_recvmmsg
+418	common	mq_timedsend_time64		sys_mq_timedsend
+419	common	mq_timedreceive_time64		sys_mq_timedreceive
+420	common	semtimedop_time64		sys_semtimedop
+421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
+422	common	futex_time64			sys_futex
+423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index ab1831769030..acf9a07ab2ff 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -741,9 +741,51 @@ __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
 __SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
 /* 295 through 402 are unassigned to sync up with generic numbers, don't use */
+#if __BITS_PER_LONG == 32
+#define __NR_clock_gettime64 403
+__SYSCALL(__NR_clock_gettime64, sys_clock_gettime)
+#define __NR_clock_settime64 404
+__SYSCALL(__NR_clock_settime64, sys_clock_settime)
+#define __NR_clock_adjtime64 405
+__SYSCALL(__NR_clock_adjtime64, sys_clock_adjtime)
+#define __NR_clock_getres_time64 406
+__SYSCALL(__NR_clock_getres_time64, sys_clock_getres)
+#define __NR_clock_nanosleep_time64 407
+__SYSCALL(__NR_clock_nanosleep_time64, sys_clock_nanosleep)
+#define __NR_timer_gettime64 408
+__SYSCALL(__NR_timer_gettime64, sys_timer_gettime)
+#define __NR_timer_settime64 409
+__SYSCALL(__NR_timer_settime64, sys_timer_settime)
+#define __NR_timerfd_gettime64 410
+__SYSCALL(__NR_timerfd_gettime64, sys_timerfd_gettime)
+#define __NR_timerfd_settime64 411
+__SYSCALL(__NR_timerfd_settime64, sys_timerfd_settime)
+#define __NR_utimensat_time64 412
+__SYSCALL(__NR_utimensat_time64, sys_utimensat)
+#define __NR_pselect6_time64 413
+__SC_COMP(__NR_pselect6_time64, sys_pselect6, compat_sys_pselect6_time64)
+#define __NR_ppoll_time64 414
+__SC_COMP(__NR_ppoll_time64, sys_ppoll, compat_sys_ppoll_time64)
+#define __NR_io_pgetevents_time64 416
+__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents)
+#define __NR_recvmmsg_time64 417
+__SC_COMP(__NR_recvmmsg_time64, sys_recvmmsg, compat_sys_recvmmsg_time64)
+#define __NR_mq_timedsend_time64 418
+__SYSCALL(__NR_mq_timedsend_time64, sys_mq_timedsend)
+#define __NR_mq_timedreceive_time64 419
+__SYSCALL(__NR_mq_timedreceive_time64, sys_mq_timedreceive)
+#define __NR_semtimedop_time64 420
+__SYSCALL(__NR_semtimedop_time64, sys_semtimedop)
+#define __NR_rt_sigtimedwait_time64 421
+__SC_COMP(__NR_rt_sigtimedwait_time64, sys_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time64)
+#define __NR_futex_time64 422
+__SYSCALL(__NR_futex_time64, sys_futex)
+#define __NR_sched_rr_get_interval_time64 423
+__SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval)
+#endif
 
 #undef __NR_syscalls
-#define __NR_syscalls 295
+#define __NR_syscalls 424
 
 /*
  * 32 bit systems traditionally used different
diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh
index cf931003395f..cc70a64fa81f 100755
--- a/scripts/checksyscalls.sh
+++ b/scripts/checksyscalls.sh
@@ -84,6 +84,26 @@ cat << EOF
 #define __IGNORE_statfs64
 #define __IGNORE_llseek
 #define __IGNORE_mmap2
+#define __IGNORE_clock_gettime64
+#define __IGNORE_clock_settime64
+#define __IGNORE_clock_adjtime64
+#define __IGNORE_clock_getres_time64
+#define __IGNORE_clock_nanosleep_time64
+#define __IGNORE_timer_gettime64
+#define __IGNORE_timer_settime64
+#define __IGNORE_timerfd_gettime64
+#define __IGNORE_timerfd_settime64
+#define __IGNORE_utimensat_time64
+#define __IGNORE_pselect6_time64
+#define __IGNORE_ppoll_time64
+#define __IGNORE_io_pgetevents_time64
+#define __IGNORE_recvmmsg_time64
+#define __IGNORE_mq_timedsend_time64
+#define __IGNORE_mq_timedreceive_time64
+#define __IGNORE_semtimedop_time64
+#define __IGNORE_rt_sigtimedwait_time64
+#define __IGNORE_futex_time64
+#define __IGNORE_sched_rr_get_interval_time64
 #else
 #define __IGNORE_sendfile
 #define __IGNORE_ftruncate
@@ -98,6 +118,26 @@ cat << EOF
 #define __IGNORE_statfs
 #define __IGNORE_lseek
 #define __IGNORE_mmap
+#define __IGNORE_clock_gettime
+#define __IGNORE_clock_settime
+#define __IGNORE_clock_adjtime
+#define __IGNORE_clock_getres
+#define __IGNORE_clock_nanosleep
+#define __IGNORE_timer_gettime
+#define __IGNORE_timer_settime
+#define __IGNORE_timerfd_gettime
+#define __IGNORE_timerfd_settime
+#define __IGNORE_utimensat
+#define __IGNORE_pselect6
+#define __IGNORE_ppoll
+#define __IGNORE_io_pgetevents
+#define __IGNORE_recvmmsg
+#define __IGNORE_mq_timedsend
+#define __IGNORE_mq_timedreceiv
+#define __IGNORE_semtimedop
+#define __IGNORE_rt_sigtimedwait
+#define __IGNORE_futex
+#define __IGNORE_sched_rr_get_interval
 #endif
 
 /* i386-specific or historical system calls */
-- 
cgit v1.2.3-59-g8ed1b


From e9e0c8903009477b630e37a8b6364b26a00720da Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 10 Jan 2019 19:04:34 +0200
Subject: fanotify: encode file identifier for FAN_REPORT_FID

When user requests the flag FAN_REPORT_FID in fanotify_init(),
a unique file identifier of the event target object will be reported
with the event.

The file identifier includes the filesystem's fsid (i.e. from statfs(2))
and an NFS file handle of the file (i.e. from name_to_handle_at(2)).

The file identifier makes holding the path reference and passing a file
descriptor to user redundant, so those are disabled in a group with
FAN_REPORT_FID.

Encode fid and store it in event for a group with FAN_REPORT_FID.
Up to 12 bytes of file handle on 32bit arch (16 bytes on 64bit arch)
are stored inline in fanotify_event struct. Larger file handles are
stored in an external allocated buffer.

On failure to encode fid, we print a warning and queue the event
without the fid information.

[JK: Fold part of later patched into this one to use
exportfs_encode_inode_fh() right away]

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      | 84 +++++++++++++++++++++++++++++++++++---
 fs/notify/fanotify/fanotify.h      | 78 +++++++++++++++++++++++++++++++++--
 fs/notify/fanotify/fanotify_user.c |  6 +--
 include/uapi/linux/fanotify.h      |  1 +
 4 files changed, 156 insertions(+), 13 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d8e3b6e50844..dd33227e518a 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -13,6 +13,7 @@
 #include <linux/wait.h>
 #include <linux/audit.h>
 #include <linux/sched/mm.h>
+#include <linux/statfs.h>
 
 #include "fanotify.h"
 
@@ -25,10 +26,18 @@ static bool should_merge(struct fsnotify_event *old_fsn,
 	old = FANOTIFY_E(old_fsn);
 	new = FANOTIFY_E(new_fsn);
 
-	if (old_fsn->inode == new_fsn->inode && old->pid == new->pid &&
-	    old->path.mnt == new->path.mnt &&
-	    old->path.dentry == new->path.dentry)
-		return true;
+	if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
+	    old->fh_type != new->fh_type || old->fh_len != new->fh_len)
+		return false;
+
+	if (fanotify_event_has_path(old)) {
+		return old->path.mnt == new->path.mnt &&
+			old->path.dentry == new->path.dentry;
+	} else if (fanotify_event_has_fid(old)) {
+		return fanotify_fid_equal(&old->fid, &new->fid, old->fh_len);
+	}
+
+	/* Do not merge events if we failed to encode fid */
 	return false;
 }
 
@@ -143,6 +152,60 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
 		~marks_ignored_mask;
 }
 
+static int fanotify_encode_fid(struct fanotify_event *event,
+			       const struct path *path, gfp_t gfp)
+{
+	struct fanotify_fid *fid = &event->fid;
+	int dwords, bytes = 0;
+	struct kstatfs stat;
+	int err, type;
+
+	stat.f_fsid.val[0] = stat.f_fsid.val[1] = 0;
+	fid->ext_fh = NULL;
+	dwords = 0;
+	err = -ENOENT;
+	type = exportfs_encode_inode_fh(d_inode(path->dentry), NULL, &dwords,
+					NULL);
+	if (!dwords)
+		goto out_err;
+
+	err = vfs_statfs(path, &stat);
+	if (err)
+		goto out_err;
+
+	bytes = dwords << 2;
+	if (bytes > FANOTIFY_INLINE_FH_LEN) {
+		/* Treat failure to allocate fh as failure to allocate event */
+		err = -ENOMEM;
+		fid->ext_fh = kmalloc(bytes, gfp);
+		if (!fid->ext_fh)
+			goto out_err;
+	}
+
+	type = exportfs_encode_inode_fh(d_inode(path->dentry),
+					fanotify_fid_fh(fid, bytes), &dwords,
+					NULL);
+	err = -EINVAL;
+	if (!type || type == FILEID_INVALID || bytes != dwords << 2)
+		goto out_err;
+
+	fid->fsid = stat.f_fsid;
+	event->fh_len = bytes;
+
+	return type;
+
+out_err:
+	pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
+			    "type=%d, bytes=%d, err=%i)\n",
+			    stat.f_fsid.val[0], stat.f_fsid.val[1],
+			    type, bytes, err);
+	kfree(fid->ext_fh);
+	fid->ext_fh = NULL;
+	event->fh_len = 0;
+
+	return FILEID_INVALID;
+}
+
 struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
 						 struct inode *inode, u32 mask,
 						 const struct path *path)
@@ -181,10 +244,16 @@ init: __maybe_unused
 		event->pid = get_pid(task_pid(current));
 	else
 		event->pid = get_pid(task_tgid(current));
-	if (path) {
+	event->fh_len = 0;
+	if (path && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+		/* Report the event without a file identifier on encode error */
+		event->fh_type = fanotify_encode_fid(event, path, gfp);
+	} else if (path) {
+		event->fh_type = FILEID_ROOT;
 		event->path = *path;
 		path_get(&event->path);
 	} else {
+		event->fh_type = FILEID_INVALID;
 		event->path.mnt = NULL;
 		event->path.dentry = NULL;
 	}
@@ -281,7 +350,10 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
 	struct fanotify_event *event;
 
 	event = FANOTIFY_E(fsn_event);
-	path_put(&event->path);
+	if (fanotify_event_has_path(event))
+		path_put(&event->path);
+	else if (fanotify_event_has_ext_fh(event))
+		kfree(event->fid.ext_fh);
 	put_pid(event->pid);
 	if (fanotify_is_perm_event(event->mask)) {
 		kmem_cache_free(fanotify_perm_event_cachep,
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 898b5b2bc1c7..271482fb9611 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,11 +2,49 @@
 #include <linux/fsnotify_backend.h>
 #include <linux/path.h>
 #include <linux/slab.h>
+#include <linux/exportfs.h>
 
 extern struct kmem_cache *fanotify_mark_cache;
 extern struct kmem_cache *fanotify_event_cachep;
 extern struct kmem_cache *fanotify_perm_event_cachep;
 
+/*
+ * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation).
+ * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and
+ * fh_* fields increase the size of fanotify_event by another 4 bytes.
+ * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and
+ * fh_* fields are packed in a hole after mask.
+ */
+#if BITS_PER_LONG == 32
+#define FANOTIFY_INLINE_FH_LEN	(3 << 2)
+#else
+#define FANOTIFY_INLINE_FH_LEN	(4 << 2)
+#endif
+
+struct fanotify_fid {
+	__kernel_fsid_t fsid;
+	union {
+		unsigned char fh[FANOTIFY_INLINE_FH_LEN];
+		unsigned char *ext_fh;
+	};
+};
+
+static inline void *fanotify_fid_fh(struct fanotify_fid *fid,
+				    unsigned int fh_len)
+{
+	return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh;
+}
+
+static inline bool fanotify_fid_equal(struct fanotify_fid *fid1,
+				      struct fanotify_fid *fid2,
+				      unsigned int fh_len)
+{
+	return fid1->fsid.val[0] == fid2->fsid.val[0] &&
+		fid1->fsid.val[1] == fid2->fsid.val[1] &&
+		!memcmp(fanotify_fid_fh(fid1, fh_len),
+			fanotify_fid_fh(fid2, fh_len), fh_len);
+}
+
 /*
  * Structure for normal fanotify events. It gets allocated in
  * fanotify_handle_event() and freed when the information is retrieved by
@@ -16,13 +54,47 @@ struct fanotify_event {
 	struct fsnotify_event fse;
 	u32 mask;
 	/*
-	 * We hold ref to this path so it may be dereferenced at any point
-	 * during this object's lifetime
+	 * Those fields are outside fanotify_fid to pack fanotify_event nicely
+	 * on 64bit arch and to use fh_type as an indication of whether path
+	 * or fid are used in the union:
+	 * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither.
 	 */
-	struct path path;
+	u8 fh_type;
+	u8 fh_len;
+	u16 pad;
+	union {
+		/*
+		 * We hold ref to this path so it may be dereferenced at any
+		 * point during this object's lifetime
+		 */
+		struct path path;
+		/*
+		 * With FAN_REPORT_FID, we do not hold any reference on the
+		 * victim object. Instead we store its NFS file handle and its
+		 * filesystem's fsid as a unique identifier.
+		 */
+		struct fanotify_fid fid;
+	};
 	struct pid *pid;
 };
 
+static inline bool fanotify_event_has_path(struct fanotify_event *event)
+{
+	return event->fh_type == FILEID_ROOT;
+}
+
+static inline bool fanotify_event_has_fid(struct fanotify_event *event)
+{
+	return event->fh_type != FILEID_ROOT &&
+		event->fh_type != FILEID_INVALID;
+}
+
+static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
+{
+	return fanotify_event_has_fid(event) &&
+		event->fh_len > FANOTIFY_INLINE_FH_LEN;
+}
+
 /*
  * Structure for permission fanotify events. It gets allocated and freed in
  * fanotify_handle_event() since we wait there for user response. When the
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 096503bd0edb..c965fcf4979e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -181,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	struct fanotify_event_metadata metadata;
 	struct fanotify_event *event;
 	struct file *f = NULL;
-	int fd, ret;
+	int ret, fd = FAN_NOFD;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
 
@@ -193,9 +193,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
 	metadata.pid = pid_vnr(event->pid);
 
-	if (unlikely(event->mask & FAN_Q_OVERFLOW)) {
-		fd = FAN_NOFD;
-	} else {
+	if (fanotify_event_has_path(event)) {
 		fd = create_fd(group, event, &f);
 		if (fd < 0)
 			return fd;
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 909c98fcace2..d07f3cbc2786 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -44,6 +44,7 @@
 
 /* Flags to determine fanotify event format */
 #define FAN_REPORT_TID		0x00000100	/* event->pid is thread id */
+#define FAN_REPORT_FID		0x00000200	/* Report unique file id */
 
 /* Deprecated - do not use this in programs and do not add new flags here! */
 #define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK | \
-- 
cgit v1.2.3-59-g8ed1b


From 5e469c830fdb5a1ebaa69b375b87f583326fd296 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 10 Jan 2019 19:04:35 +0200
Subject: fanotify: copy event fid info to user

If group requested FAN_REPORT_FID and event has file identifier,
copy that information to user reading the event after event metadata.

fid information is formatted as struct fanotify_event_info_fid
that includes a generic header struct fanotify_event_info_header,
so that other info types could be defined in the future using the
same header.

metadata->event_len includes the length of the fid information.

The fid information includes the filesystem's fsid (see statfs(2))
followed by an NFS file handle of the file that could be passed as
an argument to open_by_handle_at(2).

Cc: <linux-api@vger.kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.h      |  5 +++
 fs/notify/fanotify/fanotify_user.c | 82 +++++++++++++++++++++++++++++++++++---
 include/uapi/linux/fanotify.h      | 20 ++++++++++
 3 files changed, 102 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 271482fb9611..4aafc7144c3d 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -95,6 +95,11 @@ static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
 		event->fh_len > FANOTIFY_INLINE_FH_LEN;
 }
 
+static inline void *fanotify_event_fh(struct fanotify_event *event)
+{
+	return fanotify_fid_fh(&event->fid, event->fh_len);
+}
+
 /*
  * Structure for permission fanotify events. It gets allocated and freed in
  * fanotify_handle_event() since we wait there for user response. When the
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c965fcf4979e..cd82dd713c91 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -47,6 +47,18 @@ struct kmem_cache *fanotify_mark_cache __read_mostly;
 struct kmem_cache *fanotify_event_cachep __read_mostly;
 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 
+#define FANOTIFY_EVENT_ALIGN 4
+
+static int fanotify_event_info_len(struct fanotify_event *event)
+{
+	if (!fanotify_event_has_fid(event))
+		return 0;
+
+	return roundup(sizeof(struct fanotify_event_info_fid) +
+		       sizeof(struct file_handle) + event->fh_len,
+		       FANOTIFY_EVENT_ALIGN);
+}
+
 /*
  * Get an fsnotify notification event if one exists and is small
  * enough to fit in "count". Return an error pointer if the count
@@ -57,6 +69,9 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 					    size_t count)
 {
+	size_t event_size = FAN_EVENT_METADATA_LEN;
+	struct fanotify_event *event;
+
 	assert_spin_locked(&group->notification_lock);
 
 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
@@ -64,11 +79,18 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 	if (fsnotify_notify_queue_is_empty(group))
 		return NULL;
 
-	if (FAN_EVENT_METADATA_LEN > count)
+	if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+		event = FANOTIFY_E(fsnotify_peek_first_event(group));
+		event_size += fanotify_event_info_len(event);
+	}
+
+	if (event_size > count)
 		return ERR_PTR(-EINVAL);
 
-	/* held the notification_lock the whole time, so this is the
-	 * same event we peeked above */
+	/*
+	 * Held the notification_lock the whole time, so this is the
+	 * same event we peeked above
+	 */
 	return fsnotify_remove_first_event(group);
 }
 
@@ -174,6 +196,48 @@ static int process_access_response(struct fsnotify_group *group,
 	return 0;
 }
 
+static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
+{
+	struct fanotify_event_info_fid info = { };
+	struct file_handle handle = { };
+	size_t fh_len = event->fh_len;
+	size_t len = fanotify_event_info_len(event);
+
+	if (!len)
+		return 0;
+
+	if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
+		return -EFAULT;
+
+	/* Copy event info fid header followed by vaiable sized file handle */
+	info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
+	info.hdr.len = len;
+	info.fsid = event->fid.fsid;
+	if (copy_to_user(buf, &info, sizeof(info)))
+		return -EFAULT;
+
+	buf += sizeof(info);
+	len -= sizeof(info);
+	handle.handle_type = event->fh_type;
+	handle.handle_bytes = fh_len;
+	if (copy_to_user(buf, &handle, sizeof(handle)))
+		return -EFAULT;
+
+	buf += sizeof(handle);
+	len -= sizeof(handle);
+	if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
+		return -EFAULT;
+
+	/* Pad with 0's */
+	buf += fh_len;
+	len -= fh_len;
+	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
+	if (len > 0 && clear_user(buf, len))
+		return -EFAULT;
+
+	return 0;
+}
+
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
 				  struct fsnotify_event *fsn_event,
 				  char __user *buf, size_t count)
@@ -197,6 +261,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		fd = create_fd(group, event, &f);
 		if (fd < 0)
 			return fd;
+	} else if (fanotify_event_has_fid(event)) {
+		metadata.event_len += fanotify_event_info_len(event);
 	}
 	metadata.fd = fd;
 
@@ -208,14 +274,20 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	if (WARN_ON_ONCE(metadata.event_len > count))
 		goto out_close_fd;
 
-	if (copy_to_user(buf, &metadata, metadata.event_len))
+	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
 		goto out_close_fd;
 
 	if (fanotify_is_perm_event(event->mask))
 		FANOTIFY_PE(fsn_event)->fd = fd;
 
-	if (fd != FAN_NOFD)
+	if (fanotify_event_has_path(event)) {
 		fd_install(fd, f);
+	} else if (fanotify_event_has_fid(event)) {
+		ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
+		if (ret < 0)
+			return ret;
+	}
+
 	return metadata.event_len;
 
 out_close_fd:
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index d07f3cbc2786..959ae2bdc7ca 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -107,6 +107,26 @@ struct fanotify_event_metadata {
 	__s32 pid;
 };
 
+#define FAN_EVENT_INFO_TYPE_FID		1
+
+/* Variable length info record following event metadata */
+struct fanotify_event_info_header {
+	__u8 info_type;
+	__u8 pad;
+	__u16 len;
+};
+
+/* Unique file identifier info record */
+struct fanotify_event_info_fid {
+	struct fanotify_event_info_header hdr;
+	__kernel_fsid_t fsid;
+	/*
+	 * Following is an opaque struct file_handle that can be passed as
+	 * an argument to open_by_handle_at(2).
+	 */
+	unsigned char handle[0];
+};
+
 struct fanotify_response {
 	__s32 fd;
 	__u32 response;
-- 
cgit v1.2.3-59-g8ed1b


From 235328d1fa4251c6dcb32351219bb553a58838d2 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 10 Jan 2019 19:04:43 +0200
Subject: fanotify: add support for create/attrib/move/delete events

Add support for events with data type FSNOTIFY_EVENT_INODE
(e.g. create/attrib/move/delete) for inode and filesystem mark types.

The "inode" events do not carry enough information (i.e. path) to
report event->fd, so we do not allow setting a mask for those events
unless group supports reporting fid.

The "inode" events are not supported on a mount mark, because they do
not carry enough information (i.e. path) to be filtered by mount point.

The "dirent" events (create/move/delete) report the fid of the parent
directory where events took place without specifying the filename of the
child. In the future, fanotify may get support for reporting filename
information for those events.

Cc: <linux-api@vger.kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      |  9 ++++++++-
 fs/notify/fanotify/fanotify_user.c | 12 ++++++++++++
 include/linux/fanotify.h           | 22 ++++++++++++++++++++--
 include/uapi/linux/fanotify.h      |  8 ++++++++
 4 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 974239b03442..158c69acb04d 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -313,9 +313,16 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 
 	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
 	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB);
 	BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
 	BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
 	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO);
+	BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM);
+	BUILD_BUG_ON(FAN_CREATE != FS_CREATE);
+	BUILD_BUG_ON(FAN_DELETE != FS_DELETE);
+	BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF);
+	BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF);
 	BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
 	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
 	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
@@ -324,7 +331,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
 	BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
 
-	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12);
+	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
 
 	mask = fanotify_group_event_mask(group, iter_info, mask, data,
 					 data_type);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index bf06fd6ef761..6c61a06d0ef5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -976,6 +976,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	    group->priority == FS_PRIO_0)
 		goto fput_and_out;
 
+	/*
+	 * Events with data type inode do not carry enough information to report
+	 * event->fd, so we do not allow setting a mask for inode events unless
+	 * group supports reporting fid.
+	 * inode events are not supported on a mount mark, because they do not
+	 * carry enough information (i.e. path) to be filtered by mount point.
+	 */
+	if (mask & FANOTIFY_INODE_EVENTS &&
+	    (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) ||
+	     mark_type == FAN_MARK_MOUNT))
+		goto fput_and_out;
+
 	if (flags & FAN_MARK_FLUSH) {
 		ret = 0;
 		if (mark_type == FAN_MARK_MOUNT)
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index f59be967f72b..e9d45387089f 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -35,10 +35,28 @@
 				 FAN_MARK_IGNORED_SURV_MODIFY | \
 				 FAN_MARK_FLUSH)
 
-/* Events that user can request to be notified on */
-#define FANOTIFY_EVENTS		(FAN_ACCESS | FAN_MODIFY | \
+/*
+ * Events that can be reported with data type FSNOTIFY_EVENT_PATH.
+ * Note that FAN_MODIFY can also be reported with data type
+ * FSNOTIFY_EVENT_INODE.
+ */
+#define FANOTIFY_PATH_EVENTS	(FAN_ACCESS | FAN_MODIFY | \
 				 FAN_CLOSE | FAN_OPEN | FAN_OPEN_EXEC)
 
+/*
+ * Directory entry modification events - reported only to directory
+ * where entry is modified and not to a watching parent.
+ */
+#define FANOTIFY_DIRENT_EVENTS	(FAN_MOVE | FAN_CREATE | FAN_DELETE)
+
+/* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */
+#define FANOTIFY_INODE_EVENTS	(FANOTIFY_DIRENT_EVENTS | \
+				 FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF)
+
+/* Events that user can request to be notified on */
+#define FANOTIFY_EVENTS		(FANOTIFY_PATH_EVENTS | \
+				 FANOTIFY_INODE_EVENTS)
+
 /* Events that require a permission response from user */
 #define FANOTIFY_PERM_EVENTS	(FAN_OPEN_PERM | FAN_ACCESS_PERM | \
 				 FAN_OPEN_EXEC_PERM)
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 959ae2bdc7ca..b9effa6f8503 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -7,9 +7,16 @@
 /* the following events that user-space can register for */
 #define FAN_ACCESS		0x00000001	/* File was accessed */
 #define FAN_MODIFY		0x00000002	/* File was modified */
+#define FAN_ATTRIB		0x00000004	/* Metadata changed */
 #define FAN_CLOSE_WRITE		0x00000008	/* Writtable file closed */
 #define FAN_CLOSE_NOWRITE	0x00000010	/* Unwrittable file closed */
 #define FAN_OPEN		0x00000020	/* File was opened */
+#define FAN_MOVED_FROM		0x00000040	/* File was moved from X */
+#define FAN_MOVED_TO		0x00000080	/* File was moved to Y */
+#define FAN_CREATE		0x00000100	/* Subfile was created */
+#define FAN_DELETE		0x00000200	/* Subfile was deleted */
+#define FAN_DELETE_SELF		0x00000400	/* Self was deleted */
+#define FAN_MOVE_SELF		0x00000800	/* Self was moved */
 #define FAN_OPEN_EXEC		0x00001000	/* File was opened for exec */
 
 #define FAN_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
@@ -24,6 +31,7 @@
 
 /* helper events */
 #define FAN_CLOSE		(FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
+#define FAN_MOVE		(FAN_MOVED_FROM | FAN_MOVED_TO) /* moves */
 
 /* flags used for fanotify_init() */
 #define FAN_CLOEXEC		0x00000001
-- 
cgit v1.2.3-59-g8ed1b


From 1db64e8733f653814f041ffe1428524494ef6123 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 7 Feb 2019 11:36:32 +0200
Subject: devlink: Add devlink formatted message (fmsg) API

Devlink fmsg is a mechanism to pass descriptors between drivers and
devlink, in json-like format. The API allows the driver to add nested
attributes such as object, object pair and value array, in addition to
attributes such as name and value.

Driver can use this API to fill the fmsg context in a format which will be
translated by the devlink to the netlink message later.
There is no memory allocation in advance (other than the initial list
head), and it dynamically allocates messages descriptors and add them to
the list on the fly.

When it needs to send the data using SKBs to the netlink layer, it
fragments the data between different SKBs. In order to do this
fragmentation, it uses virtual nests attributes, to avoid actual
nesting use which cannot be divided between different SKBs.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        | 149 +++++++++++++
 include/uapi/linux/devlink.h |   8 +
 net/core/devlink.c           | 483 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 640 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 74d992a68a06..7c5722e816aa 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -448,6 +448,8 @@ struct devlink_info_req;
 
 typedef void devlink_snapshot_data_dest_t(const void *data);
 
+struct devlink_fmsg;
+
 struct devlink_ops {
 	int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
 	int (*port_type_set)(struct devlink_port *devlink_port,
@@ -639,6 +641,37 @@ int devlink_info_version_running_put(struct devlink_info_req *req,
 				     const char *version_name,
 				     const char *version_value);
 
+int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg);
+int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg);
+
+int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name);
+int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg);
+
+int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
+				     const char *name);
+int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg);
+
+int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value);
+int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value);
+int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value);
+int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value);
+int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value);
+int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+			    u16 value_len);
+
+int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			       bool value);
+int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			     u8 value);
+int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			      u32 value);
+int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			      u64 value);
+int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				 const char *value);
+int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				 const void *value, u16 value_len);
+
 #else
 
 static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
@@ -971,6 +1004,122 @@ devlink_info_version_running_put(struct devlink_info_req *req,
 {
 	return 0;
 }
+
+static inline int
+devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
+				 const char *name)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+			u16 value_len)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			   bool value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			 u8 value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			  u32 value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			  u64 value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			     const char *value)
+{
+	return 0;
+}
+
+static inline int
+devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			     const void *value, u16 value_len)
+{
+	return 0;
+}
 #endif
 
 #if IS_REACHABLE(CONFIG_NET_DEVLINK)
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 054b2d1a4537..076692209a9b 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -302,6 +302,14 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_SB_POOL_CELL_SIZE,		/* u32 */
 
+	DEVLINK_ATTR_FMSG,			/* nested */
+	DEVLINK_ATTR_FMSG_OBJ_NEST_START,	/* flag */
+	DEVLINK_ATTR_FMSG_PAIR_NEST_START,	/* flag */
+	DEVLINK_ATTR_FMSG_ARR_NEST_START,	/* flag */
+	DEVLINK_ATTR_FMSG_NEST_END,		/* flag */
+	DEVLINK_ATTR_FMSG_OBJ_NAME,		/* string */
+	DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE,	/* u8 */
+	DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA,	/* dynamic */
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index cd0d393bc62d..03883697fcf0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3879,6 +3879,489 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
 	return msg->len;
 }
 
+struct devlink_fmsg_item {
+	struct list_head list;
+	int attrtype;
+	u8 nla_type;
+	u16 len;
+	int value[0];
+};
+
+struct devlink_fmsg {
+	struct list_head item_list;
+};
+
+static struct devlink_fmsg *devlink_fmsg_alloc(void)
+{
+	struct devlink_fmsg *fmsg;
+
+	fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL);
+	if (!fmsg)
+		return NULL;
+
+	INIT_LIST_HEAD(&fmsg->item_list);
+
+	return fmsg;
+}
+
+static void devlink_fmsg_free(struct devlink_fmsg *fmsg)
+{
+	struct devlink_fmsg_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) {
+		list_del(&item->list);
+		kfree(item);
+	}
+	kfree(fmsg);
+}
+
+static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg,
+				    int attrtype)
+{
+	struct devlink_fmsg_item *item;
+
+	item = kzalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	item->attrtype = attrtype;
+	list_add_tail(&item->list, &fmsg->item_list);
+
+	return 0;
+}
+
+int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
+{
+	return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start);
+
+static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg)
+{
+	return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END);
+}
+
+int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
+{
+	return devlink_fmsg_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end);
+
+#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN)
+
+static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
+{
+	struct devlink_fmsg_item *item;
+
+	if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE)
+		return -EMSGSIZE;
+
+	item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	item->nla_type = NLA_NUL_STRING;
+	item->len = strlen(name) + 1;
+	item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME;
+	memcpy(&item->value, name, item->len);
+	list_add_tail(&item->list, &fmsg->item_list);
+
+	return 0;
+}
+
+int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
+{
+	int err;
+
+	err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_put_name(fmsg, name);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start);
+
+int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+	return devlink_fmsg_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end);
+
+int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
+				     const char *name)
+{
+	int err;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, name);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start);
+
+int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+	int err;
+
+	err = devlink_fmsg_nest_end(fmsg);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_nest_end(fmsg);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end);
+
+static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
+				  const void *value, u16 value_len,
+				  u8 value_nla_type)
+{
+	struct devlink_fmsg_item *item;
+
+	if (value_len > DEVLINK_FMSG_MAX_SIZE)
+		return -EMSGSIZE;
+
+	item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	item->nla_type = value_nla_type;
+	item->len = value_len;
+	item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
+	memcpy(&item->value, value, item->len);
+	list_add_tail(&item->list, &fmsg->item_list);
+
+	return 0;
+}
+
+int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
+{
+	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_bool_put);
+
+int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
+{
+	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u8_put);
+
+int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
+{
+	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put);
+
+int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
+{
+	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u64_put);
+
+int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
+{
+	return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1,
+				      NLA_NUL_STRING);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
+
+int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+			    u16 value_len)
+{
+	return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
+
+int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			       bool value)
+{
+	int err;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, name);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_bool_put(fmsg, value);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put);
+
+int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			     u8 value)
+{
+	int err;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, name);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u8_put(fmsg, value);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put);
+
+int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			      u32 value)
+{
+	int err;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, name);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u32_put(fmsg, value);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put);
+
+int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
+			      u64 value)
+{
+	int err;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, name);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_put(fmsg, value);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put);
+
+int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				 const char *value)
+{
+	int err;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, name);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_string_put(fmsg, value);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put);
+
+int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
+				 const void *value, u16 value_len)
+{
+	int err;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, name);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_binary_put(fmsg, value, value_len);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put);
+
+static int
+devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb)
+{
+	switch (msg->nla_type) {
+	case NLA_FLAG:
+	case NLA_U8:
+	case NLA_U32:
+	case NLA_U64:
+	case NLA_NUL_STRING:
+	case NLA_BINARY:
+		return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE,
+				  msg->nla_type);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int
+devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb)
+{
+	int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
+	u8 tmp;
+
+	switch (msg->nla_type) {
+	case NLA_FLAG:
+		/* Always provide flag data, regardless of its value */
+		tmp = *(bool *) msg->value;
+
+		return nla_put_u8(skb, attrtype, tmp);
+	case NLA_U8:
+		return nla_put_u8(skb, attrtype, *(u8 *) msg->value);
+	case NLA_U32:
+		return nla_put_u32(skb, attrtype, *(u32 *) msg->value);
+	case NLA_U64:
+		return nla_put_u64_64bit(skb, attrtype, *(u64 *) msg->value,
+					 DEVLINK_ATTR_PAD);
+	case NLA_NUL_STRING:
+		return nla_put_string(skb, attrtype, (char *) &msg->value);
+	case NLA_BINARY:
+		return nla_put(skb, attrtype, msg->len, (void *) &msg->value);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int
+devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb,
+			 int *start)
+{
+	struct devlink_fmsg_item *item;
+	struct nlattr *fmsg_nlattr;
+	int i = 0;
+	int err;
+
+	fmsg_nlattr = nla_nest_start(skb, DEVLINK_ATTR_FMSG);
+	if (!fmsg_nlattr)
+		return -EMSGSIZE;
+
+	list_for_each_entry(item, &fmsg->item_list, list) {
+		if (i < *start) {
+			i++;
+			continue;
+		}
+
+		switch (item->attrtype) {
+		case DEVLINK_ATTR_FMSG_OBJ_NEST_START:
+		case DEVLINK_ATTR_FMSG_PAIR_NEST_START:
+		case DEVLINK_ATTR_FMSG_ARR_NEST_START:
+		case DEVLINK_ATTR_FMSG_NEST_END:
+			err = nla_put_flag(skb, item->attrtype);
+			break;
+		case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA:
+			err = devlink_fmsg_item_fill_type(item, skb);
+			if (err)
+				break;
+			err = devlink_fmsg_item_fill_data(item, skb);
+			break;
+		case DEVLINK_ATTR_FMSG_OBJ_NAME:
+			err = nla_put_string(skb, item->attrtype,
+					     (char *) &item->value);
+			break;
+		default:
+			err = -EINVAL;
+			break;
+		}
+		if (!err)
+			*start = ++i;
+		else
+			break;
+	}
+
+	nla_nest_end(skb, fmsg_nlattr);
+	return err;
+}
+
+static int devlink_fmsg_snd(struct devlink_fmsg *fmsg,
+			    struct genl_info *info,
+			    enum devlink_command cmd, int flags)
+{
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+	bool last = false;
+	int index = 0;
+	void *hdr;
+	int err;
+
+	while (!last) {
+		int tmp_index = index;
+
+		skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+		if (!skb)
+			return -ENOMEM;
+
+		hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+				  &devlink_nl_family, flags | NLM_F_MULTI, cmd);
+		if (!hdr) {
+			err = -EMSGSIZE;
+			goto nla_put_failure;
+		}
+
+		err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
+		if (!err)
+			last = true;
+		else if (err != -EMSGSIZE || tmp_index == index)
+			goto nla_put_failure;
+
+		genlmsg_end(skb, hdr);
+		err = genlmsg_reply(skb, info);
+		if (err)
+			return err;
+	}
+
+	skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+	nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+			NLMSG_DONE, 0, flags | NLM_F_MULTI);
+	if (!nlh) {
+		err = -EMSGSIZE;
+		goto nla_put_failure;
+	}
+	err = genlmsg_reply(skb, info);
+	if (err)
+		return err;
+
+	return 0;
+
+nla_put_failure:
+	nlmsg_free(skb);
+	return err;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
-- 
cgit v1.2.3-59-g8ed1b


From 7afe335a8bede4e2839b0e0fa36ef629fe4a0206 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 7 Feb 2019 11:36:35 +0200
Subject: devlink: Add health get command

Add devlink health get command to provide reporter/s data for user space.
Add the ability to get data per reporter or dump data from all available
reporters.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  11 ++++
 net/core/devlink.c           | 149 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 076692209a9b..d8f20d6ce139 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -96,6 +96,8 @@ enum devlink_command {
 
 	DEVLINK_CMD_INFO_GET,		/* can dump */
 
+	DEVLINK_CMD_HEALTH_REPORTER_GET,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -310,6 +312,15 @@ enum devlink_attr {
 	DEVLINK_ATTR_FMSG_OBJ_NAME,		/* string */
 	DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE,	/* u8 */
 	DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA,	/* dynamic */
+
+	DEVLINK_ATTR_HEALTH_REPORTER,			/* nested */
+	DEVLINK_ATTR_HEALTH_REPORTER_NAME,		/* string */
+	DEVLINK_ATTR_HEALTH_REPORTER_STATE,		/* u8 */
+	DEVLINK_ATTR_HEALTH_REPORTER_ERR,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_RECOVER,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,	/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,	/* u8 */
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3eaa290831aa..86f7c0e5d4bc 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4572,6 +4572,146 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
 }
 EXPORT_SYMBOL_GPL(devlink_health_report);
 
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_info(struct devlink *devlink,
+				      struct genl_info *info)
+{
+	char *reporter_name;
+
+	if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
+		return NULL;
+
+	reporter_name =
+		nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
+	return devlink_health_reporter_find_by_name(devlink, reporter_name);
+}
+
+static int
+devlink_nl_health_reporter_fill(struct sk_buff *msg,
+				struct devlink *devlink,
+				struct devlink_health_reporter *reporter,
+				enum devlink_command cmd, u32 portid,
+				u32 seq, int flags)
+{
+	struct nlattr *reporter_attr;
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto genlmsg_cancel;
+
+	reporter_attr = nla_nest_start(msg, DEVLINK_ATTR_HEALTH_REPORTER);
+	if (!reporter_attr)
+		goto genlmsg_cancel;
+	if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+			   reporter->ops->name))
+		goto reporter_nest_cancel;
+	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
+		       reporter->health_state))
+		goto reporter_nest_cancel;
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR,
+			      reporter->error_count, DEVLINK_ATTR_PAD))
+		goto reporter_nest_cancel;
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER,
+			      reporter->recovery_count, DEVLINK_ATTR_PAD))
+		goto reporter_nest_cancel;
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
+			      reporter->graceful_period,
+			      DEVLINK_ATTR_PAD))
+		goto reporter_nest_cancel;
+	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
+		       reporter->auto_recover))
+		goto reporter_nest_cancel;
+	if (reporter->dump_fmsg &&
+	    nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,
+			      jiffies_to_msecs(reporter->dump_ts),
+			      DEVLINK_ATTR_PAD))
+		goto reporter_nest_cancel;
+
+	nla_nest_end(msg, reporter_attr);
+	genlmsg_end(msg, hdr);
+	return 0;
+
+reporter_nest_cancel:
+	nla_nest_end(msg, reporter_attr);
+genlmsg_cancel:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb,
+						   struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+	struct sk_buff *msg;
+	int err;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_health_reporter_fill(msg, devlink, reporter,
+					      DEVLINK_CMD_HEALTH_REPORTER_GET,
+					      info->snd_portid, info->snd_seq,
+					      0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
+					  struct netlink_callback *cb)
+{
+	struct devlink_health_reporter *reporter;
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		mutex_lock(&devlink->lock);
+		list_for_each_entry(reporter, &devlink->reporter_list,
+				    list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_health_reporter_fill(msg, devlink,
+							      reporter,
+							      DEVLINK_CMD_HEALTH_REPORTER_GET,
+							      NETLINK_CB(cb->skb).portid,
+							      cb->nlh->nlmsg_seq,
+							      NLM_F_MULTI);
+			if (err) {
+				mutex_unlock(&devlink->lock);
+				goto out;
+			}
+			idx++;
+		}
+		mutex_unlock(&devlink->lock);
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -4597,6 +4737,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -4840,6 +4981,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 		/* can be retrieved by unprivileged users */
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_GET,
+		.doit = devlink_nl_cmd_health_reporter_get_doit,
+		.dumpit = devlink_nl_cmd_health_reporter_get_dumpit,
+		.policy = devlink_nl_policy,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From a1e55ec0a0c6969cb7e9d9080a84041bb7b2b6e6 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 7 Feb 2019 11:36:36 +0200
Subject: devlink: Add health set command

Add devlink health set command, in order to set configuration parameters
for a specific reporter.
Supported parameters are:
- graceful_period: Time interval between auto recoveries (in msec)
- auto_recover: Determines if the devlink shall execute recover upon
		receiving error for the reporter

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index d8f20d6ce139..b03065a99884 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -97,6 +97,7 @@ enum devlink_command {
 	DEVLINK_CMD_INFO_GET,		/* can dump */
 
 	DEVLINK_CMD_HEALTH_REPORTER_GET,
+	DEVLINK_CMD_HEALTH_REPORTER_SET,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 86f7c0e5d4bc..0b231fb76e59 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4712,6 +4712,33 @@ out:
 	return msg->len;
 }
 
+static int
+devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	if (!reporter->ops->recover &&
+	    (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
+	     info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
+		return -EOPNOTSUPP;
+
+	if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
+		reporter->graceful_period =
+			nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
+
+	if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
+		reporter->auto_recover =
+			nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
+
+	return 0;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -4738,6 +4765,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
 	[DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 },
+	[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -4989,6 +5018,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 		/* can be retrieved by unprivileged users */
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_SET,
+		.doit = devlink_nl_cmd_health_reporter_set_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 20a0943a5b237f7d59dc581e9e3637f5c87f1fde Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 7 Feb 2019 11:36:37 +0200
Subject: devlink: Add health recover command

Add devlink health recover command to the uapi, in order to allow the user
to execute a recover operation over a specific reporter.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index b03065a99884..a3a97e6edad8 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -98,6 +98,7 @@ enum devlink_command {
 
 	DEVLINK_CMD_HEALTH_REPORTER_GET,
 	DEVLINK_CMD_HEALTH_REPORTER_SET,
+	DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 0b231fb76e59..0e6b0e034863 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4739,6 +4739,19 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
 	return 0;
 }
 
+static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb,
+						       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	return devlink_health_reporter_recover(reporter, NULL);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -5025,6 +5038,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
+		.doit = devlink_nl_cmd_health_reporter_recover_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From fca42a2794e31379855c7d687055da43a6e05eef Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 7 Feb 2019 11:36:38 +0200
Subject: devlink: Add health diagnose command

Add devlink health diagnose command, in order to run a diagnose
operation over a specific reporter.

It is expected from driver's callback for diagnose command to fill it
via the devlink fmsg API. Devlink will parse it and convert it to
netlink nla API in order to pass it to the user.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 46 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index a3a97e6edad8..09be37137841 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -99,6 +99,7 @@ enum devlink_command {
 	DEVLINK_CMD_HEALTH_REPORTER_GET,
 	DEVLINK_CMD_HEALTH_REPORTER_SET,
 	DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
+	DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 0e6b0e034863..1e8613db2bf7 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4752,6 +4752,45 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb,
 	return devlink_health_reporter_recover(reporter, NULL);
 }
 
+static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
+							struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+	struct devlink_fmsg *fmsg;
+	int err;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	if (!reporter->ops->diagnose)
+		return -EOPNOTSUPP;
+
+	fmsg = devlink_fmsg_alloc();
+	if (!fmsg)
+		return -ENOMEM;
+
+	err = devlink_fmsg_obj_nest_start(fmsg);
+	if (err)
+		goto out;
+
+	err = reporter->ops->diagnose(reporter, fmsg);
+	if (err)
+		goto out;
+
+	err = devlink_fmsg_obj_nest_end(fmsg);
+	if (err)
+		goto out;
+
+	err = devlink_fmsg_snd(fmsg, info,
+			       DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0);
+
+out:
+	devlink_fmsg_free(fmsg);
+	return err;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -5045,6 +5084,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
+		.doit = devlink_nl_cmd_health_reporter_diagnose_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 35455e23e6f3cffe20e2b948e57597a8dc240b1e Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 7 Feb 2019 11:36:39 +0200
Subject: devlink: Add health dump {get,clear} commands

Add devlink health dump commands, in order to run an dump operation
over a specific reporter.

The supported operations are dump_get in order to get last saved
dump (if not exist, dump now) and dump_clear to clear last saved
dump.

It is expected from driver's callback for diagnose command to fill it
via the devlink fmsg API. Devlink will parse it and convert it to
netlink nla API in order to pass it to the user.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  2 ++
 net/core/devlink.c           | 63 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 09be37137841..72d9f7c89190 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -100,6 +100,8 @@ enum devlink_command {
 	DEVLINK_CMD_HEALTH_REPORTER_SET,
 	DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
 	DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
+	DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
+	DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1e8613db2bf7..7fbdba547d4f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4791,6 +4791,53 @@ out:
 	return err;
 }
 
+static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb,
+							struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+	int err;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	if (!reporter->ops->dump)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&reporter->dump_lock);
+	err = devlink_health_do_dump(reporter, NULL);
+	if (err)
+		goto out;
+
+	err = devlink_fmsg_snd(reporter->dump_fmsg, info,
+			       DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0);
+
+out:
+	mutex_unlock(&reporter->dump_lock);
+	return err;
+}
+
+static int
+devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
+					       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_health_reporter *reporter;
+
+	reporter = devlink_health_reporter_get_from_info(devlink, info);
+	if (!reporter)
+		return -EINVAL;
+
+	if (!reporter->ops->dump)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&reporter->dump_lock);
+	devlink_health_dump_clear(reporter);
+	mutex_unlock(&reporter->dump_lock);
+	return 0;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -5091,6 +5138,22 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
+		.doit = devlink_nl_cmd_health_reporter_dump_get_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
+	},
+	{
+		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
+		.doit = devlink_nl_cmd_health_reporter_dump_clear_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 95b86d1c91ad3b19f882d9e70aa37c8e99e8dc17 Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Thu, 7 Feb 2019 01:31:27 -0500
Subject: RDMA/bnxt_re: Update kernel user abi to pass chip context

User space verbs provider library would need chip context.  Changing the
ABI to add chip version details in structure.  Furthermore, changing the
kernel driver ucontext allocation code to initialize the abi structure
with appropriate values.

As suggested by community, appended the new fields at the bottom of the
ABI structure and retaining to older fields as those were in the older
versions.

Keeping the ABI version at 1 and adding a new field in the ucontext
response structure to hold the component mask.  The user space library
should check pre-defined flags to figure out if a certain feature is
supported on not.

Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 17 ++++++++++++++---
 include/uapi/rdma/bnxt_re-abi.h          | 11 +++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 08c1725f371a..1d7469e23cde 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -3692,9 +3692,10 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
 					   struct ib_udata *udata)
 {
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 	struct bnxt_re_uctx_resp resp;
 	struct bnxt_re_ucontext *uctx;
-	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	u32 chip_met_rev_num = 0;
 	int rc;
 
 	dev_dbg(rdev_to_dev(rdev), "ABI version requested %d",
@@ -3719,14 +3720,24 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
 	}
 	spin_lock_init(&uctx->sh_lock);
 
-	resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/
+	resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_CCTX;
+	chip_met_rev_num = rdev->chip_ctx.chip_num;
+	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) <<
+			     BNXT_RE_CHIP_ID0_CHIP_REV_SFT;
+	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) <<
+			     BNXT_RE_CHIP_ID0_CHIP_MET_SFT;
+	resp.chip_id0 = chip_met_rev_num;
+	/* Future extension of chip info */
+	resp.chip_id1 = 0;
+	/*Temp, Use idr_alloc instead */
+	resp.dev_id = rdev->en_dev->pdev->devfn;
 	resp.max_qp = rdev->qplib_ctx.qpc_count;
 	resp.pg_size = PAGE_SIZE;
 	resp.cqe_sz = sizeof(struct cq_base);
 	resp.max_cqd = dev_attr->max_cq_wqes;
 	resp.rsvd    = 0;
 
-	rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
 	if (rc) {
 		dev_err(rdev_to_dev(rdev), "Failed to copy user context");
 		rc = -EFAULT;
diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h
index a7a6111e50c7..dc52e3cf574c 100644
--- a/include/uapi/rdma/bnxt_re-abi.h
+++ b/include/uapi/rdma/bnxt_re-abi.h
@@ -44,6 +44,14 @@
 
 #define BNXT_RE_ABI_VERSION	1
 
+#define BNXT_RE_CHIP_ID0_CHIP_NUM_SFT		0x00
+#define BNXT_RE_CHIP_ID0_CHIP_REV_SFT		0x10
+#define BNXT_RE_CHIP_ID0_CHIP_MET_SFT		0x18
+
+enum {
+	BNXT_RE_UCNTX_CMASK_HAVE_CCTX = 0x1ULL
+};
+
 struct bnxt_re_uctx_resp {
 	__u32 dev_id;
 	__u32 max_qp;
@@ -51,6 +59,9 @@ struct bnxt_re_uctx_resp {
 	__u32 cqe_sz;
 	__u32 max_cqd;
 	__u32 rsvd;
+	__aligned_u64 comp_mask;
+	__u32 chip_id0;
+	__u32 chip_id1;
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 998a8a8387ff5f65da456d1fc448dbb926fb5d78 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 7 Feb 2019 21:41:46 +0100
Subject: net: phy: let genphy_c45_read_link manage the devices to check

Let genphy_c45_read_link manage the devices to check, this removes
overhead from callers. Add C22EXT to the list of excluded devices
because it doesn't implement the status register. According to the
802.3 clause 45 spec registers 29.0 - 29.4 are reserved.

At the moment we have very few clause 45 PHY drivers, so we are
lacking experience whether other drivers will have to exclude further
devices, or may need to check PHY XS. If we should figure out that
list of devices to check needs to be configurable, I think best will
be to add a device list member to struct phy_driver.

v2:
- adjusted commit message
- exclude also device C22EXT from link checking

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell10g.c | 10 +---------
 drivers/net/phy/phy-c45.c    | 18 ++++++++++--------
 include/linux/phy.h          |  2 +-
 include/uapi/linux/mdio.h    |  2 ++
 4 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 296a537cdfcb..96a79c6c7810 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -428,16 +428,8 @@ static int mv3310_read_10gbr_status(struct phy_device *phydev)
 
 static int mv3310_read_status(struct phy_device *phydev)
 {
-	u32 mmd_mask = phydev->c45_ids.devices_in_package;
 	int val;
 
-	/* The vendor devads do not report link status.  Avoid the PHYXS
-	 * instance as there are three, and its status depends on the MAC
-	 * being appropriately configured for the negotiated speed.
-	 */
-	mmd_mask &= ~(BIT(MDIO_MMD_VEND1) | BIT(MDIO_MMD_VEND2) |
-		      BIT(MDIO_MMD_PHYXS));
-
 	phydev->speed = SPEED_UNKNOWN;
 	phydev->duplex = DUPLEX_UNKNOWN;
 	linkmode_zero(phydev->lp_advertising);
@@ -453,7 +445,7 @@ static int mv3310_read_status(struct phy_device *phydev)
 	if (val & MDIO_STAT1_LSTATUS)
 		return mv3310_read_10gbr_status(phydev);
 
-	val = genphy_c45_read_link(phydev, mmd_mask);
+	val = genphy_c45_read_link(phydev);
 	if (val < 0)
 		return val;
 
diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index c92d0fb7ec4f..6adfe1f6319e 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -118,17 +118,24 @@ EXPORT_SYMBOL_GPL(genphy_c45_aneg_done);
 /**
  * genphy_c45_read_link - read the overall link status from the MMDs
  * @phydev: target phy_device struct
- * @mmd_mask: MMDs to read status from
  *
  * Read the link status from the specified MMDs, and if they all indicate
  * that the link is up, set phydev->link to 1.  If an error is encountered,
  * a negative errno will be returned, otherwise zero.
  */
-int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask)
+int genphy_c45_read_link(struct phy_device *phydev)
 {
+	u32 mmd_mask = phydev->c45_ids.devices_in_package;
 	int val, devad;
 	bool link = true;
 
+	/* The vendor devads and C22EXT do not report link status. Avoid the
+	 * PHYXS instance as its status may depend on the MAC being
+	 * appropriately configured for the negotiated speed.
+	 */
+	mmd_mask &= ~(MDIO_DEVS_VEND1 | MDIO_DEVS_VEND2 | MDIO_DEVS_C22EXT |
+		      MDIO_DEVS_PHYXS);
+
 	while (mmd_mask && link) {
 		devad = __ffs(mmd_mask);
 		mmd_mask &= ~BIT(devad);
@@ -266,16 +273,11 @@ EXPORT_SYMBOL_GPL(gen10g_config_aneg);
 
 int gen10g_read_status(struct phy_device *phydev)
 {
-	u32 mmd_mask = phydev->c45_ids.devices_in_package;
-
 	/* For now just lie and say it's 10G all the time */
 	phydev->speed = SPEED_10000;
 	phydev->duplex = DUPLEX_FULL;
 
-	/* Avoid reading the vendor MMDs */
-	mmd_mask &= ~(BIT(MDIO_MMD_VEND1) | BIT(MDIO_MMD_VEND2));
-
-	return genphy_c45_read_link(phydev, mmd_mask);
+	return genphy_c45_read_link(phydev);
 }
 EXPORT_SYMBOL_GPL(gen10g_read_status);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 237dd035858a..f41bf651f6a0 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1094,7 +1094,7 @@ int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum,
 /* Clause 45 PHY */
 int genphy_c45_restart_aneg(struct phy_device *phydev);
 int genphy_c45_aneg_done(struct phy_device *phydev);
-int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask);
+int genphy_c45_read_link(struct phy_device *phydev);
 int genphy_c45_read_lpa(struct phy_device *phydev);
 int genphy_c45_read_pma(struct phy_device *phydev);
 int genphy_c45_pma_setup_forced(struct phy_device *phydev);
diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index d435b00d64ad..2e6e309f0847 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -123,6 +123,8 @@
 #define MDIO_DEVS_TC			MDIO_DEVS_PRESENT(MDIO_MMD_TC)
 #define MDIO_DEVS_AN			MDIO_DEVS_PRESENT(MDIO_MMD_AN)
 #define MDIO_DEVS_C22EXT		MDIO_DEVS_PRESENT(MDIO_MMD_C22EXT)
+#define MDIO_DEVS_VEND1			MDIO_DEVS_PRESENT(MDIO_MMD_VEND1)
+#define MDIO_DEVS_VEND2			MDIO_DEVS_PRESENT(MDIO_MMD_VEND2)
 
 /* Control register 2. */
 #define MDIO_PMA_CTRL2_TYPE		0x000f	/* PMA/PMD type selection */
-- 
cgit v1.2.3-59-g8ed1b


From 05f8bc82fc428dbaf41764e95167dd759769f33d Mon Sep 17 00:00:00 2001
From: Randy Li <ayaka@soulik.info>
Date: Thu, 10 Jan 2019 03:57:09 +0800
Subject: drm/fourcc: Add new P010, P016 video format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

P010 is a planar 4:2:0 YUV with interleaved UV plane, 10 bits per
channel video format.

P012 is a planar 4:2:0 YUV 12 bits per channel

P016 is a planar 4:2:0 YUV with interleaved UV plane, 16 bits per
channel video format.

V3: Added P012 and fixed cpp for P010.
V4: format definition refined per review.
V5: Format comment block for each new pixel format.
V6: reversed Cb/Cr order in comments.
v7: reversed Cb/Cr order in comments of header files, remove
the wrong part of commit message.
V8: reversed V7 changes except commit message and rebased.
v9: used the new properties to describe those format and
rebased.

Cc: Daniel Stone <daniel@fooishbar.org>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>

Signed-off-by: Randy Li <ayaka@soulik.info>
Signed-off-by: Clint Taylor <clinton.a.taylor@intel.com>
Reviewed-by: Ayan Kumar Halder <ayan.halder@arm.com>
Reviewed-by: Neil Armstrong <narmstrong@baylibre.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190109195710.28501-2-ayaka@soulik.info
---
 drivers/gpu/drm/drm_fourcc.c  |  9 +++++++++
 include/uapi/drm/drm_fourcc.h | 21 +++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c
index d90ee03a84c6..ba7e19d4336c 100644
--- a/drivers/gpu/drm/drm_fourcc.c
+++ b/drivers/gpu/drm/drm_fourcc.c
@@ -238,6 +238,15 @@ const struct drm_format_info *__drm_format_info(u32 format)
 		{ .format = DRM_FORMAT_X0L2,		.depth = 0,  .num_planes = 1,
 		  .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 },
 		  .hsub = 2, .vsub = 2, .is_yuv = true },
+		{ .format = DRM_FORMAT_P010,            .depth = 0,  .num_planes = 2,
+		  .char_per_block = { 2, 4, 0 }, .block_w = { 1, 0, 0 }, .block_h = { 1, 0, 0 },
+		  .hsub = 2, .vsub = 2, .is_yuv = true},
+		{ .format = DRM_FORMAT_P012,		.depth = 0,  .num_planes = 2,
+		  .char_per_block = { 2, 4, 0 }, .block_w = { 1, 0, 0 }, .block_h = { 1, 0, 0 },
+		   .hsub = 2, .vsub = 2, .is_yuv = true},
+		{ .format = DRM_FORMAT_P016,		.depth = 0,  .num_planes = 2,
+		  .char_per_block = { 2, 4, 0 }, .block_w = { 1, 0, 0 }, .block_h = { 1, 0, 0 },
+		  .hsub = 2, .vsub = 2, .is_yuv = true},
 	};
 
 	unsigned int i;
diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index 91d08a23f125..0fa360223255 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -195,6 +195,27 @@ extern "C" {
 #define DRM_FORMAT_NV24		fourcc_code('N', 'V', '2', '4') /* non-subsampled Cr:Cb plane */
 #define DRM_FORMAT_NV42		fourcc_code('N', 'V', '4', '2') /* non-subsampled Cb:Cr plane */
 
+/*
+ * 2 plane YCbCr MSB aligned
+ * index 0 = Y plane, [15:0] Y:x [10:6] little endian
+ * index 1 = Cr:Cb plane, [31:0] Cr:x:Cb:x [10:6:10:6] little endian
+ */
+#define DRM_FORMAT_P010		fourcc_code('P', '0', '1', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel */
+
+/*
+ * 2 plane YCbCr MSB aligned
+ * index 0 = Y plane, [15:0] Y:x [12:4] little endian
+ * index 1 = Cr:Cb plane, [31:0] Cr:x:Cb:x [12:4:12:4] little endian
+ */
+#define DRM_FORMAT_P012		fourcc_code('P', '0', '1', '2') /* 2x2 subsampled Cr:Cb plane 12 bits per channel */
+
+/*
+ * 2 plane YCbCr MSB aligned
+ * index 0 = Y plane, [15:0] Y little endian
+ * index 1 = Cr:Cb plane, [31:0] Cr:Cb [16:16] little endian
+ */
+#define DRM_FORMAT_P016		fourcc_code('P', '0', '1', '6') /* 2x2 subsampled Cr:Cb plane 16 bits per channel */
+
 /*
  * 3 plane YCbCr
  * index 0: Y plane, [7:0] Y
-- 
cgit v1.2.3-59-g8ed1b


From 2c1619edef61a03cb516efaa81750784c3071d10 Mon Sep 17 00:00:00 2001
From: Danit Goldberg <danitg@mellanox.com>
Date: Thu, 24 Jan 2019 14:18:15 +0200
Subject: IB/cma: Define option to set ack timeout and pack tos_set

Define new option in 'rdma_set_option' to override calculated QP timeout
when requested to provide QP attributes to modify a QP.

At the same time, pack tos_set to be bitfield.

Signed-off-by: Danit Goldberg <danitg@mellanox.com>
Reviewed-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c      | 32 ++++++++++++++++++++++++++++++++
 drivers/infiniband/core/cma_priv.h |  4 +++-
 drivers/infiniband/core/ucma.c     |  7 +++++++
 include/rdma/rdma_cm.h             |  1 +
 include/uapi/rdma/rdma_user_cm.h   |  4 ++++
 5 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index e15546ae4d0f..83aa2ad0c27e 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -888,6 +888,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net,
 	id_priv->id.ps = ps;
 	id_priv->id.qp_type = qp_type;
 	id_priv->tos_set = false;
+	id_priv->timeout_set = false;
 	id_priv->gid_type = IB_GID_TYPE_IB;
 	spin_lock_init(&id_priv->lock);
 	mutex_init(&id_priv->qp_mutex);
@@ -1130,6 +1131,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
 	} else
 		ret = -ENOSYS;
 
+	if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set)
+		qp_attr->timeout = id_priv->timeout;
+
 	return ret;
 }
 EXPORT_SYMBOL(rdma_init_qp_attr);
@@ -2490,6 +2494,34 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos)
 }
 EXPORT_SYMBOL(rdma_set_service_type);
 
+/**
+ * rdma_set_ack_timeout() - Set the ack timeout of QP associated
+ *                          with a connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec.
+ *
+ * This function should be called before rdma_connect() on active side,
+ * and on passive side before rdma_accept(). It is applicable to primary
+ * path only. The timeout will affect the local side of the QP, it is not
+ * negotiated with remote side and zero disables the timer.
+ *
+ * Return: 0 for success
+ */
+int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
+{
+	struct rdma_id_private *id_priv;
+
+	if (id->qp_type != IB_QPT_RC)
+		return -EINVAL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->timeout = timeout;
+	id_priv->timeout_set = true;
+
+	return 0;
+}
+EXPORT_SYMBOL(rdma_set_ack_timeout);
+
 static void cma_query_handler(int status, struct sa_path_rec *path_rec,
 			      void *context)
 {
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index cf47c69436a7..ca7307277518 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -84,9 +84,11 @@ struct rdma_id_private {
 	u32			options;
 	u8			srq;
 	u8			tos;
-	bool			tos_set;
+	u8			tos_set:1;
+	u8                      timeout_set:1;
 	u8			reuseaddr;
 	u8			afonly;
+	u8			timeout;
 	enum ib_gid_type	gid_type;
 
 	/*
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 01d68ed46c1b..7468b26b8a01 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1236,6 +1236,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
 		}
 		ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
 		break;
+	case RDMA_OPTION_ID_ACK_TIMEOUT:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval));
+		break;
 	default:
 		ret = -ENOSYS;
 	}
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 60987a5903b7..71f48cfdc24c 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -374,6 +374,7 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse);
  */
 int rdma_set_afonly(struct rdma_cm_id *id, int afonly);
 
+int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout);
  /**
  * rdma_get_service_id - Return the IB service ID for a specified address.
  * @id: Communication identifier associated with the address.
diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
index 0d1e78ebad05..e42940a215a3 100644
--- a/include/uapi/rdma/rdma_user_cm.h
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -300,6 +300,10 @@ enum {
 	RDMA_OPTION_ID_TOS	 = 0,
 	RDMA_OPTION_ID_REUSEADDR = 1,
 	RDMA_OPTION_ID_AFONLY	 = 2,
+	RDMA_OPTION_ID_ACK_TIMEOUT = 3
+};
+
+enum {
 	RDMA_OPTION_IB_PATH	 = 1
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 3b5e74e0afe3382f9354b657714ac40673b7c597 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 8 Feb 2019 19:25:22 +0100
Subject: net: phy: disregard "Clause 22 registers present" bit in
 get_phy_c45_devs_in_pkg

Bit 0 in register 1.5 doesn't represent a device but is a flag that
Clause 22 registers are present. Therefore disregard this bit when
populating the device list. If code needs this information it
should read register 1.5 directly instead of accessing the device
list.
Because this bit doesn't represent a device don't define a
MDIO_MMD_XYZ constant, just define a MDIO_DEVS_XYZ constant for
the flag in the device list bitmap.

v2:
- make masking of bit 0 more explicit
- improve commit message

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 3 +++
 include/uapi/linux/mdio.h    | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 9369e1323c39..d4fc1fd8af41 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -684,6 +684,9 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
 		return -EIO;
 	*devices_in_package |= (phy_reg & 0xffff);
 
+	/* Bit 0 doesn't represent a device, it indicates c22 regs presence */
+	*devices_in_package &= ~BIT(0);
+
 	return 0;
 }
 
diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index 2e6e309f0847..0e012b168e4d 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -115,6 +115,7 @@
 
 /* Device present registers. */
 #define MDIO_DEVS_PRESENT(devad)	(1 << (devad))
+#define MDIO_DEVS_C22PRESENT		MDIO_DEVS_PRESENT(0)
 #define MDIO_DEVS_PMAPMD		MDIO_DEVS_PRESENT(MDIO_MMD_PMAPMD)
 #define MDIO_DEVS_WIS			MDIO_DEVS_PRESENT(MDIO_MMD_WIS)
 #define MDIO_DEVS_PCS			MDIO_DEVS_PRESENT(MDIO_MMD_PCS)
-- 
cgit v1.2.3-59-g8ed1b


From 180cf62cec0418a64dade18b3575047af46c6335 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 9 Feb 2019 00:05:36 +0100
Subject: batman-adv: Fix typo "reseved" -> "reserved"

checkpatch.pl complains since commit 45e417022023 ("scripts/spelling.txt:
add more spellings to spelling.txt") about an additional spelling mistake
in batman-adv:`

  CHECK: 'reseved' may be misspelled - perhaps 'reserved'?
  #232: FILE: include/uapi/linux/batadv_packet.h:232:
  + * @flags: reseved for routing relevant flags - currently always 0

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batadv_packet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h
index 7eb2936a8e22..c99336f4eefe 100644
--- a/include/uapi/linux/batadv_packet.h
+++ b/include/uapi/linux/batadv_packet.h
@@ -229,7 +229,7 @@ struct batadv_ogm_packet {
  * @packet_type: batman-adv packet type, part of the general header
  * @version: batman-adv protocol version, part of the general header
  * @ttl: time to live for this packet, part of the general header
- * @flags: reseved for routing relevant flags - currently always 0
+ * @flags: reserved for routing relevant flags - currently always 0
  * @seqno: sequence number
  * @orig: originator mac address
  * @tvlv_len: length of the appended tvlv buffer (in bytes)
-- 
cgit v1.2.3-59-g8ed1b


From 60040513536097584c3d55b39acdfa7080645d80 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 12:14:56 +0100
Subject: batman-adv: Prepare framework for mesh genl config

The batman-adv configuration interface was implemented solely using sysfs.
This approach was condemned by non-batadv developers as "huge mistake".
Instead a netlink/genl based implementation was suggested.

The main objects for this configuration is the mesh/soft-interface object.
Its actual object in memory already contains most of the available
configuration settings. The genl interface reflects this by allowing to
get/set it using the mesh specific commands.

The BATADV_CMD_GET_MESH_INFO (or short version BATADV_CMD_GET_MESH) is
reused as get command because it already provides the content of other
information from the mesh/soft-interface which are not yet configuration
specific.

The set command BATADV_CMD_SET_MESH will also notify interested userspace
listeners of the "config" mcast group using the BATADV_CMD_SET_MESH command
message type that settings might have been changed and what the current
values are.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  16 +++-
 net/batman-adv/netlink.c        | 160 ++++++++++++++++++++++++----------------
 2 files changed, 111 insertions(+), 65 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index a28e76a7e0a2..b3394aac9a88 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -27,6 +27,7 @@
 
 #define BATADV_NL_NAME "batadv"
 
+#define BATADV_NL_MCAST_GROUP_CONFIG	"config"
 #define BATADV_NL_MCAST_GROUP_TPMETER	"tpmeter"
 
 /**
@@ -372,10 +373,14 @@ enum batadv_nl_commands {
 	BATADV_CMD_UNSPEC,
 
 	/**
-	 * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv
-	 * device
+	 * @BATADV_CMD_GET_MESH: Get attributes from softif/mesh
 	 */
-	BATADV_CMD_GET_MESH_INFO,
+	BATADV_CMD_GET_MESH,
+
+	/**
+	 * @BATADV_CMD_GET_MESH_INFO: Alias for @BATADV_CMD_GET_MESH
+	 */
+	BATADV_CMD_GET_MESH_INFO = BATADV_CMD_GET_MESH,
 
 	/**
 	 * @BATADV_CMD_TP_METER: Start a tp meter session
@@ -443,6 +448,11 @@ enum batadv_nl_commands {
 	 */
 	BATADV_CMD_GET_MCAST_FLAGS,
 
+	/**
+	 * @BATADV_CMD_SET_MESH: Set attributes for softif/mesh
+	 */
+	BATADV_CMD_SET_MESH,
+
 	/* add new commands above here */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index e56c40d1bfe0..179cc7cfcb70 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -62,6 +62,7 @@ struct genl_family batadv_netlink_family;
 
 /* multicast groups */
 enum batadv_netlink_multicast_groups {
+	BATADV_NL_MCGRP_CONFIG,
 	BATADV_NL_MCGRP_TPMETER,
 };
 
@@ -78,6 +79,7 @@ enum batadv_genl_ops_flags {
 };
 
 static const struct genl_multicast_group batadv_netlink_mcgrps[] = {
+	[BATADV_NL_MCGRP_CONFIG] = { .name = BATADV_NL_MCAST_GROUP_CONFIG },
 	[BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER },
 };
 
@@ -138,20 +140,29 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
 }
 
 /**
- * batadv_netlink_mesh_info_put() - fill in generic information about mesh
- *  interface
- * @msg: netlink message to be sent back
- * @soft_iface: interface for which the data should be taken
+ * batadv_netlink_mesh_fill() - Fill message with mesh attributes
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the soft interface information
+ * @cmd: type of message to generate
+ * @portid: Port making netlink request
+ * @seq: sequence number for message
+ * @flags: Additional flags for message
  *
- * Return: 0 on success, < 0 on error
+ * Return: 0 on success or negative error number in case of failure
  */
-static int
-batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
+static int batadv_netlink_mesh_fill(struct sk_buff *msg,
+				    struct batadv_priv *bat_priv,
+				    enum batadv_nl_commands cmd,
+				    u32 portid, u32 seq, int flags)
 {
-	struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+	struct net_device *soft_iface = bat_priv->soft_iface;
 	struct batadv_hard_iface *primary_if = NULL;
 	struct net_device *hard_iface;
-	int ret = -ENOBUFS;
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd);
+	if (!hdr)
+		return -ENOBUFS;
 
 	if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) ||
 	    nla_put_string(msg, BATADV_ATTR_ALGO_NAME,
@@ -162,16 +173,16 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
 		    soft_iface->dev_addr) ||
 	    nla_put_u8(msg, BATADV_ATTR_TT_TTVN,
 		       (u8)atomic_read(&bat_priv->tt.vn)))
-		goto out;
+		goto nla_put_failure;
 
 #ifdef CONFIG_BATMAN_ADV_BLA
 	if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
 			ntohs(bat_priv->bla.claim_dest.group)))
-		goto out;
+		goto nla_put_failure;
 #endif
 
 	if (batadv_mcast_mesh_info_put(msg, bat_priv))
-		goto out;
+		goto nla_put_failure;
 
 	primary_if = batadv_primary_if_get_selected(bat_priv);
 	if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) {
@@ -183,77 +194,95 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
 				   hard_iface->name) ||
 		    nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN,
 			    hard_iface->dev_addr))
-			goto out;
+			goto nla_put_failure;
 	}
 
-	ret = 0;
+	if (primary_if)
+		batadv_hardif_put(primary_if);
 
- out:
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
-	return ret;
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
 }
 
 /**
- * batadv_netlink_get_mesh_info() - handle incoming BATADV_CMD_GET_MESH_INFO
- *  netlink request
- * @skb: received netlink message
- * @info: receiver information
+ * batadv_netlink_notify_mesh() - send softif attributes to listener
+ * @bat_priv: the bat priv with all the soft interface information
  *
  * Return: 0 on success, < 0 on error
  */
-static int
-batadv_netlink_get_mesh_info(struct sk_buff *skb, struct genl_info *info)
+static int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv)
 {
-	struct net *net = genl_info_net(info);
-	struct net_device *soft_iface;
-	struct sk_buff *msg = NULL;
-	void *msg_head;
-	int ifindex;
+	struct sk_buff *msg;
 	int ret;
 
-	if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
-		return -EINVAL;
-
-	ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
-	if (!ifindex)
-		return -EINVAL;
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
 
-	soft_iface = dev_get_by_index(net, ifindex);
-	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
-		ret = -ENODEV;
-		goto out;
+	ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_SET_MESH,
+				       0, 0, 0);
+	if (ret < 0) {
+		nlmsg_free(msg);
+		return ret;
 	}
 
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	genlmsg_multicast_netns(&batadv_netlink_family,
+				dev_net(bat_priv->soft_iface), msg, 0,
+				BATADV_NL_MCGRP_CONFIG, GFP_KERNEL);
 
-	msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq,
-			       &batadv_netlink_family, 0,
-			       BATADV_CMD_GET_MESH_INFO);
-	if (!msg_head) {
-		ret = -ENOBUFS;
-		goto out;
-	}
+	return 0;
+}
 
-	ret = batadv_netlink_mesh_info_put(msg, soft_iface);
+/**
+ * batadv_netlink_get_mesh() - Get softif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+	struct batadv_priv *bat_priv = info->user_ptr[0];
+	struct sk_buff *msg;
+	int ret;
 
- out:
-	if (soft_iface)
-		dev_put(soft_iface);
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
 
-	if (ret) {
-		if (msg)
-			nlmsg_free(msg);
+	ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_GET_MESH,
+				       info->snd_portid, info->snd_seq, 0);
+	if (ret < 0) {
+		nlmsg_free(msg);
 		return ret;
 	}
 
-	genlmsg_end(msg, msg_head);
-	return genlmsg_reply(msg, info);
+	ret = genlmsg_reply(msg, info);
+
+	return ret;
+}
+
+/**
+ * batadv_netlink_set_mesh() - Set softif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+	struct batadv_priv *bat_priv = info->user_ptr[0];
+
+	batadv_netlink_notify_mesh(bat_priv);
+
+	return 0;
 }
 
 /**
@@ -600,10 +629,11 @@ static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
 
 static const struct genl_ops batadv_netlink_ops[] = {
 	{
-		.cmd = BATADV_CMD_GET_MESH_INFO,
-		.flags = GENL_ADMIN_PERM,
+		.cmd = BATADV_CMD_GET_MESH,
+		/* can be retrieved by unprivileged users */
 		.policy = batadv_netlink_policy,
-		.doit = batadv_netlink_get_mesh_info,
+		.doit = batadv_netlink_get_mesh,
+		.internal_flags = BATADV_FLAG_NEED_MESH,
 	},
 	{
 		.cmd = BATADV_CMD_TP_METER,
@@ -685,7 +715,13 @@ static const struct genl_ops batadv_netlink_ops[] = {
 		.policy = batadv_netlink_policy,
 		.dumpit = batadv_mcast_flags_dump,
 	},
-
+	{
+		.cmd = BATADV_CMD_SET_MESH,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.doit = batadv_netlink_set_mesh,
+		.internal_flags = BATADV_FLAG_NEED_MESH,
+	},
 };
 
 struct genl_family batadv_netlink_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 5c55a40fa801df2d807141319c0fdbb3939c3947 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 12:33:17 +0100
Subject: batman-adv: Prepare framework for hardif genl config

The batman-adv configuration interface was implemented solely using sysfs.
This approach was condemned by non-batadv developers as "huge mistake".
Instead a netlink/genl based implementation was suggested.

Beside the mesh/soft-interface specific configuration, the
slave/hard-interface have B.A.T.M.A.N. V specific configuration settings.
The genl interface reflects this by allowing to get/set it using the
hard-interface specific commands.

The BATADV_CMD_GET_HARDIFS (or short version BATADV_CMD_GET_HARDIF) is
reused as get command because it already allow sto dump the content of
other information from the slave/hard-interface which are not yet
configuration specific.

The set command BATADV_CMD_SET_HARDIF will also notify interested userspace
listeners of the "config" mcast group using the BATADV_CMD_SET_HARDIF
command message type that settings might have been changed and what the
current values are.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  16 ++-
 net/batman-adv/netlink.c        | 240 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 233 insertions(+), 23 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index b3394aac9a88..62456a087ef6 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -398,9 +398,15 @@ enum batadv_nl_commands {
 	BATADV_CMD_GET_ROUTING_ALGOS,
 
 	/**
-	 * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
+	 * @BATADV_CMD_GET_HARDIF: Get attributes from a hardif of the
+	 *  current softif
 	 */
-	BATADV_CMD_GET_HARDIFS,
+	BATADV_CMD_GET_HARDIF,
+
+	/**
+	 * @BATADV_CMD_GET_HARDIFS: Alias for @BATADV_CMD_GET_HARDIF
+	 */
+	BATADV_CMD_GET_HARDIFS = BATADV_CMD_GET_HARDIF,
 
 	/**
 	 * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations
@@ -453,6 +459,12 @@ enum batadv_nl_commands {
 	 */
 	BATADV_CMD_SET_MESH,
 
+	/**
+	 * @BATADV_CMD_SET_HARDIF: Set attributes for hardif of the
+	 *  current softif
+	 */
+	BATADV_CMD_SET_HARDIF,
+
 	/* add new commands above here */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 179cc7cfcb70..a184e4225fb5 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -21,6 +21,7 @@
 
 #include <linux/atomic.h>
 #include <linux/bitops.h>
+#include <linux/bug.h>
 #include <linux/byteorder/generic.h>
 #include <linux/cache.h>
 #include <linux/err.h>
@@ -76,6 +77,13 @@ enum batadv_genl_ops_flags {
 	 *  saved in info->user_ptr[0]
 	 */
 	BATADV_FLAG_NEED_MESH = BIT(0),
+
+	/**
+	 * @BATADV_FLAG_NEED_HARDIF: request requires valid hard interface in
+	 *  attribute BATADV_ATTR_HARD_IFINDEX and expects a pointer to it to be
+	 *  saved in info->user_ptr[1]
+	 */
+	BATADV_FLAG_NEED_HARDIF = BIT(1),
 };
 
 static const struct genl_multicast_group batadv_netlink_mcgrps[] = {
@@ -446,29 +454,38 @@ batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info)
 }
 
 /**
- * batadv_netlink_dump_hardif_entry() - Dump one hard interface into a message
+ * batadv_netlink_hardif_fill() - Fill message with hardif attributes
  * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hard_iface: hard interface which was modified
+ * @cmd: type of message to generate
  * @portid: Port making netlink request
+ * @seq: sequence number for message
+ * @flags: Additional flags for message
  * @cb: Control block containing additional options
- * @hard_iface: Hard interface to dump
  *
- * Return: error code, or 0 on success
+ * Return: 0 on success or negative error number in case of failure
  */
-static int
-batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid,
-				 struct netlink_callback *cb,
-				 struct batadv_hard_iface *hard_iface)
+static int batadv_netlink_hardif_fill(struct sk_buff *msg,
+				      struct batadv_priv *bat_priv,
+				      struct batadv_hard_iface *hard_iface,
+				      enum batadv_nl_commands cmd,
+				      u32 portid, u32 seq, int flags,
+				      struct netlink_callback *cb)
 {
 	struct net_device *net_dev = hard_iface->net_dev;
 	void *hdr;
 
-	hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
-			  &batadv_netlink_family, NLM_F_MULTI,
-			  BATADV_CMD_GET_HARDIFS);
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd);
 	if (!hdr)
-		return -EMSGSIZE;
+		return -ENOBUFS;
+
+	if (cb)
+		genl_dump_check_consistent(cb, hdr);
 
-	genl_dump_check_consistent(cb, hdr);
+	if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX,
+			bat_priv->soft_iface->ifindex))
+		goto nla_put_failure;
 
 	if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
 			net_dev->ifindex) ||
@@ -486,24 +503,107 @@ batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid,
 	genlmsg_end(msg, hdr);
 	return 0;
 
- nla_put_failure:
+nla_put_failure:
 	genlmsg_cancel(msg, hdr);
 	return -EMSGSIZE;
 }
 
 /**
- * batadv_netlink_dump_hardifs() - Dump all hard interface into a messages
+ * batadv_netlink_notify_hardif() - send hardif attributes to listener
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hard_iface: hard interface which was modified
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv,
+					struct batadv_hard_iface *hard_iface)
+{
+	struct sk_buff *msg;
+	int ret;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface,
+					 BATADV_CMD_SET_HARDIF, 0, 0, 0, NULL);
+	if (ret < 0) {
+		nlmsg_free(msg);
+		return ret;
+	}
+
+	genlmsg_multicast_netns(&batadv_netlink_family,
+				dev_net(bat_priv->soft_iface), msg, 0,
+				BATADV_NL_MCGRP_CONFIG, GFP_KERNEL);
+
+	return 0;
+}
+
+/**
+ * batadv_netlink_get_hardif() - Get hardif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_get_hardif(struct sk_buff *skb,
+				     struct genl_info *info)
+{
+	struct batadv_hard_iface *hard_iface = info->user_ptr[1];
+	struct batadv_priv *bat_priv = info->user_ptr[0];
+	struct sk_buff *msg;
+	int ret;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface,
+					 BATADV_CMD_GET_HARDIF,
+					 info->snd_portid, info->snd_seq, 0,
+					 NULL);
+	if (ret < 0) {
+		nlmsg_free(msg);
+		return ret;
+	}
+
+	ret = genlmsg_reply(msg, info);
+
+	return ret;
+}
+
+/**
+ * batadv_netlink_set_hardif() - Set hardif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_hardif(struct sk_buff *skb,
+				     struct genl_info *info)
+{
+	struct batadv_hard_iface *hard_iface = info->user_ptr[1];
+	struct batadv_priv *bat_priv = info->user_ptr[0];
+
+	batadv_netlink_notify_hardif(bat_priv, hard_iface);
+
+	return 0;
+}
+
+/**
+ * batadv_netlink_dump_hardif() - Dump all hard interface into a messages
  * @msg: Netlink message to dump into
  * @cb: Parameters from query
  *
  * Return: error code, or length of reply message on success
  */
 static int
-batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
+batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(cb->skb->sk);
 	struct net_device *soft_iface;
 	struct batadv_hard_iface *hard_iface;
+	struct batadv_priv *bat_priv;
 	int ifindex;
 	int portid = NETLINK_CB(cb->skb).portid;
 	int skip = cb->args[0];
@@ -523,6 +623,8 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
 		return -ENODEV;
 	}
 
+	bat_priv = netdev_priv(soft_iface);
+
 	rtnl_lock();
 	cb->seq = batadv_hardif_generation << 1 | 1;
 
@@ -533,8 +635,10 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
 		if (i++ < skip)
 			continue;
 
-		if (batadv_netlink_dump_hardif_entry(msg, portid, cb,
-						     hard_iface)) {
+		if (batadv_netlink_hardif_fill(msg, bat_priv, hard_iface,
+					       BATADV_CMD_GET_HARDIF,
+					       portid, cb->nlh->nlmsg_seq,
+					       NLM_F_MULTI, cb)) {
 			i--;
 			break;
 		}
@@ -583,6 +687,52 @@ err_put_softif:
 	return ERR_PTR(-EINVAL);
 }
 
+/**
+ * batadv_get_hardif_from_info() - Retrieve hardif from genl attributes
+ * @bat_priv: the bat priv with all the soft interface information
+ * @net: the applicable net namespace
+ * @info: receiver information
+ *
+ * Return: Pointer to hard interface (with increased refcnt) on success, error
+ *  pointer on error
+ */
+static struct batadv_hard_iface *
+batadv_get_hardif_from_info(struct batadv_priv *bat_priv, struct net *net,
+			    struct genl_info *info)
+{
+	struct batadv_hard_iface *hard_iface;
+	struct net_device *hard_dev;
+	unsigned int hardif_index;
+
+	if (!info->attrs[BATADV_ATTR_HARD_IFINDEX])
+		return ERR_PTR(-EINVAL);
+
+	hardif_index = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]);
+
+	hard_dev = dev_get_by_index(net, hardif_index);
+	if (!hard_dev)
+		return ERR_PTR(-ENODEV);
+
+	hard_iface = batadv_hardif_get_by_netdev(hard_dev);
+	if (!hard_iface)
+		goto err_put_harddev;
+
+	if (hard_iface->soft_iface != bat_priv->soft_iface)
+		goto err_put_hardif;
+
+	/* hard_dev is referenced by hard_iface and not needed here */
+	dev_put(hard_dev);
+
+	return hard_iface;
+
+err_put_hardif:
+	batadv_hardif_put(hard_iface);
+err_put_harddev:
+	dev_put(hard_dev);
+
+	return ERR_PTR(-EINVAL);
+}
+
 /**
  * batadv_pre_doit() - Prepare batman-adv genl doit request
  * @ops: requested netlink operation
@@ -595,8 +745,21 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 			   struct genl_info *info)
 {
 	struct net *net = genl_info_net(info);
+	struct batadv_hard_iface *hard_iface;
+	struct batadv_priv *bat_priv = NULL;
 	struct net_device *soft_iface;
-	struct batadv_priv *bat_priv;
+	u8 user_ptr1_flags;
+	u8 mesh_dep_flags;
+	int ret;
+
+	user_ptr1_flags = BATADV_FLAG_NEED_HARDIF;
+	if (WARN_ON(hweight8(ops->internal_flags & user_ptr1_flags) > 1))
+		return -EINVAL;
+
+	mesh_dep_flags = BATADV_FLAG_NEED_HARDIF;
+	if (WARN_ON((ops->internal_flags & mesh_dep_flags) &&
+		    (~ops->internal_flags & BATADV_FLAG_NEED_MESH)))
+		return -EINVAL;
 
 	if (ops->internal_flags & BATADV_FLAG_NEED_MESH) {
 		soft_iface = batadv_get_softif_from_info(net, info);
@@ -607,7 +770,23 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 		info->user_ptr[0] = bat_priv;
 	}
 
+	if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF) {
+		hard_iface = batadv_get_hardif_from_info(bat_priv, net, info);
+		if (IS_ERR(hard_iface)) {
+			ret = PTR_ERR(hard_iface);
+			goto err_put_softif;
+		}
+
+		info->user_ptr[1] = hard_iface;
+	}
+
 	return 0;
+
+err_put_softif:
+	if (bat_priv)
+		dev_put(bat_priv->soft_iface);
+
+	return ret;
 }
 
 /**
@@ -619,8 +798,16 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
 			     struct genl_info *info)
 {
+	struct batadv_hard_iface *hard_iface;
 	struct batadv_priv *bat_priv;
 
+	if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF &&
+	    info->user_ptr[1]) {
+		hard_iface = info->user_ptr[1];
+
+		batadv_hardif_put(hard_iface);
+	}
+
 	if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) {
 		bat_priv = info->user_ptr[0];
 		dev_put(bat_priv->soft_iface);
@@ -656,10 +843,13 @@ static const struct genl_ops batadv_netlink_ops[] = {
 		.dumpit = batadv_algo_dump,
 	},
 	{
-		.cmd = BATADV_CMD_GET_HARDIFS,
-		.flags = GENL_ADMIN_PERM,
+		.cmd = BATADV_CMD_GET_HARDIF,
+		/* can be retrieved by unprivileged users */
 		.policy = batadv_netlink_policy,
-		.dumpit = batadv_netlink_dump_hardifs,
+		.dumpit = batadv_netlink_dump_hardif,
+		.doit = batadv_netlink_get_hardif,
+		.internal_flags = BATADV_FLAG_NEED_MESH |
+				  BATADV_FLAG_NEED_HARDIF,
 	},
 	{
 		.cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL,
@@ -722,6 +912,14 @@ static const struct genl_ops batadv_netlink_ops[] = {
 		.doit = batadv_netlink_set_mesh,
 		.internal_flags = BATADV_FLAG_NEED_MESH,
 	},
+	{
+		.cmd = BATADV_CMD_SET_HARDIF,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.doit = batadv_netlink_set_hardif,
+		.internal_flags = BATADV_FLAG_NEED_MESH |
+				  BATADV_FLAG_NEED_HARDIF,
+	},
 };
 
 struct genl_family batadv_netlink_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 49e7e37cd98122126e8da58df2fe2261c6e83df2 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 12:41:08 +0100
Subject: batman-adv: Prepare framework for vlan genl config

The batman-adv configuration interface was implemented solely using sysfs.
This approach was condemned by non-batadv developers as "huge mistake".
Instead a netlink/genl based implementation was suggested.

Beside the mesh/soft-interface specific configuration, the VLANs on top of
the mesh/soft-interface have configuration settings. The genl interface
reflects this by allowing to get/set it using the vlan specific commands
BATADV_CMD_GET_VLAN/BATADV_CMD_SET_VLAN.

The set command BATADV_CMD_SET_MESH will also notify interested userspace
listeners of the "config" mcast group using the BATADV_CMD_SET_VLAN command
message type that settings might have been changed and what the current
values are.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  17 ++++
 net/batman-adv/netlink.c        | 191 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 206 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 62456a087ef6..10f3cc34fe16 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -345,6 +345,11 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_MCAST_FLAGS_PRIV,
 
+	/**
+	 * @BATADV_ATTR_VLANID: VLAN id on top of soft interface
+	 */
+	BATADV_ATTR_VLANID,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
@@ -465,6 +470,18 @@ enum batadv_nl_commands {
 	 */
 	BATADV_CMD_SET_HARDIF,
 
+	/**
+	 * @BATADV_CMD_GET_VLAN: Get attributes from a VLAN of the
+	 *  current softif
+	 */
+	BATADV_CMD_GET_VLAN,
+
+	/**
+	 * @BATADV_CMD_SET_VLAN: Set attributes for VLAN of the
+	 *  current softif
+	 */
+	BATADV_CMD_SET_VLAN,
+
 	/* add new commands above here */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index a184e4225fb5..d40913bb9031 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -30,6 +30,7 @@
 #include <linux/genetlink.h>
 #include <linux/gfp.h>
 #include <linux/if_ether.h>
+#include <linux/if_vlan.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
@@ -84,6 +85,13 @@ enum batadv_genl_ops_flags {
 	 *  saved in info->user_ptr[1]
 	 */
 	BATADV_FLAG_NEED_HARDIF = BIT(1),
+
+	/**
+	 * @BATADV_FLAG_NEED_VLAN: request requires valid vlan in
+	 *  attribute BATADV_ATTR_VLANID and expects a pointer to it to be
+	 *  saved in info->user_ptr[1]
+	 */
+	BATADV_FLAG_NEED_VLAN = BIT(2),
 };
 
 static const struct genl_multicast_group batadv_netlink_mcgrps[] = {
@@ -130,6 +138,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_DAT_CACHE_VID]		= { .type = NLA_U16 },
 	[BATADV_ATTR_MCAST_FLAGS]		= { .type = NLA_U32 },
 	[BATADV_ATTR_MCAST_FLAGS_PRIV]		= { .type = NLA_U32 },
+	[BATADV_ATTR_VLANID]			= { .type = NLA_U16 },
 };
 
 /**
@@ -653,6 +662,123 @@ batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb)
 	return msg->len;
 }
 
+/**
+ * batadv_netlink_vlan_fill() - Fill message with vlan attributes
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vlan: vlan which was modified
+ * @cmd: type of message to generate
+ * @portid: Port making netlink request
+ * @seq: sequence number for message
+ * @flags: Additional flags for message
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_vlan_fill(struct sk_buff *msg,
+				    struct batadv_priv *bat_priv,
+				    struct batadv_softif_vlan *vlan,
+				    enum batadv_nl_commands cmd,
+				    u32 portid, u32 seq, int flags)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX,
+			bat_priv->soft_iface->ifindex))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/**
+ * batadv_netlink_notify_vlan() - send vlan attributes to listener
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vlan: vlan which was modified
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv,
+				      struct batadv_softif_vlan *vlan)
+{
+	struct sk_buff *msg;
+	int ret;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan,
+				       BATADV_CMD_SET_VLAN, 0, 0, 0);
+	if (ret < 0) {
+		nlmsg_free(msg);
+		return ret;
+	}
+
+	genlmsg_multicast_netns(&batadv_netlink_family,
+				dev_net(bat_priv->soft_iface), msg, 0,
+				BATADV_NL_MCGRP_CONFIG, GFP_KERNEL);
+
+	return 0;
+}
+
+/**
+ * batadv_netlink_get_vlan() - Get vlan attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_get_vlan(struct sk_buff *skb, struct genl_info *info)
+{
+	struct batadv_softif_vlan *vlan = info->user_ptr[1];
+	struct batadv_priv *bat_priv = info->user_ptr[0];
+	struct sk_buff *msg;
+	int ret;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_GET_VLAN,
+				       info->snd_portid, info->snd_seq, 0);
+	if (ret < 0) {
+		nlmsg_free(msg);
+		return ret;
+	}
+
+	ret = genlmsg_reply(msg, info);
+
+	return ret;
+}
+
+/**
+ * batadv_netlink_set_vlan() - Get vlan attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info)
+{
+	struct batadv_softif_vlan *vlan = info->user_ptr[1];
+	struct batadv_priv *bat_priv = info->user_ptr[0];
+
+	batadv_netlink_notify_vlan(bat_priv, vlan);
+
+	return 0;
+}
+
 /**
  * batadv_get_softif_from_info() - Retrieve soft interface from genl attributes
  * @net: the applicable net namespace
@@ -733,6 +859,34 @@ err_put_harddev:
 	return ERR_PTR(-EINVAL);
 }
 
+/**
+ * batadv_get_vlan_from_info() - Retrieve vlan from genl attributes
+ * @bat_priv: the bat priv with all the soft interface information
+ * @net: the applicable net namespace
+ * @info: receiver information
+ *
+ * Return: Pointer to vlan on success (with increased refcnt), error pointer
+ *  on error
+ */
+static struct batadv_softif_vlan *
+batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net,
+			  struct genl_info *info)
+{
+	struct batadv_softif_vlan *vlan;
+	u16 vid;
+
+	if (!info->attrs[BATADV_ATTR_VLANID])
+		return ERR_PTR(-EINVAL);
+
+	vid = nla_get_u16(info->attrs[BATADV_ATTR_VLANID]);
+
+	vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG);
+	if (!vlan)
+		return ERR_PTR(-ENOENT);
+
+	return vlan;
+}
+
 /**
  * batadv_pre_doit() - Prepare batman-adv genl doit request
  * @ops: requested netlink operation
@@ -747,16 +901,17 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 	struct net *net = genl_info_net(info);
 	struct batadv_hard_iface *hard_iface;
 	struct batadv_priv *bat_priv = NULL;
+	struct batadv_softif_vlan *vlan;
 	struct net_device *soft_iface;
 	u8 user_ptr1_flags;
 	u8 mesh_dep_flags;
 	int ret;
 
-	user_ptr1_flags = BATADV_FLAG_NEED_HARDIF;
+	user_ptr1_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN;
 	if (WARN_ON(hweight8(ops->internal_flags & user_ptr1_flags) > 1))
 		return -EINVAL;
 
-	mesh_dep_flags = BATADV_FLAG_NEED_HARDIF;
+	mesh_dep_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN;
 	if (WARN_ON((ops->internal_flags & mesh_dep_flags) &&
 		    (~ops->internal_flags & BATADV_FLAG_NEED_MESH)))
 		return -EINVAL;
@@ -780,6 +935,16 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 		info->user_ptr[1] = hard_iface;
 	}
 
+	if (ops->internal_flags & BATADV_FLAG_NEED_VLAN) {
+		vlan = batadv_get_vlan_from_info(bat_priv, net, info);
+		if (IS_ERR(vlan)) {
+			ret = PTR_ERR(vlan);
+			goto err_put_softif;
+		}
+
+		info->user_ptr[1] = vlan;
+	}
+
 	return 0;
 
 err_put_softif:
@@ -799,6 +964,7 @@ static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
 			     struct genl_info *info)
 {
 	struct batadv_hard_iface *hard_iface;
+	struct batadv_softif_vlan *vlan;
 	struct batadv_priv *bat_priv;
 
 	if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF &&
@@ -808,6 +974,11 @@ static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
 		batadv_hardif_put(hard_iface);
 	}
 
+	if (ops->internal_flags & BATADV_FLAG_NEED_VLAN && info->user_ptr[1]) {
+		vlan = info->user_ptr[1];
+		batadv_softif_vlan_put(vlan);
+	}
+
 	if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) {
 		bat_priv = info->user_ptr[0];
 		dev_put(bat_priv->soft_iface);
@@ -920,6 +1091,22 @@ static const struct genl_ops batadv_netlink_ops[] = {
 		.internal_flags = BATADV_FLAG_NEED_MESH |
 				  BATADV_FLAG_NEED_HARDIF,
 	},
+	{
+		.cmd = BATADV_CMD_GET_VLAN,
+		/* can be retrieved by unprivileged users */
+		.policy = batadv_netlink_policy,
+		.doit = batadv_netlink_get_vlan,
+		.internal_flags = BATADV_FLAG_NEED_MESH |
+				  BATADV_FLAG_NEED_VLAN,
+	},
+	{
+		.cmd = BATADV_CMD_SET_VLAN,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.doit = batadv_netlink_set_vlan,
+		.internal_flags = BATADV_FLAG_NEED_MESH |
+				  BATADV_FLAG_NEED_VLAN,
+	},
 };
 
 struct genl_family batadv_netlink_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 9ab4cee5ced970019a5d0f3a43cf85671ca7b38f Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 12:46:14 +0100
Subject: batman-adv: Add aggregated_ogms mesh genl configuration

The mesh interface can delay OGM messages to aggregate different ogms
together in a single OGM packet.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the BATADV_ATTR_AGGREGATED_OGMS_ENABLED
attribute. Setting the u8 to zero will disable this feature and setting it
to something else is enabling this feature.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  6 ++++++
 net/batman-adv/netlink.c        | 12 ++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 10f3cc34fe16..f8941e80d6b4 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -350,6 +350,12 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_VLANID,
 
+	/**
+	 * @BATADV_ATTR_AGGREGATED_OGMS_ENABLED: whether the batman protocol
+	 *  messages of the mesh interface shall be aggregated or not.
+	 */
+	BATADV_ATTR_AGGREGATED_OGMS_ENABLED,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index d40913bb9031..82665dc2eb2b 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -139,6 +139,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_MCAST_FLAGS]		= { .type = NLA_U32 },
 	[BATADV_ATTR_MCAST_FLAGS_PRIV]		= { .type = NLA_U32 },
 	[BATADV_ATTR_VLANID]			= { .type = NLA_U16 },
+	[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]	= { .type = NLA_U8 },
 };
 
 /**
@@ -214,6 +215,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 			goto nla_put_failure;
 	}
 
+	if (nla_put_u8(msg, BATADV_ATTR_AGGREGATED_OGMS_ENABLED,
+		       !!atomic_read(&bat_priv->aggregated_ogms)))
+		goto nla_put_failure;
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -296,6 +301,13 @@ static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info)
 static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 {
 	struct batadv_priv *bat_priv = info->user_ptr[0];
+	struct nlattr *attr;
+
+	if (info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]) {
+		attr = info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED];
+
+		atomic_set(&bat_priv->aggregated_ogms, !!nla_get_u8(attr));
+	}
 
 	batadv_netlink_notify_mesh(bat_priv);
 
-- 
cgit v1.2.3-59-g8ed1b


From e43d16b87dc2cad18799cfd1142f4acae4135ea4 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 12:51:55 +0100
Subject: batman-adv: Add ap_isolation mesh/vlan genl configuration

The mesh interface can drop messages between clients to implement a
mesh-wide AP isolation.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH and
BATADV_CMD_SET_VLAN/BATADV_CMD_GET_VLAN commands allow to set/get the
configuration of this feature using the BATADV_ATTR_AP_ISOLATION_ENABLED
attribute. Setting the u8 to zero will disable this feature and setting it
to something else is enabling this feature.

This feature also requires that skbuff which should be handled as isolated
are marked. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to
set/get the mark/mask using the u32 attributes BATADV_ATTR_ISOLATION_MARK
and BATADV_ATTR_ISOLATION_MASK.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h | 19 +++++++++
 net/batman-adv/netlink.c        | 89 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index f8941e80d6b4..a4dadafe08dd 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -356,6 +356,25 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_AGGREGATED_OGMS_ENABLED,
 
+	/**
+	 * @BATADV_ATTR_AP_ISOLATION_ENABLED: whether the data traffic going
+	 *  from a wireless client to another wireless client will be silently
+	 *  dropped.
+	 */
+	BATADV_ATTR_AP_ISOLATION_ENABLED,
+
+	/**
+	 * @BATADV_ATTR_ISOLATION_MARK: the isolation mark which is used to
+	 *  classify clients as "isolated" by the Extended Isolation feature.
+	 */
+	BATADV_ATTR_ISOLATION_MARK,
+
+	/**
+	 * @BATADV_ATTR_ISOLATION_MASK: the isolation (bit)mask which is used to
+	 *  classify clients as "isolated" by the Extended Isolation feature.
+	 */
+	BATADV_ATTR_ISOLATION_MASK,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 82665dc2eb2b..fc80f003ed6e 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -140,6 +140,9 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_MCAST_FLAGS_PRIV]		= { .type = NLA_U32 },
 	[BATADV_ATTR_VLANID]			= { .type = NLA_U16 },
 	[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]	= { .type = NLA_U8 },
+	[BATADV_ATTR_AP_ISOLATION_ENABLED]	= { .type = NLA_U8 },
+	[BATADV_ATTR_ISOLATION_MARK]		= { .type = NLA_U32 },
+	[BATADV_ATTR_ISOLATION_MASK]		= { .type = NLA_U32 },
 };
 
 /**
@@ -157,6 +160,52 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
 	return attr ? nla_get_u32(attr) : 0;
 }
 
+/**
+ * batadv_netlink_mesh_fill_ap_isolation() - Add ap_isolation softif attribute
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg,
+						 struct batadv_priv *bat_priv)
+{
+	struct batadv_softif_vlan *vlan;
+	u8 ap_isolation;
+
+	vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+	if (!vlan)
+		return 0;
+
+	ap_isolation = atomic_read(&vlan->ap_isolation);
+	batadv_softif_vlan_put(vlan);
+
+	return nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED,
+			  !!ap_isolation);
+}
+
+/**
+ * batadv_option_set_ap_isolation() - Set ap_isolation from genl msg
+ * @attr: parsed BATADV_ATTR_AP_ISOLATION_ENABLED attribute
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_mesh_ap_isolation(struct nlattr *attr,
+						struct batadv_priv *bat_priv)
+{
+	struct batadv_softif_vlan *vlan;
+
+	vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+	if (!vlan)
+		return -ENOENT;
+
+	atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr));
+	batadv_softif_vlan_put(vlan);
+
+	return 0;
+}
+
 /**
  * batadv_netlink_mesh_fill() - Fill message with mesh attributes
  * @msg: Netlink message to dump into
@@ -219,6 +268,17 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 		       !!atomic_read(&bat_priv->aggregated_ogms)))
 		goto nla_put_failure;
 
+	if (batadv_netlink_mesh_fill_ap_isolation(msg, bat_priv))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MARK,
+			bat_priv->isolation_mark))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MASK,
+			bat_priv->isolation_mark_mask))
+		goto nla_put_failure;
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -309,6 +369,24 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 		atomic_set(&bat_priv->aggregated_ogms, !!nla_get_u8(attr));
 	}
 
+	if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) {
+		attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED];
+
+		batadv_netlink_set_mesh_ap_isolation(attr, bat_priv);
+	}
+
+	if (info->attrs[BATADV_ATTR_ISOLATION_MARK]) {
+		attr = info->attrs[BATADV_ATTR_ISOLATION_MARK];
+
+		bat_priv->isolation_mark = nla_get_u32(attr);
+	}
+
+	if (info->attrs[BATADV_ATTR_ISOLATION_MASK]) {
+		attr = info->attrs[BATADV_ATTR_ISOLATION_MASK];
+
+		bat_priv->isolation_mark_mask = nla_get_u32(attr);
+	}
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
@@ -705,6 +783,10 @@ static int batadv_netlink_vlan_fill(struct sk_buff *msg,
 	if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK))
 		goto nla_put_failure;
 
+	if (nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED,
+		       !!atomic_read(&vlan->ap_isolation)))
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -785,6 +867,13 @@ static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info)
 {
 	struct batadv_softif_vlan *vlan = info->user_ptr[1];
 	struct batadv_priv *bat_priv = info->user_ptr[0];
+	struct nlattr *attr;
+
+	if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) {
+		attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED];
+
+		atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr));
+	}
 
 	batadv_netlink_notify_vlan(bat_priv, vlan);
 
-- 
cgit v1.2.3-59-g8ed1b


From d7e52506b680826d6ff7ce73e6a90a3b9defc741 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 12:55:44 +0100
Subject: batman-adv: Add bonding mesh genl configuration

The mesh interface can use multiple slave/hard-interface ports at the same
time to transport the traffic to other nodes.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the BATADV_ATTR_BONDING_ENABLED
attribute. Setting the u8 to zero will disable this feature and setting it
to something else is enabling this feature.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  6 ++++++
 net/batman-adv/netlink.c        | 11 +++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index a4dadafe08dd..f74ff261ec8f 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -375,6 +375,12 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_ISOLATION_MASK,
 
+	/**
+	 * @BATADV_ATTR_BONDING_ENABLED: whether the data traffic going through
+	 *  the mesh will be sent using multiple interfaces at the same time.
+	 */
+	BATADV_ATTR_BONDING_ENABLED,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index fc80f003ed6e..310a2c339fd1 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -143,6 +143,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_AP_ISOLATION_ENABLED]	= { .type = NLA_U8 },
 	[BATADV_ATTR_ISOLATION_MARK]		= { .type = NLA_U32 },
 	[BATADV_ATTR_ISOLATION_MASK]		= { .type = NLA_U32 },
+	[BATADV_ATTR_BONDING_ENABLED]		= { .type = NLA_U8 },
 };
 
 /**
@@ -279,6 +280,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 			bat_priv->isolation_mark_mask))
 		goto nla_put_failure;
 
+	if (nla_put_u8(msg, BATADV_ATTR_BONDING_ENABLED,
+		       !!atomic_read(&bat_priv->bonding)))
+		goto nla_put_failure;
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -387,6 +392,12 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 		bat_priv->isolation_mark_mask = nla_get_u32(attr);
 	}
 
+	if (info->attrs[BATADV_ATTR_BONDING_ENABLED]) {
+		attr = info->attrs[BATADV_ATTR_BONDING_ENABLED];
+
+		atomic_set(&bat_priv->bonding, !!nla_get_u8(attr));
+	}
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 43ff6105a527aaa1a8e00163b0b0aedbbc0c4522 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:03:39 +0100
Subject: batman-adv: Add bridge_loop_avoidance mesh genl configuration

The mesh interface can try to detect loops in the same mesh caused by
(indirectly) bridged mesh/soft-interfaces of different nodes. Some of the
loops can also be resolved without breaking the mesh.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the
BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED attribute. Setting the u8 to zero
will disable this feature and setting it to something else is enabling this
feature.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  7 +++++++
 net/batman-adv/netlink.c        | 17 +++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index f74ff261ec8f..3cb35c661056 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -381,6 +381,13 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_BONDING_ENABLED,
 
+	/**
+	 * @BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED: whether the bridge loop
+	 *  avoidance feature is enabled. This feature detects and avoids loops
+	 *  between the mesh and devices bridged with the soft interface
+	 */
+	BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 310a2c339fd1..40c940da9498 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -144,6 +144,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_ISOLATION_MARK]		= { .type = NLA_U32 },
 	[BATADV_ATTR_ISOLATION_MASK]		= { .type = NLA_U32 },
 	[BATADV_ATTR_BONDING_ENABLED]		= { .type = NLA_U8 },
+	[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]	= { .type = NLA_U8 },
 };
 
 /**
@@ -284,6 +285,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 		       !!atomic_read(&bat_priv->bonding)))
 		goto nla_put_failure;
 
+#ifdef CONFIG_BATMAN_ADV_BLA
+	if (nla_put_u8(msg, BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED,
+		       !!atomic_read(&bat_priv->bridge_loop_avoidance)))
+		goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_BLA */
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -398,6 +405,16 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 		atomic_set(&bat_priv->bonding, !!nla_get_u8(attr));
 	}
 
+#ifdef CONFIG_BATMAN_ADV_BLA
+	if (info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]) {
+		attr = info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED];
+
+		atomic_set(&bat_priv->bridge_loop_avoidance,
+			   !!nla_get_u8(attr));
+		batadv_bla_status_update(bat_priv->soft_iface);
+	}
+#endif /* CONFIG_BATMAN_ADV_BLA */
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From a1c8de80329609ba68ff860074070efb1e14ade4 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:06:42 +0100
Subject: batman-adv: Add distributed_arp_table mesh genl configuration

The mesh interface can use a distributed hash table to answer ARP requests
without flooding the request through the whole mesh.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the
BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED attribute. Setting the u8 to zero
will disable this feature and setting it to something else is enabling this
feature.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  8 ++++++++
 net/batman-adv/netlink.c        | 17 +++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 3cb35c661056..f303a1496476 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -388,6 +388,14 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED,
 
+	/**
+	 * @BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED: whether the distributed
+	 *  arp table feature is enabled. This feature uses a distributed hash
+	 *  table to answer ARP requests without flooding the request through
+	 *  the whole mesh.
+	 */
+	BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 40c940da9498..3028c2a5c782 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -145,6 +145,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_ISOLATION_MASK]		= { .type = NLA_U32 },
 	[BATADV_ATTR_BONDING_ENABLED]		= { .type = NLA_U8 },
 	[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]	= { .type = NLA_U8 },
+	[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]	= { .type = NLA_U8 },
 };
 
 /**
@@ -291,6 +292,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 		goto nla_put_failure;
 #endif /* CONFIG_BATMAN_ADV_BLA */
 
+#ifdef CONFIG_BATMAN_ADV_DAT
+	if (nla_put_u8(msg, BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED,
+		       !!atomic_read(&bat_priv->distributed_arp_table)))
+		goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -415,6 +422,16 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 	}
 #endif /* CONFIG_BATMAN_ADV_BLA */
 
+#ifdef CONFIG_BATMAN_ADV_DAT
+	if (info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]) {
+		attr = info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED];
+
+		atomic_set(&bat_priv->distributed_arp_table,
+			   !!nla_get_u8(attr));
+		batadv_dat_status_update(bat_priv->soft_iface);
+	}
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 3e15b06eb7e410ef9f1b9673be094b3e10eacf93 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:09:49 +0100
Subject: batman-adv: Add fragmentation mesh genl configuration

The mesh interface can fragment unicast packets when the packet size
exceeds the outgoing slave/hard-interface MTU.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the BATADV_ATTR_FRAGMENTATION_ENABLED
attribute. Setting the u8 to zero will disable this feature and setting it
to something else is enabling this feature.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  7 +++++++
 net/batman-adv/netlink.c        | 12 ++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index f303a1496476..847841b8de5d 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -396,6 +396,13 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED,
 
+	/**
+	 * @BATADV_ATTR_FRAGMENTATION_ENABLED: whether the data traffic going
+	 *  through the mesh will be fragmented or silently discarded if the
+	 *  packet size exceeds the outgoing interface MTU.
+	 */
+	BATADV_ATTR_FRAGMENTATION_ENABLED,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 3028c2a5c782..5b2160d2f482 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -146,6 +146,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_BONDING_ENABLED]		= { .type = NLA_U8 },
 	[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]	= { .type = NLA_U8 },
 	[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]	= { .type = NLA_U8 },
+	[BATADV_ATTR_FRAGMENTATION_ENABLED]	= { .type = NLA_U8 },
 };
 
 /**
@@ -298,6 +299,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 		goto nla_put_failure;
 #endif /* CONFIG_BATMAN_ADV_DAT */
 
+	if (nla_put_u8(msg, BATADV_ATTR_FRAGMENTATION_ENABLED,
+		       !!atomic_read(&bat_priv->fragmentation)))
+		goto nla_put_failure;
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -432,6 +437,13 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 	}
 #endif /* CONFIG_BATMAN_ADV_DAT */
 
+	if (info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]) {
+		attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED];
+
+		atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr));
+		batadv_update_min_mtu(bat_priv->soft_iface);
+	}
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From e2d0d35b5b0ce420505e88255fd5922ed035bb8d Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:15:00 +0100
Subject: batman-adv: Add gateway mesh genl configuration

The mesh/soft-interface can optimize the handling of DHCP packets. Instead
of flooding them through the whole mesh, it can be forwarded as unicast to
a specific gateway server. The originator which injects the packets in the
mesh has to select (based on sel_class thresholds) a responsible gateway
server. This is done by switching this originator to the gw_mode client.
The servers announce their forwarding bandwidth (download/upload) when the
gw_mode server was selected.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the attributes:

* u8 BATADV_ATTR_GW_MODE (0 == off, 1 == client, 2 == server)
* u32 BATADV_ATTR_GW_BANDWIDTH_DOWN (in 100 kbit/s steps)
* u32 BATADV_ATTR_GW_BANDWIDTH_UP (in 100 kbit/s steps)
* u32 BATADV_ATTR_GW_SEL_CLASS

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h | 40 ++++++++++++++++++
 net/batman-adv/gateway_client.c |  1 -
 net/batman-adv/gateway_common.c |  1 +
 net/batman-adv/gateway_common.h |  6 ---
 net/batman-adv/netlink.c        | 92 +++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/soft-interface.c |  2 +-
 net/batman-adv/sysfs.c          |  1 +
 7 files changed, 135 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 847841b8de5d..165272be6878 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -139,6 +139,20 @@ enum batadv_mcast_flags_priv {
 	BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING	= (1 << 4),
 };
 
+/**
+ * enum batadv_gw_modes - gateway mode of node
+ */
+enum batadv_gw_modes {
+	/** @BATADV_GW_MODE_OFF: gw mode disabled */
+	BATADV_GW_MODE_OFF,
+
+	/** @BATADV_GW_MODE_CLIENT: send DHCP requests to gw servers */
+	BATADV_GW_MODE_CLIENT,
+
+	/** @BATADV_GW_MODE_SERVER: announce itself as gatway server */
+	BATADV_GW_MODE_SERVER,
+};
+
 /**
  * enum batadv_nl_attrs - batman-adv netlink attributes
  */
@@ -403,6 +417,32 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_FRAGMENTATION_ENABLED,
 
+	/**
+	 * @BATADV_ATTR_GW_BANDWIDTH_DOWN: defines the download bandwidth which
+	 *  is propagated by this node if %BATADV_ATTR_GW_BANDWIDTH_MODE was set
+	 *  to 'server'.
+	 */
+	BATADV_ATTR_GW_BANDWIDTH_DOWN,
+
+	/**
+	 * @BATADV_ATTR_GW_BANDWIDTH_UP: defines the upload bandwidth which
+	 *  is propagated by this node if %BATADV_ATTR_GW_BANDWIDTH_MODE was set
+	 *  to 'server'.
+	 */
+	BATADV_ATTR_GW_BANDWIDTH_UP,
+
+	/**
+	 * @BATADV_ATTR_GW_MODE: defines the state of the gateway features.
+	 * Possible values are specified in enum batadv_gw_modes
+	 */
+	BATADV_ATTR_GW_MODE,
+
+	/**
+	 * @BATADV_ATTR_GW_SEL_CLASS: defines the selection criteria this node
+	 *  will use to choose a gateway if gw_mode was set to 'client'.
+	 */
+	BATADV_ATTR_GW_SEL_CLASS,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 0a6b9b30eabd..f5811f61aa92 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -47,7 +47,6 @@
 #include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
 
-#include "gateway_common.h"
 #include "hard-interface.h"
 #include "log.h"
 #include "netlink.h"
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 53d03adc9bfc..e064de45e22c 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -28,6 +28,7 @@
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "gateway_client.h"
 #include "log.h"
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 89bae100a0b0..128467a0fb89 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -25,12 +25,6 @@
 
 struct net_device;
 
-enum batadv_gw_modes {
-	BATADV_GW_MODE_OFF,
-	BATADV_GW_MODE_CLIENT,
-	BATADV_GW_MODE_SERVER,
-};
-
 /**
  * enum batadv_bandwidth_units - bandwidth unit types
  */
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 5b2160d2f482..473467afea91 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -51,6 +51,7 @@
 #include "bridge_loop_avoidance.h"
 #include "distributed-arp-table.h"
 #include "gateway_client.h"
+#include "gateway_common.h"
 #include "hard-interface.h"
 #include "multicast.h"
 #include "originator.h"
@@ -147,6 +148,10 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]	= { .type = NLA_U8 },
 	[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]	= { .type = NLA_U8 },
 	[BATADV_ATTR_FRAGMENTATION_ENABLED]	= { .type = NLA_U8 },
+	[BATADV_ATTR_GW_BANDWIDTH_DOWN]		= { .type = NLA_U32 },
+	[BATADV_ATTR_GW_BANDWIDTH_UP]		= { .type = NLA_U32 },
+	[BATADV_ATTR_GW_MODE]			= { .type = NLA_U8 },
+	[BATADV_ATTR_GW_SEL_CLASS]		= { .type = NLA_U32 },
 };
 
 /**
@@ -303,6 +308,28 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 		       !!atomic_read(&bat_priv->fragmentation)))
 		goto nla_put_failure;
 
+	if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_DOWN,
+			atomic_read(&bat_priv->gw.bandwidth_down)))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_UP,
+			atomic_read(&bat_priv->gw.bandwidth_up)))
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, BATADV_ATTR_GW_MODE,
+		       atomic_read(&bat_priv->gw.mode)))
+		goto nla_put_failure;
+
+	if (bat_priv->algo_ops->gw.get_best_gw_node &&
+	    bat_priv->algo_ops->gw.is_eligible) {
+		/* GW selection class is not available if the routing algorithm
+		 * in use does not implement the GW API
+		 */
+		if (nla_put_u32(msg, BATADV_ATTR_GW_SEL_CLASS,
+				atomic_read(&bat_priv->gw.sel_class)))
+			goto nla_put_failure;
+	}
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -444,6 +471,71 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 		batadv_update_min_mtu(bat_priv->soft_iface);
 	}
 
+	if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) {
+		attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN];
+
+		atomic_set(&bat_priv->gw.bandwidth_down, nla_get_u32(attr));
+		batadv_gw_tvlv_container_update(bat_priv);
+	}
+
+	if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]) {
+		attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP];
+
+		atomic_set(&bat_priv->gw.bandwidth_up, nla_get_u32(attr));
+		batadv_gw_tvlv_container_update(bat_priv);
+	}
+
+	if (info->attrs[BATADV_ATTR_GW_MODE]) {
+		u8 gw_mode;
+
+		attr = info->attrs[BATADV_ATTR_GW_MODE];
+		gw_mode = nla_get_u8(attr);
+
+		if (gw_mode <= BATADV_GW_MODE_SERVER) {
+			/* Invoking batadv_gw_reselect() is not enough to really
+			 * de-select the current GW. It will only instruct the
+			 * gateway client code to perform a re-election the next
+			 * time that this is needed.
+			 *
+			 * When gw client mode is being switched off the current
+			 * GW must be de-selected explicitly otherwise no GW_ADD
+			 * uevent is thrown on client mode re-activation. This
+			 * is operation is performed in
+			 * batadv_gw_check_client_stop().
+			 */
+			batadv_gw_reselect(bat_priv);
+
+			/* always call batadv_gw_check_client_stop() before
+			 * changing the gateway state
+			 */
+			batadv_gw_check_client_stop(bat_priv);
+			atomic_set(&bat_priv->gw.mode, gw_mode);
+			batadv_gw_tvlv_container_update(bat_priv);
+		}
+	}
+
+	if (info->attrs[BATADV_ATTR_GW_SEL_CLASS] &&
+	    bat_priv->algo_ops->gw.get_best_gw_node &&
+	    bat_priv->algo_ops->gw.is_eligible) {
+		/* setting the GW selection class is allowed only if the routing
+		 * algorithm in use implements the GW API
+		 */
+
+		u32 sel_class_max = 0xffffffffu;
+		u32 sel_class;
+
+		attr = info->attrs[BATADV_ATTR_GW_SEL_CLASS];
+		sel_class = nla_get_u32(attr);
+
+		if (!bat_priv->algo_ops->gw.store_sel_class)
+			sel_class_max = BATADV_TQ_MAX_VALUE;
+
+		if (sel_class >= 1 && sel_class <= sel_class_max) {
+			atomic_set(&bat_priv->gw.sel_class, sel_class);
+			batadv_gw_reselect(bat_priv);
+		}
+	}
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index b14fb3462af7..12028c287de5 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -50,13 +50,13 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
 #include "bridge_loop_avoidance.h"
 #include "debugfs.h"
 #include "distributed-arp-table.h"
 #include "gateway_client.h"
-#include "gateway_common.h"
 #include "hard-interface.h"
 #include "multicast.h"
 #include "network-coding.h"
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index e1b816262c53..64fd5932a555 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -40,6 +40,7 @@
 #include <linux/stringify.h>
 #include <linux/workqueue.h>
 #include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "bridge_loop_avoidance.h"
 #include "distributed-arp-table.h"
-- 
cgit v1.2.3-59-g8ed1b


From bfc7f1be57b8a5ea738ce5db62b82234e4901abf Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:19:38 +0100
Subject: batman-adv: Add hop_penalty mesh genl configuration

The TQ (B.A.T.M.A.N. IV) and throughput values (B.A.T.M.A.N. V) are reduced
when they are forwarded. One of the reductions is the penalty for
traversing an additional hop. This hop_penalty (0-255) defines the
percentage of reduction (0-100%).

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the u8 BATADV_ATTR_HOP_PENALTY
attribute.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  6 ++++++
 net/batman-adv/netlink.c        | 11 +++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 165272be6878..b37cb923332e 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -443,6 +443,12 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_GW_SEL_CLASS,
 
+	/**
+	 * @BATADV_ATTR_HOP_PENALTY: defines the penalty which will be applied
+	 *  to an originator message's tq-field on every hop.
+	 */
+	BATADV_ATTR_HOP_PENALTY,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 473467afea91..ce6e6f078765 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -152,6 +152,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_GW_BANDWIDTH_UP]		= { .type = NLA_U32 },
 	[BATADV_ATTR_GW_MODE]			= { .type = NLA_U8 },
 	[BATADV_ATTR_GW_SEL_CLASS]		= { .type = NLA_U32 },
+	[BATADV_ATTR_HOP_PENALTY]		= { .type = NLA_U8 },
 };
 
 /**
@@ -330,6 +331,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 			goto nla_put_failure;
 	}
 
+	if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY,
+		       atomic_read(&bat_priv->hop_penalty)))
+		goto nla_put_failure;
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -536,6 +541,12 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	if (info->attrs[BATADV_ATTR_HOP_PENALTY]) {
+		attr = info->attrs[BATADV_ATTR_HOP_PENALTY];
+
+		atomic_set(&bat_priv->hop_penalty, nla_get_u8(attr));
+	}
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From b85bd091098a52f7bf00d2725b536455f82ba0d0 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:22:33 +0100
Subject: batman-adv: Add log_level mesh genl configuration

In contrast to other modules, batman-adv allows to set the debug message
verbosity per mesh/soft-interface and not per module (via modparam).

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the u32 (bitmask) BATADV_ATTR_LOG_LEVEL
attribute.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  6 ++++++
 net/batman-adv/netlink.c        | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index b37cb923332e..6d36e4b47eb4 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -449,6 +449,12 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_HOP_PENALTY,
 
+	/**
+	 * @BATADV_ATTR_LOG_LEVEL: bitmask with to define which debug messages
+	 *  should be send to the debug log/trace ring buffer
+	 */
+	BATADV_ATTR_LOG_LEVEL,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index ce6e6f078765..8c019d46815c 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -53,6 +53,7 @@
 #include "gateway_client.h"
 #include "gateway_common.h"
 #include "hard-interface.h"
+#include "log.h"
 #include "multicast.h"
 #include "originator.h"
 #include "soft-interface.h"
@@ -153,6 +154,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_GW_MODE]			= { .type = NLA_U8 },
 	[BATADV_ATTR_GW_SEL_CLASS]		= { .type = NLA_U32 },
 	[BATADV_ATTR_HOP_PENALTY]		= { .type = NLA_U8 },
+	[BATADV_ATTR_LOG_LEVEL]			= { .type = NLA_U32 },
 };
 
 /**
@@ -335,6 +337,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 		       atomic_read(&bat_priv->hop_penalty)))
 		goto nla_put_failure;
 
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+	if (nla_put_u32(msg, BATADV_ATTR_LOG_LEVEL,
+			atomic_read(&bat_priv->log_level)))
+		goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_DEBUG */
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -547,6 +555,15 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 		atomic_set(&bat_priv->hop_penalty, nla_get_u8(attr));
 	}
 
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+	if (info->attrs[BATADV_ATTR_LOG_LEVEL]) {
+		attr = info->attrs[BATADV_ATTR_LOG_LEVEL];
+
+		atomic_set(&bat_priv->log_level,
+			   nla_get_u32(attr) & BATADV_DBG_ALL);
+	}
+#endif /* CONFIG_BATMAN_ADV_DEBUG */
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From f75b56bc91122e2934e2cb458f98727c41d535c7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:25:05 +0100
Subject: batman-adv: Add multicast forceflood mesh genl configuration

The mesh interface can optimize the flooding of multicast packets based on
the content of the global translation tables. To disable this behavior and
use the broadcast-like flooding of the packets, forceflood has to be
enabled.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the
BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED attribute. Setting the u8 to zero
will disable this feature (allowing multicast optimizations) and setting it
to something else is enabling this feature (forcing simple flooding).

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  9 +++++++++
 net/batman-adv/netlink.c        | 15 +++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 6d36e4b47eb4..38caaaae8a05 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -455,6 +455,15 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_LOG_LEVEL,
 
+	/**
+	 * @BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED: whether multicast
+	 *  optimizations should be replaced by simple broadcast-like flooding
+	 *  of multicast packets. If set to non-zero then all nodes in the mesh
+	 *  are going to use classic flooding for any multicast packet with no
+	 *  optimizations.
+	 */
+	BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 8c019d46815c..475bd15f806c 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -155,6 +155,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_GW_SEL_CLASS]		= { .type = NLA_U32 },
 	[BATADV_ATTR_HOP_PENALTY]		= { .type = NLA_U8 },
 	[BATADV_ATTR_LOG_LEVEL]			= { .type = NLA_U32 },
+	[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]	= { .type = NLA_U8 },
 };
 
 /**
@@ -343,6 +344,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 		goto nla_put_failure;
 #endif /* CONFIG_BATMAN_ADV_DEBUG */
 
+#ifdef CONFIG_BATMAN_ADV_MCAST
+	if (nla_put_u8(msg, BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED,
+		       !atomic_read(&bat_priv->multicast_mode)))
+		goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_MCAST */
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -564,6 +571,14 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 	}
 #endif /* CONFIG_BATMAN_ADV_DEBUG */
 
+#ifdef CONFIG_BATMAN_ADV_MCAST
+	if (info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]) {
+		attr = info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED];
+
+		atomic_set(&bat_priv->multicast_mode, !nla_get_u8(attr));
+	}
+#endif /* CONFIG_BATMAN_ADV_MCAST */
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 6c57cde6800bae2361b8ac14a5924ffc592b3a90 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:26:14 +0100
Subject: batman-adv: Add network_coding mesh genl configuration

The mesh interface can use (in an homogeneous mesh) network coding, a
mechanism that aims to increase the overall network throughput by fusing
multiple packets in one transmission.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the BATADV_ATTR_NETWORK_CODING_ENABLED
attribute. Setting the u8 to zero will disable this feature and setting it
to something else is enabling this feature.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  7 +++++++
 net/batman-adv/netlink.c        | 17 +++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 38caaaae8a05..a4239c147bde 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -464,6 +464,13 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED,
 
+	/**
+	 * @BATADV_ATTR_NETWORK_CODING_ENABLED: whether Network Coding (using
+	 *  some magic to send fewer wifi packets but still the same content) is
+	 *  enabled or not.
+	 */
+	BATADV_ATTR_NETWORK_CODING_ENABLED,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 475bd15f806c..6f01f92e6ab3 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -55,6 +55,7 @@
 #include "hard-interface.h"
 #include "log.h"
 #include "multicast.h"
+#include "network-coding.h"
 #include "originator.h"
 #include "soft-interface.h"
 #include "tp_meter.h"
@@ -156,6 +157,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_HOP_PENALTY]		= { .type = NLA_U8 },
 	[BATADV_ATTR_LOG_LEVEL]			= { .type = NLA_U32 },
 	[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]	= { .type = NLA_U8 },
+	[BATADV_ATTR_NETWORK_CODING_ENABLED]	= { .type = NLA_U8 },
 };
 
 /**
@@ -350,6 +352,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 		goto nla_put_failure;
 #endif /* CONFIG_BATMAN_ADV_MCAST */
 
+#ifdef CONFIG_BATMAN_ADV_NC
+	if (nla_put_u8(msg, BATADV_ATTR_NETWORK_CODING_ENABLED,
+		       !!atomic_read(&bat_priv->network_coding)))
+		goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_NC */
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -579,6 +587,15 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 	}
 #endif /* CONFIG_BATMAN_ADV_MCAST */
 
+#ifdef CONFIG_BATMAN_ADV_NC
+	if (info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]) {
+		attr = info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED];
+
+		atomic_set(&bat_priv->network_coding, !!nla_get_u8(attr));
+		batadv_nc_status_update(bat_priv->soft_iface);
+	}
+#endif /* CONFIG_BATMAN_ADV_NC */
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 7b751b39f018a828a04692e199c044087102e96c Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:28:02 +0100
Subject: batman-adv: Add orig_interval mesh genl configuration

The OGM packets are transmitted every orig_interval milliseconds. This
value can be changed using the configuration interface.

The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the
configuration of this feature using the u32 BATADV_ATTR_ORIG_INTERVAL
attribute.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  6 ++++++
 net/batman-adv/netlink.c        | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index a4239c147bde..6bedd4889c37 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -471,6 +471,12 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_NETWORK_CODING_ENABLED,
 
+	/**
+	 * @BATADV_ATTR_ORIG_INTERVAL: defines the interval in milliseconds in
+	 *  which batman sends its protocol messages.
+	 */
+	BATADV_ATTR_ORIG_INTERVAL,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 6f01f92e6ab3..e25c139ba0e1 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -158,6 +158,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_LOG_LEVEL]			= { .type = NLA_U32 },
 	[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]	= { .type = NLA_U8 },
 	[BATADV_ATTR_NETWORK_CODING_ENABLED]	= { .type = NLA_U8 },
+	[BATADV_ATTR_ORIG_INTERVAL]		= { .type = NLA_U32 },
 };
 
 /**
@@ -358,6 +359,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
 		goto nla_put_failure;
 #endif /* CONFIG_BATMAN_ADV_NC */
 
+	if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL,
+			atomic_read(&bat_priv->orig_interval)))
+		goto nla_put_failure;
+
 	if (primary_if)
 		batadv_hardif_put(primary_if);
 
@@ -596,6 +601,18 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
 	}
 #endif /* CONFIG_BATMAN_ADV_NC */
 
+	if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) {
+		u32 orig_interval;
+
+		attr = info->attrs[BATADV_ATTR_ORIG_INTERVAL];
+		orig_interval = nla_get_u32(attr);
+
+		orig_interval = min_t(u32, orig_interval, INT_MAX);
+		orig_interval = max_t(u32, orig_interval, 2 * BATADV_JITTER);
+
+		atomic_set(&bat_priv->orig_interval, orig_interval);
+	}
+
 	batadv_netlink_notify_mesh(bat_priv);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From a108008290405545b43b9c7975344bc59af2341b Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:30:04 +0100
Subject: batman-adv: Add elp_interval hardif genl configuration

The ELP packets are transmitted every elp_interval milliseconds on an
slave/hard-interface. This value can be changed using the configuration
interface.

The BATADV_CMD_SET_HARDIF/BATADV_CMD_GET_HARDIF commands allow to set/get
the configuration of this feature using the u32 BATADV_ATTR_ELP_INTERVAL
attribute.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  6 ++++++
 net/batman-adv/netlink.c        | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 6bedd4889c37..f966e497361b 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -477,6 +477,12 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_ORIG_INTERVAL,
 
+	/**
+	 * @BATADV_ATTR_ELP_INTERVAL: defines the interval in milliseconds in
+	 *  which batman emits probing packets for neighbor sensing (ELP).
+	 */
+	BATADV_ATTR_ELP_INTERVAL,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index e25c139ba0e1..09187e6e92a3 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -159,6 +159,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]	= { .type = NLA_U8 },
 	[BATADV_ATTR_NETWORK_CODING_ENABLED]	= { .type = NLA_U8 },
 	[BATADV_ATTR_ORIG_INTERVAL]		= { .type = NLA_U32 },
+	[BATADV_ATTR_ELP_INTERVAL]		= { .type = NLA_U32 },
 };
 
 /**
@@ -825,6 +826,12 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg,
 			goto nla_put_failure;
 	}
 
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL,
+			atomic_read(&hard_iface->bat_v.elp_interval)))
+		goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -910,6 +917,16 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb,
 	struct batadv_hard_iface *hard_iface = info->user_ptr[1];
 	struct batadv_priv *bat_priv = info->user_ptr[0];
 
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+	struct nlattr *attr;
+
+	if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) {
+		attr = info->attrs[BATADV_ATTR_ELP_INTERVAL];
+
+		atomic_set(&hard_iface->bat_v.elp_interval, nla_get_u32(attr));
+	}
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
 	batadv_netlink_notify_hardif(bat_priv, hard_iface);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 9a182242f17c06fad620663e6fdf992e97661e66 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 23 Nov 2018 13:31:23 +0100
Subject: batman-adv: Add throughput_override hardif genl configuration

The B.A.T.M.A.N. V implementation tries to estimate the link throughput of
an interface to an originator using different automatic methods. It is
still possible to overwrite it the link throughput for all reachable
originators via this interface.

The BATADV_CMD_SET_HARDIF/BATADV_CMD_GET_HARDIF commands allow to set/get
the configuration of this feature using the u32
BATADV_ATTR_THROUGHPUT_OVERRIDE attribute. The used unit is in 100 Kbit/s.
If the value is set to 0 then batman-adv will try to estimate the
throughput by itself.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  8 ++++++++
 net/batman-adv/netlink.c        | 12 ++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index f966e497361b..305bf316dd03 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -483,6 +483,14 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_ELP_INTERVAL,
 
+	/**
+	 * @BATADV_ATTR_THROUGHPUT_OVERRIDE: defines the throughput value to be
+	 *  used by B.A.T.M.A.N. V when estimating the link throughput using
+	 *  this interface. If the value is set to 0 then batman-adv will try to
+	 *  estimate the throughput by itself.
+	 */
+	BATADV_ATTR_THROUGHPUT_OVERRIDE,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 09187e6e92a3..476b4c6017c9 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -160,6 +160,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_NETWORK_CODING_ENABLED]	= { .type = NLA_U8 },
 	[BATADV_ATTR_ORIG_INTERVAL]		= { .type = NLA_U32 },
 	[BATADV_ATTR_ELP_INTERVAL]		= { .type = NLA_U32 },
+	[BATADV_ATTR_THROUGHPUT_OVERRIDE]	= { .type = NLA_U32 },
 };
 
 /**
@@ -830,6 +831,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg,
 	if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL,
 			atomic_read(&hard_iface->bat_v.elp_interval)))
 		goto nla_put_failure;
+
+	if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT_OVERRIDE,
+			atomic_read(&hard_iface->bat_v.throughput_override)))
+		goto nla_put_failure;
 #endif /* CONFIG_BATMAN_ADV_BATMAN_V */
 
 	genlmsg_end(msg, hdr);
@@ -925,6 +930,13 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb,
 
 		atomic_set(&hard_iface->bat_v.elp_interval, nla_get_u32(attr));
 	}
+
+	if (info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]) {
+		attr = info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE];
+
+		atomic_set(&hard_iface->bat_v.throughput_override,
+			   nla_get_u32(attr));
+	}
 #endif /* CONFIG_BATMAN_ADV_BATMAN_V */
 
 	batadv_netlink_notify_hardif(bat_priv, hard_iface);
-- 
cgit v1.2.3-59-g8ed1b


From 257eeded20b34219d5484cfc415b3e39093f37b8 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Sun, 10 Feb 2019 14:24:59 +0200
Subject: net: Move all TC actions identifiers to one place

Move all the TC identifiers to one place, to the same enum that defines
the identifier of police action. This makes it easier choose numbers for
new actions since they are now defined in one place. We preserve the
original values for binary compatibility. New IDs should be added inside
the enum.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h              | 43 ++++++++++++++++++++++++++++---
 include/uapi/linux/tc_act/tc_bpf.h        |  2 --
 include/uapi/linux/tc_act/tc_connmark.h   |  2 --
 include/uapi/linux/tc_act/tc_csum.h       |  2 --
 include/uapi/linux/tc_act/tc_gact.h       |  1 -
 include/uapi/linux/tc_act/tc_ife.h        |  1 -
 include/uapi/linux/tc_act/tc_ipt.h        |  3 ---
 include/uapi/linux/tc_act/tc_mirred.h     |  1 -
 include/uapi/linux/tc_act/tc_nat.h        |  2 --
 include/uapi/linux/tc_act/tc_pedit.h      |  2 --
 include/uapi/linux/tc_act/tc_sample.h     |  2 --
 include/uapi/linux/tc_act/tc_skbedit.h    |  2 --
 include/uapi/linux/tc_act/tc_skbmod.h     |  2 --
 include/uapi/linux/tc_act/tc_tunnel_key.h |  2 --
 include/uapi/linux/tc_act/tc_vlan.h       |  2 --
 net/sched/act_simple.c                    |  2 --
 tools/include/uapi/linux/tc_act/tc_bpf.h  |  2 --
 17 files changed, 40 insertions(+), 33 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 02ac251be8c4..7ab55f97e7c4 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -63,12 +63,49 @@ enum {
 #define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2)
 #define TC_ACT_EXT_OPCODE_MAX	TC_ACT_GOTO_CHAIN
 
+/* These macros are put here for binary compatibility with userspace apps that
+ * make use of them. For kernel code and new userspace apps, use the TCA_ID_*
+ * versions.
+ */
+#define TCA_ACT_GACT 5
+#define TCA_ACT_IPT 6
+#define TCA_ACT_PEDIT 7
+#define TCA_ACT_MIRRED 8
+#define TCA_ACT_NAT 9
+#define TCA_ACT_XT 10
+#define TCA_ACT_SKBEDIT 11
+#define TCA_ACT_VLAN 12
+#define TCA_ACT_BPF 13
+#define TCA_ACT_CONNMARK 14
+#define TCA_ACT_SKBMOD 15
+#define TCA_ACT_CSUM 16
+#define TCA_ACT_TUNNEL_KEY 17
+#define TCA_ACT_SIMP 22
+#define TCA_ACT_IFE 25
+#define TCA_ACT_SAMPLE 26
+
 /* Action type identifiers*/
 enum {
-	TCA_ID_UNSPEC=0,
-	TCA_ID_POLICE=1,
+	TCA_ID_UNSPEC = 0,
+	TCA_ID_POLICE = 1,
+	TCA_ID_GACT = TCA_ACT_GACT,
+	TCA_ID_IPT = TCA_ACT_IPT,
+	TCA_ID_PEDIT = TCA_ACT_PEDIT,
+	TCA_ID_MIRRED = TCA_ACT_MIRRED,
+	TCA_ID_NAT = TCA_ACT_NAT,
+	TCA_ID_XT = TCA_ACT_XT,
+	TCA_ID_SKBEDIT = TCA_ACT_SKBEDIT,
+	TCA_ID_VLAN = TCA_ACT_VLAN,
+	TCA_ID_BPF = TCA_ACT_BPF,
+	TCA_ID_CONNMARK = TCA_ACT_CONNMARK,
+	TCA_ID_SKBMOD = TCA_ACT_SKBMOD,
+	TCA_ID_CSUM = TCA_ACT_CSUM,
+	TCA_ID_TUNNEL_KEY = TCA_ACT_TUNNEL_KEY,
+	TCA_ID_SIMP = TCA_ACT_SIMP,
+	TCA_ID_IFE = TCA_ACT_IFE,
+	TCA_ID_SAMPLE = TCA_ACT_SAMPLE,
 	/* other actions go here */
-	__TCA_ID_MAX=255
+	__TCA_ID_MAX = 255
 };
 
 #define TCA_ID_MAX __TCA_ID_MAX
diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h
index 6e89a5df49a4..653c4f94f76e 100644
--- a/include/uapi/linux/tc_act/tc_bpf.h
+++ b/include/uapi/linux/tc_act/tc_bpf.h
@@ -13,8 +13,6 @@
 
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_BPF 13
-
 struct tc_act_bpf {
 	tc_gen;
 };
diff --git a/include/uapi/linux/tc_act/tc_connmark.h b/include/uapi/linux/tc_act/tc_connmark.h
index 80caa47b1933..9f8f6f709feb 100644
--- a/include/uapi/linux/tc_act/tc_connmark.h
+++ b/include/uapi/linux/tc_act/tc_connmark.h
@@ -5,8 +5,6 @@
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_CONNMARK 14
-
 struct tc_connmark {
 	tc_gen;
 	__u16 zone;
diff --git a/include/uapi/linux/tc_act/tc_csum.h b/include/uapi/linux/tc_act/tc_csum.h
index 0ecf4d29e2f3..94b2044929de 100644
--- a/include/uapi/linux/tc_act/tc_csum.h
+++ b/include/uapi/linux/tc_act/tc_csum.h
@@ -5,8 +5,6 @@
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_CSUM 16
-
 enum {
 	TCA_CSUM_UNSPEC,
 	TCA_CSUM_PARMS,
diff --git a/include/uapi/linux/tc_act/tc_gact.h b/include/uapi/linux/tc_act/tc_gact.h
index 94273c3b81b0..37e5392e02c7 100644
--- a/include/uapi/linux/tc_act/tc_gact.h
+++ b/include/uapi/linux/tc_act/tc_gact.h
@@ -5,7 +5,6 @@
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_GACT 5
 struct tc_gact {
 	tc_gen;
 
diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
index 2f48490ef386..8c401f185675 100644
--- a/include/uapi/linux/tc_act/tc_ife.h
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -6,7 +6,6 @@
 #include <linux/pkt_cls.h>
 #include <linux/ife.h>
 
-#define TCA_ACT_IFE 25
 /* Flag bits for now just encoding/decoding; mutually exclusive */
 #define IFE_ENCODE 1
 #define IFE_DECODE 0
diff --git a/include/uapi/linux/tc_act/tc_ipt.h b/include/uapi/linux/tc_act/tc_ipt.h
index b743c8bddd13..c48d7da6750d 100644
--- a/include/uapi/linux/tc_act/tc_ipt.h
+++ b/include/uapi/linux/tc_act/tc_ipt.h
@@ -4,9 +4,6 @@
 
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_IPT 6
-#define TCA_ACT_XT 10
-
 enum {
 	TCA_IPT_UNSPEC,
 	TCA_IPT_TABLE,
diff --git a/include/uapi/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h
index 5dd671cf5776..2500a0005d05 100644
--- a/include/uapi/linux/tc_act/tc_mirred.h
+++ b/include/uapi/linux/tc_act/tc_mirred.h
@@ -5,7 +5,6 @@
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_MIRRED 8
 #define TCA_EGRESS_REDIR 1  /* packet redirect to EGRESS*/
 #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */
 #define TCA_INGRESS_REDIR 3  /* packet redirect to INGRESS*/
diff --git a/include/uapi/linux/tc_act/tc_nat.h b/include/uapi/linux/tc_act/tc_nat.h
index 086be842587b..21399c2c6130 100644
--- a/include/uapi/linux/tc_act/tc_nat.h
+++ b/include/uapi/linux/tc_act/tc_nat.h
@@ -5,8 +5,6 @@
 #include <linux/pkt_cls.h>
 #include <linux/types.h>
 
-#define TCA_ACT_NAT 9
-
 enum {
 	TCA_NAT_UNSPEC,
 	TCA_NAT_PARMS,
diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h
index 24ec792dacc1..f3e61b04fa01 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -5,8 +5,6 @@
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_PEDIT 7
-
 enum {
 	TCA_PEDIT_UNSPEC,
 	TCA_PEDIT_TM,
diff --git a/include/uapi/linux/tc_act/tc_sample.h b/include/uapi/linux/tc_act/tc_sample.h
index bd7e9f03abd2..fee1bcc20793 100644
--- a/include/uapi/linux/tc_act/tc_sample.h
+++ b/include/uapi/linux/tc_act/tc_sample.h
@@ -6,8 +6,6 @@
 #include <linux/pkt_cls.h>
 #include <linux/if_ether.h>
 
-#define TCA_ACT_SAMPLE 26
-
 struct tc_sample {
 	tc_gen;
 };
diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h
index 6de6071ebed6..800e93377218 100644
--- a/include/uapi/linux/tc_act/tc_skbedit.h
+++ b/include/uapi/linux/tc_act/tc_skbedit.h
@@ -23,8 +23,6 @@
 
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_SKBEDIT 11
-
 #define SKBEDIT_F_PRIORITY		0x1
 #define SKBEDIT_F_QUEUE_MAPPING		0x2
 #define SKBEDIT_F_MARK			0x4
diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h
index 38c072f66f2f..c525b3503797 100644
--- a/include/uapi/linux/tc_act/tc_skbmod.h
+++ b/include/uapi/linux/tc_act/tc_skbmod.h
@@ -13,8 +13,6 @@
 
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_SKBMOD 15
-
 #define SKBMOD_F_DMAC	0x1
 #define SKBMOD_F_SMAC	0x2
 #define SKBMOD_F_ETYPE	0x4
diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index be384d63e1b5..41c8b462c177 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -14,8 +14,6 @@
 
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_TUNNEL_KEY 17
-
 #define TCA_TUNNEL_KEY_ACT_SET	    1
 #define TCA_TUNNEL_KEY_ACT_RELEASE  2
 
diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h
index 0d7b5fd6605b..168995b54a70 100644
--- a/include/uapi/linux/tc_act/tc_vlan.h
+++ b/include/uapi/linux/tc_act/tc_vlan.h
@@ -13,8 +13,6 @@
 
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_VLAN 12
-
 #define TCA_VLAN_ACT_POP	1
 #define TCA_VLAN_ACT_PUSH	2
 #define TCA_VLAN_ACT_MODIFY	3
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 902957beceb3..b2b16d440154 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -19,8 +19,6 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 
-#define TCA_ACT_SIMP 22
-
 #include <linux/tc_act/tc_defact.h>
 #include <net/tc_act/tc_defact.h>
 
diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h b/tools/include/uapi/linux/tc_act/tc_bpf.h
index 6e89a5df49a4..653c4f94f76e 100644
--- a/tools/include/uapi/linux/tc_act/tc_bpf.h
+++ b/tools/include/uapi/linux/tc_act/tc_bpf.h
@@ -13,8 +13,6 @@
 
 #include <linux/pkt_cls.h>
 
-#define TCA_ACT_BPF 13
-
 struct tc_act_bpf {
 	tc_gen;
 };
-- 
cgit v1.2.3-59-g8ed1b


From eddd2cf195d6fb5e4bbc91a0fe4be55110f559ab Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Sun, 10 Feb 2019 14:25:00 +0200
Subject: net: Change TCA_ACT_* to TCA_ID_* to match that of TCA_ID_POLICE

Modify the kernel users of the TCA_ACT_* macros to use TCA_ID_*. For
example, use TCA_ID_GACT instead of TCA_ACT_GACT. This will align with
TCA_ID_POLICE and also differentiates these identifier, used in struct
tc_action_ops type field, from other macros starting with TCA_ACT_.

To make things clearer, we name the enum defining the TCA_ID_*
identifiers and also change the "type" field of struct tc_action to
id.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h              | 2 +-
 include/net/tc_act/tc_csum.h       | 2 +-
 include/net/tc_act/tc_gact.h       | 2 +-
 include/net/tc_act/tc_mirred.h     | 4 ++--
 include/net/tc_act/tc_pedit.h      | 2 +-
 include/net/tc_act/tc_sample.h     | 2 +-
 include/net/tc_act/tc_skbedit.h    | 2 +-
 include/net/tc_act/tc_tunnel_key.h | 4 ++--
 include/net/tc_act/tc_vlan.h       | 2 +-
 include/uapi/linux/pkt_cls.h       | 2 +-
 net/sched/act_api.c                | 2 +-
 net/sched/act_bpf.c                | 2 +-
 net/sched/act_connmark.c           | 2 +-
 net/sched/act_csum.c               | 2 +-
 net/sched/act_gact.c               | 2 +-
 net/sched/act_ife.c                | 2 +-
 net/sched/act_ipt.c                | 4 ++--
 net/sched/act_mirred.c             | 2 +-
 net/sched/act_nat.c                | 2 +-
 net/sched/act_pedit.c              | 2 +-
 net/sched/act_police.c             | 2 +-
 net/sched/act_sample.c             | 2 +-
 net/sched/act_simple.c             | 2 +-
 net/sched/act_skbedit.c            | 2 +-
 net/sched/act_skbmod.c             | 2 +-
 net/sched/act_tunnel_key.c         | 2 +-
 net/sched/act_vlan.c               | 2 +-
 27 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index dbc795ec659e..c745e9ccfab2 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -80,7 +80,7 @@ static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm)
 struct tc_action_ops {
 	struct list_head head;
 	char    kind[IFNAMSIZ];
-	__u32   type; /* TBD to match kind */
+	enum tca_id  id; /* identifier should match kind */
 	size_t	size;
 	struct module		*owner;
 	int     (*act)(struct sk_buff *, const struct tc_action *,
diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h
index 32d2454c0479..68269e4581b7 100644
--- a/include/net/tc_act/tc_csum.h
+++ b/include/net/tc_act/tc_csum.h
@@ -21,7 +21,7 @@ struct tcf_csum {
 static inline bool is_tcf_csum(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (a->ops && a->ops->type == TCA_ACT_CSUM)
+	if (a->ops && a->ops->id == TCA_ID_CSUM)
 		return true;
 #endif
 	return false;
diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index ef8dd0db70ce..ee8d005f56fc 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -22,7 +22,7 @@ static inline bool __is_tcf_gact_act(const struct tc_action *a, int act,
 #ifdef CONFIG_NET_CLS_ACT
 	struct tcf_gact *gact;
 
-	if (a->ops && a->ops->type != TCA_ACT_GACT)
+	if (a->ops && a->ops->id != TCA_ID_GACT)
 		return false;
 
 	gact = to_gact(a);
diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index a2e9cbca5c9e..c757585a05b0 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -17,7 +17,7 @@ struct tcf_mirred {
 static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (a->ops && a->ops->type == TCA_ACT_MIRRED)
+	if (a->ops && a->ops->id == TCA_ID_MIRRED)
 		return to_mirred(a)->tcfm_eaction == TCA_EGRESS_REDIR;
 #endif
 	return false;
@@ -26,7 +26,7 @@ static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a)
 static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (a->ops && a->ops->type == TCA_ACT_MIRRED)
+	if (a->ops && a->ops->id == TCA_ID_MIRRED)
 		return to_mirred(a)->tcfm_eaction == TCA_EGRESS_MIRROR;
 #endif
 	return false;
diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index fac3ad4a86de..748cf87a4d7e 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -23,7 +23,7 @@ struct tcf_pedit {
 static inline bool is_tcf_pedit(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (a->ops && a->ops->type == TCA_ACT_PEDIT)
+	if (a->ops && a->ops->id == TCA_ID_PEDIT)
 		return true;
 #endif
 	return false;
diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
index 01dbfea32672..0a559d4b6f0f 100644
--- a/include/net/tc_act/tc_sample.h
+++ b/include/net/tc_act/tc_sample.h
@@ -20,7 +20,7 @@ struct tcf_sample {
 static inline bool is_tcf_sample(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	return a->ops && a->ops->type == TCA_ACT_SAMPLE;
+	return a->ops && a->ops->id == TCA_ID_SAMPLE;
 #else
 	return false;
 #endif
diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 911bbac838a2..85c5c4756d92 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -44,7 +44,7 @@ static inline bool is_tcf_skbedit_mark(const struct tc_action *a)
 #ifdef CONFIG_NET_CLS_ACT
 	u32 flags;
 
-	if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) {
+	if (a->ops && a->ops->id == TCA_ID_SKBEDIT) {
 		rcu_read_lock();
 		flags = rcu_dereference(to_skbedit(a)->params)->flags;
 		rcu_read_unlock();
diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
index 46b8c7f1c8d5..23d5b8b19f3e 100644
--- a/include/net/tc_act/tc_tunnel_key.h
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -34,7 +34,7 @@ static inline bool is_tcf_tunnel_set(const struct tc_action *a)
 	struct tcf_tunnel_key *t = to_tunnel_key(a);
 	struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
 
-	if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY)
+	if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY)
 		return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET;
 #endif
 	return false;
@@ -46,7 +46,7 @@ static inline bool is_tcf_tunnel_release(const struct tc_action *a)
 	struct tcf_tunnel_key *t = to_tunnel_key(a);
 	struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
 
-	if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY)
+	if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY)
 		return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE;
 #endif
 	return false;
diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index 22ae260d6869..fe39ed502bef 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -30,7 +30,7 @@ struct tcf_vlan {
 static inline bool is_tcf_vlan(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (a->ops && a->ops->type == TCA_ACT_VLAN)
+	if (a->ops && a->ops->id == TCA_ID_VLAN)
 		return true;
 #endif
 	return false;
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 7ab55f97e7c4..51a0496f78ea 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -85,7 +85,7 @@ enum {
 #define TCA_ACT_SAMPLE 26
 
 /* Action type identifiers*/
-enum {
+enum tca_id {
 	TCA_ID_UNSPEC = 0,
 	TCA_ID_POLICE = 1,
 	TCA_ID_GACT = TCA_ACT_GACT,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d4b8355737d8..aecf1bf233c8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -543,7 +543,7 @@ int tcf_register_action(struct tc_action_ops *act,
 
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
-		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
+		if (act->id == a->id || (strcmp(act->kind, a->kind) == 0)) {
 			write_unlock(&act_mod_lock);
 			unregister_pernet_subsys(ops);
 			return -EEXIST;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c7633843e223..aa5c38d11a30 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -396,7 +396,7 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_bpf_ops __read_mostly = {
 	.kind		=	"bpf",
-	.type		=	TCA_ACT_BPF,
+	.id		=	TCA_ID_BPF,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_bpf_act,
 	.dump		=	tcf_bpf_dump,
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 8475913f2070..5d24993cccfe 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -204,7 +204,7 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_connmark_ops = {
 	.kind		=	"connmark",
-	.type		=	TCA_ACT_CONNMARK,
+	.id		=	TCA_ID_CONNMARK,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_connmark_act,
 	.dump		=	tcf_connmark_dump,
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 3dc25b7806d7..945fb34ae721 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -660,7 +660,7 @@ static size_t tcf_csum_get_fill_size(const struct tc_action *act)
 
 static struct tc_action_ops act_csum_ops = {
 	.kind		= "csum",
-	.type		= TCA_ACT_CSUM,
+	.id		= TCA_ID_CSUM,
 	.owner		= THIS_MODULE,
 	.act		= tcf_csum_act,
 	.dump		= tcf_csum_dump,
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index b61c20ebb314..93da0004e9f4 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -253,7 +253,7 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act)
 
 static struct tc_action_ops act_gact_ops = {
 	.kind		=	"gact",
-	.type		=	TCA_ACT_GACT,
+	.id		=	TCA_ID_GACT,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_gact_act,
 	.stats_update	=	tcf_gact_stats_update,
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 30b63fa23ee2..9b1f2b3990ee 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -864,7 +864,7 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_ife_ops = {
 	.kind = "ife",
-	.type = TCA_ACT_IFE,
+	.id = TCA_ID_IFE,
 	.owner = THIS_MODULE,
 	.act = tcf_ife_act,
 	.dump = tcf_ife_dump,
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8af6c11d2482..1bad190710ad 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -338,7 +338,7 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_ipt_ops = {
 	.kind		=	"ipt",
-	.type		=	TCA_ACT_IPT,
+	.id		=	TCA_ID_IPT,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_ipt_act,
 	.dump		=	tcf_ipt_dump,
@@ -387,7 +387,7 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_xt_ops = {
 	.kind		=	"xt",
-	.type		=	TCA_ACT_XT,
+	.id		=	TCA_ID_XT,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_ipt_act,
 	.dump		=	tcf_ipt_dump,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index c8cf4d10c435..6692fd054617 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -400,7 +400,7 @@ static void tcf_mirred_put_dev(struct net_device *dev)
 
 static struct tc_action_ops act_mirred_ops = {
 	.kind		=	"mirred",
-	.type		=	TCA_ACT_MIRRED,
+	.id		=	TCA_ID_MIRRED,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_mirred_act,
 	.stats_update	=	tcf_stats_update,
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index c5c1e23add77..543eab9193f1 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -304,7 +304,7 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_nat_ops = {
 	.kind		=	"nat",
-	.type		=	TCA_ACT_NAT,
+	.id		=	TCA_ID_NAT,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_nat_act,
 	.dump		=	tcf_nat_dump,
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 3663d3b615a4..a80373878df7 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -470,7 +470,7 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_pedit_ops = {
 	.kind		=	"pedit",
-	.type		=	TCA_ACT_PEDIT,
+	.id		=	TCA_ID_PEDIT,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_pedit_act,
 	.dump		=	tcf_pedit_dump,
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index ec8ec55e0fe8..8271a6263824 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -366,7 +366,7 @@ MODULE_LICENSE("GPL");
 
 static struct tc_action_ops act_police_ops = {
 	.kind		=	"police",
-	.type		=	TCA_ID_POLICE,
+	.id		=	TCA_ID_POLICE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_police_act,
 	.dump		=	tcf_police_dump,
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 1a0c682fd734..203e399e5c85 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -233,7 +233,7 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_sample_ops = {
 	.kind	  = "sample",
-	.type	  = TCA_ACT_SAMPLE,
+	.id	  = TCA_ID_SAMPLE,
 	.owner	  = THIS_MODULE,
 	.act	  = tcf_sample_act,
 	.dump	  = tcf_sample_dump,
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index b2b16d440154..d54cb608dbaf 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -195,7 +195,7 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_simp_ops = {
 	.kind		=	"simple",
-	.type		=	TCA_ACT_SIMP,
+	.id		=	TCA_ID_SIMP,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_simp_act,
 	.dump		=	tcf_simp_dump,
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 64dba3708fce..39f8a67ea940 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -305,7 +305,7 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_skbedit_ops = {
 	.kind		=	"skbedit",
-	.type		=	TCA_ACT_SKBEDIT,
+	.id		=	TCA_ID_SKBEDIT,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_skbedit_act,
 	.dump		=	tcf_skbedit_dump,
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 59710a183bd3..7bac1d78e7a3 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -260,7 +260,7 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_skbmod_ops = {
 	.kind		=	"skbmod",
-	.type		=	TCA_ACT_SKBMOD,
+	.id		=	TCA_ACT_SKBMOD,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_skbmod_act,
 	.dump		=	tcf_skbmod_dump,
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 8b43fe0130f7..9104b8e36482 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -563,7 +563,7 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_tunnel_key_ops = {
 	.kind		=	"tunnel_key",
-	.type		=	TCA_ACT_TUNNEL_KEY,
+	.id		=	TCA_ID_TUNNEL_KEY,
 	.owner		=	THIS_MODULE,
 	.act		=	tunnel_key_act,
 	.dump		=	tunnel_key_dump,
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 93fdaf707313..ac0061599225 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -297,7 +297,7 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
 
 static struct tc_action_ops act_vlan_ops = {
 	.kind		=	"vlan",
-	.type		=	TCA_ACT_VLAN,
+	.id		=	TCA_ID_VLAN,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_vlan_act,
 	.dump		=	tcf_vlan_dump,
-- 
cgit v1.2.3-59-g8ed1b


From 46f8bc92758c6259bcf945e9216098661c1587cd Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 9 Feb 2019 23:22:20 -0800
Subject: bpf: Add a bpf_sock pointer to __sk_buff and a bpf_sk_fullsock helper

In kernel, it is common to check "skb->sk && sk_fullsock(skb->sk)"
before accessing the fields in sock.  For example, in __netdev_pick_tx:

static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
			    struct net_device *sb_dev)
{
	/* ... */

	struct sock *sk = skb->sk;

		if (queue_index != new_index && sk &&
		    sk_fullsock(sk) &&
		    rcu_access_pointer(sk->sk_dst_cache))
			sk_tx_queue_set(sk, new_index);

	/* ... */

	return queue_index;
}

This patch adds a "struct bpf_sock *sk" pointer to the "struct __sk_buff"
where a few of the convert_ctx_access() in filter.c has already been
accessing the skb->sk sock_common's fields,
e.g. sock_ops_convert_ctx_access().

"__sk_buff->sk" is a PTR_TO_SOCK_COMMON_OR_NULL in the verifier.
Some of the fileds in "bpf_sock" will not be directly
accessible through the "__sk_buff->sk" pointer.  It is limited
by the new "bpf_sock_common_is_valid_access()".
e.g. The existing "type", "protocol", "mark" and "priority" in bpf_sock
     are not allowed.

The newly added "struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)"
can be used to get a sk with all accessible fields in "bpf_sock".
This helper is added to both cg_skb and sched_(cls|act).

int cg_skb_foo(struct __sk_buff *skb) {
	struct bpf_sock *sk;

	sk = skb->sk;
	if (!sk)
		return 1;

	sk = bpf_sk_fullsock(sk);
	if (!sk)
		return 1;

	if (sk->family != AF_INET6 || sk->protocol != IPPROTO_TCP)
		return 1;

	/* some_traffic_shaping(); */

	return 1;
}

(1) The sk is read only

(2) There is no new "struct bpf_sock_common" introduced.

(3) Future kernel sock's members could be added to bpf_sock only
    instead of repeatedly adding at multiple places like currently
    in bpf_sock_ops_md, bpf_sock_addr_md, sk_reuseport_md...etc.

(4) After "sk = skb->sk", the reg holding sk is in type
    PTR_TO_SOCK_COMMON_OR_NULL.

(5) After bpf_sk_fullsock(), the return type will be in type
    PTR_TO_SOCKET_OR_NULL which is the same as the return type of
    bpf_sk_lookup_xxx().

    However, bpf_sk_fullsock() does not take refcnt.  The
    acquire_reference_state() is only depending on the return type now.
    To avoid it, a new is_acquire_function() is checked before calling
    acquire_reference_state().

(6) The WARN_ON in "release_reference_state()" is no longer an
    internal verifier bug.

    When reg->id is not found in state->refs[], it means the
    bpf_prog does something wrong like
    "bpf_sk_release(bpf_sk_fullsock(skb->sk))" where reference has
    never been acquired by calling "bpf_sk_fullsock(skb->sk)".

    A -EINVAL and a verbose are done instead of WARN_ON.  A test is
    added to the test_verifier in a later patch.

    Since the WARN_ON in "release_reference_state()" is no longer
    needed, "__release_reference_state()" is folded into
    "release_reference_state()" also.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  12 +++++
 include/uapi/linux/bpf.h |  12 ++++-
 kernel/bpf/verifier.c    | 132 +++++++++++++++++++++++++++++++++--------------
 net/core/filter.c        |  42 +++++++++++++++
 4 files changed, 157 insertions(+), 41 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bd169a7bcc93..a60463b45b54 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -194,6 +194,7 @@ enum bpf_arg_type {
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */
 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
+	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */
 };
 
 /* type of values returned from helper functions */
@@ -256,6 +257,8 @@ enum bpf_reg_type {
 	PTR_TO_FLOW_KEYS,	 /* reg points to bpf_flow_keys */
 	PTR_TO_SOCKET,		 /* reg points to struct bpf_sock */
 	PTR_TO_SOCKET_OR_NULL,	 /* reg points to struct bpf_sock or NULL */
+	PTR_TO_SOCK_COMMON,	 /* reg points to sock_common */
+	PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -920,6 +923,9 @@ void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 #if defined(CONFIG_NET)
+bool bpf_sock_common_is_valid_access(int off, int size,
+				     enum bpf_access_type type,
+				     struct bpf_insn_access_aux *info);
 bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 			      struct bpf_insn_access_aux *info);
 u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
@@ -928,6 +934,12 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 				struct bpf_prog *prog,
 				u32 *target_size);
 #else
+static inline bool bpf_sock_common_is_valid_access(int off, int size,
+						   enum bpf_access_type type,
+						   struct bpf_insn_access_aux *info)
+{
+	return false;
+}
 static inline bool bpf_sock_is_valid_access(int off, int size,
 					    enum bpf_access_type type,
 					    struct bpf_insn_access_aux *info)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1777fa0c61e4..5d79cba74ddc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2329,6 +2329,14 @@ union bpf_attr {
  *		"**y**".
  *	Return
  *		0
+ *
+ * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
+ *	Description
+ *		This helper gets a **struct bpf_sock** pointer such
+ *		that all the fields in bpf_sock can be accessed.
+ *	Return
+ *		A **struct bpf_sock** pointer on success, or NULL in
+ *		case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2425,7 +2433,8 @@ union bpf_attr {
 	FN(msg_pop_data),		\
 	FN(rc_pointer_rel),		\
 	FN(spin_lock),			\
-	FN(spin_unlock),
+	FN(spin_unlock),		\
+	FN(sk_fullsock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2545,6 +2554,7 @@ struct __sk_buff {
 	__u64 tstamp;
 	__u32 wire_len;
 	__u32 gso_segs;
+	__bpf_md_ptr(struct bpf_sock *, sk);
 };
 
 struct bpf_tunnel_key {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 516dfc6d78de..b755d55a3791 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -331,10 +331,17 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
 	       type == PTR_TO_PACKET_META;
 }
 
+static bool type_is_sk_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET ||
+		type == PTR_TO_SOCK_COMMON;
+}
+
 static bool reg_type_may_be_null(enum bpf_reg_type type)
 {
 	return type == PTR_TO_MAP_VALUE_OR_NULL ||
-	       type == PTR_TO_SOCKET_OR_NULL;
+	       type == PTR_TO_SOCKET_OR_NULL ||
+	       type == PTR_TO_SOCK_COMMON_OR_NULL;
 }
 
 static bool type_is_refcounted(enum bpf_reg_type type)
@@ -377,6 +384,12 @@ static bool is_release_function(enum bpf_func_id func_id)
 	return func_id == BPF_FUNC_sk_release;
 }
 
+static bool is_acquire_function(enum bpf_func_id func_id)
+{
+	return func_id == BPF_FUNC_sk_lookup_tcp ||
+		func_id == BPF_FUNC_sk_lookup_udp;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -392,6 +405,8 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_FLOW_KEYS]	= "flow_keys",
 	[PTR_TO_SOCKET]		= "sock",
 	[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
+	[PTR_TO_SOCK_COMMON]	= "sock_common",
+	[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
 };
 
 static char slot_type_char[] = {
@@ -618,13 +633,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
 }
 
 /* release function corresponding to acquire_reference_state(). Idempotent. */
-static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
+static int release_reference_state(struct bpf_func_state *state, int ptr_id)
 {
 	int i, last_idx;
 
-	if (!ptr_id)
-		return -EFAULT;
-
 	last_idx = state->acquired_refs - 1;
 	for (i = 0; i < state->acquired_refs; i++) {
 		if (state->refs[i].id == ptr_id) {
@@ -636,21 +648,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
 			return 0;
 		}
 	}
-	return -EFAULT;
-}
-
-/* variation on the above for cases where we expect that there must be an
- * outstanding reference for the specified ptr_id.
- */
-static int release_reference_state(struct bpf_verifier_env *env, int ptr_id)
-{
-	struct bpf_func_state *state = cur_func(env);
-	int err;
-
-	err = __release_reference_state(state, ptr_id);
-	if (WARN_ON_ONCE(err != 0))
-		verbose(env, "verifier internal error: can't release reference\n");
-	return err;
+	return -EINVAL;
 }
 
 static int transfer_reference_state(struct bpf_func_state *dst,
@@ -1209,6 +1207,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_SOCKET:
 	case PTR_TO_SOCKET_OR_NULL:
+	case PTR_TO_SOCK_COMMON:
+	case PTR_TO_SOCK_COMMON_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -1647,6 +1647,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 	struct bpf_insn_access_aux info = {};
+	bool valid;
 
 	if (reg->smin_value < 0) {
 		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
@@ -1654,15 +1655,28 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 		return -EACCES;
 	}
 
-	if (!bpf_sock_is_valid_access(off, size, t, &info)) {
-		verbose(env, "invalid bpf_sock access off=%d size=%d\n",
-			off, size);
-		return -EACCES;
+	switch (reg->type) {
+	case PTR_TO_SOCK_COMMON:
+		valid = bpf_sock_common_is_valid_access(off, size, t, &info);
+		break;
+	case PTR_TO_SOCKET:
+		valid = bpf_sock_is_valid_access(off, size, t, &info);
+		break;
+	default:
+		valid = false;
 	}
 
-	env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 
-	return 0;
+	if (valid) {
+		env->insn_aux_data[insn_idx].ctx_field_size =
+			info.ctx_field_size;
+		return 0;
+	}
+
+	verbose(env, "R%d invalid %s access off=%d size=%d\n",
+		regno, reg_type_str[reg->type], off, size);
+
+	return -EACCES;
 }
 
 static bool __is_pointer_value(bool allow_ptr_leaks,
@@ -1688,8 +1702,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
 {
 	const struct bpf_reg_state *reg = reg_state(env, regno);
 
-	return reg->type == PTR_TO_CTX ||
-	       reg->type == PTR_TO_SOCKET;
+	return reg->type == PTR_TO_CTX;
+}
+
+static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
+{
+	const struct bpf_reg_state *reg = reg_state(env, regno);
+
+	return type_is_sk_pointer(reg->type);
 }
 
 static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
@@ -1800,6 +1820,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_SOCKET:
 		pointer_desc = "sock ";
 		break;
+	case PTR_TO_SOCK_COMMON:
+		pointer_desc = "sock_common ";
+		break;
 	default:
 		break;
 	}
@@ -2003,11 +2026,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			 * PTR_TO_PACKET[_META,_END]. In the latter
 			 * case, we know the offset is zero.
 			 */
-			if (reg_type == SCALAR_VALUE)
+			if (reg_type == SCALAR_VALUE) {
 				mark_reg_unknown(env, regs, value_regno);
-			else
+			} else {
 				mark_reg_known_zero(env, regs,
 						    value_regno);
+				if (reg_type_may_be_null(reg_type))
+					regs[value_regno].id = ++env->id_gen;
+			}
 			regs[value_regno].type = reg_type;
 		}
 
@@ -2053,9 +2079,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_flow_keys_access(env, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
-	} else if (reg->type == PTR_TO_SOCKET) {
+	} else if (type_is_sk_pointer(reg->type)) {
 		if (t == BPF_WRITE) {
-			verbose(env, "cannot write into socket\n");
+			verbose(env, "R%d cannot write into %s\n",
+				regno, reg_type_str[reg->type]);
 			return -EACCES;
 		}
 		err = check_sock_access(env, insn_idx, regno, off, size, t);
@@ -2102,7 +2129,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 
 	if (is_ctx_reg(env, insn->dst_reg) ||
 	    is_pkt_reg(env, insn->dst_reg) ||
-	    is_flow_key_reg(env, insn->dst_reg)) {
+	    is_flow_key_reg(env, insn->dst_reg) ||
+	    is_sk_reg(env, insn->dst_reg)) {
 		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
 			insn->dst_reg,
 			reg_type_str[reg_state(env, insn->dst_reg)->type]);
@@ -2369,6 +2397,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		err = check_ctx_reg(env, reg, regno);
 		if (err < 0)
 			return err;
+	} else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
+		expected_type = PTR_TO_SOCK_COMMON;
+		/* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
+		if (!type_is_sk_pointer(type))
+			goto err_type;
 	} else if (arg_type == ARG_PTR_TO_SOCKET) {
 		expected_type = PTR_TO_SOCKET;
 		if (type != expected_type)
@@ -2783,7 +2816,7 @@ static int release_reference(struct bpf_verifier_env *env,
 	for (i = 0; i <= vstate->curframe; i++)
 		release_reg_references(env, vstate->frame[i], meta->ptr_id);
 
-	return release_reference_state(env, meta->ptr_id);
+	return release_reference_state(cur_func(env), meta->ptr_id);
 }
 
 static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -3049,8 +3082,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		}
 	} else if (is_release_function(func_id)) {
 		err = release_reference(env, &meta);
-		if (err)
+		if (err) {
+			verbose(env, "func %s#%d reference has not been acquired before\n",
+				func_id_name(func_id), func_id);
 			return err;
+		}
 	}
 
 	regs = cur_regs(env);
@@ -3099,12 +3135,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			regs[BPF_REG_0].id = ++env->id_gen;
 		}
 	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
-		int id = acquire_reference_state(env, insn_idx);
-		if (id < 0)
-			return id;
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
-		regs[BPF_REG_0].id = id;
+		if (is_acquire_function(func_id)) {
+			int id = acquire_reference_state(env, insn_idx);
+
+			if (id < 0)
+				return id;
+			/* For release_reference() */
+			regs[BPF_REG_0].id = id;
+		} else {
+			/* For mark_ptr_or_null_reg() */
+			regs[BPF_REG_0].id = ++env->id_gen;
+		}
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -3364,6 +3407,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case PTR_TO_PACKET_END:
 	case PTR_TO_SOCKET:
 	case PTR_TO_SOCKET_OR_NULL:
+	case PTR_TO_SOCK_COMMON:
+	case PTR_TO_SOCK_COMMON_OR_NULL:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -4597,6 +4642,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			}
 		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
 			reg->type = PTR_TO_SOCKET;
+		} else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
+			reg->type = PTR_TO_SOCK_COMMON;
 		}
 		if (is_null || !(reg_is_refcounted(reg) ||
 				 reg_may_point_to_spin_lock(reg))) {
@@ -4621,7 +4668,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
 	int i, j;
 
 	if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
-		__release_reference_state(state, id);
+		release_reference_state(state, id);
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		mark_ptr_or_null_reg(state, &regs[i], id, is_null);
@@ -5790,6 +5837,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_FLOW_KEYS:
 	case PTR_TO_SOCKET:
 	case PTR_TO_SOCKET_OR_NULL:
+	case PTR_TO_SOCK_COMMON:
+	case PTR_TO_SOCK_COMMON_OR_NULL:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
@@ -6110,6 +6159,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_CTX:
 	case PTR_TO_SOCKET:
 	case PTR_TO_SOCKET_OR_NULL:
+	case PTR_TO_SOCK_COMMON:
+	case PTR_TO_SOCK_COMMON_OR_NULL:
 		return false;
 	default:
 		return true;
@@ -7112,6 +7163,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			convert_ctx_access = ops->convert_ctx_access;
 			break;
 		case PTR_TO_SOCKET:
+		case PTR_TO_SOCK_COMMON:
 			convert_ctx_access = bpf_sock_convert_ctx_access;
 			break;
 		default:
diff --git a/net/core/filter.c b/net/core/filter.c
index 3a49f68eda10..401d2e0aebf8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1793,6 +1793,20 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
+{
+	sk = sk_to_full_sk(sk);
+
+	return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
+}
+
+static const struct bpf_func_proto bpf_sk_fullsock_proto = {
+	.func		= bpf_sk_fullsock,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
+};
+
 static inline int sk_skb_try_make_writable(struct sk_buff *skb,
 					   unsigned int write_len)
 {
@@ -5406,6 +5420,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
+	case BPF_FUNC_sk_fullsock:
+		return &bpf_sk_fullsock_proto;
 	default:
 		return sk_filter_func_proto(func_id, prog);
 	}
@@ -5477,6 +5493,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_fib_lookup:
 		return &bpf_skb_fib_lookup_proto;
+	case BPF_FUNC_sk_fullsock:
+		return &bpf_sk_fullsock_proto;
 #ifdef CONFIG_XFRM
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
@@ -5764,6 +5782,11 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 		if (size != sizeof(__u64))
 			return false;
 		break;
+	case offsetof(struct __sk_buff, sk):
+		if (type == BPF_WRITE || size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+		break;
 	default:
 		/* Only narrow read access allowed for now. */
 		if (type == BPF_WRITE) {
@@ -5950,6 +5973,18 @@ static bool __sock_filter_check_size(int off, int size,
 	return size == size_default;
 }
 
+bool bpf_sock_common_is_valid_access(int off, int size,
+				     enum bpf_access_type type,
+				     struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case bpf_ctx_range_till(struct bpf_sock, type, priority):
+		return false;
+	default:
+		return bpf_sock_is_valid_access(off, size, type, info);
+	}
+}
+
 bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 			      struct bpf_insn_access_aux *info)
 {
@@ -6748,6 +6783,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 		off += offsetof(struct qdisc_skb_cb, pkt_len);
 		*target_size = 4;
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
+		break;
+
+	case offsetof(struct __sk_buff, sk):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3-59-g8ed1b


From aa65d6960a98fc15a96ce361b26e9fd55c9bccc5 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 9 Feb 2019 23:22:21 -0800
Subject: bpf: Add state, dst_ip4, dst_ip6 and dst_port to bpf_sock

This patch adds "state", "dst_ip4", "dst_ip6" and "dst_port" to the
bpf_sock.  The userspace has already been using "state",
e.g. inet_diag (ss -t) and getsockopt(TCP_INFO).

This patch also allows narrow load on the following existing fields:
"family", "type", "protocol" and "src_port".  Unlike IP address,
the load offset is resticted to the first byte for them but it
can be relaxed later if there is a use case.

This patch also folds __sock_filter_check_size() into
bpf_sock_is_valid_access() since it is not called
by any where else.  All bpf_sock checking is in
one place.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 17 ++++-----
 net/core/filter.c        | 99 +++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 85 insertions(+), 31 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5d79cba74ddc..d8f91777c5b6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2606,15 +2606,14 @@ struct bpf_sock {
 	__u32 protocol;
 	__u32 mark;
 	__u32 priority;
-	__u32 src_ip4;		/* Allows 1,2,4-byte read.
-				 * Stored in network byte order.
-				 */
-	__u32 src_ip6[4];	/* Allows 1,2,4-byte read.
-				 * Stored in network byte order.
-				 */
-	__u32 src_port;		/* Allows 4-byte read.
-				 * Stored in host byte order
-				 */
+	/* IP address also allows 1 and 2 bytes access */
+	__u32 src_ip4;
+	__u32 src_ip6[4];
+	__u32 src_port;		/* host byte order */
+	__u32 dst_port;		/* network byte order */
+	__u32 dst_ip4;
+	__u32 dst_ip6[4];
+	__u32 state;
 };
 
 struct bpf_sock_tuple {
diff --git a/net/core/filter.c b/net/core/filter.c
index 401d2e0aebf8..01bb64bf2b5e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5958,21 +5958,6 @@ full_access:
 	return true;
 }
 
-static bool __sock_filter_check_size(int off, int size,
-				     struct bpf_insn_access_aux *info)
-{
-	const int size_default = sizeof(__u32);
-
-	switch (off) {
-	case bpf_ctx_range(struct bpf_sock, src_ip4):
-	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
-		bpf_ctx_record_field_size(info, size_default);
-		return bpf_ctx_narrow_access_ok(off, size, size_default);
-	}
-
-	return size == size_default;
-}
-
 bool bpf_sock_common_is_valid_access(int off, int size,
 				     enum bpf_access_type type,
 				     struct bpf_insn_access_aux *info)
@@ -5988,13 +5973,29 @@ bool bpf_sock_common_is_valid_access(int off, int size,
 bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 			      struct bpf_insn_access_aux *info)
 {
+	const int size_default = sizeof(__u32);
+
 	if (off < 0 || off >= sizeof(struct bpf_sock))
 		return false;
 	if (off % size != 0)
 		return false;
-	if (!__sock_filter_check_size(off, size, info))
-		return false;
-	return true;
+
+	switch (off) {
+	case offsetof(struct bpf_sock, state):
+	case offsetof(struct bpf_sock, family):
+	case offsetof(struct bpf_sock, type):
+	case offsetof(struct bpf_sock, protocol):
+	case offsetof(struct bpf_sock, dst_port):
+	case offsetof(struct bpf_sock, src_port):
+	case bpf_ctx_range(struct bpf_sock, src_ip4):
+	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+	case bpf_ctx_range(struct bpf_sock, dst_ip4):
+	case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
+		bpf_ctx_record_field_size(info, size_default);
+		return bpf_ctx_narrow_access_ok(off, size, size_default);
+	}
+
+	return size == size_default;
 }
 
 static bool sock_filter_is_valid_access(int off, int size,
@@ -6838,24 +6839,32 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 	case offsetof(struct bpf_sock, family):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
-
-		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
-				      offsetof(struct sock, sk_family));
+		*insn++ = BPF_LDX_MEM(
+			BPF_FIELD_SIZEOF(struct sock_common, skc_family),
+			si->dst_reg, si->src_reg,
+			bpf_target_off(struct sock_common,
+				       skc_family,
+				       FIELD_SIZEOF(struct sock_common,
+						    skc_family),
+				       target_size));
 		break;
 
 	case offsetof(struct bpf_sock, type):
+		BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2);
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
 		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
 		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+		*target_size = 2;
 		break;
 
 	case offsetof(struct bpf_sock, protocol):
+		BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
 		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
 		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
+		*target_size = 1;
 		break;
 
 	case offsetof(struct bpf_sock, src_ip4):
@@ -6867,6 +6876,15 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 				       target_size));
 		break;
 
+	case offsetof(struct bpf_sock, dst_ip4):
+		*insn++ = BPF_LDX_MEM(
+			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+			bpf_target_off(struct sock_common, skc_daddr,
+				       FIELD_SIZEOF(struct sock_common,
+						    skc_daddr),
+				       target_size));
+		break;
+
 	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
 #if IS_ENABLED(CONFIG_IPV6)
 		off = si->off;
@@ -6885,6 +6903,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 #endif
 		break;
 
+	case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		off = si->off;
+		off -= offsetof(struct bpf_sock, dst_ip6[0]);
+		*insn++ = BPF_LDX_MEM(
+			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+			bpf_target_off(struct sock_common,
+				       skc_v6_daddr.s6_addr32[0],
+				       FIELD_SIZEOF(struct sock_common,
+						    skc_v6_daddr.s6_addr32[0]),
+				       target_size) + off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+		*target_size = 4;
+#endif
+		break;
+
 	case offsetof(struct bpf_sock, src_port):
 		*insn++ = BPF_LDX_MEM(
 			BPF_FIELD_SIZEOF(struct sock_common, skc_num),
@@ -6894,6 +6929,26 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 						    skc_num),
 				       target_size));
 		break;
+
+	case offsetof(struct bpf_sock, dst_port):
+		*insn++ = BPF_LDX_MEM(
+			BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
+			si->dst_reg, si->src_reg,
+			bpf_target_off(struct sock_common, skc_dport,
+				       FIELD_SIZEOF(struct sock_common,
+						    skc_dport),
+				       target_size));
+		break;
+
+	case offsetof(struct bpf_sock, state):
+		*insn++ = BPF_LDX_MEM(
+			BPF_FIELD_SIZEOF(struct sock_common, skc_state),
+			si->dst_reg, si->src_reg,
+			bpf_target_off(struct sock_common, skc_state,
+				       FIELD_SIZEOF(struct sock_common,
+						    skc_state),
+				       target_size));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3-59-g8ed1b


From 655a51e536c09d15ffa3603b1b6fce2b45b85a1f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 9 Feb 2019 23:22:24 -0800
Subject: bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock

This patch adds a helper function BPF_FUNC_tcp_sock and it
is currently available for cg_skb and sched_(cls|act):

struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk);

int cg_skb_foo(struct __sk_buff *skb) {
	struct bpf_tcp_sock *tp;
	struct bpf_sock *sk;
	__u32 snd_cwnd;

	sk = skb->sk;
	if (!sk)
		return 1;

	tp = bpf_tcp_sock(sk);
	if (!tp)
		return 1;

	snd_cwnd = tp->snd_cwnd;
	/* ... */

	return 1;
}

A 'struct bpf_tcp_sock' is also added to the uapi bpf.h to provide
read-only access.  bpf_tcp_sock has all the existing tcp_sock's fields
that has already been exposed by the bpf_sock_ops.
i.e. no new tcp_sock's fields are exposed in bpf.h.

This helper returns a pointer to the tcp_sock.  If it is not a tcp_sock
or it cannot be traced back to a tcp_sock by sk_to_full_sk(), it
returns NULL.  Hence, the caller needs to check for NULL before
accessing it.

The current use case is to expose members from tcp_sock
to allow a cg_skb_bpf_prog to provide per cgroup traffic
policing/shaping.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      | 30 ++++++++++++++++++
 include/uapi/linux/bpf.h | 51 ++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c    | 31 +++++++++++++++++--
 net/core/filter.c        | 79 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 188 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a60463b45b54..7f58828755fd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -204,6 +204,7 @@ enum bpf_return_type {
 	RET_PTR_TO_MAP_VALUE,		/* returns a pointer to map elem value */
 	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
 	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
+	RET_PTR_TO_TCP_SOCK_OR_NULL,	/* returns a pointer to a tcp_sock or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -259,6 +260,8 @@ enum bpf_reg_type {
 	PTR_TO_SOCKET_OR_NULL,	 /* reg points to struct bpf_sock or NULL */
 	PTR_TO_SOCK_COMMON,	 /* reg points to sock_common */
 	PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
+	PTR_TO_TCP_SOCK,	 /* reg points to struct tcp_sock */
+	PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -956,4 +959,31 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 }
 #endif
 
+#ifdef CONFIG_INET
+bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+				  struct bpf_insn_access_aux *info);
+
+u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
+				    const struct bpf_insn *si,
+				    struct bpf_insn *insn_buf,
+				    struct bpf_prog *prog,
+				    u32 *target_size);
+#else
+static inline bool bpf_tcp_sock_is_valid_access(int off, int size,
+						enum bpf_access_type type,
+						struct bpf_insn_access_aux *info)
+{
+	return false;
+}
+
+static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
+						  const struct bpf_insn *si,
+						  struct bpf_insn *insn_buf,
+						  struct bpf_prog *prog,
+						  u32 *target_size)
+{
+	return 0;
+}
+#endif /* CONFIG_INET */
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d8f91777c5b6..25c8c0e62ecf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2337,6 +2337,15 @@ union bpf_attr {
  *	Return
  *		A **struct bpf_sock** pointer on success, or NULL in
  *		case of failure.
+ *
+ * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
+ *	Description
+ *		This helper gets a **struct bpf_tcp_sock** pointer from a
+ *		**struct bpf_sock** pointer.
+ *
+ *	Return
+ *		A **struct bpf_tcp_sock** pointer on success, or NULL in
+ *		case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2434,7 +2443,8 @@ union bpf_attr {
 	FN(rc_pointer_rel),		\
 	FN(spin_lock),			\
 	FN(spin_unlock),		\
-	FN(sk_fullsock),
+	FN(sk_fullsock),		\
+	FN(tcp_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2616,6 +2626,45 @@ struct bpf_sock {
 	__u32 state;
 };
 
+struct bpf_tcp_sock {
+	__u32 snd_cwnd;		/* Sending congestion window		*/
+	__u32 srtt_us;		/* smoothed round trip time << 3 in usecs */
+	__u32 rtt_min;
+	__u32 snd_ssthresh;	/* Slow start size threshold		*/
+	__u32 rcv_nxt;		/* What we want to receive next		*/
+	__u32 snd_nxt;		/* Next sequence we send		*/
+	__u32 snd_una;		/* First byte we want an ack for	*/
+	__u32 mss_cache;	/* Cached effective mss, not including SACKS */
+	__u32 ecn_flags;	/* ECN status bits.			*/
+	__u32 rate_delivered;	/* saved rate sample: packets delivered */
+	__u32 rate_interval_us;	/* saved rate sample: time elapsed */
+	__u32 packets_out;	/* Packets which are "in flight"	*/
+	__u32 retrans_out;	/* Retransmitted packets out		*/
+	__u32 total_retrans;	/* Total retransmits for entire connection */
+	__u32 segs_in;		/* RFC4898 tcpEStatsPerfSegsIn
+				 * total number of segments in.
+				 */
+	__u32 data_segs_in;	/* RFC4898 tcpEStatsPerfDataSegsIn
+				 * total number of data segments in.
+				 */
+	__u32 segs_out;		/* RFC4898 tcpEStatsPerfSegsOut
+				 * The total number of segments sent.
+				 */
+	__u32 data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
+				 * total number of data segments sent.
+				 */
+	__u32 lost_out;		/* Lost packets			*/
+	__u32 sacked_out;	/* SACK'd packets			*/
+	__u64 bytes_received;	/* RFC4898 tcpEStatsAppHCThruOctetsReceived
+				 * sum(delta(rcv_nxt)), or how many bytes
+				 * were acked.
+				 */
+	__u64 bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
+				 * sum(delta(snd_una)), or how many bytes
+				 * were acked.
+				 */
+};
+
 struct bpf_sock_tuple {
 	union {
 		struct {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b755d55a3791..1b9496c41383 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -334,14 +334,16 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
 static bool type_is_sk_pointer(enum bpf_reg_type type)
 {
 	return type == PTR_TO_SOCKET ||
-		type == PTR_TO_SOCK_COMMON;
+		type == PTR_TO_SOCK_COMMON ||
+		type == PTR_TO_TCP_SOCK;
 }
 
 static bool reg_type_may_be_null(enum bpf_reg_type type)
 {
 	return type == PTR_TO_MAP_VALUE_OR_NULL ||
 	       type == PTR_TO_SOCKET_OR_NULL ||
-	       type == PTR_TO_SOCK_COMMON_OR_NULL;
+	       type == PTR_TO_SOCK_COMMON_OR_NULL ||
+	       type == PTR_TO_TCP_SOCK_OR_NULL;
 }
 
 static bool type_is_refcounted(enum bpf_reg_type type)
@@ -407,6 +409,8 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
 	[PTR_TO_SOCK_COMMON]	= "sock_common",
 	[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
+	[PTR_TO_TCP_SOCK]	= "tcp_sock",
+	[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
 };
 
 static char slot_type_char[] = {
@@ -1209,6 +1213,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_SOCKET_OR_NULL:
 	case PTR_TO_SOCK_COMMON:
 	case PTR_TO_SOCK_COMMON_OR_NULL:
+	case PTR_TO_TCP_SOCK:
+	case PTR_TO_TCP_SOCK_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -1662,6 +1668,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 	case PTR_TO_SOCKET:
 		valid = bpf_sock_is_valid_access(off, size, t, &info);
 		break;
+	case PTR_TO_TCP_SOCK:
+		valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
+		break;
 	default:
 		valid = false;
 	}
@@ -1823,6 +1832,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_SOCK_COMMON:
 		pointer_desc = "sock_common ";
 		break;
+	case PTR_TO_TCP_SOCK:
+		pointer_desc = "tcp_sock ";
+		break;
 	default:
 		break;
 	}
@@ -3148,6 +3160,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			/* For mark_ptr_or_null_reg() */
 			regs[BPF_REG_0].id = ++env->id_gen;
 		}
+	} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -3409,6 +3425,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case PTR_TO_SOCKET_OR_NULL:
 	case PTR_TO_SOCK_COMMON:
 	case PTR_TO_SOCK_COMMON_OR_NULL:
+	case PTR_TO_TCP_SOCK:
+	case PTR_TO_TCP_SOCK_OR_NULL:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -4644,6 +4662,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			reg->type = PTR_TO_SOCKET;
 		} else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
 			reg->type = PTR_TO_SOCK_COMMON;
+		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
+			reg->type = PTR_TO_TCP_SOCK;
 		}
 		if (is_null || !(reg_is_refcounted(reg) ||
 				 reg_may_point_to_spin_lock(reg))) {
@@ -5839,6 +5859,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_SOCKET_OR_NULL:
 	case PTR_TO_SOCK_COMMON:
 	case PTR_TO_SOCK_COMMON_OR_NULL:
+	case PTR_TO_TCP_SOCK:
+	case PTR_TO_TCP_SOCK_OR_NULL:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
@@ -6161,6 +6183,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_SOCKET_OR_NULL:
 	case PTR_TO_SOCK_COMMON:
 	case PTR_TO_SOCK_COMMON_OR_NULL:
+	case PTR_TO_TCP_SOCK:
+	case PTR_TO_TCP_SOCK_OR_NULL:
 		return false;
 	default:
 		return true;
@@ -7166,6 +7190,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		case PTR_TO_SOCK_COMMON:
 			convert_ctx_access = bpf_sock_convert_ctx_access;
 			break;
+		case PTR_TO_TCP_SOCK:
+			convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
+			break;
 		default:
 			continue;
 		}
diff --git a/net/core/filter.c b/net/core/filter.c
index c0d7b9ef279f..353735575204 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5315,6 +5315,79 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+				  struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	switch (off) {
+	case offsetof(struct bpf_tcp_sock, bytes_received):
+	case offsetof(struct bpf_tcp_sock, bytes_acked):
+		return size == sizeof(__u64);
+	default:
+		return size == sizeof(__u32);
+	}
+}
+
+u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
+				    const struct bpf_insn *si,
+				    struct bpf_insn *insn_buf,
+				    struct bpf_prog *prog, u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+#define BPF_TCP_SOCK_GET_COMMON(FIELD)					\
+	do {								\
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) >	\
+			     FIELD_SIZEOF(struct bpf_tcp_sock, FIELD));	\
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
+				      si->dst_reg, si->src_reg,		\
+				      offsetof(struct tcp_sock, FIELD)); \
+	} while (0)
+
+	CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock,
+				       BPF_TCP_SOCK_GET_COMMON);
+
+	if (insn > insn_buf)
+		return insn - insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct bpf_tcp_sock, rtt_min):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+			     sizeof(struct minmax));
+		BUILD_BUG_ON(sizeof(struct minmax) <
+			     sizeof(struct minmax_sample));
+
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct tcp_sock, rtt_min) +
+				      offsetof(struct minmax_sample, v));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
+{
+	sk = sk_to_full_sk(sk);
+
+	if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+		return (unsigned long)sk;
+
+	return (unsigned long)NULL;
+}
+
+static const struct bpf_func_proto bpf_tcp_sock_proto = {
+	.func		= bpf_tcp_sock,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_TCP_SOCK_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
+};
+
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5470,6 +5543,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_sk_fullsock:
 		return &bpf_sk_fullsock_proto;
+#ifdef CONFIG_INET
+	case BPF_FUNC_tcp_sock:
+		return &bpf_tcp_sock_proto;
+#endif
 	default:
 		return sk_filter_func_proto(func_id, prog);
 	}
@@ -5560,6 +5637,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_lookup_udp_proto;
 	case BPF_FUNC_sk_release:
 		return &bpf_sk_release_proto;
+	case BPF_FUNC_tcp_sock:
+		return &bpf_tcp_sock_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
-- 
cgit v1.2.3-59-g8ed1b


From c68cfb718c8f97b7f7a50ed66be5feb42d0c8988 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Fri, 8 Feb 2019 17:11:25 +0000
Subject: misc: fastrpc: Add support for context Invoke method

This patch adds support to compute context invoke method on the
remote processor (DSP).
This involves setting up the functions input and output arguments,
input and output handles and mapping the dmabuf fd for the
argument/handle buffers.

The below diagram depicts invocation of a single method where the
client and objects reside on different processors. An object could
expose multiple methods which can be grouped together and referred
to as an interface.

,--------,        ,------,  ,-----------,  ,------,        ,--------,
|        | method |      |  |           |  |      | method |        |
| Client |------->| Stub |->| Transport |->| Skel |------->| Object |
|        |        |      |  |           |  |      |        |        |
`--------`        `------`  `-----------`  `------`        `--------`

Client:    Linux user mode process that initiates the remote invocation
Stub:      Auto generated code linked in with the user mode process that
           takes care of marshaling parameters
Transport: Involved in carrying an invocation from a client to an
           object. This involves two portions: 1) FastRPC Linux
           kernel driver that receives the remote invocation, queues
           them up and then waits for the response after signaling the
           remote side. 2) Service running on the remote side that
           dequeues the messages from the queue and dispatches them for
           processing.
Skel:      Auto generated code that takes care of un-marshaling
           parameters
Object:    Method implementation

Most of the work is derived from various downstream Qualcomm kernels.
Credits to various Qualcomm authors who have contributed to this code.
Specially Tharun Kumar Merugu <mtharu@codeaurora.org>

Co-developed-by: Thierry Escande <thierry.escande@linaro.org>
Signed-off-by: Thierry Escande <thierry.escande@linaro.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/fastrpc.c      | 730 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/misc/fastrpc.h |  23 ++
 2 files changed, 753 insertions(+)
 create mode 100644 include/uapi/misc/fastrpc.h

(limited to 'include/uapi')

diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index 10b93fd5659a..cd69f8b308f6 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -2,7 +2,9 @@
 // Copyright (c) 2011-2018, The Linux Foundation. All rights reserved.
 // Copyright (c) 2018, Linaro Limited
 
+#include <linux/completion.h>
 #include <linux/device.h>
+#include <linux/dma-buf.h>
 #include <linux/dma-mapping.h>
 #include <linux/idr.h>
 #include <linux/list.h>
@@ -14,6 +16,7 @@
 #include <linux/rpmsg.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <uapi/misc/fastrpc.h>
 
 #define ADSP_DOMAIN_ID (0)
 #define MDSP_DOMAIN_ID (1)
@@ -21,14 +24,118 @@
 #define CDSP_DOMAIN_ID (3)
 #define FASTRPC_DEV_MAX		4 /* adsp, mdsp, slpi, cdsp*/
 #define FASTRPC_MAX_SESSIONS	9 /*8 compute, 1 cpz*/
+#define FASTRPC_ALIGN		128
+#define FASTRPC_MAX_FDLIST	16
+#define FASTRPC_MAX_CRCLIST	64
+#define FASTRPC_PHYS(p)	((p) & 0xffffffff)
 #define FASTRPC_CTX_MAX (256)
 #define FASTRPC_CTXID_MASK (0xFF0)
 #define FASTRPC_DEVICE_NAME	"fastrpc"
 
+/* Retrives number of input buffers from the scalars parameter */
+#define REMOTE_SCALARS_INBUFS(sc)	(((sc) >> 16) & 0x0ff)
+
+/* Retrives number of output buffers from the scalars parameter */
+#define REMOTE_SCALARS_OUTBUFS(sc)	(((sc) >> 8) & 0x0ff)
+
+/* Retrives number of input handles from the scalars parameter */
+#define REMOTE_SCALARS_INHANDLES(sc)	(((sc) >> 4) & 0x0f)
+
+/* Retrives number of output handles from the scalars parameter */
+#define REMOTE_SCALARS_OUTHANDLES(sc)	((sc) & 0x0f)
+
+#define REMOTE_SCALARS_LENGTH(sc)	(REMOTE_SCALARS_INBUFS(sc) +   \
+					 REMOTE_SCALARS_OUTBUFS(sc) +  \
+					 REMOTE_SCALARS_INHANDLES(sc)+ \
+					 REMOTE_SCALARS_OUTHANDLES(sc))
+#define FASTRPC_BUILD_SCALARS(attr, method, in, out, oin, oout)  \
+				(((attr & 0x07) << 29) |		\
+				((method & 0x1f) << 24) |	\
+				((in & 0xff) << 16) |		\
+				((out & 0xff) <<  8) |		\
+				((oin & 0x0f) <<  4) |		\
+				(oout & 0x0f))
+
+#define FASTRPC_SCALARS(method, in, out) \
+		FASTRPC_BUILD_SCALARS(0, method, in, out, 0, 0)
+
 #define miscdev_to_cctx(d) container_of(d, struct fastrpc_channel_ctx, miscdev)
 
 static const char *domains[FASTRPC_DEV_MAX] = { "adsp", "mdsp",
 						"sdsp", "cdsp"};
+struct fastrpc_phy_page {
+	u64 addr;		/* physical address */
+	u64 size;		/* size of contiguous region */
+};
+
+struct fastrpc_invoke_buf {
+	u32 num;		/* number of contiguous regions */
+	u32 pgidx;		/* index to start of contiguous region */
+};
+
+struct fastrpc_remote_arg {
+	u64 pv;
+	u64 len;
+};
+
+struct fastrpc_msg {
+	int pid;		/* process group id */
+	int tid;		/* thread id */
+	u64 ctx;		/* invoke caller context */
+	u32 handle;	/* handle to invoke */
+	u32 sc;		/* scalars structure describing the data */
+	u64 addr;		/* physical address */
+	u64 size;		/* size of contiguous region */
+};
+
+struct fastrpc_invoke_rsp {
+	u64 ctx;		/* invoke caller context */
+	int retval;		/* invoke return value */
+};
+
+struct fastrpc_buf {
+	struct fastrpc_user *fl;
+	struct device *dev;
+	void *virt;
+	u64 phys;
+	u64 size;
+};
+
+struct fastrpc_map {
+	struct list_head node;
+	struct fastrpc_user *fl;
+	int fd;
+	struct dma_buf *buf;
+	struct sg_table *table;
+	struct dma_buf_attachment *attach;
+	u64 phys;
+	u64 size;
+	void *va;
+	u64 len;
+	struct kref refcount;
+};
+
+struct fastrpc_invoke_ctx {
+	int nscalars;
+	int nbufs;
+	int retval;
+	int pid;
+	int tgid;
+	u32 sc;
+	u32 *crc;
+	u64 ctxid;
+	u64 msg_sz;
+	struct kref refcount;
+	struct list_head node; /* list of ctxs */
+	struct completion work;
+	struct fastrpc_msg msg;
+	struct fastrpc_user *fl;
+	struct fastrpc_remote_arg *rpra;
+	struct fastrpc_map **maps;
+	struct fastrpc_buf *buf;
+	struct fastrpc_invoke_args *args;
+	struct fastrpc_channel_ctx *cctx;
+};
 
 struct fastrpc_session_ctx {
 	struct device *dev;
@@ -55,6 +162,7 @@ struct fastrpc_user {
 
 	struct fastrpc_channel_ctx *cctx;
 	struct fastrpc_session_ctx *sctx;
+	struct fastrpc_buf *init_mem;
 
 	int tgid;
 	int pd;
@@ -64,6 +172,522 @@ struct fastrpc_user {
 	struct mutex mutex;
 };
 
+static void fastrpc_free_map(struct kref *ref)
+{
+	struct fastrpc_map *map;
+
+	map = container_of(ref, struct fastrpc_map, refcount);
+
+	if (map->table) {
+		dma_buf_unmap_attachment(map->attach, map->table,
+					 DMA_BIDIRECTIONAL);
+		dma_buf_detach(map->buf, map->attach);
+		dma_buf_put(map->buf);
+	}
+
+	kfree(map);
+}
+
+static void fastrpc_map_put(struct fastrpc_map *map)
+{
+	if (map)
+		kref_put(&map->refcount, fastrpc_free_map);
+}
+
+static void fastrpc_map_get(struct fastrpc_map *map)
+{
+	if (map)
+		kref_get(&map->refcount);
+}
+
+static int fastrpc_map_find(struct fastrpc_user *fl, int fd,
+			    struct fastrpc_map **ppmap)
+{
+	struct fastrpc_map *map = NULL;
+
+	mutex_lock(&fl->mutex);
+	list_for_each_entry(map, &fl->maps, node) {
+		if (map->fd == fd) {
+			fastrpc_map_get(map);
+			*ppmap = map;
+			mutex_unlock(&fl->mutex);
+			return 0;
+		}
+	}
+	mutex_unlock(&fl->mutex);
+
+	return -ENOENT;
+}
+
+static void fastrpc_buf_free(struct fastrpc_buf *buf)
+{
+	dma_free_coherent(buf->dev, buf->size, buf->virt,
+			  FASTRPC_PHYS(buf->phys));
+	kfree(buf);
+}
+
+static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev,
+			     u64 size, struct fastrpc_buf **obuf)
+{
+	struct fastrpc_buf *buf;
+
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	buf->fl = fl;
+	buf->virt = NULL;
+	buf->phys = 0;
+	buf->size = size;
+	buf->dev = dev;
+
+	buf->virt = dma_alloc_coherent(dev, buf->size, (dma_addr_t *)&buf->phys,
+				       GFP_KERNEL);
+	if (!buf->virt)
+		return -ENOMEM;
+
+	if (fl->sctx && fl->sctx->sid)
+		buf->phys += ((u64)fl->sctx->sid << 32);
+
+	*obuf = buf;
+
+	return 0;
+}
+
+static void fastrpc_context_free(struct kref *ref)
+{
+	struct fastrpc_invoke_ctx *ctx;
+	struct fastrpc_channel_ctx *cctx;
+	int i;
+
+	ctx = container_of(ref, struct fastrpc_invoke_ctx, refcount);
+	cctx = ctx->cctx;
+
+	for (i = 0; i < ctx->nscalars; i++)
+		fastrpc_map_put(ctx->maps[i]);
+
+	if (ctx->buf)
+		fastrpc_buf_free(ctx->buf);
+
+	spin_lock(&cctx->lock);
+	idr_remove(&cctx->ctx_idr, ctx->ctxid >> 4);
+	spin_unlock(&cctx->lock);
+
+	kfree(ctx->maps);
+	kfree(ctx);
+}
+
+static void fastrpc_context_get(struct fastrpc_invoke_ctx *ctx)
+{
+	kref_get(&ctx->refcount);
+}
+
+static void fastrpc_context_put(struct fastrpc_invoke_ctx *ctx)
+{
+	kref_put(&ctx->refcount, fastrpc_context_free);
+}
+
+static struct fastrpc_invoke_ctx *fastrpc_context_alloc(
+			struct fastrpc_user *user, u32 kernel, u32 sc,
+			struct fastrpc_invoke_args *args)
+{
+	struct fastrpc_channel_ctx *cctx = user->cctx;
+	struct fastrpc_invoke_ctx *ctx = NULL;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ctx->node);
+	ctx->fl = user;
+	ctx->nscalars = REMOTE_SCALARS_LENGTH(sc);
+	ctx->nbufs = REMOTE_SCALARS_INBUFS(sc) +
+		     REMOTE_SCALARS_OUTBUFS(sc);
+
+	if (ctx->nscalars) {
+		ctx->maps = kcalloc(ctx->nscalars,
+				    sizeof(*ctx->maps), GFP_KERNEL);
+		if (!ctx->maps) {
+			kfree(ctx);
+			return ERR_PTR(-ENOMEM);
+		}
+		ctx->args = args;
+	}
+
+	ctx->sc = sc;
+	ctx->retval = -1;
+	ctx->pid = current->pid;
+	ctx->tgid = user->tgid;
+	ctx->cctx = cctx;
+	init_completion(&ctx->work);
+
+	spin_lock(&user->lock);
+	list_add_tail(&ctx->node, &user->pending);
+	spin_unlock(&user->lock);
+
+	spin_lock(&cctx->lock);
+	ret = idr_alloc_cyclic(&cctx->ctx_idr, ctx, 1,
+			       FASTRPC_CTX_MAX, GFP_ATOMIC);
+	if (ret < 0) {
+		spin_unlock(&cctx->lock);
+		goto err_idr;
+	}
+	ctx->ctxid = ret << 4;
+	spin_unlock(&cctx->lock);
+
+	kref_init(&ctx->refcount);
+
+	return ctx;
+err_idr:
+	spin_lock(&user->lock);
+	list_del(&ctx->node);
+	spin_unlock(&user->lock);
+	kfree(ctx->maps);
+	kfree(ctx);
+
+	return ERR_PTR(ret);
+}
+
+static int fastrpc_map_create(struct fastrpc_user *fl, int fd,
+			      u64 len, struct fastrpc_map **ppmap)
+{
+	struct fastrpc_session_ctx *sess = fl->sctx;
+	struct fastrpc_map *map = NULL;
+	int err = 0;
+
+	if (!fastrpc_map_find(fl, fd, ppmap))
+		return 0;
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&map->node);
+	map->fl = fl;
+	map->fd = fd;
+	map->buf = dma_buf_get(fd);
+	if (!map->buf) {
+		err = -EINVAL;
+		goto get_err;
+	}
+
+	map->attach = dma_buf_attach(map->buf, sess->dev);
+	if (IS_ERR(map->attach)) {
+		dev_err(sess->dev, "Failed to attach dmabuf\n");
+		err = PTR_ERR(map->attach);
+		goto attach_err;
+	}
+
+	map->table = dma_buf_map_attachment(map->attach, DMA_BIDIRECTIONAL);
+	if (IS_ERR(map->table)) {
+		err = PTR_ERR(map->table);
+		goto map_err;
+	}
+
+	map->phys = sg_dma_address(map->table->sgl);
+	map->phys += ((u64)fl->sctx->sid << 32);
+	map->size = len;
+	map->va = sg_virt(map->table->sgl);
+	map->len = len;
+	kref_init(&map->refcount);
+
+	spin_lock(&fl->lock);
+	list_add_tail(&map->node, &fl->maps);
+	spin_unlock(&fl->lock);
+	*ppmap = map;
+
+	return 0;
+
+map_err:
+	dma_buf_detach(map->buf, map->attach);
+attach_err:
+	dma_buf_put(map->buf);
+get_err:
+	kfree(map);
+
+	return err;
+}
+
+/*
+ * Fastrpc payload buffer with metadata looks like:
+ *
+ * >>>>>>  START of METADATA <<<<<<<<<
+ * +---------------------------------+
+ * |           Arguments             |
+ * | type:(struct fastrpc_remote_arg)|
+ * |             (0 - N)             |
+ * +---------------------------------+
+ * |         Invoke Buffer list      |
+ * | type:(struct fastrpc_invoke_buf)|
+ * |           (0 - N)               |
+ * +---------------------------------+
+ * |         Page info list          |
+ * | type:(struct fastrpc_phy_page)  |
+ * |             (0 - N)             |
+ * +---------------------------------+
+ * |         Optional info           |
+ * |(can be specific to SoC/Firmware)|
+ * +---------------------------------+
+ * >>>>>>>>  END of METADATA <<<<<<<<<
+ * +---------------------------------+
+ * |         Inline ARGS             |
+ * |            (0-N)                |
+ * +---------------------------------+
+ */
+
+static int fastrpc_get_meta_size(struct fastrpc_invoke_ctx *ctx)
+{
+	int size = 0;
+
+	size = (sizeof(struct fastrpc_remote_arg) +
+		sizeof(struct fastrpc_invoke_buf) +
+		sizeof(struct fastrpc_phy_page)) * ctx->nscalars +
+		sizeof(u64) * FASTRPC_MAX_FDLIST +
+		sizeof(u32) * FASTRPC_MAX_CRCLIST;
+
+	return size;
+}
+
+static u64 fastrpc_get_payload_size(struct fastrpc_invoke_ctx *ctx, int metalen)
+{
+	u64 size = 0;
+	int i;
+
+	size = ALIGN(metalen, FASTRPC_ALIGN);
+	for (i = 0; i < ctx->nscalars; i++) {
+		if (ctx->args[i].fd == 0 || ctx->args[i].fd == -1) {
+			size = ALIGN(size, FASTRPC_ALIGN);
+			size += ctx->args[i].length;
+		}
+	}
+
+	return size;
+}
+
+static int fastrpc_create_maps(struct fastrpc_invoke_ctx *ctx)
+{
+	struct device *dev = ctx->fl->sctx->dev;
+	int i, err;
+
+	for (i = 0; i < ctx->nscalars; ++i) {
+		/* Make sure reserved field is set to 0 */
+		if (ctx->args[i].reserved)
+			return -EINVAL;
+
+		if (ctx->args[i].fd == 0 || ctx->args[i].fd == -1 ||
+		    ctx->args[i].length == 0)
+			continue;
+
+		err = fastrpc_map_create(ctx->fl, ctx->args[i].fd,
+					 ctx->args[i].length, &ctx->maps[i]);
+		if (err) {
+			dev_err(dev, "Error Creating map %d\n", err);
+			return -EINVAL;
+		}
+
+	}
+	return 0;
+}
+
+static int fastrpc_get_args(u32 kernel, struct fastrpc_invoke_ctx *ctx)
+{
+	struct device *dev = ctx->fl->sctx->dev;
+	struct fastrpc_remote_arg *rpra;
+	struct fastrpc_invoke_buf *list;
+	struct fastrpc_phy_page *pages;
+	int inbufs, i, err = 0;
+	u64 rlen, pkt_size;
+	uintptr_t args;
+	int metalen;
+
+
+	inbufs = REMOTE_SCALARS_INBUFS(ctx->sc);
+	metalen = fastrpc_get_meta_size(ctx);
+	pkt_size = fastrpc_get_payload_size(ctx, metalen);
+
+	err = fastrpc_create_maps(ctx);
+	if (err)
+		return err;
+
+	ctx->msg_sz = pkt_size;
+
+	err = fastrpc_buf_alloc(ctx->fl, dev, pkt_size, &ctx->buf);
+	if (err)
+		return err;
+
+	rpra = ctx->buf->virt;
+	list = ctx->buf->virt + ctx->nscalars * sizeof(*rpra);
+	pages = ctx->buf->virt + ctx->nscalars * (sizeof(*list) +
+		sizeof(*rpra));
+	args = (uintptr_t)ctx->buf->virt + metalen;
+	rlen = pkt_size - metalen;
+	ctx->rpra = rpra;
+
+	for (i = 0; i < ctx->nbufs; ++i) {
+		u64 len = ctx->args[i].length;
+
+		rpra[i].pv = 0;
+		rpra[i].len = len;
+		list[i].num = len ? 1 : 0;
+		list[i].pgidx = i;
+
+		if (!len)
+			continue;
+
+		pages[i].size = roundup(len, PAGE_SIZE);
+
+		if (ctx->maps[i]) {
+			rpra[i].pv = (u64) ctx->args[i].ptr;
+			pages[i].addr = ctx->maps[i]->phys;
+		} else {
+			rlen -= ALIGN(args, FASTRPC_ALIGN) - args;
+			args = ALIGN(args, FASTRPC_ALIGN);
+			if (rlen < len)
+				goto bail;
+
+			rpra[i].pv = args;
+			pages[i].addr = ctx->buf->phys + (pkt_size - rlen);
+			pages[i].addr = pages[i].addr &	PAGE_MASK;
+			args = args + len;
+			rlen -= len;
+		}
+
+		if (i < inbufs && !ctx->maps[i]) {
+			void *dst = (void *)(uintptr_t)rpra[i].pv;
+			void *src = (void *)(uintptr_t)ctx->args[i].ptr;
+
+			if (!kernel) {
+				if (copy_from_user(dst, (void __user *)src,
+						   len)) {
+					err = -EFAULT;
+					goto bail;
+				}
+			} else {
+				memcpy(dst, src, len);
+			}
+		}
+	}
+
+	for (i = ctx->nbufs; i < ctx->nscalars; ++i) {
+		rpra[i].pv = (u64) ctx->args[i].ptr;
+		rpra[i].len = ctx->args[i].length;
+		list[i].num = ctx->args[i].length ? 1 : 0;
+		list[i].pgidx = i;
+		pages[i].addr = ctx->maps[i]->phys;
+		pages[i].size = ctx->maps[i]->size;
+	}
+
+bail:
+	if (err)
+		dev_err(dev, "Error: get invoke args failed:%d\n", err);
+
+	return err;
+}
+
+static int fastrpc_put_args(struct fastrpc_invoke_ctx *ctx,
+			    u32 kernel)
+{
+	struct fastrpc_remote_arg *rpra = ctx->rpra;
+	int i, inbufs;
+
+	inbufs = REMOTE_SCALARS_INBUFS(ctx->sc);
+
+	for (i = inbufs; i < ctx->nbufs; ++i) {
+		void *src = (void *)(uintptr_t)rpra[i].pv;
+		void *dst = (void *)(uintptr_t)ctx->args[i].ptr;
+		u64 len = rpra[i].len;
+
+		if (!kernel) {
+			if (copy_to_user((void __user *)dst, src, len))
+				return -EFAULT;
+		} else {
+			memcpy(dst, src, len);
+		}
+	}
+
+	return 0;
+}
+
+static int fastrpc_invoke_send(struct fastrpc_session_ctx *sctx,
+			       struct fastrpc_invoke_ctx *ctx,
+			       u32 kernel, uint32_t handle)
+{
+	struct fastrpc_channel_ctx *cctx;
+	struct fastrpc_user *fl = ctx->fl;
+	struct fastrpc_msg *msg = &ctx->msg;
+
+	cctx = fl->cctx;
+	msg->pid = fl->tgid;
+	msg->tid = current->pid;
+
+	if (kernel)
+		msg->pid = 0;
+
+	msg->ctx = ctx->ctxid | fl->pd;
+	msg->handle = handle;
+	msg->sc = ctx->sc;
+	msg->addr = ctx->buf ? ctx->buf->phys : 0;
+	msg->size = roundup(ctx->msg_sz, PAGE_SIZE);
+	fastrpc_context_get(ctx);
+
+	return rpmsg_send(cctx->rpdev->ept, (void *)msg, sizeof(*msg));
+}
+
+static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
+				   u32 handle, u32 sc,
+				   struct fastrpc_invoke_args *args)
+{
+	struct fastrpc_invoke_ctx *ctx = NULL;
+	int err = 0;
+
+	if (!fl->sctx)
+		return -EINVAL;
+
+	ctx = fastrpc_context_alloc(fl, kernel, sc, args);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (ctx->nscalars) {
+		err = fastrpc_get_args(kernel, ctx);
+		if (err)
+			goto bail;
+	}
+	/* Send invoke buffer to remote dsp */
+	err = fastrpc_invoke_send(fl->sctx, ctx, kernel, handle);
+	if (err)
+		goto bail;
+
+	/* Wait for remote dsp to respond or time out */
+	err = wait_for_completion_interruptible(&ctx->work);
+	if (err)
+		goto bail;
+
+	/* Check the response from remote dsp */
+	err = ctx->retval;
+	if (err)
+		goto bail;
+
+	if (ctx->nscalars) {
+		/* populate all the output buffers with results */
+		err = fastrpc_put_args(ctx, kernel);
+		if (err)
+			goto bail;
+	}
+
+bail:
+	/* We are done with this compute context, remove it from pending list */
+	spin_lock(&fl->lock);
+	list_del(&ctx->node);
+	spin_unlock(&fl->lock);
+	fastrpc_context_put(ctx);
+
+	if (err)
+		dev_dbg(fl->sctx->dev, "Error: Invoke Failed %d\n", err);
+
+	return err;
+}
+
 static struct fastrpc_session_ctx *fastrpc_session_alloc(
 					struct fastrpc_channel_ctx *cctx)
 {
@@ -95,11 +719,26 @@ static int fastrpc_device_release(struct inode *inode, struct file *file)
 {
 	struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
 	struct fastrpc_channel_ctx *cctx = fl->cctx;
+	struct fastrpc_invoke_ctx *ctx, *n;
+	struct fastrpc_map *map, *m;
 
 	spin_lock(&cctx->lock);
 	list_del(&fl->user);
 	spin_unlock(&cctx->lock);
 
+	if (fl->init_mem)
+		fastrpc_buf_free(fl->init_mem);
+
+	list_for_each_entry_safe(ctx, n, &fl->pending, node) {
+		list_del(&ctx->node);
+		fastrpc_context_put(ctx);
+	}
+
+	list_for_each_entry_safe(map, m, &fl->maps, node) {
+		list_del(&map->node);
+		fastrpc_map_put(map);
+	}
+
 	fastrpc_session_free(cctx, fl->sctx);
 
 	mutex_destroy(&fl->mutex);
@@ -134,9 +773,60 @@ static int fastrpc_device_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int fastrpc_invoke(struct fastrpc_user *fl, char __user *argp)
+{
+	struct fastrpc_invoke_args *args = NULL;
+	struct fastrpc_invoke inv;
+	u32 nscalars;
+	int err;
+
+	if (copy_from_user(&inv, argp, sizeof(inv)))
+		return -EFAULT;
+
+	/* nscalars is truncated here to max supported value */
+	nscalars = REMOTE_SCALARS_LENGTH(inv.sc);
+	if (nscalars) {
+		args = kcalloc(nscalars, sizeof(*args), GFP_KERNEL);
+		if (!args)
+			return -ENOMEM;
+
+		if (copy_from_user(args, (void __user *)(uintptr_t)inv.args,
+				   nscalars * sizeof(*args))) {
+			kfree(args);
+			return -EFAULT;
+		}
+	}
+
+	err = fastrpc_internal_invoke(fl, false, inv.handle, inv.sc, args);
+	kfree(args);
+
+	return err;
+}
+
+static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long arg)
+{
+	struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
+	char __user *argp = (char __user *)arg;
+	int err;
+
+	switch (cmd) {
+	case FASTRPC_IOCTL_INVOKE:
+		err = fastrpc_invoke(fl, argp);
+		break;
+	default:
+		err = -ENOTTY;
+		break;
+	}
+
+	return err;
+}
+
 static const struct file_operations fastrpc_fops = {
 	.open = fastrpc_device_open,
 	.release = fastrpc_device_release,
+	.unlocked_ioctl = fastrpc_device_ioctl,
+	.compat_ioctl = fastrpc_device_ioctl,
 };
 
 static int fastrpc_cb_probe(struct platform_device *pdev)
@@ -260,9 +950,25 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev)
 	return of_platform_populate(rdev->of_node, NULL, NULL, rdev);
 }
 
+static void fastrpc_notify_users(struct fastrpc_user *user)
+{
+	struct fastrpc_invoke_ctx *ctx;
+
+	spin_lock(&user->lock);
+	list_for_each_entry(ctx, &user->pending, node)
+		complete(&ctx->work);
+	spin_unlock(&user->lock);
+}
+
 static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev)
 {
 	struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev);
+	struct fastrpc_user *user;
+
+	spin_lock(&cctx->lock);
+	list_for_each_entry(user, &cctx->users, user)
+		fastrpc_notify_users(user);
+	spin_unlock(&cctx->lock);
 
 	misc_deregister(&cctx->miscdev);
 	of_platform_depopulate(&rpdev->dev);
@@ -272,6 +978,30 @@ static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev)
 static int fastrpc_rpmsg_callback(struct rpmsg_device *rpdev, void *data,
 				  int len, void *priv, u32 addr)
 {
+	struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev);
+	struct fastrpc_invoke_rsp *rsp = data;
+	struct fastrpc_invoke_ctx *ctx;
+	unsigned long flags;
+	unsigned long ctxid;
+
+	if (len < sizeof(*rsp))
+		return -EINVAL;
+
+	ctxid = ((rsp->ctx & FASTRPC_CTXID_MASK) >> 4);
+
+	spin_lock_irqsave(&cctx->lock, flags);
+	ctx = idr_find(&cctx->ctx_idr, ctxid);
+	spin_unlock_irqrestore(&cctx->lock, flags);
+
+	if (!ctx) {
+		dev_err(&rpdev->dev, "No context ID matches response\n");
+		return -ENOENT;
+	}
+
+	ctx->retval = rsp->retval;
+	complete(&ctx->work);
+	fastrpc_context_put(ctx);
+
 	return 0;
 }
 
diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h
new file mode 100644
index 000000000000..a69ef33dc37e
--- /dev/null
+++ b/include/uapi/misc/fastrpc.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __QCOM_FASTRPC_H__
+#define __QCOM_FASTRPC_H__
+
+#include <linux/types.h>
+
+#define FASTRPC_IOCTL_INVOKE		_IOWR('R', 3, struct fastrpc_invoke)
+
+struct fastrpc_invoke_args {
+	__u64 ptr;
+	__u64 length;
+	__s32 fd;
+	__u32 reserved;
+};
+
+struct fastrpc_invoke {
+	__u32 handle;
+	__u32 sc;
+	__u64 args;
+};
+
+#endif /* __QCOM_FASTRPC_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From d73f71c7c6ee1583c08c214c8f7b20d841490b36 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Fri, 8 Feb 2019 17:11:26 +0000
Subject: misc: fastrpc: Add support for create remote init process

This patch adds support to create or attach remote shell process.
The shell process called fastrpc_shell_0 is usually loaded on the DSP
when a user process is spawned.

Most of the work is derived from various downstream Qualcomm kernels.
Credits to various Qualcomm authors who have contributed to this code.
Specially Tharun Kumar Merugu <mtharu@codeaurora.org>

Co-developed-by: Thierry Escande <thierry.escande@linaro.org>
Signed-off-by: Thierry Escande <thierry.escande@linaro.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/fastrpc.c      | 156 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/misc/fastrpc.h |  10 +++
 2 files changed, 166 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index cd69f8b308f6..ceb498487569 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -29,7 +29,10 @@
 #define FASTRPC_MAX_CRCLIST	64
 #define FASTRPC_PHYS(p)	((p) & 0xffffffff)
 #define FASTRPC_CTX_MAX (256)
+#define FASTRPC_INIT_HANDLE	1
 #define FASTRPC_CTXID_MASK (0xFF0)
+#define INIT_FILELEN_MAX (2 * 1024 * 1024)
+#define INIT_MEMLEN_MAX  (8 * 1024 * 1024)
 #define FASTRPC_DEVICE_NAME	"fastrpc"
 
 /* Retrives number of input buffers from the scalars parameter */
@@ -59,6 +62,14 @@
 #define FASTRPC_SCALARS(method, in, out) \
 		FASTRPC_BUILD_SCALARS(0, method, in, out, 0, 0)
 
+#define FASTRPC_CREATE_PROCESS_NARGS	6
+/* Remote Method id table */
+#define FASTRPC_RMID_INIT_ATTACH	0
+#define FASTRPC_RMID_INIT_RELEASE	1
+#define FASTRPC_RMID_INIT_CREATE	6
+#define FASTRPC_RMID_INIT_CREATE_ATTR	7
+#define FASTRPC_RMID_INIT_CREATE_STATIC	8
+
 #define miscdev_to_cctx(d) container_of(d, struct fastrpc_channel_ctx, miscdev)
 
 static const char *domains[FASTRPC_DEV_MAX] = { "adsp", "mdsp",
@@ -688,6 +699,109 @@ bail:
 	return err;
 }
 
+static int fastrpc_init_create_process(struct fastrpc_user *fl,
+					char __user *argp)
+{
+	struct fastrpc_init_create init;
+	struct fastrpc_invoke_args *args;
+	struct fastrpc_phy_page pages[1];
+	struct fastrpc_map *map = NULL;
+	struct fastrpc_buf *imem = NULL;
+	int memlen;
+	int err;
+	struct {
+		int pgid;
+		u32 namelen;
+		u32 filelen;
+		u32 pageslen;
+		u32 attrs;
+		u32 siglen;
+	} inbuf;
+	u32 sc;
+
+	args = kcalloc(FASTRPC_CREATE_PROCESS_NARGS, sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	if (copy_from_user(&init, argp, sizeof(init))) {
+		err = -EFAULT;
+		goto bail;
+	}
+
+	if (init.filelen > INIT_FILELEN_MAX) {
+		err = -EINVAL;
+		goto bail;
+	}
+
+	inbuf.pgid = fl->tgid;
+	inbuf.namelen = strlen(current->comm) + 1;
+	inbuf.filelen = init.filelen;
+	inbuf.pageslen = 1;
+	inbuf.attrs = init.attrs;
+	inbuf.siglen = init.siglen;
+	fl->pd = 1;
+
+	if (init.filelen && init.filefd) {
+		err = fastrpc_map_create(fl, init.filefd, init.filelen, &map);
+		if (err)
+			goto bail;
+	}
+
+	memlen = ALIGN(max(INIT_FILELEN_MAX, (int)init.filelen * 4),
+		       1024 * 1024);
+	err = fastrpc_buf_alloc(fl, fl->sctx->dev, memlen,
+				&imem);
+	if (err) {
+		fastrpc_map_put(map);
+		goto bail;
+	}
+
+	fl->init_mem = imem;
+	args[0].ptr = (u64)(uintptr_t)&inbuf;
+	args[0].length = sizeof(inbuf);
+	args[0].fd = -1;
+
+	args[1].ptr = (u64)(uintptr_t)current->comm;
+	args[1].length = inbuf.namelen;
+	args[1].fd = -1;
+
+	args[2].ptr = (u64) init.file;
+	args[2].length = inbuf.filelen;
+	args[2].fd = init.filefd;
+
+	pages[0].addr = imem->phys;
+	pages[0].size = imem->size;
+
+	args[3].ptr = (u64)(uintptr_t) pages;
+	args[3].length = 1 * sizeof(*pages);
+	args[3].fd = -1;
+
+	args[4].ptr = (u64)(uintptr_t)&inbuf.attrs;
+	args[4].length = sizeof(inbuf.attrs);
+	args[4].fd = -1;
+
+	args[5].ptr = (u64)(uintptr_t) &inbuf.siglen;
+	args[5].length = sizeof(inbuf.siglen);
+	args[5].fd = -1;
+
+	sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_CREATE, 4, 0);
+	if (init.attrs)
+		sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_CREATE_ATTR, 6, 0);
+
+	err = fastrpc_internal_invoke(fl, true, FASTRPC_INIT_HANDLE,
+				      sc, args);
+
+	if (err) {
+		fastrpc_map_put(map);
+		fastrpc_buf_free(imem);
+	}
+
+bail:
+	kfree(args);
+
+	return err;
+}
+
 static struct fastrpc_session_ctx *fastrpc_session_alloc(
 					struct fastrpc_channel_ctx *cctx)
 {
@@ -715,6 +829,23 @@ static void fastrpc_session_free(struct fastrpc_channel_ctx *cctx,
 	spin_unlock(&cctx->lock);
 }
 
+static int fastrpc_release_current_dsp_process(struct fastrpc_user *fl)
+{
+	struct fastrpc_invoke_args args[1];
+	int tgid = 0;
+	u32 sc;
+
+	tgid = fl->tgid;
+	args[0].ptr = (u64)(uintptr_t) &tgid;
+	args[0].length = sizeof(tgid);
+	args[0].fd = -1;
+	args[0].reserved = 0;
+	sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_RELEASE, 1, 0);
+
+	return fastrpc_internal_invoke(fl, true, FASTRPC_INIT_HANDLE,
+				       sc, &args[0]);
+}
+
 static int fastrpc_device_release(struct inode *inode, struct file *file)
 {
 	struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
@@ -722,6 +853,8 @@ static int fastrpc_device_release(struct inode *inode, struct file *file)
 	struct fastrpc_invoke_ctx *ctx, *n;
 	struct fastrpc_map *map, *m;
 
+	fastrpc_release_current_dsp_process(fl);
+
 	spin_lock(&cctx->lock);
 	list_del(&fl->user);
 	spin_unlock(&cctx->lock);
@@ -773,6 +906,23 @@ static int fastrpc_device_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int fastrpc_init_attach(struct fastrpc_user *fl)
+{
+	struct fastrpc_invoke_args args[1];
+	int tgid = fl->tgid;
+	u32 sc;
+
+	args[0].ptr = (u64)(uintptr_t) &tgid;
+	args[0].length = sizeof(tgid);
+	args[0].fd = -1;
+	args[0].reserved = 0;
+	sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_ATTACH, 1, 0);
+	fl->pd = 0;
+
+	return fastrpc_internal_invoke(fl, true, FASTRPC_INIT_HANDLE,
+				       sc, &args[0]);
+}
+
 static int fastrpc_invoke(struct fastrpc_user *fl, char __user *argp)
 {
 	struct fastrpc_invoke_args *args = NULL;
@@ -814,6 +964,12 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
 	case FASTRPC_IOCTL_INVOKE:
 		err = fastrpc_invoke(fl, argp);
 		break;
+	case FASTRPC_IOCTL_INIT_ATTACH:
+		err = fastrpc_init_attach(fl);
+		break;
+	case FASTRPC_IOCTL_INIT_CREATE:
+		err = fastrpc_init_create_process(fl, argp);
+		break;
 	default:
 		err = -ENOTTY;
 		break;
diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h
index a69ef33dc37e..32d191c3b7bc 100644
--- a/include/uapi/misc/fastrpc.h
+++ b/include/uapi/misc/fastrpc.h
@@ -6,6 +6,8 @@
 #include <linux/types.h>
 
 #define FASTRPC_IOCTL_INVOKE		_IOWR('R', 3, struct fastrpc_invoke)
+#define FASTRPC_IOCTL_INIT_ATTACH	_IO('R', 4)
+#define FASTRPC_IOCTL_INIT_CREATE	_IOWR('R', 5, struct fastrpc_init_create)
 
 struct fastrpc_invoke_args {
 	__u64 ptr;
@@ -20,4 +22,12 @@ struct fastrpc_invoke {
 	__u64 args;
 };
 
+struct fastrpc_init_create {
+	__u32 filelen;	/* elf file length */
+	__s32 filefd;	/* fd for the file */
+	__u32 attrs;
+	__u32 siglen;
+	__u64 file;	/* pointer to elf file */
+};
+
 #endif /* __QCOM_FASTRPC_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 6cffd79504ce040f460831030d3069fa1c99bb71 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Fri, 8 Feb 2019 17:11:27 +0000
Subject: misc: fastrpc: Add support for dmabuf exporter

User process can involve dealing with big buffer sizes, and also passing
buffers from one compute context bank to other compute context bank for
complex dsp algorithms.

This patch adds support to fastrpc to make it a proper dmabuf exporter
to avoid making copies of buffers.

Co-developed-by: Thierry Escande <thierry.escande@linaro.org>
Signed-off-by: Thierry Escande <thierry.escande@linaro.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/fastrpc.c      | 184 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/misc/fastrpc.h |   8 ++
 2 files changed, 192 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index ceb498487569..4b0db33896df 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -106,10 +106,20 @@ struct fastrpc_invoke_rsp {
 
 struct fastrpc_buf {
 	struct fastrpc_user *fl;
+	struct dma_buf *dmabuf;
 	struct device *dev;
 	void *virt;
 	u64 phys;
 	u64 size;
+	/* Lock for dma buf attachments */
+	struct mutex lock;
+	struct list_head attachments;
+};
+
+struct fastrpc_dma_buf_attachment {
+	struct device *dev;
+	struct sg_table sgt;
+	struct list_head node;
 };
 
 struct fastrpc_map {
@@ -246,6 +256,9 @@ static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev,
 	if (!buf)
 		return -ENOMEM;
 
+	INIT_LIST_HEAD(&buf->attachments);
+	mutex_init(&buf->lock);
+
 	buf->fl = fl;
 	buf->virt = NULL;
 	buf->phys = 0;
@@ -360,6 +373,111 @@ err_idr:
 	return ERR_PTR(ret);
 }
 
+static struct sg_table *
+fastrpc_map_dma_buf(struct dma_buf_attachment *attachment,
+		    enum dma_data_direction dir)
+{
+	struct fastrpc_dma_buf_attachment *a = attachment->priv;
+	struct sg_table *table;
+
+	table = &a->sgt;
+
+	if (!dma_map_sg(attachment->dev, table->sgl, table->nents, dir))
+		return ERR_PTR(-ENOMEM);
+
+	return table;
+}
+
+static void fastrpc_unmap_dma_buf(struct dma_buf_attachment *attach,
+				  struct sg_table *table,
+				  enum dma_data_direction dir)
+{
+	dma_unmap_sg(attach->dev, table->sgl, table->nents, dir);
+}
+
+static void fastrpc_release(struct dma_buf *dmabuf)
+{
+	struct fastrpc_buf *buffer = dmabuf->priv;
+
+	fastrpc_buf_free(buffer);
+}
+
+static int fastrpc_dma_buf_attach(struct dma_buf *dmabuf,
+				  struct dma_buf_attachment *attachment)
+{
+	struct fastrpc_dma_buf_attachment *a;
+	struct fastrpc_buf *buffer = dmabuf->priv;
+	int ret;
+
+	a = kzalloc(sizeof(*a), GFP_KERNEL);
+	if (!a)
+		return -ENOMEM;
+
+	ret = dma_get_sgtable(buffer->dev, &a->sgt, buffer->virt,
+			      FASTRPC_PHYS(buffer->phys), buffer->size);
+	if (ret < 0) {
+		dev_err(buffer->dev, "failed to get scatterlist from DMA API\n");
+		return -EINVAL;
+	}
+
+	a->dev = attachment->dev;
+	INIT_LIST_HEAD(&a->node);
+	attachment->priv = a;
+
+	mutex_lock(&buffer->lock);
+	list_add(&a->node, &buffer->attachments);
+	mutex_unlock(&buffer->lock);
+
+	return 0;
+}
+
+static void fastrpc_dma_buf_detatch(struct dma_buf *dmabuf,
+				    struct dma_buf_attachment *attachment)
+{
+	struct fastrpc_dma_buf_attachment *a = attachment->priv;
+	struct fastrpc_buf *buffer = dmabuf->priv;
+
+	mutex_lock(&buffer->lock);
+	list_del(&a->node);
+	mutex_unlock(&buffer->lock);
+	kfree(a);
+}
+
+static void *fastrpc_kmap(struct dma_buf *dmabuf, unsigned long pgnum)
+{
+	struct fastrpc_buf *buf = dmabuf->priv;
+
+	return buf->virt ? buf->virt + pgnum * PAGE_SIZE : NULL;
+}
+
+static void *fastrpc_vmap(struct dma_buf *dmabuf)
+{
+	struct fastrpc_buf *buf = dmabuf->priv;
+
+	return buf->virt;
+}
+
+static int fastrpc_mmap(struct dma_buf *dmabuf,
+			struct vm_area_struct *vma)
+{
+	struct fastrpc_buf *buf = dmabuf->priv;
+	size_t size = vma->vm_end - vma->vm_start;
+
+	return dma_mmap_coherent(buf->dev, vma, buf->virt,
+				 FASTRPC_PHYS(buf->phys), size);
+}
+
+static const struct dma_buf_ops fastrpc_dma_buf_ops = {
+	.attach = fastrpc_dma_buf_attach,
+	.detach = fastrpc_dma_buf_detatch,
+	.map_dma_buf = fastrpc_map_dma_buf,
+	.unmap_dma_buf = fastrpc_unmap_dma_buf,
+	.mmap = fastrpc_mmap,
+	.map = fastrpc_kmap,
+	.vmap = fastrpc_vmap,
+	.release = fastrpc_release,
+};
+
 static int fastrpc_map_create(struct fastrpc_user *fl, int fd,
 			      u64 len, struct fastrpc_map **ppmap)
 {
@@ -906,6 +1024,66 @@ static int fastrpc_device_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int fastrpc_dmabuf_free(struct fastrpc_user *fl, char __user *argp)
+{
+	struct dma_buf *buf;
+	int info;
+
+	if (copy_from_user(&info, argp, sizeof(info)))
+		return -EFAULT;
+
+	buf = dma_buf_get(info);
+	if (IS_ERR_OR_NULL(buf))
+		return -EINVAL;
+	/*
+	 * one for the last get and other for the ALLOC_DMA_BUFF ioctl
+	 */
+	dma_buf_put(buf);
+	dma_buf_put(buf);
+
+	return 0;
+}
+
+static int fastrpc_dmabuf_alloc(struct fastrpc_user *fl, char __user *argp)
+{
+	struct fastrpc_alloc_dma_buf bp;
+	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+	struct fastrpc_buf *buf = NULL;
+	int err;
+
+	if (copy_from_user(&bp, argp, sizeof(bp)))
+		return -EFAULT;
+
+	err = fastrpc_buf_alloc(fl, fl->sctx->dev, bp.size, &buf);
+	if (err)
+		return err;
+	exp_info.ops = &fastrpc_dma_buf_ops;
+	exp_info.size = bp.size;
+	exp_info.flags = O_RDWR;
+	exp_info.priv = buf;
+	buf->dmabuf = dma_buf_export(&exp_info);
+	if (IS_ERR(buf->dmabuf)) {
+		err = PTR_ERR(buf->dmabuf);
+		fastrpc_buf_free(buf);
+		return err;
+	}
+
+	bp.fd = dma_buf_fd(buf->dmabuf, O_ACCMODE);
+	if (bp.fd < 0) {
+		dma_buf_put(buf->dmabuf);
+		return -EINVAL;
+	}
+
+	if (copy_to_user(argp, &bp, sizeof(bp))) {
+		dma_buf_put(buf->dmabuf);
+		return -EFAULT;
+	}
+
+	get_dma_buf(buf->dmabuf);
+
+	return 0;
+}
+
 static int fastrpc_init_attach(struct fastrpc_user *fl)
 {
 	struct fastrpc_invoke_args args[1];
@@ -970,6 +1148,12 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
 	case FASTRPC_IOCTL_INIT_CREATE:
 		err = fastrpc_init_create_process(fl, argp);
 		break;
+	case FASTRPC_IOCTL_FREE_DMA_BUFF:
+		err = fastrpc_dmabuf_free(fl, argp);
+		break;
+	case FASTRPC_IOCTL_ALLOC_DMA_BUFF:
+		err = fastrpc_dmabuf_alloc(fl, argp);
+		break;
 	default:
 		err = -ENOTTY;
 		break;
diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h
index 32d191c3b7bc..6d701af9fc42 100644
--- a/include/uapi/misc/fastrpc.h
+++ b/include/uapi/misc/fastrpc.h
@@ -5,6 +5,8 @@
 
 #include <linux/types.h>
 
+#define FASTRPC_IOCTL_ALLOC_DMA_BUFF	_IOWR('R', 1, struct fastrpc_alloc_dma_buf)
+#define FASTRPC_IOCTL_FREE_DMA_BUFF	_IOWR('R', 2, __u32)
 #define FASTRPC_IOCTL_INVOKE		_IOWR('R', 3, struct fastrpc_invoke)
 #define FASTRPC_IOCTL_INIT_ATTACH	_IO('R', 4)
 #define FASTRPC_IOCTL_INIT_CREATE	_IOWR('R', 5, struct fastrpc_init_create)
@@ -30,4 +32,10 @@ struct fastrpc_init_create {
 	__u64 file;	/* pointer to elf file */
 };
 
+struct fastrpc_alloc_dma_buf {
+	__s32 fd;	/* fd */
+	__u32 flags;	/* flags to map with */
+	__u64 size;	/* size */
+};
+
 #endif /* __QCOM_FASTRPC_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 1ec17dbd90f8b638f41ee650558609c1af63dfa0 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Sat, 9 Feb 2019 13:35:52 +0300
Subject: inet_diag: fix reporting cgroup classid and fallback to priority

Field idiag_ext in struct inet_diag_req_v2 used as bitmap of requested
extensions has only 8 bits. Thus extensions starting from DCTCPINFO
cannot be requested directly. Some of them included into response
unconditionally or hook into some of lower 8 bits.

Extension INET_DIAG_CLASS_ID has not way to request from the beginning.

This patch bundle it with INET_DIAG_TCLASS (ipv6 tos), fixes space
reservation, and documents behavior for other extensions.

Also this patch adds fallback to reporting socket priority. This filed
is more widely used for traffic classification because ipv4 sockets
automatically maps TOS to priority and default qdisc pfifo_fast knows
about that. But priority could be changed via setsockopt SO_PRIORITY so
INET_DIAG_TOS isn't enough for predicting class.

Also cgroup2 obsoletes net_cls classid (it always zero), but we cannot
reuse this field for reporting cgroup2 id because it is 64-bit (ino+gen).

So, after this patch INET_DIAG_CLASS_ID will report socket priority
for most common setup when net_cls isn't set and/or cgroup2 in use.

Fixes: 0888e372c37f ("net: inet: diag: expose sockets cgroup classid")
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h | 16 +++++++++++-----
 net/ipv4/inet_diag.c           | 10 +++++++++-
 net/sctp/diag.c                |  1 +
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 14565d703291..e8baca85bac6 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -137,15 +137,21 @@ enum {
 	INET_DIAG_TCLASS,
 	INET_DIAG_SKMEMINFO,
 	INET_DIAG_SHUTDOWN,
-	INET_DIAG_DCTCPINFO,
-	INET_DIAG_PROTOCOL,  /* response attribute only */
+
+	/*
+	 * Next extenstions cannot be requested in struct inet_diag_req_v2:
+	 * its field idiag_ext has only 8 bits.
+	 */
+
+	INET_DIAG_DCTCPINFO,	/* request as INET_DIAG_VEGASINFO */
+	INET_DIAG_PROTOCOL,	/* response attribute only */
 	INET_DIAG_SKV6ONLY,
 	INET_DIAG_LOCALS,
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
-	INET_DIAG_MARK,
-	INET_DIAG_BBRINFO,
-	INET_DIAG_CLASS_ID,
+	INET_DIAG_MARK,		/* only with CAP_NET_ADMIN */
+	INET_DIAG_BBRINFO,	/* request as INET_DIAG_VEGASINFO */
+	INET_DIAG_CLASS_ID,	/* request as INET_DIAG_TCLASS */
 	INET_DIAG_MD5SIG,
 	__INET_DIAG_MAX,
 };
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 1a4e9ff02762..5731670c560b 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -108,6 +108,7 @@ static size_t inet_sk_attr_size(struct sock *sk,
 		+ nla_total_size(1) /* INET_DIAG_TOS */
 		+ nla_total_size(1) /* INET_DIAG_TCLASS */
 		+ nla_total_size(4) /* INET_DIAG_MARK */
+		+ nla_total_size(4) /* INET_DIAG_CLASS_ID */
 		+ nla_total_size(sizeof(struct inet_diag_meminfo))
 		+ nla_total_size(sizeof(struct inet_diag_msg))
 		+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -287,12 +288,19 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 			goto errout;
 	}
 
-	if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) {
+	if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) ||
+	    ext & (1 << (INET_DIAG_TCLASS - 1))) {
 		u32 classid = 0;
 
 #ifdef CONFIG_SOCK_CGROUP_DATA
 		classid = sock_cgroup_classid(&sk->sk_cgrp_data);
 #endif
+		/* Fallback to socket priority if class id isn't set.
+		 * Classful qdiscs use it as direct reference to class.
+		 * For cgroup2 classid is always zero.
+		 */
+		if (!classid)
+			classid = sk->sk_priority;
 
 		if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
 			goto errout;
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index 078f01a8d582..435847d98b51 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -256,6 +256,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc)
 		+ nla_total_size(1) /* INET_DIAG_TOS */
 		+ nla_total_size(1) /* INET_DIAG_TCLASS */
 		+ nla_total_size(4) /* INET_DIAG_MARK */
+		+ nla_total_size(4) /* INET_DIAG_CLASS_ID */
 		+ nla_total_size(addrlen * asoc->peer.transport_count)
 		+ nla_total_size(addrlen * addrcnt)
 		+ nla_total_size(sizeof(struct inet_diag_meminfo))
-- 
cgit v1.2.3-59-g8ed1b


From 76ce2a80a28ef5caa8cc0cd41ad04138fb7ffeed Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 12 Feb 2019 15:36:06 -0800
Subject: Rename include/{uapi => }/asm-generic/shmparam.h really

Commit 36c0f7f0f899 ("arch: unexport asm/shmparam.h for all
architectures") is different from the patch I submitted.

My patch is this:

  https://lore.kernel.org/lkml/1546904307-11124-1-git-send-email-yamada.masahiro@socionext.com/T/#u

The file renaming part:

  rename include/{uapi => }/asm-generic/shmparam.h (100%)

was lost when it was picked up.

I think it was an accident because Andrew did not say anything.

Link: http://lkml.kernel.org/r/1549158277-24558-1-git-send-email-yamada.masahiro@socionext.com
Fixes: 36c0f7f0f899 ("arch: unexport asm/shmparam.h for all architectures")
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/shmparam.h      | 7 +++++++
 include/uapi/asm-generic/shmparam.h | 7 -------
 2 files changed, 7 insertions(+), 7 deletions(-)
 create mode 100644 include/asm-generic/shmparam.h
 delete mode 100644 include/uapi/asm-generic/shmparam.h

(limited to 'include/uapi')

diff --git a/include/asm-generic/shmparam.h b/include/asm-generic/shmparam.h
new file mode 100644
index 000000000000..8b78c0ba08b1
--- /dev/null
+++ b/include/asm-generic/shmparam.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_GENERIC_SHMPARAM_H
+#define __ASM_GENERIC_SHMPARAM_H
+
+#define SHMLBA PAGE_SIZE	 /* attach addr a multiple of this */
+
+#endif /* _ASM_GENERIC_SHMPARAM_H */
diff --git a/include/uapi/asm-generic/shmparam.h b/include/uapi/asm-generic/shmparam.h
deleted file mode 100644
index 8b78c0ba08b1..000000000000
--- a/include/uapi/asm-generic/shmparam.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __ASM_GENERIC_SHMPARAM_H
-#define __ASM_GENERIC_SHMPARAM_H
-
-#define SHMLBA PAGE_SIZE	 /* attach addr a multiple of this */
-
-#endif /* _ASM_GENERIC_SHMPARAM_H */
-- 
cgit v1.2.3-59-g8ed1b


From d9a9ea94f748f47b1d75c6c5e33edcf74476c445 Mon Sep 17 00:00:00 2001
From: Chad Austin <chadaustin@fb.com>
Date: Mon, 7 Jan 2019 16:53:17 -0800
Subject: fuse: support clients that don't implement 'opendir'

Allow filesystems to return ENOSYS from opendir, preventing the kernel from
sending opendir and releasedir messages in the future. This avoids
userspace transitions when filesystems don't need to keep track of state
per directory handle.

A new capability flag, FUSE_NO_OPENDIR_SUPPORT, parallels
FUSE_NO_OPEN_SUPPORT, indicating the new semantics for returning ENOSYS
from opendir.

Signed-off-by: Chad Austin <chadaustin@fb.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/file.c            | 11 +++++++----
 fs/fuse/fuse_i.h          |  3 +++
 fs/fuse/inode.c           |  3 ++-
 include/uapi/linux/fuse.h |  7 ++++++-
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8ee0446a8322..cc6ffd23b80f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -90,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
 	if (refcount_dec_and_test(&ff->count)) {
 		struct fuse_req *req = ff->reserved_req;
 
-		if (ff->fc->no_open && !isdir) {
+		if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
 			/*
 			 * Drop the release request when client does not
 			 * implement 'open'
@@ -125,7 +125,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 
 	ff->fh = 0;
 	ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
-	if (!fc->no_open || isdir) {
+	if (isdir ? !fc->no_opendir : !fc->no_open) {
 		struct fuse_open_out outarg;
 		int err;
 
@@ -134,11 +134,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 			ff->fh = outarg.fh;
 			ff->open_flags = outarg.open_flags;
 
-		} else if (err != -ENOSYS || isdir) {
+		} else if (err != -ENOSYS) {
 			fuse_file_free(ff);
 			return err;
 		} else {
-			fc->no_open = 1;
+			if (isdir)
+				fc->no_opendir = 1;
+			else
+				fc->no_open = 1;
 		}
 	}
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 033e30af519f..0920c0c032a0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -630,6 +630,9 @@ struct fuse_conn {
 	/** Is open/release not implemented by fs? */
 	unsigned no_open:1;
 
+	/** Is opendir/releasedir not implemented by fs? */
+	unsigned no_opendir:1;
+
 	/** Is fsync not implemented by fs? */
 	unsigned no_fsync:1;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2bbb7c59d6da..1b3f3b67d9f0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -972,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
 		FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
 		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
-		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS;
+		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
+		FUSE_NO_OPENDIR_SUPPORT;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index b4967d48bfda..2ac598614a8f 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -122,6 +122,9 @@
  *  - add FOPEN_CACHE_DIR
  *  - add FUSE_MAX_PAGES, add max_pages to init_out
  *  - add FUSE_CACHE_SYMLINKS
+ *
+ *  7.29
+ *  - add FUSE_NO_OPENDIR_SUPPORT flag
  */
 
 #ifndef _LINUX_FUSE_H
@@ -157,7 +160,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 28
+#define FUSE_KERNEL_MINOR_VERSION 29
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -259,6 +262,7 @@ struct fuse_file_lock {
  * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED
  * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages
  * FUSE_CACHE_SYMLINKS: cache READLINK responses
+ * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -284,6 +288,7 @@ struct fuse_file_lock {
 #define FUSE_ABORT_ERROR	(1 << 21)
 #define FUSE_MAX_PAGES		(1 << 22)
 #define FUSE_CACHE_SYMLINKS	(1 << 23)
+#define FUSE_NO_OPENDIR_SUPPORT (1 << 24)
 
 /**
  * CUSE INIT request/reply flags
-- 
cgit v1.2.3-59-g8ed1b


From 7fd8afa8933a095a97995885740999f174e61b60 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Mon, 11 Feb 2019 15:25:29 +0100
Subject: net: phy: Add generic support for 2.5GBaseT and 5GBaseT

The 802.3bz specification, based on previous by the NBASET alliance,
defines the 2.5GBaseT and 5GBaseT link modes for ethernet traffic on
cat5e, cat6 and cat7 cables.

These mode integrate with the already defined C45 MDIO PMA/PMD registers
set that added 10G support, by defining some previously reserved bits,
and adding a new register (2.5G/5G Extended abilities).

This commit adds the required definitions in include/uapi/linux/mdio.h
to support these modes, and detect when a link-partner advertises them.

It also adds support for these mode in the generic C45 PHY
infrastructure.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-c45.c | 37 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/mdio.h | 16 ++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 6f028de4dae1..7af5fa81daf6 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -47,6 +47,16 @@ int genphy_c45_pma_setup_forced(struct phy_device *phydev)
 		/* Assume 1000base-T */
 		ctrl2 |= MDIO_PMA_CTRL2_1000BT;
 		break;
+	case SPEED_2500:
+		ctrl1 |= MDIO_CTRL1_SPEED2_5G;
+		/* Assume 2.5Gbase-T */
+		ctrl2 |= MDIO_PMA_CTRL2_2_5GBT;
+		break;
+	case SPEED_5000:
+		ctrl1 |= MDIO_CTRL1_SPEED5G;
+		/* Assume 5Gbase-T */
+		ctrl2 |= MDIO_PMA_CTRL2_5GBT;
+		break;
 	case SPEED_10000:
 		ctrl1 |= MDIO_CTRL1_SPEED10G;
 		/* Assume 10Gbase-T */
@@ -194,6 +204,12 @@ int genphy_c45_read_lpa(struct phy_device *phydev)
 	if (val < 0)
 		return val;
 
+	if (val & MDIO_AN_10GBT_STAT_LP2_5G)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
+				 phydev->lp_advertising);
+	if (val & MDIO_AN_10GBT_STAT_LP5G)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
+				 phydev->lp_advertising);
 	if (val & MDIO_AN_10GBT_STAT_LP10G)
 		linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
 				 phydev->lp_advertising);
@@ -224,6 +240,12 @@ int genphy_c45_read_pma(struct phy_device *phydev)
 	case MDIO_PMA_CTRL1_SPEED1000:
 		phydev->speed = SPEED_1000;
 		break;
+	case MDIO_CTRL1_SPEED2_5G:
+		phydev->speed = SPEED_2500;
+		break;
+	case MDIO_CTRL1_SPEED5G:
+		phydev->speed = SPEED_5000;
+		break;
 	case MDIO_CTRL1_SPEED10G:
 		phydev->speed = SPEED_10000;
 		break;
@@ -339,6 +361,21 @@ int genphy_c45_pma_read_abilities(struct phy_device *phydev)
 		linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT,
 				 phydev->supported,
 				 val & MDIO_PMA_EXTABLE_10BT);
+
+		if (val & MDIO_PMA_EXTABLE_NBT) {
+			val = phy_read_mmd(phydev, MDIO_MMD_PMAPMD,
+					   MDIO_PMA_NG_EXTABLE);
+			if (val < 0)
+				return val;
+
+			linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
+					 phydev->supported,
+					 val & MDIO_PMA_NG_EXTABLE_2_5GBT);
+
+			linkmode_mod_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
+					 phydev->supported,
+					 val & MDIO_PMA_NG_EXTABLE_5GBT);
+		}
 	}
 
 	return 0;
diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index 0e012b168e4d..0a552061ff1c 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -45,6 +45,7 @@
 #define MDIO_AN_ADVERTISE	16	/* AN advertising (base page) */
 #define MDIO_AN_LPA		19	/* AN LP abilities (base page) */
 #define MDIO_PCS_EEE_ABLE	20	/* EEE Capability register */
+#define MDIO_PMA_NG_EXTABLE	21	/* 2.5G/5G PMA/PMD extended ability */
 #define MDIO_PCS_EEE_WK_ERR	22	/* EEE wake error counter */
 #define MDIO_PHYXS_LNSTAT	24	/* PHY XGXS lane state */
 #define MDIO_AN_EEE_ADV		60	/* EEE advertisement */
@@ -92,6 +93,10 @@
 #define MDIO_CTRL1_SPEED10G		(MDIO_CTRL1_SPEEDSELEXT | 0x00)
 /* 10PASS-TS/2BASE-TL */
 #define MDIO_CTRL1_SPEED10P2B		(MDIO_CTRL1_SPEEDSELEXT | 0x04)
+/* 2.5 Gb/s */
+#define MDIO_CTRL1_SPEED2_5G		(MDIO_CTRL1_SPEEDSELEXT | 0x18)
+/* 5 Gb/s */
+#define MDIO_CTRL1_SPEED5G		(MDIO_CTRL1_SPEEDSELEXT | 0x1c)
 
 /* Status register 1. */
 #define MDIO_STAT1_LPOWERABLE		0x0002	/* Low-power ability */
@@ -145,6 +150,8 @@
 #define MDIO_PMA_CTRL2_1000BKX		0x000d	/* 1000BASE-KX type */
 #define MDIO_PMA_CTRL2_100BTX		0x000e	/* 100BASE-TX type */
 #define MDIO_PMA_CTRL2_10BT		0x000f	/* 10BASE-T type */
+#define MDIO_PMA_CTRL2_2_5GBT		0x0030  /* 2.5GBaseT type */
+#define MDIO_PMA_CTRL2_5GBT		0x0031  /* 5GBaseT type */
 #define MDIO_PCS_CTRL2_TYPE		0x0003	/* PCS type selection */
 #define MDIO_PCS_CTRL2_10GBR		0x0000	/* 10GBASE-R type */
 #define MDIO_PCS_CTRL2_10GBX		0x0001	/* 10GBASE-X type */
@@ -198,6 +205,7 @@
 #define MDIO_PMA_EXTABLE_1000BKX	0x0040	/* 1000BASE-KX ability */
 #define MDIO_PMA_EXTABLE_100BTX		0x0080	/* 100BASE-TX ability */
 #define MDIO_PMA_EXTABLE_10BT		0x0100	/* 10BASE-T ability */
+#define MDIO_PMA_EXTABLE_NBT		0x4000  /* 2.5/5GBASE-T ability */
 
 /* PHY XGXS lane state register. */
 #define MDIO_PHYXS_LNSTAT_SYNC0		0x0001
@@ -234,9 +242,13 @@
 #define MDIO_PCS_10GBRT_STAT2_BER	0x3f00
 
 /* AN 10GBASE-T control register. */
+#define MDIO_AN_10GBT_CTRL_ADV2_5G	0x0080	/* Advertise 2.5GBASE-T */
+#define MDIO_AN_10GBT_CTRL_ADV5G	0x0100	/* Advertise 5GBASE-T */
 #define MDIO_AN_10GBT_CTRL_ADV10G	0x1000	/* Advertise 10GBASE-T */
 
 /* AN 10GBASE-T status register. */
+#define MDIO_AN_10GBT_STAT_LP2_5G	0x0020  /* LP is 2.5GBT capable */
+#define MDIO_AN_10GBT_STAT_LP5G		0x0040  /* LP is 5GBT capable */
 #define MDIO_AN_10GBT_STAT_LPTRR	0x0200	/* LP training reset req. */
 #define MDIO_AN_10GBT_STAT_LPLTABLE	0x0400	/* LP loop timing ability */
 #define MDIO_AN_10GBT_STAT_LP10G	0x0800	/* LP is 10GBT capable */
@@ -265,6 +277,10 @@
 #define MDIO_EEE_10GKX4		0x0020	/* 10G KX4 EEE cap */
 #define MDIO_EEE_10GKR		0x0040	/* 10G KR EEE cap */
 
+/* 2.5G/5G Extended abilities register. */
+#define MDIO_PMA_NG_EXTABLE_2_5GBT	0x0001	/* 2.5GBASET ability */
+#define MDIO_PMA_NG_EXTABLE_5GBT	0x0002	/* 5GBASET ability */
+
 /* LASI RX_ALARM control/status registers. */
 #define MDIO_PMA_LASI_RX_PHYXSLFLT	0x0001	/* PHY XS RX local fault */
 #define MDIO_PMA_LASI_RX_PCSLFLT	0x0008	/* PCS RX local fault */
-- 
cgit v1.2.3-59-g8ed1b


From 3e0bd37ce0e4a574df6d87a901e13bcb46e10301 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 13 Feb 2019 11:53:35 -0800
Subject: bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap

This patch adds all needed plumbing in preparation to allowing
bpf programs to do IP encapping via bpf_lwt_push_encap. Actual
implementation is added in the next patch in the patchset.

Of note:
- bpf_lwt_push_encap can now be called from BPF_PROG_TYPE_LWT_XMIT
  prog types in addition to BPF_PROG_TYPE_LWT_IN;
- if the skb being encapped has GSO set, encapsulation is limited
  to IPIP/IP+GRE/IP+GUE (both IPv4 and IPv6);
- as route lookups are different for ingress vs egress, the single
  external bpf_lwt_push_encap BPF helper is routed internally to
  either bpf_lwt_in_push_encap or bpf_lwt_xmit_push_encap BPF_CALLs,
  depending on prog type.

v8 changes: fixed a typo.

Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++--
 net/core/filter.c        | 48 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 67 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 25c8c0e62ecf..bcdd2474eee7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2016,6 +2016,19 @@ union bpf_attr {
  *			Only works if *skb* contains an IPv6 packet. Insert a
  *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
  *			the IPv6 header.
+ *		**BPF_LWT_ENCAP_IP**
+ *			IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ *			must be IPv4 or IPv6, followed by zero or more
+ *			additional headers, up to LWT_BPF_MAX_HEADROOM total
+ *			bytes in all prepended headers. Please note that
+ *			if skb_is_gso(skb) is true, no more than two headers
+ *			can be prepended, and the inner header, if present,
+ *			should be either GRE or UDP/GUE.
+ *
+ *		BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
+ *		type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
+ *		by bpf programs of types BPF_PROG_TYPE_LWT_IN and
+ *		BPF_PROG_TYPE_LWT_XMIT.
  *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off {
 /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
 enum bpf_lwt_encap_mode {
 	BPF_LWT_ENCAP_SEG6,
-	BPF_LWT_ENCAP_SEG6_INLINE
+	BPF_LWT_ENCAP_SEG6_INLINE,
+	BPF_LWT_ENCAP_IP,
 };
 
 #define __bpf_md_ptr(type, name)	\
@@ -2606,7 +2620,15 @@ enum bpf_ret_code {
 	BPF_DROP = 2,
 	/* 3-6 reserved */
 	BPF_REDIRECT = 7,
-	/* >127 are reserved for prog type specific return codes */
+	/* >127 are reserved for prog type specific return codes.
+	 *
+	 * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
+	 *    BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
+	 *    changed and should be routed based on its new L3 header.
+	 *    (This is an L3 redirect, as opposed to L2 redirect
+	 *    represented by BPF_REDIRECT above).
+	 */
+	BPF_LWT_REROUTE = 128,
 };
 
 struct bpf_sock {
diff --git a/net/core/filter.c b/net/core/filter.c
index 353735575204..12c88c21b6b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4815,7 +4815,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 }
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
-BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+			     bool ingress)
+{
+	return -EINVAL;  /* Implemented in the next patch. */
+}
+#endif
+
+BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	   u32, len)
 {
 	switch (type) {
@@ -4823,14 +4831,41 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	case BPF_LWT_ENCAP_SEG6:
 	case BPF_LWT_ENCAP_SEG6_INLINE:
 		return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+	case BPF_LWT_ENCAP_IP:
+		return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
+#endif
+	default:
+		return -EINVAL;
+	}
+}
+
+BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
+	   void *, hdr, u32, len)
+{
+	switch (type) {
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+	case BPF_LWT_ENCAP_IP:
+		return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
 #endif
 	default:
 		return -EINVAL;
 	}
 }
 
-static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
-	.func		= bpf_lwt_push_encap,
+static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
+	.func		= bpf_lwt_in_push_encap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
+	.func		= bpf_lwt_xmit_push_encap,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
@@ -5417,7 +5452,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_lwt_seg6_adjust_srh ||
 	    func == bpf_lwt_seg6_action ||
 #endif
-	    func == bpf_lwt_push_encap)
+	    func == bpf_lwt_in_push_encap ||
+	    func == bpf_lwt_xmit_push_encap)
 		return true;
 
 	return false;
@@ -5815,7 +5851,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_lwt_push_encap:
-		return &bpf_lwt_push_encap_proto;
+		return &bpf_lwt_in_push_encap_proto;
 	default:
 		return lwt_out_func_proto(func_id, prog);
 	}
@@ -5851,6 +5887,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_l4_csum_replace_proto;
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
+	case BPF_FUNC_lwt_push_encap:
+		return &bpf_lwt_xmit_push_encap_proto;
 	default:
 		return lwt_out_func_proto(func_id, prog);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From ca5e9aba753ed15d173c7a7b88e4d402b7ca8121 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 12 Feb 2019 19:26:03 -0800
Subject: time: Add time_types.h

sys/time.h is the mandated include for many time related
defines. However, linux/time.h overlaps sys/time.h
significantly and this makes including both from userspace
or one from the other impossible.

This also means that userspace can get away with including
sys/time.h whenever it needs linux/time.h and this is what's
been happening in the user world usually.

But, we have new data types that we plan to use in the uapi time
interfaces also defined in the linux/time.h. But, we are unable
to use these types when sys/time.h is included.

Hence, move the new types to a new header, time_types.h.
We intend to eventually have all the uapi defines that the kernel
uses defined in this header.
Note that the plan is to replace uapi interfaces with timeval to
use __kernel_old_timeval, timespec to use __kernel_old_timespec etc.

Reported-by: Ran Rozenstein <ranro@mellanox.com>
Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW")
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/time.h       | 36 +-----------------------------------
 include/uapi/linux/time_types.h | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 35 deletions(-)
 create mode 100644 include/uapi/linux/time_types.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index b8ad1b86b942..958932effc5e 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -3,7 +3,7 @@
 #define _UAPI_LINUX_TIME_H
 
 #include <linux/types.h>
-
+#include <linux/time_types.h>
 
 #ifndef _STRUCT_TIMESPEC
 #define _STRUCT_TIMESPEC
@@ -23,7 +23,6 @@ struct timezone {
 	int	tz_dsttime;	/* type of dst correction */
 };
 
-
 /*
  * Names of the interval timers, and structure
  * defining a timer setting:
@@ -42,39 +41,6 @@ struct itimerval {
 	struct timeval it_value;	/* current value */
 };
 
-#ifndef __kernel_timespec
-struct __kernel_timespec {
-	__kernel_time64_t       tv_sec;                 /* seconds */
-	long long               tv_nsec;                /* nanoseconds */
-};
-#endif
-
-#ifndef __kernel_itimerspec
-struct __kernel_itimerspec {
-	struct __kernel_timespec it_interval;    /* timer period */
-	struct __kernel_timespec it_value;       /* timer expiration */
-};
-#endif
-
-/*
- * legacy timeval structure, only embedded in structures that
- * traditionally used 'timeval' to pass time intervals (not absolute
- * times). Do not add new users. If user space fails to compile
- * here, this is probably because it is not y2038 safe and needs to
- * be changed to use another interface.
- */
-#ifndef __kernel_old_timeval
-struct __kernel_old_timeval {
-	__kernel_long_t tv_sec;
-	__kernel_long_t tv_usec;
-};
-#endif
-
-struct __kernel_sock_timeval {
-	__s64 tv_sec;
-	__s64 tv_usec;
-};
-
 /*
  * The IDs of the various system clocks (for POSIX.1b interval timers):
  */
diff --git a/include/uapi/linux/time_types.h b/include/uapi/linux/time_types.h
new file mode 100644
index 000000000000..459070c61d47
--- /dev/null
+++ b/include/uapi/linux/time_types.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_TIME_TYPES_H
+#define _UAPI_LINUX_TIME_TYPES_H
+
+#include <linux/types.h>
+
+#ifndef __kernel_timespec
+struct __kernel_timespec {
+	__kernel_time64_t       tv_sec;                 /* seconds */
+	long long               tv_nsec;                /* nanoseconds */
+};
+#endif
+
+#ifndef __kernel_itimerspec
+struct __kernel_itimerspec {
+	struct __kernel_timespec it_interval;    /* timer period */
+	struct __kernel_timespec it_value;       /* timer expiration */
+};
+#endif
+
+/*
+ * legacy timeval structure, only embedded in structures that
+ * traditionally used 'timeval' to pass time intervals (not absolute
+ * times). Do not add new users. If user space fails to compile
+ * here, this is probably because it is not y2038 safe and needs to
+ * be changed to use another interface.
+ */
+#ifndef __kernel_old_timeval
+struct __kernel_old_timeval {
+	__kernel_long_t tv_sec;
+	__kernel_long_t tv_usec;
+};
+#endif
+
+struct __kernel_sock_timeval {
+	__s64 tv_sec;
+	__s64 tv_usec;
+};
+
+#endif /* _UAPI_LINUX_TIME_TYPES_H */
-- 
cgit v1.2.3-59-g8ed1b


From 460a2db0273efe15488a3e4b88da2678eb403178 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 12 Feb 2019 19:26:04 -0800
Subject: errqueue.h: Include time_types.h

Now that we have a separate header for struct __kernel_timespec,
include it directly without relying on userspace to do it.

Reported-by: Ran Rozenstein <ranro@mellanox.com>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/errqueue.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index d955b9e32288..28491dac074b 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -3,6 +3,7 @@
 #define _UAPI_LINUX_ERRQUEUE_H
 
 #include <linux/types.h>
+#include <linux/time_types.h>
 
 struct sock_extended_err {
 	__u32	ee_errno;	
-- 
cgit v1.2.3-59-g8ed1b


From b5bb37eddb63b16b7ab959598d108b1c444be77d Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Date: Wed, 30 Jan 2019 02:53:22 +0100
Subject: drm/amdgpu: Add command to override the context priority.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Given a master fd we can then override the priority of the context
in another fd.

Using these overrides was recommended by Christian instead of trying
to submit from a master fd, and I am adding a way to override a
single context instead of the entire process so we can only upgrade
a single Vulkan queue and not effectively the entire process.

Reused the flags field as it was checked to be 0 anyways, so nothing
used it. This is source-incompatible (due to the name change), but
ABI compatible.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c | 41 ++++++++++++++++++++++++++++++-
 include/uapi/drm/amdgpu_drm.h             |  3 ++-
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
index 0b70410488b6..0767a93e4d91 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
@@ -76,6 +76,39 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int amdgpu_sched_context_priority_override(struct amdgpu_device *adev,
+						  int fd,
+						  unsigned ctx_id,
+						  enum drm_sched_priority priority)
+{
+	struct file *filp = fget(fd);
+	struct amdgpu_fpriv *fpriv;
+	struct amdgpu_ctx *ctx;
+	int r;
+
+	if (!filp)
+		return -EINVAL;
+
+	r = amdgpu_file_to_fpriv(filp, &fpriv);
+	if (r) {
+		fput(filp);
+		return r;
+	}
+
+	ctx = amdgpu_ctx_get(fpriv, ctx_id);
+
+	if (!ctx) {
+		fput(filp);
+		return -EINVAL;
+	}
+
+	amdgpu_ctx_priority_override(ctx, priority);
+	amdgpu_ctx_put(ctx);
+	fput(filp);
+
+	return 0;
+}
+
 int amdgpu_sched_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *filp)
 {
@@ -85,7 +118,7 @@ int amdgpu_sched_ioctl(struct drm_device *dev, void *data,
 	int r;
 
 	priority = amdgpu_to_sched_priority(args->in.priority);
-	if (args->in.flags || priority == DRM_SCHED_PRIORITY_INVALID)
+	if (priority == DRM_SCHED_PRIORITY_INVALID)
 		return -EINVAL;
 
 	switch (args->in.op) {
@@ -94,6 +127,12 @@ int amdgpu_sched_ioctl(struct drm_device *dev, void *data,
 							   args->in.fd,
 							   priority);
 		break;
+	case AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE:
+		r = amdgpu_sched_context_priority_override(adev,
+							   args->in.fd,
+							   args->in.ctx_id,
+							   priority);
+		break;
 	default:
 		DRM_ERROR("Invalid sched op specified: %d\n", args->in.op);
 		r = -EINVAL;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 2acbc8bc465b..4a53f6cfa034 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -272,13 +272,14 @@ union drm_amdgpu_vm {
 
 /* sched ioctl */
 #define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE	1
+#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE	2
 
 struct drm_amdgpu_sched_in {
 	/* AMDGPU_SCHED_OP_* */
 	__u32	op;
 	__u32	fd;
 	__s32	priority;
-	__u32	flags;
+	__u32   ctx_id;
 };
 
 union drm_amdgpu_sched {
-- 
cgit v1.2.3-59-g8ed1b


From 76726ccb7f461c83040e7082cf95fe1dea2afd1f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 14 Feb 2019 13:40:44 -0800
Subject: devlink: add flash update command

Add devlink flash update command. Advanced NICs have firmware
stored in flash and often cryptographically secured. Updating
that flash is handled by management firmware. Ethtool has a
flash update command which served us well, however, it has two
shortcomings:
 - it takes rtnl_lock unnecessarily - really flash update has
   nothing to do with networking, so using a networking device
   as a handle is suboptimal, which leads us to the second one:
 - it requires a functioning netdev - in case device enters an
   error state and can't spawn a netdev (e.g. communication
   with the device fails) there is no netdev to use as a handle
   for flashing.

Devlink already has the ability to report the firmware versions,
now with the ability to update the firmware/flash we will be
able to recover devices in bad state.

To enable updates of sub-components of the FW allow passing
component name.  This name should correspond to one of the
versions reported in devlink info.

v1: - replace target id with component name (Jiri).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  3 +++
 include/uapi/linux/devlink.h |  6 ++++++
 net/core/devlink.c           | 30 ++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index c6d88759b7d5..18d7a051f412 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -521,6 +521,9 @@ struct devlink_ops {
 				      struct netlink_ext_ack *extack);
 	int (*info_get)(struct devlink *devlink, struct devlink_info_req *req,
 			struct netlink_ext_ack *extack);
+	int (*flash_update)(struct devlink *devlink, const char *file_name,
+			    const char *component,
+			    struct netlink_ext_ack *extack);
 };
 
 static inline void *devlink_priv(struct devlink *devlink)
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 72d9f7c89190..53de8802a000 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -103,6 +103,8 @@ enum devlink_command {
 	DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
 	DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
 
+	DEVLINK_CMD_FLASH_UPDATE,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -326,6 +328,10 @@ enum devlink_attr {
 	DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,		/* u64 */
 	DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,	/* u64 */
 	DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,	/* u8 */
+
+	DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME,	/* string */
+	DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,	/* string */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1d7502a5a651..4a1ad0b13e52 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2662,6 +2662,27 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 	return devlink->ops->reload(devlink, info->extack);
 }
 
+static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	const char *file_name, *component;
+	struct nlattr *nla_component;
+
+	if (!devlink->ops->flash_update)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME])
+		return -EINVAL;
+	file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]);
+
+	nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT];
+	component = nla_component ? nla_data(nla_component) : NULL;
+
+	return devlink->ops->flash_update(devlink, file_name, component,
+					  info->extack);
+}
+
 static const struct devlink_param devlink_param_generic[] = {
 	{
 		.id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
@@ -4883,6 +4904,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 },
 	[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
+	[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -5171,6 +5194,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
 				  DEVLINK_NL_FLAG_NO_LOCK,
 	},
+	{
+		.cmd = DEVLINK_CMD_FLASH_UPDATE,
+		.doit = devlink_nl_cmd_flash_update,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3-59-g8ed1b


From 99b9d7b4970cf131fd17a8f4ad4870049bd7a365 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sat, 16 Feb 2019 00:39:13 +0200
Subject: habanalabs: add basic Goya support

This patch adds a basic support for the Goya device. The code initializes
the device's PCI controller and PCI bars. It also initializes various S/W
structures and adds some basic helper functions.

Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/habanalabs/Makefile            |   3 +
 drivers/misc/habanalabs/device.c            |  71 ++++
 drivers/misc/habanalabs/goya/Makefile       |   3 +
 drivers/misc/habanalabs/goya/goya.c         | 632 ++++++++++++++++++++++++++++
 drivers/misc/habanalabs/goya/goyaP.h        | 152 +++++++
 drivers/misc/habanalabs/habanalabs.h        | 137 ++++++
 drivers/misc/habanalabs/habanalabs_drv.c    |   3 +
 drivers/misc/habanalabs/include/goya/goya.h |  45 ++
 include/uapi/misc/habanalabs.h              |  20 +
 9 files changed, 1066 insertions(+)
 create mode 100644 drivers/misc/habanalabs/goya/Makefile
 create mode 100644 drivers/misc/habanalabs/goya/goya.c
 create mode 100644 drivers/misc/habanalabs/goya/goyaP.h
 create mode 100644 drivers/misc/habanalabs/include/goya/goya.h
 create mode 100644 include/uapi/misc/habanalabs.h

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
index 0910f7fa34ec..6f1ead69bd77 100644
--- a/drivers/misc/habanalabs/Makefile
+++ b/drivers/misc/habanalabs/Makefile
@@ -5,3 +5,6 @@
 obj-m	:= habanalabs.o
 
 habanalabs-y := habanalabs_drv.o device.o
+
+include $(src)/goya/Makefile
+habanalabs-y += $(HL_GOYA_FILES)
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 6189fd0e2ccd..a4feaa784db3 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -120,8 +120,11 @@ err_cdev_add:
  */
 static int device_early_init(struct hl_device *hdev)
 {
+	int rc;
+
 	switch (hdev->asic_type) {
 	case ASIC_GOYA:
+		goya_set_asic_funcs(hdev);
 		strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
 		break;
 	default:
@@ -130,6 +133,10 @@ static int device_early_init(struct hl_device *hdev)
 		return -EINVAL;
 	}
 
+	rc = hdev->asic_funcs->early_init(hdev);
+	if (rc)
+		return rc;
+
 	return 0;
 }
 
@@ -141,6 +148,10 @@ static int device_early_init(struct hl_device *hdev)
  */
 static void device_early_fini(struct hl_device *hdev)
 {
+
+	if (hdev->asic_funcs->early_fini)
+		hdev->asic_funcs->early_fini(hdev);
+
 }
 
 /*
@@ -154,8 +165,15 @@ static void device_early_fini(struct hl_device *hdev)
  */
 int hl_device_suspend(struct hl_device *hdev)
 {
+	int rc;
+
 	pci_save_state(hdev->pdev);
 
+	rc = hdev->asic_funcs->suspend(hdev);
+	if (rc)
+		dev_err(hdev->dev,
+			"Failed to disable PCI access of device CPU\n");
+
 	/* Shut down the device */
 	pci_disable_device(hdev->pdev);
 	pci_set_power_state(hdev->pdev, PCI_D3hot);
@@ -185,6 +203,13 @@ int hl_device_resume(struct hl_device *hdev)
 		return rc;
 	}
 
+	rc = hdev->asic_funcs->resume(hdev);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to enable PCI access from device CPU\n");
+		return rc;
+	}
+
 	return 0;
 }
 
@@ -212,11 +237,21 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	if (rc)
 		goto release_device;
 
+	/*
+	 * Start calling ASIC initialization. First S/W then H/W and finally
+	 * late init
+	 */
+	rc = hdev->asic_funcs->sw_init(hdev);
+	if (rc)
+		goto early_fini;
+
 	dev_notice(hdev->dev,
 		"Successfully added device to habanalabs driver\n");
 
 	return 0;
 
+early_fini:
+	device_early_fini(hdev);
 release_device:
 	device_destroy(hclass, hdev->dev->devt);
 	cdev_del(&hdev->cdev);
@@ -247,6 +282,9 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Mark device as disabled */
 	hdev->disabled = true;
 
+	/* Call ASIC S/W finalize function */
+	hdev->asic_funcs->sw_fini(hdev);
+
 	device_early_fini(hdev);
 
 	/* Hide device from user */
@@ -338,3 +376,36 @@ int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
 
 	return *val ? 0 : -ETIMEDOUT;
 }
+
+/*
+ * MMIO register access helper functions.
+ */
+
+/*
+ * hl_rreg - Read an MMIO register
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @reg: MMIO register offset (in bytes)
+ *
+ * Returns the value of the MMIO register we are asked to read
+ *
+ */
+inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
+{
+	return readl(hdev->rmmio + reg);
+}
+
+/*
+ * hl_wreg - Write to an MMIO register
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @reg: MMIO register offset (in bytes)
+ * @val: 32-bit value
+ *
+ * Writes the 32-bit value into the MMIO register
+ *
+ */
+inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
+{
+	writel(val, hdev->rmmio + reg);
+}
diff --git a/drivers/misc/habanalabs/goya/Makefile b/drivers/misc/habanalabs/goya/Makefile
new file mode 100644
index 000000000000..38d43006386d
--- /dev/null
+++ b/drivers/misc/habanalabs/goya/Makefile
@@ -0,0 +1,3 @@
+subdir-ccflags-y += -I$(src)
+
+HL_GOYA_FILES :=  goya/goya.o
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
new file mode 100644
index 000000000000..ceaffa9afd83
--- /dev/null
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -0,0 +1,632 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "goyaP.h"
+#include "include/goya/asic_reg/goya_masks.h"
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/genalloc.h>
+
+/*
+ * GOYA security scheme:
+ *
+ * 1. Host is protected by:
+ *        - Range registers (When MMU is enabled, DMA RR does NOT protect host)
+ *        - MMU
+ *
+ * 2. DRAM is protected by:
+ *        - Range registers (protect the first 512MB)
+ *        - MMU (isolation between users)
+ *
+ * 3. Configuration is protected by:
+ *        - Range registers
+ *        - Protection bits
+ *
+ * When MMU is disabled:
+ *
+ * QMAN DMA: PQ, CQ, CP, DMA are secured.
+ * PQ, CB and the data are on the host.
+ *
+ * QMAN TPC/MME:
+ * PQ, CQ and CP are not secured.
+ * PQ, CB and the data are on the SRAM/DRAM.
+ *
+ * Since QMAN DMA is secured, KMD is parsing the DMA CB:
+ *     - KMD checks DMA pointer
+ *     - WREG, MSG_PROT are not allowed.
+ *     - MSG_LONG/SHORT are allowed.
+ *
+ * A read/write transaction by the QMAN to a protected area will succeed if
+ * and only if the QMAN's CP is secured and MSG_PROT is used
+ *
+ *
+ * When MMU is enabled:
+ *
+ * QMAN DMA: PQ, CQ and CP are secured.
+ * MMU is set to bypass on the Secure props register of the QMAN.
+ * The reasons we don't enable MMU for PQ, CQ and CP are:
+ *     - PQ entry is in kernel address space and KMD doesn't map it.
+ *     - CP writes to MSIX register and to kernel address space (completion
+ *       queue).
+ *
+ * DMA is not secured but because CP is secured, KMD still needs to parse the
+ * CB, but doesn't need to check the DMA addresses.
+ *
+ * For QMAN DMA 0, DMA is also secured because only KMD uses this DMA and KMD
+ * doesn't map memory in MMU.
+ *
+ * QMAN TPC/MME: PQ, CQ and CP aren't secured (no change from MMU disabled mode)
+ *
+ * DMA RR does NOT protect host because DMA is not secured
+ *
+ */
+
+#define GOYA_MMU_REGS_NUM		61
+
+#define GOYA_DMA_POOL_BLK_SIZE		0x100		/* 256 bytes */
+
+#define GOYA_RESET_TIMEOUT_MSEC		500		/* 500ms */
+#define GOYA_PLDM_RESET_TIMEOUT_MSEC	20000		/* 20s */
+#define GOYA_RESET_WAIT_MSEC		1		/* 1ms */
+#define GOYA_CPU_RESET_WAIT_MSEC	100		/* 100ms */
+#define GOYA_PLDM_RESET_WAIT_MSEC	1000		/* 1s */
+#define GOYA_CPU_TIMEOUT_USEC		10000000	/* 10s */
+#define GOYA_TEST_QUEUE_WAIT_USEC	100000		/* 100ms */
+
+#define GOYA_QMAN0_FENCE_VAL		0xD169B243
+
+#define GOYA_MAX_INITIATORS		20
+
+static void goya_get_fixed_properties(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+
+	prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
+
+	prop->dram_base_address = DRAM_PHYS_BASE;
+	prop->dram_size = DRAM_PHYS_DEFAULT_SIZE;
+	prop->dram_end_address = prop->dram_base_address + prop->dram_size;
+	prop->dram_user_base_address = DRAM_BASE_ADDR_USER;
+
+	prop->sram_base_address = SRAM_BASE_ADDR;
+	prop->sram_size = SRAM_SIZE;
+	prop->sram_end_address = prop->sram_base_address + prop->sram_size;
+	prop->sram_user_base_address = prop->sram_base_address +
+						SRAM_USER_BASE_OFFSET;
+
+	prop->host_phys_base_address = HOST_PHYS_BASE;
+	prop->va_space_host_start_address = VA_HOST_SPACE_START;
+	prop->va_space_host_end_address = VA_HOST_SPACE_END;
+	prop->va_space_dram_start_address = VA_DDR_SPACE_START;
+	prop->va_space_dram_end_address = VA_DDR_SPACE_END;
+	prop->cfg_size = CFG_SIZE;
+	prop->max_asid = MAX_ASID;
+	prop->tpc_enabled_mask = TPC_ENABLED_MASK;
+
+	prop->high_pll = PLL_HIGH_DEFAULT;
+}
+
+/*
+ * goya_pci_bars_map - Map PCI BARS of Goya device
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * Request PCI regions and map them to kernel virtual addresses.
+ * Returns 0 on success
+ *
+ */
+int goya_pci_bars_map(struct hl_device *hdev)
+{
+	struct pci_dev *pdev = hdev->pdev;
+	int rc;
+
+	rc = pci_request_regions(pdev, HL_NAME);
+	if (rc) {
+		dev_err(hdev->dev, "Cannot obtain PCI resources\n");
+		return rc;
+	}
+
+	hdev->pcie_bar[SRAM_CFG_BAR_ID] =
+			pci_ioremap_bar(pdev, SRAM_CFG_BAR_ID);
+	if (!hdev->pcie_bar[SRAM_CFG_BAR_ID]) {
+		dev_err(hdev->dev, "pci_ioremap_bar failed for CFG\n");
+		rc = -ENODEV;
+		goto err_release_regions;
+	}
+
+	hdev->pcie_bar[MSIX_BAR_ID] = pci_ioremap_bar(pdev, MSIX_BAR_ID);
+	if (!hdev->pcie_bar[MSIX_BAR_ID]) {
+		dev_err(hdev->dev, "pci_ioremap_bar failed for MSIX\n");
+		rc = -ENODEV;
+		goto err_unmap_sram_cfg;
+	}
+
+	hdev->pcie_bar[DDR_BAR_ID] = pci_ioremap_wc_bar(pdev, DDR_BAR_ID);
+	if (!hdev->pcie_bar[DDR_BAR_ID]) {
+		dev_err(hdev->dev, "pci_ioremap_bar failed for DDR\n");
+		rc = -ENODEV;
+		goto err_unmap_msix;
+	}
+
+	hdev->rmmio = hdev->pcie_bar[SRAM_CFG_BAR_ID] +
+				(CFG_BASE - SRAM_BASE_ADDR);
+
+	return 0;
+
+err_unmap_msix:
+	iounmap(hdev->pcie_bar[MSIX_BAR_ID]);
+err_unmap_sram_cfg:
+	iounmap(hdev->pcie_bar[SRAM_CFG_BAR_ID]);
+err_release_regions:
+	pci_release_regions(pdev);
+
+	return rc;
+}
+
+/*
+ * goya_pci_bars_unmap - Unmap PCI BARS of Goya device
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * Release all PCI BARS and unmap their virtual addresses
+ *
+ */
+static void goya_pci_bars_unmap(struct hl_device *hdev)
+{
+	struct pci_dev *pdev = hdev->pdev;
+
+	iounmap(hdev->pcie_bar[DDR_BAR_ID]);
+	iounmap(hdev->pcie_bar[MSIX_BAR_ID]);
+	iounmap(hdev->pcie_bar[SRAM_CFG_BAR_ID]);
+	pci_release_regions(pdev);
+}
+
+/*
+ * goya_elbi_write - Write through the ELBI interface
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * return 0 on success, -1 on failure
+ *
+ */
+static int goya_elbi_write(struct hl_device *hdev, u64 addr, u32 data)
+{
+	struct pci_dev *pdev = hdev->pdev;
+	ktime_t timeout;
+	u32 val;
+
+	/* Clear previous status */
+	pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, 0);
+
+	pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_ADDR, (u32) addr);
+	pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_DATA, data);
+	pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_CTRL,
+				PCI_CONFIG_ELBI_CTRL_WRITE);
+
+	timeout = ktime_add_ms(ktime_get(), 10);
+	for (;;) {
+		pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, &val);
+		if (val & PCI_CONFIG_ELBI_STS_MASK)
+			break;
+		if (ktime_compare(ktime_get(), timeout) > 0) {
+			pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS,
+						&val);
+			break;
+		}
+		usleep_range(300, 500);
+	}
+
+	if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE)
+		return 0;
+
+	if (val & PCI_CONFIG_ELBI_STS_ERR) {
+		dev_err(hdev->dev, "Error writing to ELBI\n");
+		return -EIO;
+	}
+
+	if (!(val & PCI_CONFIG_ELBI_STS_MASK)) {
+		dev_err(hdev->dev, "ELBI write didn't finish in time\n");
+		return -EIO;
+	}
+
+	dev_err(hdev->dev, "ELBI write has undefined bits in status\n");
+	return -EIO;
+}
+
+/*
+ * goya_iatu_write - iatu write routine
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ */
+static int goya_iatu_write(struct hl_device *hdev, u32 addr, u32 data)
+{
+	u32 dbi_offset;
+	int rc;
+
+	dbi_offset = addr & 0xFFF;
+
+	rc = goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI, 0x00300000);
+	rc |= goya_elbi_write(hdev, mmPCIE_DBI_BASE + dbi_offset, data);
+
+	if (rc)
+		return -EIO;
+
+	return 0;
+}
+
+void goya_reset_link_through_bridge(struct hl_device *hdev)
+{
+	struct pci_dev *pdev = hdev->pdev;
+	struct pci_dev *parent_port;
+	u16 val;
+
+	parent_port = pdev->bus->self;
+	pci_read_config_word(parent_port, PCI_BRIDGE_CONTROL, &val);
+	val |= PCI_BRIDGE_CTL_BUS_RESET;
+	pci_write_config_word(parent_port, PCI_BRIDGE_CONTROL, val);
+	ssleep(1);
+
+	val &= ~(PCI_BRIDGE_CTL_BUS_RESET);
+	pci_write_config_word(parent_port, PCI_BRIDGE_CONTROL, val);
+	ssleep(3);
+}
+
+/*
+ * goya_set_ddr_bar_base - set DDR bar to map specific device address
+ *
+ * @hdev: pointer to hl_device structure
+ * @addr: address in DDR. Must be aligned to DDR bar size
+ *
+ * This function configures the iATU so that the DDR bar will start at the
+ * specified addr.
+ *
+ */
+static int goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	int rc;
+
+	if ((goya) && (goya->ddr_bar_cur_addr == addr))
+		return 0;
+
+	/* Inbound Region 1 - Bar 4 - Point to DDR */
+	rc = goya_iatu_write(hdev, 0x314, lower_32_bits(addr));
+	rc |= goya_iatu_write(hdev, 0x318, upper_32_bits(addr));
+	rc |= goya_iatu_write(hdev, 0x300, 0);
+	/* Enable + Bar match + match enable + Bar 4 */
+	rc |= goya_iatu_write(hdev, 0x304, 0xC0080400);
+
+	/* Return the DBI window to the default location */
+	rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI, 0);
+	rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI_32, 0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to map DDR bar to 0x%08llx\n", addr);
+		return -EIO;
+	}
+
+	if (goya)
+		goya->ddr_bar_cur_addr = addr;
+
+	return 0;
+}
+
+/*
+ * goya_init_iatu - Initialize the iATU unit inside the PCI controller
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * This is needed in case the firmware doesn't initialize the iATU
+ *
+ */
+static int goya_init_iatu(struct hl_device *hdev)
+{
+	int rc;
+
+	/* Inbound Region 0 - Bar 0 - Point to SRAM_BASE_ADDR */
+	rc  = goya_iatu_write(hdev, 0x114, lower_32_bits(SRAM_BASE_ADDR));
+	rc |= goya_iatu_write(hdev, 0x118, upper_32_bits(SRAM_BASE_ADDR));
+	rc |= goya_iatu_write(hdev, 0x100, 0);
+	/* Enable + Bar match + match enable */
+	rc |= goya_iatu_write(hdev, 0x104, 0xC0080000);
+
+	/* Inbound Region 1 - Bar 4 - Point to DDR */
+	rc |= goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE);
+
+	/* Outbound Region 0 - Point to Host */
+	rc |= goya_iatu_write(hdev, 0x008, lower_32_bits(HOST_PHYS_BASE));
+	rc |= goya_iatu_write(hdev, 0x00C, upper_32_bits(HOST_PHYS_BASE));
+	rc |= goya_iatu_write(hdev, 0x010,
+		lower_32_bits(HOST_PHYS_BASE + HOST_PHYS_SIZE - 1));
+	rc |= goya_iatu_write(hdev, 0x014, 0);
+	rc |= goya_iatu_write(hdev, 0x018, 0);
+	rc |= goya_iatu_write(hdev, 0x020,
+		upper_32_bits(HOST_PHYS_BASE + HOST_PHYS_SIZE - 1));
+	/* Increase region size */
+	rc |= goya_iatu_write(hdev, 0x000, 0x00002000);
+	/* Enable */
+	rc |= goya_iatu_write(hdev, 0x004, 0x80000000);
+
+	/* Return the DBI window to the default location */
+	rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI, 0);
+	rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI_32, 0);
+
+	if (rc)
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * goya_early_init - GOYA early initialization code
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * Verify PCI bars
+ * Set DMA masks
+ * PCI controller initialization
+ * Map PCI bars
+ *
+ */
+static int goya_early_init(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct pci_dev *pdev = hdev->pdev;
+	u32 val;
+	int rc;
+
+	goya_get_fixed_properties(hdev);
+
+	/* Check BAR sizes */
+	if (pci_resource_len(pdev, SRAM_CFG_BAR_ID) != CFG_BAR_SIZE) {
+		dev_err(hdev->dev,
+			"Not " HL_NAME "? BAR %d size %llu, expecting %llu\n",
+			SRAM_CFG_BAR_ID,
+			(unsigned long long) pci_resource_len(pdev,
+							SRAM_CFG_BAR_ID),
+			CFG_BAR_SIZE);
+		return -ENODEV;
+	}
+
+	if (pci_resource_len(pdev, MSIX_BAR_ID) != MSIX_BAR_SIZE) {
+		dev_err(hdev->dev,
+			"Not " HL_NAME "? BAR %d size %llu, expecting %llu\n",
+			MSIX_BAR_ID,
+			(unsigned long long) pci_resource_len(pdev,
+								MSIX_BAR_ID),
+			MSIX_BAR_SIZE);
+		return -ENODEV;
+	}
+
+	prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID);
+
+	/* set DMA mask for GOYA */
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(39));
+	if (rc) {
+		dev_warn(hdev->dev, "Unable to set pci dma mask to 39 bits\n");
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc) {
+			dev_err(hdev->dev,
+				"Unable to set pci dma mask to 32 bits\n");
+			return rc;
+		}
+	}
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(39));
+	if (rc) {
+		dev_warn(hdev->dev,
+			"Unable to set pci consistent dma mask to 39 bits\n");
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc) {
+			dev_err(hdev->dev,
+				"Unable to set pci consistent dma mask to 32 bits\n");
+			return rc;
+		}
+	}
+
+	if (hdev->reset_pcilink)
+		goya_reset_link_through_bridge(hdev);
+
+	rc = pci_enable_device_mem(pdev);
+	if (rc) {
+		dev_err(hdev->dev, "can't enable PCI device\n");
+		return rc;
+	}
+
+	pci_set_master(pdev);
+
+	rc = goya_init_iatu(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to initialize iATU\n");
+		goto disable_device;
+	}
+
+	rc = goya_pci_bars_map(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to initialize PCI BARS\n");
+		goto disable_device;
+	}
+
+	val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS);
+	if (val & PSOC_GLOBAL_CONF_BOOT_STRAP_PINS_SRIOV_EN_MASK)
+		dev_warn(hdev->dev,
+			"PCI strap is not configured correctly, PCI bus errors may occur\n");
+
+	return 0;
+
+disable_device:
+	pci_clear_master(pdev);
+	pci_disable_device(pdev);
+
+	return rc;
+}
+
+/*
+ * goya_early_fini - GOYA early finalization code
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * Unmap PCI bars
+ *
+ */
+int goya_early_fini(struct hl_device *hdev)
+{
+	goya_pci_bars_unmap(hdev);
+
+	pci_clear_master(hdev->pdev);
+	pci_disable_device(hdev->pdev);
+
+	return 0;
+}
+
+/*
+ * goya_sw_init - Goya software initialization code
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ */
+static int goya_sw_init(struct hl_device *hdev)
+{
+	struct goya_device *goya;
+	int rc;
+
+	/* Allocate device structure */
+	goya = kzalloc(sizeof(*goya), GFP_KERNEL);
+	if (!goya)
+		return -ENOMEM;
+
+	/* according to goya_init_iatu */
+	goya->ddr_bar_cur_addr = DRAM_PHYS_BASE;
+	hdev->asic_specific = goya;
+
+	/* Create DMA pool for small allocations */
+	hdev->dma_pool = dma_pool_create(dev_name(hdev->dev),
+			&hdev->pdev->dev, GOYA_DMA_POOL_BLK_SIZE, 8, 0);
+	if (!hdev->dma_pool) {
+		dev_err(hdev->dev, "failed to create DMA pool\n");
+		rc = -ENOMEM;
+		goto free_goya_device;
+	}
+
+	hdev->cpu_accessible_dma_mem =
+			hdev->asic_funcs->dma_alloc_coherent(hdev,
+					CPU_ACCESSIBLE_MEM_SIZE,
+					&hdev->cpu_accessible_dma_address,
+					GFP_KERNEL | __GFP_ZERO);
+
+	if (!hdev->cpu_accessible_dma_mem) {
+		dev_err(hdev->dev,
+			"failed to allocate %d of dma memory for CPU accessible memory space\n",
+			CPU_ACCESSIBLE_MEM_SIZE);
+		rc = -ENOMEM;
+		goto free_dma_pool;
+	}
+
+	hdev->cpu_accessible_dma_pool = gen_pool_create(CPU_PKT_SHIFT, -1);
+	if (!hdev->cpu_accessible_dma_pool) {
+		dev_err(hdev->dev,
+			"Failed to create CPU accessible DMA pool\n");
+		rc = -ENOMEM;
+		goto free_cpu_pq_dma_mem;
+	}
+
+	rc = gen_pool_add(hdev->cpu_accessible_dma_pool,
+				(uintptr_t) hdev->cpu_accessible_dma_mem,
+				CPU_ACCESSIBLE_MEM_SIZE, -1);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to add memory to CPU accessible DMA pool\n");
+		rc = -EFAULT;
+		goto free_cpu_pq_pool;
+	}
+
+	spin_lock_init(&goya->hw_queues_lock);
+
+	return 0;
+
+free_cpu_pq_pool:
+	gen_pool_destroy(hdev->cpu_accessible_dma_pool);
+free_cpu_pq_dma_mem:
+	hdev->asic_funcs->dma_free_coherent(hdev, CPU_ACCESSIBLE_MEM_SIZE,
+			hdev->cpu_accessible_dma_mem,
+			hdev->cpu_accessible_dma_address);
+free_dma_pool:
+	dma_pool_destroy(hdev->dma_pool);
+free_goya_device:
+	kfree(goya);
+
+	return rc;
+}
+
+/*
+ * goya_sw_fini - Goya software tear-down code
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ */
+int goya_sw_fini(struct hl_device *hdev)
+{
+	struct goya_device *goya = hdev->asic_specific;
+
+	gen_pool_destroy(hdev->cpu_accessible_dma_pool);
+
+	hdev->asic_funcs->dma_free_coherent(hdev, CPU_ACCESSIBLE_MEM_SIZE,
+			hdev->cpu_accessible_dma_mem,
+			hdev->cpu_accessible_dma_address);
+
+	dma_pool_destroy(hdev->dma_pool);
+
+	kfree(goya);
+
+	return 0;
+}
+
+int goya_suspend(struct hl_device *hdev)
+{
+	return 0;
+}
+
+int goya_resume(struct hl_device *hdev)
+{
+	return 0;
+}
+
+void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size,
+					dma_addr_t *dma_handle, gfp_t flags)
+{
+	return dma_alloc_coherent(&hdev->pdev->dev, size, dma_handle, flags);
+}
+
+void goya_dma_free_coherent(struct hl_device *hdev, size_t size, void *cpu_addr,
+				dma_addr_t dma_handle)
+{
+	dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle);
+}
+
+static const struct hl_asic_funcs goya_funcs = {
+	.early_init = goya_early_init,
+	.early_fini = goya_early_fini,
+	.sw_init = goya_sw_init,
+	.sw_fini = goya_sw_fini,
+	.suspend = goya_suspend,
+	.resume = goya_resume,
+	.dma_alloc_coherent = goya_dma_alloc_coherent,
+	.dma_free_coherent = goya_dma_free_coherent,
+};
+
+/*
+ * goya_set_asic_funcs - set Goya function pointers
+ *
+ * @*hdev: pointer to hl_device structure
+ *
+ */
+void goya_set_asic_funcs(struct hl_device *hdev)
+{
+	hdev->asic_funcs = &goya_funcs;
+}
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
new file mode 100644
index 000000000000..6a78976e2098
--- /dev/null
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef GOYAP_H_
+#define GOYAP_H_
+
+#include <uapi/misc/habanalabs.h>
+#include "habanalabs.h"
+#include "include/goya/goya.h"
+
+#define NUMBER_OF_CMPLT_QUEUES		5
+#define NUMBER_OF_EXT_HW_QUEUES		5
+#define NUMBER_OF_CPU_HW_QUEUES		1
+#define NUMBER_OF_INT_HW_QUEUES		9
+#define NUMBER_OF_HW_QUEUES		(NUMBER_OF_EXT_HW_QUEUES + \
+					NUMBER_OF_CPU_HW_QUEUES + \
+					NUMBER_OF_INT_HW_QUEUES)
+
+/*
+ * Number of MSIX interrupts IDS:
+ * Each completion queue has 1 ID
+ * The event queue has 1 ID
+ */
+#define NUMBER_OF_INTERRUPTS		(NUMBER_OF_CMPLT_QUEUES + 1)
+
+#if (NUMBER_OF_HW_QUEUES >= HL_MAX_QUEUES)
+#error "Number of H/W queues must be smaller than HL_MAX_QUEUES"
+#endif
+
+#if (NUMBER_OF_INTERRUPTS > GOYA_MSIX_ENTRIES)
+#error "Number of MSIX interrupts must be smaller or equal to GOYA_MSIX_ENTRIES"
+#endif
+
+#define QMAN_FENCE_TIMEOUT_USEC		10000	/* 10 ms */
+
+#define QMAN_STOP_TIMEOUT_USEC		100000	/* 100 ms */
+
+#define TPC_ENABLED_MASK		0xFF
+
+#define PLL_HIGH_DEFAULT		1575000000	/* 1.575 GHz */
+
+#define GOYA_ARMCP_INFO_TIMEOUT		10000000	/* 10s */
+
+#define DRAM_PHYS_DEFAULT_SIZE		0x100000000ull	/* 4GB */
+
+/* DRAM Memory Map */
+
+#define CPU_FW_IMAGE_SIZE	0x10000000	/* 256MB */
+#define MMU_PAGE_TABLES_SIZE	0x0E000000	/* 224MB */
+#define MMU_CACHE_MNG_SIZE	0x00001000	/* 4KB */
+#define CPU_PQ_PKT_SIZE		0x00001000	/* 4KB */
+#define CPU_PQ_DATA_SIZE	0x01FFE000	/* 32MB - 8KB  */
+
+#define CPU_FW_IMAGE_ADDR	DRAM_PHYS_BASE
+#define MMU_PAGE_TABLES_ADDR	(CPU_FW_IMAGE_ADDR + CPU_FW_IMAGE_SIZE)
+#define MMU_CACHE_MNG_ADDR	(MMU_PAGE_TABLES_ADDR + MMU_PAGE_TABLES_SIZE)
+#define CPU_PQ_PKT_ADDR		(MMU_CACHE_MNG_ADDR + MMU_CACHE_MNG_SIZE)
+#define CPU_PQ_DATA_ADDR	(CPU_PQ_PKT_ADDR + CPU_PQ_PKT_SIZE)
+#define DRAM_BASE_ADDR_USER	(CPU_PQ_DATA_ADDR + CPU_PQ_DATA_SIZE)
+
+#if (DRAM_BASE_ADDR_USER != 0x20000000)
+#error "KMD must reserve 512MB"
+#endif
+
+/*
+ * SRAM Memory Map for KMD
+ *
+ * KMD occupies KMD_SRAM_SIZE bytes from the start of SRAM. It is used for
+ * MME/TPC QMANs
+ *
+ */
+
+#define MME_QMAN_BASE_OFFSET	0x000000	/* Must be 0 */
+#define MME_QMAN_LENGTH		64
+#define TPC_QMAN_LENGTH		64
+
+#define TPC0_QMAN_BASE_OFFSET	(MME_QMAN_BASE_OFFSET + \
+				(MME_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
+#define TPC1_QMAN_BASE_OFFSET	(TPC0_QMAN_BASE_OFFSET + \
+				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
+#define TPC2_QMAN_BASE_OFFSET	(TPC1_QMAN_BASE_OFFSET + \
+				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
+#define TPC3_QMAN_BASE_OFFSET	(TPC2_QMAN_BASE_OFFSET + \
+				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
+#define TPC4_QMAN_BASE_OFFSET	(TPC3_QMAN_BASE_OFFSET + \
+				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
+#define TPC5_QMAN_BASE_OFFSET	(TPC4_QMAN_BASE_OFFSET + \
+				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
+#define TPC6_QMAN_BASE_OFFSET	(TPC5_QMAN_BASE_OFFSET + \
+				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
+#define TPC7_QMAN_BASE_OFFSET	(TPC6_QMAN_BASE_OFFSET + \
+				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
+
+#define SRAM_KMD_RES_OFFSET	(TPC7_QMAN_BASE_OFFSET + \
+				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
+
+#if (SRAM_KMD_RES_OFFSET >= GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START)
+#error "MME/TPC QMANs SRAM space exceeds limit"
+#endif
+
+#define SRAM_USER_BASE_OFFSET	GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START
+
+/* Virtual address space */
+#define VA_HOST_SPACE_START	0x1000000000000ull	/* 256TB */
+#define VA_HOST_SPACE_END	0x3FF8000000000ull	/* 1PB - 1TB */
+#define VA_HOST_SPACE_SIZE	(VA_HOST_SPACE_END - \
+					VA_HOST_SPACE_START) /* 767TB */
+
+#define VA_DDR_SPACE_START	0x800000000ull		/* 32GB */
+#define VA_DDR_SPACE_END	0x2000000000ull		/* 128GB */
+#define VA_DDR_SPACE_SIZE	(VA_DDR_SPACE_END - \
+					VA_DDR_SPACE_START)	/* 128GB */
+
+#define DMA_MAX_TRANSFER_SIZE	0xFFFFFFFF
+
+#define HW_CAP_PLL		0x00000001
+#define HW_CAP_DDR_0		0x00000002
+#define HW_CAP_DDR_1		0x00000004
+#define HW_CAP_MME		0x00000008
+#define HW_CAP_CPU		0x00000010
+#define HW_CAP_DMA		0x00000020
+#define HW_CAP_MSIX		0x00000040
+#define HW_CAP_CPU_Q		0x00000080
+#define HW_CAP_MMU		0x00000100
+#define HW_CAP_TPC_MBIST	0x00000200
+#define HW_CAP_GOLDEN		0x00000400
+#define HW_CAP_TPC		0x00000800
+
+#define CPU_PKT_SHIFT		5
+#define CPU_PKT_SIZE		(1 << CPU_PKT_SHIFT)
+#define CPU_PKT_MASK		(~((1 << CPU_PKT_SHIFT) - 1))
+#define CPU_MAX_PKTS_IN_CB	32
+#define CPU_CB_SIZE		(CPU_PKT_SIZE * CPU_MAX_PKTS_IN_CB)
+#define CPU_ACCESSIBLE_MEM_SIZE	(HL_QUEUE_LENGTH * CPU_CB_SIZE)
+
+enum goya_fw_component {
+	FW_COMP_UBOOT,
+	FW_COMP_PREBOOT
+};
+
+struct goya_device {
+	/* TODO: remove hw_queues_lock after moving to scheduler code */
+	spinlock_t	hw_queues_lock;
+	u64		ddr_bar_cur_addr;
+	u32		hw_cap_initialized;
+};
+
+#endif /* GOYAP_H_ */
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index fa628b05db13..5076e680a73c 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -14,9 +14,62 @@
 
 #define HL_NAME				"habanalabs"
 
+#define HL_MAX_QUEUES			128
+
 struct hl_device;
 
 
+/**
+ * struct asic_fixed_properties - ASIC specific immutable properties.
+ * @sram_base_address: SRAM physical start address.
+ * @sram_end_address: SRAM physical end address.
+ * @sram_user_base_address - SRAM physical start address for user access.
+ * @dram_base_address: DRAM physical start address.
+ * @dram_end_address: DRAM physical end address.
+ * @dram_user_base_address: DRAM physical start address for user access.
+ * @dram_size: DRAM total size.
+ * @dram_pci_bar_size: size of PCI bar towards DRAM.
+ * @host_phys_base_address: base physical address of host memory for
+ *				transactions that the device generates.
+ * @va_space_host_start_address: base address of virtual memory range for
+ *                               mapping host memory.
+ * @va_space_host_end_address: end address of virtual memory range for
+ *                             mapping host memory.
+ * @va_space_dram_start_address: base address of virtual memory range for
+ *                               mapping DRAM memory.
+ * @va_space_dram_end_address: end address of virtual memory range for
+ *                             mapping DRAM memory.
+ * @cfg_size: configuration space size on SRAM.
+ * @sram_size: total size of SRAM.
+ * @max_asid: maximum number of open contexts (ASIDs).
+ * @completion_queues_count: number of completion queues.
+ * @high_pll: high PLL frequency used by the device.
+ * @tpc_enabled_mask: which TPCs are enabled.
+ */
+struct asic_fixed_properties {
+	u64			sram_base_address;
+	u64			sram_end_address;
+	u64			sram_user_base_address;
+	u64			dram_base_address;
+	u64			dram_end_address;
+	u64			dram_user_base_address;
+	u64			dram_size;
+	u64			dram_pci_bar_size;
+	u64			host_phys_base_address;
+	u64			va_space_host_start_address;
+	u64			va_space_host_end_address;
+	u64			va_space_dram_start_address;
+	u64			va_space_dram_end_address;
+	u32			cfg_size;
+	u32			sram_size;
+	u32			max_asid;
+	u32			high_pll;
+	u8			completion_queues_count;
+	u8			tpc_enabled_mask;
+};
+
+
+#define HL_QUEUE_LENGTH			256
 /*
  * ASICs
  */
@@ -33,6 +86,36 @@ enum hl_asic_type {
 	ASIC_INVALID
 };
 
+/**
+ * struct hl_asic_funcs - ASIC specific functions that are can be called from
+ *                        common code.
+ * @early_init: sets up early driver state (pre sw_init), doesn't configure H/W.
+ * @early_fini: tears down what was done in early_init.
+ * @sw_init: sets up driver state, does not configure H/W.
+ * @sw_fini: tears down driver state, does not configure H/W.
+ * @suspend: handles IP specific H/W or SW changes for suspend.
+ * @resume: handles IP specific H/W or SW changes for resume.
+ * @dma_alloc_coherent: Allocate coherent DMA memory by calling
+ *                      dma_alloc_coherent(). This is ASIC function because its
+ *                      implementation is not trivial when the driver is loaded
+ *                      in simulation mode (not upstreamed).
+ * @dma_free_coherent: Free coherent DMA memory by calling dma_free_coherent().
+ *                     This is ASIC function because its implementation is not
+ *                     trivial when the driver is loaded in simulation mode
+ *                     (not upstreamed).
+ */
+struct hl_asic_funcs {
+	int (*early_init)(struct hl_device *hdev);
+	int (*early_fini)(struct hl_device *hdev);
+	int (*sw_init)(struct hl_device *hdev);
+	int (*sw_fini)(struct hl_device *hdev);
+	int (*suspend)(struct hl_device *hdev);
+	int (*resume)(struct hl_device *hdev);
+	void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size,
+					dma_addr_t *dma_handle, gfp_t flag);
+	void (*dma_free_coherent)(struct hl_device *hdev, size_t size,
+					void *cpu_addr, dma_addr_t dma_handle);
+};
 
 /*
  * FILE PRIVATE STRUCTURE
@@ -62,26 +145,78 @@ struct hl_fpriv {
  */
 #define HL_MAX_MINORS	256
 
+/*
+ * Registers read & write functions.
+ */
+
+u32 hl_rreg(struct hl_device *hdev, u32 reg);
+void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
+
+#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \
+	readl_poll_timeout(hdev->rmmio + addr, val, cond, sleep_us, timeout_us)
+
+#define RREG32(reg) hl_rreg(hdev, (reg))
+#define WREG32(reg, v) hl_wreg(hdev, (reg), (v))
+#define DREG32(reg) pr_info("REGISTER: " #reg " : 0x%08X\n",	\
+				hl_rreg(hdev, (reg)))
+
+#define WREG32_P(reg, val, mask)				\
+	do {							\
+		u32 tmp_ = RREG32(reg);				\
+		tmp_ &= (mask);					\
+		tmp_ |= ((val) & ~(mask));			\
+		WREG32(reg, tmp_);				\
+	} while (0)
+#define WREG32_AND(reg, and) WREG32_P(reg, 0, and)
+#define WREG32_OR(reg, or) WREG32_P(reg, or, ~(or))
+
+#define REG_FIELD_SHIFT(reg, field) reg##_##field##_SHIFT
+#define REG_FIELD_MASK(reg, field) reg##_##field##_MASK
+#define WREG32_FIELD(reg, field, val)	\
+	WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \
+			(val) << REG_FIELD_SHIFT(reg, field))
+
 /**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
+ * @pcie_bar: array of available PCIe bars.
+ * @rmmio: configuration area address on SRAM.
  * @cdev: related char device.
  * @dev: realted kernel basic device structure.
  * @asic_name: ASIC specific nmae.
  * @asic_type: ASIC specific type.
+ * @dma_pool: DMA pool for small allocations.
+ * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address.
+ * @cpu_accessible_dma_address: KMD <-> ArmCP shared memory DMA address.
+ * @cpu_accessible_dma_pool: KMD <-> ArmCP shared memory pool.
+ * @asic_prop: ASIC specific immutable properties.
+ * @asic_funcs: ASIC specific functions.
+ * @asic_specific: ASIC specific information to use only from ASIC files.
  * @major: habanalabs KMD major.
  * @id: device minor.
  * @disabled: is device disabled.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
+	void __iomem			*pcie_bar[6];
+	void __iomem			*rmmio;
 	struct cdev			cdev;
 	struct device			*dev;
 	char				asic_name[16];
 	enum hl_asic_type		asic_type;
+	struct dma_pool			*dma_pool;
+	void				*cpu_accessible_dma_mem;
+	dma_addr_t			cpu_accessible_dma_address;
+	struct gen_pool			*cpu_accessible_dma_pool;
+	struct asic_fixed_properties	asic_prop;
+	const struct hl_asic_funcs	*asic_funcs;
+	void				*asic_specific;
 	u32				major;
 	u16				id;
 	u8				disabled;
+
+	/* Parameters for bring-up */
+	u8				reset_pcilink;
 };
 
 
@@ -128,4 +263,6 @@ void hl_device_fini(struct hl_device *hdev);
 int hl_device_suspend(struct hl_device *hdev);
 int hl_device_resume(struct hl_device *hdev);
 
+void goya_set_asic_funcs(struct hl_device *hdev);
+
 #endif /* HABANALABSP_H_ */
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
index a5251ed277d1..edf626b2b7b1 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -122,6 +122,9 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 
 	hdev->major = hl_major;
 
+	/* Parameters for bring-up - set them to defaults */
+	hdev->reset_pcilink = 0;
+
 	hdev->disabled = true;
 	hdev->pdev = pdev; /* can be NULL in case of simulator device */
 
diff --git a/drivers/misc/habanalabs/include/goya/goya.h b/drivers/misc/habanalabs/include/goya/goya.h
new file mode 100644
index 000000000000..614149efa412
--- /dev/null
+++ b/drivers/misc/habanalabs/include/goya/goya.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef GOYA_H
+#define GOYA_H
+
+#include "asic_reg/goya_regs.h"
+
+#include <linux/types.h>
+
+#define SRAM_CFG_BAR_ID		0
+#define MSIX_BAR_ID		2
+#define DDR_BAR_ID		4
+
+#define CFG_BAR_SIZE		0x10000000ull		/* 256MB */
+#define MSIX_BAR_SIZE		0x1000ull		/* 4KB */
+
+#define CFG_BASE		0x7FFC000000ull
+#define CFG_SIZE		0x4000000		/* 32MB CFG + 32MB DBG*/
+
+#define SRAM_BASE_ADDR		0x7FF0000000ull
+#define SRAM_SIZE		0x32A0000		/* 50.625MB */
+
+#define DRAM_PHYS_BASE		0x0ull
+
+#define HOST_PHYS_BASE		0x8000000000ull		/* 0.5TB */
+#define HOST_PHYS_SIZE		0x1000000000000ull	/* 0.25PB (48 bits) */
+
+#define GOYA_MSIX_ENTRIES	8
+
+#define QMAN_PQ_ENTRY_SIZE	16			/* Bytes */
+
+#define MAX_ASID		1024
+
+#define PROT_BITS_OFFS		0xF80
+
+#define DMA_MAX_NUM		5
+
+#define TPC_MAX_NUM		8
+
+#endif /* GOYA_H */
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
new file mode 100644
index 000000000000..a0ec23adf8f5
--- /dev/null
+++ b/include/uapi/misc/habanalabs.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+ *
+ * Copyright 2016-2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef HABANALABS_H_
+#define HABANALABS_H_
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Defines that are asic-specific but constitutes as ABI between kernel driver
+ * and userspace
+ */
+#define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START	0x8000	/* 32KB */
+
+#endif /* HABANALABS_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From be5d926b5c10430671ae975b80efb7a5652e3f9a Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sat, 16 Feb 2019 00:39:15 +0200
Subject: habanalabs: add command buffer module

This patch adds the command buffer (CB) module, which allows the user to
create and destroy CBs and to map them to the user's process
address-space.

A command buffer is a memory blocks that reside in DMA-able address-space
and is physically contiguous so it can be accessed by the device without
MMU translation. The command buffer memory is allocated using the
coherent DMA API.

When creating a new CB, the IOCTL returns a handle of it, and the
user-space process needs to use that handle to mmap the buffer to get a VA
in the user's address-space.

Before destroying (freeing) a CB, the user must unmap the CB's VA using the
CB handle.

Each CB has a reference counter, which tracks its usage in command
submissions and also its mmaps (only a single mmap is allowed).

The driver maintains a pool of pre-allocated CBs in order to reduce
latency during command submissions. In case the pool is empty, the driver
will go to the slow-path of allocating a new CB, i.e. calling
dma_alloc_coherent.

Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/habanalabs/Makefile           |   3 +-
 drivers/misc/habanalabs/command_buffer.c   | 433 +++++++++++++++++++++++++++++
 drivers/misc/habanalabs/device.c           |  43 ++-
 drivers/misc/habanalabs/goya/goya.c        |  28 ++
 drivers/misc/habanalabs/habanalabs.h       |  86 ++++++
 drivers/misc/habanalabs/habanalabs_drv.c   |   2 +
 drivers/misc/habanalabs/habanalabs_ioctl.c |  99 +++++++
 include/uapi/misc/habanalabs.h             |  46 +++
 8 files changed, 738 insertions(+), 2 deletions(-)
 create mode 100644 drivers/misc/habanalabs/command_buffer.c
 create mode 100644 drivers/misc/habanalabs/habanalabs_ioctl.c

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
index 3ffbadc2ca01..2530c9b78ca4 100644
--- a/drivers/misc/habanalabs/Makefile
+++ b/drivers/misc/habanalabs/Makefile
@@ -4,7 +4,8 @@
 
 obj-m	:= habanalabs.o
 
-habanalabs-y := habanalabs_drv.o device.o context.o asid.o
+habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
+		command_buffer.o
 
 include $(src)/goya/Makefile
 habanalabs-y += $(HL_GOYA_FILES)
diff --git a/drivers/misc/habanalabs/command_buffer.c b/drivers/misc/habanalabs/command_buffer.c
new file mode 100644
index 000000000000..e659ca3035e4
--- /dev/null
+++ b/drivers/misc/habanalabs/command_buffer.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include <uapi/misc/habanalabs.h>
+#include "habanalabs.h"
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+static void cb_fini(struct hl_device *hdev, struct hl_cb *cb)
+{
+	hdev->asic_funcs->dma_free_coherent(hdev, cb->size,
+			(void *) (uintptr_t) cb->kernel_address,
+			cb->bus_address);
+	kfree(cb);
+}
+
+static void cb_do_release(struct hl_device *hdev, struct hl_cb *cb)
+{
+	if (cb->is_pool) {
+		spin_lock(&hdev->cb_pool_lock);
+		list_add(&cb->pool_list, &hdev->cb_pool);
+		spin_unlock(&hdev->cb_pool_lock);
+	} else {
+		cb_fini(hdev, cb);
+	}
+}
+
+static void cb_release(struct kref *ref)
+{
+	struct hl_device *hdev;
+	struct hl_cb *cb;
+
+	cb = container_of(ref, struct hl_cb, refcount);
+	hdev = cb->hdev;
+
+	cb_do_release(hdev, cb);
+}
+
+static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
+					int ctx_id)
+{
+	struct hl_cb *cb;
+	void *p;
+
+	/*
+	 * We use of GFP_ATOMIC here because this function can be called from
+	 * the latency-sensitive code path for command submission. Due to H/W
+	 * limitations in some of the ASICs, the kernel must copy the user CB
+	 * that is designated for an external queue and actually enqueue
+	 * the kernel's copy. Hence, we must never sleep in this code section
+	 * and must use GFP_ATOMIC for all memory allocations.
+	 */
+	if (ctx_id == HL_KERNEL_ASID_ID)
+		cb = kzalloc(sizeof(*cb), GFP_ATOMIC);
+	else
+		cb = kzalloc(sizeof(*cb), GFP_KERNEL);
+
+	if (!cb)
+		return NULL;
+
+	if (ctx_id == HL_KERNEL_ASID_ID)
+		p = hdev->asic_funcs->dma_alloc_coherent(hdev, cb_size,
+						&cb->bus_address, GFP_ATOMIC);
+	else
+		p = hdev->asic_funcs->dma_alloc_coherent(hdev, cb_size,
+						&cb->bus_address,
+						GFP_USER | __GFP_ZERO);
+	if (!p) {
+		dev_err(hdev->dev,
+			"failed to allocate %d of dma memory for CB\n",
+			cb_size);
+		kfree(cb);
+		return NULL;
+	}
+
+	cb->kernel_address = (u64) (uintptr_t) p;
+	cb->size = cb_size;
+
+	return cb;
+}
+
+int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
+			u32 cb_size, u64 *handle, int ctx_id)
+{
+	struct hl_cb *cb;
+	bool alloc_new_cb = true;
+	int rc;
+
+	if (hdev->disabled) {
+		dev_warn_ratelimited(hdev->dev,
+			"Device is disabled. Can't create new CBs\n");
+		rc = -EBUSY;
+		goto out_err;
+	}
+
+	if (cb_size > HL_MAX_CB_SIZE) {
+		dev_err(hdev->dev,
+			"CB size %d must be less then %d\n",
+			cb_size, HL_MAX_CB_SIZE);
+		rc = -EINVAL;
+		goto out_err;
+	}
+
+	/* Minimum allocation must be PAGE SIZE */
+	if (cb_size < PAGE_SIZE)
+		cb_size = PAGE_SIZE;
+
+	if (ctx_id == HL_KERNEL_ASID_ID &&
+			cb_size <= hdev->asic_prop.cb_pool_cb_size) {
+
+		spin_lock(&hdev->cb_pool_lock);
+		if (!list_empty(&hdev->cb_pool)) {
+			cb = list_first_entry(&hdev->cb_pool, typeof(*cb),
+					pool_list);
+			list_del(&cb->pool_list);
+			spin_unlock(&hdev->cb_pool_lock);
+			alloc_new_cb = false;
+		} else {
+			spin_unlock(&hdev->cb_pool_lock);
+			dev_dbg(hdev->dev, "CB pool is empty\n");
+		}
+	}
+
+	if (alloc_new_cb) {
+		cb = hl_cb_alloc(hdev, cb_size, ctx_id);
+		if (!cb) {
+			rc = -ENOMEM;
+			goto out_err;
+		}
+	}
+
+	cb->hdev = hdev;
+	cb->ctx_id = ctx_id;
+
+	spin_lock(&mgr->cb_lock);
+	rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_ATOMIC);
+	spin_unlock(&mgr->cb_lock);
+
+	if (rc < 0) {
+		dev_err(hdev->dev, "Failed to allocate IDR for a new CB\n");
+		goto release_cb;
+	}
+
+	cb->id = rc;
+
+	kref_init(&cb->refcount);
+	spin_lock_init(&cb->lock);
+
+	/*
+	 * idr is 32-bit so we can safely OR it with a mask that is above
+	 * 32 bit
+	 */
+	*handle = cb->id | HL_MMAP_CB_MASK;
+	*handle <<= PAGE_SHIFT;
+
+	return 0;
+
+release_cb:
+	cb_do_release(hdev, cb);
+out_err:
+	*handle = 0;
+
+	return rc;
+}
+
+int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle)
+{
+	struct hl_cb *cb;
+	u32 handle;
+	int rc = 0;
+
+	/*
+	 * handle was given to user to do mmap, I need to shift it back to
+	 * how the idr module gave it to me
+	 */
+	cb_handle >>= PAGE_SHIFT;
+	handle = (u32) cb_handle;
+
+	spin_lock(&mgr->cb_lock);
+
+	cb = idr_find(&mgr->cb_handles, handle);
+	if (cb) {
+		idr_remove(&mgr->cb_handles, handle);
+		spin_unlock(&mgr->cb_lock);
+		kref_put(&cb->refcount, cb_release);
+	} else {
+		spin_unlock(&mgr->cb_lock);
+		dev_err(hdev->dev,
+			"CB destroy failed, no match to handle 0x%x\n", handle);
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	union hl_cb_args *args = data;
+	struct hl_device *hdev = hpriv->hdev;
+	u64 handle;
+	int rc;
+
+	switch (args->in.op) {
+	case HL_CB_OP_CREATE:
+		rc = hl_cb_create(hdev, &hpriv->cb_mgr, args->in.cb_size,
+					&handle, hpriv->ctx->asid);
+		memset(args, 0, sizeof(*args));
+		args->out.cb_handle = handle;
+		break;
+	case HL_CB_OP_DESTROY:
+		rc = hl_cb_destroy(hdev, &hpriv->cb_mgr,
+					args->in.cb_handle);
+		break;
+	default:
+		rc = -ENOTTY;
+		break;
+	}
+
+	return rc;
+}
+
+static void cb_vm_close(struct vm_area_struct *vma)
+{
+	struct hl_cb *cb = (struct hl_cb *) vma->vm_private_data;
+
+	cb->mmap_size -= vma->vm_end - vma->vm_start;
+
+	if (cb->mmap_size)
+		return;
+
+	spin_lock(&cb->lock);
+	cb->mmap = false;
+	spin_unlock(&cb->lock);
+
+	hl_cb_put(cb);
+	vma->vm_private_data = NULL;
+}
+
+static const struct vm_operations_struct cb_vm_ops = {
+	.close = cb_vm_close
+};
+
+int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_cb *cb;
+	phys_addr_t address;
+	u32 handle;
+	int rc;
+
+	handle = vma->vm_pgoff;
+
+	/* reference was taken here */
+	cb = hl_cb_get(hdev, &hpriv->cb_mgr, handle);
+	if (!cb) {
+		dev_err(hdev->dev,
+			"CB mmap failed, no match to handle %d\n", handle);
+		return -EINVAL;
+	}
+
+	/* Validation check */
+	if ((vma->vm_end - vma->vm_start) != cb->size) {
+		dev_err(hdev->dev,
+			"CB mmap failed, mmap size 0x%lx != 0x%x cb size\n",
+			vma->vm_end - vma->vm_start, cb->size);
+		rc = -EINVAL;
+		goto put_cb;
+	}
+
+	spin_lock(&cb->lock);
+
+	if (cb->mmap) {
+		dev_err(hdev->dev,
+			"CB mmap failed, CB already mmaped to user\n");
+		rc = -EINVAL;
+		goto release_lock;
+	}
+
+	cb->mmap = true;
+
+	spin_unlock(&cb->lock);
+
+	vma->vm_ops = &cb_vm_ops;
+
+	/*
+	 * Note: We're transferring the cb reference to
+	 * vma->vm_private_data here.
+	 */
+
+	vma->vm_private_data = cb;
+
+	/* Calculate address for CB */
+	address = virt_to_phys((void *) (uintptr_t) cb->kernel_address);
+
+	rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address,
+					address, cb->size);
+
+	if (rc) {
+		spin_lock(&cb->lock);
+		cb->mmap = false;
+		goto release_lock;
+	}
+
+	cb->mmap_size = cb->size;
+
+	return 0;
+
+release_lock:
+	spin_unlock(&cb->lock);
+put_cb:
+	hl_cb_put(cb);
+	return rc;
+}
+
+struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr,
+			u32 handle)
+{
+	struct hl_cb *cb;
+
+	spin_lock(&mgr->cb_lock);
+	cb = idr_find(&mgr->cb_handles, handle);
+
+	if (!cb) {
+		spin_unlock(&mgr->cb_lock);
+		dev_warn(hdev->dev,
+			"CB get failed, no match to handle %d\n", handle);
+		return NULL;
+	}
+
+	kref_get(&cb->refcount);
+
+	spin_unlock(&mgr->cb_lock);
+
+	return cb;
+
+}
+
+void hl_cb_put(struct hl_cb *cb)
+{
+	kref_put(&cb->refcount, cb_release);
+}
+
+void hl_cb_mgr_init(struct hl_cb_mgr *mgr)
+{
+	spin_lock_init(&mgr->cb_lock);
+	idr_init(&mgr->cb_handles);
+}
+
+void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr)
+{
+	struct hl_cb *cb;
+	struct idr *idp;
+	u32 id;
+
+	idp = &mgr->cb_handles;
+
+	idr_for_each_entry(idp, cb, id) {
+		if (kref_put(&cb->refcount, cb_release) != 1)
+			dev_err(hdev->dev,
+				"CB %d for CTX ID %d is still alive\n",
+				id, cb->ctx_id);
+	}
+
+	idr_destroy(&mgr->cb_handles);
+}
+
+struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size)
+{
+	u64 cb_handle;
+	struct hl_cb *cb;
+	int rc;
+
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle,
+			HL_KERNEL_ASID_ID);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to allocate CB for KMD %d\n", rc);
+		return NULL;
+	}
+
+	cb_handle >>= PAGE_SHIFT;
+	cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr, (u32) cb_handle);
+	/* hl_cb_get should never fail here so use kernel WARN */
+	WARN(!cb, "Kernel CB handle invalid 0x%x\n", (u32) cb_handle);
+	if (!cb)
+		goto destroy_cb;
+
+	return cb;
+
+destroy_cb:
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb_handle << PAGE_SHIFT);
+
+	return NULL;
+}
+
+int hl_cb_pool_init(struct hl_device *hdev)
+{
+	struct hl_cb *cb;
+	int i;
+
+	INIT_LIST_HEAD(&hdev->cb_pool);
+	spin_lock_init(&hdev->cb_pool_lock);
+
+	for (i = 0 ; i < hdev->asic_prop.cb_pool_cb_cnt ; i++) {
+		cb = hl_cb_alloc(hdev, hdev->asic_prop.cb_pool_cb_size,
+				HL_KERNEL_ASID_ID);
+		if (cb) {
+			cb->is_pool = true;
+			list_add(&cb->pool_list, &hdev->cb_pool);
+		} else {
+			hl_cb_pool_fini(hdev);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+int hl_cb_pool_fini(struct hl_device *hdev)
+{
+	struct hl_cb *cb, *tmp;
+
+	list_for_each_entry_safe(cb, tmp, &hdev->cb_pool, pool_list) {
+		list_del(&cb->pool_list);
+		cb_fini(hdev, cb);
+	}
+
+	return 0;
+}
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 2423588ecf22..77f0018e7aa9 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -52,6 +52,7 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 {
 	struct hl_fpriv *hpriv = filp->private_data;
 
+	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
 
 	filp->private_data = NULL;
@@ -61,10 +62,34 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+/*
+ * hl_mmap - mmap function for habanalabs device
+ *
+ * @*filp: pointer to file structure
+ * @*vma: pointer to vm_area_struct of the process
+ *
+ * Called when process does an mmap on habanalabs device. Call the device's mmap
+ * function at the end of the common code.
+ */
+static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct hl_fpriv *hpriv = filp->private_data;
+
+	if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) {
+		vma->vm_pgoff ^= HL_MMAP_CB_MASK;
+		return hl_cb_mmap(hpriv, vma);
+	}
+
+	return hpriv->hdev->asic_funcs->mmap(hpriv, vma);
+}
+
 static const struct file_operations hl_ops = {
 	.owner = THIS_MODULE,
 	.open = hl_device_open,
-	.release = hl_device_release
+	.release = hl_device_release,
+	.mmap = hl_mmap,
+	.unlocked_ioctl = hl_ioctl,
+	.compat_ioctl = hl_ioctl
 };
 
 /*
@@ -149,6 +174,8 @@ static int device_early_init(struct hl_device *hdev)
 	if (rc)
 		goto early_fini;
 
+	hl_cb_mgr_init(&hdev->kernel_cb_mgr);
+
 	mutex_init(&hdev->fd_open_cnt_lock);
 	atomic_set(&hdev->fd_open_cnt, 0);
 
@@ -170,6 +197,8 @@ early_fini:
 static void device_early_fini(struct hl_device *hdev)
 {
 
+	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
+
 	hl_asid_fini(hdev);
 
 	if (hdev->asic_funcs->early_fini)
@@ -284,11 +313,21 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		goto free_ctx;
 	}
 
+	rc = hl_cb_pool_init(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "failed to initialize CB pool\n");
+		goto release_ctx;
+	}
+
 	dev_notice(hdev->dev,
 		"Successfully added device to habanalabs driver\n");
 
 	return 0;
 
+release_ctx:
+	if (hl_ctx_put(hdev->kernel_ctx) != 1)
+		dev_err(hdev->dev,
+			"kernel ctx is still alive on initialization failure\n");
 free_ctx:
 	kfree(hdev->kernel_ctx);
 sw_fini:
@@ -325,6 +364,8 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Mark device as disabled */
 	hdev->disabled = true;
 
+	hl_cb_pool_fini(hdev);
+
 	/* Release kernel context */
 	if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
 		dev_err(hdev->dev, "kernel ctx is still alive\n");
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index ceaffa9afd83..88b047f7d85d 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -82,6 +82,9 @@
 
 #define GOYA_MAX_INITIATORS		20
 
+#define GOYA_CB_POOL_CB_CNT		512
+#define GOYA_CB_POOL_CB_SIZE		0x20000		/* 128KB */
+
 static void goya_get_fixed_properties(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -109,6 +112,8 @@ static void goya_get_fixed_properties(struct hl_device *hdev)
 	prop->tpc_enabled_mask = TPC_ENABLED_MASK;
 
 	prop->high_pll = PLL_HIGH_DEFAULT;
+	prop->cb_pool_cb_cnt = GOYA_CB_POOL_CB_CNT;
+	prop->cb_pool_cb_size = GOYA_CB_POOL_CB_SIZE;
 }
 
 /*
@@ -597,6 +602,27 @@ int goya_resume(struct hl_device *hdev)
 	return 0;
 }
 
+int goya_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
+{
+	return -EINVAL;
+}
+
+int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
+		u64 kaddress, phys_addr_t paddress, u32 size)
+{
+	int rc;
+
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP |
+			VM_DONTCOPY | VM_NORESERVE;
+
+	rc = remap_pfn_range(vma, vma->vm_start, paddress >> PAGE_SHIFT,
+				size, vma->vm_page_prot);
+	if (rc)
+		dev_err(hdev->dev, "remap_pfn_range error %d", rc);
+
+	return rc;
+}
+
 void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size,
 					dma_addr_t *dma_handle, gfp_t flags)
 {
@@ -616,6 +642,8 @@ static const struct hl_asic_funcs goya_funcs = {
 	.sw_fini = goya_sw_fini,
 	.suspend = goya_suspend,
 	.resume = goya_resume,
+	.mmap = goya_mmap,
+	.cb_mmap = goya_cb_mmap,
 	.dma_alloc_coherent = goya_dma_alloc_coherent,
 	.dma_free_coherent = goya_dma_free_coherent,
 };
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index ca8171ca3a04..63741c7224b6 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -14,9 +14,12 @@
 
 #define HL_NAME				"habanalabs"
 
+#define HL_MMAP_CB_MASK			(0x8000000000000000ull >> PAGE_SHIFT)
+
 #define HL_MAX_QUEUES			128
 
 struct hl_device;
+struct hl_fpriv;
 
 
 /**
@@ -44,6 +47,8 @@ struct hl_device;
  * @max_asid: maximum number of open contexts (ASIDs).
  * @completion_queues_count: number of completion queues.
  * @high_pll: high PLL frequency used by the device.
+ * @cb_pool_cb_cnt: number of CBs in the CB pool.
+ * @cb_pool_cb_size: size of each CB in the CB pool.
  * @tpc_enabled_mask: which TPCs are enabled.
  */
 struct asic_fixed_properties {
@@ -64,11 +69,60 @@ struct asic_fixed_properties {
 	u32			sram_size;
 	u32			max_asid;
 	u32			high_pll;
+	u32			cb_pool_cb_cnt;
+	u32			cb_pool_cb_size;
 	u8			completion_queues_count;
 	u8			tpc_enabled_mask;
 };
 
 
+/*
+ * Command Buffers
+ */
+
+#define HL_MAX_CB_SIZE		0x200000	/* 2MB */
+
+/**
+ * struct hl_cb_mgr - describes a Command Buffer Manager.
+ * @cb_lock: protects cb_handles.
+ * @cb_handles: an idr to hold all command buffer handles.
+ */
+struct hl_cb_mgr {
+	spinlock_t		cb_lock;
+	struct idr		cb_handles; /* protected by cb_lock */
+};
+
+/**
+ * struct hl_cb - describes a Command Buffer.
+ * @refcount: reference counter for usage of the CB.
+ * @hdev: pointer to device this CB belongs to.
+ * @lock: spinlock to protect mmap/cs flows.
+ * @pool_list: node in pool list of command buffers.
+ * @kernel_address: Holds the CB's kernel virtual address.
+ * @bus_address: Holds the CB's DMA address.
+ * @mmap_size: Holds the CB's size that was mmaped.
+ * @size: holds the CB's size.
+ * @id: the CB's ID.
+ * @ctx_id: holds the ID of the owner's context.
+ * @mmap: true if the CB is currently mmaped to user.
+ * @is_pool: true if CB was acquired from the pool, false otherwise.
+ */
+struct hl_cb {
+	struct kref		refcount;
+	struct hl_device	*hdev;
+	spinlock_t		lock;
+	struct list_head	pool_list;
+	u64			kernel_address;
+	dma_addr_t		bus_address;
+	u32			mmap_size;
+	u32			size;
+	u32			id;
+	u32			ctx_id;
+	u8			mmap;
+	u8			is_pool;
+};
+
+
 #define HL_QUEUE_LENGTH			256
 
 
@@ -97,6 +151,8 @@ enum hl_asic_type {
  * @sw_fini: tears down driver state, does not configure H/W.
  * @suspend: handles IP specific H/W or SW changes for suspend.
  * @resume: handles IP specific H/W or SW changes for resume.
+ * @mmap: mmap function, does nothing.
+ * @cb_mmap: maps a CB.
  * @dma_alloc_coherent: Allocate coherent DMA memory by calling
  *                      dma_alloc_coherent(). This is ASIC function because its
  *                      implementation is not trivial when the driver is loaded
@@ -113,6 +169,9 @@ struct hl_asic_funcs {
 	int (*sw_fini)(struct hl_device *hdev);
 	int (*suspend)(struct hl_device *hdev);
 	int (*resume)(struct hl_device *hdev);
+	int (*mmap)(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
+	int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
+			u64 kaddress, phys_addr_t paddress, u32 size);
 	void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size,
 					dma_addr_t *dma_handle, gfp_t flag);
 	void (*dma_free_coherent)(struct hl_device *hdev, size_t size,
@@ -163,6 +222,7 @@ struct hl_ctx_mgr {
  * @taskpid: current process ID.
  * @ctx: current executing context.
  * @ctx_mgr: context manager to handle multiple context for this FD.
+ * @cb_mgr: command buffer manager to handle multiple buffers for this FD.
  * @refcount: number of related contexts.
  */
 struct hl_fpriv {
@@ -171,6 +231,7 @@ struct hl_fpriv {
 	struct pid		*taskpid;
 	struct hl_ctx		*ctx; /* TODO: remove for multiple ctx */
 	struct hl_ctx_mgr	ctx_mgr;
+	struct hl_cb_mgr	cb_mgr;
 	struct kref		refcount;
 };
 
@@ -225,6 +286,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
  * @asic_name: ASIC specific nmae.
  * @asic_type: ASIC specific type.
  * @kernel_ctx: KMD context structure.
+ * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
  * @dma_pool: DMA pool for small allocations.
  * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address.
  * @cpu_accessible_dma_address: KMD <-> ArmCP shared memory DMA address.
@@ -240,6 +302,8 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
  * @asic_prop: ASIC specific immutable properties.
  * @asic_funcs: ASIC specific functions.
  * @asic_specific: ASIC specific information to use only from ASIC files.
+ * @cb_pool: list of preallocated CBs.
+ * @cb_pool_lock: protects the CB pool.
  * @user_ctx: current user context executing.
  * @fd_open_cnt: number of open user processes.
  * @major: habanalabs KMD major.
@@ -255,6 +319,7 @@ struct hl_device {
 	char				asic_name[16];
 	enum hl_asic_type		asic_type;
 	struct hl_ctx			*kernel_ctx;
+	struct hl_cb_mgr		kernel_cb_mgr;
 	struct dma_pool			*dma_pool;
 	void				*cpu_accessible_dma_mem;
 	dma_addr_t			cpu_accessible_dma_address;
@@ -266,6 +331,10 @@ struct hl_device {
 	struct asic_fixed_properties	asic_prop;
 	const struct hl_asic_funcs	*asic_funcs;
 	void				*asic_specific;
+
+	struct list_head		cb_pool;
+	spinlock_t			cb_pool_lock;
+
 	/* TODO: remove user_ctx for multiple process support */
 	struct hl_ctx			*user_ctx;
 	atomic_t			fd_open_cnt;
@@ -334,6 +403,23 @@ int hl_device_resume(struct hl_device *hdev);
 void hl_hpriv_get(struct hl_fpriv *hpriv);
 void hl_hpriv_put(struct hl_fpriv *hpriv);
 
+int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size,
+		u64 *handle, int ctx_id);
+int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
+int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
+struct hl_cb *hl_cb_get(struct hl_device *hdev,	struct hl_cb_mgr *mgr,
+			u32 handle);
+void hl_cb_put(struct hl_cb *cb);
+void hl_cb_mgr_init(struct hl_cb_mgr *mgr);
+void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr);
+struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size);
+int hl_cb_pool_init(struct hl_device *hdev);
+int hl_cb_pool_fini(struct hl_device *hdev);
+
 void goya_set_asic_funcs(struct hl_device *hdev);
 
+/* IOCTLs */
+long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data);
+
 #endif /* HABANALABSP_H_ */
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
index 6fddd801aca3..8628d1d8f037 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -116,6 +116,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 	kref_init(&hpriv->refcount);
 	nonseekable_open(inode, filp);
 
+	hl_cb_mgr_init(&hpriv->cb_mgr);
 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
 
 	rc = hl_ctx_create(hdev, hpriv);
@@ -131,6 +132,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 out_err:
 	filp->private_data = NULL;
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
+	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
 	kfree(hpriv);
 
 close_device:
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
new file mode 100644
index 000000000000..e53265fe9543
--- /dev/null
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include <uapi/misc/habanalabs.h>
+#include "habanalabs.h"
+
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+#define HL_IOCTL_DEF(ioctl, _func) \
+	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func}
+
+static const struct hl_ioctl_desc hl_ioctls[] = {
+	HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl)
+};
+
+#define HL_CORE_IOCTL_COUNT	ARRAY_SIZE(hl_ioctls)
+
+long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct hl_fpriv *hpriv = filep->private_data;
+	struct hl_device *hdev = hpriv->hdev;
+	hl_ioctl_t *func;
+	const struct hl_ioctl_desc *ioctl = NULL;
+	unsigned int nr = _IOC_NR(cmd);
+	char stack_kdata[128] = {0};
+	char *kdata = NULL;
+	unsigned int usize, asize;
+	int retcode;
+
+	if ((nr >= HL_COMMAND_START) && (nr < HL_COMMAND_END)) {
+		u32 hl_size;
+
+		ioctl = &hl_ioctls[nr];
+
+		hl_size = _IOC_SIZE(ioctl->cmd);
+		usize = asize = _IOC_SIZE(cmd);
+		if (hl_size > asize)
+			asize = hl_size;
+
+		cmd = ioctl->cmd;
+	} else {
+		dev_err(hdev->dev, "invalid ioctl: pid=%d, nr=0x%02x\n",
+			  task_pid_nr(current), nr);
+		return -ENOTTY;
+	}
+
+	/* Do not trust userspace, use our own definition */
+	func = ioctl->func;
+
+	if (unlikely(!func)) {
+		dev_dbg(hdev->dev, "no function\n");
+		retcode = -ENOTTY;
+		goto out_err;
+	}
+
+	if (cmd & (IOC_IN | IOC_OUT)) {
+		if (asize <= sizeof(stack_kdata)) {
+			kdata = stack_kdata;
+		} else {
+			kdata = kzalloc(asize, GFP_KERNEL);
+			if (!kdata) {
+				retcode = -ENOMEM;
+				goto out_err;
+			}
+		}
+	}
+
+	if (cmd & IOC_IN) {
+		if (copy_from_user(kdata, (void __user *)arg, usize)) {
+			retcode = -EFAULT;
+			goto out_err;
+		}
+	} else if (cmd & IOC_OUT) {
+		memset(kdata, 0, usize);
+	}
+
+	retcode = func(hpriv, kdata);
+
+	if (cmd & IOC_OUT)
+		if (copy_to_user((void __user *)arg, kdata, usize))
+			retcode = -EFAULT;
+
+out_err:
+	if (retcode)
+		dev_dbg(hdev->dev,
+			"error in ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n",
+			  task_pid_nr(current), cmd, nr);
+
+	if (kdata != stack_kdata)
+		kfree(kdata);
+
+	return retcode;
+}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index a0ec23adf8f5..a8edfd3e9c95 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -17,4 +17,50 @@
  */
 #define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START	0x8000	/* 32KB */
 
+/* Opcode to create a new command buffer */
+#define HL_CB_OP_CREATE		0
+/* Opcode to destroy previously created command buffer */
+#define HL_CB_OP_DESTROY	1
+
+struct hl_cb_in {
+	/* Handle of CB or 0 if we want to create one */
+	__u64 cb_handle;
+	/* HL_CB_OP_* */
+	__u32 op;
+	/* Size of CB. Minimum requested size must be PAGE_SIZE */
+	__u32 cb_size;
+	/* Context ID - Currently not in use */
+	__u32 ctx_id;
+	__u32 pad;
+};
+
+struct hl_cb_out {
+	/* Handle of CB */
+	__u64 cb_handle;
+};
+
+union hl_cb_args {
+	struct hl_cb_in in;
+	struct hl_cb_out out;
+};
+
+/*
+ * Command Buffer
+ * - Request a Command Buffer
+ * - Destroy a Command Buffer
+ *
+ * The command buffers are memory blocks that reside in DMA-able address
+ * space and are physically contiguous so they can be accessed by the device
+ * directly. They are allocated using the coherent DMA API.
+ *
+ * When creating a new CB, the IOCTL returns a handle of it, and the user-space
+ * process needs to use that handle to mmap the buffer so it can access them.
+ *
+ */
+#define HL_IOCTL_CB		\
+		_IOWR('H', 0x02, union hl_cb_args)
+
+#define HL_COMMAND_START	0x02
+#define HL_COMMAND_END		0x03
+
 #endif /* HABANALABS_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From 9494a8dd8d22cbff8ce358aaa223fffe1b070cb0 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sat, 16 Feb 2019 00:39:17 +0200
Subject: habanalabs: add h/w queues module

This patch adds the H/W queues module and the code to initialize Goya's
various compute and DMA engines and their queues.

Goya has 5 DMA channels, 8 TPC engines and a single MME engine. For each
channel/engine, there is a H/W queue logic which is used to pass commands
from the user to the H/W. That logic is called QMAN.

There are two types of QMANs: external and internal. The DMA QMANs are
considered external while the TPC and MME QMANs are considered internal.
For each external queue there is a completion queue, which is located on
the Host memory.

The differences between external and internal QMANs are:

1. The location of the queue's memory. External QMANs are located on the
   Host memory while internal QMANs are located on the on-chip memory.

2. The external QMAN write an entry to a completion queue and sends an
   MSI-X interrupt upon completion of a command buffer that was given to
   it. The internal QMAN doesn't do that.

Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/habanalabs/Makefile                   |    2 +-
 drivers/misc/habanalabs/device.c                   |   75 +-
 drivers/misc/habanalabs/goya/goya.c                | 1527 ++++++++++++++++++--
 drivers/misc/habanalabs/goya/goyaP.h               |    7 +
 drivers/misc/habanalabs/habanalabs.h               |  174 ++-
 drivers/misc/habanalabs/habanalabs_drv.c           |    5 +
 drivers/misc/habanalabs/hw_queue.c                 |  400 +++++
 drivers/misc/habanalabs/include/armcp_if.h         |  292 ++++
 .../habanalabs/include/goya/goya_async_events.h    |  186 +++
 .../misc/habanalabs/include/goya/goya_packets.h    |  129 ++
 drivers/misc/habanalabs/include/qman_if.h          |   56 +
 drivers/misc/habanalabs/irq.c                      |  149 ++
 include/uapi/misc/habanalabs.h                     |   29 +
 13 files changed, 2915 insertions(+), 116 deletions(-)
 create mode 100644 drivers/misc/habanalabs/hw_queue.c
 create mode 100644 drivers/misc/habanalabs/include/goya/goya_async_events.h
 create mode 100644 drivers/misc/habanalabs/include/goya/goya_packets.h
 create mode 100644 drivers/misc/habanalabs/include/qman_if.h
 create mode 100644 drivers/misc/habanalabs/irq.c

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
index 2530c9b78ca4..c07f3ccb57dc 100644
--- a/drivers/misc/habanalabs/Makefile
+++ b/drivers/misc/habanalabs/Makefile
@@ -5,7 +5,7 @@
 obj-m	:= habanalabs.o
 
 habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
-		command_buffer.o
+		command_buffer.o hw_queue.o irq.o
 
 include $(src)/goya/Makefile
 habanalabs-y += $(HL_GOYA_FILES)
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 5eefc2952be5..06e2b7f32499 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -174,13 +174,23 @@ static int device_early_init(struct hl_device *hdev)
 	if (rc)
 		goto early_fini;
 
+	hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
+	if (hdev->cq_wq == NULL) {
+		dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
+		rc = -ENOMEM;
+		goto asid_fini;
+	}
+
 	hl_cb_mgr_init(&hdev->kernel_cb_mgr);
 
 	mutex_init(&hdev->fd_open_cnt_lock);
+	mutex_init(&hdev->send_cpu_message_lock);
 	atomic_set(&hdev->fd_open_cnt, 0);
 
 	return 0;
 
+asid_fini:
+	hl_asid_fini(hdev);
 early_fini:
 	if (hdev->asic_funcs->early_fini)
 		hdev->asic_funcs->early_fini(hdev);
@@ -196,9 +206,12 @@ early_fini:
  */
 static void device_early_fini(struct hl_device *hdev)
 {
+	mutex_destroy(&hdev->send_cpu_message_lock);
 
 	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
 
+	destroy_workqueue(hdev->cq_wq);
+
 	hl_asid_fini(hdev);
 
 	if (hdev->asic_funcs->early_fini)
@@ -277,7 +290,7 @@ int hl_device_resume(struct hl_device *hdev)
  */
 int hl_device_init(struct hl_device *hdev, struct class *hclass)
 {
-	int rc;
+	int i, rc, cq_ready_cnt;
 
 	/* Create device */
 	rc = device_setup_cdev(hdev, hclass, hdev->id, &hl_ops);
@@ -298,11 +311,48 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	if (rc)
 		goto early_fini;
 
+	/*
+	 * Initialize the H/W queues. Must be done before hw_init, because
+	 * there the addresses of the kernel queue are being written to the
+	 * registers of the device
+	 */
+	rc = hl_hw_queues_create(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "failed to initialize kernel queues\n");
+		goto sw_fini;
+	}
+
+	/*
+	 * Initialize the completion queues. Must be done before hw_init,
+	 * because there the addresses of the completion queues are being
+	 * passed as arguments to request_irq
+	 */
+	hdev->completion_queue =
+			kcalloc(hdev->asic_prop.completion_queues_count,
+				sizeof(*hdev->completion_queue), GFP_KERNEL);
+
+	if (!hdev->completion_queue) {
+		dev_err(hdev->dev, "failed to allocate completion queues\n");
+		rc = -ENOMEM;
+		goto hw_queues_destroy;
+	}
+
+	for (i = 0, cq_ready_cnt = 0;
+			i < hdev->asic_prop.completion_queues_count;
+			i++, cq_ready_cnt++) {
+		rc = hl_cq_init(hdev, &hdev->completion_queue[i], i);
+		if (rc) {
+			dev_err(hdev->dev,
+				"failed to initialize completion queue\n");
+			goto cq_fini;
+		}
+	}
+
 	/* Allocate the kernel context */
 	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
 	if (!hdev->kernel_ctx) {
 		rc = -ENOMEM;
-		goto sw_fini;
+		goto cq_fini;
 	}
 
 	hdev->user_ctx = NULL;
@@ -328,6 +378,14 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 
 	hdev->disabled = false;
 
+	/* Check that the communication with the device is working */
+	rc = hdev->asic_funcs->test_queues(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to detect if device is alive\n");
+		rc = 0;
+		goto out_disabled;
+	}
+
 	dev_notice(hdev->dev,
 		"Successfully added device to habanalabs driver\n");
 
@@ -339,6 +397,12 @@ release_ctx:
 			"kernel ctx is still alive on initialization failure\n");
 free_ctx:
 	kfree(hdev->kernel_ctx);
+cq_fini:
+	for (i = 0 ; i < cq_ready_cnt ; i++)
+		hl_cq_fini(hdev, &hdev->completion_queue[i]);
+	kfree(hdev->completion_queue);
+hw_queues_destroy:
+	hl_hw_queues_destroy(hdev);
 sw_fini:
 	hdev->asic_funcs->sw_fini(hdev);
 early_fini:
@@ -368,6 +432,7 @@ out_disabled:
  */
 void hl_device_fini(struct hl_device *hdev)
 {
+	int i;
 	dev_info(hdev->dev, "Removing device\n");
 
 	/* Mark device as disabled */
@@ -382,6 +447,12 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Reset the H/W. It will be in idle state after this returns */
 	hdev->asic_funcs->hw_fini(hdev, true);
 
+	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
+		hl_cq_fini(hdev, &hdev->completion_queue[i]);
+	kfree(hdev->completion_queue);
+
+	hl_hw_queues_destroy(hdev);
+
 	/* Call ASIC S/W finalize function */
 	hdev->asic_funcs->sw_fini(hdev);
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 1fa5b91fd703..a45a183d4e5c 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -90,6 +90,26 @@
 static void goya_get_fixed_properties(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int i;
+
+	for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
+		prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
+		prop->hw_queues_props[i].kmd_only = 0;
+	}
+
+	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) {
+		prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
+		prop->hw_queues_props[i].kmd_only = 1;
+	}
+
+	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES +
+			NUMBER_OF_INT_HW_QUEUES; i++) {
+		prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
+		prop->hw_queues_props[i].kmd_only = 0;
+	}
+
+	for (; i < HL_MAX_QUEUES; i++)
+		prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
 
 	prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
 
@@ -118,6 +138,18 @@ static void goya_get_fixed_properties(struct hl_device *hdev)
 	prop->high_pll = PLL_HIGH_DEFAULT;
 }
 
+int goya_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
+{
+	struct armcp_packet pkt;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	pkt.ctl = opcode << ARMCP_PKT_CTL_OPCODE_SHIFT;
+
+	return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
+			sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL);
+}
+
 /*
  * goya_pci_bars_map - Map PCI BARS of Goya device
  *
@@ -510,6 +542,8 @@ static int goya_sw_init(struct hl_device *hdev)
 	if (!goya)
 		return -ENOMEM;
 
+	goya->test_cpu_queue = goya_test_cpu_queue;
+
 	/* according to goya_init_iatu */
 	goya->ddr_bar_cur_addr = DRAM_PHYS_BASE;
 	hdev->asic_specific = goya;
@@ -596,6 +630,311 @@ int goya_sw_fini(struct hl_device *hdev)
 	return 0;
 }
 
+static void goya_init_dma_qman(struct hl_device *hdev, int dma_id,
+		dma_addr_t bus_address)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	u32 mtr_base_lo, mtr_base_hi;
+	u32 so_base_lo, so_base_hi;
+	u32 gic_base_lo, gic_base_hi;
+	u32 reg_off = dma_id * (mmDMA_QM_1_PQ_PI - mmDMA_QM_0_PQ_PI);
+
+	mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+	so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+
+	gic_base_lo =
+		lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+	gic_base_hi =
+		upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+
+	WREG32(mmDMA_QM_0_PQ_BASE_LO + reg_off, lower_32_bits(bus_address));
+	WREG32(mmDMA_QM_0_PQ_BASE_HI + reg_off, upper_32_bits(bus_address));
+
+	WREG32(mmDMA_QM_0_PQ_SIZE + reg_off, ilog2(HL_QUEUE_LENGTH));
+	WREG32(mmDMA_QM_0_PQ_PI + reg_off, 0);
+	WREG32(mmDMA_QM_0_PQ_CI + reg_off, 0);
+
+	WREG32(mmDMA_QM_0_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo);
+	WREG32(mmDMA_QM_0_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi);
+	WREG32(mmDMA_QM_0_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo);
+	WREG32(mmDMA_QM_0_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi);
+	WREG32(mmDMA_QM_0_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo);
+	WREG32(mmDMA_QM_0_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi);
+	WREG32(mmDMA_QM_0_GLBL_ERR_WDATA + reg_off,
+			GOYA_ASYNC_EVENT_ID_DMA0_QM + dma_id);
+
+	/* PQ has buffer of 2 cache lines, while CQ has 8 lines */
+	WREG32(mmDMA_QM_0_PQ_CFG1 + reg_off, 0x00020002);
+	WREG32(mmDMA_QM_0_CQ_CFG1 + reg_off, 0x00080008);
+
+	if (dma_id == 0)
+		WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, QMAN_DMA_FULLY_TRUSTED);
+	else
+		if (goya->hw_cap_initialized & HW_CAP_MMU)
+			WREG32(mmDMA_QM_0_GLBL_PROT + reg_off,
+					QMAN_DMA_PARTLY_TRUSTED);
+		else
+			WREG32(mmDMA_QM_0_GLBL_PROT + reg_off,
+					QMAN_DMA_FULLY_TRUSTED);
+
+	WREG32(mmDMA_QM_0_GLBL_ERR_CFG + reg_off, QMAN_DMA_ERR_MSG_EN);
+	WREG32(mmDMA_QM_0_GLBL_CFG0 + reg_off, QMAN_DMA_ENABLE);
+}
+
+static void goya_init_dma_ch(struct hl_device *hdev, int dma_id)
+{
+	u32 gic_base_lo, gic_base_hi;
+	u64 sob_addr;
+	u32 reg_off = dma_id * (mmDMA_CH_1_CFG1 - mmDMA_CH_0_CFG1);
+
+	gic_base_lo =
+		lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+	gic_base_hi =
+		upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+
+	WREG32(mmDMA_CH_0_ERRMSG_ADDR_LO + reg_off, gic_base_lo);
+	WREG32(mmDMA_CH_0_ERRMSG_ADDR_HI + reg_off, gic_base_hi);
+	WREG32(mmDMA_CH_0_ERRMSG_WDATA + reg_off,
+			GOYA_ASYNC_EVENT_ID_DMA0_CH + dma_id);
+
+	if (dma_id) {
+		sob_addr = CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1000 +
+				(dma_id - 1) * 4;
+		WREG32(mmDMA_CH_0_WR_COMP_ADDR_LO + reg_off,
+				lower_32_bits(sob_addr));
+		WREG32(mmDMA_CH_0_WR_COMP_ADDR_HI + reg_off,
+				upper_32_bits(sob_addr));
+		WREG32(mmDMA_CH_0_WR_COMP_WDATA + reg_off, 0x80000001);
+	}
+}
+
+/*
+ * goya_init_dma_qmans - Initialize QMAN DMA registers
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * Initialize the H/W registers of the QMAN DMA channels
+ *
+ */
+static void goya_init_dma_qmans(struct hl_device *hdev)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	struct hl_hw_queue *q;
+	dma_addr_t bus_address;
+	int i;
+
+	if (goya->hw_cap_initialized & HW_CAP_DMA)
+		return;
+
+	q = &hdev->kernel_queues[0];
+
+	for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++, q++) {
+		bus_address = q->bus_address +
+				hdev->asic_prop.host_phys_base_address;
+
+		goya_init_dma_qman(hdev, i, bus_address);
+		goya_init_dma_ch(hdev, i);
+	}
+
+	goya->hw_cap_initialized |= HW_CAP_DMA;
+}
+
+/*
+ * goya_disable_external_queues - Disable external queues
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ */
+static void goya_disable_external_queues(struct hl_device *hdev)
+{
+	WREG32(mmDMA_QM_0_GLBL_CFG0, 0);
+	WREG32(mmDMA_QM_1_GLBL_CFG0, 0);
+	WREG32(mmDMA_QM_2_GLBL_CFG0, 0);
+	WREG32(mmDMA_QM_3_GLBL_CFG0, 0);
+	WREG32(mmDMA_QM_4_GLBL_CFG0, 0);
+}
+
+static int goya_stop_queue(struct hl_device *hdev, u32 cfg_reg,
+				u32 cp_sts_reg, u32 glbl_sts0_reg)
+{
+	int rc;
+	u32 status;
+
+	/* use the values of TPC0 as they are all the same*/
+
+	WREG32(cfg_reg, 1 << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT);
+
+	status = RREG32(cp_sts_reg);
+	if (status & TPC0_QM_CP_STS_FENCE_IN_PROGRESS_MASK) {
+		rc = hl_poll_timeout(
+			hdev,
+			cp_sts_reg,
+			status,
+			!(status & TPC0_QM_CP_STS_FENCE_IN_PROGRESS_MASK),
+			1000,
+			QMAN_FENCE_TIMEOUT_USEC);
+
+		/* if QMAN is stuck in fence no need to check for stop */
+		if (rc)
+			return 0;
+	}
+
+	rc = hl_poll_timeout(
+		hdev,
+		glbl_sts0_reg,
+		status,
+		(status & TPC0_QM_GLBL_STS0_CP_IS_STOP_MASK),
+		1000,
+		QMAN_STOP_TIMEOUT_USEC);
+
+	if (rc) {
+		dev_err(hdev->dev,
+			"Timeout while waiting for QMAN to stop\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * goya_stop_external_queues - Stop external queues
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * Returns 0 on success
+ *
+ */
+static int goya_stop_external_queues(struct hl_device *hdev)
+{
+	int rc, retval = 0;
+
+	rc = goya_stop_queue(hdev,
+			mmDMA_QM_0_GLBL_CFG1,
+			mmDMA_QM_0_CP_STS,
+			mmDMA_QM_0_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop DMA QMAN 0\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmDMA_QM_1_GLBL_CFG1,
+			mmDMA_QM_1_CP_STS,
+			mmDMA_QM_1_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop DMA QMAN 1\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmDMA_QM_2_GLBL_CFG1,
+			mmDMA_QM_2_CP_STS,
+			mmDMA_QM_2_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop DMA QMAN 2\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmDMA_QM_3_GLBL_CFG1,
+			mmDMA_QM_3_CP_STS,
+			mmDMA_QM_3_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop DMA QMAN 3\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmDMA_QM_4_GLBL_CFG1,
+			mmDMA_QM_4_CP_STS,
+			mmDMA_QM_4_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop DMA QMAN 4\n");
+		retval = -EIO;
+	}
+
+	return retval;
+}
+
+static void goya_resume_external_queues(struct hl_device *hdev)
+{
+	WREG32(mmDMA_QM_0_GLBL_CFG1, 0);
+	WREG32(mmDMA_QM_1_GLBL_CFG1, 0);
+	WREG32(mmDMA_QM_2_GLBL_CFG1, 0);
+	WREG32(mmDMA_QM_3_GLBL_CFG1, 0);
+	WREG32(mmDMA_QM_4_GLBL_CFG1, 0);
+}
+
+/*
+ * goya_init_cpu_queues - Initialize PQ/CQ/EQ of CPU
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * Returns 0 on success
+ *
+ */
+int goya_init_cpu_queues(struct hl_device *hdev)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	dma_addr_t bus_address;
+	u32 status;
+	struct hl_hw_queue *cpu_pq = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];
+	int err;
+
+	if (!hdev->cpu_queues_enable)
+		return 0;
+
+	if (goya->hw_cap_initialized & HW_CAP_CPU_Q)
+		return 0;
+
+	bus_address = cpu_pq->bus_address +
+			hdev->asic_prop.host_phys_base_address;
+	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0, lower_32_bits(bus_address));
+	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1, upper_32_bits(bus_address));
+
+	bus_address = hdev->cpu_accessible_dma_address +
+			hdev->asic_prop.host_phys_base_address;
+	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8, lower_32_bits(bus_address));
+	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9, upper_32_bits(bus_address));
+
+	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES);
+	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_10, CPU_ACCESSIBLE_MEM_SIZE);
+
+	/* Used for EQ CI */
+	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, 0);
+
+	WREG32(mmCPU_IF_PF_PQ_PI, 0);
+
+	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_7, PQ_INIT_STATUS_READY_FOR_CP);
+
+	WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
+			GOYA_ASYNC_EVENT_ID_PI_UPDATE);
+
+	err = hl_poll_timeout(
+		hdev,
+		mmPSOC_GLOBAL_CONF_SCRATCHPAD_7,
+		status,
+		(status == PQ_INIT_STATUS_READY_FOR_HOST),
+		1000,
+		GOYA_CPU_TIMEOUT_USEC);
+
+	if (err) {
+		dev_err(hdev->dev,
+			"Failed to communicate with ARM CPU (ArmCP timeout)\n");
+		return -EIO;
+	}
+
+	goya->hw_cap_initialized |= HW_CAP_CPU_Q;
+	return 0;
+}
+
 static void goya_set_pll_refclk(struct hl_device *hdev)
 {
 	WREG32(mmCPU_PLL_DIV_SEL_0, 0x0);
@@ -1023,144 +1362,644 @@ static void goya_init_golden_registers(struct hl_device *hdev)
 	goya->hw_cap_initialized |= HW_CAP_GOLDEN;
 }
 
-
-/*
- * goya_push_fw_to_device - Push FW code to device
- *
- * @hdev: pointer to hl_device structure
- *
- * Copy fw code from firmware file to device memory.
- * Returns 0 on success
- *
- */
-static int goya_push_fw_to_device(struct hl_device *hdev, const char *fw_name,
-					void __iomem *dst)
+static void goya_init_mme_qman(struct hl_device *hdev)
 {
-	const struct firmware *fw;
-	const u64 *fw_data;
-	size_t fw_size, i;
-	int rc;
+	u32 mtr_base_lo, mtr_base_hi;
+	u32 so_base_lo, so_base_hi;
+	u32 gic_base_lo, gic_base_hi;
+	u64 qman_base_addr;
 
-	rc = request_firmware(&fw, fw_name, hdev->dev);
+	mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+	so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
 
-	if (rc) {
-		dev_err(hdev->dev, "Failed to request %s\n", fw_name);
-		goto out;
-	}
+	gic_base_lo =
+		lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+	gic_base_hi =
+		upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
 
-	fw_size = fw->size;
-	if ((fw_size % 4) != 0) {
-		dev_err(hdev->dev, "illegal %s firmware size %zu\n",
-			fw_name, fw_size);
-		rc = -EINVAL;
-		goto out;
-	}
+	qman_base_addr = hdev->asic_prop.sram_base_address +
+				MME_QMAN_BASE_OFFSET;
 
-	dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size);
+	WREG32(mmMME_QM_PQ_BASE_LO, lower_32_bits(qman_base_addr));
+	WREG32(mmMME_QM_PQ_BASE_HI, upper_32_bits(qman_base_addr));
+	WREG32(mmMME_QM_PQ_SIZE, ilog2(MME_QMAN_LENGTH));
+	WREG32(mmMME_QM_PQ_PI, 0);
+	WREG32(mmMME_QM_PQ_CI, 0);
+	WREG32(mmMME_QM_CP_LDMA_SRC_BASE_LO_OFFSET, 0x10C0);
+	WREG32(mmMME_QM_CP_LDMA_SRC_BASE_HI_OFFSET, 0x10C4);
+	WREG32(mmMME_QM_CP_LDMA_TSIZE_OFFSET, 0x10C8);
+	WREG32(mmMME_QM_CP_LDMA_COMMIT_OFFSET, 0x10CC);
 
-	fw_data = (const u64 *) fw->data;
+	WREG32(mmMME_QM_CP_MSG_BASE0_ADDR_LO, mtr_base_lo);
+	WREG32(mmMME_QM_CP_MSG_BASE0_ADDR_HI, mtr_base_hi);
+	WREG32(mmMME_QM_CP_MSG_BASE1_ADDR_LO, so_base_lo);
+	WREG32(mmMME_QM_CP_MSG_BASE1_ADDR_HI, so_base_hi);
 
-	if ((fw->size % 8) != 0)
-		fw_size -= 8;
+	/* QMAN CQ has 8 cache lines */
+	WREG32(mmMME_QM_CQ_CFG1, 0x00080008);
 
-	for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) {
-		if (!(i & (0x80000 - 1))) {
-			dev_dbg(hdev->dev,
-				"copied so far %zu out of %zu for %s firmware",
-				i, fw_size, fw_name);
-			usleep_range(20, 100);
-		}
+	WREG32(mmMME_QM_GLBL_ERR_ADDR_LO, gic_base_lo);
+	WREG32(mmMME_QM_GLBL_ERR_ADDR_HI, gic_base_hi);
 
-		writeq(*fw_data, dst);
-	}
+	WREG32(mmMME_QM_GLBL_ERR_WDATA, GOYA_ASYNC_EVENT_ID_MME_QM);
 
-	if ((fw->size % 8) != 0)
-		writel(*(const u32 *) fw_data, dst);
+	WREG32(mmMME_QM_GLBL_ERR_CFG, QMAN_MME_ERR_MSG_EN);
 
-out:
-	release_firmware(fw);
-	return rc;
+	WREG32(mmMME_QM_GLBL_PROT, QMAN_MME_ERR_PROT);
+
+	WREG32(mmMME_QM_GLBL_CFG0, QMAN_MME_ENABLE);
 }
 
-static int goya_pldm_init_cpu(struct hl_device *hdev)
+static void goya_init_mme_cmdq(struct hl_device *hdev)
 {
-	char fw_name[200];
-	void __iomem *dst;
-	u32 val, unit_rst_val;
-	int rc;
+	u32 mtr_base_lo, mtr_base_hi;
+	u32 so_base_lo, so_base_hi;
+	u32 gic_base_lo, gic_base_hi;
+	u64 qman_base_addr;
 
-	/* Must initialize SRAM scrambler before pushing u-boot to SRAM */
-	goya_init_golden_registers(hdev);
+	mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+	so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
 
-	/* Put ARM cores into reset */
-	WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, CPU_RESET_ASSERT);
-	val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL);
+	gic_base_lo =
+		lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+	gic_base_hi =
+		upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
 
-	/* Reset the CA53 MACRO */
-	unit_rst_val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
-	WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, CA53_RESET);
-	val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
-	WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, unit_rst_val);
-	val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
+	qman_base_addr = hdev->asic_prop.sram_base_address +
+				MME_QMAN_BASE_OFFSET;
 
-	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin");
-	dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET;
-	rc = goya_push_fw_to_device(hdev, fw_name, dst);
-	if (rc)
-		return rc;
+	WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_LO, mtr_base_lo);
+	WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_HI, mtr_base_hi);
+	WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_LO,	so_base_lo);
+	WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_HI, so_base_hi);
 
-	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb");
-	dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET;
-	rc = goya_push_fw_to_device(hdev, fw_name, dst);
-	if (rc)
-		return rc;
+	/* CMDQ CQ has 20 cache lines */
+	WREG32(mmMME_CMDQ_CQ_CFG1, 0x00140014);
 
-	WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_FIT_RDY);
-	WREG32(mmPSOC_GLOBAL_CONF_WARM_REBOOT, CPU_BOOT_STATUS_NA);
+	WREG32(mmMME_CMDQ_GLBL_ERR_ADDR_LO, gic_base_lo);
+	WREG32(mmMME_CMDQ_GLBL_ERR_ADDR_HI, gic_base_hi);
 
-	WREG32(mmCPU_CA53_CFG_RST_ADDR_LSB_0,
-		lower_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET));
-	WREG32(mmCPU_CA53_CFG_RST_ADDR_MSB_0,
-		upper_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET));
+	WREG32(mmMME_CMDQ_GLBL_ERR_WDATA, GOYA_ASYNC_EVENT_ID_MME_CMDQ);
 
-	/* Release ARM core 0 from reset */
-	WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL,
-					CPU_RESET_CORE0_DEASSERT);
-	val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL);
+	WREG32(mmMME_CMDQ_GLBL_ERR_CFG, CMDQ_MME_ERR_MSG_EN);
 
-	return 0;
+	WREG32(mmMME_CMDQ_GLBL_PROT, CMDQ_MME_ERR_PROT);
+
+	WREG32(mmMME_CMDQ_GLBL_CFG0, CMDQ_MME_ENABLE);
 }
 
-/*
- * FW component passes an offset from SRAM_BASE_ADDR in SCRATCHPAD_xx.
- * The version string should be located by that offset.
- */
-static void goya_read_device_fw_version(struct hl_device *hdev,
-					enum goya_fw_component fwc)
+static void goya_init_mme_qmans(struct hl_device *hdev)
 {
-	const char *name;
-	u32 ver_off;
-	char *dest;
+	struct goya_device *goya = hdev->asic_specific;
+	u32 so_base_lo, so_base_hi;
 
-	switch (fwc) {
-	case FW_COMP_UBOOT:
-		ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_29);
-		dest = hdev->asic_prop.uboot_ver;
-		name = "U-Boot";
-		break;
-	case FW_COMP_PREBOOT:
-		ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_28);
-		dest = hdev->asic_prop.preboot_ver;
-		name = "Preboot";
-		break;
-	default:
-		dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc);
+	if (goya->hw_cap_initialized & HW_CAP_MME)
 		return;
-	}
 
-	ver_off &= ~((u32)SRAM_BASE_ADDR);
+	so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+	so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
 
-	if (ver_off < SRAM_SIZE - VERSION_MAX_LEN) {
+	WREG32(mmMME_SM_BASE_ADDRESS_LOW, so_base_lo);
+	WREG32(mmMME_SM_BASE_ADDRESS_HIGH, so_base_hi);
+
+	goya_init_mme_qman(hdev);
+	goya_init_mme_cmdq(hdev);
+
+	goya->hw_cap_initialized |= HW_CAP_MME;
+}
+
+static void goya_init_tpc_qman(struct hl_device *hdev, u32 base_off, int tpc_id)
+{
+	u32 mtr_base_lo, mtr_base_hi;
+	u32 so_base_lo, so_base_hi;
+	u32 gic_base_lo, gic_base_hi;
+	u64 qman_base_addr;
+	u32 reg_off = tpc_id * (mmTPC1_QM_PQ_PI - mmTPC0_QM_PQ_PI);
+
+	mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+	so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+
+	gic_base_lo =
+		lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+	gic_base_hi =
+		upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+
+	qman_base_addr = hdev->asic_prop.sram_base_address + base_off;
+
+	WREG32(mmTPC0_QM_PQ_BASE_LO + reg_off, lower_32_bits(qman_base_addr));
+	WREG32(mmTPC0_QM_PQ_BASE_HI + reg_off, upper_32_bits(qman_base_addr));
+	WREG32(mmTPC0_QM_PQ_SIZE + reg_off, ilog2(TPC_QMAN_LENGTH));
+	WREG32(mmTPC0_QM_PQ_PI + reg_off, 0);
+	WREG32(mmTPC0_QM_PQ_CI + reg_off, 0);
+	WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET + reg_off, 0x10C0);
+	WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_HI_OFFSET + reg_off, 0x10C4);
+	WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET + reg_off, 0x10C8);
+	WREG32(mmTPC0_QM_CP_LDMA_COMMIT_OFFSET + reg_off, 0x10CC);
+
+	WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo);
+	WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi);
+	WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo);
+	WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi);
+
+	WREG32(mmTPC0_QM_CQ_CFG1 + reg_off, 0x00080008);
+
+	WREG32(mmTPC0_QM_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo);
+	WREG32(mmTPC0_QM_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi);
+
+	WREG32(mmTPC0_QM_GLBL_ERR_WDATA + reg_off,
+			GOYA_ASYNC_EVENT_ID_TPC0_QM + tpc_id);
+
+	WREG32(mmTPC0_QM_GLBL_ERR_CFG + reg_off, QMAN_TPC_ERR_MSG_EN);
+
+	WREG32(mmTPC0_QM_GLBL_PROT + reg_off, QMAN_TPC_ERR_PROT);
+
+	WREG32(mmTPC0_QM_GLBL_CFG0 + reg_off, QMAN_TPC_ENABLE);
+}
+
+static void goya_init_tpc_cmdq(struct hl_device *hdev, int tpc_id)
+{
+	u32 mtr_base_lo, mtr_base_hi;
+	u32 so_base_lo, so_base_hi;
+	u32 gic_base_lo, gic_base_hi;
+	u32 reg_off = tpc_id * (mmTPC1_CMDQ_CQ_CFG1 - mmTPC0_CMDQ_CQ_CFG1);
+
+	mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
+	so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+	so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+
+	gic_base_lo =
+		lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+	gic_base_hi =
+		upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
+
+	WREG32(mmTPC0_CMDQ_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo);
+	WREG32(mmTPC0_CMDQ_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi);
+	WREG32(mmTPC0_CMDQ_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo);
+	WREG32(mmTPC0_CMDQ_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi);
+
+	WREG32(mmTPC0_CMDQ_CQ_CFG1 + reg_off, 0x00140014);
+
+	WREG32(mmTPC0_CMDQ_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo);
+	WREG32(mmTPC0_CMDQ_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi);
+
+	WREG32(mmTPC0_CMDQ_GLBL_ERR_WDATA + reg_off,
+			GOYA_ASYNC_EVENT_ID_TPC0_CMDQ + tpc_id);
+
+	WREG32(mmTPC0_CMDQ_GLBL_ERR_CFG + reg_off, CMDQ_TPC_ERR_MSG_EN);
+
+	WREG32(mmTPC0_CMDQ_GLBL_PROT + reg_off, CMDQ_TPC_ERR_PROT);
+
+	WREG32(mmTPC0_CMDQ_GLBL_CFG0 + reg_off, CMDQ_TPC_ENABLE);
+}
+
+static void goya_init_tpc_qmans(struct hl_device *hdev)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	u32 so_base_lo, so_base_hi;
+	u32 cfg_off = mmTPC1_CFG_SM_BASE_ADDRESS_LOW -
+			mmTPC0_CFG_SM_BASE_ADDRESS_LOW;
+	int i;
+
+	if (goya->hw_cap_initialized & HW_CAP_TPC)
+		return;
+
+	so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+	so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+
+	for (i = 0 ; i < TPC_MAX_NUM ; i++) {
+		WREG32(mmTPC0_CFG_SM_BASE_ADDRESS_LOW + i * cfg_off,
+				so_base_lo);
+		WREG32(mmTPC0_CFG_SM_BASE_ADDRESS_HIGH + i * cfg_off,
+				so_base_hi);
+	}
+
+	goya_init_tpc_qman(hdev, TPC0_QMAN_BASE_OFFSET, 0);
+	goya_init_tpc_qman(hdev, TPC1_QMAN_BASE_OFFSET, 1);
+	goya_init_tpc_qman(hdev, TPC2_QMAN_BASE_OFFSET, 2);
+	goya_init_tpc_qman(hdev, TPC3_QMAN_BASE_OFFSET, 3);
+	goya_init_tpc_qman(hdev, TPC4_QMAN_BASE_OFFSET, 4);
+	goya_init_tpc_qman(hdev, TPC5_QMAN_BASE_OFFSET, 5);
+	goya_init_tpc_qman(hdev, TPC6_QMAN_BASE_OFFSET, 6);
+	goya_init_tpc_qman(hdev, TPC7_QMAN_BASE_OFFSET, 7);
+
+	for (i = 0 ; i < TPC_MAX_NUM ; i++)
+		goya_init_tpc_cmdq(hdev, i);
+
+	goya->hw_cap_initialized |= HW_CAP_TPC;
+}
+
+/*
+ * goya_disable_internal_queues - Disable internal queues
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ */
+static void goya_disable_internal_queues(struct hl_device *hdev)
+{
+	WREG32(mmMME_QM_GLBL_CFG0, 0);
+	WREG32(mmMME_CMDQ_GLBL_CFG0, 0);
+
+	WREG32(mmTPC0_QM_GLBL_CFG0, 0);
+	WREG32(mmTPC0_CMDQ_GLBL_CFG0, 0);
+
+	WREG32(mmTPC1_QM_GLBL_CFG0, 0);
+	WREG32(mmTPC1_CMDQ_GLBL_CFG0, 0);
+
+	WREG32(mmTPC2_QM_GLBL_CFG0, 0);
+	WREG32(mmTPC2_CMDQ_GLBL_CFG0, 0);
+
+	WREG32(mmTPC3_QM_GLBL_CFG0, 0);
+	WREG32(mmTPC3_CMDQ_GLBL_CFG0, 0);
+
+	WREG32(mmTPC4_QM_GLBL_CFG0, 0);
+	WREG32(mmTPC4_CMDQ_GLBL_CFG0, 0);
+
+	WREG32(mmTPC5_QM_GLBL_CFG0, 0);
+	WREG32(mmTPC5_CMDQ_GLBL_CFG0, 0);
+
+	WREG32(mmTPC6_QM_GLBL_CFG0, 0);
+	WREG32(mmTPC6_CMDQ_GLBL_CFG0, 0);
+
+	WREG32(mmTPC7_QM_GLBL_CFG0, 0);
+	WREG32(mmTPC7_CMDQ_GLBL_CFG0, 0);
+}
+
+/*
+ * goya_stop_internal_queues - Stop internal queues
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * Returns 0 on success
+ *
+ */
+static int goya_stop_internal_queues(struct hl_device *hdev)
+{
+	int rc, retval = 0;
+
+	/*
+	 * Each queue (QMAN) is a separate H/W logic. That means that each
+	 * QMAN can be stopped independently and failure to stop one does NOT
+	 * mandate we should not try to stop other QMANs
+	 */
+
+	rc = goya_stop_queue(hdev,
+			mmMME_QM_GLBL_CFG1,
+			mmMME_QM_CP_STS,
+			mmMME_QM_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop MME QMAN\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmMME_CMDQ_GLBL_CFG1,
+			mmMME_CMDQ_CP_STS,
+			mmMME_CMDQ_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop MME CMDQ\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC0_QM_GLBL_CFG1,
+			mmTPC0_QM_CP_STS,
+			mmTPC0_QM_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 0 QMAN\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC0_CMDQ_GLBL_CFG1,
+			mmTPC0_CMDQ_CP_STS,
+			mmTPC0_CMDQ_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 0 CMDQ\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC1_QM_GLBL_CFG1,
+			mmTPC1_QM_CP_STS,
+			mmTPC1_QM_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 1 QMAN\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC1_CMDQ_GLBL_CFG1,
+			mmTPC1_CMDQ_CP_STS,
+			mmTPC1_CMDQ_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 1 CMDQ\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC2_QM_GLBL_CFG1,
+			mmTPC2_QM_CP_STS,
+			mmTPC2_QM_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 2 QMAN\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC2_CMDQ_GLBL_CFG1,
+			mmTPC2_CMDQ_CP_STS,
+			mmTPC2_CMDQ_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 2 CMDQ\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC3_QM_GLBL_CFG1,
+			mmTPC3_QM_CP_STS,
+			mmTPC3_QM_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 3 QMAN\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC3_CMDQ_GLBL_CFG1,
+			mmTPC3_CMDQ_CP_STS,
+			mmTPC3_CMDQ_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 3 CMDQ\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC4_QM_GLBL_CFG1,
+			mmTPC4_QM_CP_STS,
+			mmTPC4_QM_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 4 QMAN\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC4_CMDQ_GLBL_CFG1,
+			mmTPC4_CMDQ_CP_STS,
+			mmTPC4_CMDQ_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 4 CMDQ\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC5_QM_GLBL_CFG1,
+			mmTPC5_QM_CP_STS,
+			mmTPC5_QM_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 5 QMAN\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC5_CMDQ_GLBL_CFG1,
+			mmTPC5_CMDQ_CP_STS,
+			mmTPC5_CMDQ_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 5 CMDQ\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC6_QM_GLBL_CFG1,
+			mmTPC6_QM_CP_STS,
+			mmTPC6_QM_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 6 QMAN\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC6_CMDQ_GLBL_CFG1,
+			mmTPC6_CMDQ_CP_STS,
+			mmTPC6_CMDQ_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 6 CMDQ\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC7_QM_GLBL_CFG1,
+			mmTPC7_QM_CP_STS,
+			mmTPC7_QM_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 7 QMAN\n");
+		retval = -EIO;
+	}
+
+	rc = goya_stop_queue(hdev,
+			mmTPC7_CMDQ_GLBL_CFG1,
+			mmTPC7_CMDQ_CP_STS,
+			mmTPC7_CMDQ_GLBL_STS0);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop TPC 7 CMDQ\n");
+		retval = -EIO;
+	}
+
+	return retval;
+}
+
+static void goya_resume_internal_queues(struct hl_device *hdev)
+{
+	WREG32(mmMME_QM_GLBL_CFG1, 0);
+	WREG32(mmMME_CMDQ_GLBL_CFG1, 0);
+
+	WREG32(mmTPC0_QM_GLBL_CFG1, 0);
+	WREG32(mmTPC0_CMDQ_GLBL_CFG1, 0);
+
+	WREG32(mmTPC1_QM_GLBL_CFG1, 0);
+	WREG32(mmTPC1_CMDQ_GLBL_CFG1, 0);
+
+	WREG32(mmTPC2_QM_GLBL_CFG1, 0);
+	WREG32(mmTPC2_CMDQ_GLBL_CFG1, 0);
+
+	WREG32(mmTPC3_QM_GLBL_CFG1, 0);
+	WREG32(mmTPC3_CMDQ_GLBL_CFG1, 0);
+
+	WREG32(mmTPC4_QM_GLBL_CFG1, 0);
+	WREG32(mmTPC4_CMDQ_GLBL_CFG1, 0);
+
+	WREG32(mmTPC5_QM_GLBL_CFG1, 0);
+	WREG32(mmTPC5_CMDQ_GLBL_CFG1, 0);
+
+	WREG32(mmTPC6_QM_GLBL_CFG1, 0);
+	WREG32(mmTPC6_CMDQ_GLBL_CFG1, 0);
+
+	WREG32(mmTPC7_QM_GLBL_CFG1, 0);
+	WREG32(mmTPC7_CMDQ_GLBL_CFG1, 0);
+}
+
+
+/*
+ * goya_push_fw_to_device - Push FW code to device
+ *
+ * @hdev: pointer to hl_device structure
+ *
+ * Copy fw code from firmware file to device memory.
+ * Returns 0 on success
+ *
+ */
+static int goya_push_fw_to_device(struct hl_device *hdev, const char *fw_name,
+					void __iomem *dst)
+{
+	const struct firmware *fw;
+	const u64 *fw_data;
+	size_t fw_size, i;
+	int rc;
+
+	rc = request_firmware(&fw, fw_name, hdev->dev);
+
+	if (rc) {
+		dev_err(hdev->dev, "Failed to request %s\n", fw_name);
+		goto out;
+	}
+
+	fw_size = fw->size;
+	if ((fw_size % 4) != 0) {
+		dev_err(hdev->dev, "illegal %s firmware size %zu\n",
+			fw_name, fw_size);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size);
+
+	fw_data = (const u64 *) fw->data;
+
+	if ((fw->size % 8) != 0)
+		fw_size -= 8;
+
+	for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) {
+		if (!(i & (0x80000 - 1))) {
+			dev_dbg(hdev->dev,
+				"copied so far %zu out of %zu for %s firmware",
+				i, fw_size, fw_name);
+			usleep_range(20, 100);
+		}
+
+		writeq(*fw_data, dst);
+	}
+
+	if ((fw->size % 8) != 0)
+		writel(*(const u32 *) fw_data, dst);
+
+out:
+	release_firmware(fw);
+	return rc;
+}
+
+static int goya_pldm_init_cpu(struct hl_device *hdev)
+{
+	char fw_name[200];
+	void __iomem *dst;
+	u32 val, unit_rst_val;
+	int rc;
+
+	/* Must initialize SRAM scrambler before pushing u-boot to SRAM */
+	goya_init_golden_registers(hdev);
+
+	/* Put ARM cores into reset */
+	WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, CPU_RESET_ASSERT);
+	val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL);
+
+	/* Reset the CA53 MACRO */
+	unit_rst_val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
+	WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, CA53_RESET);
+	val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
+	WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, unit_rst_val);
+	val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N);
+
+	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin");
+	dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET;
+	rc = goya_push_fw_to_device(hdev, fw_name, dst);
+	if (rc)
+		return rc;
+
+	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb");
+	dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET;
+	rc = goya_push_fw_to_device(hdev, fw_name, dst);
+	if (rc)
+		return rc;
+
+	WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_FIT_RDY);
+	WREG32(mmPSOC_GLOBAL_CONF_WARM_REBOOT, CPU_BOOT_STATUS_NA);
+
+	WREG32(mmCPU_CA53_CFG_RST_ADDR_LSB_0,
+		lower_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET));
+	WREG32(mmCPU_CA53_CFG_RST_ADDR_MSB_0,
+		upper_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET));
+
+	/* Release ARM core 0 from reset */
+	WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL,
+					CPU_RESET_CORE0_DEASSERT);
+	val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL);
+
+	return 0;
+}
+
+/*
+ * FW component passes an offset from SRAM_BASE_ADDR in SCRATCHPAD_xx.
+ * The version string should be located by that offset.
+ */
+static void goya_read_device_fw_version(struct hl_device *hdev,
+					enum goya_fw_component fwc)
+{
+	const char *name;
+	u32 ver_off;
+	char *dest;
+
+	switch (fwc) {
+	case FW_COMP_UBOOT:
+		ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_29);
+		dest = hdev->asic_prop.uboot_ver;
+		name = "U-Boot";
+		break;
+	case FW_COMP_PREBOOT:
+		ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_28);
+		dest = hdev->asic_prop.preboot_ver;
+		name = "Preboot";
+		break;
+	default:
+		dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc);
+		return;
+	}
+
+	ver_off &= ~((u32)SRAM_BASE_ADDR);
+
+	if (ver_off < SRAM_SIZE - VERSION_MAX_LEN) {
 		memcpy_fromio(dest, hdev->pcie_bar[SRAM_CFG_BAR_ID] + ver_off,
 							VERSION_MAX_LEN);
 	} else {
@@ -1344,6 +2183,19 @@ static int goya_hw_init(struct hl_device *hdev)
 
 	goya_init_security(hdev);
 
+	goya_init_dma_qmans(hdev);
+
+	goya_init_mme_qmans(hdev);
+
+	goya_init_tpc_qmans(hdev);
+
+	rc = goya_init_cpu_queues(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "failed to initialize CPU H/W queues %d\n",
+			rc);
+		goto disable_queues;
+	}
+
 	/* CPU initialization is finished, we can now move to 48 bit DMA mask */
 	rc = pci_set_dma_mask(hdev->pdev, DMA_BIT_MASK(48));
 	if (rc) {
@@ -1352,7 +2204,7 @@ static int goya_hw_init(struct hl_device *hdev)
 		if (rc) {
 			dev_err(hdev->dev,
 				"Unable to set pci dma mask to 32 bits\n");
-			return rc;
+			goto disable_pci_access;
 		}
 	}
 
@@ -1364,7 +2216,7 @@ static int goya_hw_init(struct hl_device *hdev)
 		if (rc) {
 			dev_err(hdev->dev,
 				"Unable to set pci consistent dma mask to 32 bits\n");
-			return rc;
+			goto disable_pci_access;
 		}
 	}
 
@@ -1372,6 +2224,14 @@ static int goya_hw_init(struct hl_device *hdev)
 	val = RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG);
 
 	return 0;
+
+disable_pci_access:
+	goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
+disable_queues:
+	goya_disable_internal_queues(hdev);
+	goya_disable_external_queues(hdev);
+
+	return rc;
 }
 
 /*
@@ -1460,12 +2320,40 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
 
 int goya_suspend(struct hl_device *hdev)
 {
-	return 0;
+	int rc;
+
+	rc = goya_stop_internal_queues(hdev);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop internal queues\n");
+		return rc;
+	}
+
+	rc = goya_stop_external_queues(hdev);
+
+	if (rc) {
+		dev_err(hdev->dev, "failed to stop external queues\n");
+		return rc;
+	}
+
+	rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
+	if (rc)
+		dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
+
+	return rc;
 }
 
 int goya_resume(struct hl_device *hdev)
 {
-	return 0;
+	int rc;
+
+	goya_resume_external_queues(hdev);
+	goya_resume_internal_queues(hdev);
+
+	rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS);
+	if (rc)
+		dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
+	return rc;
 }
 
 int goya_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
@@ -1489,6 +2377,101 @@ int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
 	return rc;
 }
 
+void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
+{
+	u32 db_reg_offset, db_value;
+	bool invalid_queue = false;
+
+	switch (hw_queue_id) {
+	case GOYA_QUEUE_ID_DMA_0:
+		db_reg_offset = mmDMA_QM_0_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_DMA_1:
+		db_reg_offset = mmDMA_QM_1_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_DMA_2:
+		db_reg_offset = mmDMA_QM_2_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_DMA_3:
+		db_reg_offset = mmDMA_QM_3_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_DMA_4:
+		db_reg_offset = mmDMA_QM_4_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_CPU_PQ:
+		if (hdev->cpu_queues_enable)
+			db_reg_offset = mmCPU_IF_PF_PQ_PI;
+		else
+			invalid_queue = true;
+		break;
+
+	case GOYA_QUEUE_ID_MME:
+		db_reg_offset = mmMME_QM_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_TPC0:
+		db_reg_offset = mmTPC0_QM_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_TPC1:
+		db_reg_offset = mmTPC1_QM_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_TPC2:
+		db_reg_offset = mmTPC2_QM_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_TPC3:
+		db_reg_offset = mmTPC3_QM_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_TPC4:
+		db_reg_offset = mmTPC4_QM_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_TPC5:
+		db_reg_offset = mmTPC5_QM_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_TPC6:
+		db_reg_offset = mmTPC6_QM_PQ_PI;
+		break;
+
+	case GOYA_QUEUE_ID_TPC7:
+		db_reg_offset = mmTPC7_QM_PQ_PI;
+		break;
+
+	default:
+		invalid_queue = true;
+	}
+
+	if (invalid_queue) {
+		/* Should never get here */
+		dev_err(hdev->dev, "h/w queue %d is invalid. Can't set pi\n",
+			hw_queue_id);
+		return;
+	}
+
+	db_value = pi;
+
+	/* ring the doorbell */
+	WREG32(db_reg_offset, db_value);
+
+	if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ)
+		WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
+				GOYA_ASYNC_EVENT_ID_PI_UPDATE);
+}
+
+void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val)
+{
+	/* Not needed in Goya */
+}
+
 void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size,
 					dma_addr_t *dma_handle, gfp_t flags)
 {
@@ -1501,6 +2484,315 @@ void goya_dma_free_coherent(struct hl_device *hdev, size_t size, void *cpu_addr,
 	dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle);
 }
 
+void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
+				dma_addr_t *dma_handle,	u16 *queue_len)
+{
+	void *base;
+	u32 offset;
+
+	*dma_handle = hdev->asic_prop.sram_base_address;
+
+	base = hdev->pcie_bar[SRAM_CFG_BAR_ID];
+
+	switch (queue_id) {
+	case GOYA_QUEUE_ID_MME:
+		offset = MME_QMAN_BASE_OFFSET;
+		*queue_len = MME_QMAN_LENGTH;
+		break;
+	case GOYA_QUEUE_ID_TPC0:
+		offset = TPC0_QMAN_BASE_OFFSET;
+		*queue_len = TPC_QMAN_LENGTH;
+		break;
+	case GOYA_QUEUE_ID_TPC1:
+		offset = TPC1_QMAN_BASE_OFFSET;
+		*queue_len = TPC_QMAN_LENGTH;
+		break;
+	case GOYA_QUEUE_ID_TPC2:
+		offset = TPC2_QMAN_BASE_OFFSET;
+		*queue_len = TPC_QMAN_LENGTH;
+		break;
+	case GOYA_QUEUE_ID_TPC3:
+		offset = TPC3_QMAN_BASE_OFFSET;
+		*queue_len = TPC_QMAN_LENGTH;
+		break;
+	case GOYA_QUEUE_ID_TPC4:
+		offset = TPC4_QMAN_BASE_OFFSET;
+		*queue_len = TPC_QMAN_LENGTH;
+		break;
+	case GOYA_QUEUE_ID_TPC5:
+		offset = TPC5_QMAN_BASE_OFFSET;
+		*queue_len = TPC_QMAN_LENGTH;
+		break;
+	case GOYA_QUEUE_ID_TPC6:
+		offset = TPC6_QMAN_BASE_OFFSET;
+		*queue_len = TPC_QMAN_LENGTH;
+		break;
+	case GOYA_QUEUE_ID_TPC7:
+		offset = TPC7_QMAN_BASE_OFFSET;
+		*queue_len = TPC_QMAN_LENGTH;
+		break;
+	default:
+		dev_err(hdev->dev, "Got invalid queue id %d\n", queue_id);
+		return NULL;
+	}
+
+	base += offset;
+	*dma_handle += offset;
+
+	return base;
+}
+
+int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
+				u32 timeout, long *result)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	struct armcp_packet *pkt;
+	dma_addr_t pkt_dma_addr;
+	u32 tmp;
+	int rc = 0;
+
+	if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q)) {
+		if (result)
+			*result = 0;
+		return 0;
+	}
+
+	if (len > CPU_CB_SIZE) {
+		dev_err(hdev->dev, "Invalid CPU message size of %d bytes\n",
+			len);
+		return -ENOMEM;
+	}
+
+	pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
+								&pkt_dma_addr);
+	if (!pkt) {
+		dev_err(hdev->dev,
+			"Failed to allocate DMA memory for packet to CPU\n");
+		return -ENOMEM;
+	}
+
+	memcpy(pkt, msg, len);
+
+	mutex_lock(&hdev->send_cpu_message_lock);
+
+	if (hdev->disabled)
+		goto out;
+
+	rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_CPU_PQ, len,
+			pkt_dma_addr);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
+		goto out;
+	}
+
+	rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) &pkt->fence,
+					timeout, &tmp);
+
+	hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_CPU_PQ);
+
+	if (rc == -ETIMEDOUT) {
+		dev_err(hdev->dev,
+			"Timeout while waiting for CPU packet fence\n");
+		goto out;
+	}
+
+	if (tmp == ARMCP_PACKET_FENCE_VAL) {
+		rc = (pkt->ctl & ARMCP_PKT_CTL_RC_MASK) >>
+						ARMCP_PKT_CTL_RC_SHIFT;
+		if (rc) {
+			dev_err(hdev->dev,
+				"F/W ERROR %d for CPU packet %d\n",
+				rc, (pkt->ctl & ARMCP_PKT_CTL_OPCODE_MASK)
+						>> ARMCP_PKT_CTL_OPCODE_SHIFT);
+			rc = -EINVAL;
+		} else if (result) {
+			*result = pkt->result;
+		}
+	} else {
+		dev_err(hdev->dev, "CPU packet wrong fence value\n");
+		rc = -EINVAL;
+	}
+
+out:
+	mutex_unlock(&hdev->send_cpu_message_lock);
+
+	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, len, pkt);
+
+	return rc;
+}
+
+int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
+{
+	struct packet_msg_prot *fence_pkt;
+	dma_addr_t pkt_dma_addr;
+	u32 fence_val, tmp;
+	dma_addr_t fence_dma_addr;
+	u32 *fence_ptr;
+	int rc;
+
+	fence_val = GOYA_QMAN0_FENCE_VAL;
+
+	fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL,
+							&fence_dma_addr);
+	if (!fence_ptr) {
+		dev_err(hdev->dev,
+			"Failed to allocate memory for queue testing\n");
+		return -ENOMEM;
+	}
+
+	*fence_ptr = 0;
+
+	fence_pkt = hdev->asic_funcs->dma_pool_zalloc(hdev,
+					sizeof(struct packet_msg_prot),
+					GFP_KERNEL, &pkt_dma_addr);
+	if (!fence_pkt) {
+		dev_err(hdev->dev,
+			"Failed to allocate packet for queue testing\n");
+		rc = -ENOMEM;
+		goto free_fence_ptr;
+	}
+
+	fence_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+			(1 << GOYA_PKT_CTL_EB_SHIFT) |
+			(1 << GOYA_PKT_CTL_MB_SHIFT);
+	fence_pkt->value = fence_val;
+	fence_pkt->addr = fence_dma_addr +
+				hdev->asic_prop.host_phys_base_address;
+
+	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id,
+					sizeof(struct packet_msg_prot),
+					pkt_dma_addr);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to send fence packet\n");
+		goto free_pkt;
+	}
+
+	rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) fence_ptr,
+					GOYA_TEST_QUEUE_WAIT_USEC, &tmp);
+
+	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
+
+	if ((!rc) && (tmp == fence_val)) {
+		dev_info(hdev->dev,
+			"queue test on H/W queue %d succeeded\n",
+			hw_queue_id);
+	} else {
+		dev_err(hdev->dev,
+			"H/W queue %d test failed (scratch(0x%08llX) == 0x%08X)\n",
+			hw_queue_id, (unsigned long long) fence_dma_addr, tmp);
+		rc = -EINVAL;
+	}
+
+free_pkt:
+	hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_pkt,
+					pkt_dma_addr);
+free_fence_ptr:
+	hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr,
+					fence_dma_addr);
+	return rc;
+}
+
+int goya_test_cpu_queue(struct hl_device *hdev)
+{
+	struct armcp_packet test_pkt;
+	long result;
+	int rc;
+
+	/* cpu_queues_enable flag is always checked in send cpu message */
+
+	memset(&test_pkt, 0, sizeof(test_pkt));
+
+	test_pkt.ctl = ARMCP_PACKET_TEST << ARMCP_PKT_CTL_OPCODE_SHIFT;
+	test_pkt.value = ARMCP_PACKET_FENCE_VAL;
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
+			sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
+
+	if (!rc)
+		dev_info(hdev->dev, "queue test on CPU queue succeeded\n");
+	else
+		dev_err(hdev->dev, "CPU queue test failed (0x%08lX)\n", result);
+
+	return rc;
+}
+
+static int goya_test_queues(struct hl_device *hdev)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	int i, rc, ret_val = 0;
+
+	for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
+		rc = goya_test_queue(hdev, i);
+		if (rc)
+			ret_val = -EINVAL;
+	}
+
+	if (hdev->cpu_queues_enable) {
+		rc = goya->test_cpu_queue(hdev);
+		if (rc)
+			ret_val = -EINVAL;
+	}
+
+	return ret_val;
+}
+
+void *goya_dma_pool_zalloc(struct hl_device *hdev, size_t size, gfp_t mem_flags,
+				dma_addr_t *dma_handle)
+{
+	if (size > GOYA_DMA_POOL_BLK_SIZE)
+		return NULL;
+
+	return dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle);
+}
+
+void goya_dma_pool_free(struct hl_device *hdev, void *vaddr,
+			dma_addr_t dma_addr)
+{
+	dma_pool_free(hdev->dma_pool, vaddr, dma_addr);
+}
+
+void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
+			dma_addr_t *dma_handle)
+{
+	u64 kernel_addr;
+
+	/* roundup to CPU_PKT_SIZE */
+	size = (size + (CPU_PKT_SIZE - 1)) & CPU_PKT_MASK;
+
+	kernel_addr = gen_pool_alloc(hdev->cpu_accessible_dma_pool, size);
+
+	*dma_handle = hdev->cpu_accessible_dma_address +
+		(kernel_addr - (u64) (uintptr_t) hdev->cpu_accessible_dma_mem);
+
+	return (void *) (uintptr_t) kernel_addr;
+}
+
+void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
+			void *vaddr)
+{
+	/* roundup to CPU_PKT_SIZE */
+	size = (size + (CPU_PKT_SIZE - 1)) & CPU_PKT_MASK;
+
+	gen_pool_free(hdev->cpu_accessible_dma_pool, (u64) (uintptr_t) vaddr,
+			size);
+}
+
+
+static void goya_hw_queues_lock(struct hl_device *hdev)
+{
+	struct goya_device *goya = hdev->asic_specific;
+
+	spin_lock(&goya->hw_queues_lock);
+}
+
+static void goya_hw_queues_unlock(struct hl_device *hdev)
+{
+	struct goya_device *goya = hdev->asic_specific;
+
+	spin_unlock(&goya->hw_queues_lock);
+}
+
 static const struct hl_asic_funcs goya_funcs = {
 	.early_init = goya_early_init,
 	.early_fini = goya_early_fini,
@@ -1512,8 +2804,19 @@ static const struct hl_asic_funcs goya_funcs = {
 	.resume = goya_resume,
 	.mmap = goya_mmap,
 	.cb_mmap = goya_cb_mmap,
+	.ring_doorbell = goya_ring_doorbell,
+	.flush_pq_write = goya_flush_pq_write,
 	.dma_alloc_coherent = goya_dma_alloc_coherent,
 	.dma_free_coherent = goya_dma_free_coherent,
+	.get_int_queue_base = goya_get_int_queue_base,
+	.test_queues = goya_test_queues,
+	.dma_pool_zalloc = goya_dma_pool_zalloc,
+	.dma_pool_free = goya_dma_pool_free,
+	.cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc,
+	.cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free,
+	.hw_queues_lock = goya_hw_queues_lock,
+	.hw_queues_unlock = goya_hw_queues_unlock,
+	.send_cpu_message = goya_send_cpu_message
 };
 
 /*
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index 65cbb45d7083..791605cbecfe 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -11,7 +11,9 @@
 #include <uapi/misc/habanalabs.h>
 #include "habanalabs.h"
 #include "include/hl_boot_if.h"
+#include "include/goya/goya_packets.h"
 #include "include/goya/goya.h"
+#include "include/goya/goya_async_events.h"
 #include "include/goya/goya_fw_if.h"
 
 #define NUMBER_OF_CMPLT_QUEUES		5
@@ -145,12 +147,17 @@ enum goya_fw_component {
 };
 
 struct goya_device {
+	int (*test_cpu_queue)(struct hl_device *hdev);
+
 	/* TODO: remove hw_queues_lock after moving to scheduler code */
 	spinlock_t	hw_queues_lock;
 	u64		ddr_bar_cur_addr;
 	u32		hw_cap_initialized;
 };
 
+int goya_test_cpu_queue(struct hl_device *hdev);
+int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
+				u32 timeout, long *result);
 void goya_init_security(struct hl_device *hdev);
 
 #endif /* GOYAP_H_ */
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index e099f7a9dac2..2121babbebdc 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -9,6 +9,7 @@
 #define HABANALABSP_H_
 
 #include "include/armcp_if.h"
+#include "include/qman_if.h"
 
 #define pr_fmt(fmt)			"habanalabs: " fmt
 
@@ -26,9 +27,36 @@
 struct hl_device;
 struct hl_fpriv;
 
+/**
+ * enum hl_queue_type - Supported QUEUE types.
+ * @QUEUE_TYPE_NA: queue is not available.
+ * @QUEUE_TYPE_EXT: external queue which is a DMA channel that may access the
+ *                  host.
+ * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's
+ *			memories and/or operates the compute engines.
+ * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU.
+ */
+enum hl_queue_type {
+	QUEUE_TYPE_NA,
+	QUEUE_TYPE_EXT,
+	QUEUE_TYPE_INT,
+	QUEUE_TYPE_CPU
+};
+
+/**
+ * struct hw_queue_properties - queue information.
+ * @type: queue type.
+ * @kmd_only: true if only KMD is allowed to send a job to this queue, false
+ *            otherwise.
+ */
+struct hw_queue_properties {
+	enum hl_queue_type	type;
+	u8			kmd_only;
+};
 
 /**
  * struct asic_fixed_properties - ASIC specific immutable properties.
+ * @hw_queues_props: H/W queues properties.
  * @uboot_ver: F/W U-boot version.
  * @preboot_ver: F/W Preboot version.
  * @sram_base_address: SRAM physical start address.
@@ -59,6 +87,7 @@ struct hl_fpriv;
  * @tpc_enabled_mask: which TPCs are enabled.
  */
 struct asic_fixed_properties {
+	struct hw_queue_properties	hw_queues_props[HL_MAX_QUEUES];
 	char			uboot_ver[VERSION_MAX_LEN];
 	char			preboot_ver[VERSION_MAX_LEN];
 	u64			sram_base_address;
@@ -132,7 +161,89 @@ struct hl_cb {
 };
 
 
+/*
+ * QUEUES
+ */
+
+struct hl_cs_job;
+
+/*
+ * Currently, there are two limitations on the maximum length of a queue:
+ *
+ * 1. The memory footprint of the queue. The current allocated space for the
+ *    queue is PAGE_SIZE. Because each entry in the queue is HL_BD_SIZE,
+ *    the maximum length of the queue can be PAGE_SIZE / HL_BD_SIZE,
+ *    which currently is 4096/16 = 256 entries.
+ *
+ *    To increase that, we need either to decrease the size of the
+ *    BD (difficult), or allocate more than a single page (easier).
+ *
+ * 2. Because the size of the JOB handle field in the BD CTL / completion queue
+ *    is 10-bit, we can have up to 1024 open jobs per hardware queue.
+ *    Therefore, each queue can hold up to 1024 entries.
+ *
+ * HL_QUEUE_LENGTH is in units of struct hl_bd.
+ * HL_QUEUE_LENGTH * sizeof(struct hl_bd) should be <= HL_PAGE_SIZE
+ */
+
+#define HL_PAGE_SIZE			4096 /* minimum page size */
+/* Must be power of 2 (HL_PAGE_SIZE / HL_BD_SIZE) */
 #define HL_QUEUE_LENGTH			256
+#define HL_QUEUE_SIZE_IN_BYTES		(HL_QUEUE_LENGTH * HL_BD_SIZE)
+
+/*
+ * HL_CQ_LENGTH is in units of struct hl_cq_entry.
+ * HL_CQ_LENGTH should be <= HL_PAGE_SIZE
+ */
+#define HL_CQ_LENGTH			HL_QUEUE_LENGTH
+#define HL_CQ_SIZE_IN_BYTES		(HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE)
+
+
+
+/**
+ * struct hl_hw_queue - describes a H/W transport queue.
+ * @shadow_queue: pointer to a shadow queue that holds pointers to jobs.
+ * @queue_type: type of queue.
+ * @kernel_address: holds the queue's kernel virtual address.
+ * @bus_address: holds the queue's DMA address.
+ * @pi: holds the queue's pi value.
+ * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci).
+ * @hw_queue_id: the id of the H/W queue.
+ * @int_queue_len: length of internal queue (number of entries).
+ * @valid: is the queue valid (we have array of 32 queues, not all of them
+ *		exists).
+ */
+struct hl_hw_queue {
+	struct hl_cs_job	**shadow_queue;
+	enum hl_queue_type	queue_type;
+	u64			kernel_address;
+	dma_addr_t		bus_address;
+	u32			pi;
+	u32			ci;
+	u32			hw_queue_id;
+	u16			int_queue_len;
+	u8			valid;
+};
+
+/**
+ * struct hl_cq - describes a completion queue
+ * @hdev: pointer to the device structure
+ * @kernel_address: holds the queue's kernel virtual address
+ * @bus_address: holds the queue's DMA address
+ * @hw_queue_id: the id of the matching H/W queue
+ * @ci: ci inside the queue
+ * @pi: pi inside the queue
+ * @free_slots_cnt: counter of free slots in queue
+ */
+struct hl_cq {
+	struct hl_device	*hdev;
+	u64			kernel_address;
+	dma_addr_t		bus_address;
+	u32			hw_queue_id;
+	u32			ci;
+	u32			pi;
+	atomic_t		free_slots_cnt;
+};
 
 
 /*
@@ -164,6 +275,8 @@ enum hl_asic_type {
  * @resume: handles IP specific H/W or SW changes for resume.
  * @mmap: mmap function, does nothing.
  * @cb_mmap: maps a CB.
+ * @ring_doorbell: increment PI on a given QMAN.
+ * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed.
  * @dma_alloc_coherent: Allocate coherent DMA memory by calling
  *                      dma_alloc_coherent(). This is ASIC function because its
  *                      implementation is not trivial when the driver is loaded
@@ -172,6 +285,16 @@ enum hl_asic_type {
  *                     This is ASIC function because its implementation is not
  *                     trivial when the driver is loaded in simulation mode
  *                     (not upstreamed).
+ * @get_int_queue_base: get the internal queue base address.
+ * @test_queues: run simple test on all queues for sanity check.
+ * @dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
+ *                   size of allocation is HL_DMA_POOL_BLK_SIZE.
+ * @dma_pool_free: free small DMA allocation from pool.
+ * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool.
+ * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool.
+ * @hw_queues_lock: acquire H/W queues lock.
+ * @hw_queues_unlock: release H/W queues lock.
+ * @send_cpu_message: send buffer to ArmCP.
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -185,10 +308,27 @@ struct hl_asic_funcs {
 	int (*mmap)(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
 	int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
 			u64 kaddress, phys_addr_t paddress, u32 size);
+	void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
+	void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val);
 	void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size,
 					dma_addr_t *dma_handle, gfp_t flag);
 	void (*dma_free_coherent)(struct hl_device *hdev, size_t size,
 					void *cpu_addr, dma_addr_t dma_handle);
+	void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
+				dma_addr_t *dma_handle, u16 *queue_len);
+	int (*test_queues)(struct hl_device *hdev);
+	void* (*dma_pool_zalloc)(struct hl_device *hdev, size_t size,
+				gfp_t mem_flags, dma_addr_t *dma_handle);
+	void (*dma_pool_free)(struct hl_device *hdev, void *vaddr,
+				dma_addr_t dma_addr);
+	void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev,
+				size_t size, dma_addr_t *dma_handle);
+	void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
+				size_t size, void *vaddr);
+	void (*hw_queues_lock)(struct hl_device *hdev);
+	void (*hw_queues_unlock)(struct hl_device *hdev);
+	int (*send_cpu_message)(struct hl_device *hdev, u32 *msg,
+				u16 len, u32 timeout, long *result);
 };
 
 
@@ -224,6 +364,17 @@ struct hl_ctx_mgr {
 };
 
 
+
+
+/**
+ * struct hl_cs_job - command submission job.
+ * @finish_work: workqueue object to run when job is completed.
+ * @id: the id of this job inside a CS.
+ */
+struct hl_cs_job {
+	struct work_struct	finish_work;
+	u32			id;
+};
 /*
  * FILE PRIVATE STRUCTURE
  */
@@ -298,7 +449,11 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
  * @dev: realted kernel basic device structure.
  * @asic_name: ASIC specific nmae.
  * @asic_type: ASIC specific type.
+ * @completion_queue: array of hl_cq.
+ * @cq_wq: work queue of completion queues for executing work in process context
+ * @eq_wq: work queue of event queue for executing work in process context.
  * @kernel_ctx: KMD context structure.
+ * @kernel_queues: array of hl_hw_queue.
  * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
  * @dma_pool: DMA pool for small allocations.
  * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address.
@@ -312,6 +467,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
  *                    only a single process at a time. In addition, we need a
  *                    lock here so we can flush user processes which are opening
  *                    the device while we are trying to hard reset it
+ * @send_cpu_message_lock: enforces only one message in KMD <-> ArmCP queue.
  * @asic_prop: ASIC specific immutable properties.
  * @asic_funcs: ASIC specific functions.
  * @asic_specific: ASIC specific information to use only from ASIC files.
@@ -331,7 +487,10 @@ struct hl_device {
 	struct device			*dev;
 	char				asic_name[16];
 	enum hl_asic_type		asic_type;
+	struct hl_cq			*completion_queue;
+	struct workqueue_struct		*cq_wq;
 	struct hl_ctx			*kernel_ctx;
+	struct hl_hw_queue		*kernel_queues;
 	struct hl_cb_mgr		kernel_cb_mgr;
 	struct dma_pool			*dma_pool;
 	void				*cpu_accessible_dma_mem;
@@ -341,6 +500,7 @@ struct hl_device {
 	struct mutex			asid_mutex;
 	/* TODO: remove fd_open_cnt_lock for multiple process support */
 	struct mutex			fd_open_cnt_lock;
+	struct mutex			send_cpu_message_lock;
 	struct asic_fixed_properties	asic_prop;
 	const struct hl_asic_funcs	*asic_funcs;
 	void				*asic_specific;
@@ -358,6 +518,7 @@ struct hl_device {
 	/* Parameters for bring-up */
 	u8				cpu_enable;
 	u8				reset_pcilink;
+	u8				cpu_queues_enable;
 	u8				fw_loading;
 	u8				pldm;
 };
@@ -400,7 +561,18 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr, u32 timeout_us,
 				u32 *val);
 int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
 				u32 timeout_us, u32 *val);
-
+int hl_hw_queues_create(struct hl_device *hdev);
+void hl_hw_queues_destroy(struct hl_device *hdev);
+int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
+				u32 cb_size, u64 cb_ptr);
+u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
+void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);
+
+#define hl_queue_inc_ptr(p)		hl_hw_queue_add_ptr(p, 1)
+#define hl_pi_2_offset(pi)		((pi) & (HL_QUEUE_LENGTH - 1))
+
+int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id);
+void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q);
 int hl_asid_init(struct hl_device *hdev);
 void hl_asid_fini(struct hl_device *hdev);
 unsigned long hl_asid_alloc(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
index 59c2fd196659..93576249307b 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -169,6 +169,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	/* Parameters for bring-up - set them to defaults */
 	hdev->cpu_enable = 1;
 	hdev->reset_pcilink = 0;
+	hdev->cpu_queues_enable = 1;
 	hdev->fw_loading = 1;
 	hdev->pldm = 0;
 
@@ -176,6 +177,10 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	if (!hdev->cpu_enable)
 		hdev->fw_loading = 0;
 
+	/* If we don't load FW, no need to initialize CPU queues */
+	if (!hdev->fw_loading)
+		hdev->cpu_queues_enable = 0;
+
 	hdev->disabled = true;
 	hdev->pdev = pdev; /* can be NULL in case of simulator device */
 
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
new file mode 100644
index 000000000000..2ec43f36cdb8
--- /dev/null
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -0,0 +1,400 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "habanalabs.h"
+
+#include <linux/slab.h>
+
+/*
+ * hl_queue_add_ptr - add to pi or ci and checks if it wraps around
+ *
+ * @ptr: the current pi/ci value
+ * @val: the amount to add
+ *
+ * Add val to ptr. It can go until twice the queue length.
+ */
+inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val)
+{
+	ptr += val;
+	ptr &= ((HL_QUEUE_LENGTH << 1) - 1);
+	return ptr;
+}
+
+static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len)
+{
+	int delta = (q->pi - q->ci);
+
+	if (delta >= 0)
+		return (queue_len - delta);
+	else
+		return (abs(delta) - queue_len);
+}
+
+/*
+ * ext_queue_submit_bd - Submit a buffer descriptor to an external queue
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @q: pointer to habanalabs queue structure
+ * @ctl: BD's control word
+ * @len: BD's length
+ * @ptr: BD's pointer
+ *
+ * This function assumes there is enough space on the queue to submit a new
+ * BD to it. It initializes the next BD and calls the device specific
+ * function to set the pi (and doorbell)
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
+				u32 ctl, u32 len, u64 ptr)
+{
+	struct hl_bd *bd;
+
+	bd = (struct hl_bd *) (uintptr_t) q->kernel_address;
+	bd += hl_pi_2_offset(q->pi);
+	bd->ctl = ctl;
+	bd->len = len;
+	bd->ptr = ptr + hdev->asic_prop.host_phys_base_address;
+
+	q->pi = hl_queue_inc_ptr(q->pi);
+	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
+}
+
+/*
+ * ext_queue_sanity_checks - perform some sanity checks on external queue
+ *
+ * @hdev              : pointer to hl_device structure
+ * @q                 :	pointer to hl_hw_queue structure
+ * @num_of_entries    : how many entries to check for space
+ * @reserve_cq_entry  :	whether to reserve an entry in the cq
+ *
+ * H/W queues spinlock should be taken before calling this function
+ *
+ * Perform the following:
+ * - Make sure we have enough space in the h/w queue
+ * - Make sure we have enough space in the completion queue
+ * - Reserve space in the completion queue (needs to be reversed if there
+ *   is a failure down the road before the actual submission of work). Only
+ *   do this action if reserve_cq_entry is true
+ *
+ */
+static int ext_queue_sanity_checks(struct hl_device *hdev,
+				struct hl_hw_queue *q, int num_of_entries,
+				bool reserve_cq_entry)
+{
+	atomic_t *free_slots =
+			&hdev->completion_queue[q->hw_queue_id].free_slots_cnt;
+	int free_slots_cnt;
+
+	/* Check we have enough space in the queue */
+	free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH);
+
+	if (free_slots_cnt < num_of_entries) {
+		dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
+			q->hw_queue_id, num_of_entries);
+		return -EAGAIN;
+	}
+
+	if (reserve_cq_entry) {
+		/*
+		 * Check we have enough space in the completion queue
+		 * Add -1 to counter (decrement) unless counter was already 0
+		 * In that case, CQ is full so we can't submit a new CB because
+		 * we won't get ack on its completion
+		 * atomic_add_unless will return 0 if counter was already 0
+		 */
+		if (atomic_add_negative(num_of_entries * -1, free_slots)) {
+			dev_dbg(hdev->dev, "No space for %d on CQ %d\n",
+				num_of_entries, q->hw_queue_id);
+			atomic_add(num_of_entries, free_slots);
+			return -EAGAIN;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
+ *
+ * @hdev: pointer to hl_device structure
+ * @hw_queue_id: Queue's type
+ * @cb_size: size of CB
+ * @cb_ptr: pointer to CB location
+ *
+ * This function sends a single CB, that must NOT generate a completion entry
+ *
+ */
+int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
+				u32 cb_size, u64 cb_ptr)
+{
+	struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
+	int rc;
+
+	/*
+	 * The CPU queue is a synchronous queue with an effective depth of
+	 * a single entry (although it is allocated with room for multiple
+	 * entries). Therefore, there is a different lock, called
+	 * send_cpu_message_lock, that serializes accesses to the CPU queue.
+	 * As a result, we don't need to lock the access to the entire H/W
+	 * queues module when submitting a JOB to the CPU queue
+	 */
+	if (q->queue_type != QUEUE_TYPE_CPU)
+		hdev->asic_funcs->hw_queues_lock(hdev);
+
+	if (hdev->disabled) {
+		rc = -EPERM;
+		goto out;
+	}
+
+	rc = ext_queue_sanity_checks(hdev, q, 1, false);
+	if (rc)
+		goto out;
+
+	ext_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
+
+out:
+	if (q->queue_type != QUEUE_TYPE_CPU)
+		hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	return rc;
+}
+
+/*
+ * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue
+ *
+ * @hdev: pointer to hl_device structure
+ * @hw_queue_id: which queue to increment its ci
+ */
+void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
+{
+	struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
+
+	q->ci = hl_queue_inc_ptr(q->ci);
+}
+
+static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
+					struct hl_hw_queue *q)
+{
+	void *p;
+	int rc;
+
+	p = hdev->asic_funcs->dma_alloc_coherent(hdev,
+				HL_QUEUE_SIZE_IN_BYTES,
+				&q->bus_address, GFP_KERNEL | __GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+
+	q->kernel_address = (u64) (uintptr_t) p;
+
+	q->shadow_queue = kmalloc_array(HL_QUEUE_LENGTH,
+					sizeof(*q->shadow_queue),
+					GFP_KERNEL);
+	if (!q->shadow_queue) {
+		dev_err(hdev->dev,
+			"Failed to allocate shadow queue for H/W queue %d\n",
+			q->hw_queue_id);
+		rc = -ENOMEM;
+		goto free_queue;
+	}
+
+	/* Make sure read/write pointers are initialized to start of queue */
+	q->ci = 0;
+	q->pi = 0;
+
+	return 0;
+
+free_queue:
+	hdev->asic_funcs->dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES,
+			(void *) (uintptr_t) q->kernel_address, q->bus_address);
+
+	return rc;
+}
+
+static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+{
+	void *p;
+
+	p = hdev->asic_funcs->get_int_queue_base(hdev, q->hw_queue_id,
+					&q->bus_address, &q->int_queue_len);
+	if (!p) {
+		dev_err(hdev->dev,
+			"Failed to get base address for internal queue %d\n",
+			q->hw_queue_id);
+		return -EFAULT;
+	}
+
+	q->kernel_address = (u64) (uintptr_t) p;
+	q->pi = 0;
+	q->ci = 0;
+
+	return 0;
+}
+
+static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+{
+	return ext_and_cpu_hw_queue_init(hdev, q);
+}
+
+static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+{
+	return ext_and_cpu_hw_queue_init(hdev, q);
+}
+
+/*
+ * hw_queue_init - main initialization function for H/W queue object
+ *
+ * @hdev: pointer to hl_device device structure
+ * @q: pointer to hl_hw_queue queue structure
+ * @hw_queue_id: The id of the H/W queue
+ *
+ * Allocate dma-able memory for the queue and initialize fields
+ * Returns 0 on success
+ */
+static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
+			u32 hw_queue_id)
+{
+	int rc;
+
+	BUILD_BUG_ON(HL_QUEUE_SIZE_IN_BYTES > HL_PAGE_SIZE);
+
+	q->hw_queue_id = hw_queue_id;
+
+	switch (q->queue_type) {
+	case QUEUE_TYPE_EXT:
+		rc = ext_hw_queue_init(hdev, q);
+		break;
+
+	case QUEUE_TYPE_INT:
+		rc = int_hw_queue_init(hdev, q);
+		break;
+
+	case QUEUE_TYPE_CPU:
+		rc = cpu_hw_queue_init(hdev, q);
+		break;
+
+	case QUEUE_TYPE_NA:
+		q->valid = 0;
+		return 0;
+
+	default:
+		dev_crit(hdev->dev, "wrong queue type %d during init\n",
+			q->queue_type);
+		rc = -EINVAL;
+		break;
+	}
+
+	if (rc)
+		return rc;
+
+	q->valid = 1;
+
+	return 0;
+}
+
+/*
+ * hw_queue_fini - destroy queue
+ *
+ * @hdev: pointer to hl_device device structure
+ * @q: pointer to hl_hw_queue queue structure
+ *
+ * Free the queue memory
+ */
+static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
+{
+	if (!q->valid)
+		return;
+
+	/*
+	 * If we arrived here, there are no jobs waiting on this queue
+	 * so we can safely remove it.
+	 * This is because this function can only called when:
+	 * 1. Either a context is deleted, which only can occur if all its
+	 *    jobs were finished
+	 * 2. A context wasn't able to be created due to failure or timeout,
+	 *    which means there are no jobs on the queue yet
+	 *
+	 * The only exception are the queues of the kernel context, but
+	 * if they are being destroyed, it means that the entire module is
+	 * being removed. If the module is removed, it means there is no open
+	 * user context. It also means that if a job was submitted by
+	 * the kernel driver (e.g. context creation), the job itself was
+	 * released by the kernel driver when a timeout occurred on its
+	 * Completion. Thus, we don't need to release it again.
+	 */
+
+	if (q->queue_type == QUEUE_TYPE_INT)
+		return;
+
+	kfree(q->shadow_queue);
+
+	hdev->asic_funcs->dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES,
+			(void *) (uintptr_t) q->kernel_address, q->bus_address);
+}
+
+int hl_hw_queues_create(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *asic = &hdev->asic_prop;
+	struct hl_hw_queue *q;
+	int i, rc, q_ready_cnt;
+
+	hdev->kernel_queues = kcalloc(HL_MAX_QUEUES,
+				sizeof(*hdev->kernel_queues), GFP_KERNEL);
+
+	if (!hdev->kernel_queues) {
+		dev_err(hdev->dev, "Not enough memory for H/W queues\n");
+		return -ENOMEM;
+	}
+
+	/* Initialize the H/W queues */
+	for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues;
+			i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {
+
+		q->queue_type = asic->hw_queues_props[i].type;
+		rc = hw_queue_init(hdev, q, i);
+		if (rc) {
+			dev_err(hdev->dev,
+				"failed to initialize queue %d\n", i);
+			goto release_queues;
+		}
+	}
+
+	return 0;
+
+release_queues:
+	for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++)
+		hw_queue_fini(hdev, q);
+
+	kfree(hdev->kernel_queues);
+
+	return rc;
+}
+
+void hl_hw_queues_destroy(struct hl_device *hdev)
+{
+	struct hl_hw_queue *q;
+	int i;
+
+	for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++)
+		hw_queue_fini(hdev, q);
+
+	kfree(hdev->kernel_queues);
+}
+
+void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset)
+{
+	struct hl_hw_queue *q;
+	int i;
+
+	for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) {
+		if ((!q->valid) ||
+			((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU)))
+			continue;
+		q->pi = q->ci = 0;
+	}
+}
diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h
index 85fc2efe144b..cc37003aa6b7 100644
--- a/drivers/misc/habanalabs/include/armcp_if.h
+++ b/drivers/misc/habanalabs/include/armcp_if.h
@@ -10,10 +10,302 @@
 
 #include <linux/types.h>
 
+enum pq_init_status {
+	PQ_INIT_STATUS_NA = 0,
+	PQ_INIT_STATUS_READY_FOR_CP,
+	PQ_INIT_STATUS_READY_FOR_HOST
+};
+
+/*
+ * ArmCP Primary Queue Packets
+ *
+ * During normal operation, KMD needs to send various messages to ArmCP,
+ * usually either to SET some value into a H/W periphery or to GET the current
+ * value of some H/W periphery. For example, SET the frequency of MME/TPC and
+ * GET the value of the thermal sensor.
+ *
+ * These messages can be initiated either by the User application or by KMD
+ * itself, e.g. power management code. In either case, the communication from
+ * KMD to ArmCP will *always* be in synchronous mode, meaning that KMD will
+ * send a single message and poll until the message was acknowledged and the
+ * results are ready (if results are needed).
+ *
+ * This means that only a single message can be sent at a time and KMD must
+ * wait for its result before sending the next message. Having said that,
+ * because these are control messages which are sent in a relatively low
+ * frequency, this limitation seems acceptable. It's important to note that
+ * in case of multiple devices, messages to different devices *can* be sent
+ * at the same time.
+ *
+ * The message, inputs/outputs (if relevant) and fence object will be located
+ * on the device DDR at an address that will be determined by KMD. During
+ * device initialization phase, KMD will pass to ArmCP that address.  Most of
+ * the message types will contain inputs/outputs inside the message itself.
+ * The common part of each message will contain the opcode of the message (its
+ * type) and a field representing a fence object.
+ *
+ * When KMD wishes to send a message to ArmCP, it will write the message
+ * contents to the device DDR, clear the fence object and then write the
+ * value 484 to the mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR register to issue
+ * the 484 interrupt-id to the ARM core.
+ *
+ * Upon receiving the 484 interrupt-id, ArmCP will read the message from the
+ * DDR. In case the message is a SET operation, ArmCP will first perform the
+ * operation and then write to the fence object on the device DDR. In case the
+ * message is a GET operation, ArmCP will first fill the results section on the
+ * device DDR and then write to the fence object. If an error occurred, ArmCP
+ * will fill the rc field with the right error code.
+ *
+ * In the meantime, KMD will poll on the fence object. Once KMD sees that the
+ * fence object is signaled, it will read the results from the device DDR
+ * (if relevant) and resume the code execution in KMD.
+ *
+ * To use QMAN packets, the opcode must be the QMAN opcode, shifted by 8
+ * so the value being put by the KMD matches the value read by ArmCP
+ *
+ * Non-QMAN packets should be limited to values 1 through (2^8 - 1)
+ *
+ * Detailed description:
+ *
+ * ARMCP_PACKET_DISABLE_PCI_ACCESS -
+ *       After receiving this packet the embedded CPU must NOT issue PCI
+ *       transactions (read/write) towards the Host CPU. This also include
+ *       sending MSI-X interrupts.
+ *       This packet is usually sent before the device is moved to D3Hot state.
+ *
+ * ARMCP_PACKET_ENABLE_PCI_ACCESS -
+ *       After receiving this packet the embedded CPU is allowed to issue PCI
+ *       transactions towards the Host CPU, including sending MSI-X interrupts.
+ *       This packet is usually send after the device is moved to D0 state.
+ *
+ * ARMCP_PACKET_TEMPERATURE_GET -
+ *       Fetch the current temperature / Max / Max Hyst / Critical /
+ *       Critical Hyst of a specified thermal sensor. The packet's
+ *       arguments specify the desired sensor and the field to get.
+ *
+ * ARMCP_PACKET_VOLTAGE_GET -
+ *       Fetch the voltage / Max / Min of a specified sensor. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * ARMCP_PACKET_CURRENT_GET -
+ *       Fetch the current / Max / Min of a specified sensor. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * ARMCP_PACKET_FAN_SPEED_GET -
+ *       Fetch the speed / Max / Min of a specified fan. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * ARMCP_PACKET_PWM_GET -
+ *       Fetch the pwm value / mode of a specified pwm. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * ARMCP_PACKET_PWM_SET -
+ *       Set the pwm value / mode of a specified pwm. The packet's
+ *       arguments specify the sensor, type and value.
+ *
+ * ARMCP_PACKET_FREQUENCY_SET -
+ *       Set the frequency of a specified PLL. The packet's arguments specify
+ *       the PLL and the desired frequency. The actual frequency in the device
+ *       might differ from the requested frequency.
+ *
+ * ARMCP_PACKET_FREQUENCY_GET -
+ *       Fetch the frequency of a specified PLL. The packet's arguments specify
+ *       the PLL.
+ *
+ * ARMCP_PACKET_LED_SET -
+ *       Set the state of a specified led. The packet's arguments
+ *       specify the led and the desired state.
+ *
+ * ARMCP_PACKET_I2C_WR -
+ *       Write 32-bit value to I2C device. The packet's arguments specify the
+ *       I2C bus, address and value.
+ *
+ * ARMCP_PACKET_I2C_RD -
+ *       Read 32-bit value from I2C device. The packet's arguments specify the
+ *       I2C bus and address.
+ *
+ * ARMCP_PACKET_INFO_GET -
+ *       Fetch information from the device as specified in the packet's
+ *       structure. KMD passes the max size it allows the ArmCP to write to
+ *       the structure, to prevent data corruption in case of mismatched
+ *       KMD/FW versions.
+ *
+ * ARMCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
+ *
+ * ARMCP_PACKET_UNMASK_RAZWI_IRQ -
+ *       Unmask the given IRQ. The IRQ number is specified in the value field.
+ *       The packet is sent after receiving an interrupt and printing its
+ *       relevant information.
+ *
+ * ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY -
+ *       Unmask the given IRQs. The IRQs numbers are specified in an array right
+ *       after the armcp_packet structure, where its first element is the array
+ *       length. The packet is sent after a soft reset was done in order to
+ *       handle any interrupts that were sent during the reset process.
+ *
+ * ARMCP_PACKET_TEST -
+ *       Test packet for ArmCP connectivity. The CPU will put the fence value
+ *       in the result field.
+ *
+ * ARMCP_PACKET_FREQUENCY_CURR_GET -
+ *       Fetch the current frequency of a specified PLL. The packet's arguments
+ *       specify the PLL.
+ *
+ * ARMCP_PACKET_MAX_POWER_GET -
+ *       Fetch the maximal power of the device.
+ *
+ * ARMCP_PACKET_MAX_POWER_SET -
+ *       Set the maximal power of the device. The packet's arguments specify
+ *       the power.
+ *
+ * ARMCP_PACKET_EEPROM_DATA_GET -
+ *       Get EEPROM data from the ArmCP kernel. The buffer is specified in the
+ *       addr field. The CPU will put the returned data size in the result
+ *       field. In addition, KMD passes the max size it allows the ArmCP to
+ *       write to the structure, to prevent data corruption in case of
+ *       mismatched KMD/FW versions.
+ *
+ */
+
+enum armcp_packet_id {
+	ARMCP_PACKET_DISABLE_PCI_ACCESS = 1,	/* internal */
+	ARMCP_PACKET_ENABLE_PCI_ACCESS,		/* internal */
+	ARMCP_PACKET_TEMPERATURE_GET,		/* sysfs */
+	ARMCP_PACKET_VOLTAGE_GET,		/* sysfs */
+	ARMCP_PACKET_CURRENT_GET,		/* sysfs */
+	ARMCP_PACKET_FAN_SPEED_GET,		/* sysfs */
+	ARMCP_PACKET_PWM_GET,			/* sysfs */
+	ARMCP_PACKET_PWM_SET,			/* sysfs */
+	ARMCP_PACKET_FREQUENCY_SET,		/* sysfs */
+	ARMCP_PACKET_FREQUENCY_GET,		/* sysfs */
+	ARMCP_PACKET_LED_SET,			/* debugfs */
+	ARMCP_PACKET_I2C_WR,			/* debugfs */
+	ARMCP_PACKET_I2C_RD,			/* debugfs */
+	ARMCP_PACKET_INFO_GET,			/* IOCTL */
+	ARMCP_PACKET_FLASH_PROGRAM_REMOVED,
+	ARMCP_PACKET_UNMASK_RAZWI_IRQ,		/* internal */
+	ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY,	/* internal */
+	ARMCP_PACKET_TEST,			/* internal */
+	ARMCP_PACKET_FREQUENCY_CURR_GET,	/* sysfs */
+	ARMCP_PACKET_MAX_POWER_GET,		/* sysfs */
+	ARMCP_PACKET_MAX_POWER_SET,		/* sysfs */
+	ARMCP_PACKET_EEPROM_DATA_GET,		/* sysfs */
+};
+
+#define ARMCP_PACKET_FENCE_VAL	0xFE8CE7A5
+
+#define ARMCP_PKT_CTL_RC_SHIFT		12
+#define ARMCP_PKT_CTL_RC_MASK		0x0000F000
+
+#define ARMCP_PKT_CTL_OPCODE_SHIFT	16
+#define ARMCP_PKT_CTL_OPCODE_MASK	0x1FFF0000
+
+struct armcp_packet {
+	union {
+		__le64 value;	/* For SET packets */
+		__le64 result;	/* For GET packets */
+		__le64 addr;	/* For PQ */
+	};
+
+	__le32 ctl;
+
+	__le32 fence;		/* Signal to KMD that message is completed */
+
+	union {
+		struct {/* For temperature/current/voltage/fan/pwm get/set */
+			__le16 sensor_index;
+			__le16 type;
+		};
+
+		struct {	/* For I2C read/write */
+			__u8 i2c_bus;
+			__u8 i2c_addr;
+			__u8 i2c_reg;
+			__u8 pad; /* unused */
+		};
+
+		/* For frequency get/set */
+		__le32 pll_index;
+
+		/* For led set */
+		__le32 led_index;
+
+		/* For get Armcp info/EEPROM data */
+		__le32 data_max_size;
+	};
+};
+
+struct armcp_unmask_irq_arr_packet {
+	struct armcp_packet armcp_pkt;
+	__le32 length;
+	__le32 irqs[0];
+};
+
+enum armcp_packet_rc {
+	armcp_packet_success,
+	armcp_packet_invalid,
+	armcp_packet_fault
+};
+
+enum armcp_temp_type {
+	armcp_temp_input,
+	armcp_temp_max = 6,
+	armcp_temp_max_hyst,
+	armcp_temp_crit,
+	armcp_temp_crit_hyst
+};
+
+enum armcp_in_attributes {
+	armcp_in_input,
+	armcp_in_min,
+	armcp_in_max
+};
+
+enum armcp_curr_attributes {
+	armcp_curr_input,
+	armcp_curr_min,
+	armcp_curr_max
+};
+
+enum armcp_fan_attributes {
+	armcp_fan_input,
+	armcp_fan_min = 2,
+	armcp_fan_max
+};
+
+enum armcp_pwm_attributes {
+	armcp_pwm_input,
+	armcp_pwm_enable
+};
+
+/* Event Queue Packets */
+
+struct eq_generic_event {
+	__le64 data[7];
+};
+
 /*
  * ArmCP info
  */
 
 #define VERSION_MAX_LEN			128
+#define ARMCP_MAX_SENSORS		128
+
+struct armcp_sensor {
+	__le32 type;
+	__le32 flags;
+};
+
+struct armcp_info {
+	struct armcp_sensor sensors[ARMCP_MAX_SENSORS];
+	__u8 kernel_version[VERSION_MAX_LEN];
+	__le32 reserved[3];
+	__le32 cpld_version;
+	__le32 infineon_version;
+	__u8 fuse_version[VERSION_MAX_LEN];
+	__u8 thermal_version[VERSION_MAX_LEN];
+	__u8 armcp_version[VERSION_MAX_LEN];
+	__le64 dram_size;
+};
 
 #endif /* ARMCP_IF_H */
diff --git a/drivers/misc/habanalabs/include/goya/goya_async_events.h b/drivers/misc/habanalabs/include/goya/goya_async_events.h
new file mode 100644
index 000000000000..497937a17ee9
--- /dev/null
+++ b/drivers/misc/habanalabs/include/goya/goya_async_events.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef __GOYA_ASYNC_EVENTS_H_
+#define __GOYA_ASYNC_EVENTS_H_
+
+enum goya_async_event_id {
+	GOYA_ASYNC_EVENT_ID_PCIE_IF = 33,
+	GOYA_ASYNC_EVENT_ID_TPC0_ECC = 36,
+	GOYA_ASYNC_EVENT_ID_TPC1_ECC = 39,
+	GOYA_ASYNC_EVENT_ID_TPC2_ECC = 42,
+	GOYA_ASYNC_EVENT_ID_TPC3_ECC = 45,
+	GOYA_ASYNC_EVENT_ID_TPC4_ECC = 48,
+	GOYA_ASYNC_EVENT_ID_TPC5_ECC = 51,
+	GOYA_ASYNC_EVENT_ID_TPC6_ECC = 54,
+	GOYA_ASYNC_EVENT_ID_TPC7_ECC = 57,
+	GOYA_ASYNC_EVENT_ID_MME_ECC = 60,
+	GOYA_ASYNC_EVENT_ID_MME_ECC_EXT = 61,
+	GOYA_ASYNC_EVENT_ID_MMU_ECC = 63,
+	GOYA_ASYNC_EVENT_ID_DMA_MACRO = 64,
+	GOYA_ASYNC_EVENT_ID_DMA_ECC = 66,
+	GOYA_ASYNC_EVENT_ID_CPU_IF_ECC = 75,
+	GOYA_ASYNC_EVENT_ID_PSOC_MEM = 78,
+	GOYA_ASYNC_EVENT_ID_PSOC_CORESIGHT = 79,
+	GOYA_ASYNC_EVENT_ID_SRAM0 = 81,
+	GOYA_ASYNC_EVENT_ID_SRAM1 = 82,
+	GOYA_ASYNC_EVENT_ID_SRAM2 = 83,
+	GOYA_ASYNC_EVENT_ID_SRAM3 = 84,
+	GOYA_ASYNC_EVENT_ID_SRAM4 = 85,
+	GOYA_ASYNC_EVENT_ID_SRAM5 = 86,
+	GOYA_ASYNC_EVENT_ID_SRAM6 = 87,
+	GOYA_ASYNC_EVENT_ID_SRAM7 = 88,
+	GOYA_ASYNC_EVENT_ID_SRAM8 = 89,
+	GOYA_ASYNC_EVENT_ID_SRAM9 = 90,
+	GOYA_ASYNC_EVENT_ID_SRAM10 = 91,
+	GOYA_ASYNC_EVENT_ID_SRAM11 = 92,
+	GOYA_ASYNC_EVENT_ID_SRAM12 = 93,
+	GOYA_ASYNC_EVENT_ID_SRAM13 = 94,
+	GOYA_ASYNC_EVENT_ID_SRAM14 = 95,
+	GOYA_ASYNC_EVENT_ID_SRAM15 = 96,
+	GOYA_ASYNC_EVENT_ID_SRAM16 = 97,
+	GOYA_ASYNC_EVENT_ID_SRAM17 = 98,
+	GOYA_ASYNC_EVENT_ID_SRAM18 = 99,
+	GOYA_ASYNC_EVENT_ID_SRAM19 = 100,
+	GOYA_ASYNC_EVENT_ID_SRAM20 = 101,
+	GOYA_ASYNC_EVENT_ID_SRAM21 = 102,
+	GOYA_ASYNC_EVENT_ID_SRAM22 = 103,
+	GOYA_ASYNC_EVENT_ID_SRAM23 = 104,
+	GOYA_ASYNC_EVENT_ID_SRAM24 = 105,
+	GOYA_ASYNC_EVENT_ID_SRAM25 = 106,
+	GOYA_ASYNC_EVENT_ID_SRAM26 = 107,
+	GOYA_ASYNC_EVENT_ID_SRAM27 = 108,
+	GOYA_ASYNC_EVENT_ID_SRAM28 = 109,
+	GOYA_ASYNC_EVENT_ID_SRAM29 = 110,
+	GOYA_ASYNC_EVENT_ID_GIC500 = 112,
+	GOYA_ASYNC_EVENT_ID_PCIE_DEC = 115,
+	GOYA_ASYNC_EVENT_ID_TPC0_DEC = 117,
+	GOYA_ASYNC_EVENT_ID_TPC1_DEC = 120,
+	GOYA_ASYNC_EVENT_ID_TPC2_DEC = 123,
+	GOYA_ASYNC_EVENT_ID_TPC3_DEC = 126,
+	GOYA_ASYNC_EVENT_ID_TPC4_DEC = 129,
+	GOYA_ASYNC_EVENT_ID_TPC5_DEC = 132,
+	GOYA_ASYNC_EVENT_ID_TPC6_DEC = 135,
+	GOYA_ASYNC_EVENT_ID_TPC7_DEC = 138,
+	GOYA_ASYNC_EVENT_ID_AXI_ECC = 139,
+	GOYA_ASYNC_EVENT_ID_L2_RAM_ECC = 140,
+	GOYA_ASYNC_EVENT_ID_MME_WACS = 141,
+	GOYA_ASYNC_EVENT_ID_MME_WACSD = 142,
+	GOYA_ASYNC_EVENT_ID_PLL0 = 143,
+	GOYA_ASYNC_EVENT_ID_PLL1 = 144,
+	GOYA_ASYNC_EVENT_ID_PLL3 = 146,
+	GOYA_ASYNC_EVENT_ID_PLL4 = 147,
+	GOYA_ASYNC_EVENT_ID_PLL5 = 148,
+	GOYA_ASYNC_EVENT_ID_PLL6 = 149,
+	GOYA_ASYNC_EVENT_ID_CPU_AXI_SPLITTER = 155,
+	GOYA_ASYNC_EVENT_ID_PSOC_AXI_DEC = 159,
+	GOYA_ASYNC_EVENT_ID_PSOC = 160,
+	GOYA_ASYNC_EVENT_ID_PCIE_FLR = 171,
+	GOYA_ASYNC_EVENT_ID_PCIE_HOT_RESET = 172,
+	GOYA_ASYNC_EVENT_ID_PCIE_QID0_ENG0 = 174,
+	GOYA_ASYNC_EVENT_ID_PCIE_QID0_ENG1 = 175,
+	GOYA_ASYNC_EVENT_ID_PCIE_QID0_ENG2 = 176,
+	GOYA_ASYNC_EVENT_ID_PCIE_QID0_ENG3 = 177,
+	GOYA_ASYNC_EVENT_ID_PCIE_QID1_ENG0 = 178,
+	GOYA_ASYNC_EVENT_ID_PCIE_QID1_ENG1 = 179,
+	GOYA_ASYNC_EVENT_ID_PCIE_QID1_ENG2 = 180,
+	GOYA_ASYNC_EVENT_ID_PCIE_QID1_ENG3 = 181,
+	GOYA_ASYNC_EVENT_ID_PCIE_APB = 182,
+	GOYA_ASYNC_EVENT_ID_PCIE_QDB = 183,
+	GOYA_ASYNC_EVENT_ID_PCIE_BM_D_P_WR = 184,
+	GOYA_ASYNC_EVENT_ID_PCIE_BM_D_RD = 185,
+	GOYA_ASYNC_EVENT_ID_PCIE_BM_U_P_WR = 186,
+	GOYA_ASYNC_EVENT_ID_PCIE_BM_U_RD = 187,
+	GOYA_ASYNC_EVENT_ID_TPC0_BMON_SPMU = 190,
+	GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR = 191,
+	GOYA_ASYNC_EVENT_ID_TPC1_BMON_SPMU = 200,
+	GOYA_ASYNC_EVENT_ID_TPC1_KRN_ERR = 201,
+	GOYA_ASYNC_EVENT_ID_TPC2_BMON_SPMU = 210,
+	GOYA_ASYNC_EVENT_ID_TPC2_KRN_ERR = 211,
+	GOYA_ASYNC_EVENT_ID_TPC3_BMON_SPMU = 220,
+	GOYA_ASYNC_EVENT_ID_TPC3_KRN_ERR = 221,
+	GOYA_ASYNC_EVENT_ID_TPC4_BMON_SPMU = 230,
+	GOYA_ASYNC_EVENT_ID_TPC4_KRN_ERR = 231,
+	GOYA_ASYNC_EVENT_ID_TPC5_BMON_SPMU = 240,
+	GOYA_ASYNC_EVENT_ID_TPC5_KRN_ERR = 241,
+	GOYA_ASYNC_EVENT_ID_TPC6_BMON_SPMU = 250,
+	GOYA_ASYNC_EVENT_ID_TPC6_KRN_ERR = 251,
+	GOYA_ASYNC_EVENT_ID_TPC7_BMON_SPMU = 260,
+	GOYA_ASYNC_EVENT_ID_TPC7_KRN_ERR = 261,
+	GOYA_ASYNC_EVENT_ID_MMU_SBA_SPMU0 = 270,
+	GOYA_ASYNC_EVENT_ID_MMU_SBA_SPMU1 = 271,
+	GOYA_ASYNC_EVENT_ID_MME_WACS_UP = 272,
+	GOYA_ASYNC_EVENT_ID_MME_WACS_DOWN = 273,
+	GOYA_ASYNC_EVENT_ID_MMU_PAGE_FAULT = 280,
+	GOYA_ASYNC_EVENT_ID_MMU_WR_PERM = 281,
+	GOYA_ASYNC_EVENT_ID_MMU_DBG_BM = 282,
+	GOYA_ASYNC_EVENT_ID_DMA_BM_CH0 = 290,
+	GOYA_ASYNC_EVENT_ID_DMA_BM_CH1 = 291,
+	GOYA_ASYNC_EVENT_ID_DMA_BM_CH2 = 292,
+	GOYA_ASYNC_EVENT_ID_DMA_BM_CH3 = 293,
+	GOYA_ASYNC_EVENT_ID_DMA_BM_CH4 = 294,
+	GOYA_ASYNC_EVENT_ID_DDR0_PHY_DFI = 300,
+	GOYA_ASYNC_EVENT_ID_DDR0_ECC_SCRUB = 301,
+	GOYA_ASYNC_EVENT_ID_DDR0_DB_ECC = 302,
+	GOYA_ASYNC_EVENT_ID_DDR0_SB_ECC = 303,
+	GOYA_ASYNC_EVENT_ID_DDR0_SB_ECC_MC = 304,
+	GOYA_ASYNC_EVENT_ID_DDR0_AXI_RD = 305,
+	GOYA_ASYNC_EVENT_ID_DDR0_AXI_WR = 306,
+	GOYA_ASYNC_EVENT_ID_DDR1_PHY_DFI = 310,
+	GOYA_ASYNC_EVENT_ID_DDR1_ECC_SCRUB = 311,
+	GOYA_ASYNC_EVENT_ID_DDR1_DB_ECC = 312,
+	GOYA_ASYNC_EVENT_ID_DDR1_SB_ECC = 313,
+	GOYA_ASYNC_EVENT_ID_DDR1_SB_ECC_MC = 314,
+	GOYA_ASYNC_EVENT_ID_DDR1_AXI_RD = 315,
+	GOYA_ASYNC_EVENT_ID_DDR1_AXI_WR = 316,
+	GOYA_ASYNC_EVENT_ID_CPU_BMON = 320,
+	GOYA_ASYNC_EVENT_ID_TS_EAST = 322,
+	GOYA_ASYNC_EVENT_ID_TS_WEST = 323,
+	GOYA_ASYNC_EVENT_ID_TS_NORTH = 324,
+	GOYA_ASYNC_EVENT_ID_PSOC_GPIO_U16_0 = 330,
+	GOYA_ASYNC_EVENT_ID_PSOC_GPIO_U16_1 = 331,
+	GOYA_ASYNC_EVENT_ID_PSOC_GPIO_U16_2 = 332,
+	GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET = 356,
+	GOYA_ASYNC_EVENT_ID_PSOC_GPIO_10_VRHOT_ICRIT = 361,
+	GOYA_ASYNC_EVENT_ID_TPC0_CMDQ = 430,
+	GOYA_ASYNC_EVENT_ID_TPC1_CMDQ = 431,
+	GOYA_ASYNC_EVENT_ID_TPC2_CMDQ = 432,
+	GOYA_ASYNC_EVENT_ID_TPC3_CMDQ = 433,
+	GOYA_ASYNC_EVENT_ID_TPC4_CMDQ = 434,
+	GOYA_ASYNC_EVENT_ID_TPC5_CMDQ = 435,
+	GOYA_ASYNC_EVENT_ID_TPC6_CMDQ = 436,
+	GOYA_ASYNC_EVENT_ID_TPC7_CMDQ = 437,
+	GOYA_ASYNC_EVENT_ID_TPC0_QM = 438,
+	GOYA_ASYNC_EVENT_ID_TPC1_QM = 439,
+	GOYA_ASYNC_EVENT_ID_TPC2_QM = 440,
+	GOYA_ASYNC_EVENT_ID_TPC3_QM = 441,
+	GOYA_ASYNC_EVENT_ID_TPC4_QM = 442,
+	GOYA_ASYNC_EVENT_ID_TPC5_QM = 443,
+	GOYA_ASYNC_EVENT_ID_TPC6_QM = 444,
+	GOYA_ASYNC_EVENT_ID_TPC7_QM = 445,
+	GOYA_ASYNC_EVENT_ID_MME_QM = 447,
+	GOYA_ASYNC_EVENT_ID_MME_CMDQ = 448,
+	GOYA_ASYNC_EVENT_ID_DMA0_QM = 449,
+	GOYA_ASYNC_EVENT_ID_DMA1_QM = 450,
+	GOYA_ASYNC_EVENT_ID_DMA2_QM = 451,
+	GOYA_ASYNC_EVENT_ID_DMA3_QM = 452,
+	GOYA_ASYNC_EVENT_ID_DMA4_QM = 453,
+	GOYA_ASYNC_EVENT_ID_DMA_ON_HBW = 454,
+	GOYA_ASYNC_EVENT_ID_DMA0_CH = 455,
+	GOYA_ASYNC_EVENT_ID_DMA1_CH = 456,
+	GOYA_ASYNC_EVENT_ID_DMA2_CH = 457,
+	GOYA_ASYNC_EVENT_ID_DMA3_CH = 458,
+	GOYA_ASYNC_EVENT_ID_DMA4_CH = 459,
+	GOYA_ASYNC_EVENT_ID_PI_UPDATE = 484,
+	GOYA_ASYNC_EVENT_ID_HALT_MACHINE = 485,
+	GOYA_ASYNC_EVENT_ID_INTS_REGISTER = 486,
+	GOYA_ASYNC_EVENT_ID_SOFT_RESET = 487,
+	GOYA_ASYNC_EVENT_ID_LAST_VALID_ID = 1023,
+	GOYA_ASYNC_EVENT_ID_SIZE
+};
+
+#endif /* __GOYA_ASYNC_EVENTS_H_ */
diff --git a/drivers/misc/habanalabs/include/goya/goya_packets.h b/drivers/misc/habanalabs/include/goya/goya_packets.h
new file mode 100644
index 000000000000..a14407b975e4
--- /dev/null
+++ b/drivers/misc/habanalabs/include/goya/goya_packets.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2017-2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef GOYA_PACKETS_H
+#define GOYA_PACKETS_H
+
+#include <linux/types.h>
+
+#define PACKET_HEADER_PACKET_ID_SHIFT		56
+#define PACKET_HEADER_PACKET_ID_MASK		0x1F00000000000000ull
+
+enum packet_id {
+	PACKET_WREG_32 = 0x1,
+	PACKET_WREG_BULK = 0x2,
+	PACKET_MSG_LONG = 0x3,
+	PACKET_MSG_SHORT = 0x4,
+	PACKET_CP_DMA = 0x5,
+	PACKET_MSG_PROT = 0x7,
+	PACKET_FENCE = 0x8,
+	PACKET_LIN_DMA = 0x9,
+	PACKET_NOP = 0xA,
+	PACKET_STOP = 0xB,
+	MAX_PACKET_ID = (PACKET_HEADER_PACKET_ID_MASK >>
+				PACKET_HEADER_PACKET_ID_SHIFT) + 1
+};
+
+enum goya_dma_direction {
+	DMA_HOST_TO_DRAM,
+	DMA_HOST_TO_SRAM,
+	DMA_DRAM_TO_SRAM,
+	DMA_SRAM_TO_DRAM,
+	DMA_SRAM_TO_HOST,
+	DMA_DRAM_TO_HOST,
+	DMA_DRAM_TO_DRAM,
+	DMA_SRAM_TO_SRAM,
+	DMA_ENUM_MAX
+};
+
+#define GOYA_PKT_CTL_OPCODE_SHIFT	24
+#define GOYA_PKT_CTL_OPCODE_MASK	0x1F000000
+
+#define GOYA_PKT_CTL_EB_SHIFT		29
+#define GOYA_PKT_CTL_EB_MASK		0x20000000
+
+#define GOYA_PKT_CTL_RB_SHIFT		30
+#define GOYA_PKT_CTL_RB_MASK		0x40000000
+
+#define GOYA_PKT_CTL_MB_SHIFT		31
+#define GOYA_PKT_CTL_MB_MASK		0x80000000
+
+struct packet_nop {
+	__le32 reserved;
+	__le32 ctl;
+};
+
+struct packet_stop {
+	__le32 reserved;
+	__le32 ctl;
+};
+
+#define GOYA_PKT_WREG32_CTL_REG_OFFSET_SHIFT	0
+#define GOYA_PKT_WREG32_CTL_REG_OFFSET_MASK	0x0000FFFF
+
+struct packet_wreg32 {
+	__le32 value;
+	__le32 ctl;
+};
+
+struct packet_wreg_bulk {
+	__le32 size64;
+	__le32 ctl;
+	__le64 values[0]; /* data starts here */
+};
+
+struct packet_msg_long {
+	__le32 value;
+	__le32 ctl;
+	__le64 addr;
+};
+
+struct packet_msg_short {
+	__le32 value;
+	__le32 ctl;
+};
+
+struct packet_msg_prot {
+	__le32 value;
+	__le32 ctl;
+	__le64 addr;
+};
+
+struct packet_fence {
+	__le32 cfg;
+	__le32 ctl;
+};
+
+#define GOYA_PKT_LIN_DMA_CTL_WO_SHIFT		0
+#define GOYA_PKT_LIN_DMA_CTL_WO_MASK		0x00000001
+
+#define GOYA_PKT_LIN_DMA_CTL_RDCOMP_SHIFT	1
+#define GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK	0x00000002
+
+#define GOYA_PKT_LIN_DMA_CTL_WRCOMP_SHIFT	2
+#define GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK	0x00000004
+
+#define GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT	6
+#define GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK	0x00000040
+
+#define GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT	20
+#define GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK	0x00700000
+
+struct packet_lin_dma {
+	__le32 tsize;
+	__le32 ctl;
+	__le64 src_addr;
+	__le64 dst_addr;
+};
+
+struct packet_cp_dma {
+	__le32 tsize;
+	__le32 ctl;
+	__le64 src_addr;
+};
+
+#endif /* GOYA_PACKETS_H */
diff --git a/drivers/misc/habanalabs/include/qman_if.h b/drivers/misc/habanalabs/include/qman_if.h
new file mode 100644
index 000000000000..bf59bbe27fdc
--- /dev/null
+++ b/drivers/misc/habanalabs/include/qman_if.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2016-2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef QMAN_IF_H
+#define QMAN_IF_H
+
+#include <linux/types.h>
+
+/*
+ * PRIMARY QUEUE
+ */
+
+struct hl_bd {
+	__le64	ptr;
+	__le32	len;
+	__le32	ctl;
+};
+
+#define HL_BD_SIZE			sizeof(struct hl_bd)
+
+/*
+ * BD_CTL_REPEAT_VALID tells the CP whether the repeat field in the BD CTL is
+ * valid. 1 means the repeat field is valid, 0 means not-valid,
+ * i.e. repeat == 1
+ */
+#define BD_CTL_REPEAT_VALID_SHIFT	24
+#define BD_CTL_REPEAT_VALID_MASK	0x01000000
+
+#define BD_CTL_SHADOW_INDEX_SHIFT	0
+#define BD_CTL_SHADOW_INDEX_MASK	0x00000FFF
+
+/*
+ * COMPLETION QUEUE
+ */
+
+struct hl_cq_entry {
+	__le32	data;
+};
+
+#define HL_CQ_ENTRY_SIZE		sizeof(struct hl_cq_entry)
+
+#define CQ_ENTRY_READY_SHIFT			31
+#define CQ_ENTRY_READY_MASK			0x80000000
+
+#define CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT	30
+#define CQ_ENTRY_SHADOW_INDEX_VALID_MASK	0x40000000
+
+#define CQ_ENTRY_SHADOW_INDEX_SHIFT		BD_CTL_SHADOW_INDEX_SHIFT
+#define CQ_ENTRY_SHADOW_INDEX_MASK		BD_CTL_SHADOW_INDEX_MASK
+
+
+#endif /* QMAN_IF_H */
diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c
new file mode 100644
index 000000000000..6b7d35f6af08
--- /dev/null
+++ b/drivers/misc/habanalabs/irq.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "habanalabs.h"
+
+#include <linux/irqreturn.h>
+
+/*
+ * hl_cq_inc_ptr - increment ci or pi of cq
+ *
+ * @ptr: the current ci or pi value of the completion queue
+ *
+ * Increment ptr by 1. If it reaches the number of completion queue
+ * entries, set it to 0
+ */
+inline u32 hl_cq_inc_ptr(u32 ptr)
+{
+	ptr++;
+	if (unlikely(ptr == HL_CQ_LENGTH))
+		ptr = 0;
+	return ptr;
+}
+
+/*
+ * hl_irq_handler_cq - irq handler for completion queue
+ *
+ * @irq: irq number
+ * @arg: pointer to completion queue structure
+ *
+ */
+irqreturn_t hl_irq_handler_cq(int irq, void *arg)
+{
+	struct hl_cq *cq = arg;
+	struct hl_device *hdev = cq->hdev;
+	struct hl_hw_queue *queue;
+	struct hl_cs_job *job;
+	bool shadow_index_valid;
+	u16 shadow_index;
+	u32 *cq_entry;
+	u32 *cq_base;
+
+	if (hdev->disabled) {
+		dev_dbg(hdev->dev,
+			"Device disabled but received IRQ %d for CQ %d\n",
+			irq, cq->hw_queue_id);
+		return IRQ_HANDLED;
+	}
+
+	cq_base = (u32 *) (uintptr_t) cq->kernel_address;
+
+	while (1) {
+		bool entry_ready = ((cq_base[cq->ci] & CQ_ENTRY_READY_MASK)
+						>> CQ_ENTRY_READY_SHIFT);
+
+		if (!entry_ready)
+			break;
+
+		cq_entry = (u32 *) &cq_base[cq->ci];
+
+		/*
+		 * Make sure we read CQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		dma_rmb();
+
+		shadow_index_valid =
+			((*cq_entry & CQ_ENTRY_SHADOW_INDEX_VALID_MASK)
+					>> CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT);
+
+		shadow_index = (u16)
+			((*cq_entry & CQ_ENTRY_SHADOW_INDEX_MASK)
+					>> CQ_ENTRY_SHADOW_INDEX_SHIFT);
+
+		queue = &hdev->kernel_queues[cq->hw_queue_id];
+
+		if ((shadow_index_valid) && (!hdev->disabled)) {
+			job = queue->shadow_queue[hl_pi_2_offset(shadow_index)];
+			queue_work(hdev->cq_wq, &job->finish_work);
+		}
+
+		/*
+		 * Update ci of the context's queue. There is no
+		 * need to protect it with spinlock because this update is
+		 * done only inside IRQ and there is a different IRQ per
+		 * queue
+		 */
+		queue->ci = hl_queue_inc_ptr(queue->ci);
+
+		/* Clear CQ entry ready bit */
+		cq_base[cq->ci] &= ~CQ_ENTRY_READY_MASK;
+
+		cq->ci = hl_cq_inc_ptr(cq->ci);
+
+		/* Increment free slots */
+		atomic_inc(&cq->free_slots_cnt);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * hl_cq_init - main initialization function for an cq object
+ *
+ * @hdev: pointer to device structure
+ * @q: pointer to cq structure
+ * @hw_queue_id: The H/W queue ID this completion queue belongs to
+ *
+ * Allocate dma-able memory for the completion queue and initialize fields
+ * Returns 0 on success
+ */
+int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id)
+{
+	void *p;
+
+	BUILD_BUG_ON(HL_CQ_SIZE_IN_BYTES > HL_PAGE_SIZE);
+
+	p = hdev->asic_funcs->dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
+				&q->bus_address, GFP_KERNEL | __GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+
+	q->hdev = hdev;
+	q->kernel_address = (u64) (uintptr_t) p;
+	q->hw_queue_id = hw_queue_id;
+	q->ci = 0;
+	q->pi = 0;
+
+	atomic_set(&q->free_slots_cnt, HL_CQ_LENGTH);
+
+	return 0;
+}
+
+/*
+ * hl_cq_fini - destroy completion queue
+ *
+ * @hdev: pointer to device structure
+ * @q: pointer to cq structure
+ *
+ * Free the completion queue memory
+ */
+void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q)
+{
+	hdev->asic_funcs->dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
+			(void *) (uintptr_t) q->kernel_address, q->bus_address);
+}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index a8edfd3e9c95..756266cf0416 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -17,6 +17,35 @@
  */
 #define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START	0x8000	/* 32KB */
 
+/*
+ * Queue Numbering
+ *
+ * The external queues (DMA channels + CPU) MUST be before the internal queues
+ * and each group (DMA channels + CPU and internal) must be contiguous inside
+ * itself but there can be a gap between the two groups (although not
+ * recommended)
+ */
+
+enum goya_queue_id {
+	GOYA_QUEUE_ID_DMA_0 = 0,
+	GOYA_QUEUE_ID_DMA_1,
+	GOYA_QUEUE_ID_DMA_2,
+	GOYA_QUEUE_ID_DMA_3,
+	GOYA_QUEUE_ID_DMA_4,
+	GOYA_QUEUE_ID_CPU_PQ,
+	GOYA_QUEUE_ID_MME,
+	GOYA_QUEUE_ID_TPC0,
+	GOYA_QUEUE_ID_TPC1,
+	GOYA_QUEUE_ID_TPC2,
+	GOYA_QUEUE_ID_TPC3,
+	GOYA_QUEUE_ID_TPC4,
+	GOYA_QUEUE_ID_TPC5,
+	GOYA_QUEUE_ID_TPC6,
+	GOYA_QUEUE_ID_TPC7,
+	GOYA_QUEUE_ID_SIZE
+};
+
+
 /* Opcode to create a new command buffer */
 #define HL_CB_OP_CREATE		0
 /* Opcode to destroy previously created command buffer */
-- 
cgit v1.2.3-59-g8ed1b


From eff6f4a0e70b7bcf4674f471a768860a74e638a6 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sat, 16 Feb 2019 00:39:21 +0200
Subject: habanalabs: add command submission module

This patch adds the main flow for the user to submit work to the device.

Each work is described by a command submission object (CS). The CS contains
3 arrays of command buffers: One for execution, and two for context-switch
(store and restore).

For each CB, the user specifies on which queue to put that CB. In case of
an internal queue, the entry doesn't contain a pointer to the CB but the
address in the on-chip memory that the CB resides at.

The driver parses some of the CBs to enforce security restrictions.

The user receives a sequence number that represents the CS object. The user
can then query the driver regarding the status of the CS, using that
sequence number.

In case the CS doesn't finish before the timeout expires, the driver will
perform a soft-reset of the device.

Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/habanalabs/Makefile             |    3 +-
 drivers/misc/habanalabs/command_submission.c |  766 ++++++++++++++++++
 drivers/misc/habanalabs/context.c            |   52 +-
 drivers/misc/habanalabs/device.c             |   16 +
 drivers/misc/habanalabs/goya/goya.c          | 1126 ++++++++++++++++++++++++++
 drivers/misc/habanalabs/habanalabs.h         |  275 +++++++
 drivers/misc/habanalabs/habanalabs_drv.c     |   20 +
 drivers/misc/habanalabs/habanalabs_ioctl.c   |    4 +-
 drivers/misc/habanalabs/hw_queue.c           |  232 ++++++
 drivers/misc/habanalabs/memory.c             |  198 +++++
 include/uapi/misc/habanalabs.h               |  158 +++-
 11 files changed, 2843 insertions(+), 7 deletions(-)
 create mode 100644 drivers/misc/habanalabs/command_submission.c
 create mode 100644 drivers/misc/habanalabs/memory.c

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
index b5607233d216..d2fd0e18b1eb 100644
--- a/drivers/misc/habanalabs/Makefile
+++ b/drivers/misc/habanalabs/Makefile
@@ -5,7 +5,8 @@
 obj-m	:= habanalabs.o
 
 habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
-		command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o
+		command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \
+		command_submission.o
 
 include $(src)/goya/Makefile
 habanalabs-y += $(HL_GOYA_FILES)
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
new file mode 100644
index 000000000000..ae68b97e428d
--- /dev/null
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -0,0 +1,766 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include <uapi/misc/habanalabs.h>
+#include "habanalabs.h"
+
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+static void job_wq_completion(struct work_struct *work);
+static long _hl_cs_wait_ioctl(struct hl_device *hdev,
+		struct hl_ctx *ctx, u64 timeout_us, u64 seq);
+static void cs_do_release(struct kref *ref);
+
+static const char *hl_fence_get_driver_name(struct dma_fence *fence)
+{
+	return "HabanaLabs";
+}
+
+static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
+{
+	struct hl_dma_fence *hl_fence =
+		container_of(fence, struct hl_dma_fence, base_fence);
+
+	return dev_name(hl_fence->hdev->dev);
+}
+
+static bool hl_fence_enable_signaling(struct dma_fence *fence)
+{
+	return true;
+}
+
+static void hl_fence_release(struct dma_fence *fence)
+{
+	struct hl_dma_fence *hl_fence =
+		container_of(fence, struct hl_dma_fence, base_fence);
+
+	kfree_rcu(hl_fence, base_fence.rcu);
+}
+
+static const struct dma_fence_ops hl_fence_ops = {
+	.get_driver_name = hl_fence_get_driver_name,
+	.get_timeline_name = hl_fence_get_timeline_name,
+	.enable_signaling = hl_fence_enable_signaling,
+	.wait = dma_fence_default_wait,
+	.release = hl_fence_release
+};
+
+static void cs_get(struct hl_cs *cs)
+{
+	kref_get(&cs->refcount);
+}
+
+static int cs_get_unless_zero(struct hl_cs *cs)
+{
+	return kref_get_unless_zero(&cs->refcount);
+}
+
+static void cs_put(struct hl_cs *cs)
+{
+	kref_put(&cs->refcount, cs_do_release);
+}
+
+/*
+ * cs_parser - parse the user command submission
+ *
+ * @hpriv	: pointer to the private data of the fd
+ * @job        : pointer to the job that holds the command submission info
+ *
+ * The function parses the command submission of the user. It calls the
+ * ASIC specific parser, which returns a list of memory blocks to send
+ * to the device as different command buffers
+ *
+ */
+static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_cs_parser parser;
+	int rc;
+
+	parser.ctx_id = job->cs->ctx->asid;
+	parser.cs_sequence = job->cs->sequence;
+	parser.job_id = job->id;
+
+	parser.hw_queue_id = job->hw_queue_id;
+	parser.job_userptr_list = &job->userptr_list;
+	parser.patched_cb = NULL;
+	parser.user_cb = job->user_cb;
+	parser.user_cb_size = job->user_cb_size;
+	parser.ext_queue = job->ext_queue;
+	job->patched_cb = NULL;
+	parser.use_virt_addr = hdev->mmu_enable;
+
+	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
+	if (job->ext_queue) {
+		if (!rc) {
+			job->patched_cb = parser.patched_cb;
+			job->job_cb_size = parser.patched_cb_size;
+
+			spin_lock(&job->patched_cb->lock);
+			job->patched_cb->cs_cnt++;
+			spin_unlock(&job->patched_cb->lock);
+		}
+
+		/*
+		 * Whether the parsing worked or not, we don't need the
+		 * original CB anymore because it was already parsed and
+		 * won't be accessed again for this CS
+		 */
+		spin_lock(&job->user_cb->lock);
+		job->user_cb->cs_cnt--;
+		spin_unlock(&job->user_cb->lock);
+		hl_cb_put(job->user_cb);
+		job->user_cb = NULL;
+	}
+
+	return rc;
+}
+
+static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
+{
+	struct hl_cs *cs = job->cs;
+
+	if (job->ext_queue) {
+		hl_userptr_delete_list(hdev, &job->userptr_list);
+
+		/*
+		 * We might arrive here from rollback and patched CB wasn't
+		 * created, so we need to check it's not NULL
+		 */
+		if (job->patched_cb) {
+			spin_lock(&job->patched_cb->lock);
+			job->patched_cb->cs_cnt--;
+			spin_unlock(&job->patched_cb->lock);
+
+			hl_cb_put(job->patched_cb);
+		}
+	}
+
+	/*
+	 * This is the only place where there can be multiple threads
+	 * modifying the list at the same time
+	 */
+	spin_lock(&cs->job_lock);
+	list_del(&job->cs_node);
+	spin_unlock(&cs->job_lock);
+
+	if (job->ext_queue)
+		cs_put(cs);
+
+	kfree(job);
+}
+
+static void cs_do_release(struct kref *ref)
+{
+	struct hl_cs *cs = container_of(ref, struct hl_cs,
+						refcount);
+	struct hl_device *hdev = cs->ctx->hdev;
+	struct hl_cs_job *job, *tmp;
+
+	cs->completed = true;
+
+	/*
+	 * Although if we reached here it means that all external jobs have
+	 * finished, because each one of them took refcnt to CS, we still
+	 * need to go over the internal jobs and free them. Otherwise, we
+	 * will have leaked memory and what's worse, the CS object (and
+	 * potentially the CTX object) could be released, while the JOB
+	 * still holds a pointer to them (but no reference).
+	 */
+	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
+		free_job(hdev, job);
+
+	/* We also need to update CI for internal queues */
+	if (cs->submitted) {
+		hl_int_hw_queue_update_ci(cs);
+
+		spin_lock(&hdev->hw_queues_mirror_lock);
+		/* remove CS from hw_queues mirror list */
+		list_del_init(&cs->mirror_node);
+		spin_unlock(&hdev->hw_queues_mirror_lock);
+
+		/*
+		 * Don't cancel TDR in case this CS was timedout because we
+		 * might be running from the TDR context
+		 */
+		if ((!cs->timedout) &&
+			(hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT)) {
+			struct hl_cs *next;
+
+			if (cs->tdr_active)
+				cancel_delayed_work_sync(&cs->work_tdr);
+
+			spin_lock(&hdev->hw_queues_mirror_lock);
+
+			/* queue TDR for next CS */
+			next = list_first_entry_or_null(
+					&hdev->hw_queues_mirror_list,
+					struct hl_cs, mirror_node);
+
+			if ((next) && (!next->tdr_active)) {
+				next->tdr_active = true;
+				schedule_delayed_work(&next->work_tdr,
+							hdev->timeout_jiffies);
+			}
+
+			spin_unlock(&hdev->hw_queues_mirror_lock);
+		}
+	}
+
+	hl_ctx_put(cs->ctx);
+
+	if (cs->timedout)
+		dma_fence_set_error(cs->fence, -ETIMEDOUT);
+	else if (cs->aborted)
+		dma_fence_set_error(cs->fence, -EIO);
+
+	dma_fence_signal(cs->fence);
+	dma_fence_put(cs->fence);
+
+	kfree(cs);
+}
+
+static void cs_timedout(struct work_struct *work)
+{
+	struct hl_device *hdev;
+	int ctx_asid, rc;
+	struct hl_cs *cs = container_of(work, struct hl_cs,
+						 work_tdr.work);
+	rc = cs_get_unless_zero(cs);
+	if (!rc)
+		return;
+
+	if ((!cs->submitted) || (cs->completed)) {
+		cs_put(cs);
+		return;
+	}
+
+	/* Mark the CS is timed out so we won't try to cancel its TDR */
+	cs->timedout = true;
+
+	hdev = cs->ctx->hdev;
+	ctx_asid = cs->ctx->asid;
+
+	/* TODO: add information about last signaled seq and last emitted seq */
+	dev_err(hdev->dev, "CS %d.%llu got stuck!\n", ctx_asid, cs->sequence);
+
+	cs_put(cs);
+
+	if (hdev->reset_on_lockup)
+		hl_device_reset(hdev, false, false);
+}
+
+static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
+			struct hl_cs **cs_new)
+{
+	struct hl_dma_fence *fence;
+	struct dma_fence *other = NULL;
+	struct hl_cs *cs;
+	int rc;
+
+	cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->ctx = ctx;
+	cs->submitted = false;
+	cs->completed = false;
+	INIT_LIST_HEAD(&cs->job_list);
+	INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
+	kref_init(&cs->refcount);
+	spin_lock_init(&cs->job_lock);
+
+	fence = kmalloc(sizeof(*fence), GFP_ATOMIC);
+	if (!fence) {
+		rc = -ENOMEM;
+		goto free_cs;
+	}
+
+	fence->hdev = hdev;
+	spin_lock_init(&fence->lock);
+	cs->fence = &fence->base_fence;
+
+	spin_lock(&ctx->cs_lock);
+
+	fence->cs_seq = ctx->cs_sequence;
+	other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)];
+	if ((other) && (!dma_fence_is_signaled(other))) {
+		spin_unlock(&ctx->cs_lock);
+		rc = -EAGAIN;
+		goto free_fence;
+	}
+
+	dma_fence_init(&fence->base_fence, &hl_fence_ops, &fence->lock,
+			ctx->asid, ctx->cs_sequence);
+
+	cs->sequence = fence->cs_seq;
+
+	ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)] =
+							&fence->base_fence;
+	ctx->cs_sequence++;
+
+	dma_fence_get(&fence->base_fence);
+
+	dma_fence_put(other);
+
+	spin_unlock(&ctx->cs_lock);
+
+	*cs_new = cs;
+
+	return 0;
+
+free_fence:
+	kfree(fence);
+free_cs:
+	kfree(cs);
+	return rc;
+}
+
+static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
+{
+	struct hl_cs_job *job, *tmp;
+
+	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
+		free_job(hdev, job);
+}
+
+void hl_cs_rollback_all(struct hl_device *hdev)
+{
+	struct hl_cs *cs, *tmp;
+
+	/* flush all completions */
+	flush_workqueue(hdev->cq_wq);
+
+	/* Make sure we don't have leftovers in the H/W queues mirror list */
+	list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list,
+				mirror_node) {
+		cs_get(cs);
+		cs->aborted = true;
+		dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
+					cs->ctx->asid, cs->sequence);
+		cs_rollback(hdev, cs);
+		cs_put(cs);
+	}
+}
+
+static void job_wq_completion(struct work_struct *work)
+{
+	struct hl_cs_job *job = container_of(work, struct hl_cs_job,
+						finish_work);
+	struct hl_cs *cs = job->cs;
+	struct hl_device *hdev = cs->ctx->hdev;
+
+	/* job is no longer needed */
+	free_job(hdev, job);
+}
+
+static struct hl_cb *validate_queue_index(struct hl_device *hdev,
+					struct hl_cb_mgr *cb_mgr,
+					struct hl_cs_chunk *chunk,
+					bool *ext_queue)
+{
+	struct asic_fixed_properties *asic = &hdev->asic_prop;
+	struct hw_queue_properties *hw_queue_prop;
+	u32 cb_handle;
+	struct hl_cb *cb;
+
+	/* Assume external queue */
+	*ext_queue = true;
+
+	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
+
+	if ((chunk->queue_index >= HL_MAX_QUEUES) ||
+			(hw_queue_prop->type == QUEUE_TYPE_NA)) {
+		dev_err(hdev->dev, "Queue index %d is invalid\n",
+			chunk->queue_index);
+		return NULL;
+	}
+
+	if (hw_queue_prop->kmd_only) {
+		dev_err(hdev->dev, "Queue index %d is restricted for KMD\n",
+			chunk->queue_index);
+		return NULL;
+	} else if (hw_queue_prop->type == QUEUE_TYPE_INT) {
+		*ext_queue = false;
+		return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
+	}
+
+	/* Retrieve CB object */
+	cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);
+
+	cb = hl_cb_get(hdev, cb_mgr, cb_handle);
+	if (!cb) {
+		dev_err(hdev->dev, "CB handle 0x%x invalid\n", cb_handle);
+		return NULL;
+	}
+
+	if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) {
+		dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size);
+		goto release_cb;
+	}
+
+	spin_lock(&cb->lock);
+	cb->cs_cnt++;
+	spin_unlock(&cb->lock);
+
+	return cb;
+
+release_cb:
+	hl_cb_put(cb);
+	return NULL;
+}
+
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
+{
+	struct hl_cs_job *job;
+
+	job = kzalloc(sizeof(*job), GFP_ATOMIC);
+	if (!job)
+		return NULL;
+
+	job->ext_queue = ext_queue;
+
+	if (job->ext_queue) {
+		INIT_LIST_HEAD(&job->userptr_list);
+		INIT_WORK(&job->finish_work, job_wq_completion);
+	}
+
+	return job;
+}
+
+static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
+			u32 num_chunks, u64 *cs_seq)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_cs_chunk *cs_chunk_array;
+	struct hl_cs_job *job;
+	struct hl_cs *cs;
+	struct hl_cb *cb;
+	bool ext_queue_present = false;
+	u32 size_to_copy;
+	int rc, i, parse_cnt;
+
+	*cs_seq = ULLONG_MAX;
+
+	if (num_chunks > HL_MAX_JOBS_PER_CS) {
+		dev_err(hdev->dev,
+			"Number of chunks can NOT be larger than %d\n",
+			HL_MAX_JOBS_PER_CS);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array),
+					GFP_ATOMIC);
+	if (!cs_chunk_array) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
+	if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) {
+		dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
+		rc = -EFAULT;
+		goto free_cs_chunk_array;
+	}
+
+	/* increment refcnt for context */
+	hl_ctx_get(hdev, hpriv->ctx);
+
+	rc = allocate_cs(hdev, hpriv->ctx, &cs);
+	if (rc) {
+		hl_ctx_put(hpriv->ctx);
+		goto free_cs_chunk_array;
+	}
+
+	*cs_seq = cs->sequence;
+
+	/* Validate ALL the CS chunks before submitting the CS */
+	for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
+		struct hl_cs_chunk *chunk = &cs_chunk_array[i];
+		bool ext_queue;
+
+		cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk,
+					&ext_queue);
+		if (ext_queue) {
+			ext_queue_present = true;
+			if (!cb) {
+				rc = -EINVAL;
+				goto free_cs_object;
+			}
+		}
+
+		job = hl_cs_allocate_job(hdev, ext_queue);
+		if (!job) {
+			dev_err(hdev->dev, "Failed to allocate a new job\n");
+			rc = -ENOMEM;
+			if (ext_queue)
+				goto release_cb;
+			else
+				goto free_cs_object;
+		}
+
+		job->id = i + 1;
+		job->cs = cs;
+		job->user_cb = cb;
+		job->user_cb_size = chunk->cb_size;
+		if (job->ext_queue)
+			job->job_cb_size = cb->size;
+		else
+			job->job_cb_size = chunk->cb_size;
+		job->hw_queue_id = chunk->queue_index;
+
+		cs->jobs_in_queue_cnt[job->hw_queue_id]++;
+
+		list_add_tail(&job->cs_node, &cs->job_list);
+
+		/*
+		 * Increment CS reference. When CS reference is 0, CS is
+		 * done and can be signaled to user and free all its resources
+		 * Only increment for JOB on external queues, because only
+		 * for those JOBs we get completion
+		 */
+		if (job->ext_queue)
+			cs_get(cs);
+
+		rc = cs_parser(hpriv, job);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
+				cs->ctx->asid, cs->sequence, job->id, rc);
+			goto free_cs_object;
+		}
+	}
+
+	if (!ext_queue_present) {
+		dev_err(hdev->dev,
+			"Reject CS %d.%llu because no external queues jobs\n",
+			cs->ctx->asid, cs->sequence);
+		rc = -EINVAL;
+		goto free_cs_object;
+	}
+
+	rc = hl_hw_queue_schedule_cs(cs);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to submit CS %d.%llu to H/W queues, error %d\n",
+			cs->ctx->asid, cs->sequence, rc);
+		goto free_cs_object;
+	}
+
+	rc = HL_CS_STATUS_SUCCESS;
+	goto put_cs;
+
+release_cb:
+	spin_lock(&cb->lock);
+	cb->cs_cnt--;
+	spin_unlock(&cb->lock);
+	hl_cb_put(cb);
+free_cs_object:
+	cs_rollback(hdev, cs);
+	*cs_seq = ULLONG_MAX;
+	/* The path below is both for good and erroneous exits */
+put_cs:
+	/* We finished with the CS in this function, so put the ref */
+	cs_put(cs);
+free_cs_chunk_array:
+	kfree(cs_chunk_array);
+out:
+	return rc;
+}
+
+int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	union hl_cs_args *args = data;
+	struct hl_ctx *ctx = hpriv->ctx;
+	void __user *chunks;
+	u32 num_chunks;
+	u64 cs_seq = ULONG_MAX;
+	int rc, do_restore;
+	bool need_soft_reset = false;
+
+	if (hl_device_disabled_or_in_reset(hdev)) {
+		dev_warn(hdev->dev,
+			"Device is %s. Can't submit new CS\n",
+			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+		rc = -EBUSY;
+		goto out;
+	}
+
+	do_restore = atomic_cmpxchg(&ctx->thread_restore_token, 1, 0);
+
+	if (do_restore || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
+		long ret;
+
+		chunks = (void __user *)(uintptr_t)args->in.chunks_restore;
+		num_chunks = args->in.num_chunks_restore;
+
+		mutex_lock(&hpriv->restore_phase_mutex);
+
+		if (do_restore) {
+			rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
+			if (rc) {
+				dev_err_ratelimited(hdev->dev,
+					"Failed to switch to context %d, rejecting CS! %d\n",
+					ctx->asid, rc);
+				/*
+				 * If we timedout, we need to soft-reset because
+				 * QMAN is probably stuck. However, we can't
+				 * call to reset here directly because of
+				 * deadlock, so need to do it at the very end
+				 * of this function
+				 */
+				if (rc == -ETIMEDOUT)
+					need_soft_reset = true;
+				mutex_unlock(&hpriv->restore_phase_mutex);
+				goto out;
+			}
+		}
+
+		hdev->asic_funcs->restore_phase_topology(hdev);
+
+		if (num_chunks == 0) {
+			dev_dbg(hdev->dev,
+			"Need to run restore phase but restore CS is empty\n");
+			rc = 0;
+		} else {
+			rc = _hl_cs_ioctl(hpriv, chunks, num_chunks,
+						&cs_seq);
+		}
+
+		mutex_unlock(&hpriv->restore_phase_mutex);
+
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to submit restore CS for context %d (%d)\n",
+				ctx->asid, rc);
+			goto out;
+		}
+
+		/* Need to wait for restore completion before execution phase */
+		if (num_chunks > 0) {
+			ret = _hl_cs_wait_ioctl(hdev, ctx,
+					jiffies_to_usecs(hdev->timeout_jiffies),
+					cs_seq);
+			if (ret <= 0) {
+				dev_err(hdev->dev,
+					"Restore CS for context %d failed to complete %ld\n",
+					ctx->asid, ret);
+				rc = -ENOEXEC;
+				goto out;
+			}
+		}
+
+		ctx->thread_restore_wait_token = 1;
+	} else if (!ctx->thread_restore_wait_token) {
+		u32 tmp;
+
+		rc = hl_poll_timeout_memory(hdev,
+			(u64) (uintptr_t) &ctx->thread_restore_wait_token,
+			jiffies_to_usecs(hdev->timeout_jiffies),
+			&tmp);
+
+		if (rc || !tmp) {
+			dev_err(hdev->dev,
+				"restore phase hasn't finished in time\n");
+			rc = -ETIMEDOUT;
+			goto out;
+		}
+	}
+
+	chunks = (void __user *)(uintptr_t)args->in.chunks_execute;
+	num_chunks = args->in.num_chunks_execute;
+
+	if (num_chunks == 0) {
+		dev_err(hdev->dev,
+			"Got execute CS with 0 chunks, context %d\n",
+			ctx->asid);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = _hl_cs_ioctl(hpriv, chunks, num_chunks, &cs_seq);
+
+out:
+	if (rc != -EAGAIN) {
+		memset(args, 0, sizeof(*args));
+		args->out.status = rc;
+		args->out.seq = cs_seq;
+	}
+
+	if ((rc == -ETIMEDOUT) && (need_soft_reset))
+		hl_device_reset(hdev, false, false);
+
+	return rc;
+}
+
+static long _hl_cs_wait_ioctl(struct hl_device *hdev,
+		struct hl_ctx *ctx, u64 timeout_us, u64 seq)
+{
+	struct dma_fence *fence;
+	unsigned long timeout;
+	long rc;
+
+	if (timeout_us == MAX_SCHEDULE_TIMEOUT)
+		timeout = timeout_us;
+	else
+		timeout = usecs_to_jiffies(timeout_us);
+
+	hl_ctx_get(hdev, ctx);
+
+	fence = hl_ctx_get_fence(ctx, seq);
+	if (IS_ERR(fence)) {
+		rc = PTR_ERR(fence);
+	} else if (fence) {
+		rc = dma_fence_wait_timeout(fence, true, timeout);
+		if (fence->error == -ETIMEDOUT)
+			rc = -ETIMEDOUT;
+		else if (fence->error == -EIO)
+			rc = -EIO;
+		dma_fence_put(fence);
+	} else
+		rc = 1;
+
+	hl_ctx_put(ctx);
+
+	return rc;
+}
+
+int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	union hl_wait_cs_args *args = data;
+	u64 seq = args->in.seq;
+	long rc;
+
+	rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq);
+
+	memset(args, 0, sizeof(*args));
+
+	if (rc < 0) {
+		dev_err(hdev->dev, "Error %ld on waiting for CS handle %llu\n",
+			rc, seq);
+		if (rc == -ERESTARTSYS) {
+			args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED;
+			rc = -EINTR;
+		} else if (rc == -ETIMEDOUT) {
+			args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
+		} else if (rc == -EIO) {
+			args->out.status = HL_WAIT_CS_STATUS_ABORTED;
+		}
+		return rc;
+	}
+
+	if (rc == 0)
+		args->out.status = HL_WAIT_CS_STATUS_BUSY;
+	else
+		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
+
+	return 0;
+}
diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c
index de1258e7a6e6..c3854714b46c 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -12,6 +12,18 @@
 static void hl_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
+	int i;
+
+	/*
+	 * If we arrived here, there are no jobs waiting for this context
+	 * on its queues so we can safely remove it.
+	 * This is because for each CS, we increment the ref count and for
+	 * every CS that was finished we decrement it and we won't arrive
+	 * to this function unless the ref count is 0
+	 */
+
+	for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
+		dma_fence_put(ctx->cs_pending[i]);
 
 	if (ctx->asid != HL_KERNEL_ASID_ID)
 		hl_asid_free(hdev, ctx->asid);
@@ -23,8 +35,6 @@ void hl_ctx_do_release(struct kref *ref)
 
 	ctx = container_of(ref, struct hl_ctx, refcount);
 
-	dev_dbg(ctx->hdev->dev, "Now really releasing context %d\n", ctx->asid);
-
 	hl_ctx_fini(ctx);
 
 	if (ctx->hpriv)
@@ -90,6 +100,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 
 	kref_init(&ctx->refcount);
 
+	ctx->cs_sequence = 1;
+	spin_lock_init(&ctx->cs_lock);
+	atomic_set(&ctx->thread_restore_token, 1);
+	ctx->thread_restore_wait_token = 0;
+
 	if (is_kernel_ctx) {
 		ctx->asid = HL_KERNEL_ASID_ID; /* KMD gets ASID 0 */
 	} else {
@@ -100,8 +115,6 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 		}
 	}
 
-	dev_dbg(hdev->dev, "Created context with ASID %u\n", ctx->asid);
-
 	return 0;
 }
 
@@ -115,6 +128,37 @@ int hl_ctx_put(struct hl_ctx *ctx)
 	return kref_put(&ctx->refcount, hl_ctx_do_release);
 }
 
+struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct dma_fence *fence;
+
+	spin_lock(&ctx->cs_lock);
+
+	if (seq >= ctx->cs_sequence) {
+		dev_notice(hdev->dev,
+			"Can't wait on seq %llu because current CS is at seq %llu\n",
+			seq, ctx->cs_sequence);
+		spin_unlock(&ctx->cs_lock);
+		return ERR_PTR(-EINVAL);
+	}
+
+
+	if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
+		dev_dbg(hdev->dev,
+			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
+			seq, ctx->cs_sequence);
+		spin_unlock(&ctx->cs_lock);
+		return NULL;
+	}
+
+	fence = dma_fence_get(
+			ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
+	spin_unlock(&ctx->cs_lock);
+
+	return fence;
+}
+
 /*
  * hl_ctx_mgr_init - initialize the context manager
  *
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 2aa8a68cdf76..cc5f068df597 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -30,6 +30,8 @@ static void hpriv_release(struct kref *ref)
 
 	put_pid(hpriv->taskpid);
 
+	mutex_destroy(&hpriv->restore_phase_mutex);
+
 	kfree(hpriv);
 
 	/* Now the FD is really closed */
@@ -208,6 +210,8 @@ static int device_early_init(struct hl_device *hdev)
 
 	mutex_init(&hdev->fd_open_cnt_lock);
 	mutex_init(&hdev->send_cpu_message_lock);
+	INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
+	spin_lock_init(&hdev->hw_queues_mirror_lock);
 	atomic_set(&hdev->in_reset, 0);
 	atomic_set(&hdev->fd_open_cnt, 0);
 
@@ -593,6 +597,9 @@ again:
 	 */
 	hdev->asic_funcs->halt_engines(hdev, hard_reset);
 
+	/* Go over all the queues, release all CS and their jobs */
+	hl_cs_rollback_all(hdev);
+
 	if (hard_reset) {
 		/* Release kernel context */
 		if (hl_ctx_put(hdev->kernel_ctx) != 1) {
@@ -616,6 +623,12 @@ again:
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 		hl_cq_reset(hdev, &hdev->completion_queue[i]);
 
+	/* Make sure the setup phase for the user context will run again */
+	if (hdev->user_ctx) {
+		atomic_set(&hdev->user_ctx->thread_restore_token, 1);
+		hdev->user_ctx->thread_restore_wait_token = 0;
+	}
+
 	/* Finished tear-down, starting to re-initialize */
 
 	if (hard_reset) {
@@ -952,6 +965,9 @@ void hl_device_fini(struct hl_device *hdev)
 	 */
 	hdev->asic_funcs->halt_engines(hdev, true);
 
+	/* Go over all the queues, release all CS and their jobs */
+	hl_cs_rollback_all(hdev);
+
 	hl_cb_pool_fini(hdev);
 
 	/* Release kernel context */
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 1fe1d6a1ff9e..e3878fd7dc94 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -95,6 +95,19 @@ static const char goya_irq_name[GOYA_MSIX_ENTRIES][GOYA_MAX_STRING_LEN] = {
 		"goya cq 4", "goya cpu eq"
 };
 
+static u16 goya_packet_sizes[MAX_PACKET_ID] = {
+	[PACKET_WREG_32]	= sizeof(struct packet_wreg32),
+	[PACKET_WREG_BULK]	= sizeof(struct packet_wreg_bulk),
+	[PACKET_MSG_LONG]	= sizeof(struct packet_msg_long),
+	[PACKET_MSG_SHORT]	= sizeof(struct packet_msg_short),
+	[PACKET_CP_DMA]		= sizeof(struct packet_cp_dma),
+	[PACKET_MSG_PROT]	= sizeof(struct packet_msg_prot),
+	[PACKET_FENCE]		= sizeof(struct packet_fence),
+	[PACKET_LIN_DMA]	= sizeof(struct packet_lin_dma),
+	[PACKET_NOP]		= sizeof(struct packet_nop),
+	[PACKET_STOP]		= sizeof(struct packet_stop)
+};
+
 static const char *goya_axi_name[GOYA_MAX_INITIATORS] = {
 	"MME0",
 	"MME1",
@@ -2978,6 +2991,84 @@ void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
 	return base;
 }
 
+int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	struct packet_msg_prot *fence_pkt;
+	u32 *fence_ptr;
+	dma_addr_t fence_dma_addr;
+	struct hl_cb *cb;
+	u32 tmp;
+	int rc;
+
+	if (!hdev->asic_funcs->is_device_idle(hdev)) {
+		dev_err_ratelimited(hdev->dev,
+			"Can't send KMD job on QMAN0 if device is not idle\n");
+		return -EFAULT;
+	}
+
+	fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL,
+							&fence_dma_addr);
+	if (!fence_ptr) {
+		dev_err(hdev->dev,
+			"Failed to allocate fence memory for QMAN0\n");
+		return -ENOMEM;
+	}
+
+	*fence_ptr = 0;
+
+	if (goya->hw_cap_initialized & HW_CAP_MMU) {
+		WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_FULLY_TRUSTED);
+		RREG32(mmDMA_QM_0_GLBL_PROT);
+	}
+
+	/*
+	 * goya cs parser saves space for 2xpacket_msg_prot at end of CB. For
+	 * synchronized kernel jobs we only need space for 1 packet_msg_prot
+	 */
+	job->job_cb_size -= sizeof(struct packet_msg_prot);
+
+	cb = job->patched_cb;
+
+	fence_pkt = (struct packet_msg_prot *) (uintptr_t) (cb->kernel_address +
+			job->job_cb_size - sizeof(struct packet_msg_prot));
+
+	fence_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+			(1 << GOYA_PKT_CTL_EB_SHIFT) |
+			(1 << GOYA_PKT_CTL_MB_SHIFT);
+	fence_pkt->value = GOYA_QMAN0_FENCE_VAL;
+	fence_pkt->addr = fence_dma_addr +
+			hdev->asic_prop.host_phys_base_address;
+
+	rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_DMA_0,
+					job->job_cb_size, cb->bus_address);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to send CB on QMAN0, %d\n", rc);
+		goto free_fence_ptr;
+	}
+
+	rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) fence_ptr,
+					HL_DEVICE_TIMEOUT_USEC, &tmp);
+
+	hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_DMA_0);
+
+	if ((rc) || (tmp != GOYA_QMAN0_FENCE_VAL)) {
+		dev_err(hdev->dev, "QMAN0 Job hasn't finished in time\n");
+		rc = -ETIMEDOUT;
+	}
+
+free_fence_ptr:
+	hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr,
+					fence_dma_addr);
+
+	if (goya->hw_cap_initialized & HW_CAP_MMU) {
+		WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_PARTLY_TRUSTED);
+		RREG32(mmDMA_QM_0_GLBL_PROT);
+	}
+
+	return rc;
+}
+
 int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
 				u32 timeout, long *result)
 {
@@ -3214,11 +3305,985 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 			size);
 }
 
+int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sg, int nents,
+			enum dma_data_direction dir)
+{
+	if (!dma_map_sg(&hdev->pdev->dev, sg, nents, dir))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sg,
+			int nents, enum dma_data_direction dir)
+{
+	dma_unmap_sg(&hdev->pdev->dev, sg, nents, dir);
+}
+
+u32 goya_get_dma_desc_list_size(struct hl_device *hdev,
+					struct sg_table *sgt)
+{
+	struct scatterlist *sg, *sg_next_iter;
+	u32 count, len, dma_desc_cnt, len_next;
+	dma_addr_t addr, addr_next;
+
+	dma_desc_cnt = 0;
+
+	for_each_sg(sgt->sgl, sg, sgt->nents, count) {
+
+		len = sg_dma_len(sg);
+		addr = sg_dma_address(sg);
+
+		if (len == 0)
+			break;
+
+		while ((count + 1) < sgt->nents) {
+			sg_next_iter = sg_next(sg);
+			len_next = sg_dma_len(sg_next_iter);
+			addr_next = sg_dma_address(sg_next_iter);
+
+			if (len_next == 0)
+				break;
+
+			if ((addr + len == addr_next) &&
+				(len + len_next <= DMA_MAX_TRANSFER_SIZE)) {
+				len += len_next;
+				count++;
+				sg = sg_next_iter;
+			} else {
+				break;
+			}
+		}
+
+		dma_desc_cnt++;
+	}
+
+	return dma_desc_cnt * sizeof(struct packet_lin_dma);
+}
+
+static int goya_pin_memory_before_cs(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt,
+				u64 addr, enum dma_data_direction dir)
+{
+	struct hl_userptr *userptr;
+	int rc;
+
+	if (hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize,
+			parser->job_userptr_list, &userptr))
+		goto already_pinned;
+
+	userptr = kzalloc(sizeof(*userptr), GFP_ATOMIC);
+	if (!userptr)
+		return -ENOMEM;
+
+	rc = hl_pin_host_memory(hdev, addr, user_dma_pkt->tsize, userptr);
+	if (rc)
+		goto free_userptr;
+
+	list_add_tail(&userptr->job_node, parser->job_userptr_list);
+
+	rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl,
+					userptr->sgt->nents, dir);
+	if (rc) {
+		dev_err(hdev->dev, "failed to map sgt with DMA region\n");
+		goto unpin_memory;
+	}
+
+	userptr->dma_mapped = true;
+	userptr->dir = dir;
+
+already_pinned:
+	parser->patched_cb_size +=
+			goya_get_dma_desc_list_size(hdev, userptr->sgt);
+
+	return 0;
+
+unpin_memory:
+	hl_unpin_host_memory(hdev, userptr);
+free_userptr:
+	kfree(userptr);
+	return rc;
+}
+
+static int goya_validate_dma_pkt_host(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt)
+{
+	u64 device_memory_addr, addr;
+	enum dma_data_direction dir;
+	enum goya_dma_direction user_dir;
+	bool sram_addr = true;
+	bool skip_host_mem_pin = false;
+	bool user_memset;
+	int rc = 0;
+
+	user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+	user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT;
+
+	switch (user_dir) {
+	case DMA_HOST_TO_DRAM:
+		dev_dbg(hdev->dev, "DMA direction is HOST --> DRAM\n");
+		dir = DMA_TO_DEVICE;
+		sram_addr = false;
+		addr = user_dma_pkt->src_addr;
+		device_memory_addr = user_dma_pkt->dst_addr;
+		if (user_memset)
+			skip_host_mem_pin = true;
+		break;
+
+	case DMA_DRAM_TO_HOST:
+		dev_dbg(hdev->dev, "DMA direction is DRAM --> HOST\n");
+		dir = DMA_FROM_DEVICE;
+		sram_addr = false;
+		addr = user_dma_pkt->dst_addr;
+		device_memory_addr = user_dma_pkt->src_addr;
+		break;
+
+	case DMA_HOST_TO_SRAM:
+		dev_dbg(hdev->dev, "DMA direction is HOST --> SRAM\n");
+		dir = DMA_TO_DEVICE;
+		addr = user_dma_pkt->src_addr;
+		device_memory_addr = user_dma_pkt->dst_addr;
+		if (user_memset)
+			skip_host_mem_pin = true;
+		break;
+
+	case DMA_SRAM_TO_HOST:
+		dev_dbg(hdev->dev, "DMA direction is SRAM --> HOST\n");
+		dir = DMA_FROM_DEVICE;
+		addr = user_dma_pkt->dst_addr;
+		device_memory_addr = user_dma_pkt->src_addr;
+		break;
+	default:
+		dev_err(hdev->dev, "DMA direction is undefined\n");
+		return -EFAULT;
+	}
+
+	if (parser->ctx_id != HL_KERNEL_ASID_ID) {
+		if (sram_addr) {
+			if (!hl_mem_area_inside_range(device_memory_addr,
+					user_dma_pkt->tsize,
+					hdev->asic_prop.sram_user_base_address,
+					hdev->asic_prop.sram_end_address)) {
+
+				dev_err(hdev->dev,
+					"SRAM address 0x%llx + 0x%x is invalid\n",
+					device_memory_addr,
+					user_dma_pkt->tsize);
+				return -EFAULT;
+			}
+		} else {
+			if (!hl_mem_area_inside_range(device_memory_addr,
+					user_dma_pkt->tsize,
+					hdev->asic_prop.dram_user_base_address,
+					hdev->asic_prop.dram_end_address)) {
+
+				dev_err(hdev->dev,
+					"DRAM address 0x%llx + 0x%x is invalid\n",
+					device_memory_addr,
+					user_dma_pkt->tsize);
+				return -EFAULT;
+			}
+		}
+	}
+
+	if (skip_host_mem_pin)
+		parser->patched_cb_size += sizeof(*user_dma_pkt);
+	else {
+		if ((dir == DMA_TO_DEVICE) &&
+				(parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1)) {
+			dev_err(hdev->dev,
+				"Can't DMA from host on queue other then 1\n");
+			return -EFAULT;
+		}
+
+		rc = goya_pin_memory_before_cs(hdev, parser, user_dma_pkt,
+						addr, dir);
+	}
+
+	return rc;
+}
+
+static int goya_validate_dma_pkt_no_host(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt)
+{
+	u64 sram_memory_addr, dram_memory_addr;
+	enum goya_dma_direction user_dir;
+
+	user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+	if (user_dir == DMA_DRAM_TO_SRAM) {
+		dev_dbg(hdev->dev, "DMA direction is DRAM --> SRAM\n");
+		dram_memory_addr = user_dma_pkt->src_addr;
+		sram_memory_addr = user_dma_pkt->dst_addr;
+	} else {
+		dev_dbg(hdev->dev, "DMA direction is SRAM --> DRAM\n");
+		sram_memory_addr = user_dma_pkt->src_addr;
+		dram_memory_addr = user_dma_pkt->dst_addr;
+	}
+
+	if (!hl_mem_area_inside_range(sram_memory_addr, user_dma_pkt->tsize,
+				hdev->asic_prop.sram_user_base_address,
+				hdev->asic_prop.sram_end_address)) {
+		dev_err(hdev->dev, "SRAM address 0x%llx + 0x%x is invalid\n",
+			sram_memory_addr, user_dma_pkt->tsize);
+		return -EFAULT;
+	}
+
+	if (!hl_mem_area_inside_range(dram_memory_addr, user_dma_pkt->tsize,
+				hdev->asic_prop.dram_user_base_address,
+				hdev->asic_prop.dram_end_address)) {
+		dev_err(hdev->dev, "DRAM address 0x%llx + 0x%x is invalid\n",
+			dram_memory_addr, user_dma_pkt->tsize);
+		return -EFAULT;
+	}
+
+	parser->patched_cb_size += sizeof(*user_dma_pkt);
+
+	return 0;
+}
+
+static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt)
+{
+	enum goya_dma_direction user_dir;
+	int rc;
+
+	dev_dbg(hdev->dev, "DMA packet details:\n");
+	dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr);
+	dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr);
+	dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize);
+
+	user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+	/*
+	 * Special handling for DMA with size 0. The H/W has a bug where
+	 * this can cause the QMAN DMA to get stuck, so block it here.
+	 */
+	if (user_dma_pkt->tsize == 0) {
+		dev_err(hdev->dev,
+			"Got DMA with size 0, might reset the device\n");
+		return -EINVAL;
+	}
+
+	if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM))
+		rc = goya_validate_dma_pkt_no_host(hdev, parser, user_dma_pkt);
+	else
+		rc = goya_validate_dma_pkt_host(hdev, parser, user_dma_pkt);
+
+	return rc;
+}
+
+static int goya_validate_dma_pkt_mmu(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt)
+{
+	dev_dbg(hdev->dev, "DMA packet details:\n");
+	dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr);
+	dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr);
+	dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize);
+
+	/*
+	 * WA for HW-23.
+	 * We can't allow user to read from Host using QMANs other than 1.
+	 */
+	if (parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1 &&
+		hl_mem_area_inside_range(user_dma_pkt->src_addr,
+				user_dma_pkt->tsize,
+				hdev->asic_prop.va_space_host_start_address,
+				hdev->asic_prop.va_space_host_end_address)) {
+		dev_err(hdev->dev,
+			"Can't DMA from host on queue other then 1\n");
+		return -EFAULT;
+	}
+
+	if (user_dma_pkt->tsize == 0) {
+		dev_err(hdev->dev,
+			"Got DMA with size 0, might reset the device\n");
+		return -EINVAL;
+	}
+
+	parser->patched_cb_size += sizeof(*user_dma_pkt);
+
+	return 0;
+}
+
+static int goya_validate_wreg32(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_wreg32 *wreg_pkt)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	u32 sob_start_addr, sob_end_addr;
+	u16 reg_offset;
+
+	reg_offset = wreg_pkt->ctl & GOYA_PKT_WREG32_CTL_REG_OFFSET_MASK;
+
+	dev_dbg(hdev->dev, "WREG32 packet details:\n");
+	dev_dbg(hdev->dev, "reg_offset == 0x%x\n", reg_offset);
+	dev_dbg(hdev->dev, "value      == 0x%x\n", wreg_pkt->value);
+
+	if (reg_offset != (mmDMA_CH_1_WR_COMP_ADDR_LO & 0xFFFF)) {
+		dev_err(hdev->dev, "WREG32 packet with illegal address 0x%x\n",
+			reg_offset);
+		return -EPERM;
+	}
+
+	/*
+	 * With MMU, DMA channels are not secured, so it doesn't matter where
+	 * the WR COMP will be written to because it will go out with
+	 * non-secured property
+	 */
+	if (goya->hw_cap_initialized & HW_CAP_MMU)
+		return 0;
+
+	sob_start_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+	sob_end_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1023);
+
+	if ((wreg_pkt->value < sob_start_addr) ||
+			(wreg_pkt->value > sob_end_addr)) {
+
+		dev_err(hdev->dev, "WREG32 packet with illegal value 0x%x\n",
+			wreg_pkt->value);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int goya_validate_cb(struct hl_device *hdev,
+			struct hl_cs_parser *parser, bool is_mmu)
+{
+	u32 cb_parsed_length = 0;
+	int rc = 0;
+
+	parser->patched_cb_size = 0;
+
+	/* cb_user_size is more than 0 so loop will always be executed */
+	while (cb_parsed_length < parser->user_cb_size) {
+		enum packet_id pkt_id;
+		u16 pkt_size;
+		void *user_pkt;
+
+		user_pkt = (void *) (uintptr_t)
+			(parser->user_cb->kernel_address + cb_parsed_length);
+
+		pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
+				PACKET_HEADER_PACKET_ID_MASK) >>
+					PACKET_HEADER_PACKET_ID_SHIFT);
+
+		pkt_size = goya_packet_sizes[pkt_id];
+		cb_parsed_length += pkt_size;
+		if (cb_parsed_length > parser->user_cb_size) {
+			dev_err(hdev->dev,
+				"packet 0x%x is out of CB boundary\n", pkt_id);
+			rc = -EINVAL;
+			break;
+		}
+
+		switch (pkt_id) {
+		case PACKET_WREG_32:
+			/*
+			 * Although it is validated after copy in patch_cb(),
+			 * need to validate here as well because patch_cb() is
+			 * not called in MMU path while this function is called
+			 */
+			rc = goya_validate_wreg32(hdev, parser, user_pkt);
+			break;
+
+		case PACKET_WREG_BULK:
+			dev_err(hdev->dev,
+				"User not allowed to use WREG_BULK\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_MSG_PROT:
+			dev_err(hdev->dev,
+				"User not allowed to use MSG_PROT\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_CP_DMA:
+			dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_STOP:
+			dev_err(hdev->dev, "User not allowed to use STOP\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_LIN_DMA:
+			if (is_mmu)
+				rc = goya_validate_dma_pkt_mmu(hdev, parser,
+						user_pkt);
+			else
+				rc = goya_validate_dma_pkt_no_mmu(hdev, parser,
+						user_pkt);
+			break;
+
+		case PACKET_MSG_LONG:
+		case PACKET_MSG_SHORT:
+		case PACKET_FENCE:
+		case PACKET_NOP:
+			parser->patched_cb_size += pkt_size;
+			break;
+
+		default:
+			dev_err(hdev->dev, "Invalid packet header 0x%x\n",
+				pkt_id);
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc)
+			break;
+	}
+
+	/*
+	 * The new CB should have space at the end for two MSG_PROT packets:
+	 * 1. A packet that will act as a completion packet
+	 * 2. A packet that will generate MSI-X interrupt
+	 */
+	parser->patched_cb_size += sizeof(struct packet_msg_prot) * 2;
+
+	return rc;
+}
+
+static int goya_patch_dma_packet(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt,
+				struct packet_lin_dma *new_dma_pkt,
+				u32 *new_dma_pkt_size)
+{
+	struct hl_userptr *userptr;
+	struct scatterlist *sg, *sg_next_iter;
+	u32 count, len, dma_desc_cnt, len_next;
+	dma_addr_t dma_addr, dma_addr_next;
+	enum goya_dma_direction user_dir;
+	u64 device_memory_addr, addr;
+	enum dma_data_direction dir;
+	struct sg_table *sgt;
+	bool skip_host_mem_pin = false;
+	bool user_memset;
+	u32 user_rdcomp_mask, user_wrcomp_mask;
+
+	user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+	user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT;
+
+	if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM) ||
+			(user_dma_pkt->tsize == 0)) {
+		memcpy(new_dma_pkt, user_dma_pkt, sizeof(*new_dma_pkt));
+		*new_dma_pkt_size = sizeof(*new_dma_pkt);
+		return 0;
+	}
+
+	if ((user_dir == DMA_HOST_TO_DRAM) || (user_dir == DMA_HOST_TO_SRAM)) {
+		addr = user_dma_pkt->src_addr;
+		device_memory_addr = user_dma_pkt->dst_addr;
+		dir = DMA_TO_DEVICE;
+		if (user_memset)
+			skip_host_mem_pin = true;
+	} else {
+		addr = user_dma_pkt->dst_addr;
+		device_memory_addr = user_dma_pkt->src_addr;
+		dir = DMA_FROM_DEVICE;
+	}
+
+	if ((!skip_host_mem_pin) &&
+		(hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize,
+			parser->job_userptr_list, &userptr) == false)) {
+		dev_err(hdev->dev, "Userptr 0x%llx + 0x%x NOT mapped\n",
+				addr, user_dma_pkt->tsize);
+		return -EFAULT;
+	}
+
+	if ((user_memset) && (dir == DMA_TO_DEVICE)) {
+		memcpy(new_dma_pkt, user_dma_pkt, sizeof(*user_dma_pkt));
+		*new_dma_pkt_size = sizeof(*user_dma_pkt);
+		return 0;
+	}
+
+	user_rdcomp_mask =
+			(user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK);
+
+	user_wrcomp_mask =
+			(user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK);
+
+	sgt = userptr->sgt;
+	dma_desc_cnt = 0;
+
+	for_each_sg(sgt->sgl, sg, sgt->nents, count) {
+		len = sg_dma_len(sg);
+		dma_addr = sg_dma_address(sg);
+
+		if (len == 0)
+			break;
+
+		while ((count + 1) < sgt->nents) {
+			sg_next_iter = sg_next(sg);
+			len_next = sg_dma_len(sg_next_iter);
+			dma_addr_next = sg_dma_address(sg_next_iter);
+
+			if (len_next == 0)
+				break;
+
+			if ((dma_addr + len == dma_addr_next) &&
+				(len + len_next <= DMA_MAX_TRANSFER_SIZE)) {
+				len += len_next;
+				count++;
+				sg = sg_next_iter;
+			} else {
+				break;
+			}
+		}
+
+		new_dma_pkt->ctl = user_dma_pkt->ctl;
+		if (likely(dma_desc_cnt))
+			new_dma_pkt->ctl &= ~GOYA_PKT_CTL_EB_MASK;
+		new_dma_pkt->ctl &= ~(GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK |
+					GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK);
+		new_dma_pkt->tsize = len;
+
+		dma_addr += hdev->asic_prop.host_phys_base_address;
+
+		if (dir == DMA_TO_DEVICE) {
+			new_dma_pkt->src_addr = dma_addr;
+			new_dma_pkt->dst_addr = device_memory_addr;
+		} else {
+			new_dma_pkt->src_addr = device_memory_addr;
+			new_dma_pkt->dst_addr = dma_addr;
+		}
+
+		if (!user_memset)
+			device_memory_addr += len;
+		dma_desc_cnt++;
+		new_dma_pkt++;
+	}
+
+	if (!dma_desc_cnt) {
+		dev_err(hdev->dev,
+			"Error of 0 SG entries when patching DMA packet\n");
+		return -EFAULT;
+	}
+
+	/* Fix the last dma packet - rdcomp/wrcomp must be as user set them */
+	new_dma_pkt--;
+	new_dma_pkt->ctl |= (user_rdcomp_mask | user_wrcomp_mask);
+
+	*new_dma_pkt_size = dma_desc_cnt * sizeof(struct packet_lin_dma);
+
+	return 0;
+}
+
+static int goya_patch_cb(struct hl_device *hdev,
+				struct hl_cs_parser *parser)
+{
+	u32 cb_parsed_length = 0;
+	u32 cb_patched_cur_length = 0;
+	int rc = 0;
+
+	/* cb_user_size is more than 0 so loop will always be executed */
+	while (cb_parsed_length < parser->user_cb_size) {
+		enum packet_id pkt_id;
+		u16 pkt_size;
+		u32 new_pkt_size = 0;
+		void *user_pkt, *kernel_pkt;
+
+		user_pkt = (void *) (uintptr_t)
+			(parser->user_cb->kernel_address + cb_parsed_length);
+		kernel_pkt = (void *) (uintptr_t)
+			(parser->patched_cb->kernel_address +
+					cb_patched_cur_length);
+
+		pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
+				PACKET_HEADER_PACKET_ID_MASK) >>
+					PACKET_HEADER_PACKET_ID_SHIFT);
+
+		pkt_size = goya_packet_sizes[pkt_id];
+		cb_parsed_length += pkt_size;
+		if (cb_parsed_length > parser->user_cb_size) {
+			dev_err(hdev->dev,
+				"packet 0x%x is out of CB boundary\n", pkt_id);
+			rc = -EINVAL;
+			break;
+		}
+
+		switch (pkt_id) {
+		case PACKET_LIN_DMA:
+			rc = goya_patch_dma_packet(hdev, parser, user_pkt,
+						kernel_pkt, &new_pkt_size);
+			cb_patched_cur_length += new_pkt_size;
+			break;
+
+		case PACKET_WREG_32:
+			memcpy(kernel_pkt, user_pkt, pkt_size);
+			cb_patched_cur_length += pkt_size;
+			rc = goya_validate_wreg32(hdev, parser, kernel_pkt);
+			break;
+
+		case PACKET_WREG_BULK:
+			dev_err(hdev->dev,
+				"User not allowed to use WREG_BULK\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_MSG_PROT:
+			dev_err(hdev->dev,
+				"User not allowed to use MSG_PROT\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_CP_DMA:
+			dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_STOP:
+			dev_err(hdev->dev, "User not allowed to use STOP\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_MSG_LONG:
+		case PACKET_MSG_SHORT:
+		case PACKET_FENCE:
+		case PACKET_NOP:
+			memcpy(kernel_pkt, user_pkt, pkt_size);
+			cb_patched_cur_length += pkt_size;
+			break;
+
+		default:
+			dev_err(hdev->dev, "Invalid packet header 0x%x\n",
+				pkt_id);
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+static int goya_parse_cb_mmu(struct hl_device *hdev,
+		struct hl_cs_parser *parser)
+{
+	u64 patched_cb_handle;
+	u32 patched_cb_size;
+	struct hl_cb *user_cb;
+	int rc;
+
+	/*
+	 * The new CB should have space at the end for two MSG_PROT pkt:
+	 * 1. A packet that will act as a completion packet
+	 * 2. A packet that will generate MSI-X interrupt
+	 */
+	parser->patched_cb_size = parser->user_cb_size +
+			sizeof(struct packet_msg_prot) * 2;
+
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
+				parser->patched_cb_size,
+				&patched_cb_handle, HL_KERNEL_ASID_ID);
+
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to allocate patched CB for DMA CS %d\n",
+			rc);
+		return rc;
+	}
+
+	patched_cb_handle >>= PAGE_SHIFT;
+	parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr,
+				(u32) patched_cb_handle);
+	/* hl_cb_get should never fail here so use kernel WARN */
+	WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n",
+			(u32) patched_cb_handle);
+	if (!parser->patched_cb) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	/*
+	 * The check that parser->user_cb_size <= parser->user_cb->size was done
+	 * in validate_queue_index().
+	 */
+	memcpy((void *) (uintptr_t) parser->patched_cb->kernel_address,
+		(void *) (uintptr_t) parser->user_cb->kernel_address,
+		parser->user_cb_size);
+
+	patched_cb_size = parser->patched_cb_size;
+
+	/* validate patched CB instead of user CB */
+	user_cb = parser->user_cb;
+	parser->user_cb = parser->patched_cb;
+	rc = goya_validate_cb(hdev, parser, true);
+	parser->user_cb = user_cb;
+
+	if (rc) {
+		hl_cb_put(parser->patched_cb);
+		goto out;
+	}
+
+	if (patched_cb_size != parser->patched_cb_size) {
+		dev_err(hdev->dev, "user CB size mismatch\n");
+		hl_cb_put(parser->patched_cb);
+		rc = -EINVAL;
+		goto out;
+	}
+
+out:
+	/*
+	 * Always call cb destroy here because we still have 1 reference
+	 * to it by calling cb_get earlier. After the job will be completed,
+	 * cb_put will release it, but here we want to remove it from the
+	 * idr
+	 */
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr,
+					patched_cb_handle << PAGE_SHIFT);
+
+	return rc;
+}
+
+int goya_parse_cb_no_mmu(struct hl_device *hdev, struct hl_cs_parser *parser)
+{
+	u64 patched_cb_handle;
+	int rc;
+
+	rc = goya_validate_cb(hdev, parser, false);
+
+	if (rc)
+		goto free_userptr;
+
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
+				parser->patched_cb_size,
+				&patched_cb_handle, HL_KERNEL_ASID_ID);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to allocate patched CB for DMA CS %d\n", rc);
+		goto free_userptr;
+	}
+
+	patched_cb_handle >>= PAGE_SHIFT;
+	parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr,
+				(u32) patched_cb_handle);
+	/* hl_cb_get should never fail here so use kernel WARN */
+	WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n",
+			(u32) patched_cb_handle);
+	if (!parser->patched_cb) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = goya_patch_cb(hdev, parser);
+
+	if (rc)
+		hl_cb_put(parser->patched_cb);
+
+out:
+	/*
+	 * Always call cb destroy here because we still have 1 reference
+	 * to it by calling cb_get earlier. After the job will be completed,
+	 * cb_put will release it, but here we want to remove it from the
+	 * idr
+	 */
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr,
+				patched_cb_handle << PAGE_SHIFT);
+
+free_userptr:
+	if (rc)
+		hl_userptr_delete_list(hdev, parser->job_userptr_list);
+	return rc;
+}
+
+int goya_parse_cb_no_ext_quque(struct hl_device *hdev,
+		struct hl_cs_parser *parser)
+{
+	struct asic_fixed_properties *asic_prop = &hdev->asic_prop;
+	struct goya_device *goya = hdev->asic_specific;
+
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU)) {
+		/* For internal queue jobs, just check if cb address is valid */
+		if (hl_mem_area_inside_range(
+				(u64) (uintptr_t) parser->user_cb,
+				parser->user_cb_size,
+				asic_prop->sram_user_base_address,
+				asic_prop->sram_end_address))
+			return 0;
+
+		if (hl_mem_area_inside_range(
+				(u64) (uintptr_t) parser->user_cb,
+				parser->user_cb_size,
+				asic_prop->dram_user_base_address,
+				asic_prop->dram_end_address))
+			return 0;
+
+		dev_err(hdev->dev,
+			"Internal CB address 0x%llx + 0x%x is not in SRAM nor in DRAM\n",
+			(u64) (uintptr_t) parser->user_cb,
+			parser->user_cb_size);
+
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
+{
+	struct goya_device *goya = hdev->asic_specific;
+
+	if (!parser->ext_queue)
+		return goya_parse_cb_no_ext_quque(hdev, parser);
+
+	if ((goya->hw_cap_initialized & HW_CAP_MMU) && parser->use_virt_addr)
+		return goya_parse_cb_mmu(hdev, parser);
+	else
+		return goya_parse_cb_no_mmu(hdev, parser);
+}
+
+void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr,
+				u32 cq_val, u32 msix_vec)
+{
+	struct packet_msg_prot *cq_pkt;
+
+	cq_pkt = (struct packet_msg_prot *) (uintptr_t)
+		(kernel_address + len - (sizeof(struct packet_msg_prot) * 2));
+
+	cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+			(1 << GOYA_PKT_CTL_EB_SHIFT) |
+			(1 << GOYA_PKT_CTL_MB_SHIFT);
+	cq_pkt->value = cq_val;
+	cq_pkt->addr = cq_addr;
+
+	cq_pkt++;
+
+	cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+			(1 << GOYA_PKT_CTL_MB_SHIFT);
+	cq_pkt->value = msix_vec & 0x7FF;
+	cq_pkt->addr = CFG_BASE + mmPCIE_DBI_MSIX_DOORBELL_OFF;
+}
+
 static void goya_update_eq_ci(struct hl_device *hdev, u32 val)
 {
 	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, val);
 }
 
+int goya_context_switch(struct hl_device *hdev, u32 asid)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct packet_lin_dma *clear_sram_pkt;
+	struct hl_cs_parser parser;
+	struct hl_cs_job *job;
+	u32 cb_size;
+	struct hl_cb *cb;
+	int rc;
+
+	cb = hl_cb_kernel_create(hdev, PAGE_SIZE);
+	if (!cb)
+		return -EFAULT;
+
+	clear_sram_pkt = (struct packet_lin_dma *)
+					(uintptr_t) cb->kernel_address;
+
+	memset(clear_sram_pkt, 0, sizeof(*clear_sram_pkt));
+	cb_size = sizeof(*clear_sram_pkt);
+
+	clear_sram_pkt->ctl = ((PACKET_LIN_DMA << GOYA_PKT_CTL_OPCODE_SHIFT) |
+		(DMA_HOST_TO_SRAM << GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT) |
+		(1 << GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT) |
+		(1 << GOYA_PKT_LIN_DMA_CTL_WO_SHIFT) |
+		(1 << GOYA_PKT_CTL_RB_SHIFT) |
+		(1 << GOYA_PKT_CTL_MB_SHIFT));
+
+	clear_sram_pkt->src_addr = 0x7777777777777777ull;
+	clear_sram_pkt->dst_addr = prop->sram_base_address;
+	if (hdev->pldm)
+		clear_sram_pkt->tsize = 0x10000;
+	else
+		clear_sram_pkt->tsize = prop->sram_size;
+
+	job = hl_cs_allocate_job(hdev, true);
+	if (!job) {
+		dev_err(hdev->dev, "Failed to allocate a new job\n");
+		rc = -ENOMEM;
+		goto release_cb;
+	}
+
+	job->id = 0;
+	job->user_cb = cb;
+	job->user_cb->cs_cnt++;
+	job->user_cb_size = cb_size;
+	job->hw_queue_id = GOYA_QUEUE_ID_DMA_0;
+
+	parser.ctx_id = HL_KERNEL_ASID_ID;
+	parser.cs_sequence = 0;
+	parser.job_id = job->id;
+	parser.hw_queue_id = job->hw_queue_id;
+	parser.job_userptr_list = &job->userptr_list;
+	parser.user_cb = job->user_cb;
+	parser.user_cb_size = job->user_cb_size;
+	parser.ext_queue = job->ext_queue;
+	parser.use_virt_addr = hdev->mmu_enable;
+
+	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to parse kernel CB during context switch\n");
+		goto free_job;
+	}
+
+	job->patched_cb = parser.patched_cb;
+	job->job_cb_size = parser.patched_cb_size;
+	job->patched_cb->cs_cnt++;
+
+	rc = goya_send_job_on_qman0(hdev, job);
+
+	job->patched_cb->cs_cnt--;
+	hl_cb_put(job->patched_cb);
+
+free_job:
+	hl_userptr_delete_list(hdev, &job->userptr_list);
+	kfree(job);
+	cb->cs_cnt--;
+
+release_cb:
+	hl_cb_put(cb);
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
+
+	return rc;
+}
+
+void goya_restore_phase_topology(struct hl_device *hdev)
+{
+	int i, num_of_sob_in_longs, num_of_mon_in_longs;
+
+	num_of_sob_in_longs =
+		((mmSYNC_MNGR_SOB_OBJ_1023 - mmSYNC_MNGR_SOB_OBJ_0) + 4);
+
+	num_of_mon_in_longs =
+		((mmSYNC_MNGR_MON_STATUS_255 - mmSYNC_MNGR_MON_STATUS_0) + 4);
+
+	for (i = 0 ; i < num_of_sob_in_longs ; i += 4)
+		WREG32(mmSYNC_MNGR_SOB_OBJ_0 + i, 0);
+
+	for (i = 0 ; i < num_of_mon_in_longs ; i += 4)
+		WREG32(mmSYNC_MNGR_MON_STATUS_0 + i, 0);
+
+	/* Flush all WREG to prevent race */
+	i = RREG32(mmSYNC_MNGR_SOB_OBJ_0);
+}
+
 static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id,
 		u16 event_type, char *axi_name, int len)
 {
@@ -3608,6 +4673,59 @@ static void goya_disable_clock_gating(struct hl_device *hdev)
 
 }
 
+static bool goya_is_device_idle(struct hl_device *hdev)
+{
+	u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg;
+	int i;
+
+	offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0;
+
+	for (i = 0 ; i < DMA_MAX_NUM ; i++) {
+		dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset;
+
+		if ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) !=
+				DMA_QM_IDLE_MASK)
+			return false;
+	}
+
+	offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0;
+
+	for (i = 0 ; i < TPC_MAX_NUM ; i++) {
+		tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset;
+		tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset;
+		tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset;
+
+		if ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) !=
+				TPC_QM_IDLE_MASK)
+			return false;
+
+		if ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) !=
+				TPC_CMDQ_IDLE_MASK)
+			return false;
+
+		if ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) !=
+				TPC_CFG_IDLE_MASK)
+			return false;
+	}
+
+	if ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) !=
+			MME_QM_IDLE_MASK)
+		return false;
+
+	if ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) !=
+			MME_CMDQ_IDLE_MASK)
+		return false;
+
+	if ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) !=
+			MME_ARCH_IDLE_MASK)
+		return false;
+
+	if (RREG32(mmMME_SHADOW_0_STATUS) & MME_SHADOW_IDLE_MASK)
+		return false;
+
+	return true;
+}
+
 static void goya_hw_queues_lock(struct hl_device *hdev)
 {
 	struct goya_device *goya = hdev->asic_specific;
@@ -3700,7 +4818,14 @@ static const struct hl_asic_funcs goya_funcs = {
 	.dma_pool_free = goya_dma_pool_free,
 	.cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc,
 	.cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free,
+	.hl_dma_unmap_sg = goya_dma_unmap_sg,
+	.cs_parser = goya_cs_parser,
+	.asic_dma_map_sg = goya_dma_map_sg,
+	.get_dma_desc_list_size = goya_get_dma_desc_list_size,
+	.add_end_of_cb_packets = goya_add_end_of_cb_packets,
 	.update_eq_ci = goya_update_eq_ci,
+	.context_switch = goya_context_switch,
+	.restore_phase_topology = goya_restore_phase_topology,
 	.add_device_attr = goya_add_device_attr,
 	.handle_eqe = goya_handle_eqe,
 	.set_pll_profile = goya_set_pll_profile,
@@ -3708,6 +4833,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.send_heartbeat = goya_send_heartbeat,
 	.enable_clock_gating = goya_init_clock_gating,
 	.disable_clock_gating = goya_disable_clock_gating,
+	.is_device_idle = goya_is_device_idle,
 	.soft_reset_late_init = goya_soft_reset_late_init,
 	.hw_queues_lock = goya_hw_queues_lock,
 	.hw_queues_unlock = goya_hw_queues_unlock,
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 744e37bbc2a6..9adc7c6ec08b 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -16,6 +16,9 @@
 #include <linux/cdev.h>
 #include <linux/iopoll.h>
 #include <linux/irqreturn.h>
+#include <linux/dma-fence.h>
+#include <linux/dma-direction.h>
+#include <linux/scatterlist.h>
 
 #define HL_NAME				"habanalabs"
 
@@ -31,6 +34,11 @@
 
 #define HL_MAX_QUEUES			128
 
+#define HL_MAX_JOBS_PER_CS		64
+
+/* MUST BE POWER OF 2 and larger than 1 */
+#define HL_MAX_PENDING_CS		64
+
 struct hl_device;
 struct hl_fpriv;
 
@@ -61,6 +69,16 @@ struct hw_queue_properties {
 	u8			kmd_only;
 };
 
+/**
+ * enum vm_type_t - virtual memory mapping request information.
+ * @VM_TYPE_USERPTR: mapping of user memory to device virtual address.
+ * @VM_TYPE_PHYS_LIST: mapping of DRAM memory to device virtual address.
+ */
+enum vm_type_t {
+	VM_TYPE_USERPTR,
+	VM_TYPE_PHYS_LIST
+};
+
 /**
  * enum hl_device_hw_state - H/W device state. use this to understand whether
  *                           to do reset before hw_init or not
@@ -147,6 +165,19 @@ struct asic_fixed_properties {
 	u8			tpc_enabled_mask;
 };
 
+/**
+ * struct hl_dma_fence - wrapper for fence object used by command submissions.
+ * @base_fence: kernel fence object.
+ * @lock: spinlock to protect fence.
+ * @hdev: habanalabs device structure.
+ * @cs_seq: command submission sequence number.
+ */
+struct hl_dma_fence {
+	struct dma_fence	base_fence;
+	spinlock_t		lock;
+	struct hl_device	*hdev;
+	u64			cs_seq;
+};
 
 /*
  * Command Buffers
@@ -175,6 +206,7 @@ struct hl_cb_mgr {
  * @mmap_size: Holds the CB's size that was mmaped.
  * @size: holds the CB's size.
  * @id: the CB's ID.
+ * @cs_cnt: holds number of CS that this CB participates in.
  * @ctx_id: holds the ID of the owner's context.
  * @mmap: true if the CB is currently mmaped to user.
  * @is_pool: true if CB was acquired from the pool, false otherwise.
@@ -189,6 +221,7 @@ struct hl_cb {
 	u32			mmap_size;
 	u32			size;
 	u32			id;
+	u32			cs_cnt;
 	u32			ctx_id;
 	u8			mmap;
 	u8			is_pool;
@@ -313,6 +346,8 @@ enum hl_asic_type {
 	ASIC_INVALID
 };
 
+struct hl_cs_parser;
+
 /**
  * enum hl_pm_mng_profile - power management profile.
  * @PM_AUTO: internal clock is set by KMD.
@@ -372,7 +407,14 @@ enum hl_pll_frequency {
  * @dma_pool_free: free small DMA allocation from pool.
  * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool.
  * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool.
+ * @hl_dma_unmap_sg: DMA unmap scatter-gather list.
+ * @cs_parser: parse Command Submission.
+ * @asic_dma_map_sg: DMA map scatter-gather list.
+ * @get_dma_desc_list_size: get number of LIN_DMA packets required for CB.
+ * @add_end_of_cb_packets: Add packets to the end of CB, if device requires it.
  * @update_eq_ci: update event queue CI.
+ * @context_switch: called upon ASID context switch.
+ * @restore_phase_topology: clear all SOBs amd MONs.
  * @add_device_attr: add ASIC specific device attributes.
  * @handle_eqe: handle event queue entry (IRQ) from ArmCP.
  * @set_pll_profile: change PLL profile (manual/automatic).
@@ -380,6 +422,7 @@ enum hl_pll_frequency {
  * @send_heartbeat: send is-alive packet to ArmCP and verify response.
  * @enable_clock_gating: enable clock gating for reducing power consumption.
  * @disable_clock_gating: disable clock for accessing registers on HBW.
+ * @is_device_idle: return true if device is idle, false otherwise.
  * @soft_reset_late_init: perform certain actions needed after soft reset.
  * @hw_queues_lock: acquire H/W queues lock.
  * @hw_queues_unlock: release H/W queues lock.
@@ -419,7 +462,20 @@ struct hl_asic_funcs {
 				size_t size, dma_addr_t *dma_handle);
 	void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
 				size_t size, void *vaddr);
+	void (*hl_dma_unmap_sg)(struct hl_device *hdev,
+				struct scatterlist *sg, int nents,
+				enum dma_data_direction dir);
+	int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser);
+	int (*asic_dma_map_sg)(struct hl_device *hdev,
+				struct scatterlist *sg, int nents,
+				enum dma_data_direction dir);
+	u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
+					struct sg_table *sgt);
+	void (*add_end_of_cb_packets)(u64 kernel_address, u32 len, u64 cq_addr,
+					u32 cq_val, u32 msix_num);
 	void (*update_eq_ci)(struct hl_device *hdev, u32 val);
+	int (*context_switch)(struct hl_device *hdev, u32 asid);
+	void (*restore_phase_topology)(struct hl_device *hdev);
 	void (*add_device_attr)(struct hl_device *hdev,
 				struct attribute_group *dev_attr_grp);
 	void (*handle_eqe)(struct hl_device *hdev,
@@ -430,6 +486,7 @@ struct hl_asic_funcs {
 	int (*send_heartbeat)(struct hl_device *hdev);
 	void (*enable_clock_gating)(struct hl_device *hdev);
 	void (*disable_clock_gating)(struct hl_device *hdev);
+	bool (*is_device_idle)(struct hl_device *hdev);
 	int (*soft_reset_late_init)(struct hl_device *hdev);
 	void (*hw_queues_lock)(struct hl_device *hdev);
 	void (*hw_queues_unlock)(struct hl_device *hdev);
@@ -453,12 +510,28 @@ struct hl_asic_funcs {
  * @hdev: pointer to the device structure.
  * @refcount: reference counter for the context. Context is released only when
  *		this hits 0l. It is incremented on CS and CS_WAIT.
+ * @cs_pending: array of DMA fence objects representing pending CS.
+ * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
+ *			to user so user could inquire about CS. It is used as
+ *			index to cs_pending array.
+ * @cs_lock: spinlock to protect cs_sequence.
+ * @thread_restore_token: token to prevent multiple threads of the same context
+ *				from running the restore phase. Only one thread
+ *				should run it.
+ * @thread_restore_wait_token: token to prevent the threads that didn't run
+ *				the restore phase from moving to their execution
+ *				phase before the restore phase has finished.
  * @asid: context's unique address space ID in the device's MMU.
  */
 struct hl_ctx {
 	struct hl_fpriv		*hpriv;
 	struct hl_device	*hdev;
 	struct kref		refcount;
+	struct dma_fence	*cs_pending[HL_MAX_PENDING_CS];
+	u64			cs_sequence;
+	spinlock_t		cs_lock;
+	atomic_t		thread_restore_token;
+	u32			thread_restore_wait_token;
 	u32			asid;
 };
 
@@ -473,14 +546,129 @@ struct hl_ctx_mgr {
 };
 
 
+
+/*
+ * COMMAND SUBMISSIONS
+ */
+
+/**
+ * struct hl_userptr - memory mapping chunk information
+ * @vm_type: type of the VM.
+ * @job_node: linked-list node for hanging the object on the Job's list.
+ * @vec: pointer to the frame vector.
+ * @sgt: pointer to the scatter-gather table that holds the pages.
+ * @dir: for DMA unmapping, the direction must be supplied, so save it.
+ * @debugfs_list: node in debugfs list of command submissions.
+ * @addr: user-space virtual pointer to the start of the memory area.
+ * @size: size of the memory area to pin & map.
+ * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise.
+ */
+struct hl_userptr {
+	enum vm_type_t		vm_type; /* must be first */
+	struct list_head	job_node;
+	struct frame_vector	*vec;
+	struct sg_table		*sgt;
+	enum dma_data_direction dir;
+	struct list_head	debugfs_list;
+	u64			addr;
+	u32			size;
+	u8			dma_mapped;
+};
+
+/**
+ * struct hl_cs - command submission.
+ * @jobs_in_queue_cnt: per each queue, maintain counter of submitted jobs.
+ * @ctx: the context this CS belongs to.
+ * @job_list: list of the CS's jobs in the various queues.
+ * @job_lock: spinlock for the CS's jobs list. Needed for free_job.
+ * @refcount: reference counter for usage of the CS.
+ * @fence: pointer to the fence object of this CS.
+ * @work_tdr: delayed work node for TDR.
+ * @mirror_node : node in device mirror list of command submissions.
+ * @sequence: the sequence number of this CS.
+ * @submitted: true if CS was submitted to H/W.
+ * @completed: true if CS was completed by device.
+ * @timedout : true if CS was timedout.
+ * @tdr_active: true if TDR was activated for this CS (to prevent
+ *		double TDR activation).
+ * @aborted: true if CS was aborted due to some device error.
+ */
+struct hl_cs {
+	u8			jobs_in_queue_cnt[HL_MAX_QUEUES];
+	struct hl_ctx		*ctx;
+	struct list_head	job_list;
+	spinlock_t		job_lock;
+	struct kref		refcount;
+	struct dma_fence	*fence;
+	struct delayed_work	work_tdr;
+	struct list_head	mirror_node;
+	u64			sequence;
+	u8			submitted;
+	u8			completed;
+	u8			timedout;
+	u8			tdr_active;
+	u8			aborted;
+};
+
 /**
  * struct hl_cs_job - command submission job.
+ * @cs_node: the node to hang on the CS jobs list.
+ * @cs: the CS this job belongs to.
+ * @user_cb: the CB we got from the user.
+ * @patched_cb: in case of patching, this is internal CB which is submitted on
+ *		the queue instead of the CB we got from the IOCTL.
  * @finish_work: workqueue object to run when job is completed.
+ * @userptr_list: linked-list of userptr mappings that belong to this job and
+ *			wait for completion.
  * @id: the id of this job inside a CS.
+ * @hw_queue_id: the id of the H/W queue this job is submitted to.
+ * @user_cb_size: the actual size of the CB we got from the user.
+ * @job_cb_size: the actual size of the CB that we put on the queue.
+ * @ext_queue: whether the job is for external queue or internal queue.
  */
 struct hl_cs_job {
+	struct list_head	cs_node;
+	struct hl_cs		*cs;
+	struct hl_cb		*user_cb;
+	struct hl_cb		*patched_cb;
 	struct work_struct	finish_work;
+	struct list_head	userptr_list;
 	u32			id;
+	u32			hw_queue_id;
+	u32			user_cb_size;
+	u32			job_cb_size;
+	u8			ext_queue;
+};
+
+/**
+ * struct hl_cs_parser - command submission paerser properties.
+ * @user_cb: the CB we got from the user.
+ * @patched_cb: in case of patching, this is internal CB which is submitted on
+ *		the queue instead of the CB we got from the IOCTL.
+ * @job_userptr_list: linked-list of userptr mappings that belong to the related
+ *			job and wait for completion.
+ * @cs_sequence: the sequence number of the related CS.
+ * @ctx_id: the ID of the context the related CS belongs to.
+ * @hw_queue_id: the id of the H/W queue this job is submitted to.
+ * @user_cb_size: the actual size of the CB we got from the user.
+ * @patched_cb_size: the size of the CB after parsing.
+ * @ext_queue: whether the job is for external queue or internal queue.
+ * @job_id: the id of the related job inside the related CS.
+ * @use_virt_addr: whether to treat the addresses in the CB as virtual during
+ *			parsing.
+ */
+struct hl_cs_parser {
+	struct hl_cb		*user_cb;
+	struct hl_cb		*patched_cb;
+	struct list_head	*job_userptr_list;
+	u64			cs_sequence;
+	u32			ctx_id;
+	u32			hw_queue_id;
+	u32			user_cb_size;
+	u32			patched_cb_size;
+	u8			ext_queue;
+	u8			job_id;
+	u8			use_virt_addr;
 };
 
 
@@ -497,6 +685,7 @@ struct hl_cs_job {
  * @ctx_mgr: context manager to handle multiple context for this FD.
  * @cb_mgr: command buffer manager to handle multiple buffers for this FD.
  * @refcount: number of related contexts.
+ * @restore_phase_mutex: lock for context switch and restore phase.
  */
 struct hl_fpriv {
 	struct hl_device	*hdev;
@@ -506,6 +695,7 @@ struct hl_fpriv {
 	struct hl_ctx_mgr	ctx_mgr;
 	struct hl_cb_mgr	cb_mgr;
 	struct kref		refcount;
+	struct mutex		restore_phase_mutex;
 };
 
 
@@ -577,6 +767,8 @@ struct hl_device_reset_work {
  * @eq_wq: work queue of event queue for executing work in process context.
  * @kernel_ctx: KMD context structure.
  * @kernel_queues: array of hl_hw_queue.
+ * @hw_queues_mirror_list: CS mirror list for TDR.
+ * @hw_queues_mirror_lock: protects hw_queues_mirror_list.
  * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
  * @event_queue: event queue for IRQ from ArmCP.
  * @dma_pool: DMA pool for small allocations.
@@ -604,6 +796,7 @@ struct hl_device_reset_work {
  * @in_reset: is device in reset flow.
  * @curr_pll_profile: current PLL profile.
  * @fd_open_cnt: number of open user processes.
+ * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
  *             value is saved so in case of hard-reset, KMD will restore this
  *             value and update the F/W after the re-initialization
@@ -617,7 +810,10 @@ struct hl_device_reset_work {
  * @hwmon_initialized: is H/W monitor sensors was initialized.
  * @hard_reset_pending: is there a hard reset work pending.
  * @heartbeat: is heartbeat sanity check towards ArmCP enabled.
+ * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
+ *                   otherwise.
  * @init_done: is the initialization of the device done.
+ * @mmu_enable: is MMU enabled.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -634,6 +830,8 @@ struct hl_device {
 	struct workqueue_struct		*eq_wq;
 	struct hl_ctx			*kernel_ctx;
 	struct hl_hw_queue		*kernel_queues;
+	struct list_head		hw_queues_mirror_list;
+	spinlock_t			hw_queues_mirror_lock;
 	struct hl_cb_mgr		kernel_cb_mgr;
 	struct hl_eq			event_queue;
 	struct dma_pool			*dma_pool;
@@ -661,6 +859,7 @@ struct hl_device {
 	atomic_t			in_reset;
 	atomic_t			curr_pll_profile;
 	atomic_t			fd_open_cnt;
+	u64				timeout_jiffies;
 	u64				max_power;
 	u32				major;
 	u32				high_pll;
@@ -672,9 +871,11 @@ struct hl_device {
 	u8				hwmon_initialized;
 	u8				hard_reset_pending;
 	u8				heartbeat;
+	u8				reset_on_lockup;
 	u8				init_done;
 
 	/* Parameters for bring-up */
+	u8				mmu_enable;
 	u8				cpu_enable;
 	u8				reset_pcilink;
 	u8				cpu_queues_enable;
@@ -712,6 +913,58 @@ struct hl_ioctl_desc {
  * Kernel module functions that can be accessed by entire module
  */
 
+/**
+ * hl_mem_area_inside_range() - Checks whether address+size are inside a range.
+ * @address: The start address of the area we want to validate.
+ * @size: The size in bytes of the area we want to validate.
+ * @range_start_address: The start address of the valid range.
+ * @range_end_address: The end address of the valid range.
+ *
+ * Return: true if the area is inside the valid range, false otherwise.
+ */
+static inline bool hl_mem_area_inside_range(u64 address, u32 size,
+				u64 range_start_address, u64 range_end_address)
+{
+	u64 end_address = address + size;
+
+	if ((address >= range_start_address) &&
+			(end_address <= range_end_address) &&
+			(end_address > address))
+		return true;
+
+	return false;
+}
+
+/**
+ * hl_mem_area_crosses_range() - Checks whether address+size crossing a range.
+ * @address: The start address of the area we want to validate.
+ * @size: The size in bytes of the area we want to validate.
+ * @range_start_address: The start address of the valid range.
+ * @range_end_address: The end address of the valid range.
+ *
+ * Return: true if the area overlaps part or all of the valid range,
+ *		false otherwise.
+ */
+static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
+				u64 range_start_address, u64 range_end_address)
+{
+	u64 end_address = address + size;
+
+	if ((address >= range_start_address) &&
+			(address < range_end_address))
+		return true;
+
+	if ((end_address >= range_start_address) &&
+			(end_address < range_end_address))
+		return true;
+
+	if ((address < range_start_address) &&
+			(end_address >= range_end_address))
+		return true;
+
+	return false;
+}
+
 int hl_device_open(struct inode *inode, struct file *filp);
 bool hl_device_disabled_or_in_reset(struct hl_device *hdev);
 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
@@ -725,8 +978,10 @@ int hl_hw_queues_create(struct hl_device *hdev);
 void hl_hw_queues_destroy(struct hl_device *hdev);
 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 				u32 cb_size, u64 cb_ptr);
+int hl_hw_queue_schedule_cs(struct hl_cs *cs);
 u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
 void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);
+void hl_int_hw_queue_update_ci(struct hl_cs *cs);
 void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset);
 
 #define hl_queue_inc_ptr(p)		hl_hw_queue_add_ptr(p, 1)
@@ -740,6 +995,8 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q);
 void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q);
 irqreturn_t hl_irq_handler_cq(int irq, void *arg);
 irqreturn_t hl_irq_handler_eq(int irq, void *arg);
+u32 hl_cq_inc_ptr(u32 ptr);
+
 int hl_asid_init(struct hl_device *hdev);
 void hl_asid_fini(struct hl_device *hdev);
 unsigned long hl_asid_alloc(struct hl_device *hdev);
@@ -748,9 +1005,13 @@ void hl_asid_free(struct hl_device *hdev, unsigned long asid);
 int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv);
 void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx);
 int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
+void hl_ctx_do_release(struct kref *ref);
+void hl_ctx_get(struct hl_device *hdev,	struct hl_ctx *ctx);
 int hl_ctx_put(struct hl_ctx *ctx);
+struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
 void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
 void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);
+
 int hl_device_init(struct hl_device *hdev, struct class *hclass);
 void hl_device_fini(struct hl_device *hdev);
 int hl_device_suspend(struct hl_device *hdev);
@@ -782,8 +1043,20 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size);
 int hl_cb_pool_init(struct hl_device *hdev);
 int hl_cb_pool_fini(struct hl_device *hdev);
 
+void hl_cs_rollback_all(struct hl_device *hdev);
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue);
+
 void goya_set_asic_funcs(struct hl_device *hdev);
 
+int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
+			struct hl_userptr *userptr);
+int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr);
+void hl_userptr_delete_list(struct hl_device *hdev,
+				struct list_head *userptr_list);
+bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size,
+				struct list_head *userptr_list,
+				struct hl_userptr **userptr);
+
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq);
 long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);
@@ -799,5 +1072,7 @@ void hl_set_max_power(struct hl_device *hdev, u64 value);
 /* IOCTLs */
 long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data);
+int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data);
+int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data);
 
 #endif /* HABANALABSP_H_ */
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
index b0bf77af1e40..77a1cc85e530 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -24,6 +24,17 @@ static struct class *hl_class;
 DEFINE_IDR(hl_devs_idr);
 DEFINE_MUTEX(hl_devs_idr_lock);
 
+static int timeout_locked = 5;
+static int reset_on_lockup = 1;
+
+module_param(timeout_locked, int, 0444);
+MODULE_PARM_DESC(timeout_locked,
+	"Device lockup timeout in seconds (0 = disabled, default 5s)");
+
+module_param(reset_on_lockup, int, 0444);
+MODULE_PARM_DESC(reset_on_lockup,
+	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
+
 #define PCI_VENDOR_ID_HABANALABS	0x1da3
 
 #define PCI_IDS_GOYA			0x0001
@@ -113,6 +124,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 	hpriv->hdev = hdev;
 	filp->private_data = hpriv;
 	hpriv->filp = filp;
+	mutex_init(&hpriv->restore_phase_mutex);
 	kref_init(&hpriv->refcount);
 	nonseekable_open(inode, filp);
 
@@ -140,6 +152,7 @@ out_err:
 	filp->private_data = NULL;
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
+	mutex_destroy(&hpriv->restore_phase_mutex);
 	kfree(hpriv);
 
 close_device:
@@ -172,8 +185,10 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 		return -ENOMEM;
 
 	hdev->major = hl_major;
+	hdev->reset_on_lockup = reset_on_lockup;
 
 	/* Parameters for bring-up - set them to defaults */
+	hdev->mmu_enable = 0;
 	hdev->cpu_enable = 1;
 	hdev->reset_pcilink = 0;
 	hdev->cpu_queues_enable = 1;
@@ -193,6 +208,11 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	if (!hdev->cpu_queues_enable)
 		hdev->heartbeat = 0;
 
+	if (timeout_locked)
+		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
+	else
+		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
+
 	hdev->disabled = true;
 	hdev->pdev = pdev; /* can be NULL in case of simulator device */
 
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index e56a51f6bab6..481db1a5e97e 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -16,7 +16,9 @@
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func}
 
 static const struct hl_ioctl_desc hl_ioctls[] = {
-	HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl)
+	HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl),
+	HL_IOCTL_DEF(HL_IOCTL_CS, hl_cs_ioctl),
+	HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl)
 };
 
 #define HL_CORE_IOCTL_COUNT	ARRAY_SIZE(hl_ioctls)
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index 2ec43f36cdb8..68dfda59a875 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -34,6 +34,29 @@ static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len)
 		return (abs(delta) - queue_len);
 }
 
+void hl_int_hw_queue_update_ci(struct hl_cs *cs)
+{
+	struct hl_device *hdev = cs->ctx->hdev;
+	struct hl_hw_queue *q;
+	int i;
+
+	hdev->asic_funcs->hw_queues_lock(hdev);
+
+	if (hdev->disabled)
+		goto out;
+
+	q = &hdev->kernel_queues[0];
+	for (i = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
+		if (q->queue_type == QUEUE_TYPE_INT) {
+			q->ci += cs->jobs_in_queue_cnt[i];
+			q->ci &= ((q->int_queue_len << 1) - 1);
+		}
+	}
+
+out:
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+}
+
 /*
  * ext_queue_submit_bd - Submit a buffer descriptor to an external queue
  *
@@ -119,6 +142,37 @@ static int ext_queue_sanity_checks(struct hl_device *hdev,
 	return 0;
 }
 
+/*
+ * int_queue_sanity_checks - perform some sanity checks on internal queue
+ *
+ * @hdev              : pointer to hl_device structure
+ * @q                 :	pointer to hl_hw_queue structure
+ * @num_of_entries    : how many entries to check for space
+ *
+ * H/W queues spinlock should be taken before calling this function
+ *
+ * Perform the following:
+ * - Make sure we have enough space in the h/w queue
+ *
+ */
+static int int_queue_sanity_checks(struct hl_device *hdev,
+					struct hl_hw_queue *q,
+					int num_of_entries)
+{
+	int free_slots_cnt;
+
+	/* Check we have enough space in the queue */
+	free_slots_cnt = queue_free_slots(q, q->int_queue_len);
+
+	if (free_slots_cnt < num_of_entries) {
+		dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
+			q->hw_queue_id, num_of_entries);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
 /*
  * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
  *
@@ -165,6 +219,184 @@ out:
 	return rc;
 }
 
+/*
+ * ext_hw_queue_schedule_job - submit an JOB to an external queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
+{
+	struct hl_device *hdev = job->cs->ctx->hdev;
+	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+	struct hl_cq_entry cq_pkt;
+	struct hl_cq *cq;
+	u64 cq_addr;
+	struct hl_cb *cb;
+	u32 ctl;
+	u32 len;
+	u64 ptr;
+
+	/*
+	 * Update the JOB ID inside the BD CTL so the device would know what
+	 * to write in the completion queue
+	 */
+	ctl = ((q->pi << BD_CTL_SHADOW_INDEX_SHIFT) & BD_CTL_SHADOW_INDEX_MASK);
+
+	cb = job->patched_cb;
+	len = job->job_cb_size;
+	ptr = cb->bus_address;
+
+	cq_pkt.data = (q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
+					& CQ_ENTRY_SHADOW_INDEX_MASK;
+	cq_pkt.data |= 1 << CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT;
+	cq_pkt.data |= 1 << CQ_ENTRY_READY_SHIFT;
+
+	/*
+	 * No need to protect pi_offset because scheduling to the
+	 * H/W queues is done under the scheduler mutex
+	 *
+	 * No need to check if CQ is full because it was already
+	 * checked in hl_queue_sanity_checks
+	 */
+	cq = &hdev->completion_queue[q->hw_queue_id];
+	cq_addr = cq->bus_address +
+			hdev->asic_prop.host_phys_base_address;
+	cq_addr += cq->pi * sizeof(struct hl_cq_entry);
+
+	hdev->asic_funcs->add_end_of_cb_packets(cb->kernel_address, len,
+				cq_addr, cq_pkt.data, q->hw_queue_id);
+
+	q->shadow_queue[hl_pi_2_offset(q->pi)] = job;
+
+	cq->pi = hl_cq_inc_ptr(cq->pi);
+
+	ext_queue_submit_bd(hdev, q, ctl, len, ptr);
+}
+
+/*
+ * int_hw_queue_schedule_job - submit an JOB to an internal queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void int_hw_queue_schedule_job(struct hl_cs_job *job)
+{
+	struct hl_device *hdev = job->cs->ctx->hdev;
+	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+	struct hl_bd bd;
+	u64 *pi, *pbd = (u64 *) &bd;
+
+	bd.ctl = 0;
+	bd.len = job->job_cb_size;
+	bd.ptr = (u64) (uintptr_t) job->user_cb;
+
+	pi = (u64 *) (uintptr_t) (q->kernel_address +
+		((q->pi & (q->int_queue_len - 1)) * sizeof(bd)));
+
+	pi[0] = pbd[0];
+	pi[1] = pbd[1];
+
+	q->pi++;
+	q->pi &= ((q->int_queue_len << 1) - 1);
+
+	/* Flush PQ entry write. Relevant only for specific ASICs */
+	hdev->asic_funcs->flush_pq_write(hdev, pi, pbd[0]);
+
+	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
+}
+
+/*
+ * hl_hw_queue_schedule_cs - schedule a command submission
+ *
+ * @job        : pointer to the CS
+ *
+ */
+int hl_hw_queue_schedule_cs(struct hl_cs *cs)
+{
+	struct hl_device *hdev = cs->ctx->hdev;
+	struct hl_cs_job *job, *tmp;
+	struct hl_hw_queue *q;
+	int rc = 0, i, cq_cnt;
+
+	hdev->asic_funcs->hw_queues_lock(hdev);
+
+	if (hl_device_disabled_or_in_reset(hdev)) {
+		dev_err(hdev->dev,
+			"device is disabled or in reset, CS rejected!\n");
+		rc = -EPERM;
+		goto out;
+	}
+
+	q = &hdev->kernel_queues[0];
+	/* This loop assumes all external queues are consecutive */
+	for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
+		if (q->queue_type == QUEUE_TYPE_EXT) {
+			if (cs->jobs_in_queue_cnt[i]) {
+				rc = ext_queue_sanity_checks(hdev, q,
+					cs->jobs_in_queue_cnt[i], true);
+				if (rc)
+					goto unroll_cq_resv;
+				cq_cnt++;
+			}
+		} else if (q->queue_type == QUEUE_TYPE_INT) {
+			if (cs->jobs_in_queue_cnt[i]) {
+				rc = int_queue_sanity_checks(hdev, q,
+					cs->jobs_in_queue_cnt[i]);
+				if (rc)
+					goto unroll_cq_resv;
+			}
+		}
+	}
+
+	spin_lock(&hdev->hw_queues_mirror_lock);
+	list_add_tail(&cs->mirror_node, &hdev->hw_queues_mirror_list);
+
+	/* Queue TDR if the CS is the first entry and if timeout is wanted */
+	if ((hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) &&
+			(list_first_entry(&hdev->hw_queues_mirror_list,
+					struct hl_cs, mirror_node) == cs)) {
+		cs->tdr_active = true;
+		schedule_delayed_work(&cs->work_tdr, hdev->timeout_jiffies);
+		spin_unlock(&hdev->hw_queues_mirror_lock);
+	} else {
+		spin_unlock(&hdev->hw_queues_mirror_lock);
+	}
+
+	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) {
+		if (job->ext_queue)
+			ext_hw_queue_schedule_job(job);
+		else
+			int_hw_queue_schedule_job(job);
+	}
+
+	cs->submitted = true;
+
+	goto out;
+
+unroll_cq_resv:
+	/* This loop assumes all external queues are consecutive */
+	q = &hdev->kernel_queues[0];
+	for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
+		if ((q->queue_type == QUEUE_TYPE_EXT) &&
+				(cs->jobs_in_queue_cnt[i])) {
+			atomic_t *free_slots =
+				&hdev->completion_queue[i].free_slots_cnt;
+			atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
+			cq_cnt--;
+		}
+	}
+
+out:
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	return rc;
+}
+
 /*
  * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue
  *
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
new file mode 100644
index 000000000000..ad14376a1c25
--- /dev/null
+++ b/drivers/misc/habanalabs/memory.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "habanalabs.h"
+
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+/*
+ * hl_pin_host_memory - pins a chunk of host memory
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @addr                : the user-space virtual address of the memory area
+ * @size                : the size of the memory area
+ * @userptr	        : pointer to hl_userptr structure
+ *
+ * This function does the following:
+ * - Pins the physical pages
+ * - Create a SG list from those pages
+ */
+int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
+			struct hl_userptr *userptr)
+{
+	u64 start, end;
+	u32 npages, offset;
+	int rc;
+
+	if (!size) {
+		dev_err(hdev->dev, "size to pin is invalid - %d\n",
+			size);
+		return -EINVAL;
+	}
+
+	if (!access_ok((void __user *) (uintptr_t) addr, size)) {
+		dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n",
+			addr);
+		return -EFAULT;
+	}
+
+	/*
+	 * If the combination of the address and size requested for this memory
+	 * region causes an integer overflow, return error.
+	 */
+	if (((addr + size) < addr) ||
+			PAGE_ALIGN(addr + size) < (addr + size)) {
+		dev_err(hdev->dev,
+			"user pointer 0x%llx + %u causes integer overflow\n",
+			addr, size);
+		return -EINVAL;
+	}
+
+	start = addr & PAGE_MASK;
+	offset = addr & ~PAGE_MASK;
+	end = PAGE_ALIGN(addr + size);
+	npages = (end - start) >> PAGE_SHIFT;
+
+	userptr->size = size;
+	userptr->addr = addr;
+	userptr->dma_mapped = false;
+	INIT_LIST_HEAD(&userptr->job_node);
+
+	userptr->vec = frame_vector_create(npages);
+	if (!userptr->vec) {
+		dev_err(hdev->dev, "Failed to create frame vector\n");
+		return -ENOMEM;
+	}
+
+	rc = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE,
+				userptr->vec);
+
+	if (rc != npages) {
+		dev_err(hdev->dev,
+			"Failed to map host memory, user ptr probably wrong\n");
+		if (rc < 0)
+			goto destroy_framevec;
+		rc = -EFAULT;
+		goto put_framevec;
+	}
+
+	if (frame_vector_to_pages(userptr->vec) < 0) {
+		dev_err(hdev->dev,
+			"Failed to translate frame vector to pages\n");
+		rc = -EFAULT;
+		goto put_framevec;
+	}
+
+	userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_ATOMIC);
+	if (!userptr->sgt) {
+		rc = -ENOMEM;
+		goto put_framevec;
+	}
+
+	rc = sg_alloc_table_from_pages(userptr->sgt,
+					frame_vector_pages(userptr->vec),
+					npages, offset, size, GFP_ATOMIC);
+	if (rc < 0) {
+		dev_err(hdev->dev, "failed to create SG table from pages\n");
+		goto free_sgt;
+	}
+
+	return 0;
+
+free_sgt:
+	kfree(userptr->sgt);
+put_framevec:
+	put_vaddr_frames(userptr->vec);
+destroy_framevec:
+	frame_vector_destroy(userptr->vec);
+	return rc;
+}
+
+/*
+ * hl_unpin_host_memory - unpins a chunk of host memory
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @userptr             : pointer to hl_userptr structure
+ *
+ * This function does the following:
+ * - Unpins the physical pages related to the host memory
+ * - Free the SG list
+ */
+int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
+{
+	struct page **pages;
+
+	if (userptr->dma_mapped)
+		hdev->asic_funcs->hl_dma_unmap_sg(hdev,
+				userptr->sgt->sgl,
+				userptr->sgt->nents,
+				userptr->dir);
+
+	pages = frame_vector_pages(userptr->vec);
+	if (!IS_ERR(pages)) {
+		int i;
+
+		for (i = 0; i < frame_vector_count(userptr->vec); i++)
+			set_page_dirty_lock(pages[i]);
+	}
+	put_vaddr_frames(userptr->vec);
+	frame_vector_destroy(userptr->vec);
+
+	list_del(&userptr->job_node);
+
+	sg_free_table(userptr->sgt);
+	kfree(userptr->sgt);
+
+	return 0;
+}
+
+/*
+ * hl_userptr_delete_list - clear userptr list
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @userptr_list        : pointer to the list to clear
+ *
+ * This function does the following:
+ * - Iterates over the list and unpins the host memory and frees the userptr
+ *   structure.
+ */
+void hl_userptr_delete_list(struct hl_device *hdev,
+				struct list_head *userptr_list)
+{
+	struct hl_userptr *userptr, *tmp;
+
+	list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {
+		hl_unpin_host_memory(hdev, userptr);
+		kfree(userptr);
+	}
+
+	INIT_LIST_HEAD(userptr_list);
+}
+
+/*
+ * hl_userptr_is_pinned - returns whether the given userptr is pinned
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @userptr_list        : pointer to the list to clear
+ * @userptr             : pointer to userptr to check
+ *
+ * This function does the following:
+ * - Iterates over the list and checks if the given userptr is in it, means is
+ *   pinned. If so, returns true, otherwise returns false.
+ */
+bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
+				u32 size, struct list_head *userptr_list,
+				struct hl_userptr **userptr)
+{
+	list_for_each_entry((*userptr), userptr_list, job_node) {
+		if ((addr == (*userptr)->addr) && (size == (*userptr)->size))
+			return true;
+	}
+
+	return false;
+}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 756266cf0416..fba49417f607 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -73,6 +73,95 @@ union hl_cb_args {
 	struct hl_cb_out out;
 };
 
+/*
+ * This structure size must always be fixed to 64-bytes for backward
+ * compatibility
+ */
+struct hl_cs_chunk {
+	/*
+	 * For external queue, this represents a Handle of CB on the Host
+	 * For internal queue, this represents an SRAM or DRAM address of the
+	 * internal CB
+	 */
+	__u64 cb_handle;
+	/* Index of queue to put the CB on */
+	__u32 queue_index;
+	/*
+	 * Size of command buffer with valid packets
+	 * Can be smaller then actual CB size
+	 */
+	__u32 cb_size;
+	/* HL_CS_CHUNK_FLAGS_* */
+	__u32 cs_chunk_flags;
+	/* Align structure to 64 bytes */
+	__u32 pad[11];
+};
+
+#define HL_CS_FLAGS_FORCE_RESTORE	0x1
+
+#define HL_CS_STATUS_SUCCESS		0
+
+struct hl_cs_in {
+	/* this holds address of array of hl_cs_chunk for restore phase */
+	__u64 chunks_restore;
+	/* this holds address of array of hl_cs_chunk for execution phase */
+	__u64 chunks_execute;
+	/* this holds address of array of hl_cs_chunk for store phase -
+	 * Currently not in use
+	 */
+	__u64 chunks_store;
+	/* Number of chunks in restore phase array */
+	__u32 num_chunks_restore;
+	/* Number of chunks in execution array */
+	__u32 num_chunks_execute;
+	/* Number of chunks in restore phase array - Currently not in use */
+	__u32 num_chunks_store;
+	/* HL_CS_FLAGS_* */
+	__u32 cs_flags;
+	/* Context ID - Currently not in use */
+	__u32 ctx_id;
+};
+
+struct hl_cs_out {
+	/* this holds the sequence number of the CS to pass to wait ioctl */
+	__u64 seq;
+	/* HL_CS_STATUS_* */
+	__u32 status;
+	__u32 pad;
+};
+
+union hl_cs_args {
+	struct hl_cs_in in;
+	struct hl_cs_out out;
+};
+
+struct hl_wait_cs_in {
+	/* Command submission sequence number */
+	__u64 seq;
+	/* Absolute timeout to wait in microseconds */
+	__u64 timeout_us;
+	/* Context ID - Currently not in use */
+	__u32 ctx_id;
+	__u32 pad;
+};
+
+#define HL_WAIT_CS_STATUS_COMPLETED	0
+#define HL_WAIT_CS_STATUS_BUSY		1
+#define HL_WAIT_CS_STATUS_TIMEDOUT	2
+#define HL_WAIT_CS_STATUS_ABORTED	3
+#define HL_WAIT_CS_STATUS_INTERRUPTED	4
+
+struct hl_wait_cs_out {
+	/* HL_WAIT_CS_STATUS_* */
+	__u32 status;
+	__u32 pad;
+};
+
+union hl_wait_cs_args {
+	struct hl_wait_cs_in in;
+	struct hl_wait_cs_out out;
+};
+
 /*
  * Command Buffer
  * - Request a Command Buffer
@@ -89,7 +178,74 @@ union hl_cb_args {
 #define HL_IOCTL_CB		\
 		_IOWR('H', 0x02, union hl_cb_args)
 
+/*
+ * Command Submission
+ *
+ * To submit work to the device, the user need to call this IOCTL with a set
+ * of JOBS. That set of JOBS constitutes a CS object.
+ * Each JOB will be enqueued on a specific queue, according to the user's input.
+ * There can be more then one JOB per queue.
+ *
+ * There are two types of queues - external and internal. External queues
+ * are DMA queues which transfer data from/to the Host. All other queues are
+ * internal. The driver will get completion notifications from the device only
+ * on JOBS which are enqueued in the external queues.
+ *
+ * This IOCTL is asynchronous in regard to the actual execution of the CS. This
+ * means it returns immediately after ALL the JOBS were enqueued on their
+ * relevant queues. Therefore, the user mustn't assume the CS has been completed
+ * or has even started to execute.
+ *
+ * Upon successful enqueue, the IOCTL returns an opaque handle which the user
+ * can use with the "Wait for CS" IOCTL to check whether the handle's CS
+ * external JOBS have been completed. Note that if the CS has internal JOBS
+ * which can execute AFTER the external JOBS have finished, the driver might
+ * report that the CS has finished executing BEFORE the internal JOBS have
+ * actually finish executing.
+ *
+ * The CS IOCTL will receive three sets of JOBS. One set is for "restore" phase,
+ * a second set is for "execution" phase and a third set is for "store" phase.
+ * The JOBS on the "restore" phase are enqueued only after context-switch
+ * (or if its the first CS for this context). The user can also order the
+ * driver to run the "restore" phase explicitly
+ *
+ */
+#define HL_IOCTL_CS			\
+		_IOWR('H', 0x03, union hl_cs_args)
+
+/*
+ * Wait for Command Submission
+ *
+ * The user can call this IOCTL with a handle it received from the CS IOCTL
+ * to wait until the handle's CS has finished executing. The user will wait
+ * inside the kernel until the CS has finished or until the user-requeusted
+ * timeout has expired.
+ *
+ * The return value of the IOCTL is a standard Linux error code. The possible
+ * values are:
+ *
+ * EINTR     - Kernel waiting has been interrupted, e.g. due to OS signal
+ *             that the user process received
+ * ETIMEDOUT - The CS has caused a timeout on the device
+ * EIO       - The CS was aborted (usually because the device was reset)
+ * ENODEV    - The device wants to do hard-reset (so user need to close FD)
+ *
+ * The driver also returns a custom define inside the IOCTL which can be:
+ *
+ * HL_WAIT_CS_STATUS_COMPLETED   - The CS has been completed successfully (0)
+ * HL_WAIT_CS_STATUS_BUSY        - The CS is still executing (0)
+ * HL_WAIT_CS_STATUS_TIMEDOUT    - The CS has caused a timeout on the device
+ *                                 (ETIMEDOUT)
+ * HL_WAIT_CS_STATUS_ABORTED     - The CS was aborted, usually because the
+ *                                 device was reset (EIO)
+ * HL_WAIT_CS_STATUS_INTERRUPTED - Waiting for the CS was interrupted (EINTR)
+ *
+ */
+
+#define HL_IOCTL_WAIT_CS			\
+		_IOWR('H', 0x04, union hl_wait_cs_args)
+
 #define HL_COMMAND_START	0x02
-#define HL_COMMAND_END		0x03
+#define HL_COMMAND_END		0x05
 
 #endif /* HABANALABS_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From 0feaf86d4e69507ab9b2af7dcc63a6886352d5db Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Sat, 16 Feb 2019 00:39:22 +0200
Subject: habanalabs: add virtual memory and MMU modules

This patch adds the Virtual Memory and MMU modules.

Goya has an internal MMU which provides process isolation on the internal
DDR. The internal MMU also performs translations for transactions that go
from Goya to the Host.

The driver is responsible for allocating and freeing memory on the DDR
upon user request. It also provides an interface to map and unmap DDR and
Host memory to the device address space.

The MMU in Goya supports 3-level and 4-level page tables. With 3-level, the
size of each page is 2MB, while with 4-level the size of each page is 4KB.

In the DDR, the physical pages are always 2MB.

Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/habanalabs/Makefile                   |    2 +-
 drivers/misc/habanalabs/context.c                  |   19 +-
 drivers/misc/habanalabs/device.c                   |   20 +-
 drivers/misc/habanalabs/goya/goya.c                |  395 +++++
 drivers/misc/habanalabs/habanalabs.h               |  189 ++-
 drivers/misc/habanalabs/habanalabs_drv.c           |    2 +-
 drivers/misc/habanalabs/habanalabs_ioctl.c         |    3 +-
 .../habanalabs/include/hw_ip/mmu/mmu_general.h     |   46 +
 .../misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h   |   15 +
 drivers/misc/habanalabs/memory.c                   | 1517 ++++++++++++++++++++
 drivers/misc/habanalabs/mmu.c                      |  691 +++++++++
 include/uapi/misc/habanalabs.h                     |  122 +-
 12 files changed, 3013 insertions(+), 8 deletions(-)
 create mode 100644 drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
 create mode 100644 drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h
 create mode 100644 drivers/misc/habanalabs/mmu.c

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
index d2fd0e18b1eb..fd46f8b48bab 100644
--- a/drivers/misc/habanalabs/Makefile
+++ b/drivers/misc/habanalabs/Makefile
@@ -6,7 +6,7 @@ obj-m	:= habanalabs.o
 
 habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
 		command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \
-		command_submission.o
+		command_submission.o mmu.o
 
 include $(src)/goya/Makefile
 habanalabs-y += $(HL_GOYA_FILES)
diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c
index c3854714b46c..619ace1c4ef7 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -25,8 +25,10 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 	for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
 		dma_fence_put(ctx->cs_pending[i]);
 
-	if (ctx->asid != HL_KERNEL_ASID_ID)
+	if (ctx->asid != HL_KERNEL_ASID_ID) {
+		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
+	}
 }
 
 void hl_ctx_do_release(struct kref *ref)
@@ -96,6 +98,8 @@ void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx)
 
 int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 {
+	int rc = 0;
+
 	ctx->hdev = hdev;
 
 	kref_init(&ctx->refcount);
@@ -113,9 +117,22 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 			dev_err(hdev->dev, "No free ASID, failed to create context\n");
 			return -ENOMEM;
 		}
+
+		rc = hl_vm_ctx_init(ctx);
+		if (rc) {
+			dev_err(hdev->dev, "Failed to init mem ctx module\n");
+			rc = -ENOMEM;
+			goto mem_ctx_err;
+		}
 	}
 
 	return 0;
+
+mem_ctx_err:
+	if (ctx->asid != HL_KERNEL_ASID_ID)
+		hl_asid_free(hdev, ctx->asid);
+
+	return rc;
 }
 
 void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx)
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index cc5f068df597..d0929022655b 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -615,8 +615,10 @@ again:
 	/* Reset the H/W. It will be in idle state after this returns */
 	hdev->asic_funcs->hw_fini(hdev, hard_reset);
 
-	if (hard_reset)
+	if (hard_reset) {
+		hl_vm_fini(hdev);
 		hl_eq_reset(hdev, &hdev->event_queue);
+	}
 
 	/* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
 	hl_hw_queue_reset(hdev, hard_reset);
@@ -677,6 +679,13 @@ again:
 			goto out_err;
 		}
 
+		rc = hl_vm_init(hdev);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to init memory module after hard reset\n");
+			goto out_err;
+		}
+
 		hl_set_max_power(hdev, hdev->max_power);
 
 		hdev->hard_reset_pending = false;
@@ -861,6 +870,13 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		hdev->asic_name,
 		hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
 
+	rc = hl_vm_init(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to initialize memory module\n");
+		rc = 0;
+		goto out_disabled;
+	}
+
 	/*
 	 * hl_hwmon_init must be called after device_late_init, because only
 	 * there we get the information from the device about which
@@ -977,6 +993,8 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Reset the H/W. It will be in idle state after this returns */
 	hdev->asic_funcs->hw_fini(hdev, true);
 
+	hl_vm_fini(hdev);
+
 	hl_eq_fini(hdev, &hdev->event_queue);
 
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index e3878fd7dc94..89b82b989966 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -6,6 +6,8 @@
  */
 
 #include "goyaP.h"
+#include "include/hw_ip/mmu/mmu_general.h"
+#include "include/hw_ip/mmu/mmu_v1_0.h"
 #include "include/goya/asic_reg/goya_masks.h"
 
 #include <linux/pci.h>
@@ -80,6 +82,7 @@
 #define GOYA_PLDM_RESET_WAIT_MSEC	1000		/* 1s */
 #define GOYA_CPU_TIMEOUT_USEC		10000000	/* 10s */
 #define GOYA_TEST_QUEUE_WAIT_USEC	100000		/* 100ms */
+#define GOYA_PLDM_MMU_TIMEOUT_USEC	(MMU_CONFIG_TIMEOUT_USEC * 100)
 
 #define GOYA_QMAN0_FENCE_VAL		0xD169B243
 
@@ -131,6 +134,70 @@ static const char *goya_axi_name[GOYA_MAX_INITIATORS] = {
 	"MMU"
 };
 
+static u64 goya_mmu_regs[GOYA_MMU_REGS_NUM] = {
+	mmDMA_QM_0_GLBL_NON_SECURE_PROPS,
+	mmDMA_QM_1_GLBL_NON_SECURE_PROPS,
+	mmDMA_QM_2_GLBL_NON_SECURE_PROPS,
+	mmDMA_QM_3_GLBL_NON_SECURE_PROPS,
+	mmDMA_QM_4_GLBL_NON_SECURE_PROPS,
+	mmTPC0_QM_GLBL_SECURE_PROPS,
+	mmTPC0_QM_GLBL_NON_SECURE_PROPS,
+	mmTPC0_CMDQ_GLBL_SECURE_PROPS,
+	mmTPC0_CMDQ_GLBL_NON_SECURE_PROPS,
+	mmTPC0_CFG_ARUSER,
+	mmTPC0_CFG_AWUSER,
+	mmTPC1_QM_GLBL_SECURE_PROPS,
+	mmTPC1_QM_GLBL_NON_SECURE_PROPS,
+	mmTPC1_CMDQ_GLBL_SECURE_PROPS,
+	mmTPC1_CMDQ_GLBL_NON_SECURE_PROPS,
+	mmTPC1_CFG_ARUSER,
+	mmTPC1_CFG_AWUSER,
+	mmTPC2_QM_GLBL_SECURE_PROPS,
+	mmTPC2_QM_GLBL_NON_SECURE_PROPS,
+	mmTPC2_CMDQ_GLBL_SECURE_PROPS,
+	mmTPC2_CMDQ_GLBL_NON_SECURE_PROPS,
+	mmTPC2_CFG_ARUSER,
+	mmTPC2_CFG_AWUSER,
+	mmTPC3_QM_GLBL_SECURE_PROPS,
+	mmTPC3_QM_GLBL_NON_SECURE_PROPS,
+	mmTPC3_CMDQ_GLBL_SECURE_PROPS,
+	mmTPC3_CMDQ_GLBL_NON_SECURE_PROPS,
+	mmTPC3_CFG_ARUSER,
+	mmTPC3_CFG_AWUSER,
+	mmTPC4_QM_GLBL_SECURE_PROPS,
+	mmTPC4_QM_GLBL_NON_SECURE_PROPS,
+	mmTPC4_CMDQ_GLBL_SECURE_PROPS,
+	mmTPC4_CMDQ_GLBL_NON_SECURE_PROPS,
+	mmTPC4_CFG_ARUSER,
+	mmTPC4_CFG_AWUSER,
+	mmTPC5_QM_GLBL_SECURE_PROPS,
+	mmTPC5_QM_GLBL_NON_SECURE_PROPS,
+	mmTPC5_CMDQ_GLBL_SECURE_PROPS,
+	mmTPC5_CMDQ_GLBL_NON_SECURE_PROPS,
+	mmTPC5_CFG_ARUSER,
+	mmTPC5_CFG_AWUSER,
+	mmTPC6_QM_GLBL_SECURE_PROPS,
+	mmTPC6_QM_GLBL_NON_SECURE_PROPS,
+	mmTPC6_CMDQ_GLBL_SECURE_PROPS,
+	mmTPC6_CMDQ_GLBL_NON_SECURE_PROPS,
+	mmTPC6_CFG_ARUSER,
+	mmTPC6_CFG_AWUSER,
+	mmTPC7_QM_GLBL_SECURE_PROPS,
+	mmTPC7_QM_GLBL_NON_SECURE_PROPS,
+	mmTPC7_CMDQ_GLBL_SECURE_PROPS,
+	mmTPC7_CMDQ_GLBL_NON_SECURE_PROPS,
+	mmTPC7_CFG_ARUSER,
+	mmTPC7_CFG_AWUSER,
+	mmMME_QM_GLBL_SECURE_PROPS,
+	mmMME_QM_GLBL_NON_SECURE_PROPS,
+	mmMME_CMDQ_GLBL_SECURE_PROPS,
+	mmMME_CMDQ_GLBL_NON_SECURE_PROPS,
+	mmMME_SBA_CONTROL_DATA,
+	mmMME_SBB_CONTROL_DATA,
+	mmMME_SBC_CONTROL_DATA,
+	mmMME_WBC_CONTROL_DATA
+};
+
 #define GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE 121
 
 static u32 goya_non_fatal_events[GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE] = {
@@ -258,6 +325,10 @@ static u32 goya_non_fatal_events[GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE] = {
 };
 
 static int goya_armcp_info_get(struct hl_device *hdev);
+static void goya_mmu_prepare(struct hl_device *hdev, u32 asid);
+static int goya_mmu_clear_pgt_range(struct hl_device *hdev);
+static int goya_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid,
+					u64 phys_addr);
 
 static void goya_get_fixed_properties(struct hl_device *hdev)
 {
@@ -296,6 +367,16 @@ static void goya_get_fixed_properties(struct hl_device *hdev)
 	prop->sram_user_base_address = prop->sram_base_address +
 						SRAM_USER_BASE_OFFSET;
 
+	prop->mmu_pgt_addr = MMU_PAGE_TABLES_ADDR;
+	if (hdev->pldm)
+		prop->mmu_pgt_size = 0x800000; /* 8MB */
+	else
+		prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
+	prop->mmu_pte_size = HL_PTE_SIZE;
+	prop->mmu_hop_table_size = HOP_TABLE_SIZE;
+	prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
+	prop->dram_page_size = PAGE_SIZE_2MB;
+
 	prop->host_phys_base_address = HOST_PHYS_BASE;
 	prop->va_space_host_start_address = VA_HOST_SPACE_START;
 	prop->va_space_host_end_address = VA_HOST_SPACE_END;
@@ -752,7 +833,18 @@ static int goya_late_init(struct hl_device *hdev)
 
 	goya_fetch_psoc_frequency(hdev);
 
+	rc = goya_mmu_clear_pgt_range(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to clear MMU page tables range\n");
+		goto disable_pci_access;
+	}
+
 	return 0;
+
+disable_pci_access:
+	goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
+
+	return rc;
 }
 
 /*
@@ -2565,6 +2657,54 @@ out:
 	return 0;
 }
 
+static int goya_mmu_init(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct goya_device *goya = hdev->asic_specific;
+	u64 hop0_addr;
+	int rc, i;
+
+	if (!hdev->mmu_enable)
+		return 0;
+
+	if (goya->hw_cap_initialized & HW_CAP_MMU)
+		return 0;
+
+	hdev->dram_supports_virtual_memory = true;
+
+	for (i = 0 ; i < prop->max_asid ; i++) {
+		hop0_addr = prop->mmu_pgt_addr +
+				(i * prop->mmu_hop_table_size);
+
+		rc = goya_mmu_update_asid_hop0_addr(hdev, i, hop0_addr);
+		if (rc) {
+			dev_err(hdev->dev,
+				"failed to set hop0 addr for asid %d\n", i);
+			goto err;
+		}
+	}
+
+	goya->hw_cap_initialized |= HW_CAP_MMU;
+
+	/* init MMU cache manage page */
+	WREG32(mmSTLB_CACHE_INV_BASE_39_8, MMU_CACHE_MNG_ADDR >> 8);
+	WREG32(mmSTLB_CACHE_INV_BASE_49_40, MMU_CACHE_MNG_ADDR << 40);
+
+	/* Remove follower feature due to performance bug */
+	WREG32_AND(mmSTLB_STLB_FEATURE_EN,
+			(~STLB_STLB_FEATURE_EN_FOLLOWER_EN_MASK));
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true);
+
+	WREG32(mmMMU_MMU_ENABLE, 1);
+	WREG32(mmMMU_SPI_MASK, 0xF);
+
+	return 0;
+
+err:
+	return rc;
+}
+
 /*
  * goya_hw_init - Goya hardware initialization code
  *
@@ -2614,6 +2754,10 @@ static int goya_hw_init(struct hl_device *hdev)
 		return rc;
 	}
 
+	rc = goya_mmu_init(hdev);
+	if (rc)
+		return rc;
+
 	goya_init_security(hdev);
 
 	goya_init_dma_qmans(hdev);
@@ -4249,6 +4393,10 @@ int goya_context_switch(struct hl_device *hdev, u32 asid)
 
 	rc = goya_send_job_on_qman0(hdev, job);
 
+	/* no point in setting the asid in case of failure */
+	if (!rc)
+		goya_mmu_prepare(hdev, asid);
+
 	job->patched_cb->cs_cnt--;
 	hl_cb_put(job->patched_cb);
 
@@ -4284,6 +4432,22 @@ void goya_restore_phase_topology(struct hl_device *hdev)
 	i = RREG32(mmSYNC_MNGR_SOB_OBJ_0);
 }
 
+static u64 goya_read_pte(struct hl_device *hdev, u64 addr)
+{
+	struct goya_device *goya = hdev->asic_specific;
+
+	return readq(hdev->pcie_bar[DDR_BAR_ID] +
+			(addr - goya->ddr_bar_cur_addr));
+}
+
+static void goya_write_pte(struct hl_device *hdev, u64 addr, u64 val)
+{
+	struct goya_device *goya = hdev->asic_specific;
+
+	writeq(val, hdev->pcie_bar[DDR_BAR_ID] +
+			(addr - goya->ddr_bar_cur_addr));
+}
+
 static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id,
 		u16 event_type, char *axi_name, int len)
 {
@@ -4567,6 +4731,233 @@ void *goya_get_events_stat(struct hl_device *hdev, u32 *size)
 	return goya->events_stat;
 }
 
+static int goya_mmu_clear_pgt_range(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct goya_device *goya = hdev->asic_specific;
+	struct packet_lin_dma *clear_pgt_range_pkt;
+	struct hl_cs_parser parser;
+	struct hl_cs_job *job;
+	u32 cb_size;
+	struct hl_cb *cb;
+	int rc;
+
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
+		return 0;
+
+	cb = hl_cb_kernel_create(hdev, PAGE_SIZE);
+	if (!cb)
+		return -EFAULT;
+
+	clear_pgt_range_pkt = (struct packet_lin_dma *)
+					(uintptr_t) cb->kernel_address;
+
+	memset(clear_pgt_range_pkt, 0, sizeof(*clear_pgt_range_pkt));
+	cb_size = sizeof(*clear_pgt_range_pkt);
+
+	clear_pgt_range_pkt->ctl =
+		((PACKET_LIN_DMA << GOYA_PKT_CTL_OPCODE_SHIFT) |
+		(DMA_HOST_TO_DRAM << GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT) |
+		(1 << GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT) |
+		(1 << GOYA_PKT_LIN_DMA_CTL_WO_SHIFT) |
+		(1 << GOYA_PKT_CTL_RB_SHIFT) |
+		(1 << GOYA_PKT_CTL_MB_SHIFT));
+
+	clear_pgt_range_pkt->src_addr = 0;
+	clear_pgt_range_pkt->dst_addr = prop->mmu_pgt_addr;
+	clear_pgt_range_pkt->tsize = prop->mmu_pgt_size + MMU_CACHE_MNG_SIZE;
+
+	job = hl_cs_allocate_job(hdev, true);
+	if (!job) {
+		dev_err(hdev->dev, "Failed to allocate a new job\n");
+		rc = -ENOMEM;
+		goto release_cb;
+	}
+
+	job->id = 0;
+	job->user_cb = cb;
+	job->user_cb->cs_cnt++;
+	job->user_cb_size = cb_size;
+	job->hw_queue_id = GOYA_QUEUE_ID_DMA_0;
+
+	parser.ctx_id = HL_KERNEL_ASID_ID;
+	parser.cs_sequence = 0;
+	parser.job_id = job->id;
+	parser.hw_queue_id = job->hw_queue_id;
+	parser.job_userptr_list = &job->userptr_list;
+	parser.user_cb = job->user_cb;
+	parser.user_cb_size = job->user_cb_size;
+	parser.ext_queue = job->ext_queue;
+	parser.use_virt_addr = hdev->mmu_enable;
+
+	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to parse kernel CB when clearing pgt\n");
+		goto free_job;
+	}
+
+	job->patched_cb = parser.patched_cb;
+	job->job_cb_size = parser.patched_cb_size;
+	job->patched_cb->cs_cnt++;
+
+	rc = goya_send_job_on_qman0(hdev, job);
+
+	job->patched_cb->cs_cnt--;
+	hl_cb_put(job->patched_cb);
+
+free_job:
+	hl_userptr_delete_list(hdev, &job->userptr_list);
+	kfree(job);
+	cb->cs_cnt--;
+
+release_cb:
+	hl_cb_put(cb);
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
+
+	return rc;
+}
+
+static void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	int i;
+
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
+		return;
+
+	if (asid & ~MME_QM_GLBL_SECURE_PROPS_ASID_MASK) {
+		WARN(1, "asid %u is too big\n", asid);
+		return;
+	}
+
+	/* zero the MMBP and ASID bits and then set the ASID */
+	for (i = 0 ; i < GOYA_MMU_REGS_NUM ; i++) {
+		WREG32_AND(goya_mmu_regs[i], ~0x7FF);
+		WREG32_OR(goya_mmu_regs[i], asid);
+	}
+}
+
+static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	u32 status, timeout_usec;
+	int rc;
+
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
+		return;
+
+	/* no need in L1 only invalidation in Goya */
+	if (!is_hard)
+		return;
+
+	if (hdev->pldm)
+		timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
+	else
+		timeout_usec = MMU_CONFIG_TIMEOUT_USEC;
+
+	mutex_lock(&hdev->mmu_cache_lock);
+
+	/* L0 & L1 invalidation */
+	WREG32(mmSTLB_INV_ALL_START, 1);
+
+	rc = hl_poll_timeout(
+		hdev,
+		mmSTLB_INV_ALL_START,
+		status,
+		!status,
+		1000,
+		timeout_usec);
+
+	mutex_unlock(&hdev->mmu_cache_lock);
+
+	if (rc)
+		dev_notice_ratelimited(hdev->dev,
+			"Timeout when waiting for MMU cache invalidation\n");
+}
+
+static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
+		bool is_hard, u32 asid, u64 va, u64 size)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	u32 status, timeout_usec, inv_data, pi;
+	int rc;
+
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
+		return;
+
+	/* no need in L1 only invalidation in Goya */
+	if (!is_hard)
+		return;
+
+	if (hdev->pldm)
+		timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
+	else
+		timeout_usec = MMU_CONFIG_TIMEOUT_USEC;
+
+	mutex_lock(&hdev->mmu_cache_lock);
+
+	/*
+	 * TODO: currently invalidate entire L0 & L1 as in regular hard
+	 * invalidation. Need to apply invalidation of specific cache lines with
+	 * mask of ASID & VA & size.
+	 * Note that L1 with be flushed entirely in any case.
+	 */
+
+	/* L0 & L1 invalidation */
+	inv_data = RREG32(mmSTLB_CACHE_INV);
+	/* PI is 8 bit */
+	pi = ((inv_data & STLB_CACHE_INV_PRODUCER_INDEX_MASK) + 1) & 0xFF;
+	WREG32(mmSTLB_CACHE_INV,
+			(inv_data & STLB_CACHE_INV_INDEX_MASK_MASK) | pi);
+
+	rc = hl_poll_timeout(
+		hdev,
+		mmSTLB_INV_CONSUMER_INDEX,
+		status,
+		status == pi,
+		1000,
+		timeout_usec);
+
+	mutex_unlock(&hdev->mmu_cache_lock);
+
+	if (rc)
+		dev_notice_ratelimited(hdev->dev,
+			"Timeout when waiting for MMU cache invalidation\n");
+}
+
+static int goya_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid,
+						u64 phys_addr)
+{
+	u32 status, timeout_usec;
+	int rc;
+
+	if (hdev->pldm)
+		timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
+	else
+		timeout_usec = MMU_CONFIG_TIMEOUT_USEC;
+
+	WREG32(MMU_HOP0_PA43_12, phys_addr >> MMU_HOP0_PA43_12_SHIFT);
+	WREG32(MMU_HOP0_PA49_44, phys_addr >> MMU_HOP0_PA49_44_SHIFT);
+	WREG32(MMU_ASID_BUSY, 0x80000000 | asid);
+
+	rc = hl_poll_timeout(
+		hdev,
+		MMU_ASID_BUSY,
+		status,
+		!(status & 0x80000000),
+		1000,
+		timeout_usec);
+
+	if (rc) {
+		dev_err(hdev->dev,
+			"Timeout during MMU hop0 config of asid %d\n", asid);
+		return rc;
+	}
+
+	return 0;
+}
+
 int goya_send_heartbeat(struct hl_device *hdev)
 {
 	struct goya_device *goya = hdev->asic_specific;
@@ -4830,6 +5221,10 @@ static const struct hl_asic_funcs goya_funcs = {
 	.handle_eqe = goya_handle_eqe,
 	.set_pll_profile = goya_set_pll_profile,
 	.get_events_stat = goya_get_events_stat,
+	.read_pte = goya_read_pte,
+	.write_pte = goya_write_pte,
+	.mmu_invalidate_cache = goya_mmu_invalidate_cache,
+	.mmu_invalidate_cache_range = goya_mmu_invalidate_cache_range,
 	.send_heartbeat = goya_send_heartbeat,
 	.enable_clock_gating = goya_init_clock_gating,
 	.disable_clock_gating = goya_disable_clock_gating,
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 9adc7c6ec08b..03085e7a12dd 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -19,6 +19,7 @@
 #include <linux/dma-fence.h>
 #include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
+#include <linux/hashtable.h>
 
 #define HL_NAME				"habanalabs"
 
@@ -39,6 +40,31 @@
 /* MUST BE POWER OF 2 and larger than 1 */
 #define HL_MAX_PENDING_CS		64
 
+/* Memory */
+#define MEM_HASH_TABLE_BITS		7 /* 1 << 7 buckets */
+
+/* MMU */
+#define MMU_HASH_TABLE_BITS		7 /* 1 << 7 buckets */
+
+/**
+ * struct pgt_info - MMU hop page info.
+ * @node: hash linked-list node for the pgts hash of pgts.
+ * @addr: physical address of the pgt.
+ * @ctx: pointer to the owner ctx.
+ * @num_of_ptes: indicates how many ptes are used in the pgt.
+ *
+ * The MMU page tables hierarchy is placed on the DRAM. When a new level (hop)
+ * is needed during mapping, a new page is allocated and this structure holds
+ * its essential information. During unmapping, if no valid PTEs remained in the
+ * page, it is freed with its pgt_info structure.
+ */
+struct pgt_info {
+	struct hlist_node node;
+	u64 addr;
+	struct hl_ctx *ctx;
+	int num_of_ptes;
+};
+
 struct hl_device;
 struct hl_fpriv;
 
@@ -72,11 +98,11 @@ struct hw_queue_properties {
 /**
  * enum vm_type_t - virtual memory mapping request information.
  * @VM_TYPE_USERPTR: mapping of user memory to device virtual address.
- * @VM_TYPE_PHYS_LIST: mapping of DRAM memory to device virtual address.
+ * @VM_TYPE_PHYS_PACK: mapping of DRAM memory to device virtual address.
  */
 enum vm_type_t {
 	VM_TYPE_USERPTR,
-	VM_TYPE_PHYS_LIST
+	VM_TYPE_PHYS_PACK
 };
 
 /**
@@ -117,6 +143,12 @@ enum hl_device_hw_state {
  *                               mapping DRAM memory.
  * @va_space_dram_end_address: end address of virtual memory range for
  *                             mapping DRAM memory.
+ * @mmu_pgt_addr: base physical address in DRAM of MMU page tables.
+ * @mmu_pgt_size: MMU page tables total size.
+ * @mmu_pte_size: PTE size in MMU page tables.
+ * @mmu_hop_table_size: MMU hop table size.
+ * @mmu_hop0_tables_total_size: total size of MMU hop0 tables.
+ * @dram_page_size: page size for MMU DRAM allocation.
  * @cfg_size: configuration space size on SRAM.
  * @sram_size: total size of SRAM.
  * @max_asid: maximum number of open contexts (ASIDs).
@@ -150,6 +182,12 @@ struct asic_fixed_properties {
 	u64			va_space_host_end_address;
 	u64			va_space_dram_start_address;
 	u64			va_space_dram_end_address;
+	u64			mmu_pgt_addr;
+	u32			mmu_pgt_size;
+	u32			mmu_pte_size;
+	u32			mmu_hop_table_size;
+	u32			mmu_hop0_tables_total_size;
+	u32			dram_page_size;
 	u32			cfg_size;
 	u32			sram_size;
 	u32			max_asid;
@@ -419,6 +457,12 @@ enum hl_pll_frequency {
  * @handle_eqe: handle event queue entry (IRQ) from ArmCP.
  * @set_pll_profile: change PLL profile (manual/automatic).
  * @get_events_stat: retrieve event queue entries histogram.
+ * @read_pte: read MMU page table entry from DRAM.
+ * @write_pte: write MMU page table entry to DRAM.
+ * @mmu_invalidate_cache: flush MMU STLB cache, either with soft (L1 only) or
+ *                        hard (L0 & L1) flush.
+ * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with
+ *                              ASID-VA-size mask.
  * @send_heartbeat: send is-alive packet to ArmCP and verify response.
  * @enable_clock_gating: enable clock gating for reducing power consumption.
  * @disable_clock_gating: disable clock for accessing registers on HBW.
@@ -483,6 +527,11 @@ struct hl_asic_funcs {
 	void (*set_pll_profile)(struct hl_device *hdev,
 			enum hl_pll_frequency freq);
 	void* (*get_events_stat)(struct hl_device *hdev, u32 *size);
+	u64 (*read_pte)(struct hl_device *hdev, u64 addr);
+	void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val);
+	void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard);
+	void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
+			u32 asid, u64 va, u64 size);
 	int (*send_heartbeat)(struct hl_device *hdev);
 	void (*enable_clock_gating)(struct hl_device *hdev);
 	void (*disable_clock_gating)(struct hl_device *hdev);
@@ -504,17 +553,40 @@ struct hl_asic_funcs {
 
 #define HL_KERNEL_ASID_ID	0
 
+/**
+ * struct hl_va_range - virtual addresses range.
+ * @lock: protects the virtual addresses list.
+ * @list: list of virtual addresses blocks available for mappings.
+ * @start_addr: range start address.
+ * @end_addr: range end address.
+ */
+struct hl_va_range {
+	struct mutex		lock;
+	struct list_head	list;
+	u64			start_addr;
+	u64			end_addr;
+};
+
 /**
  * struct hl_ctx - user/kernel context.
+ * @mem_hash: holds mapping from virtual address to virtual memory area
+ *		descriptor (hl_vm_phys_pg_list or hl_userptr).
+ * @mmu_hash: holds a mapping from virtual address to pgt_info structure.
  * @hpriv: pointer to the private (KMD) data of the process (fd).
  * @hdev: pointer to the device structure.
  * @refcount: reference counter for the context. Context is released only when
  *		this hits 0l. It is incremented on CS and CS_WAIT.
  * @cs_pending: array of DMA fence objects representing pending CS.
+ * @host_va_range: holds available virtual addresses for host mappings.
+ * @dram_va_range: holds available virtual addresses for DRAM mappings.
+ * @mem_hash_lock: protects the mem_hash.
+ * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifing the
+ *            MMU hash or walking the PGT requires talking this lock
  * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
  *			to user so user could inquire about CS. It is used as
  *			index to cs_pending array.
  * @cs_lock: spinlock to protect cs_sequence.
+ * @dram_phys_mem: amount of used physical DRAM memory by this context.
  * @thread_restore_token: token to prevent multiple threads of the same context
  *				from running the restore phase. Only one thread
  *				should run it.
@@ -524,12 +596,19 @@ struct hl_asic_funcs {
  * @asid: context's unique address space ID in the device's MMU.
  */
 struct hl_ctx {
+	DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS);
+	DECLARE_HASHTABLE(mmu_hash, MMU_HASH_TABLE_BITS);
 	struct hl_fpriv		*hpriv;
 	struct hl_device	*hdev;
 	struct kref		refcount;
 	struct dma_fence	*cs_pending[HL_MAX_PENDING_CS];
+	struct hl_va_range	host_va_range;
+	struct hl_va_range	dram_va_range;
+	struct mutex		mem_hash_lock;
+	struct mutex		mmu_lock;
 	u64			cs_sequence;
 	spinlock_t		cs_lock;
+	atomic64_t		dram_phys_mem;
 	atomic_t		thread_restore_token;
 	u32			thread_restore_wait_token;
 	u32			asid;
@@ -672,6 +751,85 @@ struct hl_cs_parser {
 };
 
 
+/*
+ * MEMORY STRUCTURE
+ */
+
+/**
+ * struct hl_vm_hash_node - hash element from virtual address to virtual
+ *				memory area descriptor (hl_vm_phys_pg_list or
+ *				hl_userptr).
+ * @node: node to hang on the hash table in context object.
+ * @vaddr: key virtual address.
+ * @ptr: value pointer (hl_vm_phys_pg_list or hl_userptr).
+ */
+struct hl_vm_hash_node {
+	struct hlist_node	node;
+	u64			vaddr;
+	void			*ptr;
+};
+
+/**
+ * struct hl_vm_phys_pg_pack - physical page pack.
+ * @vm_type: describes the type of the virtual area descriptor.
+ * @pages: the physical page array.
+ * @mapping_cnt: number of shared mappings.
+ * @asid: the context related to this list.
+ * @npages: num physical pages in the pack.
+ * @page_size: size of each page in the pack.
+ * @total_size: total size of all the pages in this list.
+ * @flags: HL_MEM_* flags related to this list.
+ * @handle: the provided handle related to this list.
+ * @offset: offset from the first page.
+ * @contiguous: is contiguous physical memory.
+ * @created_from_userptr: is product of host virtual address.
+ */
+struct hl_vm_phys_pg_pack {
+	enum vm_type_t		vm_type; /* must be first */
+	u64			*pages;
+	atomic_t		mapping_cnt;
+	u32			asid;
+	u32			npages;
+	u32			page_size;
+	u32			total_size;
+	u32			flags;
+	u32			handle;
+	u32			offset;
+	u8			contiguous;
+	u8			created_from_userptr;
+};
+
+/**
+ * struct hl_vm_va_block - virtual range block information.
+ * @node: node to hang on the virtual range list in context object.
+ * @start: virtual range start address.
+ * @end: virtual range end address.
+ * @size: virtual range size.
+ */
+struct hl_vm_va_block {
+	struct list_head	node;
+	u64			start;
+	u64			end;
+	u64			size;
+};
+
+/**
+ * struct hl_vm - virtual memory manager for MMU.
+ * @dram_pg_pool: pool for DRAM physical pages of 2MB.
+ * @dram_pg_pool_refcount: reference counter for the pool usage.
+ * @idr_lock: protects the phys_pg_list_handles.
+ * @phys_pg_pack_handles: idr to hold all device allocations handles.
+ * @init_done: whether initialization was done. We need this because VM
+ *		initialization might be skipped during device initialization.
+ */
+struct hl_vm {
+	struct gen_pool		*dram_pg_pool;
+	struct kref		dram_pg_pool_refcount;
+	spinlock_t		idr_lock;
+	struct idr		phys_pg_pack_handles;
+	u8			init_done;
+};
+
 /*
  * FILE PRIVATE STRUCTURE
  */
@@ -787,12 +945,16 @@ struct hl_device_reset_work {
  * @asic_prop: ASIC specific immutable properties.
  * @asic_funcs: ASIC specific functions.
  * @asic_specific: ASIC specific information to use only from ASIC files.
+ * @mmu_pgt_pool: pool of available MMU hops.
+ * @vm: virtual memory manager for MMU.
+ * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context
  * @hwmon_dev: H/W monitor device.
  * @pm_mng_profile: current power management profile.
  * @hl_chip_info: ASIC's sensors information.
  * @cb_pool: list of preallocated CBs.
  * @cb_pool_lock: protects the CB pool.
  * @user_ctx: current user context executing.
+ * @dram_used_mem: current DRAM memory consumption.
  * @in_reset: is device in reset flow.
  * @curr_pll_profile: current PLL profile.
  * @fd_open_cnt: number of open user processes.
@@ -812,6 +974,7 @@ struct hl_device_reset_work {
  * @heartbeat: is heartbeat sanity check towards ArmCP enabled.
  * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
  *                   otherwise.
+ * @dram_supports_virtual_memory: is MMU enabled towards DRAM.
  * @init_done: is the initialization of the device done.
  * @mmu_enable: is MMU enabled.
  */
@@ -846,6 +1009,9 @@ struct hl_device {
 	struct asic_fixed_properties	asic_prop;
 	const struct hl_asic_funcs	*asic_funcs;
 	void				*asic_specific;
+	struct gen_pool			*mmu_pgt_pool;
+	struct hl_vm			vm;
+	struct mutex			mmu_cache_lock;
 	struct device			*hwmon_dev;
 	enum hl_pm_mng_profile		pm_mng_profile;
 	struct hwmon_chip_info		*hl_chip_info;
@@ -856,6 +1022,7 @@ struct hl_device {
 	/* TODO: remove user_ctx for multiple process support */
 	struct hl_ctx			*user_ctx;
 
+	atomic64_t			dram_used_mem;
 	atomic_t			in_reset;
 	atomic_t			curr_pll_profile;
 	atomic_t			fd_open_cnt;
@@ -872,6 +1039,7 @@ struct hl_device {
 	u8				hard_reset_pending;
 	u8				heartbeat;
 	u8				reset_on_lockup;
+	u8				dram_supports_virtual_memory;
 	u8				init_done;
 
 	/* Parameters for bring-up */
@@ -1021,6 +1189,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 void hl_hpriv_get(struct hl_fpriv *hpriv);
 void hl_hpriv_put(struct hl_fpriv *hpriv);
 int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
+
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
 		struct armcp_sensor *sensors_arr);
 
@@ -1048,6 +1217,12 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 
+int hl_vm_ctx_init(struct hl_ctx *ctx);
+void hl_vm_ctx_fini(struct hl_ctx *ctx);
+
+int hl_vm_init(struct hl_device *hdev);
+void hl_vm_fini(struct hl_device *hdev);
+
 int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
 			struct hl_userptr *userptr);
 int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr);
@@ -1057,6 +1232,15 @@ bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size,
 				struct list_head *userptr_list,
 				struct hl_userptr **userptr);
 
+int hl_mmu_init(struct hl_device *hdev);
+void hl_mmu_fini(struct hl_device *hdev);
+void hl_mmu_ctx_init(struct hl_ctx *ctx);
+void hl_mmu_ctx_fini(struct hl_ctx *ctx);
+int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size);
+int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size);
+void hl_mmu_swap_out(struct hl_ctx *ctx);
+void hl_mmu_swap_in(struct hl_ctx *ctx);
+
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq);
 long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);
@@ -1074,5 +1258,6 @@ long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data);
 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data);
 int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data);
+int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data);
 
 #endif /* HABANALABSP_H_ */
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
index 77a1cc85e530..436ccae0989d 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -188,7 +188,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	hdev->reset_on_lockup = reset_on_lockup;
 
 	/* Parameters for bring-up - set them to defaults */
-	hdev->mmu_enable = 0;
+	hdev->mmu_enable = 1;
 	hdev->cpu_enable = 1;
 	hdev->reset_pcilink = 0;
 	hdev->cpu_queues_enable = 1;
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index 481db1a5e97e..6e4dc5b5e696 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -18,7 +18,8 @@
 static const struct hl_ioctl_desc hl_ioctls[] = {
 	HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl),
 	HL_IOCTL_DEF(HL_IOCTL_CS, hl_cs_ioctl),
-	HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl)
+	HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl),
+	HL_IOCTL_DEF(HL_IOCTL_MEMORY, hl_mem_ioctl)
 };
 
 #define HL_CORE_IOCTL_COUNT	ARRAY_SIZE(hl_ioctls)
diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
new file mode 100644
index 000000000000..1bc36aba1426
--- /dev/null
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2016-2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef INCLUDE_MMU_GENERAL_H_
+#define INCLUDE_MMU_GENERAL_H_
+
+#define PAGE_SHIFT_4KB			12
+#define PAGE_SHIFT_2MB			21
+#define PAGE_SIZE_2MB			(_AC(1, UL) << PAGE_SHIFT_2MB)
+#define PAGE_SIZE_4KB			(_AC(1, UL) << PAGE_SHIFT_4KB)
+#define PAGE_MASK_2MB			(~(PAGE_SIZE_2MB - 1))
+
+#define PAGE_PRESENT_MASK		0x0000000000001
+#define SWAP_OUT_MASK			0x0000000000004
+#define LAST_MASK			0x0000000000800
+#define PHYS_ADDR_MASK			0x3FFFFFFFFF000ull
+#define HOP0_MASK			0x3000000000000ull
+#define HOP1_MASK			0x0FF8000000000ull
+#define HOP2_MASK			0x0007FC0000000ull
+#define HOP3_MASK			0x000003FE00000
+#define HOP4_MASK			0x00000001FF000
+#define OFFSET_MASK			0x0000000000FFF
+
+#define HOP0_SHIFT			48
+#define HOP1_SHIFT			39
+#define HOP2_SHIFT			30
+#define HOP3_SHIFT			21
+#define HOP4_SHIFT			12
+
+#define PTE_PHYS_ADDR_SHIFT		12
+#define PTE_PHYS_ADDR_MASK		~0xFFF
+
+#define HL_PTE_SIZE			sizeof(u64)
+#define HOP_TABLE_SIZE			PAGE_SIZE_4KB
+#define HOP0_TABLES_TOTAL_SIZE		(HOP_TABLE_SIZE * MAX_ASID)
+
+#define MMU_HOP0_PA43_12_SHIFT		12
+#define MMU_HOP0_PA49_44_SHIFT		(12 + 32)
+
+#define MMU_CONFIG_TIMEOUT_USEC		2000 /* 2 ms */
+
+#endif /* INCLUDE_MMU_GENERAL_H_ */
diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h
new file mode 100644
index 000000000000..8539dd041f2c
--- /dev/null
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2016-2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef INCLUDE_MMU_V1_0_H_
+#define INCLUDE_MMU_V1_0_H_
+
+#define MMU_HOP0_PA43_12	0x490004
+#define MMU_HOP0_PA49_44	0x490008
+#define MMU_ASID_BUSY		0x490000
+
+#endif /* INCLUDE_MMU_V1_0_H_ */
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index ad14376a1c25..6650c8085fc6 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -5,10 +5,1198 @@
  * All Rights Reserved.
  */
 
+#include <uapi/misc/habanalabs.h>
 #include "habanalabs.h"
+#include "include/hw_ip/mmu/mmu_general.h"
 
 #include <linux/uaccess.h>
 #include <linux/slab.h>
+#include <linux/genalloc.h>
+
+#define PGS_IN_2MB_PAGE	(PAGE_SIZE_2MB >> PAGE_SHIFT)
+#define HL_MMU_DEBUG	0
+
+/*
+ * The va ranges in context object contain a list with the available chunks of
+ * device virtual memory.
+ * There is one range for host allocations and one for DRAM allocations.
+ *
+ * On initialization each range contains one chunk of all of its available
+ * virtual range which is a half of the total device virtual range.
+ *
+ * On each mapping of physical pages, a suitable virtual range chunk (with a
+ * minimum size) is selected from the list. If the chunk size equals the
+ * requested size, the chunk is returned. Otherwise, the chunk is split into
+ * two chunks - one to return as result and a remainder to stay in the list.
+ *
+ * On each Unmapping of a virtual address, the relevant virtual chunk is
+ * returned to the list. The chunk is added to the list and if its edges match
+ * the edges of the adjacent chunks (means a contiguous chunk can be created),
+ * the chunks are merged.
+ *
+ * On finish, the list is checked to have only one chunk of all the relevant
+ * virtual range (which is a half of the device total virtual range).
+ * If not (means not all mappings were unmapped), a warning is printed.
+ */
+
+/*
+ * alloc_device_memory - allocate device memory
+ *
+ * @ctx                 : current context
+ * @args                : host parameters containing the requested size
+ * @ret_handle          : result handle
+ *
+ * This function does the following:
+ * - Allocate the requested size rounded up to 2MB pages
+ * - Return unique handle
+ */
+static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
+				u32 *ret_handle)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm *vm = &hdev->vm;
+	struct hl_vm_phys_pg_pack *phys_pg_pack;
+	u64 paddr = 0;
+	u32 total_size, num_pgs, num_curr_pgs, page_size, page_shift;
+	int handle, rc, i;
+	bool contiguous;
+
+	num_curr_pgs = 0;
+	page_size = hdev->asic_prop.dram_page_size;
+	page_shift = __ffs(page_size);
+	num_pgs = (args->alloc.mem_size + (page_size - 1)) >> page_shift;
+	total_size = num_pgs << page_shift;
+
+	contiguous = args->flags & HL_MEM_CONTIGUOUS;
+
+	if (contiguous) {
+		paddr = (u64) gen_pool_alloc(vm->dram_pg_pool, total_size);
+		if (!paddr) {
+			dev_err(hdev->dev,
+				"failed to allocate %u huge contiguous pages\n",
+				num_pgs);
+			return -ENOMEM;
+		}
+	}
+
+	phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
+	if (!phys_pg_pack) {
+		rc = -ENOMEM;
+		goto pages_pack_err;
+	}
+
+	phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK;
+	phys_pg_pack->asid = ctx->asid;
+	phys_pg_pack->npages = num_pgs;
+	phys_pg_pack->page_size = page_size;
+	phys_pg_pack->total_size = total_size;
+	phys_pg_pack->flags = args->flags;
+	phys_pg_pack->contiguous = contiguous;
+
+	phys_pg_pack->pages = kcalloc(num_pgs, sizeof(u64), GFP_KERNEL);
+	if (!phys_pg_pack->pages) {
+		rc = -ENOMEM;
+		goto pages_arr_err;
+	}
+
+	if (phys_pg_pack->contiguous) {
+		for (i = 0 ; i < num_pgs ; i++)
+			phys_pg_pack->pages[i] = paddr + i * page_size;
+	} else {
+		for (i = 0 ; i < num_pgs ; i++) {
+			phys_pg_pack->pages[i] = (u64) gen_pool_alloc(
+							vm->dram_pg_pool,
+							page_size);
+			if (!phys_pg_pack->pages[i]) {
+				dev_err(hdev->dev,
+					"ioctl failed to allocate page\n");
+				rc = -ENOMEM;
+				goto page_err;
+			}
+
+			num_curr_pgs++;
+		}
+	}
+
+	spin_lock(&vm->idr_lock);
+	handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0,
+				GFP_KERNEL);
+	spin_unlock(&vm->idr_lock);
+
+	if (handle < 0) {
+		dev_err(hdev->dev, "Failed to get handle for page\n");
+		rc = -EFAULT;
+		goto idr_err;
+	}
+
+	for (i = 0 ; i < num_pgs ; i++)
+		kref_get(&vm->dram_pg_pool_refcount);
+
+	phys_pg_pack->handle = handle;
+
+	atomic64_add(phys_pg_pack->total_size, &ctx->dram_phys_mem);
+	atomic64_add(phys_pg_pack->total_size, &hdev->dram_used_mem);
+
+	*ret_handle = handle;
+
+	return 0;
+
+idr_err:
+page_err:
+	if (!phys_pg_pack->contiguous)
+		for (i = 0 ; i < num_curr_pgs ; i++)
+			gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[i],
+					page_size);
+
+	kfree(phys_pg_pack->pages);
+pages_arr_err:
+	kfree(phys_pg_pack);
+pages_pack_err:
+	if (contiguous)
+		gen_pool_free(vm->dram_pg_pool, paddr, total_size);
+
+	return rc;
+}
+
+/*
+ * get_userptr_from_host_va - initialize userptr structure from given host
+ *                            virtual address
+ *
+ * @hdev                : habanalabs device structure
+ * @args                : parameters containing the virtual address and size
+ * @p_userptr           : pointer to result userptr structure
+ *
+ * This function does the following:
+ * - Allocate userptr structure
+ * - Pin the given host memory using the userptr structure
+ * - Perform DMA mapping to have the DMA addresses of the pages
+ */
+static int get_userptr_from_host_va(struct hl_device *hdev,
+		struct hl_mem_in *args, struct hl_userptr **p_userptr)
+{
+	struct hl_userptr *userptr;
+	int rc;
+
+	userptr = kzalloc(sizeof(*userptr), GFP_KERNEL);
+	if (!userptr) {
+		rc = -ENOMEM;
+		goto userptr_err;
+	}
+
+	rc = hl_pin_host_memory(hdev, args->map_host.host_virt_addr,
+			args->map_host.mem_size, userptr);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to pin host memory\n");
+		goto pin_err;
+	}
+
+	rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl,
+					userptr->sgt->nents, DMA_BIDIRECTIONAL);
+	if (rc) {
+		dev_err(hdev->dev, "failed to map sgt with DMA region\n");
+		goto dma_map_err;
+	}
+
+	userptr->dma_mapped = true;
+	userptr->dir = DMA_BIDIRECTIONAL;
+	userptr->vm_type = VM_TYPE_USERPTR;
+
+	*p_userptr = userptr;
+
+	return 0;
+
+dma_map_err:
+	hl_unpin_host_memory(hdev, userptr);
+pin_err:
+	kfree(userptr);
+userptr_err:
+
+	return rc;
+}
+
+/*
+ * free_userptr - free userptr structure
+ *
+ * @hdev                : habanalabs device structure
+ * @userptr             : userptr to free
+ *
+ * This function does the following:
+ * - Unpins the physical pages
+ * - Frees the userptr structure
+ */
+static void free_userptr(struct hl_device *hdev, struct hl_userptr *userptr)
+{
+	hl_unpin_host_memory(hdev, userptr);
+	kfree(userptr);
+}
+
+/*
+ * dram_pg_pool_do_release - free DRAM pages pool
+ *
+ * @ref                 : pointer to reference object
+ *
+ * This function does the following:
+ * - Frees the idr structure of physical pages handles
+ * - Frees the generic pool of DRAM physical pages
+ */
+static void dram_pg_pool_do_release(struct kref *ref)
+{
+	struct hl_vm *vm = container_of(ref, struct hl_vm,
+			dram_pg_pool_refcount);
+
+	/*
+	 * free the idr here as only here we know for sure that there are no
+	 * allocated physical pages and hence there are no handles in use
+	 */
+	idr_destroy(&vm->phys_pg_pack_handles);
+	gen_pool_destroy(vm->dram_pg_pool);
+}
+
+/*
+ * free_phys_pg_pack   - free physical page pack
+ *
+ * @hdev               : habanalabs device structure
+ * @phys_pg_pack       : physical page pack to free
+ *
+ * This function does the following:
+ * - For DRAM memory only, iterate over the pack and free each physical block
+ *   structure by returning it to the general pool
+ * - Free the hl_vm_phys_pg_pack structure
+ */
+static void free_phys_pg_pack(struct hl_device *hdev,
+		struct hl_vm_phys_pg_pack *phys_pg_pack)
+{
+	struct hl_vm *vm = &hdev->vm;
+	int i;
+
+	if (!phys_pg_pack->created_from_userptr) {
+		if (phys_pg_pack->contiguous) {
+			gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[0],
+					phys_pg_pack->total_size);
+
+			for (i = 0; i < phys_pg_pack->npages ; i++)
+				kref_put(&vm->dram_pg_pool_refcount,
+					dram_pg_pool_do_release);
+		} else {
+			for (i = 0 ; i < phys_pg_pack->npages ; i++) {
+				gen_pool_free(vm->dram_pg_pool,
+						phys_pg_pack->pages[i],
+						phys_pg_pack->page_size);
+				kref_put(&vm->dram_pg_pool_refcount,
+					dram_pg_pool_do_release);
+			}
+		}
+	}
+
+	kfree(phys_pg_pack->pages);
+	kfree(phys_pg_pack);
+}
+
+/*
+ * free_device_memory - free device memory
+ *
+ * @ctx                  : current context
+ * @handle              : handle of the memory chunk to free
+ *
+ * This function does the following:
+ * - Free the device memory related to the given handle
+ */
+static int free_device_memory(struct hl_ctx *ctx, u32 handle)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm *vm = &hdev->vm;
+	struct hl_vm_phys_pg_pack *phys_pg_pack;
+
+	spin_lock(&vm->idr_lock);
+	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
+	if (phys_pg_pack) {
+		if (atomic_read(&phys_pg_pack->mapping_cnt) > 0) {
+			dev_err(hdev->dev, "handle %u is mapped, cannot free\n",
+				handle);
+			spin_unlock(&vm->idr_lock);
+			return -EINVAL;
+		}
+
+		/*
+		 * must remove from idr before the freeing of the physical
+		 * pages as the refcount of the pool is also the trigger of the
+		 * idr destroy
+		 */
+		idr_remove(&vm->phys_pg_pack_handles, handle);
+		spin_unlock(&vm->idr_lock);
+
+		atomic64_sub(phys_pg_pack->total_size, &ctx->dram_phys_mem);
+		atomic64_sub(phys_pg_pack->total_size, &hdev->dram_used_mem);
+
+		free_phys_pg_pack(hdev, phys_pg_pack);
+	} else {
+		spin_unlock(&vm->idr_lock);
+		dev_err(hdev->dev,
+			"free device memory failed, no match for handle %u\n",
+			handle);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * clear_va_list_locked - free virtual addresses list
+ *
+ * @hdev                : habanalabs device structure
+ * @va_list             : list of virtual addresses to free
+ *
+ * This function does the following:
+ * - Iterate over the list and free each virtual addresses block
+ *
+ * This function should be called only when va_list lock is taken
+ */
+static void clear_va_list_locked(struct hl_device *hdev,
+		struct list_head *va_list)
+{
+	struct hl_vm_va_block *va_block, *tmp;
+
+	list_for_each_entry_safe(va_block, tmp, va_list, node) {
+		list_del(&va_block->node);
+		kfree(va_block);
+	}
+}
+
+/*
+ * print_va_list_locked    - print virtual addresses list
+ *
+ * @hdev                : habanalabs device structure
+ * @va_list             : list of virtual addresses to print
+ *
+ * This function does the following:
+ * - Iterate over the list and print each virtual addresses block
+ *
+ * This function should be called only when va_list lock is taken
+ */
+static void print_va_list_locked(struct hl_device *hdev,
+		struct list_head *va_list)
+{
+#if HL_MMU_DEBUG
+	struct hl_vm_va_block *va_block;
+
+	dev_dbg(hdev->dev, "print va list:\n");
+
+	list_for_each_entry(va_block, va_list, node)
+		dev_dbg(hdev->dev,
+			"va block, start: 0x%llx, end: 0x%llx, size: %llu\n",
+			va_block->start, va_block->end, va_block->size);
+#endif
+}
+
+/*
+ * merge_va_blocks_locked - merge a virtual block if possible
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @va_list             : pointer to the virtual addresses block list
+ * @va_block            : virtual block to merge with adjacent blocks
+ *
+ * This function does the following:
+ * - Merge the given blocks with the adjacent blocks if their virtual ranges
+ *   create a contiguous virtual range
+ *
+ * This Function should be called only when va_list lock is taken
+ */
+static void merge_va_blocks_locked(struct hl_device *hdev,
+		struct list_head *va_list, struct hl_vm_va_block *va_block)
+{
+	struct hl_vm_va_block *prev, *next;
+
+	prev = list_prev_entry(va_block, node);
+	if (&prev->node != va_list && prev->end + 1 == va_block->start) {
+		prev->end = va_block->end;
+		prev->size = prev->end - prev->start;
+		list_del(&va_block->node);
+		kfree(va_block);
+		va_block = prev;
+	}
+
+	next = list_next_entry(va_block, node);
+	if (&next->node != va_list && va_block->end + 1 == next->start) {
+		next->start = va_block->start;
+		next->size = next->end - next->start;
+		list_del(&va_block->node);
+		kfree(va_block);
+	}
+}
+
+/*
+ * add_va_block_locked - add a virtual block to the virtual addresses list
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @va_list             : pointer to the virtual addresses block list
+ * @start               : start virtual address
+ * @end                 : end virtual address
+ *
+ * This function does the following:
+ * - Add the given block to the virtual blocks list and merge with other
+ * blocks if a contiguous virtual block can be created
+ *
+ * This Function should be called only when va_list lock is taken
+ */
+static int add_va_block_locked(struct hl_device *hdev,
+		struct list_head *va_list, u64 start, u64 end)
+{
+	struct hl_vm_va_block *va_block, *res = NULL;
+	u64 size = end - start;
+
+	print_va_list_locked(hdev, va_list);
+
+	list_for_each_entry(va_block, va_list, node) {
+		/* TODO: remove upon matureness */
+		if (hl_mem_area_crosses_range(start, size, va_block->start,
+				va_block->end)) {
+			dev_err(hdev->dev,
+				"block crossing ranges at start 0x%llx, end 0x%llx\n",
+				va_block->start, va_block->end);
+			return -EINVAL;
+		}
+
+		if (va_block->end < start)
+			res = va_block;
+	}
+
+	va_block = kmalloc(sizeof(*va_block), GFP_KERNEL);
+	if (!va_block)
+		return -ENOMEM;
+
+	va_block->start = start;
+	va_block->end = end;
+	va_block->size = size;
+
+	if (!res)
+		list_add(&va_block->node, va_list);
+	else
+		list_add(&va_block->node, &res->node);
+
+	merge_va_blocks_locked(hdev, va_list, va_block);
+
+	print_va_list_locked(hdev, va_list);
+
+	return 0;
+}
+
+/*
+ * add_va_block - wrapper for add_va_block_locked
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @va_list             : pointer to the virtual addresses block list
+ * @start               : start virtual address
+ * @end                 : end virtual address
+ *
+ * This function does the following:
+ * - Takes the list lock and calls add_va_block_locked
+ */
+static inline int add_va_block(struct hl_device *hdev,
+		struct hl_va_range *va_range, u64 start, u64 end)
+{
+	int rc;
+
+	mutex_lock(&va_range->lock);
+	rc = add_va_block_locked(hdev, &va_range->list, start, end);
+	mutex_unlock(&va_range->lock);
+
+	return rc;
+}
+
+/*
+ * get_va_block - get a virtual block with the requested size
+ *
+ * @hdev            : pointer to the habanalabs device structure
+ * @va_range        : pointer to the virtual addresses range
+ * @size            : requested block size
+ * @hint_addr       : hint for request address by the user
+ * @is_userptr      : is host or DRAM memory
+ *
+ * This function does the following:
+ * - Iterate on the virtual block list to find a suitable virtual block for the
+ *   requested size
+ * - Reserve the requested block and update the list
+ * - Return the start address of the virtual block
+ */
+static u64 get_va_block(struct hl_device *hdev,
+		struct hl_va_range *va_range, u32 size, u64 hint_addr,
+		bool is_userptr)
+{
+	struct hl_vm_va_block *va_block, *new_va_block = NULL;
+	u64 valid_start, valid_size, prev_start, prev_end, page_mask,
+		res_valid_start = 0, res_valid_size = 0;
+	u32 page_size;
+	bool add_prev = false;
+
+	if (is_userptr) {
+		/*
+		 * We cannot know if the user allocated memory with huge pages
+		 * or not, hence we continue with the biggest possible
+		 * granularity.
+		 */
+		page_size = PAGE_SIZE_2MB;
+		page_mask = PAGE_MASK_2MB;
+	} else {
+		page_size = hdev->asic_prop.dram_page_size;
+		page_mask = ~((u64)page_size - 1);
+	}
+
+	mutex_lock(&va_range->lock);
+
+	print_va_list_locked(hdev, &va_range->list);
+
+	list_for_each_entry(va_block, &va_range->list, node) {
+		/* calc the first possible aligned addr */
+		valid_start = va_block->start;
+
+
+		if (valid_start & (page_size - 1)) {
+			valid_start &= page_mask;
+			valid_start += page_size;
+			if (valid_start > va_block->end)
+				continue;
+		}
+
+		valid_size = va_block->end - valid_start;
+
+		if (valid_size >= size &&
+			(!new_va_block || valid_size < res_valid_size)) {
+
+			new_va_block = va_block;
+			res_valid_start = valid_start;
+			res_valid_size = valid_size;
+		}
+
+		if (hint_addr && hint_addr >= valid_start &&
+				((hint_addr + size) <= va_block->end)) {
+			new_va_block = va_block;
+			res_valid_start = hint_addr;
+			res_valid_size = valid_size;
+			break;
+		}
+	}
+
+	if (!new_va_block) {
+		dev_err(hdev->dev, "no available va block for size %u\n", size);
+		goto out;
+	}
+
+	if (res_valid_start > new_va_block->start) {
+		prev_start = new_va_block->start;
+		prev_end = res_valid_start - 1;
+
+		new_va_block->start = res_valid_start;
+		new_va_block->size = res_valid_size;
+
+		add_prev = true;
+	}
+
+	if (new_va_block->size > size) {
+		new_va_block->start += size;
+		new_va_block->size = new_va_block->end - new_va_block->start;
+	} else {
+		list_del(&new_va_block->node);
+		kfree(new_va_block);
+	}
+
+	if (add_prev)
+		add_va_block_locked(hdev, &va_range->list, prev_start,
+				prev_end);
+
+	print_va_list_locked(hdev, &va_range->list);
+out:
+	mutex_unlock(&va_range->lock);
+
+	return res_valid_start;
+}
+
+/*
+ * get_sg_info - get number of pages and the DMA address from SG list
+ *
+ * @sg                 : the SG list
+ * @dma_addr           : pointer to DMA address to return
+ *
+ * Calculate the number of consecutive pages described by the SG list. Take the
+ * offset of the address in the first page, add to it the length and round it up
+ * to the number of needed pages.
+ */
+static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
+{
+	*dma_addr = sg_dma_address(sg);
+
+	return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) +
+			(PAGE_SIZE - 1)) >> PAGE_SHIFT;
+}
+
+/*
+ * init_phys_pg_pack_from_userptr - initialize physical page pack from host
+ *                                   memory
+ *
+ * @ctx                : current context
+ * @userptr            : userptr to initialize from
+ * @pphys_pg_pack      : res pointer
+ *
+ * This function does the following:
+ * - Pin the physical pages related to the given virtual block
+ * - Create a physical page pack from the physical pages related to the given
+ *   virtual block
+ */
+static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
+		struct hl_userptr *userptr,
+		struct hl_vm_phys_pg_pack **pphys_pg_pack)
+{
+	struct hl_vm_phys_pg_pack *phys_pg_pack;
+	struct scatterlist *sg;
+	dma_addr_t dma_addr;
+	u64 page_mask;
+	u32 npages, total_npages, page_size = PAGE_SIZE;
+	bool first = true, is_huge_page_opt = true;
+	int rc, i, j;
+
+	phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
+	if (!phys_pg_pack)
+		return -ENOMEM;
+
+	phys_pg_pack->vm_type = userptr->vm_type;
+	phys_pg_pack->created_from_userptr = true;
+	phys_pg_pack->asid = ctx->asid;
+	atomic_set(&phys_pg_pack->mapping_cnt, 1);
+
+	/* Only if all dma_addrs are aligned to 2MB and their
+	 * sizes is at least 2MB, we can use huge page mapping.
+	 * We limit the 2MB optimization to this condition,
+	 * since later on we acquire the related VA range as one
+	 * consecutive block.
+	 */
+	total_npages = 0;
+	for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
+		npages = get_sg_info(sg, &dma_addr);
+
+		total_npages += npages;
+
+		if (first) {
+			first = false;
+			dma_addr &= PAGE_MASK_2MB;
+		}
+
+		if ((npages % PGS_IN_2MB_PAGE) ||
+					(dma_addr & (PAGE_SIZE_2MB - 1)))
+			is_huge_page_opt = false;
+	}
+
+	if (is_huge_page_opt) {
+		page_size = PAGE_SIZE_2MB;
+		total_npages /= PGS_IN_2MB_PAGE;
+	}
+
+	page_mask = ~(((u64) page_size) - 1);
+
+	phys_pg_pack->pages = kcalloc(total_npages, sizeof(u64), GFP_KERNEL);
+	if (!phys_pg_pack->pages) {
+		rc = -ENOMEM;
+		goto page_pack_arr_mem_err;
+	}
+
+	phys_pg_pack->npages = total_npages;
+	phys_pg_pack->page_size = page_size;
+	phys_pg_pack->total_size = total_npages * page_size;
+
+	j = 0;
+	first = true;
+	for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
+		npages = get_sg_info(sg, &dma_addr);
+
+		/* align down to physical page size and save the offset */
+		if (first) {
+			first = false;
+			phys_pg_pack->offset = dma_addr & (page_size - 1);
+			dma_addr &= page_mask;
+		}
+
+		while (npages) {
+			phys_pg_pack->pages[j++] = dma_addr;
+			dma_addr += page_size;
+
+			if (is_huge_page_opt)
+				npages -= PGS_IN_2MB_PAGE;
+			else
+				npages--;
+		}
+	}
+
+	*pphys_pg_pack = phys_pg_pack;
+
+	return 0;
+
+page_pack_arr_mem_err:
+	kfree(phys_pg_pack);
+
+	return rc;
+}
+
+/*
+ * map_phys_page_pack - maps the physical page pack
+ *
+ * @ctx                : current context
+ * @vaddr              : start address of the virtual area to map from
+ * @phys_pg_pack       : the pack of physical pages to map to
+ *
+ * This function does the following:
+ * - Maps each chunk of virtual memory to matching physical chunk
+ * - Stores number of successful mappings in the given argument
+ * - Returns 0 on success, error code otherwise.
+ */
+static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr,
+		struct hl_vm_phys_pg_pack *phys_pg_pack)
+{
+	struct hl_device *hdev = ctx->hdev;
+	u64 next_vaddr = vaddr, paddr;
+	u32 page_size = phys_pg_pack->page_size;
+	int i, rc = 0, mapped_pg_cnt = 0;
+
+	for (i = 0 ; i < phys_pg_pack->npages ; i++) {
+		paddr = phys_pg_pack->pages[i];
+
+		/* For accessing the host we need to turn on bit 39 */
+		if (phys_pg_pack->created_from_userptr)
+			paddr += hdev->asic_prop.host_phys_base_address;
+
+		rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size);
+		if (rc) {
+			dev_err(hdev->dev,
+				"map failed for handle %u, npages: %d, mapped: %d",
+				phys_pg_pack->handle, phys_pg_pack->npages,
+				mapped_pg_cnt);
+			goto err;
+		}
+
+		mapped_pg_cnt++;
+		next_vaddr += page_size;
+	}
+
+	return 0;
+
+err:
+	next_vaddr = vaddr;
+	for (i = 0 ; i < mapped_pg_cnt ; i++) {
+		if (hl_mmu_unmap(ctx, next_vaddr, page_size))
+			dev_warn_ratelimited(hdev->dev,
+				"failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n",
+					phys_pg_pack->handle, next_vaddr,
+					phys_pg_pack->pages[i], page_size);
+
+		next_vaddr += page_size;
+	}
+
+	return rc;
+}
+
+static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args,
+				u64 *paddr)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm *vm = &hdev->vm;
+	struct hl_vm_phys_pg_pack *phys_pg_pack;
+	u32 handle;
+
+	handle = lower_32_bits(args->map_device.handle);
+	spin_lock(&vm->idr_lock);
+	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
+	if (!phys_pg_pack) {
+		spin_unlock(&vm->idr_lock);
+		dev_err(hdev->dev, "no match for handle %u\n", handle);
+		return -EINVAL;
+	}
+
+	*paddr = phys_pg_pack->pages[0];
+
+	spin_unlock(&vm->idr_lock);
+
+	return 0;
+}
+
+/*
+ * map_device_va - map the given memory
+ *
+ * @ctx	         : current context
+ * @args         : host parameters with handle/host virtual address
+ * @device_addr	 : pointer to result device virtual address
+ *
+ * This function does the following:
+ * - If given a physical device memory handle, map to a device virtual block
+ *   and return the start address of this block
+ * - If given a host virtual address and size, find the related physical pages,
+ *   map a device virtual block to this pages and return the start address of
+ *   this block
+ */
+static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
+		u64 *device_addr)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm *vm = &hdev->vm;
+	struct hl_vm_phys_pg_pack *phys_pg_pack;
+	struct hl_userptr *userptr = NULL;
+	struct hl_vm_hash_node *hnode;
+	enum vm_type_t *vm_type;
+	u64 ret_vaddr, hint_addr;
+	u32 handle = 0;
+	int rc;
+	bool is_userptr = args->flags & HL_MEM_USERPTR;
+
+	/* Assume failure */
+	*device_addr = 0;
+
+	if (is_userptr) {
+		rc = get_userptr_from_host_va(hdev, args, &userptr);
+		if (rc) {
+			dev_err(hdev->dev, "failed to get userptr from va\n");
+			return rc;
+		}
+
+		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
+				&phys_pg_pack);
+		if (rc) {
+			dev_err(hdev->dev,
+				"unable to init page pack for vaddr 0x%llx\n",
+				args->map_host.host_virt_addr);
+			goto init_page_pack_err;
+		}
+
+		vm_type = (enum vm_type_t *) userptr;
+		hint_addr = args->map_host.hint_addr;
+	} else {
+		handle = lower_32_bits(args->map_device.handle);
+
+		spin_lock(&vm->idr_lock);
+		phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
+		if (!phys_pg_pack) {
+			spin_unlock(&vm->idr_lock);
+			dev_err(hdev->dev,
+				"no match for handle %u\n", handle);
+			return -EINVAL;
+		}
+
+		/* increment now to avoid freeing device memory while mapping */
+		atomic_inc(&phys_pg_pack->mapping_cnt);
+
+		spin_unlock(&vm->idr_lock);
+
+		vm_type = (enum vm_type_t *) phys_pg_pack;
+
+		hint_addr = args->map_device.hint_addr;
+	}
+
+	/*
+	 * relevant for mapping device physical memory only, as host memory is
+	 * implicitly shared
+	 */
+	if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) &&
+			phys_pg_pack->asid != ctx->asid) {
+		dev_err(hdev->dev,
+			"Failed to map memory, handle %u is not shared\n",
+			handle);
+		rc = -EPERM;
+		goto shared_err;
+	}
+
+	hnode = kzalloc(sizeof(*hnode), GFP_KERNEL);
+	if (!hnode) {
+		rc = -ENOMEM;
+		goto hnode_err;
+	}
+
+	ret_vaddr = get_va_block(hdev,
+			is_userptr ? &ctx->host_va_range : &ctx->dram_va_range,
+			phys_pg_pack->total_size, hint_addr, is_userptr);
+	if (!ret_vaddr) {
+		dev_err(hdev->dev, "no available va block for handle %u\n",
+				handle);
+		rc = -ENOMEM;
+		goto va_block_err;
+	}
+
+	mutex_lock(&ctx->mmu_lock);
+
+	rc = map_phys_page_pack(ctx, ret_vaddr, phys_pg_pack);
+	if (rc) {
+		mutex_unlock(&ctx->mmu_lock);
+		dev_err(hdev->dev, "mapping page pack failed for handle %u\n",
+				handle);
+		goto map_err;
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache_range(hdev, false, ctx->asid,
+			ret_vaddr, phys_pg_pack->total_size);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	ret_vaddr += phys_pg_pack->offset;
+
+	hnode->ptr = vm_type;
+	hnode->vaddr = ret_vaddr;
+
+	mutex_lock(&ctx->mem_hash_lock);
+	hash_add(ctx->mem_hash, &hnode->node, ret_vaddr);
+	mutex_unlock(&ctx->mem_hash_lock);
+
+	*device_addr = ret_vaddr;
+
+	if (is_userptr)
+		free_phys_pg_pack(hdev, phys_pg_pack);
+
+	return 0;
+
+map_err:
+	if (add_va_block(hdev,
+			is_userptr ? &ctx->host_va_range : &ctx->dram_va_range,
+			ret_vaddr,
+			ret_vaddr + phys_pg_pack->total_size - 1))
+		dev_warn(hdev->dev,
+			"release va block failed for handle 0x%x, vaddr: 0x%llx\n",
+				handle, ret_vaddr);
+
+va_block_err:
+	kfree(hnode);
+hnode_err:
+shared_err:
+	atomic_dec(&phys_pg_pack->mapping_cnt);
+	if (is_userptr)
+		free_phys_pg_pack(hdev, phys_pg_pack);
+init_page_pack_err:
+	if (is_userptr)
+		free_userptr(hdev, userptr);
+
+	return rc;
+}
+
+/*
+ * unmap_device_va      - unmap the given device virtual address
+ *
+ * @ctx                 : current context
+ * @vaddr               : device virtual address to unmap
+ *
+ * This function does the following:
+ * - Unmap the physical pages related to the given virtual address
+ * - return the device virtual block to the virtual block list
+ */
+static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
+	struct hl_vm_hash_node *hnode = NULL;
+	struct hl_userptr *userptr = NULL;
+	enum vm_type_t *vm_type;
+	u64 next_vaddr;
+	u32 page_size;
+	bool is_userptr;
+	int i, rc;
+
+	/* protect from double entrance */
+	mutex_lock(&ctx->mem_hash_lock);
+	hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr)
+		if (vaddr == hnode->vaddr)
+			break;
+
+	if (!hnode) {
+		mutex_unlock(&ctx->mem_hash_lock);
+		dev_err(hdev->dev,
+			"unmap failed, no mem hnode for vaddr 0x%llx\n",
+			vaddr);
+		return -EINVAL;
+	}
+
+	hash_del(&hnode->node);
+	mutex_unlock(&ctx->mem_hash_lock);
+
+	vm_type = hnode->ptr;
+
+	if (*vm_type == VM_TYPE_USERPTR) {
+		is_userptr = true;
+		userptr = hnode->ptr;
+		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
+				&phys_pg_pack);
+		if (rc) {
+			dev_err(hdev->dev,
+				"unable to init page pack for vaddr 0x%llx\n",
+				vaddr);
+			goto vm_type_err;
+		}
+	} else if (*vm_type == VM_TYPE_PHYS_PACK) {
+		is_userptr = false;
+		phys_pg_pack = hnode->ptr;
+	} else {
+		dev_warn(hdev->dev,
+			"unmap failed, unknown vm desc for vaddr 0x%llx\n",
+				vaddr);
+		rc = -EFAULT;
+		goto vm_type_err;
+	}
+
+	if (atomic_read(&phys_pg_pack->mapping_cnt) == 0) {
+		dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr);
+		rc = -EINVAL;
+		goto mapping_cnt_err;
+	}
+
+	page_size = phys_pg_pack->page_size;
+	vaddr &= ~(((u64) page_size) - 1);
+
+	next_vaddr = vaddr;
+
+	mutex_lock(&ctx->mmu_lock);
+
+	for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size)
+		if (hl_mmu_unmap(ctx, next_vaddr, page_size))
+			dev_warn_ratelimited(hdev->dev,
+				"unmap failed for vaddr: 0x%llx\n", next_vaddr);
+
+	hdev->asic_funcs->mmu_invalidate_cache_range(hdev, true, ctx->asid,
+			vaddr, phys_pg_pack->total_size);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	if (add_va_block(hdev,
+			is_userptr ? &ctx->host_va_range : &ctx->dram_va_range,
+			vaddr,
+			vaddr + phys_pg_pack->total_size - 1))
+		dev_warn(hdev->dev, "add va block failed for vaddr: 0x%llx\n",
+				vaddr);
+
+	atomic_dec(&phys_pg_pack->mapping_cnt);
+	kfree(hnode);
+
+	if (is_userptr) {
+		free_phys_pg_pack(hdev, phys_pg_pack);
+		free_userptr(hdev, userptr);
+	}
+
+	return 0;
+
+mapping_cnt_err:
+	if (is_userptr)
+		free_phys_pg_pack(hdev, phys_pg_pack);
+vm_type_err:
+	mutex_lock(&ctx->mem_hash_lock);
+	hash_add(ctx->mem_hash, &hnode->node, vaddr);
+	mutex_unlock(&ctx->mem_hash_lock);
+
+	return rc;
+}
+
+int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	union hl_mem_args *args = data;
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_ctx *ctx = hpriv->ctx;
+	u64 device_addr = 0;
+	u32 handle = 0;
+	int rc;
+
+	if (hl_device_disabled_or_in_reset(hdev)) {
+		dev_warn_ratelimited(hdev->dev,
+			"Device is disabled or in reset. Can't execute memory IOCTL\n");
+		return -EBUSY;
+	}
+
+	if (hdev->mmu_enable) {
+		switch (args->in.op) {
+		case HL_MEM_OP_ALLOC:
+			if (!hdev->dram_supports_virtual_memory) {
+				dev_err(hdev->dev,
+					"DRAM alloc is not supported\n");
+				rc = -EINVAL;
+				goto out;
+			}
+			if (args->in.alloc.mem_size == 0) {
+				dev_err(hdev->dev,
+					"alloc size must be larger than 0\n");
+				rc = -EINVAL;
+				goto out;
+			}
+			rc = alloc_device_memory(ctx, &args->in, &handle);
+
+			memset(args, 0, sizeof(*args));
+			args->out.handle = (__u64) handle;
+			break;
+
+		case HL_MEM_OP_FREE:
+			if (!hdev->dram_supports_virtual_memory) {
+				dev_err(hdev->dev,
+					"DRAM free is not supported\n");
+				rc = -EINVAL;
+				goto out;
+			}
+			rc = free_device_memory(ctx, args->in.free.handle);
+			break;
+
+		case HL_MEM_OP_MAP:
+			rc = map_device_va(ctx, &args->in, &device_addr);
+
+			memset(args, 0, sizeof(*args));
+			args->out.device_virt_addr = device_addr;
+			break;
+
+		case HL_MEM_OP_UNMAP:
+			rc = unmap_device_va(ctx,
+					args->in.unmap.device_virt_addr);
+			break;
+
+		default:
+			dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
+			rc = -ENOTTY;
+			break;
+		}
+	} else {
+		switch (args->in.op) {
+		case HL_MEM_OP_ALLOC:
+			if (args->in.alloc.mem_size == 0) {
+				dev_err(hdev->dev,
+					"alloc size must be larger than 0\n");
+				rc = -EINVAL;
+				goto out;
+			}
+
+			/* Force contiguous as there are no real MMU
+			 * translations to overcome physical memory gaps
+			 */
+			args->in.flags |= HL_MEM_CONTIGUOUS;
+			rc = alloc_device_memory(ctx, &args->in, &handle);
+
+			memset(args, 0, sizeof(*args));
+			args->out.handle = (__u64) handle;
+			break;
+
+		case HL_MEM_OP_FREE:
+			rc = free_device_memory(ctx, args->in.free.handle);
+			break;
+
+		case HL_MEM_OP_MAP:
+			if (args->in.flags & HL_MEM_USERPTR) {
+				device_addr = args->in.map_host.host_virt_addr;
+				rc = 0;
+			} else {
+				rc = get_paddr_from_handle(ctx, &args->in,
+						&device_addr);
+			}
+
+			memset(args, 0, sizeof(*args));
+			args->out.device_virt_addr = device_addr;
+			break;
+
+		case HL_MEM_OP_UNMAP:
+			rc = 0;
+			break;
+
+		default:
+			dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
+			rc = -ENOTTY;
+			break;
+		}
+	}
+
+out:
+	return rc;
+}
 
 /*
  * hl_pin_host_memory - pins a chunk of host memory
@@ -196,3 +1384,332 @@ bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
 
 	return false;
 }
+
+/*
+ * hl_va_range_init - initialize virtual addresses range
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @va_range            : pointer to the range to initialize
+ * @start               : range start address
+ * @end                 : range end address
+ *
+ * This function does the following:
+ * - Initializes the virtual addresses list of the given range with the given
+ *   addresses.
+ */
+static int hl_va_range_init(struct hl_device *hdev,
+		struct hl_va_range *va_range, u64 start, u64 end)
+{
+	int rc;
+
+	INIT_LIST_HEAD(&va_range->list);
+
+	/* PAGE_SIZE alignment */
+
+	if (start & (PAGE_SIZE - 1)) {
+		start &= PAGE_MASK;
+		start += PAGE_SIZE;
+	}
+
+	if (end & (PAGE_SIZE - 1))
+		end &= PAGE_MASK;
+
+	if (start >= end) {
+		dev_err(hdev->dev, "too small vm range for va list\n");
+		return -EFAULT;
+	}
+
+	rc = add_va_block(hdev, va_range, start, end);
+
+	if (rc) {
+		dev_err(hdev->dev, "Failed to init host va list\n");
+		return rc;
+	}
+
+	va_range->start_addr = start;
+	va_range->end_addr = end;
+
+	return 0;
+}
+
+/*
+ * hl_vm_ctx_init_with_ranges - initialize virtual memory for context
+ *
+ * @ctx                 : pointer to the habanalabs context structure
+ * @host_range_start    : host virtual addresses range start
+ * @host_range_end      : host virtual addresses range end
+ * @dram_range_start    : dram virtual addresses range start
+ * @dram_range_end      : dram virtual addresses range end
+ *
+ * This function initializes the following:
+ * - MMU for context
+ * - Virtual address to area descriptor hashtable
+ * - Virtual block list of available virtual memory
+ */
+int hl_vm_ctx_init_with_ranges(struct hl_ctx *ctx, u64 host_range_start,
+				u64 host_range_end, u64 dram_range_start,
+				u64 dram_range_end)
+{
+	struct hl_device *hdev = ctx->hdev;
+	int rc;
+
+	hl_mmu_ctx_init(ctx);
+
+	mutex_init(&ctx->mem_hash_lock);
+	hash_init(ctx->mem_hash);
+
+	mutex_init(&ctx->host_va_range.lock);
+
+	rc = hl_va_range_init(hdev, &ctx->host_va_range, host_range_start,
+			host_range_end);
+	if (rc) {
+		dev_err(hdev->dev, "failed to init host vm range\n");
+		goto host_vm_err;
+	}
+
+	mutex_init(&ctx->dram_va_range.lock);
+
+	rc = hl_va_range_init(hdev, &ctx->dram_va_range, dram_range_start,
+			dram_range_end);
+	if (rc) {
+		dev_err(hdev->dev, "failed to init dram vm range\n");
+		goto dram_vm_err;
+	}
+
+	return 0;
+
+dram_vm_err:
+	mutex_destroy(&ctx->dram_va_range.lock);
+
+	mutex_lock(&ctx->host_va_range.lock);
+	clear_va_list_locked(hdev, &ctx->host_va_range.list);
+	mutex_unlock(&ctx->host_va_range.lock);
+host_vm_err:
+	mutex_destroy(&ctx->host_va_range.lock);
+	mutex_destroy(&ctx->mem_hash_lock);
+	hl_mmu_ctx_fini(ctx);
+
+	return rc;
+}
+
+int hl_vm_ctx_init(struct hl_ctx *ctx)
+{
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	u64 host_range_start, host_range_end, dram_range_start,
+		dram_range_end;
+
+	atomic64_set(&ctx->dram_phys_mem, 0);
+
+	/*
+	 * - If MMU is enabled, init the ranges as usual.
+	 * - If MMU is disabled, in case of host mapping, the returned address
+	 *   is the given one.
+	 *   In case of DRAM mapping, the returned address is the physical
+	 *   address of the memory related to the given handle.
+	 */
+	if (ctx->hdev->mmu_enable) {
+		dram_range_start = prop->va_space_dram_start_address;
+		dram_range_end = prop->va_space_dram_end_address;
+		host_range_start = prop->va_space_host_start_address;
+		host_range_end = prop->va_space_host_end_address;
+	} else {
+		dram_range_start = prop->dram_user_base_address;
+		dram_range_end = prop->dram_end_address;
+		host_range_start = prop->dram_user_base_address;
+		host_range_end = prop->dram_end_address;
+	}
+
+	return hl_vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,
+			dram_range_start, dram_range_end);
+}
+
+/*
+ * hl_va_range_fini     - clear a virtual addresses range
+ *
+ * @hdev                : pointer to the habanalabs structure
+ * va_range             : pointer to virtual addresses range
+ *
+ * This function initializes the following:
+ * - Checks that the given range contains the whole initial range
+ * - Frees the virtual addresses block list and its lock
+ */
+static void hl_va_range_fini(struct hl_device *hdev,
+		struct hl_va_range *va_range)
+{
+	struct hl_vm_va_block *va_block;
+
+	if (list_empty(&va_range->list)) {
+		dev_warn(hdev->dev,
+				"va list should not be empty on cleanup!\n");
+		goto out;
+	}
+
+	if (!list_is_singular(&va_range->list)) {
+		dev_warn(hdev->dev,
+			"va list should not contain multiple blocks on cleanup!\n");
+		goto free_va_list;
+	}
+
+	va_block = list_first_entry(&va_range->list, typeof(*va_block), node);
+
+	if (va_block->start != va_range->start_addr ||
+		va_block->end != va_range->end_addr) {
+		dev_warn(hdev->dev,
+			"wrong va block on cleanup, from 0x%llx to 0x%llx\n",
+				va_block->start, va_block->end);
+		goto free_va_list;
+	}
+
+free_va_list:
+	mutex_lock(&va_range->lock);
+	clear_va_list_locked(hdev, &va_range->list);
+	mutex_unlock(&va_range->lock);
+
+out:
+	mutex_destroy(&va_range->lock);
+}
+
+/*
+ * hl_vm_ctx_fini       - virtual memory teardown of context
+ *
+ * @ctx                 : pointer to the habanalabs context structure
+ *
+ * This function perform teardown the following:
+ * - Virtual block list of available virtual memory
+ * - Virtual address to area descriptor hashtable
+ * - MMU for context
+ *
+ * In addition this function does the following:
+ * - Unmaps the existing hashtable nodes if the hashtable is not empty. The
+ *   hashtable should be empty as no valid mappings should exist at this
+ *   point.
+ * - Frees any existing physical page list from the idr which relates to the
+ *   current context asid.
+ * - This function checks the virtual block list for correctness. At this point
+ *   the list should contain one element which describes the whole virtual
+ *   memory range of the context. Otherwise, a warning is printed.
+ */
+void hl_vm_ctx_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm *vm = &hdev->vm;
+	struct hl_vm_phys_pg_pack *phys_pg_list;
+	struct hl_vm_hash_node *hnode;
+	struct hlist_node *tmp_node;
+	int i;
+
+	if (!hash_empty(ctx->mem_hash))
+		dev_notice(hdev->dev, "ctx is freed while it has va in use\n");
+
+	hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
+		dev_dbg(hdev->dev,
+			"hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",
+			hnode->vaddr, ctx->asid);
+		unmap_device_va(ctx, hnode->vaddr);
+	}
+
+	spin_lock(&vm->idr_lock);
+	idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)
+		if (phys_pg_list->asid == ctx->asid) {
+			dev_dbg(hdev->dev,
+				"page list 0x%p of asid %d is still alive\n",
+				phys_pg_list, ctx->asid);
+			free_phys_pg_pack(hdev, phys_pg_list);
+			idr_remove(&vm->phys_pg_pack_handles, i);
+		}
+	spin_unlock(&vm->idr_lock);
+
+	hl_va_range_fini(hdev, &ctx->dram_va_range);
+	hl_va_range_fini(hdev, &ctx->host_va_range);
+
+	mutex_destroy(&ctx->mem_hash_lock);
+	hl_mmu_ctx_fini(ctx);
+}
+
+/*
+ * hl_vm_init           - initialize virtual memory module
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ *
+ * This function initializes the following:
+ * - MMU module
+ * - DRAM physical pages pool of 2MB
+ * - Idr for device memory allocation handles
+ */
+int hl_vm_init(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_vm *vm = &hdev->vm;
+	int rc;
+
+	rc = hl_mmu_init(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to init MMU\n");
+		return rc;
+	}
+
+	vm->dram_pg_pool = gen_pool_create(__ffs(prop->dram_page_size), -1);
+	if (!vm->dram_pg_pool) {
+		dev_err(hdev->dev, "Failed to create dram page pool\n");
+		rc = -ENOMEM;
+		goto pool_create_err;
+	}
+
+	kref_init(&vm->dram_pg_pool_refcount);
+
+	rc = gen_pool_add(vm->dram_pg_pool, prop->dram_user_base_address,
+			prop->dram_end_address - prop->dram_user_base_address,
+			-1);
+
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to add memory to dram page pool %d\n", rc);
+		goto pool_add_err;
+	}
+
+	spin_lock_init(&vm->idr_lock);
+	idr_init(&vm->phys_pg_pack_handles);
+
+	atomic64_set(&hdev->dram_used_mem, 0);
+
+	vm->init_done = true;
+
+	return 0;
+
+pool_add_err:
+	gen_pool_destroy(vm->dram_pg_pool);
+pool_create_err:
+	hl_mmu_fini(hdev);
+
+	return rc;
+}
+
+/*
+ * hl_vm_fini           - virtual memory module teardown
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ *
+ * This function perform teardown to the following:
+ * - Idr for device memory allocation handles
+ * - DRAM physical pages pool of 2MB
+ * - MMU module
+ */
+void hl_vm_fini(struct hl_device *hdev)
+{
+	struct hl_vm *vm = &hdev->vm;
+
+	if (!vm->init_done)
+		return;
+
+	/*
+	 * At this point all the contexts should be freed and hence no DRAM
+	 * memory should be in use. Hence the DRAM pool should be freed here.
+	 */
+	if (kref_put(&vm->dram_pg_pool_refcount, dram_pg_pool_do_release) != 1)
+		dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n",
+				__func__);
+
+	hl_mmu_fini(hdev);
+
+	vm->init_done = false;
+}
diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/mmu.c
new file mode 100644
index 000000000000..79c70d92e74b
--- /dev/null
+++ b/drivers/misc/habanalabs/mmu.c
@@ -0,0 +1,691 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "habanalabs.h"
+#include "include/hw_ip/mmu/mmu_general.h"
+
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+
+static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 addr)
+{
+	struct pgt_info *pgt_info = NULL;
+
+	hash_for_each_possible(ctx->mmu_hash, pgt_info, node,
+				(unsigned long) addr)
+		if (addr == pgt_info->addr)
+			break;
+
+	return pgt_info;
+}
+
+static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
+
+	gen_pool_free(pgt_info->ctx->hdev->mmu_pgt_pool, pgt_info->addr,
+			ctx->hdev->asic_prop.mmu_hop_table_size);
+	hash_del(&pgt_info->node);
+
+	kfree(pgt_info);
+}
+
+static u64 alloc_hop(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct pgt_info *pgt_info;
+	u64 addr;
+
+	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
+	if (!pgt_info)
+		return ULLONG_MAX;
+
+	addr = (u64) gen_pool_alloc(hdev->mmu_pgt_pool,
+			hdev->asic_prop.mmu_hop_table_size);
+	if (!addr) {
+		dev_err(hdev->dev, "failed to allocate page\n");
+		kfree(pgt_info);
+		return ULLONG_MAX;
+	}
+
+	pgt_info->addr = addr;
+	pgt_info->ctx = ctx;
+	pgt_info->num_of_ptes = 0;
+	hash_add(ctx->mmu_hash, &pgt_info->node, addr);
+
+	return addr;
+}
+
+static inline void clear_pte(struct hl_device *hdev, u64 pte_addr)
+{
+	/* clear the last and present bits */
+	hdev->asic_funcs->write_pte(hdev, pte_addr, 0);
+}
+
+static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	get_pgt_info(ctx, hop_addr)->num_of_ptes++;
+}
+
+/*
+ * put_pte - decrement the num of ptes and free the hop if possible
+ *
+ * @ctx: pointer to the context structure
+ * @hop_addr: addr of the hop
+ *
+ * This function returns the number of ptes left on this hop. If the number is
+ * 0, it means the pte was freed.
+ */
+static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
+	int num_of_ptes_left;
+
+	pgt_info->num_of_ptes--;
+
+	/*
+	 * Need to save the number of ptes left because free_hop might free
+	 * the pgt_info
+	 */
+	num_of_ptes_left = pgt_info->num_of_ptes;
+	if (!num_of_ptes_left)
+		free_hop(ctx, hop_addr);
+
+	return num_of_ptes_left;
+}
+
+static inline u64 get_hop0_addr(struct hl_ctx *ctx)
+{
+	return ctx->hdev->asic_prop.mmu_pgt_addr +
+			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+}
+
+static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
+					u64 virt_addr, u64 mask, u64 shift)
+{
+	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
+			((virt_addr & mask) >> shift);
+}
+
+static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP0_MASK, HOP0_SHIFT);
+}
+
+static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP1_MASK, HOP1_SHIFT);
+}
+
+static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP2_MASK, HOP2_SHIFT);
+}
+
+static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP3_MASK, HOP3_SHIFT);
+}
+
+static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP4_MASK, HOP4_SHIFT);
+}
+
+static inline u64 get_next_hop_addr(u64 curr_pte)
+{
+	if (curr_pte & PAGE_PRESENT_MASK)
+		return curr_pte & PHYS_ADDR_MASK;
+	else
+		return ULLONG_MAX;
+}
+
+static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
+						bool *is_new_hop)
+{
+	u64 hop_addr = get_next_hop_addr(curr_pte);
+
+	if (hop_addr == ULLONG_MAX) {
+		hop_addr = alloc_hop(ctx);
+		*is_new_hop = true;
+	}
+
+	return hop_addr;
+}
+
+/*
+ * hl_mmu_init - init the mmu module
+ *
+ * @hdev: pointer to the habanalabs device structure
+ *
+ * This function does the following:
+ * - Allocate max_asid zeroed hop0 pgts so no mapping is available
+ * - Enable mmu in hw
+ * - Invalidate the mmu cache
+ * - Create a pool of pages for pgts
+ * - Returns 0 on success
+ *
+ * This function depends on DMA QMAN to be working!
+ */
+int hl_mmu_init(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	if (!hdev->mmu_enable)
+		return 0;
+
+	/* MMU HW init was already done in device hw_init() */
+
+	mutex_init(&hdev->mmu_cache_lock);
+
+	hdev->mmu_pgt_pool =
+			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
+
+	if (!hdev->mmu_pgt_pool) {
+		dev_err(hdev->dev, "Failed to create page gen pool\n");
+		rc = -ENOMEM;
+		goto err_pool_create;
+	}
+
+	rc = gen_pool_add(hdev->mmu_pgt_pool, prop->mmu_pgt_addr +
+			prop->mmu_hop0_tables_total_size,
+			prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
+			-1);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
+		goto err_pool_add;
+	}
+
+	return 0;
+
+err_pool_add:
+	gen_pool_destroy(hdev->mmu_pgt_pool);
+err_pool_create:
+	mutex_destroy(&hdev->mmu_cache_lock);
+
+	return rc;
+}
+
+/*
+ * hl_mmu_fini - release the mmu module.
+ *
+ * @hdev: pointer to the habanalabs device structure
+ *
+ * This function does the following:
+ * - Disable mmu in hw
+ * - free the pgts pool
+ *
+ * All ctxs should be freed before calling this func
+ */
+void hl_mmu_fini(struct hl_device *hdev)
+{
+	if (!hdev->mmu_enable)
+		return;
+
+	gen_pool_destroy(hdev->mmu_pgt_pool);
+
+	mutex_destroy(&hdev->mmu_cache_lock);
+
+	/* MMU HW fini will be done in device hw_fini() */
+}
+
+/*
+ * hl_mmu_ctx_init - init a ctx for using the mmu module
+ *
+ * @ctx: pointer to the context structure
+ *
+ * This function does the following:
+ * - Init a mutex to protect the concurrent mapping flow
+ * - Init a hash to hold all pgts related to this ctx
+ */
+void hl_mmu_ctx_init(struct hl_ctx *ctx)
+{
+	if (!ctx->hdev->mmu_enable)
+		return;
+
+	mutex_init(&ctx->mmu_lock);
+	hash_init(ctx->mmu_hash);
+}
+
+/*
+ * hl_mmu_ctx_fini - disable a ctx from using the mmu module
+ *
+ * @ctx: pointer to the context structure
+ *
+ * This function does the following:
+ * - Free any pgts which were not freed yet
+ * - Free the mutex
+ */
+void hl_mmu_ctx_fini(struct hl_ctx *ctx)
+{
+	struct pgt_info *pgt_info;
+	struct hlist_node *tmp;
+	int i;
+
+	if (!ctx->hdev->mmu_enable)
+		return;
+
+	if (!hash_empty(ctx->mmu_hash))
+		dev_err(ctx->hdev->dev,
+				"ctx is freed while it has pgts in use\n");
+
+	hash_for_each_safe(ctx->mmu_hash, i, tmp, pgt_info, node) {
+		dev_err(ctx->hdev->dev,
+			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
+			pgt_info->addr, ctx->asid, pgt_info->num_of_ptes);
+		free_hop(ctx, pgt_info->addr);
+	}
+
+	mutex_destroy(&ctx->mmu_lock);
+}
+
+static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
+{
+	struct hl_device *hdev = ctx->hdev;
+	u64 hop0_addr = 0, hop0_pte_addr = 0,
+		hop1_addr = 0, hop1_pte_addr = 0,
+		hop2_addr = 0, hop2_pte_addr = 0,
+		hop3_addr = 0, hop3_pte_addr = 0,
+		hop4_addr = 0, hop4_pte_addr = 0,
+		curr_pte;
+	int clear_hop3 = 1;
+
+	hop0_addr = get_hop0_addr(ctx);
+
+	hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr);
+
+	curr_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr);
+
+	hop1_addr = get_next_hop_addr(curr_pte);
+
+	if (hop1_addr == ULLONG_MAX)
+		goto not_mapped;
+
+	hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr);
+
+	curr_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr);
+
+	hop2_addr = get_next_hop_addr(curr_pte);
+
+	if (hop2_addr == ULLONG_MAX)
+		goto not_mapped;
+
+	hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr);
+
+	curr_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr);
+
+	hop3_addr = get_next_hop_addr(curr_pte);
+
+	if (hop3_addr == ULLONG_MAX)
+		goto not_mapped;
+
+	hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr);
+
+	curr_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);
+
+	if (!(curr_pte & LAST_MASK)) {
+		hop4_addr = get_next_hop_addr(curr_pte);
+
+		if (hop4_addr == ULLONG_MAX)
+			goto not_mapped;
+
+		hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr);
+
+		curr_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
+
+		clear_hop3 = 0;
+	}
+
+	if (!(curr_pte & PAGE_PRESENT_MASK))
+		goto not_mapped;
+
+	clear_pte(hdev, hop4_addr ? hop4_pte_addr : hop3_pte_addr);
+
+	if (hop4_addr && !put_pte(ctx, hop4_addr))
+		clear_hop3 = 1;
+
+	if (!clear_hop3)
+		goto flush;
+	clear_pte(hdev, hop3_pte_addr);
+
+	if (put_pte(ctx, hop3_addr))
+		goto flush;
+	clear_pte(hdev, hop2_pte_addr);
+
+	if (put_pte(ctx, hop2_addr))
+		goto flush;
+	clear_pte(hdev, hop1_pte_addr);
+
+	if (put_pte(ctx, hop1_addr))
+		goto flush;
+	clear_pte(hdev, hop0_pte_addr);
+
+flush:
+	/* flush all writes from all cores to reach PCI */
+	mb();
+
+	hdev->asic_funcs->read_pte(hdev,
+				hop4_addr ? hop4_pte_addr : hop3_pte_addr);
+
+	return 0;
+
+not_mapped:
+	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
+		virt_addr);
+
+	return -EINVAL;
+}
+
+/*
+ * hl_mmu_unmap - unmaps a virtual addr
+ *
+ * @ctx: pointer to the context structure
+ * @virt_addr: virt addr to map from
+ * @page_size: size of the page to unmap
+ *
+ * This function does the following:
+ * - Check that the virt addr is mapped
+ * - Unmap the virt addr and frees pgts if possible
+ * - Returns 0 on success, -EINVAL if the given addr is not mapped
+ *
+ * Because this function changes the page tables in the device and because it
+ * changes the MMU hash, it must be protected by a lock.
+ * However, because it maps only a single page, the lock should be implemented
+ * in a higher level in order to protect the entire mapping of the memory area
+ */
+int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size)
+{
+	struct hl_device *hdev = ctx->hdev;
+	u64 real_virt_addr;
+	u32 real_page_size, npages;
+	int i, rc;
+
+	if (!hdev->mmu_enable)
+		return 0;
+
+	/*
+	 * The H/W handles mapping of 4KB/2MB page. Hence if the host page size
+	 * is bigger, we break it to sub-pages and unmap them separately.
+	 */
+	if ((page_size % PAGE_SIZE_2MB) == 0) {
+		real_page_size = PAGE_SIZE_2MB;
+	} else if ((page_size % PAGE_SIZE_4KB) == 0) {
+		real_page_size = PAGE_SIZE_4KB;
+	} else {
+		dev_err(hdev->dev,
+			"page size of %u is not 4KB nor 2MB aligned, can't unmap\n",
+				page_size);
+
+		return -EFAULT;
+	}
+
+	npages = page_size / real_page_size;
+	real_virt_addr = virt_addr;
+
+	for (i = 0 ; i < npages ; i++) {
+		rc = _hl_mmu_unmap(ctx, real_virt_addr);
+		if (rc)
+			return rc;
+
+		real_virt_addr += real_page_size;
+	}
+
+	return 0;
+}
+
+static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
+		u32 page_size)
+{
+	struct hl_device *hdev = ctx->hdev;
+	u64 hop0_addr = 0, hop0_pte_addr = 0,
+		hop1_addr = 0, hop1_pte_addr = 0,
+		hop2_addr = 0, hop2_pte_addr = 0,
+		hop3_addr = 0, hop3_pte_addr = 0,
+		hop4_addr = 0, hop4_pte_addr = 0,
+		curr_pte = 0;
+	bool hop1_new = false, hop2_new = false, hop3_new = false,
+		hop4_new = false, is_huge;
+	int rc = -ENOMEM;
+
+	/*
+	 * This mapping function can map a 4KB/2MB page. For 2MB page there are
+	 * only 3 hops rather than 4. Currently the DRAM allocation uses 2MB
+	 * pages only but user memory could have been allocated with one of the
+	 * two page sizes. Since this is a common code for all the three cases,
+	 * we need this hugs page check.
+	 */
+	is_huge = page_size == PAGE_SIZE_2MB;
+
+	hop0_addr = get_hop0_addr(ctx);
+
+	hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr);
+
+	curr_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr);
+
+	hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
+
+	if (hop1_addr == ULLONG_MAX)
+		goto err;
+
+	hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr);
+
+	curr_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr);
+
+	hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
+
+	if (hop2_addr == ULLONG_MAX)
+		goto err;
+
+	hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr);
+
+	curr_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr);
+
+	hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
+
+	if (hop3_addr == ULLONG_MAX)
+		goto err;
+
+	hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr);
+
+	curr_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);
+
+	if (!is_huge) {
+		hop4_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop4_new);
+
+		if (hop4_addr == ULLONG_MAX)
+			goto err;
+
+		hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr);
+
+		curr_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
+	}
+
+	if (curr_pte & PAGE_PRESENT_MASK) {
+		dev_err(hdev->dev,
+				"mapping already exists for virt_addr 0x%llx\n",
+					virt_addr);
+
+		dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n",
+				hdev->asic_funcs->read_pte(hdev, hop0_pte_addr),
+				hop0_pte_addr);
+		dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n",
+				hdev->asic_funcs->read_pte(hdev, hop1_pte_addr),
+				hop1_pte_addr);
+		dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n",
+				hdev->asic_funcs->read_pte(hdev, hop2_pte_addr),
+				hop2_pte_addr);
+		dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n",
+				hdev->asic_funcs->read_pte(hdev, hop3_pte_addr),
+				hop3_pte_addr);
+
+		if (!is_huge)
+			dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n",
+				hdev->asic_funcs->read_pte(hdev,
+							hop4_pte_addr),
+							hop4_pte_addr);
+
+		rc = EINVAL;
+		goto err;
+	}
+
+	curr_pte = (phys_addr & PTE_PHYS_ADDR_MASK) | LAST_MASK
+			| PAGE_PRESENT_MASK;
+
+	hdev->asic_funcs->write_pte(hdev,
+				is_huge ? hop3_pte_addr : hop4_pte_addr,
+				curr_pte);
+
+	if (hop1_new) {
+		curr_pte = (hop1_addr & PTE_PHYS_ADDR_MASK) |
+				PAGE_PRESENT_MASK;
+		ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop0_pte_addr,
+				curr_pte);
+	}
+	if (hop2_new) {
+		curr_pte = (hop2_addr & PTE_PHYS_ADDR_MASK) |
+				PAGE_PRESENT_MASK;
+		ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop1_pte_addr,
+				curr_pte);
+		get_pte(ctx, hop1_addr);
+	}
+	if (hop3_new) {
+		curr_pte = (hop3_addr & PTE_PHYS_ADDR_MASK) |
+				PAGE_PRESENT_MASK;
+		ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop2_pte_addr,
+				curr_pte);
+		get_pte(ctx, hop2_addr);
+	}
+
+	if (!is_huge) {
+		if (hop4_new) {
+			curr_pte = (hop4_addr & PTE_PHYS_ADDR_MASK) |
+					PAGE_PRESENT_MASK;
+			ctx->hdev->asic_funcs->write_pte(ctx->hdev,
+					hop3_pte_addr, curr_pte);
+			get_pte(ctx, hop3_addr);
+		}
+
+		get_pte(ctx, hop4_addr);
+	} else {
+		get_pte(ctx, hop3_addr);
+	}
+
+	/* flush all writes from all cores to reach PCI */
+	mb();
+
+	hdev->asic_funcs->read_pte(hdev,
+				is_huge ? hop3_pte_addr : hop4_pte_addr);
+
+	return 0;
+
+err:
+	if (hop4_new)
+		free_hop(ctx, hop4_addr);
+	if (hop3_new)
+		free_hop(ctx, hop3_addr);
+	if (hop2_new)
+		free_hop(ctx, hop2_addr);
+	if (hop1_new)
+		free_hop(ctx, hop1_addr);
+
+	return rc;
+}
+
+/*
+ * hl_mmu_map - maps a virtual addr to physical addr
+ *
+ * @ctx: pointer to the context structure
+ * @virt_addr: virt addr to map from
+ * @phys_addr: phys addr to map to
+ * @page_size: physical page size
+ *
+ * This function does the following:
+ * - Check that the virt addr is not mapped
+ * - Allocate pgts as necessary in order to map the virt addr to the phys
+ * - Returns 0 on success, -EINVAL if addr is already mapped, or -ENOMEM.
+ *
+ * Because this function changes the page tables in the device and because it
+ * changes the MMU hash, it must be protected by a lock.
+ * However, because it maps only a single page, the lock should be implemented
+ * in a higher level in order to protect the entire mapping of the memory area
+ */
+int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size)
+{
+	struct hl_device *hdev = ctx->hdev;
+	u64 real_virt_addr;
+	u32 real_page_size, npages;
+	int i, rc, mapped_cnt = 0;
+
+	if (!hdev->mmu_enable)
+		return 0;
+
+	/*
+	 * The H/W handles mapping of 4KB/2MB page. Hence if the host page size
+	 * is bigger, we break it to sub-pages and map them separately.
+	 */
+	if ((page_size % PAGE_SIZE_2MB) == 0) {
+		real_page_size = PAGE_SIZE_2MB;
+	} else if ((page_size % PAGE_SIZE_4KB) == 0) {
+		real_page_size = PAGE_SIZE_4KB;
+	} else {
+		dev_err(hdev->dev,
+			"page size of %u is not 4KB nor 2MB aligned, can't map\n",
+				page_size);
+
+		return -EFAULT;
+	}
+
+	npages = page_size / real_page_size;
+	real_virt_addr = virt_addr;
+
+	for (i = 0 ; i < npages ; i++) {
+		rc = _hl_mmu_map(ctx, real_virt_addr, phys_addr,
+				real_page_size);
+		if (rc)
+			goto err;
+
+		real_virt_addr += real_page_size;
+		mapped_cnt++;
+	}
+
+	return 0;
+
+err:
+	real_virt_addr = virt_addr;
+	for (i = 0 ; i < mapped_cnt ; i++) {
+		if (_hl_mmu_unmap(ctx, real_virt_addr))
+			dev_warn_ratelimited(hdev->dev,
+				"failed to unmap va: 0x%llx\n", real_virt_addr);
+
+		real_virt_addr += real_page_size;
+	}
+
+	return rc;
+}
+
+/*
+ * hl_mmu_swap_out - marks all mapping of the given ctx as swapped out
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+void hl_mmu_swap_out(struct hl_ctx *ctx)
+{
+
+}
+
+/*
+ * hl_mmu_swap_in - marks all mapping of the given ctx as swapped in
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+void hl_mmu_swap_in(struct hl_ctx *ctx)
+{
+
+}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index fba49417f607..9015043887d1 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -162,6 +162,108 @@ union hl_wait_cs_args {
 	struct hl_wait_cs_out out;
 };
 
+/* Opcode to alloc device memory */
+#define HL_MEM_OP_ALLOC			0
+/* Opcode to free previously allocated device memory */
+#define HL_MEM_OP_FREE			1
+/* Opcode to map host memory */
+#define HL_MEM_OP_MAP			2
+/* Opcode to unmap previously mapped host memory */
+#define HL_MEM_OP_UNMAP			3
+
+/* Memory flags */
+#define HL_MEM_CONTIGUOUS	0x1
+#define HL_MEM_SHARED		0x2
+#define HL_MEM_USERPTR		0x4
+
+struct hl_mem_in {
+	union {
+		/* HL_MEM_OP_ALLOC- allocate device memory */
+		struct {
+			/* Size to alloc */
+			__u32 mem_size;
+			__u32 pad;
+		} alloc;
+
+		/* HL_MEM_OP_FREE - free device memory */
+		struct {
+			/* Handle returned from HL_MEM_OP_ALLOC */
+			__u64 handle;
+		} free;
+
+		/* HL_MEM_OP_MAP - map device memory */
+		struct {
+			/*
+			 * Requested virtual address of mapped memory.
+			 * KMD will try to map the requested region to this
+			 * hint address, as long as the address is valid and
+			 * not already mapped. The user should check the
+			 * returned address of the IOCTL to make sure he got
+			 * the hint address. Passing 0 here means that KMD
+			 * will choose the address itself.
+			 */
+			__u64 hint_addr;
+			/* Handle returned from HL_MEM_OP_ALLOC */
+			__u64 handle;
+		} map_device;
+
+		/* HL_MEM_OP_MAP - map host memory */
+		struct {
+			/* Address of allocated host memory */
+			__u64 host_virt_addr;
+			/*
+			 * Requested virtual address of mapped memory.
+			 * KMD will try to map the requested region to this
+			 * hint address, as long as the address is valid and
+			 * not already mapped. The user should check the
+			 * returned address of the IOCTL to make sure he got
+			 * the hint address. Passing 0 here means that KMD
+			 * will choose the address itself.
+			 */
+			__u64 hint_addr;
+			/* Size of allocated host memory */
+			__u32 mem_size;
+			__u32 pad;
+		} map_host;
+
+		/* HL_MEM_OP_UNMAP - unmap host memory */
+		struct {
+			/* Virtual address returned from HL_MEM_OP_MAP */
+			__u64 device_virt_addr;
+		} unmap;
+	};
+
+	/* HL_MEM_OP_* */
+	__u32 op;
+	/* HL_MEM_* flags */
+	__u32 flags;
+	/* Context ID - Currently not in use */
+	__u32 ctx_id;
+	__u32 pad;
+};
+
+struct hl_mem_out {
+	union {
+		/*
+		 * Used for HL_MEM_OP_MAP as the virtual address that was
+		 * assigned in the device VA space.
+		 * A value of 0 means the requested operation failed.
+		 */
+		__u64 device_virt_addr;
+
+		/*
+		 * Used for HL_MEM_OP_ALLOC. This is the assigned
+		 * handle for the allocated memory
+		 */
+		__u64 handle;
+	};
+};
+
+union hl_mem_args {
+	struct hl_mem_in in;
+	struct hl_mem_out out;
+};
+
 /*
  * Command Buffer
  * - Request a Command Buffer
@@ -245,7 +347,25 @@ union hl_wait_cs_args {
 #define HL_IOCTL_WAIT_CS			\
 		_IOWR('H', 0x04, union hl_wait_cs_args)
 
+/*
+ * Memory
+ * - Map host memory to device MMU
+ * - Unmap host memory from device MMU
+ *
+ * This IOCTL allows the user to map host memory to the device MMU
+ *
+ * For host memory, the IOCTL doesn't allocate memory. The user is supposed
+ * to allocate the memory in user-space (malloc/new). The driver pins the
+ * physical pages (up to the allowed limit by the OS), assigns a virtual
+ * address in the device VA space and initializes the device MMU.
+ *
+ * There is an option for the user to specify the requested virtual address.
+ *
+ */
+#define HL_IOCTL_MEMORY		\
+		_IOWR('H', 0x05, union hl_mem_args)
+
 #define HL_COMMAND_START	0x02
-#define HL_COMMAND_END		0x05
+#define HL_COMMAND_END		0x06
 
 #endif /* HABANALABS_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From d8dd7b0a81cc192ef5d30ec76ed6f6d35a1a7cf5 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Sat, 16 Feb 2019 00:39:23 +0200
Subject: habanalabs: implement INFO IOCTL

This patch implements the INFO IOCTL. That IOCTL is used by the user to
query information that is relevant/needed by the user in order to submit
deep learning jobs to Goya.

The information is divided into several categories, such as H/W IP, Events
that happened, DDR usage and more.

Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/habanalabs/goya/goya.c        |   6 ++
 drivers/misc/habanalabs/habanalabs.h       |   2 +
 drivers/misc/habanalabs/habanalabs_ioctl.c | 126 +++++++++++++++++++++++++++++
 include/uapi/misc/habanalabs.h             |  75 ++++++++++++++++-
 4 files changed, 208 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 89b82b989966..bf3f76f1aeae 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5131,6 +5131,11 @@ static void goya_hw_queues_unlock(struct hl_device *hdev)
 	spin_unlock(&goya->hw_queues_lock);
 }
 
+static u32 goya_get_pci_id(struct hl_device *hdev)
+{
+	return hdev->pdev->device;
+}
+
 int goya_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 {
 	struct goya_device *goya = hdev->asic_specific;
@@ -5232,6 +5237,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.soft_reset_late_init = goya_soft_reset_late_init,
 	.hw_queues_lock = goya_hw_queues_lock,
 	.hw_queues_unlock = goya_hw_queues_unlock,
+	.get_pci_id = goya_get_pci_id,
 	.get_eeprom_data = goya_get_eeprom_data,
 	.send_cpu_message = goya_send_cpu_message,
 	.get_hw_state = goya_get_hw_state
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 03085e7a12dd..02de4a2cab27 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -470,6 +470,7 @@ enum hl_pll_frequency {
  * @soft_reset_late_init: perform certain actions needed after soft reset.
  * @hw_queues_lock: acquire H/W queues lock.
  * @hw_queues_unlock: release H/W queues lock.
+ * @get_pci_id: retrieve PCI ID.
  * @get_eeprom_data: retrieve EEPROM data from F/W.
  * @send_cpu_message: send buffer to ArmCP.
  * @get_hw_state: retrieve the H/W state
@@ -539,6 +540,7 @@ struct hl_asic_funcs {
 	int (*soft_reset_late_init)(struct hl_device *hdev);
 	void (*hw_queues_lock)(struct hl_device *hdev);
 	void (*hw_queues_unlock)(struct hl_device *hdev);
+	u32 (*get_pci_id)(struct hl_device *hdev);
 	int (*get_eeprom_data)(struct hl_device *hdev, void *data,
 				size_t max_size);
 	int (*send_cpu_message)(struct hl_device *hdev, u32 *msg,
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index 6e4dc5b5e696..12408d3302e9 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -12,10 +12,136 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 
+static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
+{
+	struct hl_info_hw_ip_info hw_ip = {0};
+	u32 size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 sram_kmd_size, dram_kmd_size;
+
+	if ((!size) || (!out))
+		return -EINVAL;
+
+	sram_kmd_size = (prop->sram_user_base_address -
+				prop->sram_base_address);
+	dram_kmd_size = (prop->dram_user_base_address -
+				prop->dram_base_address);
+
+	hw_ip.device_id = hdev->asic_funcs->get_pci_id(hdev);
+	hw_ip.sram_base_address = prop->sram_user_base_address;
+	hw_ip.dram_base_address = prop->dram_user_base_address;
+	hw_ip.tpc_enabled_mask = prop->tpc_enabled_mask;
+	hw_ip.sram_size = prop->sram_size - sram_kmd_size;
+	hw_ip.dram_size = prop->dram_size - dram_kmd_size;
+	if (hw_ip.dram_size > 0)
+		hw_ip.dram_enabled = 1;
+	hw_ip.num_of_events = prop->num_of_events;
+	memcpy(hw_ip.armcp_version,
+		prop->armcp_info.armcp_version, VERSION_MAX_LEN);
+	hw_ip.armcp_cpld_version = prop->armcp_info.cpld_version;
+	hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr;
+	hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf;
+	hw_ip.psoc_pci_pll_od = prop->psoc_pci_pll_od;
+	hw_ip.psoc_pci_pll_div_factor = prop->psoc_pci_pll_div_factor;
+
+	return copy_to_user(out, &hw_ip,
+		min((size_t)size, sizeof(hw_ip))) ? -EFAULT : 0;
+}
+
+static int hw_events_info(struct hl_device *hdev, struct hl_info_args *args)
+{
+	u32 size, max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	void *arr;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	arr = hdev->asic_funcs->get_events_stat(hdev, &size);
+
+	return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0;
+}
+
+static int dram_usage_info(struct hl_device *hdev, struct hl_info_args *args)
+{
+	struct hl_info_dram_usage dram_usage = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 dram_kmd_size;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	dram_kmd_size = (prop->dram_user_base_address -
+				prop->dram_base_address);
+	dram_usage.dram_free_mem = (prop->dram_size - dram_kmd_size) -
+					atomic64_read(&hdev->dram_used_mem);
+	dram_usage.ctx_dram_mem = atomic64_read(&hdev->user_ctx->dram_phys_mem);
+
+	return copy_to_user(out, &dram_usage,
+		min((size_t) max_size, sizeof(dram_usage))) ? -EFAULT : 0;
+}
+
+static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
+{
+	struct hl_info_hw_idle hw_idle = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev);
+
+	return copy_to_user(out, &hw_idle,
+		min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;
+}
+
+static int hl_info_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	struct hl_info_args *args = data;
+	struct hl_device *hdev = hpriv->hdev;
+	int rc;
+
+	if (hl_device_disabled_or_in_reset(hdev)) {
+		dev_err(hdev->dev,
+			"Device is disabled or in reset. Can't execute INFO IOCTL\n");
+		return -EBUSY;
+	}
+
+	switch (args->op) {
+	case HL_INFO_HW_IP_INFO:
+		rc = hw_ip_info(hdev, args);
+		break;
+
+	case HL_INFO_HW_EVENTS:
+		rc = hw_events_info(hdev, args);
+		break;
+
+	case HL_INFO_DRAM_USAGE:
+		rc = dram_usage_info(hdev, args);
+		break;
+
+	case HL_INFO_HW_IDLE:
+		rc = hw_idle(hdev, args);
+		break;
+
+	default:
+		dev_err(hdev->dev, "Invalid request %d\n", args->op);
+		rc = -ENOTTY;
+		break;
+	}
+
+	return rc;
+}
+
 #define HL_IOCTL_DEF(ioctl, _func) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func}
 
 static const struct hl_ioctl_desc hl_ioctls[] = {
+	HL_IOCTL_DEF(HL_IOCTL_INFO, hl_info_ioctl),
 	HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl),
 	HL_IOCTL_DEF(HL_IOCTL_CS, hl_cs_ioctl),
 	HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl),
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 9015043887d1..4afc1891ece8 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -45,6 +45,62 @@ enum goya_queue_id {
 	GOYA_QUEUE_ID_SIZE
 };
 
+/* Opcode for management ioctl */
+#define HL_INFO_HW_IP_INFO	0
+#define HL_INFO_HW_EVENTS	1
+#define HL_INFO_DRAM_USAGE	2
+#define HL_INFO_HW_IDLE		3
+
+#define HL_INFO_VERSION_MAX_LEN	128
+
+struct hl_info_hw_ip_info {
+	__u64 sram_base_address;
+	__u64 dram_base_address;
+	__u64 dram_size;
+	__u32 sram_size;
+	__u32 num_of_events;
+	__u32 device_id; /* PCI Device ID */
+	__u32 reserved[3];
+	__u32 armcp_cpld_version;
+	__u32 psoc_pci_pll_nr;
+	__u32 psoc_pci_pll_nf;
+	__u32 psoc_pci_pll_od;
+	__u32 psoc_pci_pll_div_factor;
+	__u8 tpc_enabled_mask;
+	__u8 dram_enabled;
+	__u8 pad[2];
+	__u8 armcp_version[HL_INFO_VERSION_MAX_LEN];
+};
+
+struct hl_info_dram_usage {
+	__u64 dram_free_mem;
+	__u64 ctx_dram_mem;
+};
+
+struct hl_info_hw_idle {
+	__u32 is_idle;
+	__u32 pad;
+};
+
+struct hl_info_args {
+	/* Location of relevant struct in userspace */
+	__u64 return_pointer;
+	/*
+	 * The size of the return value. Just like "size" in "snprintf",
+	 * it limits how many bytes the kernel can write
+	 *
+	 * For hw_events array, the size should be
+	 * hl_info_hw_ip_info.num_of_events * sizeof(__u32)
+	 */
+	__u32 return_size;
+
+	/* HL_INFO_* */
+	__u32 op;
+
+	/* Context ID - Currently not in use */
+	__u32 ctx_id;
+	__u32 pad;
+};
 
 /* Opcode to create a new command buffer */
 #define HL_CB_OP_CREATE		0
@@ -264,6 +320,23 @@ union hl_mem_args {
 	struct hl_mem_out out;
 };
 
+/*
+ * Various information operations such as:
+ * - H/W IP information
+ * - Current dram usage
+ *
+ * The user calls this IOCTL with an opcode that describes the required
+ * information. The user should supply a pointer to a user-allocated memory
+ * chunk, which will be filled by the driver with the requested information.
+ *
+ * The user supplies the maximum amount of size to copy into the user's memory,
+ * in order to prevent data corruption in case of differences between the
+ * definitions of structures in kernel and userspace, e.g. in case of old
+ * userspace and new kernel driver
+ */
+#define HL_IOCTL_INFO	\
+		_IOWR('H', 0x01, struct hl_info_args)
+
 /*
  * Command Buffer
  * - Request a Command Buffer
@@ -365,7 +438,7 @@ union hl_mem_args {
 #define HL_IOCTL_MEMORY		\
 		_IOWR('H', 0x05, union hl_mem_args)
 
-#define HL_COMMAND_START	0x02
+#define HL_COMMAND_START	0x01
 #define HL_COMMAND_END		0x06
 
 #endif /* HABANALABS_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From 0d0216c03a7a14e121abb2e3eb38e491767c36e8 Mon Sep 17 00:00:00 2001
From: Yury Norov <ynorov@caviumnetworks.com>
Date: Wed, 16 May 2018 11:18:48 +0300
Subject: compat ABI: use non-compat openat and open_by_handle_at variants

The only difference between native and compat openat and open_by_handle_at
is that non-compat version forces O_LARGEFILE, and it should be the
default behaviour for all architectures, as we are going to drop the
support of 32-bit userspace off_t.

Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Yury Norov <ynorov@marvell.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/uapi/asm-generic/unistd.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index acf9a07ab2ff..b928eff3bf92 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -179,7 +179,7 @@ __SYSCALL(__NR_fchownat, sys_fchownat)
 #define __NR_fchown 55
 __SYSCALL(__NR_fchown, sys_fchown)
 #define __NR_openat 56
-__SC_COMP(__NR_openat, sys_openat, compat_sys_openat)
+__SYSCALL(__NR_openat, sys_openat)
 #define __NR_close 57
 __SYSCALL(__NR_close, sys_close)
 #define __NR_vhangup 58
@@ -678,8 +678,7 @@ __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
 #define __NR_name_to_handle_at         264
 __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 #define __NR_open_by_handle_at         265
-__SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \
-	  compat_sys_open_by_handle_at)
+__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
 #define __NR_clock_adjtime 266
 __SC_3264(__NR_clock_adjtime, sys_clock_adjtime32, sys_clock_adjtime)
 #define __NR_syncfs 267
-- 
cgit v1.2.3-59-g8ed1b


From 746c9398f5ac2b3f5730da4ed09e99ef4bb50b4a Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 8 Feb 2019 01:02:55 -0500
Subject: arch: move common mmap flags to linux/mman.h

Now that we have 3 mmap flags shared by all architectures,
let's move them into the common header.

This will help discourage future architectures from duplicating code.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/alpha/include/uapi/asm/mman.h     | 4 +---
 arch/mips/include/uapi/asm/mman.h      | 4 +---
 arch/parisc/include/uapi/asm/mman.h    | 4 +---
 arch/xtensa/include/uapi/asm/mman.h    | 4 +---
 include/uapi/asm-generic/mman-common.h | 4 +---
 include/uapi/linux/mman.h              | 4 ++++
 6 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index f9d4e6b6d4bd..ac23379b7a87 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -10,9 +10,7 @@
 #define PROT_GROWSDOWN	0x01000000	/* mprotect flag: extend change to start of growsdown vma */
 #define PROT_GROWSUP	0x02000000	/* mprotect flag: extend change to end of growsup vma */
 
-#define MAP_SHARED	0x01		/* Share changes */
-#define MAP_PRIVATE	0x02		/* Changes are private */
-#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
+/* 0x01 - 0x03 are defined in linux/mman.h */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping (OSF/1 is _wrong_) */
 #define MAP_FIXED	0x100		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x10		/* don't use a file */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 3035ca499cd8..c2b40969eb1f 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -27,9 +27,7 @@
 /*
  * Flags for mmap
  */
-#define MAP_SHARED	0x001		/* Share changes */
-#define MAP_PRIVATE	0x002		/* Changes are private */
-#define MAP_SHARED_VALIDATE 0x003	/* share + validate extension flags */
+/* 0x01 - 0x03 are defined in linux/mman.h */
 #define MAP_TYPE	0x00f		/* Mask for type of mapping */
 #define MAP_FIXED	0x010		/* Interpret addr exactly */
 
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 870fbf8c7088..c98162f494db 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -10,9 +10,7 @@
 #define PROT_GROWSDOWN	0x01000000	/* mprotect flag: extend change to start of growsdown vma */
 #define PROT_GROWSUP	0x02000000	/* mprotect flag: extend change to end of growsup vma */
 
-#define MAP_SHARED	0x01		/* Share changes */
-#define MAP_PRIVATE	0x02		/* Changes are private */
-#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
+/* 0x01 - 0x03 are defined in linux/mman.h */
 #define MAP_TYPE	0x2b		/* Mask for type of mapping, includes bits 0x08 and 0x20 */
 #define MAP_FIXED	0x04		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x10		/* don't use a file */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 58f29a9d895d..be726062412b 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -34,9 +34,7 @@
 /*
  * Flags for mmap
  */
-#define MAP_SHARED	0x001		/* Share changes */
-#define MAP_PRIVATE	0x002		/* Changes are private */
-#define MAP_SHARED_VALIDATE 0x003	/* share + validate extension flags */
+/* 0x01 - 0x03 are defined in linux/mman.h */
 #define MAP_TYPE	0x00f		/* Mask for type of mapping */
 #define MAP_FIXED	0x010		/* Interpret addr exactly */
 
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index e7ee32861d51..abd238d0f7a4 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -15,9 +15,7 @@
 #define PROT_GROWSDOWN	0x01000000	/* mprotect flag: extend change to start of growsdown vma */
 #define PROT_GROWSUP	0x02000000	/* mprotect flag: extend change to end of growsup vma */
 
-#define MAP_SHARED	0x01		/* Share changes */
-#define MAP_PRIVATE	0x02		/* Changes are private */
-#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
+/* 0x01 - 0x03 are defined in linux/mman.h */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index d0f515d53299..fc1a64c3447b 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -12,6 +12,10 @@
 #define OVERCOMMIT_ALWAYS		1
 #define OVERCOMMIT_NEVER		2
 
+#define MAP_SHARED	0x01		/* Share changes */
+#define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
+
 /*
  * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
  * size other than the default is desired.  See hugetlb_encode.h.
-- 
cgit v1.2.3-59-g8ed1b


From a7fe4ca72b1fe7877de5672640d0b4e023d0fdca Mon Sep 17 00:00:00 2001
From: Vivek Kasireddy <vivek.kasireddy@intel.com>
Date: Thu, 7 Feb 2019 22:18:43 -0500
Subject: media: v4l: Add 32-bit packed YUV formats

The formats added in this patch include:
 V4L2_PIX_FMT_AYUV32
 V4L2_PIX_FMT_XYUV32
 V4L2_PIX_FMT_VUYA32
 V4L2_PIX_FMT_VUYX32

These formats enable the trasmission of alpha channel data to other
drivers and userspace applications in addition to YUV data. For
example, buffers generated by drivers in one of these formats
can be used by the Weston compositor to display as a texture or
flipped directly onto the overlay planes with the help of a DRM
driver.

Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst | 170 ++++++++++++++++++++-
 drivers/media/v4l2-core/v4l2-ioctl.c               |   4 +
 include/uapi/linux/videodev2.h                     |   4 +
 3 files changed, 177 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst b/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst
index f53e8f57a003..7fcee1c11ac4 100644
--- a/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst
@@ -190,6 +190,170 @@ component of each pixel in one 16 or 32 bit word.
       - Cr\ :sub:`2`
       - Cr\ :sub:`1`
       - Cr\ :sub:`0`
+      -
+    * .. _V4L2-PIX-FMT-AYUV32:
+
+      - ``V4L2_PIX_FMT_AYUV32``
+      - 'AYUV'
+
+      - a\ :sub:`7`
+      - a\ :sub:`6`
+      - a\ :sub:`5`
+      - a\ :sub:`4`
+      - a\ :sub:`3`
+      - a\ :sub:`2`
+      - a\ :sub:`1`
+      - a\ :sub:`0`
+
+      - Y'\ :sub:`7`
+      - Y'\ :sub:`6`
+      - Y'\ :sub:`5`
+      - Y'\ :sub:`4`
+      - Y'\ :sub:`3`
+      - Y'\ :sub:`2`
+      - Y'\ :sub:`1`
+      - Y'\ :sub:`0`
+
+      - Cb\ :sub:`7`
+      - Cb\ :sub:`6`
+      - Cb\ :sub:`5`
+      - Cb\ :sub:`4`
+      - Cb\ :sub:`3`
+      - Cb\ :sub:`2`
+      - Cb\ :sub:`1`
+      - Cb\ :sub:`0`
+
+      - Cr\ :sub:`7`
+      - Cr\ :sub:`6`
+      - Cr\ :sub:`5`
+      - Cr\ :sub:`4`
+      - Cr\ :sub:`3`
+      - Cr\ :sub:`2`
+      - Cr\ :sub:`1`
+      - Cr\ :sub:`0`
+      -
+    * .. _V4L2-PIX-FMT-XYUV32:
+
+      - ``V4L2_PIX_FMT_XYUV32``
+      - 'XYUV'
+
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+
+      - Y'\ :sub:`7`
+      - Y'\ :sub:`6`
+      - Y'\ :sub:`5`
+      - Y'\ :sub:`4`
+      - Y'\ :sub:`3`
+      - Y'\ :sub:`2`
+      - Y'\ :sub:`1`
+      - Y'\ :sub:`0`
+
+      - Cb\ :sub:`7`
+      - Cb\ :sub:`6`
+      - Cb\ :sub:`5`
+      - Cb\ :sub:`4`
+      - Cb\ :sub:`3`
+      - Cb\ :sub:`2`
+      - Cb\ :sub:`1`
+      - Cb\ :sub:`0`
+
+      - Cr\ :sub:`7`
+      - Cr\ :sub:`6`
+      - Cr\ :sub:`5`
+      - Cr\ :sub:`4`
+      - Cr\ :sub:`3`
+      - Cr\ :sub:`2`
+      - Cr\ :sub:`1`
+      - Cr\ :sub:`0`
+      -
+    * .. _V4L2-PIX-FMT-VUYA32:
+
+      - ``V4L2_PIX_FMT_VUYA32``
+      - 'VUYA'
+
+      - Cr\ :sub:`7`
+      - Cr\ :sub:`6`
+      - Cr\ :sub:`5`
+      - Cr\ :sub:`4`
+      - Cr\ :sub:`3`
+      - Cr\ :sub:`2`
+      - Cr\ :sub:`1`
+      - Cr\ :sub:`0`
+
+      - Cb\ :sub:`7`
+      - Cb\ :sub:`6`
+      - Cb\ :sub:`5`
+      - Cb\ :sub:`4`
+      - Cb\ :sub:`3`
+      - Cb\ :sub:`2`
+      - Cb\ :sub:`1`
+      - Cb\ :sub:`0`
+
+      - Y'\ :sub:`7`
+      - Y'\ :sub:`6`
+      - Y'\ :sub:`5`
+      - Y'\ :sub:`4`
+      - Y'\ :sub:`3`
+      - Y'\ :sub:`2`
+      - Y'\ :sub:`1`
+      - Y'\ :sub:`0`
+
+      - a\ :sub:`7`
+      - a\ :sub:`6`
+      - a\ :sub:`5`
+      - a\ :sub:`4`
+      - a\ :sub:`3`
+      - a\ :sub:`2`
+      - a\ :sub:`1`
+      - a\ :sub:`0`
+      -
+    * .. _V4L2-PIX-FMT-VUYX32:
+
+      - ``V4L2_PIX_FMT_VUYX32``
+      - 'VUYX'
+
+      - Cr\ :sub:`7`
+      - Cr\ :sub:`6`
+      - Cr\ :sub:`5`
+      - Cr\ :sub:`4`
+      - Cr\ :sub:`3`
+      - Cr\ :sub:`2`
+      - Cr\ :sub:`1`
+      - Cr\ :sub:`0`
+
+      - Cb\ :sub:`7`
+      - Cb\ :sub:`6`
+      - Cb\ :sub:`5`
+      - Cb\ :sub:`4`
+      - Cb\ :sub:`3`
+      - Cb\ :sub:`2`
+      - Cb\ :sub:`1`
+      - Cb\ :sub:`0`
+
+      - Y'\ :sub:`7`
+      - Y'\ :sub:`6`
+      - Y'\ :sub:`5`
+      - Y'\ :sub:`4`
+      - Y'\ :sub:`3`
+      - Y'\ :sub:`2`
+      - Y'\ :sub:`1`
+      - Y'\ :sub:`0`
+
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
 
 .. raw:: latex
 
@@ -202,4 +366,8 @@ component of each pixel in one 16 or 32 bit word.
     #) The value of a = alpha bits is undefined when reading from the driver,
        ignored when writing to the driver, except when alpha blending has
        been negotiated for a :ref:`Video Overlay <overlay>` or
-       :ref:`Video Output Overlay <osd>`.
+       :ref:`Video Output Overlay <osd>` for the formats Y444, YUV555 and
+       YUV4. However, for formats AYUV32 and VUYA32, the alpha component is
+       expected to contain a meaningful value that can be used by drivers
+       and applications. And, the formats XYUV32 and VUYX32 contain undefined
+       alpha values that must be ignored by all applications and drivers.
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 206b7348797e..6f707466b5d2 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1220,6 +1220,10 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_YUV555:	descr = "16-bit A/XYUV 1-5-5-5"; break;
 	case V4L2_PIX_FMT_YUV565:	descr = "16-bit YUV 5-6-5"; break;
 	case V4L2_PIX_FMT_YUV32:	descr = "32-bit A/XYUV 8-8-8-8"; break;
+	case V4L2_PIX_FMT_AYUV32:	descr = "32-bit AYUV 8-8-8-8"; break;
+	case V4L2_PIX_FMT_XYUV32:	descr = "32-bit XYUV 8-8-8-8"; break;
+	case V4L2_PIX_FMT_VUYA32:	descr = "32-bit VUYA 8-8-8-8"; break;
+	case V4L2_PIX_FMT_VUYX32:	descr = "32-bit VUYX 8-8-8-8"; break;
 	case V4L2_PIX_FMT_YUV410:	descr = "Planar YUV 4:1:0"; break;
 	case V4L2_PIX_FMT_YUV420:	descr = "Planar YUV 4:2:0"; break;
 	case V4L2_PIX_FMT_HI240:	descr = "8-bit Dithered RGB (BTTV)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 9a920f071ff9..1db220da3bcc 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -562,6 +562,10 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_YUV555  v4l2_fourcc('Y', 'U', 'V', 'O') /* 16  YUV-5-5-5     */
 #define V4L2_PIX_FMT_YUV565  v4l2_fourcc('Y', 'U', 'V', 'P') /* 16  YUV-5-6-5     */
 #define V4L2_PIX_FMT_YUV32   v4l2_fourcc('Y', 'U', 'V', '4') /* 32  YUV-8-8-8-8   */
+#define V4L2_PIX_FMT_AYUV32  v4l2_fourcc('A', 'Y', 'U', 'V') /* 32  AYUV-8-8-8-8  */
+#define V4L2_PIX_FMT_XYUV32  v4l2_fourcc('X', 'Y', 'U', 'V') /* 32  XYUV-8-8-8-8  */
+#define V4L2_PIX_FMT_VUYA32  v4l2_fourcc('V', 'U', 'Y', 'A') /* 32  VUYA-8-8-8-8  */
+#define V4L2_PIX_FMT_VUYX32  v4l2_fourcc('V', 'U', 'Y', 'X') /* 32  VUYX-8-8-8-8  */
 #define V4L2_PIX_FMT_HI240   v4l2_fourcc('H', 'I', '2', '4') /*  8  8-bit color   */
 #define V4L2_PIX_FMT_HM12    v4l2_fourcc('H', 'M', '1', '2') /*  8  YUV 4:2:0 16x16 macroblocks */
 #define V4L2_PIX_FMT_M420    v4l2_fourcc('M', '4', '2', '0') /* 12  YUV 4:2:0 2 lines y, 1 line uv interleaved */
-- 
cgit v1.2.3-59-g8ed1b


From 721074b03411327e7bf41555d4cc7c18f49313f7 Mon Sep 17 00:00:00 2001
From: Patrick Lerda <patrick9876@free.fr>
Date: Thu, 17 Jan 2019 03:50:13 -0500
Subject: media: rc: rcmm decoder and encoder

media: add support for RCMM infrared remote controls.

Signed-off-by: Patrick Lerda <patrick9876@free.fr>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/lirc.h.rst.exceptions |   3 +
 MAINTAINERS                               |   5 +
 drivers/media/rc/Kconfig                  |  13 ++
 drivers/media/rc/Makefile                 |   1 +
 drivers/media/rc/ir-rcmm-decoder.c        | 254 ++++++++++++++++++++++++++++++
 drivers/media/rc/rc-core-priv.h           |   5 +
 drivers/media/rc/rc-main.c                |   9 ++
 include/media/rc-map.h                    |  14 +-
 include/uapi/linux/lirc.h                 |   6 +
 tools/include/uapi/linux/lirc.h           |  12 ++
 tools/testing/selftests/ir/ir_loopback.c  |   9 ++
 11 files changed, 328 insertions(+), 3 deletions(-)
 create mode 100644 drivers/media/rc/ir-rcmm-decoder.c

(limited to 'include/uapi')

diff --git a/Documentation/media/lirc.h.rst.exceptions b/Documentation/media/lirc.h.rst.exceptions
index 379b9e7df5d0..7a8b8ff4f076 100644
--- a/Documentation/media/lirc.h.rst.exceptions
+++ b/Documentation/media/lirc.h.rst.exceptions
@@ -60,6 +60,9 @@ ignore symbol RC_PROTO_SHARP
 ignore symbol RC_PROTO_XMP
 ignore symbol RC_PROTO_CEC
 ignore symbol RC_PROTO_IMON
+ignore symbol RC_PROTO_RCMM12
+ignore symbol RC_PROTO_RCMM24
+ignore symbol RC_PROTO_RCMM32
 
 # Undocumented macros
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 914d333ee73d..9662ad6c58e4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16536,6 +16536,11 @@ M:	David Härdeman <david@hardeman.nu>
 S:	Maintained
 F:	drivers/media/rc/winbond-cir.c
 
+RCMM REMOTE CONTROLS DECODER
+M:	Patrick Lerda <patrick9876@free.fr>
+S:	Maintained
+F:	drivers/media/rc/ir-rcmm-decoder.c
+
 WINSYSTEMS EBC-C384 WATCHDOG DRIVER
 M:	William Breathitt Gray <vilhelm.gray@gmail.com>
 L:	linux-watchdog@vger.kernel.org
diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
index 8a216068a35a..8e95f6eac89d 100644
--- a/drivers/media/rc/Kconfig
+++ b/drivers/media/rc/Kconfig
@@ -133,6 +133,19 @@ config IR_IMON_DECODER
 	   remote control and you would like to use it with a raw IR
 	   receiver, or if you wish to use an encoder to transmit this IR.
 
+config IR_RCMM_DECODER
+	tristate "Enable IR raw decoder for the RC-MM protocol"
+	depends on RC_CORE
+	help
+	   Enable this option when you have IR with RC-MM protocol, and
+	   you need the software decoder. The driver supports 12,
+	   24 and 32 bits RC-MM variants. You can enable or disable the
+	   different modes using the following RC protocol keywords:
+	   'rc-mm-12', 'rc-mm-24' and 'rc-mm-32'.
+
+	   To compile this driver as a module, choose M here: the module
+	   will be called ir-rcmm-decoder.
+
 endif #RC_DECODERS
 
 menuconfig RC_DEVICES
diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
index 92c163816849..48d23433b3c0 100644
--- a/drivers/media/rc/Makefile
+++ b/drivers/media/rc/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_IR_SHARP_DECODER) += ir-sharp-decoder.o
 obj-$(CONFIG_IR_MCE_KBD_DECODER) += ir-mce_kbd-decoder.o
 obj-$(CONFIG_IR_XMP_DECODER) += ir-xmp-decoder.o
 obj-$(CONFIG_IR_IMON_DECODER) += ir-imon-decoder.o
+obj-$(CONFIG_IR_RCMM_DECODER) += ir-rcmm-decoder.o
 
 # stand-alone IR receivers/transmitters
 obj-$(CONFIG_RC_ATI_REMOTE) += ati_remote.o
diff --git a/drivers/media/rc/ir-rcmm-decoder.c b/drivers/media/rc/ir-rcmm-decoder.c
new file mode 100644
index 000000000000..f1096ac1e5c5
--- /dev/null
+++ b/drivers/media/rc/ir-rcmm-decoder.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0+
+// ir-rcmm-decoder.c - A decoder for the RCMM IR protocol
+//
+// Copyright (C) 2018 by Patrick Lerda <patrick9876@free.fr>
+
+#include "rc-core-priv.h"
+#include <linux/module.h>
+#include <linux/version.h>
+
+#define RCMM_UNIT		166667	/* nanosecs */
+#define RCMM_PREFIX_PULSE	416666  /* 166666.666666666*2.5 */
+#define RCMM_PULSE_0            277777  /* 166666.666666666*(1+2/3) */
+#define RCMM_PULSE_1            444444  /* 166666.666666666*(2+2/3) */
+#define RCMM_PULSE_2            611111  /* 166666.666666666*(3+2/3) */
+#define RCMM_PULSE_3            777778  /* 166666.666666666*(4+2/3) */
+
+enum rcmm_state {
+	STATE_INACTIVE,
+	STATE_LOW,
+	STATE_BUMP,
+	STATE_VALUE,
+	STATE_FINISHED,
+};
+
+static bool rcmm_mode(const struct rcmm_dec *data)
+{
+	return !((0x000c0000 & data->bits) == 0x000c0000);
+}
+
+static int rcmm_miscmode(struct rc_dev *dev, struct rcmm_dec *data)
+{
+	switch (data->count) {
+	case 24:
+		if (dev->enabled_protocols & RC_PROTO_BIT_RCMM24) {
+			rc_keydown(dev, RC_PROTO_RCMM24, data->bits, 0);
+			data->state = STATE_INACTIVE;
+			return 0;
+		}
+		return -1;
+
+	case 12:
+		if (dev->enabled_protocols & RC_PROTO_BIT_RCMM12) {
+			rc_keydown(dev, RC_PROTO_RCMM12, data->bits, 0);
+			data->state = STATE_INACTIVE;
+			return 0;
+		}
+		return -1;
+	}
+
+	return -1;
+}
+
+/**
+ * ir_rcmm_decode() - Decode one RCMM pulse or space
+ * @dev:	the struct rc_dev descriptor of the device
+ * @ev:		the struct ir_raw_event descriptor of the pulse/space
+ *
+ * This function returns -EINVAL if the pulse violates the state machine
+ */
+static int ir_rcmm_decode(struct rc_dev *dev, struct ir_raw_event ev)
+{
+	struct rcmm_dec *data = &dev->raw->rcmm;
+	u32 scancode;
+	u8 toggle;
+	int value;
+
+	if (!(dev->enabled_protocols & (RC_PROTO_BIT_RCMM32 |
+							RC_PROTO_BIT_RCMM24 |
+							RC_PROTO_BIT_RCMM12)))
+		return 0;
+
+	if (!is_timing_event(ev)) {
+		if (ev.reset)
+			data->state = STATE_INACTIVE;
+		return 0;
+	}
+
+	switch (data->state) {
+	case STATE_INACTIVE:
+		if (!ev.pulse)
+			break;
+
+		if (!eq_margin(ev.duration, RCMM_PREFIX_PULSE, RCMM_UNIT / 2))
+			break;
+
+		data->state = STATE_LOW;
+		data->count = 0;
+		data->bits  = 0;
+		return 0;
+
+	case STATE_LOW:
+		if (ev.pulse)
+			break;
+
+		if (!eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2))
+			break;
+
+		data->state = STATE_BUMP;
+		return 0;
+
+	case STATE_BUMP:
+		if (!ev.pulse)
+			break;
+
+		if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2))
+			break;
+
+		data->state = STATE_VALUE;
+		return 0;
+
+	case STATE_VALUE:
+		if (ev.pulse)
+			break;
+
+		if (eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2))
+			value = 0;
+		else if (eq_margin(ev.duration, RCMM_PULSE_1, RCMM_UNIT / 2))
+			value = 1;
+		else if (eq_margin(ev.duration, RCMM_PULSE_2, RCMM_UNIT / 2))
+			value = 2;
+		else if (eq_margin(ev.duration, RCMM_PULSE_3, RCMM_UNIT / 2))
+			value = 3;
+		else
+			value = -1;
+
+		if (value == -1) {
+			if (!rcmm_miscmode(dev, data))
+				return 0;
+			break;
+		}
+
+		data->bits <<= 2;
+		data->bits |= value;
+
+		data->count += 2;
+
+		if (data->count < 32)
+			data->state = STATE_BUMP;
+		else
+			data->state = STATE_FINISHED;
+
+		return 0;
+
+	case STATE_FINISHED:
+		if (!ev.pulse)
+			break;
+
+		if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2))
+			break;
+
+		if (rcmm_mode(data)) {
+			toggle = !!(0x8000 & data->bits);
+			scancode = data->bits & ~0x8000;
+		} else {
+			toggle = 0;
+			scancode = data->bits;
+		}
+
+		if (dev->enabled_protocols & RC_PROTO_BIT_RCMM32) {
+			rc_keydown(dev, RC_PROTO_RCMM32, scancode, toggle);
+			data->state = STATE_INACTIVE;
+			return 0;
+		}
+
+		break;
+	}
+
+	data->state = STATE_INACTIVE;
+	return -EINVAL;
+}
+
+static const int rcmmspace[] = {
+	RCMM_PULSE_0,
+	RCMM_PULSE_1,
+	RCMM_PULSE_2,
+	RCMM_PULSE_3,
+};
+
+static int ir_rcmm_rawencoder(struct ir_raw_event **ev, unsigned int max,
+			      unsigned int n, u32 data)
+{
+	int i;
+	int ret;
+
+	ret = ir_raw_gen_pulse_space(ev, &max, RCMM_PREFIX_PULSE, RCMM_PULSE_0);
+	if (ret)
+		return ret;
+
+	for (i = n - 2; i >= 0; i -= 2) {
+		const unsigned int space = rcmmspace[(data >> i) & 3];
+
+		ret = ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, space);
+		if (ret)
+			return ret;
+	}
+
+	return ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, RCMM_PULSE_3 * 2);
+}
+
+static int ir_rcmm_encode(enum rc_proto protocol, u32 scancode,
+			  struct ir_raw_event *events, unsigned int max)
+{
+	struct ir_raw_event *e = events;
+	int ret;
+
+	switch (protocol) {
+	case RC_PROTO_RCMM32:
+		ret = ir_rcmm_rawencoder(&e, max, 32, scancode);
+		break;
+	case RC_PROTO_RCMM24:
+		ret = ir_rcmm_rawencoder(&e, max, 24, scancode);
+		break;
+	case RC_PROTO_RCMM12:
+		ret = ir_rcmm_rawencoder(&e, max, 12, scancode);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret < 0)
+		return ret;
+
+	return e - events;
+}
+
+static struct ir_raw_handler rcmm_handler = {
+	.protocols	= RC_PROTO_BIT_RCMM32 |
+			  RC_PROTO_BIT_RCMM24 |
+			  RC_PROTO_BIT_RCMM12,
+	.decode		= ir_rcmm_decode,
+	.encode         = ir_rcmm_encode,
+	.carrier        = 36000,
+	.min_timeout	= RCMM_PULSE_3 + RCMM_UNIT,
+};
+
+static int __init ir_rcmm_decode_init(void)
+{
+	ir_raw_handler_register(&rcmm_handler);
+
+	pr_info("IR RCMM protocol handler initialized\n");
+	return 0;
+}
+
+static void __exit ir_rcmm_decode_exit(void)
+{
+	ir_raw_handler_unregister(&rcmm_handler);
+}
+
+module_init(ir_rcmm_decode_init);
+module_exit(ir_rcmm_decode_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick Lerda");
+MODULE_DESCRIPTION("RCMM IR protocol decoder");
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index c2cbe7f6266c..9f21b3e8b377 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -131,6 +131,11 @@ struct ir_raw_event_ctrl {
 		unsigned int bits;
 		bool stick_keyboard;
 	} imon;
+	struct rcmm_dec {
+		int state;
+		unsigned int count;
+		u32 bits;
+	} rcmm;
 };
 
 /* Mutex for locking raw IR processing and handler change */
diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
index 66a174979b3c..dc38e9c0a2ff 100644
--- a/drivers/media/rc/rc-main.c
+++ b/drivers/media/rc/rc-main.c
@@ -70,6 +70,12 @@ static const struct {
 	[RC_PROTO_CEC] = { .name = "cec", .repeat_period = 0 },
 	[RC_PROTO_IMON] = { .name = "imon",
 		.scancode_bits = 0x7fffffff, .repeat_period = 114 },
+	[RC_PROTO_RCMM12] = { .name = "rc-mm-12",
+		.scancode_bits = 0x00000fff, .repeat_period = 114 },
+	[RC_PROTO_RCMM24] = { .name = "rc-mm-24",
+		.scancode_bits = 0x00ffffff, .repeat_period = 114 },
+	[RC_PROTO_RCMM32] = { .name = "rc-mm-32",
+		.scancode_bits = 0xffffffff, .repeat_period = 114 },
 };
 
 /* Used to keep track of known keymaps */
@@ -1006,6 +1012,9 @@ static const struct {
 	{ RC_PROTO_BIT_XMP,	"xmp",		"ir-xmp-decoder"	},
 	{ RC_PROTO_BIT_CEC,	"cec",		NULL			},
 	{ RC_PROTO_BIT_IMON,	"imon",		"ir-imon-decoder"	},
+	{ RC_PROTO_BIT_RCMM12 |
+	  RC_PROTO_BIT_RCMM24 |
+	  RC_PROTO_BIT_RCMM32,	"rc-mm",	"ir-rcmm-decoder"	},
 };
 
 /**
diff --git a/include/media/rc-map.h b/include/media/rc-map.h
index d621acadfbf3..e5e86d595645 100644
--- a/include/media/rc-map.h
+++ b/include/media/rc-map.h
@@ -37,6 +37,9 @@
 #define RC_PROTO_BIT_XMP		BIT_ULL(RC_PROTO_XMP)
 #define RC_PROTO_BIT_CEC		BIT_ULL(RC_PROTO_CEC)
 #define RC_PROTO_BIT_IMON		BIT_ULL(RC_PROTO_IMON)
+#define RC_PROTO_BIT_RCMM12		BIT_ULL(RC_PROTO_RCMM12)
+#define RC_PROTO_BIT_RCMM24		BIT_ULL(RC_PROTO_RCMM24)
+#define RC_PROTO_BIT_RCMM32		BIT_ULL(RC_PROTO_RCMM32)
 
 #define RC_PROTO_BIT_ALL \
 			(RC_PROTO_BIT_UNKNOWN | RC_PROTO_BIT_OTHER | \
@@ -51,7 +54,8 @@
 			 RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \
 			 RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \
 			 RC_PROTO_BIT_XMP | RC_PROTO_BIT_CEC | \
-			 RC_PROTO_BIT_IMON)
+			 RC_PROTO_BIT_IMON | RC_PROTO_BIT_RCMM12 | \
+			 RC_PROTO_BIT_RCMM24 | RC_PROTO_BIT_RCMM32)
 /* All rc protocols for which we have decoders */
 #define RC_PROTO_BIT_ALL_IR_DECODER \
 			(RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \
@@ -64,7 +68,9 @@
 			 RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \
 			 RC_PROTO_BIT_RC6_6A_24 |  RC_PROTO_BIT_RC6_6A_32 | \
 			 RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \
-			 RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON)
+			 RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON | \
+			 RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \
+			 RC_PROTO_BIT_RCMM32)
 
 #define RC_PROTO_BIT_ALL_IR_ENCODER \
 			(RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \
@@ -77,7 +83,9 @@
 			 RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \
 			 RC_PROTO_BIT_RC6_6A_24 | \
 			 RC_PROTO_BIT_RC6_6A_32 | RC_PROTO_BIT_RC6_MCE | \
-			 RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON)
+			 RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON | \
+			 RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \
+			 RC_PROTO_BIT_RCMM32)
 
 #define RC_SCANCODE_UNKNOWN(x)			(x)
 #define RC_SCANCODE_OTHER(x)			(x)
diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h
index 6b319581882f..45fcbf99d72e 100644
--- a/include/uapi/linux/lirc.h
+++ b/include/uapi/linux/lirc.h
@@ -192,6 +192,9 @@ struct lirc_scancode {
  * @RC_PROTO_XMP: XMP protocol
  * @RC_PROTO_CEC: CEC protocol
  * @RC_PROTO_IMON: iMon Pad protocol
+ * @RC_PROTO_RCMM12: RC-MM protocol 12 bits
+ * @RC_PROTO_RCMM24: RC-MM protocol 24 bits
+ * @RC_PROTO_RCMM32: RC-MM protocol 32 bits
  */
 enum rc_proto {
 	RC_PROTO_UNKNOWN	= 0,
@@ -218,6 +221,9 @@ enum rc_proto {
 	RC_PROTO_XMP		= 21,
 	RC_PROTO_CEC		= 22,
 	RC_PROTO_IMON		= 23,
+	RC_PROTO_RCMM12		= 24,
+	RC_PROTO_RCMM24		= 25,
+	RC_PROTO_RCMM32		= 26,
 };
 
 #endif
diff --git a/tools/include/uapi/linux/lirc.h b/tools/include/uapi/linux/lirc.h
index f189931042a7..45fcbf99d72e 100644
--- a/tools/include/uapi/linux/lirc.h
+++ b/tools/include/uapi/linux/lirc.h
@@ -133,6 +133,12 @@
 
 #define LIRC_SET_WIDEBAND_RECEIVER     _IOW('i', 0x00000023, __u32)
 
+/*
+ * Return the recording timeout, which is either set by
+ * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols.
+ */
+#define LIRC_GET_REC_TIMEOUT	       _IOR('i', 0x00000024, __u32)
+
 /*
  * struct lirc_scancode - decoded scancode with protocol for use with
  *	LIRC_MODE_SCANCODE
@@ -186,6 +192,9 @@ struct lirc_scancode {
  * @RC_PROTO_XMP: XMP protocol
  * @RC_PROTO_CEC: CEC protocol
  * @RC_PROTO_IMON: iMon Pad protocol
+ * @RC_PROTO_RCMM12: RC-MM protocol 12 bits
+ * @RC_PROTO_RCMM24: RC-MM protocol 24 bits
+ * @RC_PROTO_RCMM32: RC-MM protocol 32 bits
  */
 enum rc_proto {
 	RC_PROTO_UNKNOWN	= 0,
@@ -212,6 +221,9 @@ enum rc_proto {
 	RC_PROTO_XMP		= 21,
 	RC_PROTO_CEC		= 22,
 	RC_PROTO_IMON		= 23,
+	RC_PROTO_RCMM12		= 24,
+	RC_PROTO_RCMM24		= 25,
+	RC_PROTO_RCMM32		= 26,
 };
 
 #endif
diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c
index 858c19caf224..570a7358942c 100644
--- a/tools/testing/selftests/ir/ir_loopback.c
+++ b/tools/testing/selftests/ir/ir_loopback.c
@@ -51,6 +51,10 @@ static const struct {
 	{ RC_PROTO_RC6_6A_32, "rc-6-6a-32", 0xffffffff, "rc-6" },
 	{ RC_PROTO_RC6_MCE, "rc-6-mce", 0x00007fff, "rc-6" },
 	{ RC_PROTO_SHARP, "sharp", 0x1fff, "sharp" },
+	{ RC_PROTO_IMON, "imon", 0x7fffffff, "imon" },
+	{ RC_PROTO_RCMM12, "rcmm-12", 0x00000fff, "rcmm" },
+	{ RC_PROTO_RCMM24, "rcmm-24", 0x00ffffff, "rcmm" },
+	{ RC_PROTO_RCMM32, "rcmm-32", 0xffffffff, "rcmm" },
 };
 
 int lirc_open(const char *rc)
@@ -139,6 +143,11 @@ int main(int argc, char **argv)
 			    (((scancode >> 8) ^ ~scancode) & 0xff) == 0)
 				continue;
 
+			if (rc_proto == RC_PROTO_RCMM32 &&
+			    (scancode & 0x000c0000) != 0x000c0000 &&
+			    scancode & 0x00008000)
+				continue;
+
 			struct lirc_scancode lsc = {
 				.rc_proto = rc_proto,
 				.scancode = scancode
-- 
cgit v1.2.3-59-g8ed1b


From 80d7da1cac62f28b3df4880e8143b39cabb4b59a Mon Sep 17 00:00:00 2001
From: Yury Norov <ynorov@caviumnetworks.com>
Date: Wed, 16 May 2018 11:18:50 +0300
Subject: asm-generic: Drop getrlimit and setrlimit syscalls from default list

The newer prlimit64 syscall provides all the functionality of getrlimit
and setrlimit syscalls and adds the pid of target process, so future
architectures won't need to include getrlimit and setrlimit.

Therefore drop getrlimit and setrlimit syscalls from the generic syscall
list unless __ARCH_WANT_SET_GET_RLIMIT is defined by the architecture's
unistd.h prior to including asm-generic/unistd.h, and adjust all
architectures using the generic syscall list to define it so that no
in-tree architectures are affected.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-hexagon@vger.kernel.org
Cc: uclinux-h8-devel@lists.sourceforge.jp
Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mark Salter <msalter@redhat.com> [c6x]
Acked-by: James Hogan <james.hogan@imgtec.com> [metag]
Acked-by: Ley Foon Tan <lftan@altera.com> [nios2]
Acked-by: Stafford Horne <shorne@gmail.com> [openrisc]
Acked-by: Will Deacon <will.deacon@arm.com> [arm64]
Acked-by: Vineet Gupta <vgupta@synopsys.com> #arch/arc bits
Signed-off-by: Yury Norov <ynorov@marvell.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arc/include/uapi/asm/unistd.h       | 1 +
 arch/arm64/include/uapi/asm/unistd.h     | 1 +
 arch/c6x/include/uapi/asm/unistd.h       | 1 +
 arch/csky/include/uapi/asm/unistd.h      | 1 +
 arch/h8300/include/uapi/asm/unistd.h     | 1 +
 arch/hexagon/include/uapi/asm/unistd.h   | 1 +
 arch/nds32/include/uapi/asm/unistd.h     | 1 +
 arch/nios2/include/uapi/asm/unistd.h     | 1 +
 arch/openrisc/include/uapi/asm/unistd.h  | 1 +
 arch/riscv/include/uapi/asm/unistd.h     | 1 +
 arch/unicore32/include/uapi/asm/unistd.h | 1 +
 include/uapi/asm-generic/unistd.h        | 5 +++++
 scripts/checksyscalls.sh                 | 5 +++++
 13 files changed, 21 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/arc/include/uapi/asm/unistd.h b/arch/arc/include/uapi/asm/unistd.h
index 3b3543fd151c..6a1a62a979dd 100644
--- a/arch/arc/include/uapi/asm/unistd.h
+++ b/arch/arc/include/uapi/asm/unistd.h
@@ -18,6 +18,7 @@
 
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_STAT64
+#define __ARCH_WANT_SET_GET_RLIMIT
 #define __ARCH_WANT_SYS_EXECVE
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_VFORK
diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h
index dae1584cf017..79937de2a0cc 100644
--- a/arch/arm64/include/uapi/asm/unistd.h
+++ b/arch/arm64/include/uapi/asm/unistd.h
@@ -17,5 +17,6 @@
 
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_NEW_STAT
+#define __ARCH_WANT_SET_GET_RLIMIT
 
 #include <asm-generic/unistd.h>
diff --git a/arch/c6x/include/uapi/asm/unistd.h b/arch/c6x/include/uapi/asm/unistd.h
index 6b2fe792de9d..e3721b2cfd6a 100644
--- a/arch/c6x/include/uapi/asm/unistd.h
+++ b/arch/c6x/include/uapi/asm/unistd.h
@@ -17,6 +17,7 @@
 
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_STAT64
+#define __ARCH_WANT_SET_GET_RLIMIT
 #define __ARCH_WANT_SYS_CLONE
 
 /* Use the standard ABI for syscalls. */
diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h
index 224c9a9ab45b..f5c83492136f 100644
--- a/arch/csky/include/uapi/asm/unistd.h
+++ b/arch/csky/include/uapi/asm/unistd.h
@@ -2,6 +2,7 @@
 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
 
 #define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SET_GET_RLIMIT
 #include <asm-generic/unistd.h>
 
 #define __NR_set_thread_area	(__NR_arch_specific_syscall + 0)
diff --git a/arch/h8300/include/uapi/asm/unistd.h b/arch/h8300/include/uapi/asm/unistd.h
index 628195823816..b9e9352f2328 100644
--- a/arch/h8300/include/uapi/asm/unistd.h
+++ b/arch/h8300/include/uapi/asm/unistd.h
@@ -2,5 +2,6 @@
 
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_STAT64
+#define __ARCH_WANT_SET_GET_RLIMIT
 
 #include <asm-generic/unistd.h>
diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h
index c91ca7d02461..6bb392a33c35 100644
--- a/arch/hexagon/include/uapi/asm/unistd.h
+++ b/arch/hexagon/include/uapi/asm/unistd.h
@@ -30,6 +30,7 @@
 #define sys_mmap2 sys_mmap_pgoff
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_STAT64
+#define __ARCH_WANT_SET_GET_RLIMIT
 #define __ARCH_WANT_SYS_EXECVE
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_VFORK
diff --git a/arch/nds32/include/uapi/asm/unistd.h b/arch/nds32/include/uapi/asm/unistd.h
index c2c3a3e34083..eb98d24d3190 100644
--- a/arch/nds32/include/uapi/asm/unistd.h
+++ b/arch/nds32/include/uapi/asm/unistd.h
@@ -3,6 +3,7 @@
 
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYNC_FILE_RANGE2
+#define __ARCH_WANT_SET_GET_RLIMIT
 
 /* Use the standard ABI for syscalls */
 #include <asm-generic/unistd.h>
diff --git a/arch/nios2/include/uapi/asm/unistd.h b/arch/nios2/include/uapi/asm/unistd.h
index d9948d88790b..fa68e68bc26d 100644
--- a/arch/nios2/include/uapi/asm/unistd.h
+++ b/arch/nios2/include/uapi/asm/unistd.h
@@ -20,6 +20,7 @@
 
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_STAT64
+#define __ARCH_WANT_SET_GET_RLIMIT
 
 /* Use the standard ABI for syscalls */
 #include <asm-generic/unistd.h>
diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h
index ec37df18d8ed..2e0bc0ff9f31 100644
--- a/arch/openrisc/include/uapi/asm/unistd.h
+++ b/arch/openrisc/include/uapi/asm/unistd.h
@@ -21,6 +21,7 @@
 
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_STAT64
+#define __ARCH_WANT_SET_GET_RLIMIT
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_CLONE
 
diff --git a/arch/riscv/include/uapi/asm/unistd.h b/arch/riscv/include/uapi/asm/unistd.h
index 1f3bd3ebbb0d..d9340c52e7ad 100644
--- a/arch/riscv/include/uapi/asm/unistd.h
+++ b/arch/riscv/include/uapi/asm/unistd.h
@@ -18,6 +18,7 @@
 #ifdef __LP64__
 #define __ARCH_WANT_NEW_STAT
 #endif /* __LP64__ */
+#define __ARCH_WANT_SET_GET_RLIMIT
 
 #include <asm-generic/unistd.h>
 
diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h
index 1e8fe5941b8a..2b575c0cf177 100644
--- a/arch/unicore32/include/uapi/asm/unistd.h
+++ b/arch/unicore32/include/uapi/asm/unistd.h
@@ -12,6 +12,7 @@
  */
 
 #define __ARCH_WANT_RENAMEAT
+#define __ARCH_WANT_SET_GET_RLIMIT
 
 /* Use the standard ABI for syscalls. */
 #include <asm-generic/unistd.h>
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index b928eff3bf92..2cdf600b05fa 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -467,10 +467,15 @@ __SYSCALL(__NR_uname, sys_newuname)
 __SYSCALL(__NR_sethostname, sys_sethostname)
 #define __NR_setdomainname 162
 __SYSCALL(__NR_setdomainname, sys_setdomainname)
+
+#ifdef __ARCH_WANT_SET_GET_RLIMIT
+/* getrlimit and setrlimit are superseded with prlimit64 */
 #define __NR_getrlimit 163
 __SC_COMP(__NR_getrlimit, sys_getrlimit, compat_sys_getrlimit)
 #define __NR_setrlimit 164
 __SC_COMP(__NR_setrlimit, sys_setrlimit, compat_sys_setrlimit)
+#endif
+
 #define __NR_getrusage 165
 __SC_COMP(__NR_getrusage, sys_getrusage, compat_sys_getrusage)
 #define __NR_umask 166
diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh
index cc70a64fa81f..53c5677d7e82 100755
--- a/scripts/checksyscalls.sh
+++ b/scripts/checksyscalls.sh
@@ -38,6 +38,11 @@ cat << EOF
 #define __IGNORE_lstat64	/* fstatat64 */
 #endif
 
+#ifndef __ARCH_WANT_SET_GET_RLIMIT
+#define __IGNORE_getrlimit	/* getrlimit */
+#define __IGNORE_setrlimit	/* setrlimit */
+#endif
+
 /* Missing flags argument */
 #define __IGNORE_renameat	/* renameat2 */
 
-- 
cgit v1.2.3-59-g8ed1b


From 517b773e0f612d608cbc62a08c55601bd56f73f6 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 18 Feb 2019 22:25:49 +0200
Subject: RDMA/nldev: Share with user-space object IDs

Give to the user space tools unique identifier for PD, MR, CQ and CM_ID
objects, so they can be able to query on them with .doit callbacks.

QP .doit is not supported yet, till all drivers will be updated to provide
their LQPN to be equal to their restrack ID.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 20 ++++++++++++++++++++
 include/uapi/rdma/rdma_netlink.h |  9 +++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 9b4f891771c4..81d7ee3dcb20 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -108,6 +108,10 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
 	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_PDN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_CQN]               = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -466,6 +470,9 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
 		    &cm_id->route.addr.dst_addr))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -494,6 +501,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -522,6 +532,9 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			      RDMA_NLDEV_ATTR_PAD))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -552,6 +565,9 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			      atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -893,6 +909,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 		.nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
 		.entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_CM_IDN,
 	},
 	[RDMA_RESTRACK_CQ] = {
 		.fill_res_func = fill_res_cq_entry,
@@ -900,6 +917,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
 		.flags = NLDEV_PER_DEV,
 		.entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_CQN,
 	},
 	[RDMA_RESTRACK_MR] = {
 		.fill_res_func = fill_res_mr_entry,
@@ -907,6 +925,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
 		.flags = NLDEV_PER_DEV,
 		.entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_MRN,
 	},
 	[RDMA_RESTRACK_PD] = {
 		.fill_res_func = fill_res_pd_entry,
@@ -914,6 +933,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
 		.flags = NLDEV_PER_DEV,
 		.entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_PDN,
 	},
 };
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 3a9e681e4257..43362132e0d7 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -456,6 +456,15 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_DRIVER_S64,		/* s64 */
 	RDMA_NLDEV_ATTR_DRIVER_U64,		/* u64 */
 
+	/*
+	 * Indexes to get/set secific entry,
+	 * for QP use RDMA_NLDEV_ATTR_RES_LQPN
+	 */
+	RDMA_NLDEV_ATTR_RES_PDN,               /* u32 */
+	RDMA_NLDEV_ATTR_RES_CQN,               /* u32 */
+	RDMA_NLDEV_ATTR_RES_MRN,               /* u32 */
+	RDMA_NLDEV_ATTR_RES_CM_IDN,            /* u32 */
+
 	/*
 	 * Always the end
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From c3d02788b45ab4a2d8f243b98c04b549c8193af6 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 18 Feb 2019 22:25:50 +0200
Subject: RDMA/nldev: Provide parent IDs for PD, MR and QP objects

PD, MR and QP objects have parents objects: contexts and PDs.  The exposed
parent IDs allow to correlate various objects and simplify debug
investigation.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 18 ++++++++++++++++++
 include/uapi/rdma/rdma_netlink.h |  1 +
 2 files changed, 19 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 81d7ee3dcb20..e6c7cc510556 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -112,6 +112,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_CQN]               = { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -420,6 +421,10 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
 		goto err;
 
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -503,6 +508,10 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
 
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
 		goto err;
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+			cq->uobject->context->res.id))
+		goto err;
 
 	if (fill_res_name_pid(msg, res))
 		goto err;
@@ -535,6 +544,10 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
 		goto err;
 
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
@@ -568,6 +581,11 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id))
 		goto err;
 
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+			pd->uobject->context->res.id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 43362132e0d7..4ebbcfb2c6ef 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -464,6 +464,7 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_RES_CQN,               /* u32 */
 	RDMA_NLDEV_ATTR_RES_MRN,               /* u32 */
 	RDMA_NLDEV_ATTR_RES_CM_IDN,            /* u32 */
+	RDMA_NLDEV_ATTR_RES_CTXN,	       /* u32 */
 
 	/*
 	 * Always the end
-- 
cgit v1.2.3-59-g8ed1b


From c8ce48f06503eee20f189eed5b2aa736272b7344 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 18 Feb 2019 17:30:06 +0100
Subject: asm-generic: Make time32 syscall numbers optional

We don't want new architectures to even provide the old 32-bit time_t
based system calls any more, or define the syscall number macros.

Add a new __ARCH_WANT_TIME32_SYSCALLS macro that gets enabled for all
existing 32-bit architectures using the generic system call table,
so we don't change any current behavior.
Since this symbol is evaluated in user space as well, we cannot use
a Kconfig CONFIG_* macro but have to define it in uapi/asm/unistd.h.

On 64-bit architectures, the same system call numbers mostly refer to
the system calls we want to keep, as they already pass 64-bit time_t.

As new architectures no longer provide these, we need new exceptions
in checksyscalls.sh.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arc/include/uapi/asm/unistd.h       |  1 +
 arch/arm64/include/uapi/asm/unistd.h     |  1 +
 arch/c6x/include/uapi/asm/unistd.h       |  1 +
 arch/csky/include/uapi/asm/unistd.h      |  1 +
 arch/h8300/include/uapi/asm/unistd.h     |  1 +
 arch/hexagon/include/uapi/asm/unistd.h   |  1 +
 arch/nds32/include/uapi/asm/unistd.h     |  1 +
 arch/nios2/include/uapi/asm/unistd.h     |  1 +
 arch/openrisc/include/uapi/asm/unistd.h  |  1 +
 arch/riscv/include/uapi/asm/unistd.h     |  3 +++
 arch/unicore32/include/uapi/asm/unistd.h |  1 +
 include/uapi/asm-generic/unistd.h        | 36 ++++++++++++++++++++++++++++++++
 scripts/checksyscalls.sh                 |  7 +++++++
 13 files changed, 56 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/arc/include/uapi/asm/unistd.h b/arch/arc/include/uapi/asm/unistd.h
index 6a1a62a979dd..5eafa1115162 100644
--- a/arch/arc/include/uapi/asm/unistd.h
+++ b/arch/arc/include/uapi/asm/unistd.h
@@ -23,6 +23,7 @@
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_FORK
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 #define sys_mmap2 sys_mmap_pgoff
 
diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h
index 79937de2a0cc..4703d218663a 100644
--- a/arch/arm64/include/uapi/asm/unistd.h
+++ b/arch/arm64/include/uapi/asm/unistd.h
@@ -18,5 +18,6 @@
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_NEW_STAT
 #define __ARCH_WANT_SET_GET_RLIMIT
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 #include <asm-generic/unistd.h>
diff --git a/arch/c6x/include/uapi/asm/unistd.h b/arch/c6x/include/uapi/asm/unistd.h
index e3721b2cfd6a..79b724c39d9b 100644
--- a/arch/c6x/include/uapi/asm/unistd.h
+++ b/arch/c6x/include/uapi/asm/unistd.h
@@ -19,6 +19,7 @@
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SET_GET_RLIMIT
 #define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 /* Use the standard ABI for syscalls. */
 #include <asm-generic/unistd.h>
diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h
index f5c83492136f..ec60e49cea66 100644
--- a/arch/csky/include/uapi/asm/unistd.h
+++ b/arch/csky/include/uapi/asm/unistd.h
@@ -3,6 +3,7 @@
 
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SET_GET_RLIMIT
+#define __ARCH_WANT_TIME32_SYSCALLS
 #include <asm-generic/unistd.h>
 
 #define __NR_set_thread_area	(__NR_arch_specific_syscall + 0)
diff --git a/arch/h8300/include/uapi/asm/unistd.h b/arch/h8300/include/uapi/asm/unistd.h
index b9e9352f2328..eb7bc0012af5 100644
--- a/arch/h8300/include/uapi/asm/unistd.h
+++ b/arch/h8300/include/uapi/asm/unistd.h
@@ -3,5 +3,6 @@
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SET_GET_RLIMIT
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 #include <asm-generic/unistd.h>
diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h
index 6bb392a33c35..432c4db1b623 100644
--- a/arch/hexagon/include/uapi/asm/unistd.h
+++ b/arch/hexagon/include/uapi/asm/unistd.h
@@ -35,5 +35,6 @@
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_FORK
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 #include <asm-generic/unistd.h>
diff --git a/arch/nds32/include/uapi/asm/unistd.h b/arch/nds32/include/uapi/asm/unistd.h
index eb98d24d3190..4ec8f543103f 100644
--- a/arch/nds32/include/uapi/asm/unistd.h
+++ b/arch/nds32/include/uapi/asm/unistd.h
@@ -4,6 +4,7 @@
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYNC_FILE_RANGE2
 #define __ARCH_WANT_SET_GET_RLIMIT
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 /* Use the standard ABI for syscalls */
 #include <asm-generic/unistd.h>
diff --git a/arch/nios2/include/uapi/asm/unistd.h b/arch/nios2/include/uapi/asm/unistd.h
index fa68e68bc26d..0b4bb1d41b28 100644
--- a/arch/nios2/include/uapi/asm/unistd.h
+++ b/arch/nios2/include/uapi/asm/unistd.h
@@ -21,6 +21,7 @@
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SET_GET_RLIMIT
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 /* Use the standard ABI for syscalls */
 #include <asm-generic/unistd.h>
diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h
index 2e0bc0ff9f31..566f8c4f8047 100644
--- a/arch/openrisc/include/uapi/asm/unistd.h
+++ b/arch/openrisc/include/uapi/asm/unistd.h
@@ -24,6 +24,7 @@
 #define __ARCH_WANT_SET_GET_RLIMIT
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 #include <asm-generic/unistd.h>
 
diff --git a/arch/riscv/include/uapi/asm/unistd.h b/arch/riscv/include/uapi/asm/unistd.h
index d9340c52e7ad..486a288b454c 100644
--- a/arch/riscv/include/uapi/asm/unistd.h
+++ b/arch/riscv/include/uapi/asm/unistd.h
@@ -19,6 +19,9 @@
 #define __ARCH_WANT_NEW_STAT
 #endif /* __LP64__ */
 #define __ARCH_WANT_SET_GET_RLIMIT
+#ifndef __LP64__
+#define __ARCH_WANT_TIME32_SYSCALLS
+#endif
 
 #include <asm-generic/unistd.h>
 
diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h
index 2b575c0cf177..4e5e624f5d7e 100644
--- a/arch/unicore32/include/uapi/asm/unistd.h
+++ b/arch/unicore32/include/uapi/asm/unistd.h
@@ -13,6 +13,7 @@
 
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SET_GET_RLIMIT
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 /* Use the standard ABI for syscalls. */
 #include <asm-generic/unistd.h>
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 2cdf600b05fa..12cdf611d217 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -38,8 +38,10 @@ __SYSCALL(__NR_io_destroy, sys_io_destroy)
 __SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)
 #define __NR_io_cancel 3
 __SYSCALL(__NR_io_cancel, sys_io_cancel)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_io_getevents 4
 __SC_3264(__NR_io_getevents, sys_io_getevents_time32, sys_io_getevents)
+#endif
 
 /* fs/xattr.c */
 #define __NR_setxattr 5
@@ -222,10 +224,12 @@ __SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev)
 __SYSCALL(__NR3264_sendfile, sys_sendfile64)
 
 /* fs/select.c */
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_pselect6 72
 __SC_COMP_3264(__NR_pselect6, sys_pselect6_time32, sys_pselect6, compat_sys_pselect6_time32)
 #define __NR_ppoll 73
 __SC_COMP_3264(__NR_ppoll, sys_ppoll_time32, sys_ppoll, compat_sys_ppoll_time32)
+#endif
 
 /* fs/signalfd.c */
 #define __NR_signalfd4 74
@@ -269,16 +273,20 @@ __SC_COMP(__NR_sync_file_range, sys_sync_file_range, \
 /* fs/timerfd.c */
 #define __NR_timerfd_create 85
 __SYSCALL(__NR_timerfd_create, sys_timerfd_create)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_timerfd_settime 86
 __SC_3264(__NR_timerfd_settime, sys_timerfd_settime32, \
 	  sys_timerfd_settime)
 #define __NR_timerfd_gettime 87
 __SC_3264(__NR_timerfd_gettime, sys_timerfd_gettime32, \
 	  sys_timerfd_gettime)
+#endif
 
 /* fs/utimes.c */
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_utimensat 88
 __SC_3264(__NR_utimensat, sys_utimensat_time32, sys_utimensat)
+#endif
 
 /* kernel/acct.c */
 #define __NR_acct 89
@@ -309,8 +317,10 @@ __SYSCALL(__NR_set_tid_address, sys_set_tid_address)
 __SYSCALL(__NR_unshare, sys_unshare)
 
 /* kernel/futex.c */
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_futex 98
 __SC_3264(__NR_futex, sys_futex_time32, sys_futex)
+#endif
 #define __NR_set_robust_list 99
 __SC_COMP(__NR_set_robust_list, sys_set_robust_list, \
 	  compat_sys_set_robust_list)
@@ -319,8 +329,10 @@ __SC_COMP(__NR_get_robust_list, sys_get_robust_list, \
 	  compat_sys_get_robust_list)
 
 /* kernel/hrtimer.c */
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_nanosleep 101
 __SC_3264(__NR_nanosleep, sys_nanosleep_time32, sys_nanosleep)
+#endif
 
 /* kernel/itimer.c */
 #define __NR_getitimer 102
@@ -341,14 +353,19 @@ __SYSCALL(__NR_delete_module, sys_delete_module)
 /* kernel/posix-timers.c */
 #define __NR_timer_create 107
 __SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_timer_gettime 108
 __SC_3264(__NR_timer_gettime, sys_timer_gettime32, sys_timer_gettime)
+#endif
 #define __NR_timer_getoverrun 109
 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_timer_settime 110
 __SC_3264(__NR_timer_settime, sys_timer_settime32, sys_timer_settime)
+#endif
 #define __NR_timer_delete 111
 __SYSCALL(__NR_timer_delete, sys_timer_delete)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_clock_settime 112
 __SC_3264(__NR_clock_settime, sys_clock_settime32, sys_clock_settime)
 #define __NR_clock_gettime 113
@@ -358,6 +375,7 @@ __SC_3264(__NR_clock_getres, sys_clock_getres_time32, sys_clock_getres)
 #define __NR_clock_nanosleep 115
 __SC_3264(__NR_clock_nanosleep, sys_clock_nanosleep_time32, \
 	  sys_clock_nanosleep)
+#endif
 
 /* kernel/printk.c */
 #define __NR_syslog 116
@@ -388,9 +406,11 @@ __SYSCALL(__NR_sched_yield, sys_sched_yield)
 __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
 #define __NR_sched_get_priority_min 126
 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_sched_rr_get_interval 127
 __SC_3264(__NR_sched_rr_get_interval, sys_sched_rr_get_interval_time32, \
 	  sys_sched_rr_get_interval)
+#endif
 
 /* kernel/signal.c */
 #define __NR_restart_syscall 128
@@ -411,9 +431,11 @@ __SC_COMP(__NR_rt_sigaction, sys_rt_sigaction, compat_sys_rt_sigaction)
 __SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask)
 #define __NR_rt_sigpending 136
 __SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_rt_sigtimedwait 137
 __SC_COMP_3264(__NR_rt_sigtimedwait, sys_rt_sigtimedwait_time32, \
 	  sys_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time32)
+#endif
 #define __NR_rt_sigqueueinfo 138
 __SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \
 	  compat_sys_rt_sigqueueinfo)
@@ -486,12 +508,14 @@ __SYSCALL(__NR_prctl, sys_prctl)
 __SYSCALL(__NR_getcpu, sys_getcpu)
 
 /* kernel/time.c */
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_gettimeofday 169
 __SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday)
 #define __NR_settimeofday 170
 __SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday)
 #define __NR_adjtimex 171
 __SC_3264(__NR_adjtimex, sys_adjtimex_time32, sys_adjtimex)
+#endif
 
 /* kernel/timer.c */
 #define __NR_getpid 172
@@ -516,11 +540,13 @@ __SC_COMP(__NR_sysinfo, sys_sysinfo, compat_sys_sysinfo)
 __SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open)
 #define __NR_mq_unlink 181
 __SYSCALL(__NR_mq_unlink, sys_mq_unlink)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_mq_timedsend 182
 __SC_3264(__NR_mq_timedsend, sys_mq_timedsend_time32, sys_mq_timedsend)
 #define __NR_mq_timedreceive 183
 __SC_3264(__NR_mq_timedreceive, sys_mq_timedreceive_time32, \
 	  sys_mq_timedreceive)
+#endif
 #define __NR_mq_notify 184
 __SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify)
 #define __NR_mq_getsetattr 185
@@ -541,8 +567,10 @@ __SC_COMP(__NR_msgsnd, sys_msgsnd, compat_sys_msgsnd)
 __SYSCALL(__NR_semget, sys_semget)
 #define __NR_semctl 191
 __SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_semtimedop 192
 __SC_COMP(__NR_semtimedop, sys_semtimedop, sys_semtimedop_time32)
+#endif
 #define __NR_semop 193
 __SYSCALL(__NR_semop, sys_semop)
 
@@ -663,8 +691,10 @@ __SC_COMP(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo, \
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_accept4 242
 __SYSCALL(__NR_accept4, sys_accept4)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_recvmmsg 243
 __SC_COMP_3264(__NR_recvmmsg, sys_recvmmsg_time32, sys_recvmmsg, compat_sys_recvmmsg_time32)
+#endif
 
 /*
  * Architectures may provide up to 16 syscalls of their own
@@ -672,8 +702,10 @@ __SC_COMP_3264(__NR_recvmmsg, sys_recvmmsg_time32, sys_recvmmsg, compat_sys_recv
  */
 #define __NR_arch_specific_syscall 244
 
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_wait4 260
 __SC_COMP(__NR_wait4, sys_wait4, compat_sys_wait4)
+#endif
 #define __NR_prlimit64 261
 __SYSCALL(__NR_prlimit64, sys_prlimit64)
 #define __NR_fanotify_init 262
@@ -684,8 +716,10 @@ __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
 __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 #define __NR_open_by_handle_at         265
 __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_clock_adjtime 266
 __SC_3264(__NR_clock_adjtime, sys_clock_adjtime32, sys_clock_adjtime)
+#endif
 #define __NR_syncfs 267
 __SYSCALL(__NR_syncfs, sys_syncfs)
 #define __NR_setns 268
@@ -738,8 +772,10 @@ __SYSCALL(__NR_pkey_alloc,    sys_pkey_alloc)
 __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 #define __NR_statx 291
 __SYSCALL(__NR_statx,     sys_statx)
+#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_io_pgetevents 292
 __SC_COMP_3264(__NR_io_pgetevents, sys_io_pgetevents_time32, sys_io_pgetevents, compat_sys_io_pgetevents)
+#endif
 #define __NR_rseq 293
 __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh
index 53c5677d7e82..ffd635efbdca 100755
--- a/scripts/checksyscalls.sh
+++ b/scripts/checksyscalls.sh
@@ -143,6 +143,13 @@ cat << EOF
 #define __IGNORE_rt_sigtimedwait
 #define __IGNORE_futex
 #define __IGNORE_sched_rr_get_interval
+#define __IGNORE_gettimeofday
+#define __IGNORE_settimeofday
+#define __IGNORE_wait4
+#define __IGNORE_adjtimex
+#define __IGNORE_nanosleep
+#define __IGNORE_io_getevents
+#define __IGNORE_recvmmsg
 #endif
 
 /* i386-specific or historical system calls */
-- 
cgit v1.2.3-59-g8ed1b


From 2736d94f351b92749c07efef01f7c10548e39ad8 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Tue, 22 Jan 2019 10:50:10 +0200
Subject: ethtool: Added support for 50Gbps per lane link modes

Added support for 50Gbps per lane link modes. Define various 50G, 100G
and 200G link modes using it.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/uapi/linux/ethtool.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 17be76aeb468..378c52308d89 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1453,6 +1453,21 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_FEC_NONE_BIT	= 49,
 	ETHTOOL_LINK_MODE_FEC_RS_BIT	= 50,
 	ETHTOOL_LINK_MODE_FEC_BASER_BIT	= 51,
+	ETHTOOL_LINK_MODE_50000baseKR_Full_BIT		 = 52,
+	ETHTOOL_LINK_MODE_50000baseSR_Full_BIT		 = 53,
+	ETHTOOL_LINK_MODE_50000baseCR_Full_BIT		 = 54,
+	ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT	 = 55,
+	ETHTOOL_LINK_MODE_50000baseDR_Full_BIT		 = 56,
+	ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT	 = 57,
+	ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT	 = 58,
+	ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT	 = 59,
+	ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT = 60,
+	ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT	 = 61,
+	ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT	 = 62,
+	ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT	 = 63,
+	ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64,
+	ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT	 = 65,
+	ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT	 = 66,
 
 	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
 	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
@@ -1461,7 +1476,7 @@ enum ethtool_link_mode_bit_indices {
 	 */
 
 	__ETHTOOL_LINK_MODE_LAST
-	  = ETHTOOL_LINK_MODE_FEC_BASER_BIT,
+	  = ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT,
 };
 
 #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)	\
@@ -1569,6 +1584,7 @@ enum ethtool_link_mode_bit_indices {
 #define SPEED_50000		50000
 #define SPEED_56000		56000
 #define SPEED_100000		100000
+#define SPEED_200000		200000
 
 #define SPEED_UNKNOWN		-1
 
-- 
cgit v1.2.3-59-g8ed1b


From eeaf06ac1a5584e41cf289f8351e446bb131374b Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Thu, 5 Jul 2018 12:57:12 +1000
Subject: drm/nouveau/svm: initial support for shared virtual memory

This uses HMM to mirror a process' CPU page tables into a channel's page
tables, and keep them synchronised so that both the CPU and GPU are able
to access the same memory at the same virtual address.

While this code also supports Volta/Turing, it's only enabled for Pascal
GPUs currently due to channel recovery being unreliable right now on the
later GPUs.

Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
---
 drivers/gpu/drm/nouveau/Kbuild         |   1 +
 drivers/gpu/drm/nouveau/Kconfig        |  11 +
 drivers/gpu/drm/nouveau/nouveau_chan.c |   9 +
 drivers/gpu/drm/nouveau/nouveau_drm.c  |   7 +-
 drivers/gpu/drm/nouveau/nouveau_drv.h  |   2 +
 drivers/gpu/drm/nouveau/nouveau_svm.c  | 737 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_svm.h  |  41 ++
 drivers/gpu/drm/nouveau/nouveau_vmm.c  |   2 +
 drivers/gpu/drm/nouveau/nouveau_vmm.h  |   1 +
 include/uapi/drm/nouveau_drm.h         |   8 +
 10 files changed, 818 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_svm.c
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_svm.h

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/nouveau/Kbuild b/drivers/gpu/drm/nouveau/Kbuild
index b17843dd050d..71bd48aafe64 100644
--- a/drivers/gpu/drm/nouveau/Kbuild
+++ b/drivers/gpu/drm/nouveau/Kbuild
@@ -30,6 +30,7 @@ nouveau-y += nouveau_vga.o
 # DRM - memory management
 nouveau-y += nouveau_bo.o
 nouveau-y += nouveau_gem.o
+nouveau-$(CONFIG_DRM_NOUVEAU_SVM) += nouveau_svm.o
 nouveau-y += nouveau_mem.o
 nouveau-y += nouveau_prime.o
 nouveau-y += nouveau_sgdma.o
diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig
index 432c440223bb..e7b26a6f386f 100644
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -71,3 +71,14 @@ config DRM_NOUVEAU_BACKLIGHT
 	help
 	  Say Y here if you want to control the backlight of your display
 	  (e.g. a laptop panel).
+
+config DRM_NOUVEAU_SVM
+	bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
+	depends on ARCH_HAS_HMM
+	depends on DRM_NOUVEAU
+	depends on STAGING
+	select HMM_MIRROR
+	default n
+	help
+	  Say Y here if you want to enable experimental support for
+	  Shared Virtual Memory (SVM).
diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c
index 5160a0a9c77d..282fd90b65e1 100644
--- a/drivers/gpu/drm/nouveau/nouveau_chan.c
+++ b/drivers/gpu/drm/nouveau/nouveau_chan.c
@@ -42,6 +42,7 @@
 #include "nouveau_fence.h"
 #include "nouveau_abi16.h"
 #include "nouveau_vmm.h"
+#include "nouveau_svm.h"
 
 MODULE_PARM_DESC(vram_pushbuf, "Create DMA push buffers in VRAM");
 int nouveau_vram_pushbuf;
@@ -95,6 +96,10 @@ nouveau_channel_del(struct nouveau_channel **pchan)
 
 		if (chan->fence)
 			nouveau_fence(chan->drm)->context_del(chan);
+
+		if (cli)
+			nouveau_svmm_part(chan->vmm->svmm, chan->inst);
+
 		nvif_object_fini(&chan->nvsw);
 		nvif_object_fini(&chan->gart);
 		nvif_object_fini(&chan->vram);
@@ -494,6 +499,10 @@ nouveau_channel_new(struct nouveau_drm *drm, struct nvif_device *device,
 		nouveau_channel_del(pchan);
 	}
 
+	ret = nouveau_svmm_join((*pchan)->vmm->svmm, (*pchan)->inst);
+	if (ret)
+		nouveau_channel_del(pchan);
+
 done:
 	cli->base.super = super;
 	return ret;
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 72755d4c3983..53fdfa283ee2 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -62,6 +62,7 @@
 #include "nouveau_usif.h"
 #include "nouveau_connector.h"
 #include "nouveau_platform.h"
+#include "nouveau_svm.h"
 
 MODULE_PARM_DESC(config, "option string to pass to driver core");
 static char *nouveau_config;
@@ -549,6 +550,7 @@ nouveau_drm_device_init(struct drm_device *dev)
 
 	nouveau_debugfs_init(drm);
 	nouveau_hwmon_init(dev);
+	nouveau_svm_init(drm);
 	nouveau_fbcon_init(dev);
 	nouveau_led_init(dev);
 
@@ -592,6 +594,7 @@ nouveau_drm_device_fini(struct drm_device *dev)
 
 	nouveau_led_fini(dev);
 	nouveau_fbcon_fini(dev);
+	nouveau_svm_fini(drm);
 	nouveau_hwmon_fini(dev);
 	nouveau_debugfs_fini(drm);
 
@@ -737,6 +740,7 @@ nouveau_do_suspend(struct drm_device *dev, bool runtime)
 	struct nouveau_drm *drm = nouveau_drm(dev);
 	int ret;
 
+	nouveau_svm_suspend(drm);
 	nouveau_led_suspend(dev);
 
 	if (dev->mode_config.num_crtc) {
@@ -813,7 +817,7 @@ nouveau_do_resume(struct drm_device *dev, bool runtime)
 	}
 
 	nouveau_led_resume(dev);
-
+	nouveau_svm_resume(drm);
 	return 0;
 }
 
@@ -1033,6 +1037,7 @@ nouveau_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GROBJ_ALLOC, nouveau_abi16_ioctl_grobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_AUTH|DRM_RENDER_ALLOW),
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 8fcff0a4800e..1326b42f75e6 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -210,6 +210,8 @@ struct nouveau_drm {
 	bool have_disp_power_ref;
 
 	struct dev_pm_domain vga_pm_domain;
+
+	struct nouveau_svm *svm;
 };
 
 static inline struct nouveau_drm *
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
new file mode 100644
index 000000000000..6158e99b1dc3
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -0,0 +1,737 @@
+/*
+ * Copyright 2018 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "nouveau_svm.h"
+#include "nouveau_drv.h"
+#include "nouveau_chan.h"
+
+#include <nvif/notify.h>
+#include <nvif/object.h>
+#include <nvif/vmm.h>
+
+#include <nvif/class.h>
+#include <nvif/clb069.h>
+#include <nvif/ifc00d.h>
+
+#include <linux/sched/mm.h>
+#include <linux/sort.h>
+#include <linux/hmm.h>
+
+struct nouveau_svm {
+	struct nouveau_drm *drm;
+	struct mutex mutex;
+	struct list_head inst;
+
+	struct nouveau_svm_fault_buffer {
+		int id;
+		struct nvif_object object;
+		u32 entries;
+		u32 getaddr;
+		u32 putaddr;
+		u32 get;
+		u32 put;
+		struct nvif_notify notify;
+
+		struct nouveau_svm_fault {
+			u64 inst;
+			u64 addr;
+			u64 time;
+			u32 engine;
+			u8  gpc;
+			u8  hub;
+			u8  access;
+			u8  client;
+			u8  fault;
+			struct nouveau_svmm *svmm;
+		} **fault;
+		int fault_nr;
+	} buffer[1];
+};
+
+#define SVM_DBG(s,f,a...) NV_DEBUG((s)->drm, "svm: "f"\n", ##a)
+#define SVM_ERR(s,f,a...) NV_WARN((s)->drm, "svm: "f"\n", ##a)
+
+struct nouveau_ivmm {
+	struct nouveau_svmm *svmm;
+	u64 inst;
+	struct list_head head;
+};
+
+static struct nouveau_ivmm *
+nouveau_ivmm_find(struct nouveau_svm *svm, u64 inst)
+{
+	struct nouveau_ivmm *ivmm;
+	list_for_each_entry(ivmm, &svm->inst, head) {
+		if (ivmm->inst == inst)
+			return ivmm;
+	}
+	return NULL;
+}
+
+struct nouveau_svmm {
+	struct nouveau_vmm *vmm;
+	struct {
+		unsigned long start;
+		unsigned long limit;
+	} unmanaged;
+
+	struct mutex mutex;
+
+	struct mm_struct *mm;
+	struct hmm_mirror mirror;
+};
+
+#define SVMM_DBG(s,f,a...)                                                     \
+	NV_DEBUG((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a)
+#define SVMM_ERR(s,f,a...)                                                     \
+	NV_WARN((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a)
+
+/* Unlink channel instance from SVMM. */
+void
+nouveau_svmm_part(struct nouveau_svmm *svmm, u64 inst)
+{
+	struct nouveau_ivmm *ivmm;
+	if (svmm) {
+		mutex_lock(&svmm->vmm->cli->drm->svm->mutex);
+		ivmm = nouveau_ivmm_find(svmm->vmm->cli->drm->svm, inst);
+		if (ivmm) {
+			list_del(&ivmm->head);
+			kfree(ivmm);
+		}
+		mutex_unlock(&svmm->vmm->cli->drm->svm->mutex);
+	}
+}
+
+/* Link channel instance to SVMM. */
+int
+nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst)
+{
+	struct nouveau_ivmm *ivmm;
+	if (svmm) {
+		if (!(ivmm = kmalloc(sizeof(*ivmm), GFP_KERNEL)))
+			return -ENOMEM;
+		ivmm->svmm = svmm;
+		ivmm->inst = inst;
+
+		mutex_lock(&svmm->vmm->cli->drm->svm->mutex);
+		list_add(&ivmm->head, &svmm->vmm->cli->drm->svm->inst);
+		mutex_unlock(&svmm->vmm->cli->drm->svm->mutex);
+	}
+	return 0;
+}
+
+/* Invalidate SVMM address-range on GPU. */
+static void
+nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit)
+{
+	if (limit > start) {
+		bool super = svmm->vmm->vmm.object.client->super;
+		svmm->vmm->vmm.object.client->super = true;
+		nvif_object_mthd(&svmm->vmm->vmm.object, NVIF_VMM_V0_PFNCLR,
+				 &(struct nvif_vmm_pfnclr_v0) {
+					.addr = start,
+					.size = limit - start,
+				 }, sizeof(struct nvif_vmm_pfnclr_v0));
+		svmm->vmm->vmm.object.client->super = super;
+	}
+}
+
+static int
+nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
+					const struct hmm_update *update)
+{
+	struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror);
+	unsigned long start = update->start;
+	unsigned long limit = update->end;
+
+	if (!update->blockable)
+		return -EAGAIN;
+
+	SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit);
+
+	mutex_lock(&svmm->mutex);
+	if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) {
+		if (start < svmm->unmanaged.start) {
+			nouveau_svmm_invalidate(svmm, start,
+						svmm->unmanaged.limit);
+		}
+		start = svmm->unmanaged.limit;
+	}
+
+	nouveau_svmm_invalidate(svmm, start, limit);
+	mutex_unlock(&svmm->mutex);
+	return 0;
+}
+
+static void
+nouveau_svmm_release(struct hmm_mirror *mirror)
+{
+}
+
+static const struct hmm_mirror_ops
+nouveau_svmm = {
+	.sync_cpu_device_pagetables = nouveau_svmm_sync_cpu_device_pagetables,
+	.release = nouveau_svmm_release,
+};
+
+void
+nouveau_svmm_fini(struct nouveau_svmm **psvmm)
+{
+	struct nouveau_svmm *svmm = *psvmm;
+	if (svmm) {
+		hmm_mirror_unregister(&svmm->mirror);
+		kfree(*psvmm);
+		*psvmm = NULL;
+	}
+}
+
+int
+nouveau_svmm_init(struct drm_device *dev, void *data,
+		  struct drm_file *file_priv)
+{
+	struct nouveau_cli *cli = nouveau_cli(file_priv);
+	struct nouveau_svmm *svmm;
+	struct drm_nouveau_svm_init *args = data;
+	int ret;
+
+	/* Allocate tracking for SVM-enabled VMM. */
+	if (!(svmm = kzalloc(sizeof(*svmm), GFP_KERNEL)))
+		return -ENOMEM;
+	svmm->vmm = &cli->svm;
+	svmm->unmanaged.start = args->unmanaged_addr;
+	svmm->unmanaged.limit = args->unmanaged_addr + args->unmanaged_size;
+	mutex_init(&svmm->mutex);
+
+	/* Check that SVM isn't already enabled for the client. */
+	mutex_lock(&cli->mutex);
+	if (cli->svm.cli) {
+		ret = -EBUSY;
+		goto done;
+	}
+
+	/* Allocate a new GPU VMM that can support SVM (managed by the
+	 * client, with replayable faults enabled).
+	 *
+	 * All future channel/memory allocations will make use of this
+	 * VMM instead of the standard one.
+	 */
+	ret = nvif_vmm_init(&cli->mmu, cli->vmm.vmm.object.oclass, true,
+			    args->unmanaged_addr, args->unmanaged_size,
+			    &(struct gp100_vmm_v0) {
+				.fault_replay = true,
+			    }, sizeof(struct gp100_vmm_v0), &cli->svm.vmm);
+	if (ret)
+		goto done;
+
+	/* Enable HMM mirroring of CPU address-space to VMM. */
+	svmm->mm = get_task_mm(current);
+	down_write(&svmm->mm->mmap_sem);
+	svmm->mirror.ops = &nouveau_svmm;
+	ret = hmm_mirror_register(&svmm->mirror, svmm->mm);
+	if (ret == 0) {
+		cli->svm.svmm = svmm;
+		cli->svm.cli = cli;
+	}
+	up_write(&svmm->mm->mmap_sem);
+	mmput(svmm->mm);
+
+done:
+	if (ret)
+		nouveau_svmm_fini(&svmm);
+	mutex_unlock(&cli->mutex);
+	return ret;
+}
+
+static const u64
+nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = {
+	[HMM_PFN_VALID         ] = NVIF_VMM_PFNMAP_V0_V,
+	[HMM_PFN_WRITE         ] = NVIF_VMM_PFNMAP_V0_W,
+	[HMM_PFN_DEVICE_PRIVATE] = NVIF_VMM_PFNMAP_V0_VRAM,
+};
+
+static const u64
+nouveau_svm_pfn_values[HMM_PFN_VALUE_MAX] = {
+	[HMM_PFN_ERROR  ] = ~NVIF_VMM_PFNMAP_V0_V,
+	[HMM_PFN_NONE   ] =  NVIF_VMM_PFNMAP_V0_NONE,
+	[HMM_PFN_SPECIAL] = ~NVIF_VMM_PFNMAP_V0_V,
+};
+
+/* Issue fault replay for GPU to retry accesses that faulted previously. */
+static void
+nouveau_svm_fault_replay(struct nouveau_svm *svm)
+{
+	SVM_DBG(svm, "replay");
+	WARN_ON(nvif_object_mthd(&svm->drm->client.vmm.vmm.object,
+				 GP100_VMM_VN_FAULT_REPLAY,
+				 &(struct gp100_vmm_fault_replay_vn) {},
+				 sizeof(struct gp100_vmm_fault_replay_vn)));
+}
+
+/* Cancel a replayable fault that could not be handled.
+ *
+ * Cancelling the fault will trigger recovery to reset the engine
+ * and kill the offending channel (ie. GPU SIGSEGV).
+ */
+static void
+nouveau_svm_fault_cancel(struct nouveau_svm *svm,
+			 u64 inst, u8 hub, u8 gpc, u8 client)
+{
+	SVM_DBG(svm, "cancel %016llx %d %02x %02x", inst, hub, gpc, client);
+	WARN_ON(nvif_object_mthd(&svm->drm->client.vmm.vmm.object,
+				 GP100_VMM_VN_FAULT_CANCEL,
+				 &(struct gp100_vmm_fault_cancel_v0) {
+					.hub = hub,
+					.gpc = gpc,
+					.client = client,
+					.inst = inst,
+				 }, sizeof(struct gp100_vmm_fault_cancel_v0)));
+}
+
+static void
+nouveau_svm_fault_cancel_fault(struct nouveau_svm *svm,
+			       struct nouveau_svm_fault *fault)
+{
+	nouveau_svm_fault_cancel(svm, fault->inst,
+				      fault->hub,
+				      fault->gpc,
+				      fault->client);
+}
+
+static int
+nouveau_svm_fault_cmp(const void *a, const void *b)
+{
+	const struct nouveau_svm_fault *fa = *(struct nouveau_svm_fault **)a;
+	const struct nouveau_svm_fault *fb = *(struct nouveau_svm_fault **)b;
+	int ret;
+	if ((ret = (s64)fa->inst - fb->inst))
+		return ret;
+	if ((ret = (s64)fa->addr - fb->addr))
+		return ret;
+	/*XXX: atomic? */
+	return (fa->access == 0 || fa->access == 3) -
+	       (fb->access == 0 || fb->access == 3);
+}
+
+static void
+nouveau_svm_fault_cache(struct nouveau_svm *svm,
+			struct nouveau_svm_fault_buffer *buffer, u32 offset)
+{
+	struct nvif_object *memory = &buffer->object;
+	const u32 instlo = nvif_rd32(memory, offset + 0x00);
+	const u32 insthi = nvif_rd32(memory, offset + 0x04);
+	const u32 addrlo = nvif_rd32(memory, offset + 0x08);
+	const u32 addrhi = nvif_rd32(memory, offset + 0x0c);
+	const u32 timelo = nvif_rd32(memory, offset + 0x10);
+	const u32 timehi = nvif_rd32(memory, offset + 0x14);
+	const u32 engine = nvif_rd32(memory, offset + 0x18);
+	const u32   info = nvif_rd32(memory, offset + 0x1c);
+	const u64   inst = (u64)insthi << 32 | instlo;
+	const u8     gpc = (info & 0x1f000000) >> 24;
+	const u8     hub = (info & 0x00100000) >> 20;
+	const u8  client = (info & 0x00007f00) >> 8;
+	struct nouveau_svm_fault *fault;
+
+	//XXX: i think we're supposed to spin waiting */
+	if (WARN_ON(!(info & 0x80000000)))
+		return;
+
+	nvif_mask(memory, offset + 0x1c, 0x80000000, 0x00000000);
+
+	if (!buffer->fault[buffer->fault_nr]) {
+		fault = kmalloc(sizeof(*fault), GFP_KERNEL);
+		if (WARN_ON(!fault)) {
+			nouveau_svm_fault_cancel(svm, inst, hub, gpc, client);
+			return;
+		}
+		buffer->fault[buffer->fault_nr] = fault;
+	}
+
+	fault = buffer->fault[buffer->fault_nr++];
+	fault->inst   = inst;
+	fault->addr   = (u64)addrhi << 32 | addrlo;
+	fault->time   = (u64)timehi << 32 | timelo;
+	fault->engine = engine;
+	fault->gpc    = gpc;
+	fault->hub    = hub;
+	fault->access = (info & 0x000f0000) >> 16;
+	fault->client = client;
+	fault->fault  = (info & 0x0000001f);
+
+	SVM_DBG(svm, "fault %016llx %016llx %02x",
+		fault->inst, fault->addr, fault->access);
+}
+
+static int
+nouveau_svm_fault(struct nvif_notify *notify)
+{
+	struct nouveau_svm_fault_buffer *buffer =
+		container_of(notify, typeof(*buffer), notify);
+	struct nouveau_svm *svm =
+		container_of(buffer, typeof(*svm), buffer[buffer->id]);
+	struct nvif_object *device = &svm->drm->client.device.object;
+	struct nouveau_svmm *svmm;
+	struct {
+		struct {
+			struct nvif_ioctl_v0 i;
+			struct nvif_ioctl_mthd_v0 m;
+			struct nvif_vmm_pfnmap_v0 p;
+		} i;
+		u64 phys[16];
+	} args;
+	struct hmm_range range;
+	struct vm_area_struct *vma;
+	u64 inst, start, limit;
+	int fi, fn, pi, fill;
+	int replay = 0, ret;
+
+	/* Parse available fault buffer entries into a cache, and update
+	 * the GET pointer so HW can reuse the entries.
+	 */
+	SVM_DBG(svm, "fault handler");
+	if (buffer->get == buffer->put) {
+		buffer->put = nvif_rd32(device, buffer->putaddr);
+		buffer->get = nvif_rd32(device, buffer->getaddr);
+		if (buffer->get == buffer->put)
+			return NVIF_NOTIFY_KEEP;
+	}
+	buffer->fault_nr = 0;
+
+	SVM_DBG(svm, "get %08x put %08x", buffer->get, buffer->put);
+	while (buffer->get != buffer->put) {
+		nouveau_svm_fault_cache(svm, buffer, buffer->get * 0x20);
+		if (++buffer->get == buffer->entries)
+			buffer->get = 0;
+	}
+	nvif_wr32(device, buffer->getaddr, buffer->get);
+	SVM_DBG(svm, "%d fault(s) pending", buffer->fault_nr);
+
+	/* Sort parsed faults by instance pointer to prevent unnecessary
+	 * instance to SVMM translations, followed by address and access
+	 * type to reduce the amount of work when handling the faults.
+	 */
+	sort(buffer->fault, buffer->fault_nr, sizeof(*buffer->fault),
+	     nouveau_svm_fault_cmp, NULL);
+
+	/* Lookup SVMM structure for each unique instance pointer. */
+	mutex_lock(&svm->mutex);
+	for (fi = 0, svmm = NULL; fi < buffer->fault_nr; fi++) {
+		if (!svmm || buffer->fault[fi]->inst != inst) {
+			struct nouveau_ivmm *ivmm =
+				nouveau_ivmm_find(svm, buffer->fault[fi]->inst);
+			svmm = ivmm ? ivmm->svmm : NULL;
+			inst = buffer->fault[fi]->inst;
+			SVM_DBG(svm, "inst %016llx -> svm-%p", inst, svmm);
+		}
+		buffer->fault[fi]->svmm = svmm;
+	}
+	mutex_unlock(&svm->mutex);
+
+	/* Process list of faults. */
+	args.i.i.version = 0;
+	args.i.i.type = NVIF_IOCTL_V0_MTHD;
+	args.i.m.version = 0;
+	args.i.m.method = NVIF_VMM_V0_PFNMAP;
+	args.i.p.version = 0;
+
+	for (fi = 0; fn = fi + 1, fi < buffer->fault_nr; fi = fn) {
+		/* Cancel any faults from non-SVM channels. */
+		if (!(svmm = buffer->fault[fi]->svmm)) {
+			nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
+			continue;
+		}
+		SVMM_DBG(svmm, "addr %016llx", buffer->fault[fi]->addr);
+
+		/* We try and group handling of faults within a small
+		 * window into a single update.
+		 */
+		start = buffer->fault[fi]->addr;
+		limit = start + (ARRAY_SIZE(args.phys) << PAGE_SHIFT);
+		if (start < svmm->unmanaged.limit)
+			limit = min_t(u64, limit, svmm->unmanaged.start);
+		else
+		if (limit > svmm->unmanaged.start)
+			start = max_t(u64, start, svmm->unmanaged.limit);
+		SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
+
+		/* Intersect fault window with the CPU VMA, cancelling
+		 * the fault if the address is invalid.
+		 */
+		down_read(&svmm->mm->mmap_sem);
+		vma = find_vma_intersection(svmm->mm, start, limit);
+		if (!vma) {
+			SVMM_ERR(svmm, "wndw %016llx-%016llx", start, limit);
+			up_read(&svmm->mm->mmap_sem);
+			nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
+			continue;
+		}
+		start = max_t(u64, start, vma->vm_start);
+		limit = min_t(u64, limit, vma->vm_end);
+		SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
+
+		if (buffer->fault[fi]->addr != start) {
+			SVMM_ERR(svmm, "addr %016llx", buffer->fault[fi]->addr);
+			up_read(&svmm->mm->mmap_sem);
+			nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
+			continue;
+		}
+
+		/* Prepare the GPU-side update of all pages within the
+		 * fault window, determining required pages and access
+		 * permissions based on pending faults.
+		 */
+		args.i.p.page = PAGE_SHIFT;
+		args.i.p.addr = start;
+		for (fn = fi, pi = 0;;) {
+			/* Determine required permissions based on GPU fault
+			 * access flags.
+			 *XXX: atomic?
+			 */
+			if (buffer->fault[fn]->access != 0 /* READ. */ &&
+			    buffer->fault[fn]->access != 3 /* PREFETCH. */) {
+				args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V |
+						  NVIF_VMM_PFNMAP_V0_W;
+			} else {
+				args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V;
+			}
+			args.i.p.size = pi << PAGE_SHIFT;
+
+			/* It's okay to skip over duplicate addresses from the
+			 * same SVMM as faults are ordered by access type such
+			 * that only the first one needs to be handled.
+			 *
+			 * ie. WRITE faults appear first, thus any handling of
+			 * pending READ faults will already be satisfied.
+			 */
+			while (++fn < buffer->fault_nr &&
+			       buffer->fault[fn]->svmm == svmm &&
+			       buffer->fault[fn    ]->addr ==
+			       buffer->fault[fn - 1]->addr);
+
+			/* If the next fault is outside the window, or all GPU
+			 * faults have been dealt with, we're done here.
+			 */
+			if (fn >= buffer->fault_nr ||
+			    buffer->fault[fn]->svmm != svmm ||
+			    buffer->fault[fn]->addr >= limit)
+				break;
+
+			/* Fill in the gap between this fault and the next. */
+			fill = (buffer->fault[fn    ]->addr -
+				buffer->fault[fn - 1]->addr) >> PAGE_SHIFT;
+			while (--fill)
+				args.phys[pi++] = NVIF_VMM_PFNMAP_V0_NONE;
+		}
+
+		SVMM_DBG(svmm, "wndw %016llx-%016llx covering %d fault(s)",
+			 args.i.p.addr,
+			 args.i.p.addr + args.i.p.size, fn - fi);
+
+		/* Have HMM fault pages within the fault window to the GPU. */
+		range.vma = vma;
+		range.start = args.i.p.addr;
+		range.end = args.i.p.addr + args.i.p.size;
+		range.pfns = args.phys;
+		range.flags = nouveau_svm_pfn_flags;
+		range.values = nouveau_svm_pfn_values;
+		range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
+again:
+		ret = hmm_vma_fault(&range, true);
+		if (ret == 0) {
+			mutex_lock(&svmm->mutex);
+			if (!hmm_vma_range_done(&range)) {
+				mutex_unlock(&svmm->mutex);
+				goto again;
+			}
+
+			svmm->vmm->vmm.object.client->super = true;
+			ret = nvif_object_ioctl(&svmm->vmm->vmm.object,
+						&args, sizeof(args.i) +
+						pi * sizeof(args.phys[0]),
+						NULL);
+			svmm->vmm->vmm.object.client->super = false;
+			mutex_unlock(&svmm->mutex);
+		}
+		up_read(&svmm->mm->mmap_sem);
+
+		/* Cancel any faults in the window whose pages didn't manage
+		 * to keep their valid bit, or stay writeable when required.
+		 *
+		 * If handling failed completely, cancel all faults.
+		 */
+		while (fi < fn) {
+			struct nouveau_svm_fault *fault = buffer->fault[fi++];
+			pi = (fault->addr - range.start) >> PAGE_SHIFT;
+			if (ret ||
+			     !(range.pfns[pi] & NVIF_VMM_PFNMAP_V0_V) ||
+			    (!(range.pfns[pi] & NVIF_VMM_PFNMAP_V0_W) &&
+			     fault->access != 0 && fault->access != 3)) {
+				nouveau_svm_fault_cancel_fault(svm, fault);
+				continue;
+			}
+			replay++;
+		}
+	}
+
+	/* Issue fault replay to the GPU. */
+	if (replay)
+		nouveau_svm_fault_replay(svm);
+	return NVIF_NOTIFY_KEEP;
+}
+
+static void
+nouveau_svm_fault_buffer_fini(struct nouveau_svm *svm, int id)
+{
+	struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id];
+	nvif_notify_put(&buffer->notify);
+}
+
+static int
+nouveau_svm_fault_buffer_init(struct nouveau_svm *svm, int id)
+{
+	struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id];
+	struct nvif_object *device = &svm->drm->client.device.object;
+	buffer->get = nvif_rd32(device, buffer->getaddr);
+	buffer->put = nvif_rd32(device, buffer->putaddr);
+	SVM_DBG(svm, "get %08x put %08x (init)", buffer->get, buffer->put);
+	return nvif_notify_get(&buffer->notify);
+}
+
+static void
+nouveau_svm_fault_buffer_dtor(struct nouveau_svm *svm, int id)
+{
+	struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id];
+	int i;
+
+	if (buffer->fault) {
+		for (i = 0; buffer->fault[i] && i < buffer->entries; i++)
+			kfree(buffer->fault[i]);
+		kvfree(buffer->fault);
+	}
+
+	nouveau_svm_fault_buffer_fini(svm, id);
+
+	nvif_notify_fini(&buffer->notify);
+	nvif_object_fini(&buffer->object);
+}
+
+static int
+nouveau_svm_fault_buffer_ctor(struct nouveau_svm *svm, s32 oclass, int id)
+{
+	struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id];
+	struct nouveau_drm *drm = svm->drm;
+	struct nvif_object *device = &drm->client.device.object;
+	struct nvif_clb069_v0 args = {};
+	int ret;
+
+	buffer->id = id;
+
+	ret = nvif_object_init(device, 0, oclass, &args, sizeof(args),
+			       &buffer->object);
+	if (ret < 0) {
+		SVM_ERR(svm, "Fault buffer allocation failed: %d", ret);
+		return ret;
+	}
+
+	nvif_object_map(&buffer->object, NULL, 0);
+	buffer->entries = args.entries;
+	buffer->getaddr = args.get;
+	buffer->putaddr = args.put;
+
+	ret = nvif_notify_init(&buffer->object, nouveau_svm_fault, true,
+			       NVB069_V0_NTFY_FAULT, NULL, 0, 0,
+			       &buffer->notify);
+	if (ret)
+		return ret;
+
+	buffer->fault = kvzalloc(sizeof(*buffer->fault) * buffer->entries, GFP_KERNEL);
+	if (!buffer->fault)
+		return -ENOMEM;
+
+	return nouveau_svm_fault_buffer_init(svm, id);
+}
+
+void
+nouveau_svm_resume(struct nouveau_drm *drm)
+{
+	struct nouveau_svm *svm = drm->svm;
+	if (svm)
+		nouveau_svm_fault_buffer_init(svm, 0);
+}
+
+void
+nouveau_svm_suspend(struct nouveau_drm *drm)
+{
+	struct nouveau_svm *svm = drm->svm;
+	if (svm)
+		nouveau_svm_fault_buffer_fini(svm, 0);
+}
+
+void
+nouveau_svm_fini(struct nouveau_drm *drm)
+{
+	struct nouveau_svm *svm = drm->svm;
+	if (svm) {
+		nouveau_svm_fault_buffer_dtor(svm, 0);
+		kfree(drm->svm);
+		drm->svm = NULL;
+	}
+}
+
+void
+nouveau_svm_init(struct nouveau_drm *drm)
+{
+	static const struct nvif_mclass buffers[] = {
+		{   VOLTA_FAULT_BUFFER_A, 0 },
+		{ MAXWELL_FAULT_BUFFER_A, 0 },
+		{}
+	};
+	struct nouveau_svm *svm;
+	int ret;
+
+	/* Disable on Volta and newer until channel recovery is fixed,
+	 * otherwise clients will have a trivial way to trash the GPU
+	 * for everyone.
+	 */
+	if (drm->client.device.info.family > NV_DEVICE_INFO_V0_PASCAL)
+		return;
+
+	if (!(drm->svm = svm = kzalloc(sizeof(*drm->svm), GFP_KERNEL)))
+		return;
+
+	drm->svm->drm = drm;
+	mutex_init(&drm->svm->mutex);
+	INIT_LIST_HEAD(&drm->svm->inst);
+
+	ret = nvif_mclass(&drm->client.device.object, buffers);
+	if (ret < 0) {
+		SVM_DBG(svm, "No supported fault buffer class");
+		nouveau_svm_fini(drm);
+		return;
+	}
+
+	ret = nouveau_svm_fault_buffer_ctor(svm, buffers[ret].oclass, 0);
+	if (ret) {
+		nouveau_svm_fini(drm);
+		return;
+	}
+
+	SVM_DBG(svm, "Initialised");
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.h b/drivers/gpu/drm/nouveau/nouveau_svm.h
new file mode 100644
index 000000000000..0379a1c316ca
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.h
@@ -0,0 +1,41 @@
+#ifndef __NOUVEAU_SVM_H__
+#define __NOUVEAU_SVM_H__
+#include <nvif/os.h>
+struct drm_device;
+struct drm_file;
+struct nouveau_drm;
+
+struct nouveau_svmm;
+
+#if IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM)
+void nouveau_svm_init(struct nouveau_drm *);
+void nouveau_svm_fini(struct nouveau_drm *);
+void nouveau_svm_suspend(struct nouveau_drm *);
+void nouveau_svm_resume(struct nouveau_drm *);
+
+int nouveau_svmm_init(struct drm_device *, void *, struct drm_file *);
+void nouveau_svmm_fini(struct nouveau_svmm **);
+int nouveau_svmm_join(struct nouveau_svmm *, u64 inst);
+void nouveau_svmm_part(struct nouveau_svmm *, u64 inst);
+#else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */
+static inline void nouveau_svm_init(struct nouveau_drm *drm) {}
+static inline void nouveau_svm_fini(struct nouveau_drm *drm) {}
+static inline void nouveau_svm_suspend(struct nouveau_drm *drm) {}
+static inline void nouveau_svm_resume(struct nouveau_drm *drm) {}
+
+static inline int nouveau_svmm_init(struct drm_device *device, void *p,
+				    struct drm_file *file)
+{
+	return -ENOSYS;
+}
+
+static inline void nouveau_svmm_fini(struct nouveau_svmm **svmmp) {}
+
+static inline int nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst)
+{
+	return 0;
+}
+
+static inline void nouveau_svmm_part(struct nouveau_svmm *svmm, u64 inst) {}
+#endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */
+#endif
diff --git a/drivers/gpu/drm/nouveau/nouveau_vmm.c b/drivers/gpu/drm/nouveau/nouveau_vmm.c
index 724d02d7c049..77061182a1cf 100644
--- a/drivers/gpu/drm/nouveau/nouveau_vmm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_vmm.c
@@ -22,6 +22,7 @@
 #include "nouveau_vmm.h"
 #include "nouveau_drv.h"
 #include "nouveau_bo.h"
+#include "nouveau_svm.h"
 #include "nouveau_mem.h"
 
 void
@@ -119,6 +120,7 @@ done:
 void
 nouveau_vmm_fini(struct nouveau_vmm *vmm)
 {
+	nouveau_svmm_fini(&vmm->svmm);
 	nvif_vmm_fini(&vmm->vmm);
 	vmm->cli = NULL;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_vmm.h b/drivers/gpu/drm/nouveau/nouveau_vmm.h
index ede872f6f668..2b98d975f37e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_vmm.h
+++ b/drivers/gpu/drm/nouveau/nouveau_vmm.h
@@ -25,6 +25,7 @@ void nouveau_vma_unmap(struct nouveau_vma *);
 struct nouveau_vmm {
 	struct nouveau_cli *cli;
 	struct nvif_vmm vmm;
+	struct nouveau_svmm *svmm;
 };
 
 int nouveau_vmm_init(struct nouveau_cli *, s32 oclass, struct nouveau_vmm *);
diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
index 259588a4b61b..afac182c80d8 100644
--- a/include/uapi/drm/nouveau_drm.h
+++ b/include/uapi/drm/nouveau_drm.h
@@ -133,12 +133,20 @@ struct drm_nouveau_gem_cpu_fini {
 #define DRM_NOUVEAU_NOTIFIEROBJ_ALLOC  0x05 /* deprecated */
 #define DRM_NOUVEAU_GPUOBJ_FREE        0x06 /* deprecated */
 #define DRM_NOUVEAU_NVIF               0x07
+#define DRM_NOUVEAU_SVM_INIT           0x08
 #define DRM_NOUVEAU_GEM_NEW            0x40
 #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
 #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
 #define DRM_NOUVEAU_GEM_CPU_FINI       0x43
 #define DRM_NOUVEAU_GEM_INFO           0x44
 
+struct drm_nouveau_svm_init {
+	__u64 unmanaged_addr;
+	__u64 unmanaged_size;
+};
+
+#define DRM_IOCTL_NOUVEAU_SVM_INIT           DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_SVM_INIT, struct drm_nouveau_svm_init)
+
 #define DRM_IOCTL_NOUVEAU_GEM_NEW            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_NEW, struct drm_nouveau_gem_new)
 #define DRM_IOCTL_NOUVEAU_GEM_PUSHBUF        DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_PUSHBUF, struct drm_nouveau_gem_pushbuf)
 #define DRM_IOCTL_NOUVEAU_GEM_CPU_PREP       DRM_IOW (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_PREP, struct drm_nouveau_gem_cpu_prep)
-- 
cgit v1.2.3-59-g8ed1b


From f180bf12ac061f093abb9247505f661817973cae Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Tue, 7 Aug 2018 16:13:16 -0400
Subject: drm/nouveau/svm: new ioctl to migrate process memory to GPU memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This add an ioctl to migrate a range of process address space to the
device memory. On platform without cache coherent bus (x86, ARM, ...)
this means that CPU can not access that range directly, instead CPU
will fault which will migrate the memory back to system memory.

This is behind a staging flag so that we can evolve the API.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
---
 drivers/gpu/drm/nouveau/nouveau_drm.c |  1 +
 drivers/gpu/drm/nouveau/nouveau_svm.c | 96 +++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_svm.h |  7 +++
 include/uapi/drm/nouveau_drm.h        | 43 ++++++++++++++++
 4 files changed, 147 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index ee2c4d498d7d..5020265bfbd9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -1043,6 +1043,7 @@ nouveau_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_BIND, nouveau_svmm_bind, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_AUTH|DRM_RENDER_ALLOW),
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 3ba980d80a1a..93ed43c413f0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -22,6 +22,7 @@
 #include "nouveau_svm.h"
 #include "nouveau_drv.h"
 #include "nouveau_chan.h"
+#include "nouveau_dmem.h"
 
 #include <nvif/notify.h>
 #include <nvif/object.h>
@@ -104,6 +105,101 @@ struct nouveau_svmm {
 #define SVMM_ERR(s,f,a...)                                                     \
 	NV_WARN((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a)
 
+int
+nouveau_svmm_bind(struct drm_device *dev, void *data,
+		  struct drm_file *file_priv)
+{
+	struct nouveau_cli *cli = nouveau_cli(file_priv);
+	struct drm_nouveau_svm_bind *args = data;
+	unsigned target, cmd, priority;
+	unsigned long addr, end, size;
+	struct mm_struct *mm;
+
+	args->va_start &= PAGE_MASK;
+	args->va_end &= PAGE_MASK;
+
+	/* Sanity check arguments */
+	if (args->reserved0 || args->reserved1)
+		return -EINVAL;
+	if (args->header & (~NOUVEAU_SVM_BIND_VALID_MASK))
+		return -EINVAL;
+	if (args->va_start >= args->va_end)
+		return -EINVAL;
+	if (!args->npages)
+		return -EINVAL;
+
+	cmd = args->header >> NOUVEAU_SVM_BIND_COMMAND_SHIFT;
+	cmd &= NOUVEAU_SVM_BIND_COMMAND_MASK;
+	switch (cmd) {
+	case NOUVEAU_SVM_BIND_COMMAND__MIGRATE:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	priority = args->header >> NOUVEAU_SVM_BIND_PRIORITY_SHIFT;
+	priority &= NOUVEAU_SVM_BIND_PRIORITY_MASK;
+
+	/* FIXME support CPU target ie all target value < GPU_VRAM */
+	target = args->header >> NOUVEAU_SVM_BIND_TARGET_SHIFT;
+	target &= NOUVEAU_SVM_BIND_TARGET_MASK;
+	switch (target) {
+	case NOUVEAU_SVM_BIND_TARGET__GPU_VRAM:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/*
+	 * FIXME: For now refuse non 0 stride, we need to change the migrate
+	 * kernel function to handle stride to avoid to create a mess within
+	 * each device driver.
+	 */
+	if (args->stride)
+		return -EINVAL;
+
+	size = ((unsigned long)args->npages) << PAGE_SHIFT;
+	if ((args->va_start + size) <= args->va_start)
+		return -EINVAL;
+	if ((args->va_start + size) > args->va_end)
+		return -EINVAL;
+
+	/*
+	 * Ok we are ask to do something sane, for now we only support migrate
+	 * commands but we will add things like memory policy (what to do on
+	 * page fault) and maybe some other commands.
+	 */
+
+	mm = get_task_mm(current);
+	down_read(&mm->mmap_sem);
+
+	for (addr = args->va_start, end = args->va_start + size; addr < end;) {
+		struct vm_area_struct *vma;
+		unsigned long next;
+
+		vma = find_vma_intersection(mm, addr, end);
+		if (!vma)
+			break;
+
+		next = min(vma->vm_end, end);
+		/* This is a best effort so we ignore errors */
+		nouveau_dmem_migrate_vma(cli->drm, vma, addr, next);
+		addr = next;
+	}
+
+	/*
+	 * FIXME Return the number of page we have migrated, again we need to
+	 * update the migrate API to return that information so that we can
+	 * report it to user space.
+	 */
+	args->result = 0;
+
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	return 0;
+}
+
 /* Unlink channel instance from SVMM. */
 void
 nouveau_svmm_part(struct nouveau_svmm *svmm, u64 inst)
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.h b/drivers/gpu/drm/nouveau/nouveau_svm.h
index 0379a1c316ca..e839d8189461 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.h
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.h
@@ -17,6 +17,7 @@ int nouveau_svmm_init(struct drm_device *, void *, struct drm_file *);
 void nouveau_svmm_fini(struct nouveau_svmm **);
 int nouveau_svmm_join(struct nouveau_svmm *, u64 inst);
 void nouveau_svmm_part(struct nouveau_svmm *, u64 inst);
+int nouveau_svmm_bind(struct drm_device *, void *, struct drm_file *);
 #else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */
 static inline void nouveau_svm_init(struct nouveau_drm *drm) {}
 static inline void nouveau_svm_fini(struct nouveau_drm *drm) {}
@@ -37,5 +38,11 @@ static inline int nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst)
 }
 
 static inline void nouveau_svmm_part(struct nouveau_svmm *svmm, u64 inst) {}
+
+static inline int nouveau_svmm_bind(struct drm_device *device, void *p,
+				    struct drm_file *file)
+{
+	return -ENOSYS;
+}
 #endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */
 #endif
diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
index afac182c80d8..9459a6e3bc1f 100644
--- a/include/uapi/drm/nouveau_drm.h
+++ b/include/uapi/drm/nouveau_drm.h
@@ -134,6 +134,7 @@ struct drm_nouveau_gem_cpu_fini {
 #define DRM_NOUVEAU_GPUOBJ_FREE        0x06 /* deprecated */
 #define DRM_NOUVEAU_NVIF               0x07
 #define DRM_NOUVEAU_SVM_INIT           0x08
+#define DRM_NOUVEAU_SVM_BIND           0x09
 #define DRM_NOUVEAU_GEM_NEW            0x40
 #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
 #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
@@ -145,7 +146,49 @@ struct drm_nouveau_svm_init {
 	__u64 unmanaged_size;
 };
 
+struct drm_nouveau_svm_bind {
+	__u64 header;
+	__u64 va_start;
+	__u64 va_end;
+	__u64 npages;
+	__u64 stride;
+	__u64 result;
+	__u64 reserved0;
+	__u64 reserved1;
+};
+
+#define NOUVEAU_SVM_BIND_COMMAND_SHIFT          0
+#define NOUVEAU_SVM_BIND_COMMAND_BITS           8
+#define NOUVEAU_SVM_BIND_COMMAND_MASK           ((1 << 8) - 1)
+#define NOUVEAU_SVM_BIND_PRIORITY_SHIFT         8
+#define NOUVEAU_SVM_BIND_PRIORITY_BITS          8
+#define NOUVEAU_SVM_BIND_PRIORITY_MASK          ((1 << 8) - 1)
+#define NOUVEAU_SVM_BIND_TARGET_SHIFT           16
+#define NOUVEAU_SVM_BIND_TARGET_BITS            32
+#define NOUVEAU_SVM_BIND_TARGET_MASK            0xffffffff
+
+/*
+ * Below is use to validate ioctl argument, userspace can also use it to make
+ * sure that no bit are set beyond known fields for a given kernel version.
+ */
+#define NOUVEAU_SVM_BIND_VALID_BITS     48
+#define NOUVEAU_SVM_BIND_VALID_MASK     ((1ULL << NOUVEAU_SVM_BIND_VALID_BITS) - 1)
+
+
+/*
+ * NOUVEAU_BIND_COMMAND__MIGRATE: synchronous migrate to target memory.
+ * result: number of page successfuly migrate to the target memory.
+ */
+#define NOUVEAU_SVM_BIND_COMMAND__MIGRATE               0
+
+/*
+ * NOUVEAU_SVM_BIND_HEADER_TARGET__GPU_VRAM: target the GPU VRAM memory.
+ */
+#define NOUVEAU_SVM_BIND_TARGET__GPU_VRAM               (1UL << 31)
+
+
 #define DRM_IOCTL_NOUVEAU_SVM_INIT           DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_SVM_INIT, struct drm_nouveau_svm_init)
+#define DRM_IOCTL_NOUVEAU_SVM_BIND           DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_SVM_BIND, struct drm_nouveau_svm_bind)
 
 #define DRM_IOCTL_NOUVEAU_GEM_NEW            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_NEW, struct drm_nouveau_gem_new)
 #define DRM_IOCTL_NOUVEAU_GEM_PUSHBUF        DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_PUSHBUF, struct drm_nouveau_gem_pushbuf)
-- 
cgit v1.2.3-59-g8ed1b


From 3856ec4b93c9463d36ee39098dde1fbbd29ec6dd Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Fri, 15 Feb 2019 11:03:53 -0800
Subject: RDMA/core: Add RDMA_NLDEV_CMD_NEWLINK/DELLINK support

Add support for new LINK messages to allow adding and deleting rdma
interfaces.  This will be used initially for soft rdma drivers which
instantiate device instances dynamically by the admin specifying a netdev
device to use.  The rdma_rxe module will be the first user of these
messages.

The design is modeled after RTNL_NEWLINK/DELLINK: rdma drivers register
with the rdma core if they provide link add/delete functions.  Each driver
registers with a unique "type" string, that is used to dispatch messages
coming from user space.  A new RDMA_NLDEV_ATTR is defined for the "type"
string.  User mode will pass 3 attributes in a NEWLINK message:
RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma device name to be created,
RDMA_NLDEV_ATTR_LINK_TYPE for the "type" of link being added, and
RDMA_NLDEV_ATTR_NDEV_NAME for the net_device interface to use for this
link.  The DELLINK message will contain the RDMA_NLDEV_ATTR_DEV_INDEX of
the device to delete.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 122 +++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h          |   3 +
 include/rdma/rdma_netlink.h      |  11 ++++
 include/uapi/rdma/rdma_netlink.h |  10 +++-
 4 files changed, 144 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 1980ddc5f7bc..5e94dc87f04f 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -33,6 +33,7 @@
 #include <linux/module.h>
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/mutex.h>
 #include <net/netlink.h>
 #include <rdma/rdma_cm.h>
 #include <rdma/rdma_netlink.h>
@@ -113,6 +114,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,
+				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -1200,6 +1203,117 @@ RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
 RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
 RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
 
+static LIST_HEAD(link_ops);
+static DECLARE_RWSEM(link_ops_rwsem);
+
+static const struct rdma_link_ops *link_ops_get(const char *type)
+{
+	const struct rdma_link_ops *ops;
+
+	list_for_each_entry(ops, &link_ops, list) {
+		if (!strcmp(ops->type, type))
+			goto out;
+	}
+	ops = NULL;
+out:
+	return ops;
+}
+
+void rdma_link_register(struct rdma_link_ops *ops)
+{
+	down_write(&link_ops_rwsem);
+	if (link_ops_get(ops->type)) {
+		WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type);
+		goto out;
+	}
+	list_add(&ops->list, &link_ops);
+out:
+	up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_register);
+
+void rdma_link_unregister(struct rdma_link_ops *ops)
+{
+	down_write(&link_ops_rwsem);
+	list_del(&ops->list);
+	up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_unregister);
+
+static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	char ibdev_name[IB_DEVICE_NAME_MAX];
+	const struct rdma_link_ops *ops;
+	char ndev_name[IFNAMSIZ];
+	struct net_device *ndev;
+	char type[IFNAMSIZ];
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
+		return -EINVAL;
+
+	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+		    sizeof(ibdev_name));
+	if (strchr(ibdev_name, '%'))
+		return -EINVAL;
+
+	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
+	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
+		    sizeof(ndev_name));
+
+	ndev = dev_get_by_name(&init_net, ndev_name);
+	if (!ndev)
+		return -ENODEV;
+
+	down_read(&link_ops_rwsem);
+	ops = link_ops_get(type);
+#ifdef CONFIG_MODULES
+	if (!ops) {
+		up_read(&link_ops_rwsem);
+		request_module("rdma-link-%s", type);
+		down_read(&link_ops_rwsem);
+		ops = link_ops_get(type);
+	}
+#endif
+	err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL;
+	up_read(&link_ops_rwsem);
+	dev_put(ndev);
+
+	return err;
+}
+
+static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) {
+		ib_device_put(device);
+		return -EINVAL;
+	}
+
+	ib_unregister_device_and_put(device);
+	return 0;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	[RDMA_NLDEV_CMD_GET] = {
 		.doit = nldev_get_doit,
@@ -1209,6 +1323,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_set_doit,
 		.flags = RDMA_NL_ADMIN_PERM,
 	},
+	[RDMA_NLDEV_CMD_NEWLINK] = {
+		.doit = nldev_newlink,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NLDEV_CMD_DELLINK] = {
+		.doit = nldev_dellink,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 	[RDMA_NLDEV_CMD_PORT_GET] = {
 		.doit = nldev_port_get_doit,
 		.dump = nldev_port_get_dumpit,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 640263289ab9..225cb76d469f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -238,6 +238,7 @@ enum ib_device_cap_flags {
 	IB_DEVICE_RDMA_NETDEV_OPA_VNIC		= (1ULL << 35),
 	/* The device supports padding incoming writes to cacheline. */
 	IB_DEVICE_PCI_WRITE_END_PADDING		= (1ULL << 36),
+	IB_DEVICE_ALLOW_USER_UNREG		= (1ULL << 37),
 };
 
 enum ib_signature_prot_cap {
@@ -2622,6 +2623,8 @@ struct ib_device {
 	refcount_t refcount;
 	struct completion unreg_completion;
 	struct work_struct unregistration_work;
+
+	const struct rdma_link_ops *link_ops;
 };
 
 struct ib_client {
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 70218e6b5187..10732ab31ba2 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -99,4 +99,15 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags);
  * Returns true on success or false if no listeners.
  */
 bool rdma_nl_chk_listeners(unsigned int group);
+
+struct rdma_link_ops {
+	struct list_head list;
+	const char *type;
+	int (*newlink)(const char *ibdev_name, struct net_device *ndev);
+};
+
+void rdma_link_register(struct rdma_link_ops *ops);
+void rdma_link_unregister(struct rdma_link_ops *ops);
+
+#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type)
 #endif /* _RDMA_NETLINK_H */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 4ebbcfb2c6ef..5cc592728071 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -255,9 +255,11 @@ enum rdma_nldev_command {
 	RDMA_NLDEV_CMD_GET, /* can dump */
 	RDMA_NLDEV_CMD_SET,
 
-	/* 3 - 4 are free to use */
+	RDMA_NLDEV_CMD_NEWLINK,
 
-	RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */
+	RDMA_NLDEV_CMD_DELLINK,
+
+	RDMA_NLDEV_CMD_PORT_GET, /* can dump */
 
 	/* 6 - 8 are free to use */
 
@@ -465,6 +467,10 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_RES_MRN,               /* u32 */
 	RDMA_NLDEV_ATTR_RES_CM_IDN,            /* u32 */
 	RDMA_NLDEV_ATTR_RES_CTXN,	       /* u32 */
+	/*
+	 * Identifies the rdma driver. eg: "rxe" or "siw"
+	 */
+	RDMA_NLDEV_ATTR_LINK_TYPE,		/* string */
 
 	/*
 	 * Always the end
-- 
cgit v1.2.3-59-g8ed1b


From fadccd8fc2d06cf7fd222245d7e04b00fae946cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Feb 2019 09:37:13 +0100
Subject: nvme_ioctl.h: remove duplicate GPL boilerplate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We already have a ЅPDX header, so no need to duplicate the information.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 include/linux/nvme.h            | 10 +---------
 include/uapi/linux/nvme_ioctl.h |  9 ---------
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index bbcc83886899..baa49e6a23cc 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Definitions for the NVM Express interface
  * Copyright (c) 2011-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _LINUX_NVME_H
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index 6e74b1eaf541..1c215ea1798e 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -2,15 +2,6 @@
 /*
  * Definitions for the NVM Express ioctl interface
  * Copyright (c) 2011-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _UAPI_LINUX_NVME_IOCTL_H
-- 
cgit v1.2.3-59-g8ed1b


From 61697a6abd24acba941359c6268a94f4afe4a53d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 18 Jan 2019 14:19:26 -0500
Subject: dm: eliminate 'split_discard_bios' flag from DM target interface

There is no need to have DM core split discards on behalf of a DM target
now that blk_queue_split() handles splitting discards based on the
queue_limits.  A DM target just needs to set max_discard_sectors,
discard_granularity, etc, in queue_limits.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c  |  1 -
 drivers/md/dm-raid.c          | 14 +++++++++-----
 drivers/md/dm-thin.c          |  1 -
 drivers/md/dm-zoned-target.c  |  1 -
 drivers/md/dm.c               | 25 ++++++-------------------
 include/linux/device-mapper.h |  6 ------
 include/uapi/linux/dm-ioctl.h |  4 ++--
 7 files changed, 17 insertions(+), 35 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index b29a8327eed1..adc529f12b6b 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2496,7 +2496,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
-	ti->split_discard_bios = false;
 
 	ti->per_io_data_size = sizeof(struct per_bio_data);
 
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index adcfe8ae10aa..9fdef6897316 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2986,11 +2986,6 @@ static void configure_discard_support(struct raid_set *rs)
 		}
 	}
 
-	/*
-	 * RAID1 and RAID10 personalities require bio splitting,
-	 * RAID0/4/5/6 don't and process large discard bios properly.
-	 */
-	ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs));
 	ti->num_discard_bios = 1;
 }
 
@@ -3747,6 +3742,15 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	blk_limits_io_min(limits, chunk_size);
 	blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
+
+	/*
+	 * RAID1 and RAID10 personalities require bio splitting,
+	 * RAID0/4/5/6 don't and process large discard bios properly.
+	 */
+	if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
+		limits->discard_granularity = chunk_size;
+		limits->max_discard_sectors = chunk_size;
+	}
 }
 
 static void raid_postsuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e83b63608262..0d9ded0f5e50 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -4227,7 +4227,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
-		ti->split_discard_bios = false;
 	}
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 6af5babe6837..8865c1709e16 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -727,7 +727,6 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->per_io_data_size = sizeof(struct dmz_bioctx);
 	ti->flush_supported = true;
 	ti->discards_supported = true;
-	ti->split_discard_bios = true;
 
 	/* The exposed capacity is the number of chunks that can be mapped */
 	ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7a774fcd0194..55f12df3589d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1478,17 +1478,10 @@ static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
 	return ti->num_write_zeroes_bios;
 }
 
-typedef bool (*is_split_required_fn)(struct dm_target *ti);
-
-static bool is_split_required_for_discard(struct dm_target *ti)
-{
-	return ti->split_discard_bios;
-}
-
 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
-				       unsigned num_bios, bool is_split_required)
+				       unsigned num_bios)
 {
-	unsigned len;
+	unsigned len = ci->sector_count;
 
 	/*
 	 * Even though the device advertised support for this type of
@@ -1499,11 +1492,6 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
 	if (!num_bios)
 		return -EOPNOTSUPP;
 
-	if (!is_split_required)
-		len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
-	else
-		len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
-
 	__send_duplicate_bios(ci, ti, num_bios, &len);
 
 	ci->sector += len;
@@ -1514,23 +1502,22 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
 
 static int __send_discard(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti),
-					   is_split_required_for_discard(ti));
+	return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
 }
 
 static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti), false);
+	return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
 }
 
 static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti), false);
+	return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
 }
 
 static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false);
+	return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
 }
 
 static bool is_abnormal_io(struct bio *bio)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e528baebad69..0f5b3d7c6cb3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -315,12 +315,6 @@ struct dm_target {
 	 * whether or not its underlying devices have support.
 	 */
 	bool discards_supported:1;
-
-	/*
-	 * Set if the target required discard bios to be split
-	 * on max_io_len boundary.
-	 */
-	bool split_discard_bios:1;
 };
 
 /* Each target can link one of these into the table */
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index d1e49514977b..f396a82dfd3e 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -270,9 +270,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	39
+#define DM_VERSION_MINOR	40
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2018-04-03)"
+#define DM_VERSION_EXTRA	"-ioctl (2019-01-18)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3-59-g8ed1b


From 54719527fd06e80fce52b98537414035cd21e8d4 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Thu, 21 Feb 2019 14:12:01 +0200
Subject: devlink: Rename devlink health attributes

Rename devlink health attributes for better reflect the attributes use.
Add COUNT prefix on error counter attribute and recovery counter
attribute.

Fixes: 7afe335a8bed ("devlink: Add health get command")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h | 4 ++--
 net/core/devlink.c           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 53de8802a000..5bb4ea67d84f 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -323,8 +323,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_HEALTH_REPORTER,			/* nested */
 	DEVLINK_ATTR_HEALTH_REPORTER_NAME,		/* string */
 	DEVLINK_ATTR_HEALTH_REPORTER_STATE,		/* u8 */
-	DEVLINK_ATTR_HEALTH_REPORTER_ERR,		/* u64 */
-	DEVLINK_ATTR_HEALTH_REPORTER_RECOVER,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT,	/* u64 */
 	DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,		/* u64 */
 	DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,	/* u64 */
 	DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,	/* u8 */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 04d98550c78c..5135997ecbe7 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4650,10 +4650,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
 	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
 		       reporter->health_state))
 		goto reporter_nest_cancel;
-	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR,
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT,
 			      reporter->error_count, DEVLINK_ATTR_PAD))
 		goto reporter_nest_cancel;
-	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER,
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT,
 			      reporter->recovery_count, DEVLINK_ATTR_PAD))
 		goto reporter_nest_cancel;
 	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
-- 
cgit v1.2.3-59-g8ed1b


From ca8d4794f669e721fb5198f6d142e42dd8080239 Mon Sep 17 00:00:00 2001
From: Callum Sinclair <callum.sinclair@alliedtelesis.co.nz>
Date: Mon, 18 Feb 2019 10:07:52 +1300
Subject: ipmr: ip6mr: Create new sockopt to clear mfc cache or vifs

Currently the only way to clear the forwarding cache was to delete the
entries one by one using the MRT_DEL_MFC socket option or to destroy and
recreate the socket.

Create a new socket option which with the use of optional flags can
clear any combination of multicast entries (static or not static) and
multicast vifs (static or not static).

Calling the new socket option MRT_FLUSH with the flags MRT_FLUSH_MFC and
MRT_FLUSH_VIFS will clear all entries and vifs on the socket except for
static entries.

Signed-off-by: Callum Sinclair <callum.sinclair@alliedtelesis.co.nz>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mroute.h  |  9 ++++-
 include/uapi/linux/mroute6.h |  9 ++++-
 net/ipv4/ipmr.c              | 75 +++++++++++++++++++++++++++---------------
 net/ipv6/ip6mr.c             | 78 +++++++++++++++++++++++++++++---------------
 4 files changed, 115 insertions(+), 56 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
index 5d37a9ccce63..11c8c1fc1124 100644
--- a/include/uapi/linux/mroute.h
+++ b/include/uapi/linux/mroute.h
@@ -28,12 +28,19 @@
 #define MRT_TABLE	(MRT_BASE+9)	/* Specify mroute table ID		*/
 #define MRT_ADD_MFC_PROXY	(MRT_BASE+10)	/* Add a (*,*|G) mfc entry	*/
 #define MRT_DEL_MFC_PROXY	(MRT_BASE+11)	/* Del a (*,*|G) mfc entry	*/
-#define MRT_MAX		(MRT_BASE+11)
+#define MRT_FLUSH	(MRT_BASE+12)	/* Flush all mfc entries and/or vifs	*/
+#define MRT_MAX		(MRT_BASE+12)
 
 #define SIOCGETVIFCNT	SIOCPROTOPRIVATE	/* IP protocol privates */
 #define SIOCGETSGCNT	(SIOCPROTOPRIVATE+1)
 #define SIOCGETRPF	(SIOCPROTOPRIVATE+2)
 
+/* MRT_FLUSH optional flags */
+#define MRT_FLUSH_MFC	1	/* Flush multicast entries */
+#define MRT_FLUSH_MFC_STATIC	2	/* Flush static multicast entries */
+#define MRT_FLUSH_VIFS	4	/* Flush multicast vifs */
+#define MRT_FLUSH_VIFS_STATIC	8	/* Flush static multicast vifs */
+
 #define MAXVIFS		32
 typedef unsigned long vifbitmap_t;	/* User mode code depends on this lot */
 typedef unsigned short vifi_t;
diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
index 9999cc006390..c36177a86516 100644
--- a/include/uapi/linux/mroute6.h
+++ b/include/uapi/linux/mroute6.h
@@ -31,12 +31,19 @@
 #define MRT6_TABLE	(MRT6_BASE+9)	/* Specify mroute table ID		*/
 #define MRT6_ADD_MFC_PROXY	(MRT6_BASE+10)	/* Add a (*,*|G) mfc entry	*/
 #define MRT6_DEL_MFC_PROXY	(MRT6_BASE+11)	/* Del a (*,*|G) mfc entry	*/
-#define MRT6_MAX	(MRT6_BASE+11)
+#define MRT6_FLUSH	(MRT6_BASE+12)	/* Flush all mfc entries and/or vifs	*/
+#define MRT6_MAX	(MRT6_BASE+12)
 
 #define SIOCGETMIFCNT_IN6	SIOCPROTOPRIVATE	/* IP protocol privates */
 #define SIOCGETSGCNT_IN6	(SIOCPROTOPRIVATE+1)
 #define SIOCGETRPF	(SIOCPROTOPRIVATE+2)
 
+/* MRT6_FLUSH optional flags */
+#define MRT6_FLUSH_MFC	1	/* Flush multicast entries */
+#define MRT6_FLUSH_MFC_STATIC	2	/* Flush static multicast entries */
+#define MRT6_FLUSH_MIFS	4	/* Flushing multicast vifs */
+#define MRT6_FLUSH_MIFS_STATIC	8	/* Flush static multicast vifs */
+
 #define MAXMIFS		32
 typedef unsigned long mifbitmap_t;	/* User mode code depends on this lot */
 typedef unsigned short mifi_t;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e536970557dd..2c931120c494 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -110,7 +110,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
 static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 				 int cmd);
 static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
-static void mroute_clean_tables(struct mr_table *mrt, bool all);
+static void mroute_clean_tables(struct mr_table *mrt, int flags);
 static void ipmr_expire_process(struct timer_list *t);
 
 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -415,7 +415,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 static void ipmr_free_table(struct mr_table *mrt)
 {
 	del_timer_sync(&mrt->ipmr_expire_timer);
-	mroute_clean_tables(mrt, true);
+	mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
+				 MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC);
 	rhltable_destroy(&mrt->mfc_hash);
 	kfree(mrt);
 }
@@ -1296,7 +1297,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 }
 
 /* Close the multicast socket, and clear the vif tables etc */
-static void mroute_clean_tables(struct mr_table *mrt, bool all)
+static void mroute_clean_tables(struct mr_table *mrt, int flags)
 {
 	struct net *net = read_pnet(&mrt->net);
 	struct mr_mfc *c, *tmp;
@@ -1305,35 +1306,44 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 	int i;
 
 	/* Shut down all active vif entries */
-	for (i = 0; i < mrt->maxvif; i++) {
-		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
-			continue;
-		vif_delete(mrt, i, 0, &list);
+	if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) {
+		for (i = 0; i < mrt->maxvif; i++) {
+			if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
+			     !(flags & MRT_FLUSH_VIFS_STATIC)) ||
+			    (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS)))
+				continue;
+			vif_delete(mrt, i, 0, &list);
+		}
+		unregister_netdevice_many(&list);
 	}
-	unregister_netdevice_many(&list);
 
 	/* Wipe the cache */
-	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
-		if (!all && (c->mfc_flags & MFC_STATIC))
-			continue;
-		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
-		list_del_rcu(&c->list);
-		cache = (struct mfc_cache *)c;
-		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
-					      mrt->id);
-		mroute_netlink_event(mrt, cache, RTM_DELROUTE);
-		mr_cache_put(c);
-	}
-
-	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
-		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
-			list_del(&c->list);
+	if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) {
+		list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+			if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) ||
+			    (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC)))
+				continue;
+			rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+			list_del_rcu(&c->list);
 			cache = (struct mfc_cache *)c;
+			call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
+						      mrt->id);
 			mroute_netlink_event(mrt, cache, RTM_DELROUTE);
-			ipmr_destroy_unres(mrt, cache);
+			mr_cache_put(c);
+		}
+	}
+
+	if (flags & MRT_FLUSH_MFC) {
+		if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+			spin_lock_bh(&mfc_unres_lock);
+			list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
+				list_del(&c->list);
+				cache = (struct mfc_cache *)c;
+				mroute_netlink_event(mrt, cache, RTM_DELROUTE);
+				ipmr_destroy_unres(mrt, cache);
+			}
+			spin_unlock_bh(&mfc_unres_lock);
 		}
-		spin_unlock_bh(&mfc_unres_lock);
 	}
 }
 
@@ -1354,7 +1364,7 @@ static void mrtsock_destruct(struct sock *sk)
 						    NETCONFA_IFINDEX_ALL,
 						    net->ipv4.devconf_all);
 			RCU_INIT_POINTER(mrt->mroute_sk, NULL);
-			mroute_clean_tables(mrt, false);
+			mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC);
 		}
 	}
 	rtnl_unlock();
@@ -1479,6 +1489,17 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 					   sk == rtnl_dereference(mrt->mroute_sk),
 					   parent);
 		break;
+	case MRT_FLUSH:
+		if (optlen != sizeof(val)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (get_user(val, (int __user *)optval)) {
+			ret = -EFAULT;
+			break;
+		}
+		mroute_clean_tables(mrt, val);
+		break;
 	/* Control PIM assert. */
 	case MRT_ASSERT:
 		if (optlen != sizeof(val)) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index cc01aa3f2b5e..3594f1d9c68c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -97,7 +97,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
 static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
 static int ip6mr_rtm_dumproute(struct sk_buff *skb,
 			       struct netlink_callback *cb);
-static void mroute_clean_tables(struct mr_table *mrt, bool all);
+static void mroute_clean_tables(struct mr_table *mrt, int flags);
 static void ipmr_expire_process(struct timer_list *t);
 
 #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
@@ -393,7 +393,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 static void ip6mr_free_table(struct mr_table *mrt)
 {
 	del_timer_sync(&mrt->ipmr_expire_timer);
-	mroute_clean_tables(mrt, true);
+	mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC |
+				 MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC);
 	rhltable_destroy(&mrt->mfc_hash);
 	kfree(mrt);
 }
@@ -1496,42 +1497,51 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
  *	Close the multicast socket, and clear the vif tables etc
  */
 
-static void mroute_clean_tables(struct mr_table *mrt, bool all)
+static void mroute_clean_tables(struct mr_table *mrt, int flags)
 {
 	struct mr_mfc *c, *tmp;
 	LIST_HEAD(list);
 	int i;
 
 	/* Shut down all active vif entries */
-	for (i = 0; i < mrt->maxvif; i++) {
-		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
-			continue;
-		mif6_delete(mrt, i, 0, &list);
+	if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) {
+		for (i = 0; i < mrt->maxvif; i++) {
+			if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
+			     !(flags & MRT6_FLUSH_MIFS_STATIC)) ||
+			    (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS)))
+				continue;
+			mif6_delete(mrt, i, 0, &list);
+		}
+		unregister_netdevice_many(&list);
 	}
-	unregister_netdevice_many(&list);
 
 	/* Wipe the cache */
-	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
-		if (!all && (c->mfc_flags & MFC_STATIC))
-			continue;
-		rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
-		list_del_rcu(&c->list);
-		call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
-					       FIB_EVENT_ENTRY_DEL,
-					       (struct mfc6_cache *)c, mrt->id);
-		mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
-		mr_cache_put(c);
+	if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) {
+		list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+			if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) ||
+			    (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC)))
+				continue;
+			rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
+			list_del_rcu(&c->list);
+			call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
+						       FIB_EVENT_ENTRY_DEL,
+						       (struct mfc6_cache *)c, mrt->id);
+			mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
+			mr_cache_put(c);
+		}
 	}
 
-	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
-		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
-			list_del(&c->list);
-			mr6_netlink_event(mrt, (struct mfc6_cache *)c,
-					  RTM_DELROUTE);
-			ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
+	if (flags & MRT6_FLUSH_MFC) {
+		if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+			spin_lock_bh(&mfc_unres_lock);
+			list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
+				list_del(&c->list);
+				mr6_netlink_event(mrt, (struct mfc6_cache *)c,
+						  RTM_DELROUTE);
+				ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
+			}
+			spin_unlock_bh(&mfc_unres_lock);
 		}
-		spin_unlock_bh(&mfc_unres_lock);
 	}
 }
 
@@ -1587,7 +1597,7 @@ int ip6mr_sk_done(struct sock *sk)
 						     NETCONFA_IFINDEX_ALL,
 						     net->ipv6.devconf_all);
 
-			mroute_clean_tables(mrt, false);
+			mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC);
 			err = 0;
 			break;
 		}
@@ -1703,6 +1713,20 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		rtnl_unlock();
 		return ret;
 
+	case MRT6_FLUSH:
+	{
+		int flags;
+
+		if (optlen != sizeof(flags))
+			return -EINVAL;
+		if (get_user(flags, (int __user *)optval))
+			return -EFAULT;
+		rtnl_lock();
+		mroute_clean_tables(mrt, flags);
+		rtnl_unlock();
+		return 0;
+	}
+
 	/*
 	 *	Control PIM assert (to activate pim will activate assert)
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From 663586c0a8929db81e617c775823efb9d65f2bc2 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 7 Nov 2018 23:16:19 +0100
Subject: ubi: Expose the bitrot interface

Using UBI_IOCRPEB and UBI_IOCSPEB userspace can force
reading and scrubbing of PEBs.

In case of bitflips UBI will automatically take action
and move data to a different PEB.
This interface allows a daemon to foster your NAND.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/cdev.c      |  30 +++++++++
 drivers/mtd/ubi/ubi.h       |   1 +
 drivers/mtd/ubi/wl.c        | 144 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/mtd/ubi-user.h |   5 ++
 4 files changed, 180 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 22547d7a84ea..947a8adbc799 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -974,6 +974,36 @@ static long ubi_cdev_ioctl(struct file *file, unsigned int cmd,
 		break;
 	}
 
+	/* Check a specific PEB for bitflips and scrub it if needed */
+	case UBI_IOCRPEB:
+	{
+		int pnum;
+
+		err = get_user(pnum, (__user int32_t *)argp);
+		if (err) {
+			err = -EFAULT;
+			break;
+		}
+
+		err = ubi_bitflip_check(ubi, pnum, 0);
+		break;
+	}
+
+	/* Force scrubbing for a specific PEB */
+	case UBI_IOCSPEB:
+	{
+		int pnum;
+
+		err = get_user(pnum, (__user int32_t *)argp);
+		if (err) {
+			err = -EFAULT;
+			break;
+		}
+
+		err = ubi_bitflip_check(ubi, pnum, 1);
+		break;
+	}
+
 	default:
 		err = -ENOTTY;
 		break;
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
index d47b9e436e67..a1b9e764d489 100644
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -929,6 +929,7 @@ int ubi_wl_put_fm_peb(struct ubi_device *ubi, struct ubi_wl_entry *used_e,
 int ubi_is_erase_work(struct ubi_work *wrk);
 void ubi_refill_pools(struct ubi_device *ubi);
 int ubi_ensure_anchor_pebs(struct ubi_device *ubi);
+int ubi_bitflip_check(struct ubi_device *ubi, int pnum, int force_scrub);
 
 /* io.c */
 int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index ca1b31385eb5..40f838d54b0f 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1440,6 +1440,150 @@ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum)
 	return err;
 }
 
+static bool scrub_possible(struct ubi_device *ubi, struct ubi_wl_entry *e)
+{
+	if (in_wl_tree(e, &ubi->scrub))
+		return false;
+	else if (in_wl_tree(e, &ubi->erroneous))
+		return false;
+	else if (ubi->move_from == e)
+		return false;
+	else if (ubi->move_to == e)
+		return false;
+
+	return true;
+}
+
+/**
+ * ubi_bitflip_check - Check an eraseblock for bitflips and scrub it if needed.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock to schedule
+ * @force: dont't read the block, assume bitflips happened and take action.
+ *
+ * This function reads the given eraseblock and checks if bitflips occured.
+ * In case of bitflips, the eraseblock is scheduled for scrubbing.
+ * If scrubbing is forced with @force, the eraseblock is not read,
+ * but scheduled for scrubbing right away.
+ *
+ * Returns:
+ * %EINVAL, PEB is out of range
+ * %ENOENT, PEB is no longer used by UBI
+ * %EBUSY, PEB cannot be checked now or a check is currently running on it
+ * %EAGAIN, bit flips happened but scrubbing is currently not possible
+ * %EUCLEAN, bit flips happened and PEB is scheduled for scrubbing
+ * %0, no bit flips detected
+ */
+int ubi_bitflip_check(struct ubi_device *ubi, int pnum, int force)
+{
+	int err;
+	struct ubi_wl_entry *e;
+
+	if (pnum < 0 || pnum >= ubi->peb_count) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Pause all parallel work, otherwise it can happen that the
+	 * erase worker frees a wl entry under us.
+	 */
+	down_write(&ubi->work_sem);
+
+	/*
+	 * Make sure that the wl entry does not change state while
+	 * inspecting it.
+	 */
+	spin_lock(&ubi->wl_lock);
+	e = ubi->lookuptbl[pnum];
+	if (!e) {
+		spin_unlock(&ubi->wl_lock);
+		err = -ENOENT;
+		goto out_resume;
+	}
+
+	/*
+	 * Does it make sense to check this PEB?
+	 */
+	if (!scrub_possible(ubi, e)) {
+		spin_unlock(&ubi->wl_lock);
+		err = -EBUSY;
+		goto out_resume;
+	}
+	spin_unlock(&ubi->wl_lock);
+
+	if (!force) {
+		mutex_lock(&ubi->buf_mutex);
+		err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size);
+		mutex_unlock(&ubi->buf_mutex);
+	}
+
+	if (err == UBI_IO_BITFLIPS || force) {
+		/*
+		 * Okay, bit flip happened, let's figure out what we can do.
+		 */
+		spin_lock(&ubi->wl_lock);
+
+		/*
+		 * Recheck. We released wl_lock, UBI might have killed the
+		 * wl entry under us.
+		 */
+		e = ubi->lookuptbl[pnum];
+		if (!e) {
+			spin_unlock(&ubi->wl_lock);
+			err = -ENOENT;
+			goto out_resume;
+		}
+
+		/*
+		 * Need to re-check state
+		 */
+		if (!scrub_possible(ubi, e)) {
+			spin_unlock(&ubi->wl_lock);
+			err = -EBUSY;
+			goto out_resume;
+		}
+
+		if (in_pq(ubi, e)) {
+			prot_queue_del(ubi, e->pnum);
+			wl_tree_add(e, &ubi->scrub);
+			spin_unlock(&ubi->wl_lock);
+
+			err = ensure_wear_leveling(ubi, 1);
+		} else if (in_wl_tree(e, &ubi->used)) {
+			rb_erase(&e->u.rb, &ubi->used);
+			wl_tree_add(e, &ubi->scrub);
+			spin_unlock(&ubi->wl_lock);
+
+			err = ensure_wear_leveling(ubi, 1);
+		} else if (in_wl_tree(e, &ubi->free)) {
+			rb_erase(&e->u.rb, &ubi->free);
+			ubi->free_count--;
+			spin_unlock(&ubi->wl_lock);
+
+			/*
+			 * This PEB is empty we can schedule it for
+			 * erasure right away. No wear leveling needed.
+			 */
+			err = schedule_erase(ubi, e, UBI_UNKNOWN, UBI_UNKNOWN,
+					     force ? 0 : 1, true);
+		} else {
+			spin_unlock(&ubi->wl_lock);
+			err = -EAGAIN;
+		}
+
+		if (!err && !force)
+			err = -EUCLEAN;
+	} else {
+		err = 0;
+	}
+
+out_resume:
+	up_write(&ubi->work_sem);
+out:
+
+	return err;
+}
+
 /**
  * tree_destroy - destroy an RB-tree.
  * @ubi: UBI device description object
diff --git a/include/uapi/mtd/ubi-user.h b/include/uapi/mtd/ubi-user.h
index aad3b6201fc0..b69e9ba6742b 100644
--- a/include/uapi/mtd/ubi-user.h
+++ b/include/uapi/mtd/ubi-user.h
@@ -171,6 +171,11 @@
 /* Re-name volumes */
 #define UBI_IOCRNVOL _IOW(UBI_IOC_MAGIC, 3, struct ubi_rnvol_req)
 
+/* Read the specified PEB and scrub it if there are bitflips */
+#define UBI_IOCRPEB _IOW(UBI_IOC_MAGIC, 4, __s32)
+/* Force scrubbing on the specified PEB */
+#define UBI_IOCSPEB _IOW(UBI_IOC_MAGIC, 5, __s32)
+
 /* ioctl commands of the UBI control character device */
 
 #define UBI_CTRL_IOC_MAGIC 'o'
-- 
cgit v1.2.3-59-g8ed1b


From e728fdf0628971d43cb4e48860defc6e8a553761 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 22 Feb 2019 19:25:59 +0100
Subject: net: phy: improve definition of __ETHTOOL_LINK_MODE_MASK_NBITS

The way to define __ETHTOOL_LINK_MODE_MASK_NBITS seems to be overly
complicated, go with a standard approach instead.
Whilst we're at it, move the comment to the right place.

v2:
- rebased

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  4 ----
 include/uapi/linux/ethtool.h | 17 +++++++++--------
 2 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 19a8de5326fb..e6ebc9761822 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -98,10 +98,6 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 	return index % n_rx_rings;
 }
 
-/* number of link mode bits/ulongs handled internally by kernel */
-#define __ETHTOOL_LINK_MODE_MASK_NBITS			\
-	(__ETHTOOL_LINK_MODE_LAST + 1)
-
 /* declare a link mode bitmap */
 #define __ETHTOOL_DECLARE_LINK_MODE_MASK(name)		\
 	DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS)
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 378c52308d89..3652b239dad1 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1432,6 +1432,13 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT	= 29,
 	ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT	= 30,
 	ETHTOOL_LINK_MODE_25000baseCR_Full_BIT	= 31,
+
+	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
+	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
+	 * macro for bits > 31. The only way to use indices > 31 is to
+	 * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API.
+	 */
+
 	ETHTOOL_LINK_MODE_25000baseKR_Full_BIT	= 32,
 	ETHTOOL_LINK_MODE_25000baseSR_Full_BIT	= 33,
 	ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT	= 34,
@@ -1469,14 +1476,8 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT	 = 65,
 	ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT	 = 66,
 
-	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
-	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
-	 * macro for bits > 31. The only way to use indices > 31 is to
-	 * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API.
-	 */
-
-	__ETHTOOL_LINK_MODE_LAST
-	  = ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT,
+	/* must be last entry */
+	__ETHTOOL_LINK_MODE_MASK_NBITS
 };
 
 #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)	\
-- 
cgit v1.2.3-59-g8ed1b


From 228a73abde5c04428678e917b271f8526cfd90ed Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 4 Jan 2019 13:31:54 +0800
Subject: btrfs: introduce new ioctl to unregister a btrfs device

Support for a new command that can be used eg. as a command

  $ btrfs device scan --forget [dev]'
(the final name may change though)

to undo the effects of 'btrfs device scan [dev]'. For this purpose
this patch proposes to use ioctl #5 as it was empty and is next to the
SCAN ioctl.

The new ioctl BTRFS_IOC_FORGET_DEV works only on the control device
(/dev/btrfs-control) to unregister one or all devices, devices that are
not mounted.

The argument is struct btrfs_ioctl_vol_args, ::name specifies the device
path. To unregister all device, the path is an empty string.

Again, the devices are removed only if they aren't part of a mounte
filesystem.

This new ioctl provides:

- release of unwanted btrfs_fs_devices and btrfs_devices structures
  from memory if the device is not going to be mounted

- ability to mount filesystem in degraded mode, when one devices is
  corrupted like in split brain raid1

- running test cases which would require reloading the kernel module
  but this is not possible eg. due to mounted filesystem or built-in

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c           |  3 +++
 fs/btrfs/volumes.c         | 11 +++++++++++
 fs/btrfs/volumes.h         |  1 +
 include/uapi/linux/btrfs.h |  2 ++
 4 files changed, 17 insertions(+)

(limited to 'include/uapi')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0a3f122dd61f..f9d13a30aa8a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2190,6 +2190,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		ret = PTR_ERR_OR_ZERO(device);
 		mutex_unlock(&uuid_mutex);
 		break;
+	case BTRFS_IOC_FORGET_DEV:
+		ret = btrfs_forget_devices(vol->name);
+		break;
 	case BTRFS_IOC_DEVICES_READY:
 		mutex_lock(&uuid_mutex);
 		device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7ca31c83e730..fe122e6099ae 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1446,6 +1446,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
 	return 0;
 }
 
+int btrfs_forget_devices(const char *path)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+	mutex_unlock(&uuid_mutex);
+
+	return ret;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount path
  * and we are not allowed to call set_blocksize during the scan. The superblock
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 656ea8d85770..3ad9d58d1b66 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       fmode_t flags, void *holder);
 struct btrfs_device *btrfs_scan_one_device(const char *path,
 					   fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
 void btrfs_assign_next_active_device(struct btrfs_device *device,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index e0763bc4158e..c195896d478f 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -837,6 +837,8 @@ enum btrfs_err_code {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
 				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
+				   struct btrfs_ioctl_vol_args)
 /* trans start and trans end are dangerous, and only for
  * use by applications that know how to avoid the
  * resulting deadlocks
-- 
cgit v1.2.3-59-g8ed1b


From 3f7ae5f3dc5295ac17d6521130ed8a8f8a723fbf Mon Sep 17 00:00:00 2001
From: "Mohit P. Tahiliani" <tahiliani@nitk.edu.in>
Date: Tue, 26 Feb 2019 00:39:59 +0530
Subject: net: sched: pie: add more cases to auto-tune alpha and beta

The current implementation scales the local alpha and beta
variables in the calculate_probability function by the same
amount for all values of drop probability below 1%.

RFC 8033 suggests using additional cases for auto-tuning
alpha and beta when the drop probability is less than 1%.

In order to add more auto-tuning cases, MAX_PROB must be
scaled by u64 instead of u32 to prevent underflow when
scaling the local alpha and beta variables in the
calculate_probability function.

Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
Signed-off-by: Dhaval Khandla <dhavaljkhandla26@gmail.com>
Signed-off-by: Hrishikesh Hiraskar <hrishihiraskar@gmail.com>
Signed-off-by: Manish Kumar B <bmanish15597@gmail.com>
Signed-off-by: Sachin D. Patil <sdp.sachin@gmail.com>
Signed-off-by: Leslie Monis <lesliemonis@gmail.com>
Acked-by: Dave Taht <dave.taht@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 +-
 net/sched/sch_pie.c            | 65 +++++++++++++++++++++---------------------
 2 files changed, 33 insertions(+), 34 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 0d18b1d1fbbc..1eb572ef3f27 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -954,7 +954,7 @@ enum {
 #define TCA_PIE_MAX   (__TCA_PIE_MAX - 1)
 
 struct tc_pie_xstats {
-	__u32 prob;             /* current probability */
+	__u64 prob;             /* current probability */
 	__u32 delay;            /* current delay in ms */
 	__u32 avg_dq_rate;      /* current average dq_rate in bits/pie_time */
 	__u32 packets_in;       /* total number of packets enqueued */
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index d88ab53593b3..30f158582499 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -33,7 +33,7 @@
 
 #define QUEUE_THRESHOLD 16384
 #define DQCOUNT_INVALID -1
-#define MAX_PROB  0xffffffff
+#define MAX_PROB 0xffffffffffffffff
 #define PIE_SCALE 8
 
 /* parameters used */
@@ -49,7 +49,7 @@ struct pie_params {
 
 /* variables used */
 struct pie_vars {
-	u32 prob;		/* probability but scaled by u32 limit. */
+	u64 prob;		/* probability but scaled by u64 limit. */
 	psched_time_t burst_time;
 	psched_time_t qdelay;
 	psched_time_t qdelay_old;
@@ -99,8 +99,8 @@ static void pie_vars_init(struct pie_vars *vars)
 static bool drop_early(struct Qdisc *sch, u32 packet_size)
 {
 	struct pie_sched_data *q = qdisc_priv(sch);
-	u32 rnd;
-	u32 local_prob = q->vars.prob;
+	u64 rnd;
+	u64 local_prob = q->vars.prob;
 	u32 mtu = psched_mtu(qdisc_dev(sch));
 
 	/* If there is still burst allowance left skip random early drop */
@@ -124,11 +124,11 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
 	 * probablity. Smaller packets will have lower drop prob in this case
 	 */
 	if (q->params.bytemode && packet_size <= mtu)
-		local_prob = (local_prob / mtu) * packet_size;
+		local_prob = (u64)packet_size * div_u64(local_prob, mtu);
 	else
 		local_prob = q->vars.prob;
 
-	rnd = prandom_u32();
+	prandom_bytes(&rnd, 8);
 	if (rnd < local_prob)
 		return true;
 
@@ -317,9 +317,10 @@ static void calculate_probability(struct Qdisc *sch)
 	u32 qlen = sch->qstats.backlog;	/* queue size in bytes */
 	psched_time_t qdelay = 0;	/* in pschedtime */
 	psched_time_t qdelay_old = q->vars.qdelay;	/* in pschedtime */
-	s32 delta = 0;		/* determines the change in probability */
-	u32 oldprob;
-	u32 alpha, beta;
+	s64 delta = 0;		/* determines the change in probability */
+	u64 oldprob;
+	u64 alpha, beta;
+	u32 power;
 	bool update_prob = true;
 
 	q->vars.qdelay_old = q->vars.qdelay;
@@ -339,38 +340,36 @@ static void calculate_probability(struct Qdisc *sch)
 	 * value for alpha as 0.125. In this implementation, we use values 0-32
 	 * passed from user space to represent this. Also, alpha and beta have
 	 * unit of HZ and need to be scaled before they can used to update
-	 * probability. alpha/beta are updated locally below by 1) scaling them
-	 * appropriately 2) scaling down by 16 to come to 0-2 range.
-	 * Please see paper for details.
-	 *
-	 * We scale alpha and beta differently depending on whether we are in
-	 * light, medium or high dropping mode.
+	 * probability. alpha/beta are updated locally below by scaling down
+	 * by 16 to come to 0-2 range.
 	 */
-	if (q->vars.prob < MAX_PROB / 100) {
-		alpha =
-		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
-		beta =
-		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
-	} else if (q->vars.prob < MAX_PROB / 10) {
-		alpha =
-		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
-		beta =
-		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
-	} else {
-		alpha =
-		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
-		beta =
-		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+	alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+	beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+
+	/* We scale alpha and beta differently depending on how heavy the
+	 * congestion is. Please see RFC 8033 for details.
+	 */
+	if (q->vars.prob < MAX_PROB / 10) {
+		alpha >>= 1;
+		beta >>= 1;
+
+		power = 100;
+		while (q->vars.prob < div_u64(MAX_PROB, power) &&
+		       power <= 1000000) {
+			alpha >>= 2;
+			beta >>= 2;
+			power *= 10;
+		}
 	}
 
 	/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
-	delta += alpha * ((qdelay - q->params.target));
-	delta += beta * ((qdelay - qdelay_old));
+	delta += alpha * (u64)(qdelay - q->params.target);
+	delta += beta * (u64)(qdelay - qdelay_old);
 
 	oldprob = q->vars.prob;
 
 	/* to ensure we increase probability in steps of no more than 2% */
-	if (delta > (s32)(MAX_PROB / (100 / 2)) &&
+	if (delta > (s64)(MAX_PROB / (100 / 2)) &&
 	    q->vars.prob >= MAX_PROB / 10)
 		delta = (MAX_PROB / 100) * 2;
 
-- 
cgit v1.2.3-59-g8ed1b


From e5567f5f67621877726f99be040af9fbedda37dc Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Tue, 19 Feb 2019 11:04:51 -0800
Subject: PCI/ATS: Add pci_prg_resp_pasid_required() interface.

Return the PRG Response PASID Required bit in the Page Request
Status Register.

As per PCIe spec r4.0, sec 10.5.2.3, if this bit is Set, the device
expects a PASID TLP Prefix on PRG Response Messages when the
corresponding Page Requests had a PASID TLP Prefix. If Clear, the device
does not expect PASID TLP Prefixes on any PRG Response Message, and the
device behavior is undefined if the device receives a PRG Response Message
with a PASID TLP Prefix. Also the device behavior is undefined if this
bit is Set and the device receives a PRG Response Message with no PASID TLP
Prefix when the corresponding Page Requests had a PASID TLP Prefix.

This function will be used by drivers like IOMMU, if it is required to
check the status of the PRG Response PASID Required bit before enabling
the PASID support of the device.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/pci/ats.c             | 30 ++++++++++++++++++++++++++++++
 include/linux/pci-ats.h       |  5 +++++
 include/uapi/linux/pci_regs.h |  1 +
 3 files changed, 36 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 5b78f3b1b918..420cd0a578d0 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -368,6 +368,36 @@ int pci_pasid_features(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_GPL(pci_pasid_features);
 
+/**
+ * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit
+ *				 status.
+ * @pdev: PCI device structure
+ *
+ * Returns 1 if PASID is required in PRG Response Message, 0 otherwise.
+ *
+ * Even though the PRG response PASID status is read from PRI Status
+ * Register, since this API will mainly be used by PASID users, this
+ * function is defined within #ifdef CONFIG_PCI_PASID instead of
+ * CONFIG_PCI_PRI.
+ */
+int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{
+	u16 status;
+	int pos;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+	if (!pos)
+		return 0;
+
+	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
+
+	if (status & PCI_PRI_STATUS_PASID)
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
+
 #define PASID_NUMBER_SHIFT	8
 #define PASID_NUMBER_MASK	(0x1f << PASID_NUMBER_SHIFT)
 /**
diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 7c4b8e27268c..facfd6a18fe1 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -40,6 +40,7 @@ void pci_disable_pasid(struct pci_dev *pdev);
 void pci_restore_pasid_state(struct pci_dev *pdev);
 int pci_pasid_features(struct pci_dev *pdev);
 int pci_max_pasids(struct pci_dev *pdev);
+int pci_prg_resp_pasid_required(struct pci_dev *pdev);
 
 #else  /* CONFIG_PCI_PASID */
 
@@ -66,6 +67,10 @@ static inline int pci_max_pasids(struct pci_dev *pdev)
 	return -EINVAL;
 }
 
+static int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI_PASID */
 
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index e1e9888c85e6..898be572b010 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -880,6 +880,7 @@
 #define  PCI_PRI_STATUS_RF	0x001	/* Response Failure */
 #define  PCI_PRI_STATUS_UPRGI	0x002	/* Unexpected PRG index */
 #define  PCI_PRI_STATUS_STOPPED	0x100	/* PRI Stopped */
+#define  PCI_PRI_STATUS_PASID	0x8000	/* PRG Response PASID Required */
 #define PCI_PRI_MAX_REQ		0x08	/* PRI max reqs supported */
 #define PCI_PRI_ALLOC_REQ	0x0c	/* PRI max reqs allowed */
 #define PCI_EXT_CAP_PRI_SIZEOF	16
-- 
cgit v1.2.3-59-g8ed1b


From 8c938ddc6df3bbe72809db1be6c9f3af83f5d7a9 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Tue, 19 Feb 2019 11:06:09 -0800
Subject: PCI/ATS: Add pci_ats_page_aligned() interface

Return the Page Aligned Request bit in the ATS Capability Register.

As per PCIe spec r4.0, sec 10.5.1.2, if the Page Aligned Request bit is
set, it indicates the Untranslated Addresses generated by the device are
always aligned to a 4096 byte boundary.

An IOMMU that can only translate page-aligned addresses can only be used
with devices that always produce aligned Untranslated Addresses. This
interface will be used by drivers for such IOMMUs to determine whether
devices can use the ATS service.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/pci/ats.c             | 27 +++++++++++++++++++++++++++
 include/linux/pci.h           |  2 ++
 include/uapi/linux/pci_regs.h |  1 +
 3 files changed, 30 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 420cd0a578d0..97c08146534a 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -142,6 +142,33 @@ int pci_ats_queue_depth(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_ats_queue_depth);
 
+/**
+ * pci_ats_page_aligned - Return Page Aligned Request bit status.
+ * @pdev: the PCI device
+ *
+ * Returns 1, if the Untranslated Addresses generated by the device
+ * are always aligned or 0 otherwise.
+ *
+ * Per PCIe spec r4.0, sec 10.5.1.2, if the Page Aligned Request bit
+ * is set, it indicates the Untranslated Addresses generated by the
+ * device are always aligned to a 4096 byte boundary.
+ */
+int pci_ats_page_aligned(struct pci_dev *pdev)
+{
+	u16 cap;
+
+	if (!pdev->ats_cap)
+		return 0;
+
+	pci_read_config_word(pdev, pdev->ats_cap + PCI_ATS_CAP, &cap);
+
+	if (cap & PCI_ATS_CAP_PAGE_ALIGNED)
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_ats_page_aligned);
+
 #ifdef CONFIG_PCI_PRI
 /**
  * pci_enable_pri - Enable PRI capability
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 65f1d8c2f082..9724a8c0496b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1524,11 +1524,13 @@ void pci_ats_init(struct pci_dev *dev);
 int pci_enable_ats(struct pci_dev *dev, int ps);
 void pci_disable_ats(struct pci_dev *dev);
 int pci_ats_queue_depth(struct pci_dev *dev);
+int pci_ats_page_aligned(struct pci_dev *dev);
 #else
 static inline void pci_ats_init(struct pci_dev *d) { }
 static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; }
 static inline void pci_disable_ats(struct pci_dev *d) { }
 static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; }
+static inline int pci_ats_page_aligned(struct pci_dev *dev) { return 0; }
 #endif
 
 #ifdef CONFIG_PCIE_PTM
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 898be572b010..5c98133f2c94 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -866,6 +866,7 @@
 #define PCI_ATS_CAP		0x04	/* ATS Capability Register */
 #define  PCI_ATS_CAP_QDEP(x)	((x) & 0x1f)	/* Invalidate Queue Depth */
 #define  PCI_ATS_MAX_QDEP	32	/* Max Invalidate Queue Depth */
+#define  PCI_ATS_CAP_PAGE_ALIGNED	0x0020 /* Page Aligned Request */
 #define PCI_ATS_CTRL		0x06	/* ATS Control Register */
 #define  PCI_ATS_CTRL_ENABLE	0x8000	/* ATS Enable */
 #define  PCI_ATS_CTRL_STU(x)	((x) & 0x1f)	/* Smallest Translation Unit */
-- 
cgit v1.2.3-59-g8ed1b


From 230afe74d139f37ba5e344ad4e53d65911d12188 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Wed, 27 Feb 2019 00:19:18 +0200
Subject: habanalabs: allow memory allocations larger than 4GB

This patch increase the size field in the uapi structure of the Memory
IOCTL from 32-bit to 64-bit. This is to allow the user to allocate and/or
map memory in chunks that are larger then 4GB.

Goya's device memory (DRAM) can be up to 16GB, and for certain
topologies, the user may want an allocation that is larger than 4GB.

This change doesn't break current user-space because there was a "pad"
field in the uapi structure right after the size field. Changing the size
field to be 64-bit and removing the pad field maintains compatibility with
current user-space.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/habanalabs/habanalabs.h |  2 +-
 drivers/misc/habanalabs/memory.c     | 10 ++++------
 include/uapi/misc/habanalabs.h       |  6 ++----
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 901542d685e8..fdf517448599 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1320,7 +1320,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx);
 int hl_vm_init(struct hl_device *hdev);
 void hl_vm_fini(struct hl_device *hdev);
 
-int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
+int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
 			struct hl_userptr *userptr);
 int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr);
 void hl_userptr_delete_list(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index 9e3491dc3b55..4b57d7ce50dd 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -1210,7 +1210,7 @@ out:
  * - Pins the physical pages
  * - Create a SG list from those pages
  */
-int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
+int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
 			struct hl_userptr *userptr)
 {
 	u64 start, end;
@@ -1218,14 +1218,12 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
 	int rc;
 
 	if (!size) {
-		dev_err(hdev->dev, "size to pin is invalid - %d\n",
-			size);
+		dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);
 		return -EINVAL;
 	}
 
 	if (!access_ok((void __user *) (uintptr_t) addr, size)) {
-		dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n",
-			addr);
+		dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);
 		return -EFAULT;
 	}
 
@@ -1236,7 +1234,7 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
 	if (((addr + size) < addr) ||
 			PAGE_ALIGN(addr + size) < (addr + size)) {
 		dev_err(hdev->dev,
-			"user pointer 0x%llx + %u causes integer overflow\n",
+			"user pointer 0x%llx + %llu causes integer overflow\n",
 			addr, size);
 		return -EINVAL;
 	}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 4afc1891ece8..23d6ad3459cb 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -237,8 +237,7 @@ struct hl_mem_in {
 		/* HL_MEM_OP_ALLOC- allocate device memory */
 		struct {
 			/* Size to alloc */
-			__u32 mem_size;
-			__u32 pad;
+			__u64 mem_size;
 		} alloc;
 
 		/* HL_MEM_OP_FREE - free device memory */
@@ -278,8 +277,7 @@ struct hl_mem_in {
 			 */
 			__u64 hint_addr;
 			/* Size of allocated host memory */
-			__u32 mem_size;
-			__u32 pad;
+			__u64 mem_size;
 		} map_host;
 
 		/* HL_MEM_OP_UNMAP - unmap host memory */
-- 
cgit v1.2.3-59-g8ed1b


From 5f8f8b93aeb8371c54af08bece2bd04bc2d48707 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 25 Feb 2019 14:28:40 -0800
Subject: bpf: expose program stats via bpf_prog_info

Return bpf program run_time_ns and run_cnt via bpf_prog_info

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bcdd2474eee7..2e308e90ffea 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2813,6 +2813,8 @@ struct bpf_prog_info {
 	__u32 jited_line_info_rec_size;
 	__u32 nr_prog_tags;
 	__aligned_u64 prog_tags;
+	__u64 run_time_ns;
+	__u64 run_cnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 31cf66fc3f5c..174581dfe225 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2152,6 +2152,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
 	struct bpf_prog_info info = {};
 	u32 info_len = attr->info.info_len;
+	struct bpf_prog_stats stats;
 	char __user *uinsns;
 	u32 ulen;
 	int err;
@@ -2191,6 +2192,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (err)
 		return err;
 
+	bpf_prog_get_stats(prog, &stats);
+	info.run_time_ns = stats.nsecs;
+	info.run_cnt = stats.cnt;
+
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 541664d360d1fdaa116473410265b2cb8a806b50 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Thu, 28 Feb 2019 11:55:44 +0200
Subject: habanalabs: add comments in uapi/misc/habanalabs.h

Add comment about minimum and maximum size of command buffer.
Add some text about the expected input of CS IOCTL.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/misc/habanalabs.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 23d6ad3459cb..7fd6f633534c 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -112,7 +112,9 @@ struct hl_cb_in {
 	__u64 cb_handle;
 	/* HL_CB_OP_* */
 	__u32 op;
-	/* Size of CB. Minimum requested size must be PAGE_SIZE */
+	/* Size of CB. Maximum size is 2MB. The minimum size that will be
+	 * allocated, regardless of this parameter's value, is PAGE_SIZE
+	 */
 	__u32 cb_size;
 	/* Context ID - Currently not in use */
 	__u32 ctx_id;
@@ -364,6 +366,12 @@ union hl_mem_args {
  * internal. The driver will get completion notifications from the device only
  * on JOBS which are enqueued in the external queues.
  *
+ * For jobs on external queues, the user needs to create command buffers
+ * through the CB ioctl and give the CB's handle to the CS ioctl. For jobs on
+ * internal queues, the user needs to prepare a "command buffer" with packets
+ * on either the SRAM or DRAM, and give the device address of that buffer to
+ * the CS ioctl.
+ *
  * This IOCTL is asynchronous in regard to the actual execution of the CS. This
  * means it returns immediately after ALL the JOBS were enqueued on their
  * relevant queues. Therefore, the user mustn't assume the CS has been completed
-- 
cgit v1.2.3-59-g8ed1b


From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 7 Jan 2019 10:46:33 -0700
Subject: Add io_uring IO interface

The submission queue (SQ) and completion queue (CQ) rings are shared
between the application and the kernel. This eliminates the need to
copy data back and forth to submit and complete IO.

IO submissions use the io_uring_sqe data structure, and completions
are generated in the form of io_uring_cqe data structures. The SQ
ring is an index into the io_uring_sqe array, which makes it possible
to submit a batch of IOs without them being contiguous in the ring.
The CQ ring is always contiguous, as completion events are inherently
unordered, and hence any io_uring_cqe entry can point back to an
arbitrary submission.

Two new system calls are added for this:

io_uring_setup(entries, params)
	Sets up an io_uring instance for doing async IO. On success,
	returns a file descriptor that the application can mmap to
	gain access to the SQ ring, CQ ring, and io_uring_sqes.

io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize)
	Initiates IO against the rings mapped to this fd, or waits for
	them to complete, or both. The behavior is controlled by the
	parameters passed in. If 'to_submit' is non-zero, then we'll
	try and submit new IO. If IORING_ENTER_GETEVENTS is set, the
	kernel will wait for 'min_complete' events, if they aren't
	already available. It's valid to set IORING_ENTER_GETEVENTS
	and 'min_complete' == 0 at the same time, this allows the
	kernel to return already completed events without waiting
	for them. This is useful only for polling, as for IRQ
	driven IO, the application can just check the CQ ring
	without entering the kernel.

With this setup, it's possible to do async IO with a single system
call. Future developments will enable polled IO with this interface,
and polled submission as well. The latter will enable an application
to do IO without doing ANY system calls at all.

For IRQ driven IO, an application only needs to enter the kernel for
completions if it wants to wait for them to occur.

Each io_uring is backed by a workqueue, to support buffered async IO
as well. We will only punt to an async context if the command would
need to wait for IO on the device side. Any data that can be accessed
directly in the page cache is done inline. This avoids the slowness
issue of usual threadpools, since cached data is accessed as quickly
as a sync interface.

Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |    2 +
 arch/x86/entry/syscalls/syscall_64.tbl |    2 +
 fs/Makefile                            |    1 +
 fs/io_uring.c                          | 1255 ++++++++++++++++++++++++++++++++
 include/linux/fs.h                     |    9 +
 include/linux/sched/user.h             |    2 +-
 include/linux/syscalls.h               |    6 +
 include/uapi/asm-generic/unistd.h      |    6 +-
 include/uapi/linux/io_uring.h          |   95 +++
 init/Kconfig                           |    9 +
 kernel/sys_ni.c                        |    2 +
 net/unix/garbage.c                     |    3 +
 12 files changed, 1390 insertions(+), 2 deletions(-)
 create mode 100644 fs/io_uring.c
 create mode 100644 include/uapi/linux/io_uring.h

(limited to 'include/uapi')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..481c126259e9 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,5 @@
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
 385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
+425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
+426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..6a32a430c8e0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,8 @@
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+425	common	io_uring_setup		__x64_sys_io_uring_setup
+426	common	io_uring_enter		__x64_sys_io_uring_enter
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..8e15d6fc4340 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_IO_URING)		+= io_uring.o
 obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
diff --git a/fs/io_uring.c b/fs/io_uring.c
new file mode 100644
index 000000000000..f68052290426
--- /dev/null
+++ b/fs/io_uring.c
@@ -0,0 +1,1255 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared application/kernel submission and completion ring pairs, for
+ * supporting fast/efficient IO.
+ *
+ * A note on the read/write ordering memory barriers that are matched between
+ * the application and kernel side. When the application reads the CQ ring
+ * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
+ * the kernel uses after writing the tail. Failure to do so could cause a
+ * delay in when the application notices that completion events available.
+ * This isn't a fatal condition. Likewise, the application must use an
+ * appropriate smp_wmb() both before writing the SQ tail, and after writing
+ * the SQ tail. The first one orders the sqe writes with the tail write, and
+ * the latter is paired with the smp_rmb() the kernel will issue before
+ * reading the SQ tail on submission.
+ *
+ * Also see the examples in the liburing library:
+ *
+ *	git://git.kernel.dk/liburing
+ *
+ * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
+ * from data shared between the kernel and application. This is done both
+ * for ordering purposes, but also to ensure that once a value is loaded from
+ * data that the application could potentially modify, it remains stable.
+ *
+ * Copyright (C) 2018-2019 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/refcount.h>
+#include <linux/uio.h>
+
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_context.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/blkdev.h>
+#include <linux/net.h>
+#include <net/sock.h>
+#include <net/af_unix.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/nospec.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "internal.h"
+
+#define IORING_MAX_ENTRIES	4096
+
+struct io_uring {
+	u32 head ____cacheline_aligned_in_smp;
+	u32 tail ____cacheline_aligned_in_smp;
+};
+
+struct io_sq_ring {
+	struct io_uring		r;
+	u32			ring_mask;
+	u32			ring_entries;
+	u32			dropped;
+	u32			flags;
+	u32			array[];
+};
+
+struct io_cq_ring {
+	struct io_uring		r;
+	u32			ring_mask;
+	u32			ring_entries;
+	u32			overflow;
+	struct io_uring_cqe	cqes[];
+};
+
+struct io_ring_ctx {
+	struct {
+		struct percpu_ref	refs;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		unsigned int		flags;
+		bool			compat;
+		bool			account_mem;
+
+		/* SQ ring */
+		struct io_sq_ring	*sq_ring;
+		unsigned		cached_sq_head;
+		unsigned		sq_entries;
+		unsigned		sq_mask;
+		struct io_uring_sqe	*sq_sqes;
+	} ____cacheline_aligned_in_smp;
+
+	/* IO offload */
+	struct workqueue_struct	*sqo_wq;
+	struct mm_struct	*sqo_mm;
+
+	struct {
+		/* CQ ring */
+		struct io_cq_ring	*cq_ring;
+		unsigned		cached_cq_tail;
+		unsigned		cq_entries;
+		unsigned		cq_mask;
+		struct wait_queue_head	cq_wait;
+		struct fasync_struct	*cq_fasync;
+	} ____cacheline_aligned_in_smp;
+
+	struct user_struct	*user;
+
+	struct completion	ctx_done;
+
+	struct {
+		struct mutex		uring_lock;
+		wait_queue_head_t	wait;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		spinlock_t		completion_lock;
+	} ____cacheline_aligned_in_smp;
+
+#if defined(CONFIG_UNIX)
+	struct socket		*ring_sock;
+#endif
+};
+
+struct sqe_submit {
+	const struct io_uring_sqe	*sqe;
+	unsigned short			index;
+	bool				has_user;
+};
+
+struct io_kiocb {
+	struct kiocb		rw;
+
+	struct sqe_submit	submit;
+
+	struct io_ring_ctx	*ctx;
+	struct list_head	list;
+	unsigned int		flags;
+#define REQ_F_FORCE_NONBLOCK	1	/* inline submission attempt */
+	u64			user_data;
+
+	struct work_struct	work;
+};
+
+#define IO_PLUG_THRESHOLD		2
+
+static struct kmem_cache *req_cachep;
+
+static const struct file_operations io_uring_fops;
+
+struct sock *io_uring_get_socket(struct file *file)
+{
+#if defined(CONFIG_UNIX)
+	if (file->f_op == &io_uring_fops) {
+		struct io_ring_ctx *ctx = file->private_data;
+
+		return ctx->ring_sock->sk;
+	}
+#endif
+	return NULL;
+}
+EXPORT_SYMBOL(io_uring_get_socket);
+
+static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+{
+	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
+
+	complete(&ctx->ctx_done);
+}
+
+static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+{
+	struct io_ring_ctx *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
+		kfree(ctx);
+		return NULL;
+	}
+
+	ctx->flags = p->flags;
+	init_waitqueue_head(&ctx->cq_wait);
+	init_completion(&ctx->ctx_done);
+	mutex_init(&ctx->uring_lock);
+	init_waitqueue_head(&ctx->wait);
+	spin_lock_init(&ctx->completion_lock);
+	return ctx;
+}
+
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+	struct io_cq_ring *ring = ctx->cq_ring;
+
+	if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
+		/* order cqe stores with ring update */
+		smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
+
+		/*
+		 * Write sider barrier of tail update, app has read side. See
+		 * comment at the top of this file.
+		 */
+		smp_wmb();
+
+		if (wq_has_sleeper(&ctx->cq_wait)) {
+			wake_up_interruptible(&ctx->cq_wait);
+			kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
+		}
+	}
+}
+
+static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+{
+	struct io_cq_ring *ring = ctx->cq_ring;
+	unsigned tail;
+
+	tail = ctx->cached_cq_tail;
+	/* See comment at the top of the file */
+	smp_rmb();
+	if (tail + 1 == READ_ONCE(ring->r.head))
+		return NULL;
+
+	ctx->cached_cq_tail++;
+	return &ring->cqes[tail & ctx->cq_mask];
+}
+
+static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+				 long res, unsigned ev_flags)
+{
+	struct io_uring_cqe *cqe;
+
+	/*
+	 * If we can't get a cq entry, userspace overflowed the
+	 * submission (by quite a lot). Increment the overflow count in
+	 * the ring.
+	 */
+	cqe = io_get_cqring(ctx);
+	if (cqe) {
+		WRITE_ONCE(cqe->user_data, ki_user_data);
+		WRITE_ONCE(cqe->res, res);
+		WRITE_ONCE(cqe->flags, ev_flags);
+	} else {
+		unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
+
+		WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
+	}
+}
+
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+				long res, unsigned ev_flags)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+}
+
+static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
+{
+	percpu_ref_put_many(&ctx->refs, refs);
+
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+}
+
+static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	if (!percpu_ref_tryget(&ctx->refs))
+		return NULL;
+
+	req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
+	if (req) {
+		req->ctx = ctx;
+		req->flags = 0;
+		return req;
+	}
+
+	io_ring_drop_ctx_refs(ctx, 1);
+	return NULL;
+}
+
+static void io_free_req(struct io_kiocb *req)
+{
+	io_ring_drop_ctx_refs(req->ctx, 1);
+	kmem_cache_free(req_cachep, req);
+}
+
+static void kiocb_end_write(struct kiocb *kiocb)
+{
+	if (kiocb->ki_flags & IOCB_WRITE) {
+		struct inode *inode = file_inode(kiocb->ki_filp);
+
+		/*
+		 * Tell lockdep we inherited freeze protection from submission
+		 * thread.
+		 */
+		if (S_ISREG(inode->i_mode))
+			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+		file_end_write(kiocb->ki_filp);
+	}
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+	kiocb_end_write(kiocb);
+
+	fput(kiocb->ki_filp);
+	io_cqring_add_event(req->ctx, req->user_data, res, 0);
+	io_free_req(req);
+}
+
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static bool io_file_supports_async(struct file *file)
+{
+	umode_t mode = file_inode(file)->i_mode;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode))
+		return true;
+	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
+		return true;
+
+	return false;
+}
+
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+	struct kiocb *kiocb = &req->rw;
+	unsigned ioprio;
+	int fd, ret;
+
+	/* For -EAGAIN retry, everything is already prepped */
+	if (kiocb->ki_filp)
+		return 0;
+
+	fd = READ_ONCE(sqe->fd);
+	kiocb->ki_filp = fget(fd);
+	if (unlikely(!kiocb->ki_filp))
+		return -EBADF;
+	if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
+		force_nonblock = false;
+	kiocb->ki_pos = READ_ONCE(sqe->off);
+	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+
+	ioprio = READ_ONCE(sqe->ioprio);
+	if (ioprio) {
+		ret = ioprio_check_cap(ioprio);
+		if (ret)
+			goto out_fput;
+
+		kiocb->ki_ioprio = ioprio;
+	} else
+		kiocb->ki_ioprio = get_current_ioprio();
+
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	if (unlikely(ret))
+		goto out_fput;
+	if (force_nonblock) {
+		kiocb->ki_flags |= IOCB_NOWAIT;
+		req->flags |= REQ_F_FORCE_NONBLOCK;
+	}
+	if (kiocb->ki_flags & IOCB_HIPRI) {
+		ret = -EINVAL;
+		goto out_fput;
+	}
+
+	kiocb->ki_complete = io_complete_rw;
+	return 0;
+out_fput:
+	fput(kiocb->ki_filp);
+	return ret;
+}
+
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+	switch (ret) {
+	case -EIOCBQUEUED:
+		break;
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/*
+		 * We can't just restart the syscall, since previously
+		 * submitted sqes may already be in progress. Just fail this
+		 * IO with EINTR.
+		 */
+		ret = -EINTR;
+		/* fall through */
+	default:
+		kiocb->ki_complete(kiocb, ret, 0);
+	}
+}
+
+static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
+			   const struct sqe_submit *s, struct iovec **iovec,
+			   struct iov_iter *iter)
+{
+	const struct io_uring_sqe *sqe = s->sqe;
+	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	size_t sqe_len = READ_ONCE(sqe->len);
+
+	if (!s->has_user)
+		return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+	if (ctx->compat)
+		return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
+						iovec, iter);
+#endif
+
+	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+}
+
+static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
+		       bool force_nonblock)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct kiocb *kiocb = &req->rw;
+	struct iov_iter iter;
+	struct file *file;
+	ssize_t ret;
+
+	ret = io_prep_rw(req, s->sqe, force_nonblock);
+	if (ret)
+		return ret;
+	file = kiocb->ki_filp;
+
+	ret = -EBADF;
+	if (unlikely(!(file->f_mode & FMODE_READ)))
+		goto out_fput;
+	ret = -EINVAL;
+	if (unlikely(!file->f_op->read_iter))
+		goto out_fput;
+
+	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+	if (ret)
+		goto out_fput;
+
+	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
+	if (!ret) {
+		ssize_t ret2;
+
+		/* Catch -EAGAIN return for forced non-blocking submission */
+		ret2 = call_read_iter(file, kiocb, &iter);
+		if (!force_nonblock || ret2 != -EAGAIN)
+			io_rw_done(kiocb, ret2);
+		else
+			ret = -EAGAIN;
+	}
+	kfree(iovec);
+out_fput:
+	/* Hold on to the file for -EAGAIN */
+	if (unlikely(ret && ret != -EAGAIN))
+		fput(file);
+	return ret;
+}
+
+static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
+			bool force_nonblock)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	struct kiocb *kiocb = &req->rw;
+	struct iov_iter iter;
+	struct file *file;
+	ssize_t ret;
+
+	ret = io_prep_rw(req, s->sqe, force_nonblock);
+	if (ret)
+		return ret;
+	/* Hold on to the file for -EAGAIN */
+	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
+		return -EAGAIN;
+
+	ret = -EBADF;
+	file = kiocb->ki_filp;
+	if (unlikely(!(file->f_mode & FMODE_WRITE)))
+		goto out_fput;
+	ret = -EINVAL;
+	if (unlikely(!file->f_op->write_iter))
+		goto out_fput;
+
+	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+	if (ret)
+		goto out_fput;
+
+	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
+				iov_iter_count(&iter));
+	if (!ret) {
+		/*
+		 * Open-code file_start_write here to grab freeze protection,
+		 * which will be released by another thread in
+		 * io_complete_rw().  Fool lockdep by telling it the lock got
+		 * released so that it doesn't complain about the held lock when
+		 * we return to userspace.
+		 */
+		if (S_ISREG(file_inode(file)->i_mode)) {
+			__sb_start_write(file_inode(file)->i_sb,
+						SB_FREEZE_WRITE, true);
+			__sb_writers_release(file_inode(file)->i_sb,
+						SB_FREEZE_WRITE);
+		}
+		kiocb->ki_flags |= IOCB_WRITE;
+		io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
+	}
+	kfree(iovec);
+out_fput:
+	if (unlikely(ret))
+		fput(file);
+	return ret;
+}
+
+/*
+ * IORING_OP_NOP just posts a completion event, nothing else.
+ */
+static int io_nop(struct io_kiocb *req, u64 user_data)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	long err = 0;
+
+	/*
+	 * Twilight zone - it's possible that someone issued an opcode that
+	 * has a file attached, then got -EAGAIN on submission, and changed
+	 * the sqe before we retried it from async context. Avoid dropping
+	 * a file reference for this malicious case, and flag the error.
+	 */
+	if (req->rw.ki_filp) {
+		err = -EBADF;
+		fput(req->rw.ki_filp);
+	}
+	io_cqring_add_event(ctx, user_data, err, 0);
+	io_free_req(req);
+	return 0;
+}
+
+static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			   const struct sqe_submit *s, bool force_nonblock)
+{
+	ssize_t ret;
+	int opcode;
+
+	if (unlikely(s->index >= ctx->sq_entries))
+		return -EINVAL;
+	req->user_data = READ_ONCE(s->sqe->user_data);
+
+	opcode = READ_ONCE(s->sqe->opcode);
+	switch (opcode) {
+	case IORING_OP_NOP:
+		ret = io_nop(req, req->user_data);
+		break;
+	case IORING_OP_READV:
+		ret = io_read(req, s, force_nonblock);
+		break;
+	case IORING_OP_WRITEV:
+		ret = io_write(req, s, force_nonblock);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static void io_sq_wq_submit_work(struct work_struct *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct sqe_submit *s = &req->submit;
+	const struct io_uring_sqe *sqe = s->sqe;
+	struct io_ring_ctx *ctx = req->ctx;
+	mm_segment_t old_fs = get_fs();
+	int ret;
+
+	 /* Ensure we clear previously set forced non-block flag */
+	req->flags &= ~REQ_F_FORCE_NONBLOCK;
+	req->rw.ki_flags &= ~IOCB_NOWAIT;
+
+	if (!mmget_not_zero(ctx->sqo_mm)) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	use_mm(ctx->sqo_mm);
+	set_fs(USER_DS);
+	s->has_user = true;
+
+	ret = __io_submit_sqe(ctx, req, s, false);
+
+	set_fs(old_fs);
+	unuse_mm(ctx->sqo_mm);
+	mmput(ctx->sqo_mm);
+err:
+	if (ret) {
+		io_cqring_add_event(ctx, sqe->user_data, ret, 0);
+		io_free_req(req);
+	}
+
+	/* async context always use a copy of the sqe */
+	kfree(sqe);
+}
+
+static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s)
+{
+	struct io_kiocb *req;
+	ssize_t ret;
+
+	/* enforce forwards compatibility on users */
+	if (unlikely(s->sqe->flags))
+		return -EINVAL;
+
+	req = io_get_req(ctx);
+	if (unlikely(!req))
+		return -EAGAIN;
+
+	req->rw.ki_filp = NULL;
+
+	ret = __io_submit_sqe(ctx, req, s, true);
+	if (ret == -EAGAIN) {
+		struct io_uring_sqe *sqe_copy;
+
+		sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
+		if (sqe_copy) {
+			memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
+			s->sqe = sqe_copy;
+
+			memcpy(&req->submit, s, sizeof(*s));
+			INIT_WORK(&req->work, io_sq_wq_submit_work);
+			queue_work(ctx->sqo_wq, &req->work);
+			ret = 0;
+		}
+	}
+	if (ret)
+		io_free_req(req);
+
+	return ret;
+}
+
+static void io_commit_sqring(struct io_ring_ctx *ctx)
+{
+	struct io_sq_ring *ring = ctx->sq_ring;
+
+	if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
+		/*
+		 * Ensure any loads from the SQEs are done at this point,
+		 * since once we write the new head, the application could
+		 * write new data to them.
+		 */
+		smp_store_release(&ring->r.head, ctx->cached_sq_head);
+
+		/*
+		 * write side barrier of head update, app has read side. See
+		 * comment at the top of this file
+		 */
+		smp_wmb();
+	}
+}
+
+/*
+ * Undo last io_get_sqring()
+ */
+static void io_drop_sqring(struct io_ring_ctx *ctx)
+{
+	ctx->cached_sq_head--;
+}
+
+/*
+ * Fetch an sqe, if one is available. Note that s->sqe will point to memory
+ * that is mapped by userspace. This means that care needs to be taken to
+ * ensure that reads are stable, as we cannot rely on userspace always
+ * being a good citizen. If members of the sqe are validated and then later
+ * used, it's important that those reads are done through READ_ONCE() to
+ * prevent a re-load down the line.
+ */
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+{
+	struct io_sq_ring *ring = ctx->sq_ring;
+	unsigned head;
+
+	/*
+	 * The cached sq head (or cq tail) serves two purposes:
+	 *
+	 * 1) allows us to batch the cost of updating the user visible
+	 *    head updates.
+	 * 2) allows the kernel side to track the head on its own, even
+	 *    though the application is the one updating it.
+	 */
+	head = ctx->cached_sq_head;
+	/* See comment at the top of this file */
+	smp_rmb();
+	if (head == READ_ONCE(ring->r.tail))
+		return false;
+
+	head = READ_ONCE(ring->array[head & ctx->sq_mask]);
+	if (head < ctx->sq_entries) {
+		s->index = head;
+		s->sqe = &ctx->sq_sqes[head];
+		ctx->cached_sq_head++;
+		return true;
+	}
+
+	/* drop invalid entries */
+	ctx->cached_sq_head++;
+	ring->dropped++;
+	/* See comment at the top of this file */
+	smp_wmb();
+	return false;
+}
+
+static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
+{
+	int i, ret = 0, submit = 0;
+	struct blk_plug plug;
+
+	if (to_submit > IO_PLUG_THRESHOLD)
+		blk_start_plug(&plug);
+
+	for (i = 0; i < to_submit; i++) {
+		struct sqe_submit s;
+
+		if (!io_get_sqring(ctx, &s))
+			break;
+
+		s.has_user = true;
+		ret = io_submit_sqe(ctx, &s);
+		if (ret) {
+			io_drop_sqring(ctx);
+			break;
+		}
+
+		submit++;
+	}
+	io_commit_sqring(ctx);
+
+	if (to_submit > IO_PLUG_THRESHOLD)
+		blk_finish_plug(&plug);
+
+	return submit ? submit : ret;
+}
+
+static unsigned io_cqring_events(struct io_cq_ring *ring)
+{
+	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
+}
+
+/*
+ * Wait until events become available, if we don't already have some. The
+ * application must reap them itself, as they reside on the shared cq ring.
+ */
+static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
+			  const sigset_t __user *sig, size_t sigsz)
+{
+	struct io_cq_ring *ring = ctx->cq_ring;
+	sigset_t ksigmask, sigsaved;
+	DEFINE_WAIT(wait);
+	int ret;
+
+	/* See comment at the top of this file */
+	smp_rmb();
+	if (io_cqring_events(ring) >= min_events)
+		return 0;
+
+	if (sig) {
+		ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
+		if (ret)
+			return ret;
+	}
+
+	do {
+		prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
+
+		ret = 0;
+		/* See comment at the top of this file */
+		smp_rmb();
+		if (io_cqring_events(ring) >= min_events)
+			break;
+
+		schedule();
+
+		ret = -EINTR;
+		if (signal_pending(current))
+			break;
+	} while (1);
+
+	finish_wait(&ctx->wait, &wait);
+
+	if (sig)
+		restore_user_sigmask(sig, &sigsaved);
+
+	return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
+}
+
+static int io_sq_offload_start(struct io_ring_ctx *ctx)
+{
+	int ret;
+
+	mmgrab(current->mm);
+	ctx->sqo_mm = current->mm;
+
+	/* Do QD, or 2 * CPUS, whatever is smallest */
+	ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
+			min(ctx->sq_entries - 1, 2 * num_online_cpus()));
+	if (!ctx->sqo_wq) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	return 0;
+err:
+	mmdrop(ctx->sqo_mm);
+	ctx->sqo_mm = NULL;
+	return ret;
+}
+
+static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
+{
+	atomic_long_sub(nr_pages, &user->locked_vm);
+}
+
+static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
+{
+	unsigned long page_limit, cur_pages, new_pages;
+
+	/* Don't allow more pages than we can safely lock */
+	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	do {
+		cur_pages = atomic_long_read(&user->locked_vm);
+		new_pages = cur_pages + nr_pages;
+		if (new_pages > page_limit)
+			return -ENOMEM;
+	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+					new_pages) != cur_pages);
+
+	return 0;
+}
+
+static void io_mem_free(void *ptr)
+{
+	struct page *page = virt_to_head_page(ptr);
+
+	if (put_page_testzero(page))
+		free_compound_page(page);
+}
+
+static void *io_mem_alloc(size_t size)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
+				__GFP_NORETRY;
+
+	return (void *) __get_free_pages(gfp_flags, get_order(size));
+}
+
+static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
+{
+	struct io_sq_ring *sq_ring;
+	struct io_cq_ring *cq_ring;
+	size_t bytes;
+
+	bytes = struct_size(sq_ring, array, sq_entries);
+	bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
+	bytes += struct_size(cq_ring, cqes, cq_entries);
+
+	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+}
+
+static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+{
+	if (ctx->sqo_wq)
+		destroy_workqueue(ctx->sqo_wq);
+	if (ctx->sqo_mm)
+		mmdrop(ctx->sqo_mm);
+#if defined(CONFIG_UNIX)
+	if (ctx->ring_sock)
+		sock_release(ctx->ring_sock);
+#endif
+
+	io_mem_free(ctx->sq_ring);
+	io_mem_free(ctx->sq_sqes);
+	io_mem_free(ctx->cq_ring);
+
+	percpu_ref_exit(&ctx->refs);
+	if (ctx->account_mem)
+		io_unaccount_mem(ctx->user,
+				ring_pages(ctx->sq_entries, ctx->cq_entries));
+	free_uid(ctx->user);
+	kfree(ctx);
+}
+
+static __poll_t io_uring_poll(struct file *file, poll_table *wait)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+	__poll_t mask = 0;
+
+	poll_wait(file, &ctx->cq_wait, wait);
+	/* See comment at the top of this file */
+	smp_rmb();
+	if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
+		mask |= EPOLLOUT | EPOLLWRNORM;
+	if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+static int io_uring_fasync(int fd, struct file *file, int on)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+
+	return fasync_helper(fd, file, on, &ctx->cq_fasync);
+}
+
+static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+{
+	mutex_lock(&ctx->uring_lock);
+	percpu_ref_kill(&ctx->refs);
+	mutex_unlock(&ctx->uring_lock);
+
+	wait_for_completion(&ctx->ctx_done);
+	io_ring_ctx_free(ctx);
+}
+
+static int io_uring_release(struct inode *inode, struct file *file)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+
+	file->private_data = NULL;
+	io_ring_ctx_wait_and_kill(ctx);
+	return 0;
+}
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long sz = vma->vm_end - vma->vm_start;
+	struct io_ring_ctx *ctx = file->private_data;
+	unsigned long pfn;
+	struct page *page;
+	void *ptr;
+
+	switch (offset) {
+	case IORING_OFF_SQ_RING:
+		ptr = ctx->sq_ring;
+		break;
+	case IORING_OFF_SQES:
+		ptr = ctx->sq_sqes;
+		break;
+	case IORING_OFF_CQ_RING:
+		ptr = ctx->cq_ring;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	page = virt_to_head_page(ptr);
+	if (sz > (PAGE_SIZE << compound_order(page)))
+		return -EINVAL;
+
+	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+}
+
+SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
+		u32, min_complete, u32, flags, const sigset_t __user *, sig,
+		size_t, sigsz)
+{
+	struct io_ring_ctx *ctx;
+	long ret = -EBADF;
+	int submitted = 0;
+	struct fd f;
+
+	if (flags & ~IORING_ENTER_GETEVENTS)
+		return -EINVAL;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EOPNOTSUPP;
+	if (f.file->f_op != &io_uring_fops)
+		goto out_fput;
+
+	ret = -ENXIO;
+	ctx = f.file->private_data;
+	if (!percpu_ref_tryget(&ctx->refs))
+		goto out_fput;
+
+	ret = 0;
+	if (to_submit) {
+		to_submit = min(to_submit, ctx->sq_entries);
+
+		mutex_lock(&ctx->uring_lock);
+		submitted = io_ring_submit(ctx, to_submit);
+		mutex_unlock(&ctx->uring_lock);
+
+		if (submitted < 0)
+			goto out_ctx;
+	}
+	if (flags & IORING_ENTER_GETEVENTS) {
+		min_complete = min(min_complete, ctx->cq_entries);
+
+		/*
+		 * The application could have included the 'to_submit' count
+		 * in how many events it wanted to wait for. If we failed to
+		 * submit the desired count, we may need to adjust the number
+		 * of events to poll/wait for.
+		 */
+		if (submitted < to_submit)
+			min_complete = min_t(unsigned, submitted, min_complete);
+
+		ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+	}
+
+out_ctx:
+	io_ring_drop_ctx_refs(ctx, 1);
+out_fput:
+	fdput(f);
+	return submitted ? submitted : ret;
+}
+
+static const struct file_operations io_uring_fops = {
+	.release	= io_uring_release,
+	.mmap		= io_uring_mmap,
+	.poll		= io_uring_poll,
+	.fasync		= io_uring_fasync,
+};
+
+static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+				  struct io_uring_params *p)
+{
+	struct io_sq_ring *sq_ring;
+	struct io_cq_ring *cq_ring;
+	size_t size;
+
+	sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
+	if (!sq_ring)
+		return -ENOMEM;
+
+	ctx->sq_ring = sq_ring;
+	sq_ring->ring_mask = p->sq_entries - 1;
+	sq_ring->ring_entries = p->sq_entries;
+	ctx->sq_mask = sq_ring->ring_mask;
+	ctx->sq_entries = sq_ring->ring_entries;
+
+	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	ctx->sq_sqes = io_mem_alloc(size);
+	if (!ctx->sq_sqes) {
+		io_mem_free(ctx->sq_ring);
+		return -ENOMEM;
+	}
+
+	cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
+	if (!cq_ring) {
+		io_mem_free(ctx->sq_ring);
+		io_mem_free(ctx->sq_sqes);
+		return -ENOMEM;
+	}
+
+	ctx->cq_ring = cq_ring;
+	cq_ring->ring_mask = p->cq_entries - 1;
+	cq_ring->ring_entries = p->cq_entries;
+	ctx->cq_mask = cq_ring->ring_mask;
+	ctx->cq_entries = cq_ring->ring_entries;
+	return 0;
+}
+
+/*
+ * Allocate an anonymous fd, this is what constitutes the application
+ * visible backing of an io_uring instance. The application mmaps this
+ * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
+ * we have to tie this fd to a socket for file garbage collection purposes.
+ */
+static int io_uring_get_fd(struct io_ring_ctx *ctx)
+{
+	struct file *file;
+	int ret;
+
+#if defined(CONFIG_UNIX)
+	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
+				&ctx->ring_sock);
+	if (ret)
+		return ret;
+#endif
+
+	ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	if (ret < 0)
+		goto err;
+
+	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
+					O_RDWR | O_CLOEXEC);
+	if (IS_ERR(file)) {
+		put_unused_fd(ret);
+		ret = PTR_ERR(file);
+		goto err;
+	}
+
+#if defined(CONFIG_UNIX)
+	ctx->ring_sock->file = file;
+#endif
+	fd_install(ret, file);
+	return ret;
+err:
+#if defined(CONFIG_UNIX)
+	sock_release(ctx->ring_sock);
+	ctx->ring_sock = NULL;
+#endif
+	return ret;
+}
+
+static int io_uring_create(unsigned entries, struct io_uring_params *p)
+{
+	struct user_struct *user = NULL;
+	struct io_ring_ctx *ctx;
+	bool account_mem;
+	int ret;
+
+	if (!entries || entries > IORING_MAX_ENTRIES)
+		return -EINVAL;
+
+	/*
+	 * Use twice as many entries for the CQ ring. It's possible for the
+	 * application to drive a higher depth than the size of the SQ ring,
+	 * since the sqes are only used at submission time. This allows for
+	 * some flexibility in overcommitting a bit.
+	 */
+	p->sq_entries = roundup_pow_of_two(entries);
+	p->cq_entries = 2 * p->sq_entries;
+
+	user = get_uid(current_user());
+	account_mem = !capable(CAP_IPC_LOCK);
+
+	if (account_mem) {
+		ret = io_account_mem(user,
+				ring_pages(p->sq_entries, p->cq_entries));
+		if (ret) {
+			free_uid(user);
+			return ret;
+		}
+	}
+
+	ctx = io_ring_ctx_alloc(p);
+	if (!ctx) {
+		if (account_mem)
+			io_unaccount_mem(user, ring_pages(p->sq_entries,
+								p->cq_entries));
+		free_uid(user);
+		return -ENOMEM;
+	}
+	ctx->compat = in_compat_syscall();
+	ctx->account_mem = account_mem;
+	ctx->user = user;
+
+	ret = io_allocate_scq_urings(ctx, p);
+	if (ret)
+		goto err;
+
+	ret = io_sq_offload_start(ctx);
+	if (ret)
+		goto err;
+
+	ret = io_uring_get_fd(ctx);
+	if (ret < 0)
+		goto err;
+
+	memset(&p->sq_off, 0, sizeof(p->sq_off));
+	p->sq_off.head = offsetof(struct io_sq_ring, r.head);
+	p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
+	p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
+	p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
+	p->sq_off.flags = offsetof(struct io_sq_ring, flags);
+	p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
+	p->sq_off.array = offsetof(struct io_sq_ring, array);
+
+	memset(&p->cq_off, 0, sizeof(p->cq_off));
+	p->cq_off.head = offsetof(struct io_cq_ring, r.head);
+	p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
+	p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
+	p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
+	p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
+	p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
+	return ret;
+err:
+	io_ring_ctx_wait_and_kill(ctx);
+	return ret;
+}
+
+/*
+ * Sets up an aio uring context, and returns the fd. Applications asks for a
+ * ring size, we return the actual sq/cq ring sizes (among other things) in the
+ * params structure passed in.
+ */
+static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
+{
+	struct io_uring_params p;
+	long ret;
+	int i;
+
+	if (copy_from_user(&p, params, sizeof(p)))
+		return -EFAULT;
+	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
+		if (p.resv[i])
+			return -EINVAL;
+	}
+
+	if (p.flags)
+		return -EINVAL;
+
+	ret = io_uring_create(entries, &p);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(params, &p, sizeof(p)))
+		return -EFAULT;
+
+	return ret;
+}
+
+SYSCALL_DEFINE2(io_uring_setup, u32, entries,
+		struct io_uring_params __user *, params)
+{
+	return io_uring_setup(entries, params);
+}
+
+static int __init io_uring_init(void)
+{
+	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+	return 0;
+};
+__initcall(io_uring_init);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dedcc2e9265c..61aa210f0c2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3517,4 +3517,13 @@ extern void inode_nohighmem(struct inode *inode);
 extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
 		       int advice);
 
+#if defined(CONFIG_IO_URING)
+extern struct sock *io_uring_get_socket(struct file *file);
+#else
+static inline struct sock *io_uring_get_socket(struct file *file)
+{
+	return NULL;
+}
+#endif
+
 #endif /* _LINUX_FS_H */
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 39ad98c09c58..c7b5f86b91a1 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -40,7 +40,7 @@ struct user_struct {
 	kuid_t uid;
 
 #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
-    defined(CONFIG_NET)
+    defined(CONFIG_NET) || defined(CONFIG_IO_URING)
 	atomic_long_t locked_vm;
 #endif
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 257cccba3062..3072dbaa7869 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -69,6 +69,7 @@ struct file_handle;
 struct sigaltstack;
 struct rseq;
 union bpf_attr;
+struct io_uring_params;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id,
 				struct io_event __user *events,
 				struct old_timespec32 __user *timeout,
 				const struct __aio_sigset *sig);
+asmlinkage long sys_io_uring_setup(u32 entries,
+				struct io_uring_params __user *p);
+asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
+				u32 min_complete, u32 flags,
+				const sigset_t __user *sig, size_t sigsz);
 
 /* fs/xattr.c */
 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d90127298f12..87871e7b7ea7 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
 __SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
+#define __NR_io_uring_setup 425
+__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
+#define __NR_io_uring_enter 426
+__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
 
 #undef __NR_syscalls
-#define __NR_syscalls 295
+#define __NR_syscalls 427
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
new file mode 100644
index 000000000000..ac692823d6f4
--- /dev/null
+++ b/include/uapi/linux/io_uring.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Header file for the io_uring interface.
+ *
+ * Copyright (C) 2019 Jens Axboe
+ * Copyright (C) 2019 Christoph Hellwig
+ */
+#ifndef LINUX_IO_URING_H
+#define LINUX_IO_URING_H
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+/*
+ * IO submission data structure (Submission Queue Entry)
+ */
+struct io_uring_sqe {
+	__u8	opcode;		/* type of operation for this sqe */
+	__u8	flags;		/* as of now unused */
+	__u16	ioprio;		/* ioprio for the request */
+	__s32	fd;		/* file descriptor to do IO on */
+	__u64	off;		/* offset into file */
+	__u64	addr;		/* pointer to buffer or iovecs */
+	__u32	len;		/* buffer size or number of iovecs */
+	union {
+		__kernel_rwf_t	rw_flags;
+		__u32		__resv;
+	};
+	__u64	user_data;	/* data to be passed back at completion time */
+	__u64	__pad2[3];
+};
+
+#define IORING_OP_NOP		0
+#define IORING_OP_READV		1
+#define IORING_OP_WRITEV	2
+
+/*
+ * IO completion data structure (Completion Queue Entry)
+ */
+struct io_uring_cqe {
+	__u64	user_data;	/* sqe->data submission passed back */
+	__s32	res;		/* result code for this event */
+	__u32	flags;
+};
+
+/*
+ * Magic offsets for the application to mmap the data it needs
+ */
+#define IORING_OFF_SQ_RING		0ULL
+#define IORING_OFF_CQ_RING		0x8000000ULL
+#define IORING_OFF_SQES			0x10000000ULL
+
+/*
+ * Filled with the offset for mmap(2)
+ */
+struct io_sqring_offsets {
+	__u32 head;
+	__u32 tail;
+	__u32 ring_mask;
+	__u32 ring_entries;
+	__u32 flags;
+	__u32 dropped;
+	__u32 array;
+	__u32 resv1;
+	__u64 resv2;
+};
+
+struct io_cqring_offsets {
+	__u32 head;
+	__u32 tail;
+	__u32 ring_mask;
+	__u32 ring_entries;
+	__u32 overflow;
+	__u32 cqes;
+	__u64 resv[2];
+};
+
+/*
+ * io_uring_enter(2) flags
+ */
+#define IORING_ENTER_GETEVENTS	(1U << 0)
+
+/*
+ * Passed in for io_uring_setup(2). Copied back with updated info on success
+ */
+struct io_uring_params {
+	__u32 sq_entries;
+	__u32 cq_entries;
+	__u32 flags;
+	__u32 resv[7];
+	struct io_sqring_offsets sq_off;
+	struct io_cqring_offsets cq_off;
+};
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index c9386a365eea..53b54214a36e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1414,6 +1414,15 @@ config AIO
 	  by some high performance threaded applications. Disabling
 	  this option saves about 7k.
 
+config IO_URING
+	bool "Enable IO uring support" if EXPERT
+	select ANON_INODES
+	default y
+	help
+	  This option enables support for the io_uring interface, enabling
+	  applications to submit and complete IO through submission and
+	  completion rings that are shared between the kernel and application.
+
 config ADVISE_SYSCALLS
 	bool "Enable madvise/fadvise syscalls" if EXPERT
 	default y
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ab9d0e3c6d50..ee5e523564bb 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,6 +46,8 @@ COND_SYSCALL(io_getevents);
 COND_SYSCALL(io_pgetevents);
 COND_SYSCALL_COMPAT(io_getevents);
 COND_SYSCALL_COMPAT(io_pgetevents);
+COND_SYSCALL(io_uring_setup);
+COND_SYSCALL(io_uring_enter);
 
 /* fs/xattr.c */
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index c36757e72844..f81854d74c7d 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -108,6 +108,9 @@ struct sock *unix_get_socket(struct file *filp)
 		/* PF_UNIX ? */
 		if (s && sock->ops && sock->ops->family == PF_UNIX)
 			u_sock = s;
+	} else {
+		/* Could be an io_uring instance */
+		u_sock = io_uring_get_socket(filp);
 	}
 	return u_sock;
 }
-- 
cgit v1.2.3-59-g8ed1b


From c992fe2925d776be066d9f6cc13f9ea11d78b657 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 11 Jan 2019 09:43:02 -0700
Subject: io_uring: add fsync support

Add a new fsync opcode, which either syncs a range if one is passed,
or the whole file if the offset and length fields are both cleared
to zero.  A flag is provided to use fdatasync semantics, that is only
force out metadata which is required to retrieve the file data, but
not others like metadata.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 54 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  8 ++++++-
 2 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f68052290426..745413d92712 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -24,6 +24,7 @@
  * data that the application could potentially modify, it remains stable.
  *
  * Copyright (C) 2018-2019 Jens Axboe
+ * Copyright (c) 2018-2019 Christoph Hellwig
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -557,6 +558,56 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 	return 0;
 }
 
+static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	int fd;
+
+	/* Prep already done */
+	if (req->rw.ki_filp)
+		return 0;
+
+	if (unlikely(sqe->addr || sqe->ioprio))
+		return -EINVAL;
+
+	fd = READ_ONCE(sqe->fd);
+	req->rw.ki_filp = fget(fd);
+	if (unlikely(!req->rw.ki_filp))
+		return -EBADF;
+
+	return 0;
+}
+
+static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		    bool force_nonblock)
+{
+	loff_t sqe_off = READ_ONCE(sqe->off);
+	loff_t sqe_len = READ_ONCE(sqe->len);
+	loff_t end = sqe_off + sqe_len;
+	unsigned fsync_flags;
+	int ret;
+
+	fsync_flags = READ_ONCE(sqe->fsync_flags);
+	if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
+		return -EINVAL;
+
+	ret = io_prep_fsync(req, sqe);
+	if (ret)
+		return ret;
+
+	/* fsync always requires a blocking context */
+	if (force_nonblock)
+		return -EAGAIN;
+
+	ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
+				end > 0 ? end : LLONG_MAX,
+				fsync_flags & IORING_FSYNC_DATASYNC);
+
+	fput(req->rw.ki_filp);
+	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+	io_free_req(req);
+	return 0;
+}
+
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			   const struct sqe_submit *s, bool force_nonblock)
 {
@@ -578,6 +629,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_WRITEV:
 		ret = io_write(req, s, force_nonblock);
 		break;
+	case IORING_OP_FSYNC:
+		ret = io_fsync(req, s->sqe, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ac692823d6f4..4589d56d0b68 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -24,7 +24,7 @@ struct io_uring_sqe {
 	__u32	len;		/* buffer size or number of iovecs */
 	union {
 		__kernel_rwf_t	rw_flags;
-		__u32		__resv;
+		__u32		fsync_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	__u64	__pad2[3];
@@ -33,6 +33,12 @@ struct io_uring_sqe {
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1
 #define IORING_OP_WRITEV	2
+#define IORING_OP_FSYNC		3
+
+/*
+ * sqe->fsync_flags
+ */
+#define IORING_FSYNC_DATASYNC	(1U << 0)
 
 /*
  * IO completion data structure (Completion Queue Entry)
-- 
cgit v1.2.3-59-g8ed1b


From def596e9557c91d9846fc4d84d26f2c564644416 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 9 Jan 2019 08:59:42 -0700
Subject: io_uring: support for IO polling

Add support for a polled io_uring instance. When a read or write is
submitted to a polled io_uring, the application must poll for
completions on the CQ ring through io_uring_enter(2). Polled IO may not
generate IRQ completions, hence they need to be actively found by the
application itself.

To use polling, io_uring_setup() must be used with the
IORING_SETUP_IOPOLL flag being set. It is illegal to mix and match
polled and non-polled IO on an io_uring.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 275 ++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h |   5 +
 2 files changed, 271 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 745413d92712..578797a4f318 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -124,6 +124,14 @@ struct io_ring_ctx {
 
 	struct {
 		spinlock_t		completion_lock;
+		bool			poll_multi_file;
+		/*
+		 * ->poll_list is protected by the ctx->uring_lock for
+		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
+		 * For SQPOLL, only the single threaded io_sq_thread() will
+		 * manipulate the list, hence no extra locking is needed there.
+		 */
+		struct list_head	poll_list;
 	} ____cacheline_aligned_in_smp;
 
 #if defined(CONFIG_UNIX)
@@ -135,6 +143,7 @@ struct sqe_submit {
 	const struct io_uring_sqe	*sqe;
 	unsigned short			index;
 	bool				has_user;
+	bool				needs_lock;
 };
 
 struct io_kiocb {
@@ -146,12 +155,15 @@ struct io_kiocb {
 	struct list_head	list;
 	unsigned int		flags;
 #define REQ_F_FORCE_NONBLOCK	1	/* inline submission attempt */
+#define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 	u64			user_data;
+	u64			error;
 
 	struct work_struct	work;
 };
 
 #define IO_PLUG_THRESHOLD		2
+#define IO_IOPOLL_BATCH			8
 
 static struct kmem_cache *req_cachep;
 
@@ -196,6 +208,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
+	INIT_LIST_HEAD(&ctx->poll_list);
 	return ctx;
 }
 
@@ -297,12 +310,153 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)
 	return NULL;
 }
 
+static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
+{
+	if (*nr) {
+		kmem_cache_free_bulk(req_cachep, *nr, reqs);
+		io_ring_drop_ctx_refs(ctx, *nr);
+		*nr = 0;
+	}
+}
+
 static void io_free_req(struct io_kiocb *req)
 {
 	io_ring_drop_ctx_refs(req->ctx, 1);
 	kmem_cache_free(req_cachep, req);
 }
 
+/*
+ * Find and free completed poll iocbs
+ */
+static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
+			       struct list_head *done)
+{
+	void *reqs[IO_IOPOLL_BATCH];
+	struct io_kiocb *req;
+	int to_free = 0;
+
+	while (!list_empty(done)) {
+		req = list_first_entry(done, struct io_kiocb, list);
+		list_del(&req->list);
+
+		io_cqring_fill_event(ctx, req->user_data, req->error, 0);
+
+		reqs[to_free++] = req;
+		(*nr_events)++;
+
+		fput(req->rw.ki_filp);
+		if (to_free == ARRAY_SIZE(reqs))
+			io_free_req_many(ctx, reqs, &to_free);
+	}
+	io_commit_cqring(ctx);
+
+	io_free_req_many(ctx, reqs, &to_free);
+}
+
+static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
+			long min)
+{
+	struct io_kiocb *req, *tmp;
+	LIST_HEAD(done);
+	bool spin;
+	int ret;
+
+	/*
+	 * Only spin for completions if we don't have multiple devices hanging
+	 * off our complete list, and we're under the requested amount.
+	 */
+	spin = !ctx->poll_multi_file && *nr_events < min;
+
+	ret = 0;
+	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
+		struct kiocb *kiocb = &req->rw;
+
+		/*
+		 * Move completed entries to our local list. If we find a
+		 * request that requires polling, break out and complete
+		 * the done list first, if we have entries there.
+		 */
+		if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+			list_move_tail(&req->list, &done);
+			continue;
+		}
+		if (!list_empty(&done))
+			break;
+
+		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+		if (ret < 0)
+			break;
+
+		if (ret && spin)
+			spin = false;
+		ret = 0;
+	}
+
+	if (!list_empty(&done))
+		io_iopoll_complete(ctx, nr_events, &done);
+
+	return ret;
+}
+
+/*
+ * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
+ * non-spinning poll check - we'll still enter the driver poll loop, but only
+ * as a non-spinning completion check.
+ */
+static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
+				long min)
+{
+	while (!list_empty(&ctx->poll_list)) {
+		int ret;
+
+		ret = io_do_iopoll(ctx, nr_events, min);
+		if (ret < 0)
+			return ret;
+		if (!min || *nr_events >= min)
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * We can't just wait for polled events to come to us, we have to actively
+ * find and complete them.
+ */
+static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
+{
+	if (!(ctx->flags & IORING_SETUP_IOPOLL))
+		return;
+
+	mutex_lock(&ctx->uring_lock);
+	while (!list_empty(&ctx->poll_list)) {
+		unsigned int nr_events = 0;
+
+		io_iopoll_getevents(ctx, &nr_events, 1);
+	}
+	mutex_unlock(&ctx->uring_lock);
+}
+
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+			   long min)
+{
+	int ret = 0;
+
+	do {
+		int tmin = 0;
+
+		if (*nr_events < min)
+			tmin = min - *nr_events;
+
+		ret = io_iopoll_getevents(ctx, nr_events, tmin);
+		if (ret <= 0)
+			break;
+		ret = 0;
+	} while (min && !*nr_events && !need_resched());
+
+	return ret;
+}
+
 static void kiocb_end_write(struct kiocb *kiocb)
 {
 	if (kiocb->ki_flags & IOCB_WRITE) {
@@ -329,6 +483,53 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 	io_free_req(req);
 }
 
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+{
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+	kiocb_end_write(kiocb);
+
+	req->error = res;
+	if (res != -EAGAIN)
+		req->flags |= REQ_F_IOPOLL_COMPLETED;
+}
+
+/*
+ * After the iocb has been issued, it's safe to be found on the poll list.
+ * Adding the kiocb to the list AFTER submission ensures that we don't
+ * find it from a io_iopoll_getevents() thread before the issuer is done
+ * accessing the kiocb cookie.
+ */
+static void io_iopoll_req_issued(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	/*
+	 * Track whether we have multiple files in our lists. This will impact
+	 * how we do polling eventually, not spinning if we're on potentially
+	 * different devices.
+	 */
+	if (list_empty(&ctx->poll_list)) {
+		ctx->poll_multi_file = false;
+	} else if (!ctx->poll_multi_file) {
+		struct io_kiocb *list_req;
+
+		list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
+						list);
+		if (list_req->rw.ki_filp != req->rw.ki_filp)
+			ctx->poll_multi_file = true;
+	}
+
+	/*
+	 * For fast devices, IO may have already completed. If it has, add
+	 * it to the front so we find it first.
+	 */
+	if (req->flags & REQ_F_IOPOLL_COMPLETED)
+		list_add(&req->list, &ctx->poll_list);
+	else
+		list_add_tail(&req->list, &ctx->poll_list);
+}
+
 /*
  * If we tracked the file through the SCM inflight mechanism, we could support
  * any file. For now, just ensure that anything potentially problematic is done
@@ -349,6 +550,7 @@ static bool io_file_supports_async(struct file *file)
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		      bool force_nonblock)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
 	unsigned ioprio;
 	int fd, ret;
@@ -384,12 +586,22 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		kiocb->ki_flags |= IOCB_NOWAIT;
 		req->flags |= REQ_F_FORCE_NONBLOCK;
 	}
-	if (kiocb->ki_flags & IOCB_HIPRI) {
-		ret = -EINVAL;
-		goto out_fput;
-	}
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		ret = -EOPNOTSUPP;
+		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
+		    !kiocb->ki_filp->f_op->iopoll)
+			goto out_fput;
 
-	kiocb->ki_complete = io_complete_rw;
+		req->error = 0;
+		kiocb->ki_flags |= IOCB_HIPRI;
+		kiocb->ki_complete = io_complete_rw_iopoll;
+	} else {
+		if (kiocb->ki_flags & IOCB_HIPRI) {
+			ret = -EINVAL;
+			goto out_fput;
+		}
+		kiocb->ki_complete = io_complete_rw;
+	}
 	return 0;
 out_fput:
 	fput(kiocb->ki_filp);
@@ -543,6 +755,9 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 	struct io_ring_ctx *ctx = req->ctx;
 	long err = 0;
 
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
 	/*
 	 * Twilight zone - it's possible that someone issued an opcode that
 	 * has a file attached, then got -EAGAIN on submission, and changed
@@ -566,6 +781,8 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (req->rw.ki_filp)
 		return 0;
 
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
 	if (unlikely(sqe->addr || sqe->ioprio))
 		return -EINVAL;
 
@@ -637,7 +854,22 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		break;
 	}
 
-	return ret;
+	if (ret)
+		return ret;
+
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		if (req->error == -EAGAIN)
+			return -EAGAIN;
+
+		/* workqueue context doesn't hold uring_lock, grab it now */
+		if (s->needs_lock)
+			mutex_lock(&ctx->uring_lock);
+		io_iopoll_req_issued(req);
+		if (s->needs_lock)
+			mutex_unlock(&ctx->uring_lock);
+	}
+
+	return 0;
 }
 
 static void io_sq_wq_submit_work(struct work_struct *work)
@@ -661,8 +893,19 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 	use_mm(ctx->sqo_mm);
 	set_fs(USER_DS);
 	s->has_user = true;
+	s->needs_lock = true;
 
-	ret = __io_submit_sqe(ctx, req, s, false);
+	do {
+		ret = __io_submit_sqe(ctx, req, s, false);
+		/*
+		 * We can get EAGAIN for polled IO even though we're forcing
+		 * a sync submission from here, since we can't wait for
+		 * request slots on the block side.
+		 */
+		if (ret != -EAGAIN)
+			break;
+		cond_resched();
+	} while (1);
 
 	set_fs(old_fs);
 	unuse_mm(ctx->sqo_mm);
@@ -799,6 +1042,8 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 			break;
 
 		s.has_user = true;
+		s.needs_lock = false;
+
 		ret = io_submit_sqe(ctx, &s);
 		if (ret) {
 			io_drop_sqring(ctx);
@@ -947,6 +1192,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		destroy_workqueue(ctx->sqo_wq);
 	if (ctx->sqo_mm)
 		mmdrop(ctx->sqo_mm);
+
+	io_iopoll_reap_events(ctx);
+
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock)
 		sock_release(ctx->ring_sock);
@@ -993,6 +1241,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	percpu_ref_kill(&ctx->refs);
 	mutex_unlock(&ctx->uring_lock);
 
+	io_iopoll_reap_events(ctx);
 	wait_for_completion(&ctx->ctx_done);
 	io_ring_ctx_free(ctx);
 }
@@ -1074,6 +1323,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			goto out_ctx;
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
+		unsigned nr_events = 0;
+
 		min_complete = min(min_complete, ctx->cq_entries);
 
 		/*
@@ -1085,7 +1336,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		if (submitted < to_submit)
 			min_complete = min_t(unsigned, submitted, min_complete);
 
-		ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+		if (ctx->flags & IORING_SETUP_IOPOLL) {
+			mutex_lock(&ctx->uring_lock);
+			ret = io_iopoll_check(ctx, &nr_events, min_complete);
+			mutex_unlock(&ctx->uring_lock);
+		} else {
+			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+		}
 	}
 
 out_ctx:
@@ -1282,7 +1539,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			return -EINVAL;
 	}
 
-	if (p.flags)
+	if (p.flags & ~IORING_SETUP_IOPOLL)
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4589d56d0b68..5c457ea396e6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -30,6 +30,11 @@ struct io_uring_sqe {
 	__u64	__pad2[3];
 };
 
+/*
+ * io_uring_setup() flags
+ */
+#define IORING_SETUP_IOPOLL	(1U << 0)	/* io_context is polled */
+
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1
 #define IORING_OP_WRITEV	2
-- 
cgit v1.2.3-59-g8ed1b


From edafccee56ff31678a091ddb7219aba9b28bc3cb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 9 Jan 2019 09:16:05 -0700
Subject: io_uring: add support for pre-mapped user IO buffers

If we have fixed user buffers, we can map them into the kernel when we
setup the io_uring. That avoids the need to do get_user_pages() for
each and every IO.

To utilize this feature, the application must call io_uring_register()
after having setup an io_uring instance, passing in
IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer to
an iovec array, and the nr_args should contain how many iovecs the
application wishes to map.

If successful, these buffers are now mapped into the kernel, eligible
for IO. To use these fixed buffers, the application must use the
IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then
set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len
must point to somewhere inside the indexed buffer.

The application may register buffers throughout the lifetime of the
io_uring instance. It can call io_uring_register() with
IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of
buffers, and then register a new set. The application need not
unregister buffers explicitly before shutting down the io_uring
instance.

It's perfectly valid to setup a larger buffer, and then sometimes only
use parts of it for an IO. As long as the range is within the originally
mapped region, it will work just fine.

For now, buffers must not be file backed. If file backed buffers are
passed in, the registration will fail with -1/EOPNOTSUPP. This
restriction may be relaxed in the future.

RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat
arbitrary 1G per buffer size is also imposed.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/io_uring.c                          | 374 +++++++++++++++++++++++++++++++--
 include/linux/syscalls.h               |   2 +
 include/uapi/asm-generic/unistd.h      |   4 +-
 include/uapi/linux/io_uring.h          |  13 +-
 kernel/sys_ni.c                        |   1 +
 7 files changed, 381 insertions(+), 15 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 481c126259e9..2eefd2a7c1ce 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -400,3 +400,4 @@
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
 425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
 426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
+427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 6a32a430c8e0..65c026185e61 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
 334	common	rseq			__x64_sys_rseq
 425	common	io_uring_setup		__x64_sys_io_uring_setup
 426	common	io_uring_enter		__x64_sys_io_uring_enter
+427	common	io_uring_register	__x64_sys_io_uring_register
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 31f43ed894ba..c0c0f68568b5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -45,6 +45,7 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/blkdev.h>
+#include <linux/bvec.h>
 #include <linux/net.h>
 #include <net/sock.h>
 #include <net/af_unix.h>
@@ -52,6 +53,8 @@
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
 #include <linux/nospec.h>
+#include <linux/sizes.h>
+#include <linux/hugetlb.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -81,6 +84,13 @@ struct io_cq_ring {
 	struct io_uring_cqe	cqes[];
 };
 
+struct io_mapped_ubuf {
+	u64		ubuf;
+	size_t		len;
+	struct		bio_vec *bvec;
+	unsigned int	nr_bvecs;
+};
+
 struct io_ring_ctx {
 	struct {
 		struct percpu_ref	refs;
@@ -113,6 +123,10 @@ struct io_ring_ctx {
 		struct fasync_struct	*cq_fasync;
 	} ____cacheline_aligned_in_smp;
 
+	/* if used, fixed mapped user buffers */
+	unsigned		nr_user_bufs;
+	struct io_mapped_ubuf	*user_bufs;
+
 	struct user_struct	*user;
 
 	struct completion	ctx_done;
@@ -732,6 +746,46 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 	}
 }
 
+static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
+			   const struct io_uring_sqe *sqe,
+			   struct iov_iter *iter)
+{
+	size_t len = READ_ONCE(sqe->len);
+	struct io_mapped_ubuf *imu;
+	unsigned index, buf_index;
+	size_t offset;
+	u64 buf_addr;
+
+	/* attempt to use fixed buffers without having provided iovecs */
+	if (unlikely(!ctx->user_bufs))
+		return -EFAULT;
+
+	buf_index = READ_ONCE(sqe->buf_index);
+	if (unlikely(buf_index >= ctx->nr_user_bufs))
+		return -EFAULT;
+
+	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+	imu = &ctx->user_bufs[index];
+	buf_addr = READ_ONCE(sqe->addr);
+
+	/* overflow */
+	if (buf_addr + len < buf_addr)
+		return -EFAULT;
+	/* not inside the mapped region */
+	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+		return -EFAULT;
+
+	/*
+	 * May not be a start of buffer, set size appropriately
+	 * and advance us to the beginning.
+	 */
+	offset = buf_addr - imu->ubuf;
+	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+	if (offset)
+		iov_iter_advance(iter, offset);
+	return 0;
+}
+
 static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 			   const struct sqe_submit *s, struct iovec **iovec,
 			   struct iov_iter *iter)
@@ -739,6 +793,23 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	const struct io_uring_sqe *sqe = s->sqe;
 	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	size_t sqe_len = READ_ONCE(sqe->len);
+	u8 opcode;
+
+	/*
+	 * We're reading ->opcode for the second time, but the first read
+	 * doesn't care whether it's _FIXED or not, so it doesn't matter
+	 * whether ->opcode changes concurrently. The first read does care
+	 * about whether it is a READ or a WRITE, so we don't trust this read
+	 * for that purpose and instead let the caller pass in the read/write
+	 * flag.
+	 */
+	opcode = READ_ONCE(sqe->opcode);
+	if (opcode == IORING_OP_READ_FIXED ||
+	    opcode == IORING_OP_WRITE_FIXED) {
+		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+		*iovec = NULL;
+		return ret;
+	}
 
 	if (!s->has_user)
 		return -EFAULT;
@@ -886,7 +957,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(sqe->addr || sqe->ioprio))
+	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
 	fd = READ_ONCE(sqe->fd);
@@ -945,9 +1016,19 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		ret = io_nop(req, req->user_data);
 		break;
 	case IORING_OP_READV:
+		if (unlikely(s->sqe->buf_index))
+			return -EINVAL;
 		ret = io_read(req, s, force_nonblock, state);
 		break;
 	case IORING_OP_WRITEV:
+		if (unlikely(s->sqe->buf_index))
+			return -EINVAL;
+		ret = io_write(req, s, force_nonblock, state);
+		break;
+	case IORING_OP_READ_FIXED:
+		ret = io_read(req, s, force_nonblock, state);
+		break;
+	case IORING_OP_WRITE_FIXED:
 		ret = io_write(req, s, force_nonblock, state);
 		break;
 	case IORING_OP_FSYNC:
@@ -976,28 +1057,46 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	return 0;
 }
 
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+{
+	u8 opcode = READ_ONCE(sqe->opcode);
+
+	return !(opcode == IORING_OP_READ_FIXED ||
+		 opcode == IORING_OP_WRITE_FIXED);
+}
+
 static void io_sq_wq_submit_work(struct work_struct *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct sqe_submit *s = &req->submit;
 	const struct io_uring_sqe *sqe = s->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
-	mm_segment_t old_fs = get_fs();
+	mm_segment_t old_fs;
+	bool needs_user;
 	int ret;
 
 	 /* Ensure we clear previously set forced non-block flag */
 	req->flags &= ~REQ_F_FORCE_NONBLOCK;
 	req->rw.ki_flags &= ~IOCB_NOWAIT;
 
-	if (!mmget_not_zero(ctx->sqo_mm)) {
-		ret = -EFAULT;
-		goto err;
-	}
-
-	use_mm(ctx->sqo_mm);
-	set_fs(USER_DS);
-	s->has_user = true;
 	s->needs_lock = true;
+	s->has_user = false;
+
+	/*
+	 * If we're doing IO to fixed buffers, we don't need to get/set
+	 * user context
+	 */
+	needs_user = io_sqe_needs_user(s->sqe);
+	if (needs_user) {
+		if (!mmget_not_zero(ctx->sqo_mm)) {
+			ret = -EFAULT;
+			goto err;
+		}
+		use_mm(ctx->sqo_mm);
+		old_fs = get_fs();
+		set_fs(USER_DS);
+		s->has_user = true;
+	}
 
 	do {
 		ret = __io_submit_sqe(ctx, req, s, false, NULL);
@@ -1011,9 +1110,11 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 		cond_resched();
 	} while (1);
 
-	set_fs(old_fs);
-	unuse_mm(ctx->sqo_mm);
-	mmput(ctx->sqo_mm);
+	if (needs_user) {
+		set_fs(old_fs);
+		unuse_mm(ctx->sqo_mm);
+		mmput(ctx->sqo_mm);
+	}
 err:
 	if (ret) {
 		io_cqring_add_event(ctx, sqe->user_data, ret, 0);
@@ -1317,6 +1418,198 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
 	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 }
 
+static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+{
+	int i, j;
+
+	if (!ctx->user_bufs)
+		return -ENXIO;
+
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+
+		for (j = 0; j < imu->nr_bvecs; j++)
+			put_page(imu->bvec[j].bv_page);
+
+		if (ctx->account_mem)
+			io_unaccount_mem(ctx->user, imu->nr_bvecs);
+		kfree(imu->bvec);
+		imu->nr_bvecs = 0;
+	}
+
+	kfree(ctx->user_bufs);
+	ctx->user_bufs = NULL;
+	ctx->nr_user_bufs = 0;
+	return 0;
+}
+
+static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
+		       void __user *arg, unsigned index)
+{
+	struct iovec __user *src;
+
+#ifdef CONFIG_COMPAT
+	if (ctx->compat) {
+		struct compat_iovec __user *ciovs;
+		struct compat_iovec ciov;
+
+		ciovs = (struct compat_iovec __user *) arg;
+		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
+			return -EFAULT;
+
+		dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
+		dst->iov_len = ciov.iov_len;
+		return 0;
+	}
+#endif
+	src = (struct iovec __user *) arg;
+	if (copy_from_user(dst, &src[index], sizeof(*dst)))
+		return -EFAULT;
+	return 0;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
+				  unsigned nr_args)
+{
+	struct vm_area_struct **vmas = NULL;
+	struct page **pages = NULL;
+	int i, j, got_pages = 0;
+	int ret = -EINVAL;
+
+	if (ctx->user_bufs)
+		return -EBUSY;
+	if (!nr_args || nr_args > UIO_MAXIOV)
+		return -EINVAL;
+
+	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
+					GFP_KERNEL);
+	if (!ctx->user_bufs)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_args; i++) {
+		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+		unsigned long off, start, end, ubuf;
+		int pret, nr_pages;
+		struct iovec iov;
+		size_t size;
+
+		ret = io_copy_iov(ctx, &iov, arg, i);
+		if (ret)
+			break;
+
+		/*
+		 * Don't impose further limits on the size and buffer
+		 * constraints here, we'll -EINVAL later when IO is
+		 * submitted if they are wrong.
+		 */
+		ret = -EFAULT;
+		if (!iov.iov_base || !iov.iov_len)
+			goto err;
+
+		/* arbitrary limit, but we need something */
+		if (iov.iov_len > SZ_1G)
+			goto err;
+
+		ubuf = (unsigned long) iov.iov_base;
+		end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		start = ubuf >> PAGE_SHIFT;
+		nr_pages = end - start;
+
+		if (ctx->account_mem) {
+			ret = io_account_mem(ctx->user, nr_pages);
+			if (ret)
+				goto err;
+		}
+
+		ret = 0;
+		if (!pages || nr_pages > got_pages) {
+			kfree(vmas);
+			kfree(pages);
+			pages = kmalloc_array(nr_pages, sizeof(struct page *),
+						GFP_KERNEL);
+			vmas = kmalloc_array(nr_pages,
+					sizeof(struct vm_area_struct *),
+					GFP_KERNEL);
+			if (!pages || !vmas) {
+				ret = -ENOMEM;
+				if (ctx->account_mem)
+					io_unaccount_mem(ctx->user, nr_pages);
+				goto err;
+			}
+			got_pages = nr_pages;
+		}
+
+		imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+						GFP_KERNEL);
+		ret = -ENOMEM;
+		if (!imu->bvec) {
+			if (ctx->account_mem)
+				io_unaccount_mem(ctx->user, nr_pages);
+			goto err;
+		}
+
+		ret = 0;
+		down_read(&current->mm->mmap_sem);
+		pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
+						pages, vmas);
+		if (pret == nr_pages) {
+			/* don't support file backed memory */
+			for (j = 0; j < nr_pages; j++) {
+				struct vm_area_struct *vma = vmas[j];
+
+				if (vma->vm_file &&
+				    !is_file_hugepages(vma->vm_file)) {
+					ret = -EOPNOTSUPP;
+					break;
+				}
+			}
+		} else {
+			ret = pret < 0 ? pret : -EFAULT;
+		}
+		up_read(&current->mm->mmap_sem);
+		if (ret) {
+			/*
+			 * if we did partial map, or found file backed vmas,
+			 * release any pages we did get
+			 */
+			if (pret > 0) {
+				for (j = 0; j < pret; j++)
+					put_page(pages[j]);
+			}
+			if (ctx->account_mem)
+				io_unaccount_mem(ctx->user, nr_pages);
+			goto err;
+		}
+
+		off = ubuf & ~PAGE_MASK;
+		size = iov.iov_len;
+		for (j = 0; j < nr_pages; j++) {
+			size_t vec_len;
+
+			vec_len = min_t(size_t, size, PAGE_SIZE - off);
+			imu->bvec[j].bv_page = pages[j];
+			imu->bvec[j].bv_len = vec_len;
+			imu->bvec[j].bv_offset = off;
+			off = 0;
+			size -= vec_len;
+		}
+		/* store original address for later verification */
+		imu->ubuf = ubuf;
+		imu->len = iov.iov_len;
+		imu->nr_bvecs = nr_pages;
+
+		ctx->nr_user_bufs++;
+	}
+	kfree(pages);
+	kfree(vmas);
+	return 0;
+err:
+	kfree(pages);
+	kfree(vmas);
+	io_sqe_buffer_unregister(ctx);
+	return ret;
+}
+
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	if (ctx->sqo_wq)
@@ -1325,6 +1618,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		mmdrop(ctx->sqo_mm);
 
 	io_iopoll_reap_events(ctx);
+	io_sqe_buffer_unregister(ctx);
 
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock)
@@ -1689,6 +1983,60 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
 	return io_uring_setup(entries, params);
 }
 
+static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
+			       void __user *arg, unsigned nr_args)
+{
+	int ret;
+
+	percpu_ref_kill(&ctx->refs);
+	wait_for_completion(&ctx->ctx_done);
+
+	switch (opcode) {
+	case IORING_REGISTER_BUFFERS:
+		ret = io_sqe_buffer_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_BUFFERS:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_sqe_buffer_unregister(ctx);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	/* bring the ctx back to life */
+	reinit_completion(&ctx->ctx_done);
+	percpu_ref_reinit(&ctx->refs);
+	return ret;
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+		void __user *, arg, unsigned int, nr_args)
+{
+	struct io_ring_ctx *ctx;
+	long ret = -EBADF;
+	struct fd f;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EOPNOTSUPP;
+	if (f.file->f_op != &io_uring_fops)
+		goto out_fput;
+
+	ctx = f.file->private_data;
+
+	mutex_lock(&ctx->uring_lock);
+	ret = __io_uring_register(ctx, opcode, arg, nr_args);
+	mutex_unlock(&ctx->uring_lock);
+out_fput:
+	fdput(f);
+	return ret;
+}
+
 static int __init io_uring_init(void)
 {
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3072dbaa7869..3681c05ac538 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -315,6 +315,8 @@ asmlinkage long sys_io_uring_setup(u32 entries,
 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
 				u32 min_complete, u32 flags,
 				const sigset_t __user *sig, size_t sigsz);
+asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
+				void __user *arg, unsigned int nr_args);
 
 /* fs/xattr.c */
 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 87871e7b7ea7..d346229a1eb0 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -744,9 +744,11 @@ __SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
 __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
 #define __NR_io_uring_enter 426
 __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
+#define __NR_io_uring_register 427
+__SYSCALL(__NR_io_uring_register, sys_io_uring_register)
 
 #undef __NR_syscalls
-#define __NR_syscalls 427
+#define __NR_syscalls 428
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5c457ea396e6..cf28f7a11f12 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -27,7 +27,10 @@ struct io_uring_sqe {
 		__u32		fsync_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
-	__u64	__pad2[3];
+	union {
+		__u16	buf_index;	/* index into fixed buffers, if used */
+		__u64	__pad2[3];
+	};
 };
 
 /*
@@ -39,6 +42,8 @@ struct io_uring_sqe {
 #define IORING_OP_READV		1
 #define IORING_OP_WRITEV	2
 #define IORING_OP_FSYNC		3
+#define IORING_OP_READ_FIXED	4
+#define IORING_OP_WRITE_FIXED	5
 
 /*
  * sqe->fsync_flags
@@ -103,4 +108,10 @@ struct io_uring_params {
 	struct io_cqring_offsets cq_off;
 };
 
+/*
+ * io_uring_register(2) opcodes and arguments
+ */
+#define IORING_REGISTER_BUFFERS		0
+#define IORING_UNREGISTER_BUFFERS	1
+
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ee5e523564bb..1bb6604dc19f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,6 +48,7 @@ COND_SYSCALL_COMPAT(io_getevents);
 COND_SYSCALL_COMPAT(io_pgetevents);
 COND_SYSCALL(io_uring_setup);
 COND_SYSCALL(io_uring_enter);
+COND_SYSCALL(io_uring_register);
 
 /* fs/xattr.c */
 
-- 
cgit v1.2.3-59-g8ed1b


From 6b06314c47e141031be043539900d80d2c7ba10f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Jan 2019 22:13:58 -0700
Subject: io_uring: add file set registration

We normally have to fget/fput for each IO we do on a file. Even with
the batching we do, the cost of the atomic inc/dec of the file usage
count adds up.

This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes
for the io_uring_register(2) system call. The arguments passed in must
be an array of __s32 holding file descriptors, and nr_args should hold
the number of file descriptors the application wishes to pin for the
duration of the io_uring instance (or until IORING_UNREGISTER_FILES is
called).

When used, the application must set IOSQE_FIXED_FILE in the sqe->flags
member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd
to the index in the array passed in to IORING_REGISTER_FILES.

Files are automatically unregistered when the io_uring instance is torn
down. An application need only unregister if it wishes to register a new
set of fds.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 311 +++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/io_uring.h |   9 +-
 2 files changed, 288 insertions(+), 32 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c0c0f68568b5..5eaa1c4a3f7c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -49,6 +49,7 @@
 #include <linux/net.h>
 #include <net/sock.h>
 #include <net/af_unix.h>
+#include <net/scm.h>
 #include <linux/anon_inodes.h>
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
@@ -61,6 +62,7 @@
 #include "internal.h"
 
 #define IORING_MAX_ENTRIES	4096
+#define IORING_MAX_FIXED_FILES	1024
 
 struct io_uring {
 	u32 head ____cacheline_aligned_in_smp;
@@ -123,6 +125,14 @@ struct io_ring_ctx {
 		struct fasync_struct	*cq_fasync;
 	} ____cacheline_aligned_in_smp;
 
+	/*
+	 * If used, fixed file set. Writers must ensure that ->refs is dead,
+	 * readers must ensure that ->refs is alive as long as the file* is
+	 * used. Only updated through io_uring_register(2).
+	 */
+	struct file		**user_files;
+	unsigned		nr_user_files;
+
 	/* if used, fixed mapped user buffers */
 	unsigned		nr_user_bufs;
 	struct io_mapped_ubuf	*user_bufs;
@@ -170,6 +180,7 @@ struct io_kiocb {
 	unsigned int		flags;
 #define REQ_F_FORCE_NONBLOCK	1	/* inline submission attempt */
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
+#define REQ_F_FIXED_FILE	4	/* ctx owns file */
 	u64			user_data;
 	u64			error;
 
@@ -404,15 +415,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		 * Batched puts of the same file, to avoid dirtying the
 		 * file usage count multiple times, if avoidable.
 		 */
-		if (!file) {
-			file = req->rw.ki_filp;
-			file_count = 1;
-		} else if (file == req->rw.ki_filp) {
-			file_count++;
-		} else {
-			fput_many(file, file_count);
-			file = req->rw.ki_filp;
-			file_count = 1;
+		if (!(req->flags & REQ_F_FIXED_FILE)) {
+			if (!file) {
+				file = req->rw.ki_filp;
+				file_count = 1;
+			} else if (file == req->rw.ki_filp) {
+				file_count++;
+			} else {
+				fput_many(file, file_count);
+				file = req->rw.ki_filp;
+				file_count = 1;
+			}
 		}
 
 		if (to_free == ARRAY_SIZE(reqs))
@@ -544,13 +557,19 @@ static void kiocb_end_write(struct kiocb *kiocb)
 	}
 }
 
+static void io_fput(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_FIXED_FILE))
+		fput(req->rw.ki_filp);
+}
+
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 
 	kiocb_end_write(kiocb);
 
-	fput(kiocb->ki_filp);
+	io_fput(req);
 	io_cqring_add_event(req->ctx, req->user_data, res, 0);
 	io_free_req(req);
 }
@@ -666,19 +685,29 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
-	unsigned ioprio;
+	unsigned ioprio, flags;
 	int fd, ret;
 
 	/* For -EAGAIN retry, everything is already prepped */
 	if (kiocb->ki_filp)
 		return 0;
 
+	flags = READ_ONCE(sqe->flags);
 	fd = READ_ONCE(sqe->fd);
-	kiocb->ki_filp = io_file_get(state, fd);
-	if (unlikely(!kiocb->ki_filp))
-		return -EBADF;
-	if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
-		force_nonblock = false;
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files ||
+		    (unsigned) fd >= ctx->nr_user_files))
+			return -EBADF;
+		kiocb->ki_filp = ctx->user_files[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		kiocb->ki_filp = io_file_get(state, fd);
+		if (unlikely(!kiocb->ki_filp))
+			return -EBADF;
+		if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
+			force_nonblock = false;
+	}
 	kiocb->ki_pos = READ_ONCE(sqe->off);
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -718,10 +747,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 	return 0;
 out_fput:
-	/* in case of error, we didn't use this file reference. drop it. */
-	if (state)
-		state->used_refs--;
-	io_file_put(state, kiocb->ki_filp);
+	if (!(flags & IOSQE_FIXED_FILE)) {
+		/*
+		 * in case of error, we didn't use this file reference. drop it.
+		 */
+		if (state)
+			state->used_refs--;
+		io_file_put(state, kiocb->ki_filp);
+	}
 	return ret;
 }
 
@@ -863,7 +896,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
 out_fput:
 	/* Hold on to the file for -EAGAIN */
 	if (unlikely(ret && ret != -EAGAIN))
-		fput(file);
+		io_fput(req);
 	return ret;
 }
 
@@ -917,7 +950,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	kfree(iovec);
 out_fput:
 	if (unlikely(ret))
-		fput(file);
+		io_fput(req);
 	return ret;
 }
 
@@ -940,7 +973,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 	 */
 	if (req->rw.ki_filp) {
 		err = -EBADF;
-		fput(req->rw.ki_filp);
+		io_fput(req);
 	}
 	io_cqring_add_event(ctx, user_data, err, 0);
 	io_free_req(req);
@@ -949,21 +982,32 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 
 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct io_ring_ctx *ctx = req->ctx;
+	unsigned flags;
 	int fd;
 
 	/* Prep already done */
 	if (req->rw.ki_filp)
 		return 0;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
 	fd = READ_ONCE(sqe->fd);
-	req->rw.ki_filp = fget(fd);
-	if (unlikely(!req->rw.ki_filp))
-		return -EBADF;
+	flags = READ_ONCE(sqe->flags);
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+			return -EBADF;
+		req->rw.ki_filp = ctx->user_files[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		req->rw.ki_filp = fget(fd);
+		if (unlikely(!req->rw.ki_filp))
+			return -EBADF;
+	}
 
 	return 0;
 }
@@ -993,7 +1037,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				end > 0 ? end : LLONG_MAX,
 				fsync_flags & IORING_FSYNC_DATASYNC);
 
-	fput(req->rw.ki_filp);
+	io_fput(req);
 	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
 	io_free_req(req);
 	return 0;
@@ -1132,7 +1176,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(s->sqe->flags))
+	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
 		return -EINVAL;
 
 	req = io_get_req(ctx, state);
@@ -1344,6 +1388,201 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
 }
 
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if defined(CONFIG_UNIX)
+	if (ctx->ring_sock) {
+		struct sock *sock = ctx->ring_sock->sk;
+		struct sk_buff *skb;
+
+		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+			kfree_skb(skb);
+	}
+#else
+	int i;
+
+	for (i = 0; i < ctx->nr_user_files; i++)
+		fput(ctx->user_files[i]);
+#endif
+}
+
+static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+	if (!ctx->user_files)
+		return -ENXIO;
+
+	__io_sqe_files_unregister(ctx);
+	kfree(ctx->user_files);
+	ctx->user_files = NULL;
+	ctx->nr_user_files = 0;
+	return 0;
+}
+
+static void io_finish_async(struct io_ring_ctx *ctx)
+{
+	if (ctx->sqo_wq) {
+		destroy_workqueue(ctx->sqo_wq);
+		ctx->sqo_wq = NULL;
+	}
+}
+
+#if defined(CONFIG_UNIX)
+static void io_destruct_skb(struct sk_buff *skb)
+{
+	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
+
+	io_finish_async(ctx);
+	unix_destruct_scm(skb);
+}
+
+/*
+ * Ensure the UNIX gc is aware of our file set, so we are certain that
+ * the io_uring can be safely unregistered on process exit, even if we have
+ * loops in the file referencing.
+ */
+static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
+{
+	struct sock *sk = ctx->ring_sock->sk;
+	struct scm_fp_list *fpl;
+	struct sk_buff *skb;
+	int i;
+
+	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+		unsigned long inflight = ctx->user->unix_inflight + nr;
+
+		if (inflight > task_rlimit(current, RLIMIT_NOFILE))
+			return -EMFILE;
+	}
+
+	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+	if (!fpl)
+		return -ENOMEM;
+
+	skb = alloc_skb(0, GFP_KERNEL);
+	if (!skb) {
+		kfree(fpl);
+		return -ENOMEM;
+	}
+
+	skb->sk = sk;
+	skb->destructor = io_destruct_skb;
+
+	fpl->user = get_uid(ctx->user);
+	for (i = 0; i < nr; i++) {
+		fpl->fp[i] = get_file(ctx->user_files[i + offset]);
+		unix_inflight(fpl->user, fpl->fp[i]);
+	}
+
+	fpl->max = fpl->count = nr;
+	UNIXCB(skb).fp = fpl;
+	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+	skb_queue_head(&sk->sk_receive_queue, skb);
+
+	for (i = 0; i < nr; i++)
+		fput(fpl->fp[i]);
+
+	return 0;
+}
+
+/*
+ * If UNIX sockets are enabled, fd passing can cause a reference cycle which
+ * causes regular reference counting to break down. We rely on the UNIX
+ * garbage collection to take care of this problem for us.
+ */
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+	unsigned left, total;
+	int ret = 0;
+
+	total = 0;
+	left = ctx->nr_user_files;
+	while (left) {
+		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
+		int ret;
+
+		ret = __io_sqe_files_scm(ctx, this_files, total);
+		if (ret)
+			break;
+		left -= this_files;
+		total += this_files;
+	}
+
+	if (!ret)
+		return 0;
+
+	while (total < ctx->nr_user_files) {
+		fput(ctx->user_files[total]);
+		total++;
+	}
+
+	return ret;
+}
+#else
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+	return 0;
+}
+#endif
+
+static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+				 unsigned nr_args)
+{
+	__s32 __user *fds = (__s32 __user *) arg;
+	int fd, ret = 0;
+	unsigned i;
+
+	if (ctx->user_files)
+		return -EBUSY;
+	if (!nr_args)
+		return -EINVAL;
+	if (nr_args > IORING_MAX_FIXED_FILES)
+		return -EMFILE;
+
+	ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
+	if (!ctx->user_files)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_args; i++) {
+		ret = -EFAULT;
+		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
+			break;
+
+		ctx->user_files[i] = fget(fd);
+
+		ret = -EBADF;
+		if (!ctx->user_files[i])
+			break;
+		/*
+		 * Don't allow io_uring instances to be registered. If UNIX
+		 * isn't enabled, then this causes a reference cycle and this
+		 * instance can never get freed. If UNIX is enabled we'll
+		 * handle it just fine, but there's still no point in allowing
+		 * a ring fd as it doesn't support regular read/write anyway.
+		 */
+		if (ctx->user_files[i]->f_op == &io_uring_fops) {
+			fput(ctx->user_files[i]);
+			break;
+		}
+		ctx->nr_user_files++;
+		ret = 0;
+	}
+
+	if (ret) {
+		for (i = 0; i < ctx->nr_user_files; i++)
+			fput(ctx->user_files[i]);
+
+		kfree(ctx->user_files);
+		ctx->nr_user_files = 0;
+		return ret;
+	}
+
+	ret = io_sqe_files_scm(ctx);
+	if (ret)
+		io_sqe_files_unregister(ctx);
+
+	return ret;
+}
+
 static int io_sq_offload_start(struct io_ring_ctx *ctx)
 {
 	int ret;
@@ -1612,13 +1851,13 @@ err:
 
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
-	if (ctx->sqo_wq)
-		destroy_workqueue(ctx->sqo_wq);
+	io_finish_async(ctx);
 	if (ctx->sqo_mm)
 		mmdrop(ctx->sqo_mm);
 
 	io_iopoll_reap_events(ctx);
 	io_sqe_buffer_unregister(ctx);
+	io_sqe_files_unregister(ctx);
 
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock)
@@ -1858,6 +2097,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
 
 #if defined(CONFIG_UNIX)
 	ctx->ring_sock->file = file;
+	ctx->ring_sock->sk->sk_user_data = ctx;
 #endif
 	fd_install(ret, file);
 	return ret;
@@ -2001,6 +2241,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_sqe_buffer_unregister(ctx);
 		break;
+	case IORING_REGISTER_FILES:
+		ret = io_sqe_files_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_FILES:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_sqe_files_unregister(ctx);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cf28f7a11f12..6257478d55e9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -16,7 +16,7 @@
  */
 struct io_uring_sqe {
 	__u8	opcode;		/* type of operation for this sqe */
-	__u8	flags;		/* as of now unused */
+	__u8	flags;		/* IOSQE_ flags */
 	__u16	ioprio;		/* ioprio for the request */
 	__s32	fd;		/* file descriptor to do IO on */
 	__u64	off;		/* offset into file */
@@ -33,6 +33,11 @@ struct io_uring_sqe {
 	};
 };
 
+/*
+ * sqe->flags
+ */
+#define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
+
 /*
  * io_uring_setup() flags
  */
@@ -113,5 +118,7 @@ struct io_uring_params {
  */
 #define IORING_REGISTER_BUFFERS		0
 #define IORING_UNREGISTER_BUFFERS	1
+#define IORING_REGISTER_FILES		2
+#define IORING_UNREGISTER_FILES		3
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 6c271ce2f1d572f7fa225700a13cfe7ced492434 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Jan 2019 11:22:30 -0700
Subject: io_uring: add submission polling

This enables an application to do IO, without ever entering the kernel.
By using the SQ ring to fill in new sqes and watching for completions
on the CQ ring, we can submit and reap IOs without doing a single system
call. The kernel side thread will poll for new submissions, and in case
of HIPRI/polled IO, it'll also poll for completions.

By default, we allow 1 second of active spinning. This can by changed
by passing in a different grace period at io_uring_register(2) time.
If the thread exceeds this idle time without having any work to do, it
will set:

sq_ring->flags |= IORING_SQ_NEED_WAKEUP.

The application will have to call io_uring_enter() to start things back
up again. If IO is kept busy, that will never be needed. Basically an
application that has this feature enabled will guard it's
io_uring_enter(2) call with:

read_barrier();
if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP)
	io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);

instead of calling it unconditionally.

It's mandatory to use fixed files with this feature. Failure to do so
will result in the application getting an -EBADF CQ entry when
submitting IO.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 249 ++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h |  12 +-
 2 files changed, 253 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5eaa1c4a3f7c..0a4caedf82c1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -44,6 +44,7 @@
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/blkdev.h>
 #include <linux/bvec.h>
 #include <linux/net.h>
@@ -108,12 +109,16 @@ struct io_ring_ctx {
 		unsigned		cached_sq_head;
 		unsigned		sq_entries;
 		unsigned		sq_mask;
+		unsigned		sq_thread_idle;
 		struct io_uring_sqe	*sq_sqes;
 	} ____cacheline_aligned_in_smp;
 
 	/* IO offload */
 	struct workqueue_struct	*sqo_wq;
+	struct task_struct	*sqo_thread;	/* if using sq thread polling */
 	struct mm_struct	*sqo_mm;
+	wait_queue_head_t	sqo_wait;
+	unsigned		sqo_stop;
 
 	struct {
 		/* CQ ring */
@@ -168,6 +173,7 @@ struct sqe_submit {
 	unsigned short			index;
 	bool				has_user;
 	bool				needs_lock;
+	bool				needs_fixed_file;
 };
 
 struct io_kiocb {
@@ -327,6 +333,8 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
+	if (waitqueue_active(&ctx->sqo_wait))
+		wake_up(&ctx->sqo_wait);
 }
 
 static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
@@ -680,9 +688,10 @@ static bool io_file_supports_async(struct file *file)
 	return false;
 }
 
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 		      bool force_nonblock, struct io_submit_state *state)
 {
+	const struct io_uring_sqe *sqe = s->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
 	unsigned ioprio, flags;
@@ -702,6 +711,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		kiocb->ki_filp = ctx->user_files[fd];
 		req->flags |= REQ_F_FIXED_FILE;
 	} else {
+		if (s->needs_fixed_file)
+			return -EBADF;
 		kiocb->ki_filp = io_file_get(state, fd);
 		if (unlikely(!kiocb->ki_filp))
 			return -EBADF;
@@ -865,7 +876,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	struct file *file;
 	ssize_t ret;
 
-	ret = io_prep_rw(req, s->sqe, force_nonblock, state);
+	ret = io_prep_rw(req, s, force_nonblock, state);
 	if (ret)
 		return ret;
 	file = kiocb->ki_filp;
@@ -909,7 +920,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	struct file *file;
 	ssize_t ret;
 
-	ret = io_prep_rw(req, s->sqe, force_nonblock, state);
+	ret = io_prep_rw(req, s, force_nonblock, state);
 	if (ret)
 		return ret;
 	/* Hold on to the file for -EAGAIN */
@@ -1301,6 +1312,169 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 	return false;
 }
 
+static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
+			  unsigned int nr, bool has_user, bool mm_fault)
+{
+	struct io_submit_state state, *statep = NULL;
+	int ret, i, submitted = 0;
+
+	if (nr > IO_PLUG_THRESHOLD) {
+		io_submit_state_start(&state, ctx, nr);
+		statep = &state;
+	}
+
+	for (i = 0; i < nr; i++) {
+		if (unlikely(mm_fault)) {
+			ret = -EFAULT;
+		} else {
+			sqes[i].has_user = has_user;
+			sqes[i].needs_lock = true;
+			sqes[i].needs_fixed_file = true;
+			ret = io_submit_sqe(ctx, &sqes[i], statep);
+		}
+		if (!ret) {
+			submitted++;
+			continue;
+		}
+
+		io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
+	}
+
+	if (statep)
+		io_submit_state_end(&state);
+
+	return submitted;
+}
+
+static int io_sq_thread(void *data)
+{
+	struct sqe_submit sqes[IO_IOPOLL_BATCH];
+	struct io_ring_ctx *ctx = data;
+	struct mm_struct *cur_mm = NULL;
+	mm_segment_t old_fs;
+	DEFINE_WAIT(wait);
+	unsigned inflight;
+	unsigned long timeout;
+
+	old_fs = get_fs();
+	set_fs(USER_DS);
+
+	timeout = inflight = 0;
+	while (!kthread_should_stop() && !ctx->sqo_stop) {
+		bool all_fixed, mm_fault = false;
+		int i;
+
+		if (inflight) {
+			unsigned nr_events = 0;
+
+			if (ctx->flags & IORING_SETUP_IOPOLL) {
+				/*
+				 * We disallow the app entering submit/complete
+				 * with polling, but we still need to lock the
+				 * ring to prevent racing with polled issue
+				 * that got punted to a workqueue.
+				 */
+				mutex_lock(&ctx->uring_lock);
+				io_iopoll_check(ctx, &nr_events, 0);
+				mutex_unlock(&ctx->uring_lock);
+			} else {
+				/*
+				 * Normal IO, just pretend everything completed.
+				 * We don't have to poll completions for that.
+				 */
+				nr_events = inflight;
+			}
+
+			inflight -= nr_events;
+			if (!inflight)
+				timeout = jiffies + ctx->sq_thread_idle;
+		}
+
+		if (!io_get_sqring(ctx, &sqes[0])) {
+			/*
+			 * We're polling. If we're within the defined idle
+			 * period, then let us spin without work before going
+			 * to sleep.
+			 */
+			if (inflight || !time_after(jiffies, timeout)) {
+				cpu_relax();
+				continue;
+			}
+
+			/*
+			 * Drop cur_mm before scheduling, we can't hold it for
+			 * long periods (or over schedule()). Do this before
+			 * adding ourselves to the waitqueue, as the unuse/drop
+			 * may sleep.
+			 */
+			if (cur_mm) {
+				unuse_mm(cur_mm);
+				mmput(cur_mm);
+				cur_mm = NULL;
+			}
+
+			prepare_to_wait(&ctx->sqo_wait, &wait,
+						TASK_INTERRUPTIBLE);
+
+			/* Tell userspace we may need a wakeup call */
+			ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
+			smp_wmb();
+
+			if (!io_get_sqring(ctx, &sqes[0])) {
+				if (kthread_should_stop()) {
+					finish_wait(&ctx->sqo_wait, &wait);
+					break;
+				}
+				if (signal_pending(current))
+					flush_signals(current);
+				schedule();
+				finish_wait(&ctx->sqo_wait, &wait);
+
+				ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+				smp_wmb();
+				continue;
+			}
+			finish_wait(&ctx->sqo_wait, &wait);
+
+			ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+			smp_wmb();
+		}
+
+		i = 0;
+		all_fixed = true;
+		do {
+			if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
+				all_fixed = false;
+
+			i++;
+			if (i == ARRAY_SIZE(sqes))
+				break;
+		} while (io_get_sqring(ctx, &sqes[i]));
+
+		/* Unless all new commands are FIXED regions, grab mm */
+		if (!all_fixed && !cur_mm) {
+			mm_fault = !mmget_not_zero(ctx->sqo_mm);
+			if (!mm_fault) {
+				use_mm(ctx->sqo_mm);
+				cur_mm = ctx->sqo_mm;
+			}
+		}
+
+		inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
+						mm_fault);
+
+		/* Commit SQ ring head once we've consumed all SQEs */
+		io_commit_sqring(ctx);
+	}
+
+	set_fs(old_fs);
+	if (cur_mm) {
+		unuse_mm(cur_mm);
+		mmput(cur_mm);
+	}
+	return 0;
+}
+
 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 {
 	struct io_submit_state state, *statep = NULL;
@@ -1319,6 +1493,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 
 		s.has_user = true;
 		s.needs_lock = false;
+		s.needs_fixed_file = false;
 
 		ret = io_submit_sqe(ctx, &s, statep);
 		if (ret) {
@@ -1418,8 +1593,20 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	return 0;
 }
 
+static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+{
+	if (ctx->sqo_thread) {
+		ctx->sqo_stop = 1;
+		mb();
+		kthread_stop(ctx->sqo_thread);
+		ctx->sqo_thread = NULL;
+	}
+}
+
 static void io_finish_async(struct io_ring_ctx *ctx)
 {
+	io_sq_thread_stop(ctx);
+
 	if (ctx->sqo_wq) {
 		destroy_workqueue(ctx->sqo_wq);
 		ctx->sqo_wq = NULL;
@@ -1583,13 +1770,47 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-static int io_sq_offload_start(struct io_ring_ctx *ctx)
+static int io_sq_offload_start(struct io_ring_ctx *ctx,
+			       struct io_uring_params *p)
 {
 	int ret;
 
+	init_waitqueue_head(&ctx->sqo_wait);
 	mmgrab(current->mm);
 	ctx->sqo_mm = current->mm;
 
+	ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+	if (!ctx->sq_thread_idle)
+		ctx->sq_thread_idle = HZ;
+
+	ret = -EINVAL;
+	if (!cpu_possible(p->sq_thread_cpu))
+		goto err;
+
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		if (p->flags & IORING_SETUP_SQ_AFF) {
+			int cpu;
+
+			cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
+			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
+							ctx, cpu,
+							"io_uring-sq");
+		} else {
+			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
+							"io_uring-sq");
+		}
+		if (IS_ERR(ctx->sqo_thread)) {
+			ret = PTR_ERR(ctx->sqo_thread);
+			ctx->sqo_thread = NULL;
+			goto err;
+		}
+		wake_up_process(ctx->sqo_thread);
+	} else if (p->flags & IORING_SETUP_SQ_AFF) {
+		/* Can't have SQ_AFF without SQPOLL */
+		ret = -EINVAL;
+		goto err;
+	}
+
 	/* Do QD, or 2 * CPUS, whatever is smallest */
 	ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
 			min(ctx->sq_entries - 1, 2 * num_online_cpus()));
@@ -1600,6 +1821,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx)
 
 	return 0;
 err:
+	io_sq_thread_stop(ctx);
 	mmdrop(ctx->sqo_mm);
 	ctx->sqo_mm = NULL;
 	return ret;
@@ -1959,7 +2181,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	int submitted = 0;
 	struct fd f;
 
-	if (flags & ~IORING_ENTER_GETEVENTS)
+	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
 		return -EINVAL;
 
 	f = fdget(fd);
@@ -1975,6 +2197,18 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	if (!percpu_ref_tryget(&ctx->refs))
 		goto out_fput;
 
+	/*
+	 * For SQ polling, the thread will do all submissions and completions.
+	 * Just return the requested submit count, and wake the thread if
+	 * we were asked to.
+	 */
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		if (flags & IORING_ENTER_SQ_WAKEUP)
+			wake_up(&ctx->sqo_wait);
+		submitted = to_submit;
+		goto out_ctx;
+	}
+
 	ret = 0;
 	if (to_submit) {
 		to_submit = min(to_submit, ctx->sq_entries);
@@ -2156,7 +2390,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 	if (ret)
 		goto err;
 
-	ret = io_sq_offload_start(ctx);
+	ret = io_sq_offload_start(ctx, p);
 	if (ret)
 		goto err;
 
@@ -2204,7 +2438,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			return -EINVAL;
 	}
 
-	if (p.flags & ~IORING_SETUP_IOPOLL)
+	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
+			IORING_SETUP_SQ_AFF))
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6257478d55e9..0ec74bab8dbe 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -42,6 +42,8 @@ struct io_uring_sqe {
  * io_uring_setup() flags
  */
 #define IORING_SETUP_IOPOLL	(1U << 0)	/* io_context is polled */
+#define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
+#define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
 
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1
@@ -86,6 +88,11 @@ struct io_sqring_offsets {
 	__u64 resv2;
 };
 
+/*
+ * sq_ring->flags
+ */
+#define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
+
 struct io_cqring_offsets {
 	__u32 head;
 	__u32 tail;
@@ -100,6 +107,7 @@ struct io_cqring_offsets {
  * io_uring_enter(2) flags
  */
 #define IORING_ENTER_GETEVENTS	(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -108,7 +116,9 @@ struct io_uring_params {
 	__u32 sq_entries;
 	__u32 cq_entries;
 	__u32 flags;
-	__u32 resv[7];
+	__u32 sq_thread_cpu;
+	__u32 sq_thread_idle;
+	__u32 resv[5];
 	struct io_sqring_offsets sq_off;
 	struct io_cqring_offsets cq_off;
 };
-- 
cgit v1.2.3-59-g8ed1b


From f7c917ba11a67632a8452ea99fe132f626a7a2cc Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:46 -0800
Subject: bpf: add bpf helper bpf_skb_ecn_set_ce

This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce
"int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to
BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
be attached to the ingress and egress path. The helper is needed
because his type of bpf_prog cannot modify the skb directly.

This helper is used to set the ECN field of ECN capable IP packets to ce
(congestion encountered) in the IPv6 or IPv4 header of the skb. It can be
used by a bpf_prog to manage egress or ingress network bandwdith limit
per cgroupv2 by inducing an ECN response in the TCP sender.
This works best when using DCTCP.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 net/core/filter.c        | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e308e90ffea..3c38ac9a92a7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2359,6 +2359,13 @@ union bpf_attr {
  *	Return
  *		A **struct bpf_tcp_sock** pointer on success, or NULL in
  *		case of failure.
+ *
+ * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
+ *     Description
+ *             Sets ECN of IP header to ce (congestion encountered) if
+ *             current value is ect (ECN capable). Works with IPv6 and IPv4.
+ *     Return
+ *             1 if set, 0 if not set.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2457,7 +2464,8 @@ union bpf_attr {
 	FN(spin_lock),			\
 	FN(spin_unlock),		\
 	FN(sk_fullsock),		\
-	FN(tcp_sock),
+	FN(tcp_sock),			\
+	FN(skb_ecn_set_ce),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 85749f6ec789..558ca72f2254 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = {
 	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
 };
 
+BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
+{
+	unsigned int iphdr_len;
+
+	if (skb->protocol == cpu_to_be16(ETH_P_IP))
+		iphdr_len = sizeof(struct iphdr);
+	else if (skb->protocol == cpu_to_be16(ETH_P_IPV6))
+		iphdr_len = sizeof(struct ipv6hdr);
+	else
+		return 0;
+
+	if (skb_headlen(skb) < iphdr_len)
+		return 0;
+
+	if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
+		return 0;
+
+	return INET_ECN_set_ce(skb);
+}
+
+static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
+	.func           = bpf_skb_ecn_set_ce,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_INET
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
+	case BPF_FUNC_skb_ecn_set_ce:
+		return &bpf_skb_ecn_set_ce_proto;
 #endif
 	default:
 		return sk_filter_func_proto(func_id, prog);
-- 
cgit v1.2.3-59-g8ed1b


From 0b5c7efdfc6e389ec6840579fe90bdb6f42b08dc Mon Sep 17 00:00:00 2001
From: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
Date: Fri, 1 Mar 2019 16:04:05 +0100
Subject: sch_cake: Permit use of connmarks as tin classifiers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add flag 'FWMARK' to enable use of firewall connmarks as tin selector.
The connmark (skbuff->mark) needs to be in the range 1->tin_cnt ie.
for diffserv3 the mark needs to be 1->3.

Background

Typically CAKE uses DSCP as the basis for tin selection.  DSCP values
are relatively easily changed as part of the egress path, usually with
iptables & the mangle table, ingress is more challenging.  CAKE is often
used on the WAN interface of a residential gateway where passthrough of
DSCP from the ISP is either missing or set to unhelpful values thus use
of ingress DSCP values for tin selection isn't helpful in that
environment.

An approach to solving the ingress tin selection problem is to use
CAKE's understanding of tc filters.  Naive tc filters could match on
source/destination port numbers and force tin selection that way, but
multiple filters don't scale particularly well as each filter must be
traversed whether it matches or not. e.g. a simple example to map 3
firewall marks to tins:

MAJOR=$( tc qdisc show dev $DEV | head -1 | awk '{print $3}' )
tc filter add dev $DEV parent $MAJOR protocol all handle 0x01 fw action skbedit priority ${MAJOR}1
tc filter add dev $DEV parent $MAJOR protocol all handle 0x02 fw action skbedit priority ${MAJOR}2
tc filter add dev $DEV parent $MAJOR protocol all handle 0x03 fw action skbedit priority ${MAJOR}3

Another option is to use eBPF cls_act with tc filters e.g.

MAJOR=$( tc qdisc show dev $DEV | head -1 | awk '{print $3}' )
tc filter add dev $DEV parent $MAJOR bpf da obj my-bpf-fwmark-to-class.o

This has the disadvantages of a) needing someone to write & maintain
the bpf program, b) a bpf toolchain to compile it and c) needing to
hardcode the major number in the bpf program so it matches the cake
instance (or forcing the cake instance to a particular major number)
since the major number cannot be passed to the bpf program via tc
command line.

As already hinted at by the previous examples, it would be helpful
to associate tins with something that survives the Internet path and
ideally allows tin selection on both egress and ingress.  Netfilter's
conntrack permits setting an identifying mark on a connection which
can also be restored to an ingress packet with tc action connmark e.g.

tc filter add dev eth0 parent ffff: protocol all prio 10 u32 \
	match u32 0 0 flowid 1:1 action connmark action mirred egress redirect dev ifb1

Since tc's connmark action has restored any connmark into skb->mark,
any of the previous solutions are based upon it and in one form or
another copy that mark to the skb->priority field where again CAKE
picks this up.

This change cuts out at least one of the (less intuitive &
non-scalable) middlemen and permit direct access to skb->mark.

Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_cake.c           | 34 +++++++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 1eb572ef3f27..7ee74c3474bf 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1021,6 +1021,7 @@ enum {
 	TCA_CAKE_INGRESS,
 	TCA_CAKE_ACK_FILTER,
 	TCA_CAKE_SPLIT_GSO,
+	TCA_CAKE_FWMARK,
 	__TCA_CAKE_MAX
 };
 #define TCA_CAKE_MAX	(__TCA_CAKE_MAX - 1)
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 4d688b3b471b..f1cc4779699b 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -258,7 +258,8 @@ enum {
 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 	CAKE_FLAG_INGRESS	   = BIT(2),
 	CAKE_FLAG_WASH		   = BIT(3),
-	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
+	CAKE_FLAG_SPLIT_GSO	   = BIT(4),
+	CAKE_FLAG_FWMARK	   = BIT(5)
 };
 
 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
@@ -1566,12 +1567,20 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
 		if (q->rate_flags & CAKE_FLAG_WASH)
 			cake_wash_diffserv(skb);
 	} else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
-		/* extract the Diffserv Precedence field, if it exists */
-		/* and clear DSCP bits if washing */
-		tin = q->tin_index[cake_handle_diffserv(skb,
-				q->rate_flags & CAKE_FLAG_WASH)];
-		if (unlikely(tin >= q->tin_cnt))
-			tin = 0;
+		if (q->rate_flags & CAKE_FLAG_FWMARK && /* use fw mark */
+		    skb->mark &&
+		    skb->mark <= q->tin_cnt) {
+			tin = q->tin_order[skb->mark - 1];
+			if (q->rate_flags & CAKE_FLAG_WASH)
+				cake_wash_diffserv(skb);
+		} else {
+			/* extract the Diffserv Precedence field, if it exists */
+			/* and clear DSCP bits if washing */
+			tin = q->tin_index[cake_handle_diffserv(skb,
+					q->rate_flags & CAKE_FLAG_WASH)];
+			if (unlikely(tin >= q->tin_cnt))
+				tin = 0;
+		}
 	} else {
 		tin = 0;
 		if (q->rate_flags & CAKE_FLAG_WASH)
@@ -2624,6 +2633,13 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
 	}
 
+	if (tb[TCA_CAKE_FWMARK]) {
+		if (!!nla_get_u32(tb[TCA_CAKE_FWMARK]))
+			q->rate_flags |= CAKE_FLAG_FWMARK;
+		else
+			q->rate_flags &= ~CAKE_FLAG_FWMARK;
+	}
+
 	if (q->tins) {
 		sch_tree_lock(sch);
 		cake_reconfigure(sch);
@@ -2783,6 +2799,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
 		goto nla_put_failure;
 
+	if (nla_put_u32(skb, TCA_CAKE_FWMARK,
+			!!(q->rate_flags & CAKE_FLAG_FWMARK)))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
-- 
cgit v1.2.3-59-g8ed1b


From 9036b2fe092a107856edd1a3bad48b83f2b45000 Mon Sep 17 00:00:00 2001
From: Francesco Ruggeri <fruggeri@arista.com>
Date: Fri, 1 Mar 2019 15:31:03 -0800
Subject: net: ipv6: add socket option IPV6_ROUTER_ALERT_ISOLATE

By default IPv6 socket with IPV6_ROUTER_ALERT socket option set will
receive all IPv6 RA packets from all namespaces.
IPV6_ROUTER_ALERT_ISOLATE socket option restricts packets received by
the socket to be only from the socket's namespace.

Signed-off-by: Maxim Martynov <maxim@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 ++-
 include/uapi/linux/in6.h |  1 +
 net/ipv6/ip6_output.c    |  6 ++++++
 net/ipv6/ipv6_sockglue.c | 10 ++++++++++
 4 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 6d45ce784bea..ea7c7906591e 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -281,7 +281,8 @@ struct ipv6_pinfo {
 				dontfrag:1,
 				autoflowlabel:1,
 				autoflowlabel_set:1,
-				mc_all:1;
+				mc_all:1,
+				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 71d82fe15b03..9f2273a08356 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -178,6 +178,7 @@ struct in6_flowlabel_req {
 #define IPV6_JOIN_ANYCAST	27
 #define IPV6_LEAVE_ANYCAST	28
 #define IPV6_MULTICAST_ALL	29
+#define IPV6_ROUTER_ALERT_ISOLATE	30
 
 /* IPV6_MTU_DISCOVER values */
 #define IPV6_PMTUDISC_DONT		0
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5f9fa0302b5a..edbd12067170 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -300,6 +300,12 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 		if (sk && ra->sel == sel &&
 		    (!sk->sk_bound_dev_if ||
 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+
+			if (np && np->rtalert_isolate &&
+			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
+				continue;
+			}
 			if (last) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 				if (skb2)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 973e215c3114..40f21fef25ff 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -787,6 +787,12 @@ done:
 			goto e_inval;
 		retv = ip6_ra_control(sk, val);
 		break;
+	case IPV6_ROUTER_ALERT_ISOLATE:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rtalert_isolate = valbool;
+		retv = 0;
+		break;
 	case IPV6_MTU_DISCOVER:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1358,6 +1364,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		val = np->rxopt.bits.recvfragsize;
 		break;
 
+	case IPV6_ROUTER_ALERT_ISOLATE:
+		val = np->rtalert_isolate;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 3eb39f47934f9d5a3027fe00d906a45fe3a15fad Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Mon, 19 Nov 2018 00:51:56 +0100
Subject: signal: add pidfd_send_signal() syscall

The kill() syscall operates on process identifiers (pid). After a process
has exited its pid can be reused by another process. If a caller sends a
signal to a reused pid it will end up signaling the wrong process. This
issue has often surfaced and there has been a push to address this problem [1].

This patch uses file descriptors (fd) from proc/<pid> as stable handles on
struct pid. Even if a pid is recycled the handle will not change. The fd
can be used to send signals to the process it refers to.
Thus, the new syscall pidfd_send_signal() is introduced to solve this
problem. Instead of pids it operates on process fds (pidfd).

/* prototype and argument /*
long pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags);

/* syscall number 424 */
The syscall number was chosen to be 424 to align with Arnd's rework in his
y2038 to minimize merge conflicts (cf. [25]).

In addition to the pidfd and signal argument it takes an additional
siginfo_t and flags argument. If the siginfo_t argument is NULL then
pidfd_send_signal() is equivalent to kill(<positive-pid>, <signal>). If it
is not NULL pidfd_send_signal() is equivalent to rt_sigqueueinfo().
The flags argument is added to allow for future extensions of this syscall.
It currently needs to be passed as 0. Failing to do so will cause EINVAL.

/* pidfd_send_signal() replaces multiple pid-based syscalls */
The pidfd_send_signal() syscall currently takes on the job of
rt_sigqueueinfo(2) and parts of the functionality of kill(2), Namely, when a
positive pid is passed to kill(2). It will however be possible to also
replace tgkill(2) and rt_tgsigqueueinfo(2) if this syscall is extended.

/* sending signals to threads (tid) and process groups (pgid) */
Specifically, the pidfd_send_signal() syscall does currently not operate on
process groups or threads. This is left for future extensions.
In order to extend the syscall to allow sending signal to threads and
process groups appropriately named flags (e.g. PIDFD_TYPE_PGID, and
PIDFD_TYPE_TID) should be added. This implies that the flags argument will
determine what is signaled and not the file descriptor itself. Put in other
words, grouping in this api is a property of the flags argument not a
property of the file descriptor (cf. [13]). Clarification for this has been
requested by Eric (cf. [19]).
When appropriate extensions through the flags argument are added then
pidfd_send_signal() can additionally replace the part of kill(2) which
operates on process groups as well as the tgkill(2) and
rt_tgsigqueueinfo(2) syscalls.
How such an extension could be implemented has been very roughly sketched
in [14], [15], and [16]. However, this should not be taken as a commitment
to a particular implementation. There might be better ways to do it.
Right now this is intentionally left out to keep this patchset as simple as
possible (cf. [4]).

/* naming */
The syscall had various names throughout iterations of this patchset:
- procfd_signal()
- procfd_send_signal()
- taskfd_send_signal()
In the last round of reviews it was pointed out that given that if the
flags argument decides the scope of the signal instead of different types
of fds it might make sense to either settle for "procfd_" or "pidfd_" as
prefix. The community was willing to accept either (cf. [17] and [18]).
Given that one developer expressed strong preference for the "pidfd_"
prefix (cf. [13]) and with other developers less opinionated about the name
we should settle for "pidfd_" to avoid further bikeshedding.

The  "_send_signal" suffix was chosen to reflect the fact that the syscall
takes on the job of multiple syscalls. It is therefore intentional that the
name is not reminiscent of neither kill(2) nor rt_sigqueueinfo(2). Not the
fomer because it might imply that pidfd_send_signal() is a replacement for
kill(2), and not the latter because it is a hassle to remember the correct
spelling - especially for non-native speakers - and because it is not
descriptive enough of what the syscall actually does. The name
"pidfd_send_signal" makes it very clear that its job is to send signals.

/* zombies */
Zombies can be signaled just as any other process. No special error will be
reported since a zombie state is an unreliable state (cf. [3]). However,
this can be added as an extension through the @flags argument if the need
ever arises.

/* cross-namespace signals */
The patch currently enforces that the signaler and signalee either are in
the same pid namespace or that the signaler's pid namespace is an ancestor
of the signalee's pid namespace. This is done for the sake of simplicity
and because it is unclear to what values certain members of struct
siginfo_t would need to be set to (cf. [5], [6]).

/* compat syscalls */
It became clear that we would like to avoid adding compat syscalls
(cf. [7]).  The compat syscall handling is now done in kernel/signal.c
itself by adding __copy_siginfo_from_user_generic() which lets us avoid
compat syscalls (cf. [8]). It should be noted that the addition of
__copy_siginfo_from_user_any() is caused by a bug in the original
implementation of rt_sigqueueinfo(2) (cf. 12).
With upcoming rework for syscall handling things might improve
significantly (cf. [11]) and __copy_siginfo_from_user_any() will not gain
any additional callers.

/* testing */
This patch was tested on x64 and x86.

/* userspace usage */
An asciinema recording for the basic functionality can be found under [9].
With this patch a process can be killed via:

 #define _GNU_SOURCE
 #include <errno.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>

 static inline int do_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
                                         unsigned int flags)
 {
 #ifdef __NR_pidfd_send_signal
         return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
 #else
         return -ENOSYS;
 #endif
 }

 int main(int argc, char *argv[])
 {
         int fd, ret, saved_errno, sig;

         if (argc < 3)
                 exit(EXIT_FAILURE);

         fd = open(argv[1], O_DIRECTORY | O_CLOEXEC);
         if (fd < 0) {
                 printf("%s - Failed to open \"%s\"\n", strerror(errno), argv[1]);
                 exit(EXIT_FAILURE);
         }

         sig = atoi(argv[2]);

         printf("Sending signal %d to process %s\n", sig, argv[1]);
         ret = do_pidfd_send_signal(fd, sig, NULL, 0);

         saved_errno = errno;
         close(fd);
         errno = saved_errno;

         if (ret < 0) {
                 printf("%s - Failed to send signal %d to process %s\n",
                        strerror(errno), sig, argv[1]);
                 exit(EXIT_FAILURE);
         }

         exit(EXIT_SUCCESS);
 }

/* Q&A
 * Given that it seems the same questions get asked again by people who are
 * late to the party it makes sense to add a Q&A section to the commit
 * message so it's hopefully easier to avoid duplicate threads.
 *
 * For the sake of progress please consider these arguments settled unless
 * there is a new point that desperately needs to be addressed. Please make
 * sure to check the links to the threads in this commit message whether
 * this has not already been covered.
 */
Q-01: (Florian Weimer [20], Andrew Morton [21])
      What happens when the target process has exited?
A-01: Sending the signal will fail with ESRCH (cf. [22]).

Q-02:  (Andrew Morton [21])
       Is the task_struct pinned by the fd?
A-02:  No. A reference to struct pid is kept. struct pid - as far as I
       understand - was created exactly for the reason to not require to
       pin struct task_struct (cf. [22]).

Q-03: (Andrew Morton [21])
      Does the entire procfs directory remain visible? Just one entry
      within it?
A-03: The same thing that happens right now when you hold a file descriptor
      to /proc/<pid> open (cf. [22]).

Q-04: (Andrew Morton [21])
      Does the pid remain reserved?
A-04: No. This patchset guarantees a stable handle not that pids are not
      recycled (cf. [22]).

Q-05: (Andrew Morton [21])
      Do attempts to signal that fd return errors?
A-05: See {Q,A}-01.

Q-06: (Andrew Morton [22])
      Is there a cleaner way of obtaining the fd? Another syscall perhaps.
A-06: Userspace can already trivially retrieve file descriptors from procfs
      so this is something that we will need to support anyway. Hence,
      there's no immediate need to add another syscalls just to make
      pidfd_send_signal() not dependent on the presence of procfs. However,
      adding a syscalls to get such file descriptors is planned for a
      future patchset (cf. [22]).

Q-07: (Andrew Morton [21] and others)
      This fd-for-a-process sounds like a handy thing and people may well
      think up other uses for it in the future, probably unrelated to
      signals. Are the code and the interface designed to permit such
      future applications?
A-07: Yes (cf. [22]).

Q-08: (Andrew Morton [21] and others)
      Now I think about it, why a new syscall? This thing is looking
      rather like an ioctl?
A-08: This has been extensively discussed. It was agreed that a syscall is
      preferred for a variety or reasons. Here are just a few taken from
      prior threads. Syscalls are safer than ioctl()s especially when
      signaling to fds. Processes are a core kernel concept so a syscall
      seems more appropriate. The layout of the syscall with its four
      arguments would require the addition of a custom struct for the
      ioctl() thereby causing at least the same amount or even more
      complexity for userspace than a simple syscall. The new syscall will
      replace multiple other pid-based syscalls (see description above).
      The file-descriptors-for-processes concept introduced with this
      syscall will be extended with other syscalls in the future. See also
      [22], [23] and various other threads already linked in here.

Q-09: (Florian Weimer [24])
      What happens if you use the new interface with an O_PATH descriptor?
A-09:
      pidfds opened as O_PATH fds cannot be used to send signals to a
      process (cf. [2]). Signaling processes through pidfds is the
      equivalent of writing to a file. Thus, this is not an operation that
      operates "purely at the file descriptor level" as required by the
      open(2) manpage. See also [4].

/* References */
[1]:  https://lore.kernel.org/lkml/20181029221037.87724-1-dancol@google.com/
[2]:  https://lore.kernel.org/lkml/874lbtjvtd.fsf@oldenburg2.str.redhat.com/
[3]:  https://lore.kernel.org/lkml/20181204132604.aspfupwjgjx6fhva@brauner.io/
[4]:  https://lore.kernel.org/lkml/20181203180224.fkvw4kajtbvru2ku@brauner.io/
[5]:  https://lore.kernel.org/lkml/20181121213946.GA10795@mail.hallyn.com/
[6]:  https://lore.kernel.org/lkml/20181120103111.etlqp7zop34v6nv4@brauner.io/
[7]:  https://lore.kernel.org/lkml/36323361-90BD-41AF-AB5B-EE0D7BA02C21@amacapital.net/
[8]:  https://lore.kernel.org/lkml/87tvjxp8pc.fsf@xmission.com/
[9]:  https://asciinema.org/a/IQjuCHew6bnq1cr78yuMv16cy
[11]: https://lore.kernel.org/lkml/F53D6D38-3521-4C20-9034-5AF447DF62FF@amacapital.net/
[12]: https://lore.kernel.org/lkml/87zhtjn8ck.fsf@xmission.com/
[13]: https://lore.kernel.org/lkml/871s6u9z6u.fsf@xmission.com/
[14]: https://lore.kernel.org/lkml/20181206231742.xxi4ghn24z4h2qki@brauner.io/
[15]: https://lore.kernel.org/lkml/20181207003124.GA11160@mail.hallyn.com/
[16]: https://lore.kernel.org/lkml/20181207015423.4miorx43l3qhppfz@brauner.io/
[17]: https://lore.kernel.org/lkml/CAGXu5jL8PciZAXvOvCeCU3wKUEB_dU-O3q0tDw4uB_ojMvDEew@mail.gmail.com/
[18]: https://lore.kernel.org/lkml/20181206222746.GB9224@mail.hallyn.com/
[19]: https://lore.kernel.org/lkml/20181208054059.19813-1-christian@brauner.io/
[20]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/
[21]: https://lore.kernel.org/lkml/20181228152012.dbf0508c2508138efc5f2bbe@linux-foundation.org/
[22]: https://lore.kernel.org/lkml/20181228233725.722tdfgijxcssg76@brauner.io/
[23]: https://lwn.net/Articles/773459/
[24]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/
[25]: https://lore.kernel.org/lkml/CAK8P3a0ej9NcJM8wXNPbcGUyOUZYX+VLoDFdbenW3s3114oQZw@mail.gmail.com/

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Jann Horn <jannh@google.com>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Tycho Andersen <tycho@tycho.ws>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Aleksa Sarai <cyphar@cyphar.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/proc/base.c                         |   9 +++
 include/linux/proc_fs.h                |   6 ++
 include/linux/syscalls.h               |   3 +
 include/uapi/asm-generic/unistd.h      |   4 +-
 kernel/signal.c                        | 133 +++++++++++++++++++++++++++++++--
 kernel/sys_ni.c                        |   1 +
 8 files changed, 151 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..234d91df8ca6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,4 @@
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
 385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
+424	i386	pidfd_send_signal	sys_pidfd_send_signal		__ia32_sys_pidfd_send_signal
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..58f4b3ad4fe0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 633a63462573..b6627c471078 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3046,6 +3046,15 @@ static const struct file_operations proc_tgid_base_operations = {
 	.llseek		= generic_file_llseek,
 };
 
+struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+	if (!d_is_dir(file->f_path.dentry) ||
+	    (file->f_op != &proc_tgid_base_operations))
+		return ERR_PTR(-EBADF);
+
+	return proc_pid(file_inode(file));
+}
+
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	return proc_pident_lookup(dir, dentry,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d0e1f1522a78..52a283ba0465 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -73,6 +73,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    int (*show)(struct seq_file *, void *),
 						    proc_write_t write,
 						    void *data);
+extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
 #else /* CONFIG_PROC_FS */
 
@@ -114,6 +115,11 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
 #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
 #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
 
+static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+	return ERR_PTR(-EBADF);
+}
+
 #endif /* CONFIG_PROC_FS */
 
 struct net;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 257cccba3062..5eb2e351675e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -926,6 +926,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
 asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
 			 int flags, uint32_t sig);
+asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
+				       siginfo_t __user *info,
+				       unsigned int flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d90127298f12..c861e7d1053b 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -740,9 +740,11 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_kexec_file_load 294
 __SYSCALL(__NR_kexec_file_load,     sys_kexec_file_load)
+#define __NR_pidfd_send_signal 424
+__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal)
 
 #undef __NR_syscalls
-#define __NR_syscalls 295
+#define __NR_syscalls 425
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/signal.c b/kernel/signal.c
index e1d7ad8e6ab1..268bed80244f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -19,7 +19,9 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/coredump.h>
@@ -3429,6 +3431,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 #endif
 #endif
 
+static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+{
+	clear_siginfo(info);
+	info->si_signo = sig;
+	info->si_errno = 0;
+	info->si_code = SI_USER;
+	info->si_pid = task_tgid_vnr(current);
+	info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
+}
+
 /**
  *  sys_kill - send a signal to a process
  *  @pid: the PID of the process
@@ -3438,16 +3450,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
 	struct kernel_siginfo info;
 
-	clear_siginfo(&info);
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_USER;
-	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+	prepare_kill_siginfo(sig, &info);
 
 	return kill_something_info(sig, &info, pid);
 }
 
+#ifdef CONFIG_PROC_FS
+/*
+ * Verify that the signaler and signalee either are in the same pid namespace
+ * or that the signaler's pid namespace is an ancestor of the signalee's pid
+ * namespace.
+ */
+static bool access_pidfd_pidns(struct pid *pid)
+{
+	struct pid_namespace *active = task_active_pid_ns(current);
+	struct pid_namespace *p = ns_of_pid(pid);
+
+	for (;;) {
+		if (!p)
+			return false;
+		if (p == active)
+			break;
+		p = p->parent;
+	}
+
+	return true;
+}
+
+static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
+{
+#ifdef CONFIG_COMPAT
+	/*
+	 * Avoid hooking up compat syscalls and instead handle necessary
+	 * conversions here. Note, this is a stop-gap measure and should not be
+	 * considered a generic solution.
+	 */
+	if (in_compat_syscall())
+		return copy_siginfo_from_user32(
+			kinfo, (struct compat_siginfo __user *)info);
+#endif
+	return copy_siginfo_from_user(kinfo, info);
+}
+
+/**
+ * sys_pidfd_send_signal - send a signal to a process through a task file
+ *                          descriptor
+ * @pidfd:  the file descriptor of the process
+ * @sig:    signal to be sent
+ * @info:   the signal info
+ * @flags:  future flags to be passed
+ *
+ * The syscall currently only signals via PIDTYPE_PID which covers
+ * kill(<positive-pid>, <signal>. It does not signal threads or process
+ * groups.
+ * In order to extend the syscall to threads and process groups the @flags
+ * argument should be used. In essence, the @flags argument will determine
+ * what is signaled and not the file descriptor itself. Put in other words,
+ * grouping is a property of the flags argument not a property of the file
+ * descriptor.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
+		siginfo_t __user *, info, unsigned int, flags)
+{
+	int ret;
+	struct fd f;
+	struct pid *pid;
+	kernel_siginfo_t kinfo;
+
+	/* Enforce flags be set to 0 until we add an extension. */
+	if (flags)
+		return -EINVAL;
+
+	f = fdget_raw(pidfd);
+	if (!f.file)
+		return -EBADF;
+
+	/* Is this a pidfd? */
+	pid = tgid_pidfd_to_pid(f.file);
+	if (IS_ERR(pid)) {
+		ret = PTR_ERR(pid);
+		goto err;
+	}
+
+	ret = -EINVAL;
+	if (!access_pidfd_pidns(pid))
+		goto err;
+
+	if (info) {
+		ret = copy_siginfo_from_user_any(&kinfo, info);
+		if (unlikely(ret))
+			goto err;
+
+		ret = -EINVAL;
+		if (unlikely(sig != kinfo.si_signo))
+			goto err;
+
+		if ((task_pid(current) != pid) &&
+		    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) {
+			/* Only allow sending arbitrary signals to yourself. */
+			ret = -EPERM;
+			if (kinfo.si_code != SI_USER)
+				goto err;
+
+			/* Turn this into a regular kill signal. */
+			prepare_kill_siginfo(sig, &kinfo);
+		}
+	} else {
+		prepare_kill_siginfo(sig, &kinfo);
+	}
+
+	ret = kill_pid_info(sig, &kinfo, pid);
+
+err:
+	fdput(f);
+	return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
 static int
 do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ab9d0e3c6d50..f905f4f9f677 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -163,6 +163,7 @@ COND_SYSCALL(syslog);
 /* kernel/sched/core.c */
 
 /* kernel/signal.c */
+COND_SYSCALL(pidfd_send_signal);
 
 /* kernel/sys.c */
 COND_SYSCALL(setregid);
-- 
cgit v1.2.3-59-g8ed1b


From ca215086b14b89a0e70fc211314944aa6ce50020 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 5 Mar 2019 15:42:23 -0800
Subject: mm: convert PG_balloon to PG_offline

PG_balloon was introduced to implement page migration/compaction for
pages inflated in virtio-balloon.  Nowadays, it is only a marker that a
page is part of virtio-balloon and therefore logically offline.

We also want to make use of this flag in other balloon drivers - for
inflated pages or when onlining a section but keeping some pages offline
(e.g.  used right now by XEN and Hyper-V via set_online_page_callback()).

We are going to expose this flag to dump tools like makedumpfile.  But
instead of exposing PG_balloon, let's generalize the concept of marking
pages as logically offline, so it can be reused for other purposes later
on.

Rename PG_balloon to PG_offline.  This is an indicator that the page is
logically offline, the content stale and that it should not be touched
(e.g.  a hypervisor would have to allocate backing storage in order for
the guest to dump an unused page).  We can then e.g.  exclude such pages
from dumps.

We replace and reuse KPF_BALLOON (23), as this shouldn't really harm
(and for now the semantics stay the same).  In following patches, we
will make use of this bit also in other balloon drivers.  While at it,
document PGTABLE.

[akpm@linux-foundation.org: fix comment text, per David]
Link: http://lkml.kernel.org/r/20181119101616.8901-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Konstantin Khlebnikov <koct9i@gmail.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Pankaj gupta <pagupta@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Christian Hansen <chansen3@cisco.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Miles Chen <miles.chen@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Kazuhito Hagio <k-hagio@ab.jp.nec.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Freche <jfreche@vmware.com>
Cc: Kairui Song <kasong@redhat.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Lianbo Jiang <lijiang@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xavier Deguillard <xdeguillard@vmware.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/pagemap.rst |  9 ++++++---
 fs/proc/page.c                           |  4 ++--
 include/linux/balloon_compaction.h       |  8 ++++----
 include/linux/page-flags.h               | 11 +++++++----
 include/uapi/linux/kernel-page-flags.h   |  2 +-
 tools/vm/page-types.c                    |  2 +-
 6 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 3f7bade2c231..340a5aee9b80 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -75,9 +75,10 @@ number of times a page is mapped.
     20. NOPAGE
     21. KSM
     22. THP
-    23. BALLOON
+    23. OFFLINE
     24. ZERO_PAGE
     25. IDLE
+    26. PGTABLE
 
  * ``/proc/kpagecgroup``.  This file contains a 64-bit inode number of the
    memory cgroup each page is charged to, indexed by PFN. Only available when
@@ -118,8 +119,8 @@ Short descriptions to the page flags
     identical memory pages dynamically shared between one or more processes
 22 - THP
     contiguous pages which construct transparent hugepages
-23 - BALLOON
-    balloon compaction page
+23 - OFFLINE
+    page is logically offline
 24 - ZERO_PAGE
     zero page for pfn_zero or huge_zero page
 25 - IDLE
@@ -128,6 +129,8 @@ Short descriptions to the page flags
     Note that this flag may be stale in case the page was accessed via
     a PTE. To make sure the flag is up-to-date one has to read
     ``/sys/kernel/mm/page_idle/bitmap`` first.
+26 - PGTABLE
+    page is in use as a page table
 
 IO related page flags
 ---------------------
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 40b05e0d4274..544d1ee15aee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page)
 	else if (page_count(page) == 0 && is_free_buddy_page(page))
 		u |= 1 << KPF_BUDDY;
 
-	if (PageBalloon(page))
-		u |= 1 << KPF_BALLOON;
+	if (PageOffline(page))
+		u |= 1 << KPF_OFFLINE;
 	if (PageTable(page))
 		u |= 1 << KPF_PGTABLE;
 
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index cbe50da5a59d..f111c780ef1d 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -95,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping,
 static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
-	__SetPageBalloon(page);
+	__SetPageOffline(page);
 	__SetPageMovable(page, balloon->inode->i_mapping);
 	set_page_private(page, (unsigned long)balloon);
 	list_add(&page->lru, &balloon->pages);
@@ -111,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
  */
 static inline void balloon_page_delete(struct page *page)
 {
-	__ClearPageBalloon(page);
+	__ClearPageOffline(page);
 	__ClearPageMovable(page);
 	set_page_private(page, 0);
 	/*
@@ -141,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
 static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
-	__SetPageBalloon(page);
+	__SetPageOffline(page);
 	list_add(&page->lru, &balloon->pages);
 }
 
 static inline void balloon_page_delete(struct page *page)
 {
-	__ClearPageBalloon(page);
+	__ClearPageOffline(page);
 	list_del(&page->lru);
 }
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 39b4494e29f1..808b4183e30d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -671,7 +671,7 @@ PAGEFLAG_FALSE(DoubleMap)
 /* Reserve		0x0000007f to catch underflows of page_mapcount */
 #define PAGE_MAPCOUNT_RESERVE	-128
 #define PG_buddy	0x00000080
-#define PG_balloon	0x00000100
+#define PG_offline	0x00000100
 #define PG_kmemcg	0x00000200
 #define PG_table	0x00000400
 
@@ -706,10 +706,13 @@ static __always_inline void __ClearPage##uname(struct page *page)	\
 PAGE_TYPE_OPS(Buddy, buddy)
 
 /*
- * PageBalloon() is true for pages that are on the balloon page list
- * (see mm/balloon_compaction.c).
+ * PageOffline() indicates that the page is logically offline although the
+ * containing section is online. (e.g. inflated in a balloon driver or
+ * not onlined when onlining the section).
+ * The content of these pages is effectively stale. Such pages should not
+ * be touched (read/write/dump/save) except by their owner.
  */
-PAGE_TYPE_OPS(Balloon, balloon)
+PAGE_TYPE_OPS(Offline, offline)
 
 /*
  * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 21b9113c69da..6f2f2720f3ac 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -32,7 +32,7 @@
 
 #define KPF_KSM			21
 #define KPF_THP			22
-#define KPF_BALLOON		23
+#define KPF_OFFLINE		23
 #define KPF_ZERO_PAGE		24
 #define KPF_IDLE		25
 #define KPF_PGTABLE		26
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 1ff3a6c0367b..6f64b2b93234 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -133,7 +133,7 @@ static const char * const page_flag_names[] = {
 	[KPF_NOPAGE]		= "n:nopage",
 	[KPF_KSM]		= "x:ksm",
 	[KPF_THP]		= "t:thp",
-	[KPF_BALLOON]		= "o:balloon",
+	[KPF_OFFLINE]		= "o:offline",
 	[KPF_PGTABLE]		= "g:pgtable",
 	[KPF_ZERO_PAGE]		= "z:zero_page",
 	[KPF_IDLE]              = "i:idle_page",
-- 
cgit v1.2.3-59-g8ed1b


From ab3948f58ff841e51feb845720624665ef5b7ef3 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Tue, 5 Mar 2019 15:47:54 -0800
Subject: mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd

Android uses ashmem for sharing memory regions.  We are looking forward
to migrating all usecases of ashmem to memfd so that we can possibly
remove the ashmem driver in the future from staging while also
benefiting from using memfd and contributing to it.  Note staging
drivers are also not ABI and generally can be removed at anytime.

One of the main usecases Android has is the ability to create a region
and mmap it as writeable, then add protection against making any
"future" writes while keeping the existing already mmap'ed
writeable-region active.  This allows us to implement a usecase where
receivers of the shared memory buffer can get a read-only view, while
the sender continues to write to the buffer.  See CursorWindow
documentation in Android for more details:

  https://developer.android.com/reference/android/database/CursorWindow

This usecase cannot be implemented with the existing F_SEAL_WRITE seal.
To support the usecase, this patch adds a new F_SEAL_FUTURE_WRITE seal
which prevents any future mmap and write syscalls from succeeding while
keeping the existing mmap active.

A better way to do F_SEAL_FUTURE_WRITE seal was discussed [1] last week
where we don't need to modify core VFS structures to get the same
behavior of the seal.  This solves several side-effects pointed by Andy.
self-tests are provided in later patch to verify the expected semantics.

[1] https://lore.kernel.org/lkml/20181111173650.GA256781@google.com/

Thanks a lot to Andy for suggestions to improve code.

Link: http://lkml.kernel.org/r/20190112203816.85534-2-joel@joelfernandes.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: J. Bruce Fields <bfields@fieldses.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Marc-Andr Lureau <marcandre.lureau@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c       |  2 +-
 include/uapi/linux/fcntl.h |  1 +
 mm/memfd.c                 |  3 ++-
 mm/shmem.c                 | 25 ++++++++++++++++++++++---
 4 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a7fa037b876b..b0eef008de67 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		inode_lock(inode);
 
 		/* protected by i_mutex */
-		if (info->seals & F_SEAL_WRITE) {
+		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
 			inode_unlock(inode);
 			return -EPERM;
 		}
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd9a350..a2f8658f1c55 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -41,6 +41,7 @@
 #define F_SEAL_SHRINK	0x0002	/* prevent file from shrinking */
 #define F_SEAL_GROW	0x0004	/* prevent file from growing */
 #define F_SEAL_WRITE	0x0008	/* prevent writes */
+#define F_SEAL_FUTURE_WRITE	0x0010  /* prevent future writes while mapped */
 /* (1U << 31) is reserved for signed error codes */
 
 /*
diff --git a/mm/memfd.c b/mm/memfd.c
index 97264c79d2cd..650e65a46b9c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
 #define F_ALL_SEALS (F_SEAL_SEAL | \
 		     F_SEAL_SHRINK | \
 		     F_SEAL_GROW | \
-		     F_SEAL_WRITE)
+		     F_SEAL_WRITE | \
+		     F_SEAL_FUTURE_WRITE)
 
 static int memfd_add_seals(struct file *file, unsigned int seals)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 283a1833dafc..b3db3779a30a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2190,6 +2190,24 @@ out_nomem:
 
 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+
+	if (info->seals & F_SEAL_FUTURE_WRITE) {
+		/*
+		 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+		 * "future write" seal active.
+		 */
+		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+			return -EPERM;
+
+		/*
+		 * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
+		 * read-only mapping, take care to not allow mprotect to revert
+		 * protections.
+		 */
+		vma->vm_flags &= ~(VM_MAYWRITE);
+	}
+
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
@@ -2440,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index = pos >> PAGE_SHIFT;
 
 	/* i_mutex is held by caller */
-	if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) {
-		if (info->seals & F_SEAL_WRITE)
+	if (unlikely(info->seals & (F_SEAL_GROW |
+				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
 			return -EPERM;
 		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
 			return -EPERM;
@@ -2704,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
 
 		/* protected by i_mutex */
-		if (info->seals & F_SEAL_WRITE) {
+		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
 			error = -EPERM;
 			goto out;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 221c5eb2338232f7340386de1c43decc32682e58 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 17 Jan 2019 09:41:58 -0700
Subject: io_uring: add support for IORING_OP_POLL

This is basically a direct port of bfe4037e722e, which implements a
one-shot poll command through aio. Description below is based on that
commit as well. However, instead of adding a POLL command and relying
on io_cancel(2) to remove it, we mimic the epoll(2) interface of
having a command to add a poll notification, IORING_OP_POLL_ADD,
and one to remove it again, IORING_OP_POLL_REMOVE.

To poll for a file descriptor the application should submit an sqe of
type IORING_OP_POLL. It will poll the fd for the events specified in the
poll_events field.

Unlike poll or epoll without EPOLLONESHOT this interface always works in
one shot mode, that is once the sqe is completed, it will have to be
resubmitted.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Based-on-code-from: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 263 +++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/io_uring.h |   3 +
 2 files changed, 265 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d2a3a1bc85cc..85b914bd823c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -161,6 +161,7 @@ struct io_ring_ctx {
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
 		struct list_head	poll_list;
+		struct list_head	cancel_list;
 	} ____cacheline_aligned_in_smp;
 
 #if defined(CONFIG_UNIX)
@@ -176,8 +177,20 @@ struct sqe_submit {
 	bool				needs_fixed_file;
 };
 
+struct io_poll_iocb {
+	struct file			*file;
+	struct wait_queue_head		*head;
+	__poll_t			events;
+	bool				woken;
+	bool				canceled;
+	struct wait_queue_entry		wait;
+};
+
 struct io_kiocb {
-	struct kiocb		rw;
+	union {
+		struct kiocb		rw;
+		struct io_poll_iocb	poll;
+	};
 
 	struct sqe_submit	submit;
 
@@ -261,6 +274,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
 	INIT_LIST_HEAD(&ctx->poll_list);
+	INIT_LIST_HEAD(&ctx->cancel_list);
 	return ctx;
 }
 
@@ -1058,6 +1072,246 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
+static void io_poll_remove_one(struct io_kiocb *req)
+{
+	struct io_poll_iocb *poll = &req->poll;
+
+	spin_lock(&poll->head->lock);
+	WRITE_ONCE(poll->canceled, true);
+	if (!list_empty(&poll->wait.entry)) {
+		list_del_init(&poll->wait.entry);
+		queue_work(req->ctx->sqo_wq, &req->work);
+	}
+	spin_unlock(&poll->head->lock);
+
+	list_del_init(&req->list);
+}
+
+static void io_poll_remove_all(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	spin_lock_irq(&ctx->completion_lock);
+	while (!list_empty(&ctx->cancel_list)) {
+		req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+		io_poll_remove_one(req);
+	}
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
+/*
+ * Find a running poll command that matches one specified in sqe->addr,
+ * and remove it if found.
+ */
+static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *poll_req, *next;
+	int ret = -ENOENT;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+	    sqe->poll_events)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->completion_lock);
+	list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
+		if (READ_ONCE(sqe->addr) == poll_req->user_data) {
+			io_poll_remove_one(poll_req);
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+	io_free_req(req);
+	return 0;
+}
+
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+{
+	io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
+	io_fput(req);
+	io_free_req(req);
+}
+
+static void io_poll_complete_work(struct work_struct *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_poll_iocb *poll = &req->poll;
+	struct poll_table_struct pt = { ._key = poll->events };
+	struct io_ring_ctx *ctx = req->ctx;
+	__poll_t mask = 0;
+
+	if (!READ_ONCE(poll->canceled))
+		mask = vfs_poll(poll->file, &pt) & poll->events;
+
+	/*
+	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
+	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
+	 * synchronize with them.  In the cancellation case the list_del_init
+	 * itself is not actually needed, but harmless so we keep it in to
+	 * avoid further branches in the fast path.
+	 */
+	spin_lock_irq(&ctx->completion_lock);
+	if (!mask && !READ_ONCE(poll->canceled)) {
+		add_wait_queue(poll->head, &poll->wait);
+		spin_unlock_irq(&ctx->completion_lock);
+		return;
+	}
+	list_del_init(&req->list);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_poll_complete(req, mask);
+}
+
+static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+			void *key)
+{
+	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
+							wait);
+	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
+	struct io_ring_ctx *ctx = req->ctx;
+	__poll_t mask = key_to_poll(key);
+
+	poll->woken = true;
+
+	/* for instances that support it check for an event match first: */
+	if (mask) {
+		unsigned long flags;
+
+		if (!(mask & poll->events))
+			return 0;
+
+		/* try to complete the iocb inline if we can: */
+		if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+			list_del(&req->list);
+			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+			list_del_init(&poll->wait.entry);
+			io_poll_complete(req, mask);
+			return 1;
+		}
+	}
+
+	list_del_init(&poll->wait.entry);
+	queue_work(ctx->sqo_wq, &req->work);
+	return 1;
+}
+
+struct io_poll_table {
+	struct poll_table_struct pt;
+	struct io_kiocb *req;
+	int error;
+};
+
+static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+			       struct poll_table_struct *p)
+{
+	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+	if (unlikely(pt->req->poll.head)) {
+		pt->error = -EINVAL;
+		return;
+	}
+
+	pt->error = 0;
+	pt->req->poll.head = head;
+	add_wait_queue(head, &pt->req->poll.wait);
+}
+
+static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_poll_iocb *poll = &req->poll;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_poll_table ipt;
+	unsigned flags;
+	__poll_t mask;
+	u16 events;
+	int fd;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+		return -EINVAL;
+
+	INIT_WORK(&req->work, io_poll_complete_work);
+	events = READ_ONCE(sqe->poll_events);
+	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+
+	flags = READ_ONCE(sqe->flags);
+	fd = READ_ONCE(sqe->fd);
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+			return -EBADF;
+		poll->file = ctx->user_files[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		poll->file = fget(fd);
+	}
+	if (unlikely(!poll->file))
+		return -EBADF;
+
+	poll->head = NULL;
+	poll->woken = false;
+	poll->canceled = false;
+
+	ipt.pt._qproc = io_poll_queue_proc;
+	ipt.pt._key = poll->events;
+	ipt.req = req;
+	ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+	/* initialized the list so that we can do list_empty checks */
+	INIT_LIST_HEAD(&poll->wait.entry);
+	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+
+	/* one for removal from waitqueue, one for this function */
+	refcount_set(&req->refs, 2);
+
+	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+	if (unlikely(!poll->head)) {
+		/* we did not manage to set up a waitqueue, done */
+		goto out;
+	}
+
+	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&poll->head->lock);
+	if (poll->woken) {
+		/* wake_up context handles the rest */
+		mask = 0;
+		ipt.error = 0;
+	} else if (mask || ipt.error) {
+		/* if we get an error or a mask we are done */
+		WARN_ON_ONCE(list_empty(&poll->wait.entry));
+		list_del_init(&poll->wait.entry);
+	} else {
+		/* actually waiting for an event */
+		list_add_tail(&req->list, &ctx->cancel_list);
+	}
+	spin_unlock(&poll->head->lock);
+	spin_unlock_irq(&ctx->completion_lock);
+
+out:
+	if (unlikely(ipt.error)) {
+		if (!(flags & IOSQE_FIXED_FILE))
+			fput(poll->file);
+		/*
+		 * Drop one of our refs to this req, __io_submit_sqe() will
+		 * drop the other one since we're returning an error.
+		 */
+		io_free_req(req);
+		return ipt.error;
+	}
+
+	if (mask)
+		io_poll_complete(req, mask);
+	io_free_req(req);
+	return 0;
+}
+
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			   const struct sqe_submit *s, bool force_nonblock,
 			   struct io_submit_state *state)
@@ -1093,6 +1347,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_FSYNC:
 		ret = io_fsync(req, s->sqe, force_nonblock);
 		break;
+	case IORING_OP_POLL_ADD:
+		ret = io_poll_add(req, s->sqe);
+		break;
+	case IORING_OP_POLL_REMOVE:
+		ret = io_poll_remove(req, s->sqe);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -2131,6 +2391,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	percpu_ref_kill(&ctx->refs);
 	mutex_unlock(&ctx->uring_lock);
 
+	io_poll_remove_all(ctx);
 	io_iopoll_reap_events(ctx);
 	wait_for_completion(&ctx->ctx_done);
 	io_ring_ctx_free(ctx);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0ec74bab8dbe..e23408692118 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -25,6 +25,7 @@ struct io_uring_sqe {
 	union {
 		__kernel_rwf_t	rw_flags;
 		__u32		fsync_flags;
+		__u16		poll_events;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -51,6 +52,8 @@ struct io_uring_sqe {
 #define IORING_OP_FSYNC		3
 #define IORING_OP_READ_FIXED	4
 #define IORING_OP_WRITE_FIXED	5
+#define IORING_OP_POLL_ADD	6
+#define IORING_OP_POLL_REMOVE	7
 
 /*
  * sqe->fsync_flags
-- 
cgit v1.2.3-59-g8ed1b


From 54d50897d544c874562253e2a8f70dfcad22afe8 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 7 Mar 2019 16:27:14 -0800
Subject: linux/kernel.h: split *_MAX and *_MIN macros into <linux/limits.h>

<linux/kernel.h> tends to be cluttered because we often put various sort
of unrelated stuff in it.  So, we have split out a sensible chunk of
code into a separate header from time to time.

This commit splits out the *_MAX and *_MIN defines.

The standard header <limits.h> contains various MAX, MIN constants
including numerial limits.  [1]

I think it makes sense to move in-kernel MAX, MIN constants into
include/linux/limits.h.

We already have include/uapi/linux/limits.h to contain some user-space
constants.  I changed its include guard to _UAPI_LINUX_LIMITS_H.  This
change has no impact to the user-space because
scripts/headers_install.sh rips off the '_UAPI' prefix from the include
guards of exported headers.

[1] http://pubs.opengroup.org/onlinepubs/009604499/basedefs/limits.h.html

Link: http://lkml.kernel.org/r/1549156242-20806-2-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Alex Elder <elder@linaro.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h      | 29 +----------------------------
 include/linux/limits.h      | 36 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/limits.h |  4 ++--
 3 files changed, 39 insertions(+), 30 deletions(-)
 create mode 100644 include/linux/limits.h

(limited to 'include/uapi')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a9ff66977e10..34a5036debd3 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -4,6 +4,7 @@
 
 
 #include <stdarg.h>
+#include <linux/limits.h>
 #include <linux/linkage.h>
 #include <linux/stddef.h>
 #include <linux/types.h>
@@ -17,34 +18,6 @@
 #include <asm/div64.h>
 #include <uapi/linux/kernel.h>
 
-#define USHRT_MAX	((unsigned short)~0U)
-#define SHRT_MAX	((short)(USHRT_MAX>>1))
-#define SHRT_MIN	((short)(-SHRT_MAX - 1))
-#define INT_MAX		((int)(~0U>>1))
-#define INT_MIN		(-INT_MAX - 1)
-#define UINT_MAX	(~0U)
-#define LONG_MAX	((long)(~0UL>>1))
-#define LONG_MIN	(-LONG_MAX - 1)
-#define ULONG_MAX	(~0UL)
-#define LLONG_MAX	((long long)(~0ULL>>1))
-#define LLONG_MIN	(-LLONG_MAX - 1)
-#define ULLONG_MAX	(~0ULL)
-#define SIZE_MAX	(~(size_t)0)
-#define PHYS_ADDR_MAX	(~(phys_addr_t)0)
-
-#define U8_MAX		((u8)~0U)
-#define S8_MAX		((s8)(U8_MAX>>1))
-#define S8_MIN		((s8)(-S8_MAX - 1))
-#define U16_MAX		((u16)~0U)
-#define S16_MAX		((s16)(U16_MAX>>1))
-#define S16_MIN		((s16)(-S16_MAX - 1))
-#define U32_MAX		((u32)~0U)
-#define S32_MAX		((s32)(U32_MAX>>1))
-#define S32_MIN		((s32)(-S32_MAX - 1))
-#define U64_MAX		((u64)~0ULL)
-#define S64_MAX		((s64)(U64_MAX>>1))
-#define S64_MIN		((s64)(-S64_MAX - 1))
-
 #define STACK_MAGIC	0xdeadbeef
 
 /**
diff --git a/include/linux/limits.h b/include/linux/limits.h
new file mode 100644
index 000000000000..76afcd24ff8c
--- /dev/null
+++ b/include/linux/limits.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LIMITS_H
+#define _LINUX_LIMITS_H
+
+#include <uapi/linux/limits.h>
+#include <linux/types.h>
+
+#define USHRT_MAX	((unsigned short)~0U)
+#define SHRT_MAX	((short)(USHRT_MAX >> 1))
+#define SHRT_MIN	((short)(-SHRT_MAX - 1))
+#define INT_MAX		((int)(~0U >> 1))
+#define INT_MIN		(-INT_MAX - 1)
+#define UINT_MAX	(~0U)
+#define LONG_MAX	((long)(~0UL >> 1))
+#define LONG_MIN	(-LONG_MAX - 1)
+#define ULONG_MAX	(~0UL)
+#define LLONG_MAX	((long long)(~0ULL >> 1))
+#define LLONG_MIN	(-LLONG_MAX - 1)
+#define ULLONG_MAX	(~0ULL)
+#define SIZE_MAX	(~(size_t)0)
+#define PHYS_ADDR_MAX	(~(phys_addr_t)0)
+
+#define U8_MAX		((u8)~0U)
+#define S8_MAX		((s8)(U8_MAX >> 1))
+#define S8_MIN		((s8)(-S8_MAX - 1))
+#define U16_MAX		((u16)~0U)
+#define S16_MAX		((s16)(U16_MAX >> 1))
+#define S16_MIN		((s16)(-S16_MAX - 1))
+#define U32_MAX		((u32)~0U)
+#define S32_MAX		((s32)(U32_MAX >> 1))
+#define S32_MIN		((s32)(-S32_MAX - 1))
+#define U64_MAX		((u64)~0ULL)
+#define S64_MAX		((s64)(U64_MAX >> 1))
+#define S64_MIN		((s64)(-S64_MAX - 1))
+
+#endif /* _LINUX_LIMITS_H */
diff --git a/include/uapi/linux/limits.h b/include/uapi/linux/limits.h
index c3547f07605c..6bcbe3068761 100644
--- a/include/uapi/linux/limits.h
+++ b/include/uapi/linux/limits.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _LINUX_LIMITS_H
-#define _LINUX_LIMITS_H
+#ifndef _UAPI_LINUX_LIMITS_H
+#define _UAPI_LINUX_LIMITS_H
 
 #define NR_OPEN	        1024
 
-- 
cgit v1.2.3-59-g8ed1b


From 60d6d04ca3abb34d5e89f030dbea440d9715a168 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Thu, 7 Mar 2019 16:29:09 -0800
Subject: autofs: add ignore mount option

Add an autofs file system mount option that can be used to provide a
generic indicator to applications that the mount entry should be ignored
when displaying mount information.

In other OSes that provide autofs and that provide a mount list to user
space based on the kernel mount list a no-op mount option ("ignore" is
the one use on the most common OS) is allowed so that autofs file system
users can optionally use it.

The idea is that it be used by user space programs to exclude autofs
mounts from consideration when reading the mounts list.

Prior to the change to link /etc/mtab to /proc/self/mounts all I needed
to do to achieve this was to use mount(2) and not update the mtab but
now that no longer works.

I know the symlinking happened a long time ago and I considered doing
this then but, at the time I couldn't remember the commonly used option
name and thought persuading the various utility maintainers would be too
hard.

But now I have a RHEL request to do this for compatibility for a widely
used product so I want to go ahead with it and try and enlist the help
of some utility package maintainers.

Clearly, without the option nothing can be done so it's at least a
start.

Link: http://lkml.kernel.org/r/154725123970.11260.6113771566924907275.stgit@pluto-themaw-net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/autofs_i.h         | 1 +
 fs/autofs/inode.c            | 9 ++++++++-
 include/uapi/linux/auto_fs.h | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 3e59f0ed777b..b735f2b1e462 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -105,6 +105,7 @@ struct autofs_wait_queue {
 
 #define AUTOFS_SBI_CATATONIC	0x0001
 #define AUTOFS_SBI_STRICTEXPIRE 0x0002
+#define AUTOFS_SBI_IGNORE	0x0004
 
 struct autofs_sb_info {
 	u32 magic;
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 078992eee299..8647ecaa89fc 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -89,6 +89,8 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root)
 		seq_printf(m, ",indirect");
 	if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
 		seq_printf(m, ",strictexpire");
+	if (sbi->flags & AUTOFS_SBI_IGNORE)
+		seq_printf(m, ",ignore");
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	if (sbi->pipe)
 		seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
@@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = {
 };
 
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
-	Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire};
+	Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
+	Opt_ignore};
 
 static const match_table_t tokens = {
 	{Opt_fd, "fd=%u"},
@@ -124,6 +127,7 @@ static const match_table_t tokens = {
 	{Opt_direct, "direct"},
 	{Opt_offset, "offset"},
 	{Opt_strictexpire, "strictexpire"},
+	{Opt_ignore, "ignore"},
 	{Opt_err, NULL}
 };
 
@@ -206,6 +210,9 @@ static int parse_options(char *options,
 		case Opt_strictexpire:
 			sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
 			break;
+		case Opt_ignore:
+			sbi->flags |= AUTOFS_SBI_IGNORE;
+			break;
 		default:
 			return 1;
 		}
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index 082119630b49..1f7925afad2d 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -23,7 +23,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	3
 #define AUTOFS_MAX_PROTO_VERSION	5
 
-#define AUTOFS_PROTO_SUBVERSION		4
+#define AUTOFS_PROTO_SUBVERSION		5
 
 /*
  * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
-- 
cgit v1.2.3-59-g8ed1b


From 6eb3c3d0a52dca337e327ae8868ca1f44a712e02 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 7 Mar 2019 16:29:26 -0800
Subject: exec: increase BINPRM_BUF_SIZE to 256

Large enterprise clients often run applications out of networked file
systems where the IT mandated layout of project volumes can end up
leading to paths that are longer than 128 characters.  Bumping this up
to the next order of two solves this problem in all but the most
egregious case while still fitting into a 512b slab.

[oleg@redhat.com: update comment, per Kees]
Link: http://lkml.kernel.org/r/20181112160956.GA28472@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Ben Woodard <woodard@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                    | 2 +-
 include/uapi/linux/binfmts.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/fs/exec.c b/fs/exec.c
index bf0ace3841ad..2e0033348d8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
 
 /*
  * Fill the binprm structure from the inode.
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ * Check permissions, then read the first BINPRM_BUF_SIZE bytes
  *
  * This may be called multiple times for binary chains (scripts for example).
  */
diff --git a/include/uapi/linux/binfmts.h b/include/uapi/linux/binfmts.h
index 4abad03a8853..689025d9c185 100644
--- a/include/uapi/linux/binfmts.h
+++ b/include/uapi/linux/binfmts.h
@@ -16,6 +16,6 @@ struct pt_regs;
 #define MAX_ARG_STRINGS 0x7FFFFFFF
 
 /* sizeof(linux_binprm->buf) */
-#define BINPRM_BUF_SIZE 128
+#define BINPRM_BUF_SIZE 256
 
 #endif /* _UAPI_LINUX_BINFMTS_H */
-- 
cgit v1.2.3-59-g8ed1b


From a623a7a1a5670c25a16881f5078072d272d96b71 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 11 Mar 2019 16:38:17 +0100
Subject: y2038: fix socket.h header inclusion

Referencing the __kernel_long_t type caused some user space applications
to stop compiling when they had not already included linux/posix_types.h,
e.g.

s/multicast.c -o ext/sockets/multicast.lo
In file included from /builddir/build/BUILD/php-7.3.3/main/php.h:468,
                 from /builddir/build/BUILD/php-7.3.3/ext/sockets/sockets.c:27:
/builddir/build/BUILD/php-7.3.3/ext/sockets/sockets.c: In function 'zm_startup_sockets':
/builddir/build/BUILD/php-7.3.3/ext/sockets/sockets.c:776:40: error: '__kernel_long_t' undeclared (first use in this function)
  776 |  REGISTER_LONG_CONSTANT("SO_SNDTIMEO", SO_SNDTIMEO, CONST_CS | CONST_PERSISTENT);

It is safe to include that header here, since it only contains kernel
internal types that do not conflict with other user space types.

It's still possible that some related build failures remain, but those
are likely to be for code that is not already y2038 safe.

Reported-by: Laura Abbott <labbott@redhat.com>
Fixes: a9beb86ae6e5 ("sock: Add SO_RCVTIMEO_NEW and SO_SNDTIMEO_NEW")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  | 2 +-
 arch/mips/include/uapi/asm/socket.h   | 2 +-
 arch/parisc/include/uapi/asm/socket.h | 2 +-
 arch/sparc/include/uapi/asm/socket.h  | 2 +-
 include/uapi/asm-generic/socket.h     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 0d0fddb7e738..976e89b116e5 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -2,8 +2,8 @@
 #ifndef _UAPI_ASM_SOCKET_H
 #define _UAPI_ASM_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 /*
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index eb9f33f8a8b3..d41765cfbc6e 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -10,8 +10,8 @@
 #ifndef _UAPI_ASM_SOCKET_H
 #define _UAPI_ASM_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /*
  * For setsockopt(2)
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 16e428f03526..66c5dd245ac7 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -2,8 +2,8 @@
 #ifndef _UAPI_ASM_SOCKET_H
 #define _UAPI_ASM_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 #define SOL_SOCKET	0xffff
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 88fe4f978aca..9265a9eece15 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -2,8 +2,8 @@
 #ifndef _ASM_SOCKET_H
 #define _ASM_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 #define SOL_SOCKET	0xffff
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index c8b430cb6dc4..8c1391c89171 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -2,8 +2,8 @@
 #ifndef __ASM_GENERIC_SOCKET_H
 #define __ASM_GENERIC_SOCKET_H
 
+#include <linux/posix_types.h>
 #include <asm/sockios.h>
-#include <asm/bitsperlong.h>
 
 /* For setsockopt(2) */
 #define SOL_SOCKET	1
-- 
cgit v1.2.3-59-g8ed1b


From dbafd7ddd62369b2f3926ab847cbf8fc40e800b7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 12 Mar 2019 10:23:04 -0700
Subject: bpf: Add bpf_get_listener_sock(struct bpf_sock *sk) helper

Add a new helper "struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)"
which returns a bpf_sock in TCP_LISTEN state.  It will trace back to
the listener sk from a request_sock if possible.  It returns NULL
for all other cases.

No reference is taken because the helper ensures the sk is
in SOCK_RCU_FREE (where the TCP_LISTEN sock should be in).
Hence, bpf_sk_release() is unnecessary and the verifier does not
allow bpf_sk_release(listen_sk) to be called either.

The following is also allowed because the bpf_prog is run under
rcu_read_lock():

	sk = bpf_sk_lookup_tcp();
	/* if (!sk) { ... } */
	listen_sk = bpf_get_listener_sock(sk);
	/* if (!listen_sk) { ... } */
	bpf_sk_release(sk);
	src_port = listen_sk->src_port; /* Allowed */

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 11 ++++++++++-
 net/core/filter.c        | 21 +++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3c38ac9a92a7..983b25cb608d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2366,6 +2366,14 @@ union bpf_attr {
  *             current value is ect (ECN capable). Works with IPv6 and IPv4.
  *     Return
  *             1 if set, 0 if not set.
+ *
+ * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)
+ *	Description
+ *		Return a **struct bpf_sock** pointer in TCP_LISTEN state.
+ *		bpf_sk_release() is unnecessary and not allowed.
+ *	Return
+ *		A **struct bpf_sock** pointer on success, or NULL in
+ *		case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2465,7 +2473,8 @@ union bpf_attr {
 	FN(spin_unlock),		\
 	FN(sk_fullsock),		\
 	FN(tcp_sock),			\
-	FN(skb_ecn_set_ce),
+	FN(skb_ecn_set_ce),		\
+	FN(get_listener_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 36b6afacf83c..647c63a7b25b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5418,6 +5418,23 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = {
 	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
 };
 
+BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
+{
+	sk = sk_to_full_sk(sk);
+
+	if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+		return (unsigned long)sk;
+
+	return (unsigned long)NULL;
+}
+
+static const struct bpf_func_proto bpf_get_listener_sock_proto = {
+	.func		= bpf_get_listener_sock,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
+};
+
 BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
 {
 	unsigned int iphdr_len;
@@ -5603,6 +5620,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_INET
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
+	case BPF_FUNC_get_listener_sock:
+		return &bpf_get_listener_sock_proto;
 	case BPF_FUNC_skb_ecn_set_ce:
 		return &bpf_skb_ecn_set_ce_proto;
 #endif
@@ -5698,6 +5717,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_release_proto;
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
+	case BPF_FUNC_get_listener_sock:
+		return &bpf_get_listener_sock_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
-- 
cgit v1.2.3-59-g8ed1b


From 62369db2df8d1edfa040878203b446e023a16802 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Thu, 14 Mar 2019 12:38:39 +0000
Subject: bpf: fix documentation for eBPF helpers

Another round of minor fixes for the documentation of the BPF helpers
located in the UAPI bpf.h header file. Changes include:

- Moving around description of some helpers, to keep the descriptions in
  the same order as helpers are declared (bpf_map_push_elem(), leftover
  from commit 90b1023f68c7 ("bpf: fix documentation for eBPF helpers"),
  bpf_rc_keydown(), and bpf_skb_ancestor_cgroup_id()).
- Fixing typos ("contex" -> "context").
- Harmonising return types ("void* " -> "void *", "uint64_t" -> "u64").
- Addition of the "bpf_" prefix to bpf_get_storage().
- Light additions of RST markup on some keywords.
- Empty line deletion between description and return value for
  bpf_tcp_sock().
- Edit for the description for bpf_skb_ecn_set_ce() (capital letters,
  acronym expansion, no effect if ECT not set, more details on return
  value).

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 128 ++++++++++++++++++++++++-----------------------
 1 file changed, 65 insertions(+), 63 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 983b25cb608d..4465d00d3493 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -502,16 +502,6 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
- * 	Description
- * 		Push an element *value* in *map*. *flags* is one of:
- *
- * 		**BPF_EXIST**
- * 		If the queue/stack is full, the oldest element is removed to
- * 		make room for this.
- * 	Return
- * 		0 on success, or a negative error in case of failure.
- *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
@@ -1435,14 +1425,14 @@ union bpf_attr {
  * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
  * 	Description
  * 		Equivalent to bpf_get_socket_cookie() helper that accepts
- * 		*skb*, but gets socket from **struct bpf_sock_addr** contex.
+ * 		*skb*, but gets socket from **struct bpf_sock_addr** context.
  * 	Return
  * 		A 8-byte long non-decreasing number.
  *
  * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
  * 	Description
  * 		Equivalent to bpf_get_socket_cookie() helper that accepts
- * 		*skb*, but gets socket from **struct bpf_sock_ops** contex.
+ * 		*skb*, but gets socket from **struct bpf_sock_ops** context.
  * 	Return
  * 		A 8-byte long non-decreasing number.
  *
@@ -2098,52 +2088,52 @@ union bpf_attr {
  *	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ * int bpf_rc_repeat(void *ctx)
  *	Description
  *		This helper is used in programs implementing IR decoding, to
- *		report a successfully decoded key press with *scancode*,
- *		*toggle* value in the given *protocol*. The scancode will be
- *		translated to a keycode using the rc keymap, and reported as
- *		an input key down event. After a period a key up event is
- *		generated. This period can be extended by calling either
- *		**bpf_rc_keydown**\ () again with the same values, or calling
- *		**bpf_rc_repeat**\ ().
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
  *
- *		Some protocols include a toggle bit, in case the button	was
- *		released and pressed again between consecutive scancodes.
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
  *
  *		The *ctx* should point to the lirc sample as passed into
  *		the program.
  *
- *		The *protocol* is the decoded protocol number (see
- *		**enum rc_proto** for some predefined values).
- *
  *		This helper is only available is the kernel was compiled with
  *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
  *		"**y**".
  *	Return
  *		0
  *
- * int bpf_rc_repeat(void *ctx)
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
  *	Description
  *		This helper is used in programs implementing IR decoding, to
- *		report a successfully decoded repeat key message. This delays
- *		the generation of a key up event for previously generated
- *		key down event.
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown**\ () again with the same values, or calling
+ *		**bpf_rc_repeat**\ ().
  *
- *		Some IR protocols like NEC have a special IR message for
- *		repeating last button, for when a button is held down.
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
  *
  *		The *ctx* should point to the lirc sample as passed into
  *		the program.
  *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
  *		This helper is only available is the kernel was compiled with
  *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
  *		"**y**".
  *	Return
  *		0
  *
- * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * u64 bpf_skb_cgroup_id(struct sk_buff *skb)
  * 	Description
  * 		Return the cgroup v2 id of the socket associated with the *skb*.
  * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
@@ -2159,30 +2149,12 @@ union bpf_attr {
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
- * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
- *	Description
- *		Return id of cgroup v2 that is ancestor of cgroup associated
- *		with the *skb* at the *ancestor_level*.  The root cgroup is at
- *		*ancestor_level* zero and each step down the hierarchy
- *		increments the level. If *ancestor_level* == level of cgroup
- *		associated with *skb*, then return value will be same as that
- *		of **bpf_skb_cgroup_id**\ ().
- *
- *		The helper is useful to implement policies based on cgroups
- *		that are upper in hierarchy than immediate cgroup associated
- *		with *skb*.
- *
- *		The format of returned id and helper limitations are same as in
- *		**bpf_skb_cgroup_id**\ ().
- *	Return
- *		The id is returned or 0 in case the id could not be retrieved.
- *
  * u64 bpf_get_current_cgroup_id(void)
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
  * 		on the cgroup within which the current task is running.
  *
- * void* get_local_storage(void *map, u64 flags)
+ * void *bpf_get_local_storage(void *map, u64 flags)
  *	Description
  *		Get the pointer to the local storage area.
  *		The type and the size of the local storage is defined
@@ -2209,6 +2181,24 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
+ * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *skb* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *skb*, then return value will be same as that
+ *		of **bpf_skb_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *skb*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_skb_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
  * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for TCP socket matching *tuple*, optionally in a child
@@ -2289,6 +2279,16 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+ * 	Description
+ * 		Push an element *value* in *map*. *flags* is one of:
+ *
+ * 		**BPF_EXIST**
+ * 			If the queue/stack is full, the oldest element is
+ * 			removed to make room for this.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_map_pop_elem(struct bpf_map *map, void *value)
  * 	Description
  * 		Pop an element from *map*.
@@ -2346,33 +2346,35 @@ union bpf_attr {
  * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
  *	Description
  *		This helper gets a **struct bpf_sock** pointer such
- *		that all the fields in bpf_sock can be accessed.
+ *		that all the fields in this **bpf_sock** can be accessed.
  *	Return
- *		A **struct bpf_sock** pointer on success, or NULL in
+ *		A **struct bpf_sock** pointer on success, or **NULL** in
  *		case of failure.
  *
  * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
  *	Description
  *		This helper gets a **struct bpf_tcp_sock** pointer from a
  *		**struct bpf_sock** pointer.
- *
  *	Return
- *		A **struct bpf_tcp_sock** pointer on success, or NULL in
+ *		A **struct bpf_tcp_sock** pointer on success, or **NULL** in
  *		case of failure.
  *
  * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
- *     Description
- *             Sets ECN of IP header to ce (congestion encountered) if
- *             current value is ect (ECN capable). Works with IPv6 and IPv4.
- *     Return
- *             1 if set, 0 if not set.
+ *	Description
+ *		Set ECN (Explicit Congestion Notification) field of IP header
+ *		to **CE** (Congestion Encountered) if current value is **ECT**
+ *		(ECN Capable Transport). Otherwise, do nothing. Works with IPv6
+ *		and IPv4.
+ *	Return
+ *		1 if the **CE** flag is set (either by the current helper call
+ *		or because it was already present), 0 if it is not set.
  *
  * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)
  *	Description
- *		Return a **struct bpf_sock** pointer in TCP_LISTEN state.
- *		bpf_sk_release() is unnecessary and not allowed.
+ *		Return a **struct bpf_sock** pointer in **TCP_LISTEN** state.
+ *		**bpf_sk_release**\ () is unnecessary and not allowed.
  *	Return
- *		A **struct bpf_sock** pointer on success, or NULL in
+ *		A **struct bpf_sock** pointer on success, or **NULL** in
  *		case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
-- 
cgit v1.2.3-59-g8ed1b


From 0eb0978528d47699edd091dc2c337952ad8da436 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Thu, 14 Mar 2019 12:38:40 +0000
Subject: bpf: add documentation for helpers bpf_spin_lock(), bpf_spin_unlock()

Add documentation for the BPF spinlock-related helpers to the doc in
bpf.h. I added the constraints and restrictions coming with the use of
spinlocks for BPF: not all of it is directly related to the use of the
helper, but I thought it would be nice for users to find them in the man
page.

This list of restrictions is nearly a verbatim copy of the list in
Alexei's commit log for those helpers.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4465d00d3493..929c8e537a14 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2343,6 +2343,61 @@ union bpf_attr {
  *	Return
  *		0
  *
+ * int bpf_spin_lock(struct bpf_spin_lock *lock)
+ *	Description
+ *		Acquire a spinlock represented by the pointer *lock*, which is
+ *		stored as part of a value of a map. Taking the lock allows to
+ *		safely update the rest of the fields in that value. The
+ *		spinlock can (and must) later be released with a call to
+ *		**bpf_spin_unlock**\ (\ *lock*\ ).
+ *
+ *		Spinlocks in BPF programs come with a number of restrictions
+ *		and constraints:
+ *
+ *		* **bpf_spin_lock** objects are only allowed inside maps of
+ *		  types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this
+ *		  list could be extended in the future).
+ *		* BTF description of the map is mandatory.
+ *		* The BPF program can take ONE lock at a time, since taking two
+ *		  or more could cause dead locks.
+ *		* Only one **struct bpf_spin_lock** is allowed per map element.
+ *		* When the lock is taken, calls (either BPF to BPF or helpers)
+ *		  are not allowed.
+ *		* The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not
+ *		  allowed inside a spinlock-ed region.
+ *		* The BPF program MUST call **bpf_spin_unlock**\ () to release
+ *		  the lock, on all execution paths, before it returns.
+ *		* The BPF program can access **struct bpf_spin_lock** only via
+ *		  the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ ()
+ *		  helpers. Loading or storing data into the **struct
+ *		  bpf_spin_lock** *lock*\ **;** field of a map is not allowed.
+ *		* To use the **bpf_spin_lock**\ () helper, the BTF description
+ *		  of the map value must be a struct and have **struct
+ *		  bpf_spin_lock** *anyname*\ **;** field at the top level.
+ *		  Nested lock inside another struct is not allowed.
+ *		* The **struct bpf_spin_lock** *lock* field in a map value must
+ *		  be aligned on a multiple of 4 bytes in that value.
+ *		* Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy
+ *		  the **bpf_spin_lock** field to user space.
+ *		* Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from
+ *		  a BPF program, do not update the **bpf_spin_lock** field.
+ *		* **bpf_spin_lock** cannot be on the stack or inside a
+ *		  networking packet (it can only be inside of a map values).
+ *		* **bpf_spin_lock** is available to root only.
+ *		* Tracing programs and socket filter programs cannot use
+ *		  **bpf_spin_lock**\ () due to insufficient preemption checks
+ *		  (but this may change in the future).
+ *		* **bpf_spin_lock** is not allowed in inner maps of map-in-map.
+ *	Return
+ *		0
+ *
+ * int bpf_spin_unlock(struct bpf_spin_lock *lock)
+ *	Description
+ *		Release the *lock* previously locked by a call to
+ *		**bpf_spin_lock**\ (\ *lock*\ ).
+ *	Return
+ *		0
+ *
  * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
  *	Description
  *		This helper gets a **struct bpf_sock** pointer such
-- 
cgit v1.2.3-59-g8ed1b


From 037fc3368be46dc1a2a90f6e50c8cbce49d75fd6 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Sun, 17 Mar 2019 11:01:09 +0900
Subject: kbuild: force all architectures except um to include mandatory-y

Currently, every arch/*/include/uapi/asm/Kbuild explicitly includes
the common Kbuild.asm file. Factor out the duplicated include directives
to scripts/Makefile.asm-generic so that no architecture would opt out
of the mandatory-y mechanism.

um is not forced to include mandatory-y since it is a very exceptional
case which does not support UAPI.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 Documentation/kbuild/makefiles.txt      |  2 +-
 Makefile                                |  6 ++++--
 arch/alpha/include/uapi/asm/Kbuild      |  2 --
 arch/arc/include/uapi/asm/Kbuild        |  2 --
 arch/arm/include/uapi/asm/Kbuild        |  1 -
 arch/arm64/include/uapi/asm/Kbuild      |  1 -
 arch/c6x/include/uapi/asm/Kbuild        |  2 --
 arch/csky/include/uapi/asm/Kbuild       |  2 --
 arch/h8300/include/uapi/asm/Kbuild      |  2 --
 arch/hexagon/include/uapi/asm/Kbuild    |  2 --
 arch/ia64/include/uapi/asm/Kbuild       |  2 --
 arch/m68k/include/uapi/asm/Kbuild       |  2 --
 arch/microblaze/include/uapi/asm/Kbuild |  2 --
 arch/mips/include/uapi/asm/Kbuild       |  2 --
 arch/nds32/include/uapi/asm/Kbuild      |  2 --
 arch/nios2/include/uapi/asm/Kbuild      |  2 --
 arch/openrisc/include/uapi/asm/Kbuild   |  2 --
 arch/parisc/include/uapi/asm/Kbuild     |  2 --
 arch/powerpc/include/uapi/asm/Kbuild    |  2 --
 arch/riscv/include/uapi/asm/Kbuild      |  1 -
 arch/s390/include/uapi/asm/Kbuild       |  1 -
 arch/sh/include/uapi/asm/Kbuild         |  1 -
 arch/sparc/include/uapi/asm/Kbuild      |  2 --
 arch/unicore32/include/uapi/asm/Kbuild  |  2 --
 arch/x86/include/uapi/asm/Kbuild        |  2 --
 arch/xtensa/include/uapi/asm/Kbuild     |  2 --
 include/asm-generic/Kbuild              |  5 +++++
 include/uapi/asm-generic/Kbuild         | 36 +++++++++++++++++++++++++++++++++
 include/uapi/asm-generic/Kbuild.asm     | 34 -------------------------------
 scripts/Makefile.asm-generic            |  5 +++++
 30 files changed, 51 insertions(+), 80 deletions(-)
 create mode 100644 include/asm-generic/Kbuild
 create mode 100644 include/uapi/asm-generic/Kbuild
 delete mode 100644 include/uapi/asm-generic/Kbuild.asm

(limited to 'include/uapi')

diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index f124be6e4c3a..03c065855eaf 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -1274,7 +1274,7 @@ See subsequent chapter for the syntax of the Kbuild file.
 
 --- 7.4 mandatory-y
 
-	mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild.asm
+	mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild
 	to define the minimum set of ASM headers that all architectures must have.
 
 	This works like optional generic-y. If a mandatory header is missing
diff --git a/Makefile b/Makefile
index 08a01add09a6..4f45ae628f54 100644
--- a/Makefile
+++ b/Makefile
@@ -1098,9 +1098,11 @@ asm-generic := -f $(srctree)/scripts/Makefile.asm-generic obj
 
 PHONY += asm-generic uapi-asm-generic
 asm-generic: uapi-asm-generic
-	$(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/asm
+	$(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/asm \
+	generic=include/asm-generic
 uapi-asm-generic:
-	$(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/uapi/asm
+	$(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/uapi/asm \
+	generic=include/uapi/asm-generic
 
 PHONY += prepare-objtool
 prepare-objtool: $(objtool_target)
diff --git a/arch/alpha/include/uapi/asm/Kbuild b/arch/alpha/include/uapi/asm/Kbuild
index 439f5157aa35..7417847dc438 100644
--- a/arch/alpha/include/uapi/asm/Kbuild
+++ b/arch/alpha/include/uapi/asm/Kbuild
@@ -1,3 +1 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/arc/include/uapi/asm/Kbuild
+++ b/arch/arc/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index eee8f7d23899..23b4464c0995 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd-common.h
 generated-y += unistd-oabi.h
diff --git a/arch/arm64/include/uapi/asm/Kbuild b/arch/arm64/include/uapi/asm/Kbuild
index 87eea29b24ab..602d137932dc 100644
--- a/arch/arm64/include/uapi/asm/Kbuild
+++ b/arch/arm64/include/uapi/asm/Kbuild
@@ -1,4 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-include include/uapi/asm-generic/Kbuild.asm
 
 generic-y += kvm_para.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/csky/include/uapi/asm/Kbuild b/arch/csky/include/uapi/asm/Kbuild
index c1b06dcf6cf8..1c72f04ff75d 100644
--- a/arch/csky/include/uapi/asm/Kbuild
+++ b/arch/csky/include/uapi/asm/Kbuild
@@ -1,3 +1 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += ucontext.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild
index c1b06dcf6cf8..1c72f04ff75d 100644
--- a/arch/hexagon/include/uapi/asm/Kbuild
+++ b/arch/hexagon/include/uapi/asm/Kbuild
@@ -1,3 +1 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += ucontext.h
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 5b819e53c397..20018cb883a9 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_64.h
 generic-y += kvm_para.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index 960bf1e4be53..8a7ad40be463 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generic-y += kvm_para.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index 97823ec46e97..3ce84fbb2678 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -1,5 +1,3 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/mips/include/uapi/asm/Kbuild b/arch/mips/include/uapi/asm/Kbuild
index 0851c103a8ce..c3798bfe0486 100644
--- a/arch/mips/include/uapi/asm/Kbuild
+++ b/arch/mips/include/uapi/asm/Kbuild
@@ -1,5 +1,3 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_n32.h
 generated-y += unistd_n64.h
 generated-y += unistd_o32.h
diff --git a/arch/nds32/include/uapi/asm/Kbuild b/arch/nds32/include/uapi/asm/Kbuild
index c1b06dcf6cf8..1c72f04ff75d 100644
--- a/arch/nds32/include/uapi/asm/Kbuild
+++ b/arch/nds32/include/uapi/asm/Kbuild
@@ -1,3 +1 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += ucontext.h
diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/nios2/include/uapi/asm/Kbuild
+++ b/arch/nios2/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild
index c54353d390ff..22fdbd08cdc8 100644
--- a/arch/parisc/include/uapi/asm/Kbuild
+++ b/arch/parisc/include/uapi/asm/Kbuild
@@ -1,5 +1,3 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generated-y += unistd_64.h
 generic-y += kvm_para.h
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 214a39acdf25..2bd5b392277c 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generated-y += unistd_64.h
diff --git a/arch/riscv/include/uapi/asm/Kbuild b/arch/riscv/include/uapi/asm/Kbuild
index d2ee86b4c091..e69de29bb2d1 100644
--- a/arch/riscv/include/uapi/asm/Kbuild
+++ b/arch/riscv/include/uapi/asm/Kbuild
@@ -1 +0,0 @@
-include include/uapi/asm-generic/Kbuild.asm
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index da3e0d48abbc..46c1ff0b842a 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
 generated-y += unistd_64.h
diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild
index eaa30bcd93bf..ecfbd40924dd 100644
--- a/arch/sh/include/uapi/asm/Kbuild
+++ b/arch/sh/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-include include/uapi/asm-generic/Kbuild.asm
 
 generated-y += unistd_32.h
 generic-y += kvm_para.h
diff --git a/arch/sparc/include/uapi/asm/Kbuild b/arch/sparc/include/uapi/asm/Kbuild
index 214a39acdf25..2bd5b392277c 100644
--- a/arch/sparc/include/uapi/asm/Kbuild
+++ b/arch/sparc/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generated-y += unistd_64.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 0febf1a07c30..755bb11323d8 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index f6648e9928b3..59b5ad310f78 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -1,5 +1,3 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generated-y += unistd_64.h
 generated-y += unistd_x32.h
diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild
index 960bf1e4be53..8a7ad40be463 100644
--- a/arch/xtensa/include/uapi/asm/Kbuild
+++ b/arch/xtensa/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
-include include/uapi/asm-generic/Kbuild.asm
-
 generated-y += unistd_32.h
 generic-y += kvm_para.h
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
new file mode 100644
index 000000000000..6f4536d70b8e
--- /dev/null
+++ b/include/asm-generic/Kbuild
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# asm headers that all architectures except um should have
+# (This file is not included when SRCARCH=um since UML borrows several
+# asm headers from the host architecutre.)
diff --git a/include/uapi/asm-generic/Kbuild b/include/uapi/asm-generic/Kbuild
new file mode 100644
index 000000000000..ebb180aac74e
--- /dev/null
+++ b/include/uapi/asm-generic/Kbuild
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Headers that are mandatory in usr/include/asm/
+# (This file is not included when SRCARCH=um since UML does not support UAPI.)
+
+mandatory-y += auxvec.h
+mandatory-y += bitsperlong.h
+mandatory-y += bpf_perf_event.h
+mandatory-y += byteorder.h
+mandatory-y += errno.h
+mandatory-y += fcntl.h
+mandatory-y += ioctl.h
+mandatory-y += ioctls.h
+mandatory-y += ipcbuf.h
+mandatory-y += mman.h
+mandatory-y += msgbuf.h
+mandatory-y += param.h
+mandatory-y += poll.h
+mandatory-y += posix_types.h
+mandatory-y += ptrace.h
+mandatory-y += resource.h
+mandatory-y += sembuf.h
+mandatory-y += setup.h
+mandatory-y += shmbuf.h
+mandatory-y += sigcontext.h
+mandatory-y += siginfo.h
+mandatory-y += signal.h
+mandatory-y += socket.h
+mandatory-y += sockios.h
+mandatory-y += stat.h
+mandatory-y += statfs.h
+mandatory-y += swab.h
+mandatory-y += termbits.h
+mandatory-y += termios.h
+mandatory-y += types.h
+mandatory-y += unistd.h
diff --git a/include/uapi/asm-generic/Kbuild.asm b/include/uapi/asm-generic/Kbuild.asm
deleted file mode 100644
index 355c4ac2c0b0..000000000000
--- a/include/uapi/asm-generic/Kbuild.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Headers that are mandatory in usr/include/asm/
-#
-mandatory-y += auxvec.h
-mandatory-y += bitsperlong.h
-mandatory-y += bpf_perf_event.h
-mandatory-y += byteorder.h
-mandatory-y += errno.h
-mandatory-y += fcntl.h
-mandatory-y += ioctl.h
-mandatory-y += ioctls.h
-mandatory-y += ipcbuf.h
-mandatory-y += mman.h
-mandatory-y += msgbuf.h
-mandatory-y += param.h
-mandatory-y += poll.h
-mandatory-y += posix_types.h
-mandatory-y += ptrace.h
-mandatory-y += resource.h
-mandatory-y += sembuf.h
-mandatory-y += setup.h
-mandatory-y += shmbuf.h
-mandatory-y += sigcontext.h
-mandatory-y += siginfo.h
-mandatory-y += signal.h
-mandatory-y += socket.h
-mandatory-y += sockios.h
-mandatory-y += stat.h
-mandatory-y += statfs.h
-mandatory-y += swab.h
-mandatory-y += termbits.h
-mandatory-y += termios.h
-mandatory-y += types.h
-mandatory-y += unistd.h
diff --git a/scripts/Makefile.asm-generic b/scripts/Makefile.asm-generic
index c7d2b7acad26..82ad63dcd62b 100644
--- a/scripts/Makefile.asm-generic
+++ b/scripts/Makefile.asm-generic
@@ -12,6 +12,11 @@ all:
 src := $(subst /generated,,$(obj))
 -include $(src)/Kbuild
 
+# $(generic)/Kbuild lists mandatory-y. Exclude um since it is a special case.
+ifneq ($(SRCARCH),um)
+include $(generic)/Kbuild
+endif
+
 include scripts/Kbuild.include
 
 redundant := $(filter $(mandatory-y) $(generated-y), $(generic-y))
-- 
cgit v1.2.3-59-g8ed1b


From 07ba9e7be423423043c5090a2f395c0da26e1b3d Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 18 Jan 2019 11:18:17 -0800
Subject: Input: document meanings of KEY_SCREEN and KEY_ZOOM

It is hard to say what KEY_SCREEN and KEY_ZOOM mean, but historically DVB
folks have used them to indicate switch to full screen mode. Later, they
converged on using KEY_ZOOM to switch into full screen mode and KEY)SCREEN
to control aspect ratio (see Documentation/media/uapi/rc/rc-tables.rst).

Let's commit to these uses, and define:

- KEY_FULL_SCREEN (and make KEY_ZOOM its alias)
- KEY_ASPECT_RATIO (and make KEY_SCREEN its alias)

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input-event-codes.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index ae366b87426a..bc5054e51bef 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -439,10 +439,12 @@
 #define KEY_TITLE		0x171
 #define KEY_SUBTITLE		0x172
 #define KEY_ANGLE		0x173
-#define KEY_ZOOM		0x174
+#define KEY_FULL_SCREEN		0x174	/* AC View Toggle */
+#define KEY_ZOOM		KEY_FULL_SCREEN
 #define KEY_MODE		0x175
 #define KEY_KEYBOARD		0x176
-#define KEY_SCREEN		0x177
+#define KEY_ASPECT_RATIO	0x177	/* HUTRR37: Aspect */
+#define KEY_SCREEN		KEY_ASPECT_RATIO
 #define KEY_PC			0x178	/* Media Select Computer */
 #define KEY_TV			0x179	/* Media Select TV */
 #define KEY_TV2			0x17a	/* Media Select Cable */
-- 
cgit v1.2.3-59-g8ed1b


From 0532a1b0d045115521a93acf28f1270df89ad806 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 22 Mar 2019 09:19:34 +0100
Subject: virt: vbox: Implement passing requestor info to the host for
 VirtualBox 6.0.x

VirtualBox 6.0.x has a new feature where the guest kernel driver passes
info about the origin of the request (e.g. userspace or kernelspace) to
the hypervisor.

If we do not pass this information then when running the 6.0.x userspace
guest-additions tools on a 6.0.x host, some requests will get denied
with a VERR_VERSION_MISMATCH error, breaking vboxservice.service and
the mounting of shared folders marked to be auto-mounted.

This commit implements passing the requestor info to the host, fixing this.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/vboxguest/vboxguest_core.c    | 106 ++++++++++++++++++++---------
 drivers/virt/vboxguest/vboxguest_core.h    |  15 ++--
 drivers/virt/vboxguest/vboxguest_linux.c   |  26 ++++++-
 drivers/virt/vboxguest/vboxguest_utils.c   |  32 +++++----
 drivers/virt/vboxguest/vboxguest_version.h |   9 ++-
 drivers/virt/vboxguest/vmmdev.h            |   8 ++-
 include/linux/vbox_utils.h                 |  12 ++--
 include/uapi/linux/vbox_vmmdev_types.h     |  60 ++++++++++++++++
 8 files changed, 197 insertions(+), 71 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/virt/vboxguest/vboxguest_core.c b/drivers/virt/vboxguest/vboxguest_core.c
index df7d09409efe..8ca333f21292 100644
--- a/drivers/virt/vboxguest/vboxguest_core.c
+++ b/drivers/virt/vboxguest/vboxguest_core.c
@@ -27,6 +27,10 @@
 
 #define GUEST_MAPPINGS_TRIES	5
 
+#define VBG_KERNEL_REQUEST \
+	(VMMDEV_REQUESTOR_KERNEL | VMMDEV_REQUESTOR_USR_DRV | \
+	 VMMDEV_REQUESTOR_CON_DONT_KNOW | VMMDEV_REQUESTOR_TRUST_NOT_GIVEN)
+
 /**
  * Reserves memory in which the VMM can relocate any guest mappings
  * that are floating around.
@@ -48,7 +52,8 @@ static void vbg_guest_mappings_init(struct vbg_dev *gdev)
 	int i, rc;
 
 	/* Query the required space. */
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HYPERVISOR_INFO);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HYPERVISOR_INFO,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return;
 
@@ -135,7 +140,8 @@ static void vbg_guest_mappings_exit(struct vbg_dev *gdev)
 	 * Tell the host that we're going to free the memory we reserved for
 	 * it, the free it up. (Leak the memory if anything goes wrong here.)
 	 */
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_HYPERVISOR_INFO);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_HYPERVISOR_INFO,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return;
 
@@ -172,8 +178,10 @@ static int vbg_report_guest_info(struct vbg_dev *gdev)
 	struct vmmdev_guest_info2 *req2 = NULL;
 	int rc, ret = -ENOMEM;
 
-	req1 = vbg_req_alloc(sizeof(*req1), VMMDEVREQ_REPORT_GUEST_INFO);
-	req2 = vbg_req_alloc(sizeof(*req2), VMMDEVREQ_REPORT_GUEST_INFO2);
+	req1 = vbg_req_alloc(sizeof(*req1), VMMDEVREQ_REPORT_GUEST_INFO,
+			     VBG_KERNEL_REQUEST);
+	req2 = vbg_req_alloc(sizeof(*req2), VMMDEVREQ_REPORT_GUEST_INFO2,
+			     VBG_KERNEL_REQUEST);
 	if (!req1 || !req2)
 		goto out_free;
 
@@ -187,8 +195,8 @@ static int vbg_report_guest_info(struct vbg_dev *gdev)
 	req2->additions_minor = VBG_VERSION_MINOR;
 	req2->additions_build = VBG_VERSION_BUILD;
 	req2->additions_revision = VBG_SVN_REV;
-	/* (no features defined yet) */
-	req2->additions_features = 0;
+	req2->additions_features =
+		VMMDEV_GUEST_INFO2_ADDITIONS_FEATURES_REQUESTOR_INFO;
 	strlcpy(req2->name, VBG_VERSION_STRING,
 		sizeof(req2->name));
 
@@ -230,7 +238,8 @@ static int vbg_report_driver_status(struct vbg_dev *gdev, bool active)
 	struct vmmdev_guest_status *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_REPORT_GUEST_STATUS);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_REPORT_GUEST_STATUS,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -423,7 +432,8 @@ static int vbg_heartbeat_host_config(struct vbg_dev *gdev, bool enabled)
 	struct vmmdev_heartbeat *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_HEARTBEAT_CONFIGURE);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_HEARTBEAT_CONFIGURE,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -457,7 +467,8 @@ static int vbg_heartbeat_init(struct vbg_dev *gdev)
 
 	gdev->guest_heartbeat_req = vbg_req_alloc(
 					sizeof(*gdev->guest_heartbeat_req),
-					VMMDEVREQ_GUEST_HEARTBEAT);
+					VMMDEVREQ_GUEST_HEARTBEAT,
+					VBG_KERNEL_REQUEST);
 	if (!gdev->guest_heartbeat_req)
 		return -ENOMEM;
 
@@ -528,7 +539,8 @@ static int vbg_reset_host_event_filter(struct vbg_dev *gdev,
 	struct vmmdev_mask *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -567,8 +579,14 @@ static int vbg_set_session_event_filter(struct vbg_dev *gdev,
 	u32 changed, previous;
 	int rc, ret = 0;
 
-	/* Allocate a request buffer before taking the spinlock */
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK);
+	/*
+	 * Allocate a request buffer before taking the spinlock, when
+	 * the session is being terminated the requestor is the kernel,
+	 * as we're cleaning up.
+	 */
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_CTL_GUEST_FILTER_MASK,
+			    session_termination ? VBG_KERNEL_REQUEST :
+						  session->requestor);
 	if (!req) {
 		if (!session_termination)
 			return -ENOMEM;
@@ -627,7 +645,8 @@ static int vbg_reset_host_capabilities(struct vbg_dev *gdev)
 	struct vmmdev_mask *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -662,8 +681,14 @@ static int vbg_set_session_capabilities(struct vbg_dev *gdev,
 	u32 changed, previous;
 	int rc, ret = 0;
 
-	/* Allocate a request buffer before taking the spinlock */
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES);
+	/*
+	 * Allocate a request buffer before taking the spinlock, when
+	 * the session is being terminated the requestor is the kernel,
+	 * as we're cleaning up.
+	 */
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_GUEST_CAPABILITIES,
+			    session_termination ? VBG_KERNEL_REQUEST :
+						  session->requestor);
 	if (!req) {
 		if (!session_termination)
 			return -ENOMEM;
@@ -722,7 +747,8 @@ static int vbg_query_host_version(struct vbg_dev *gdev)
 	struct vmmdev_host_version *req;
 	int rc, ret;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HOST_VERSION);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_GET_HOST_VERSION,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
@@ -783,19 +809,24 @@ int vbg_core_init(struct vbg_dev *gdev, u32 fixed_events)
 
 	gdev->mem_balloon.get_req =
 		vbg_req_alloc(sizeof(*gdev->mem_balloon.get_req),
-			      VMMDEVREQ_GET_MEMBALLOON_CHANGE_REQ);
+			      VMMDEVREQ_GET_MEMBALLOON_CHANGE_REQ,
+			      VBG_KERNEL_REQUEST);
 	gdev->mem_balloon.change_req =
 		vbg_req_alloc(sizeof(*gdev->mem_balloon.change_req),
-			      VMMDEVREQ_CHANGE_MEMBALLOON);
+			      VMMDEVREQ_CHANGE_MEMBALLOON,
+			      VBG_KERNEL_REQUEST);
 	gdev->cancel_req =
 		vbg_req_alloc(sizeof(*(gdev->cancel_req)),
-			      VMMDEVREQ_HGCM_CANCEL2);
+			      VMMDEVREQ_HGCM_CANCEL2,
+			      VBG_KERNEL_REQUEST);
 	gdev->ack_events_req =
 		vbg_req_alloc(sizeof(*gdev->ack_events_req),
-			      VMMDEVREQ_ACKNOWLEDGE_EVENTS);
+			      VMMDEVREQ_ACKNOWLEDGE_EVENTS,
+			      VBG_KERNEL_REQUEST);
 	gdev->mouse_status_req =
 		vbg_req_alloc(sizeof(*gdev->mouse_status_req),
-			      VMMDEVREQ_GET_MOUSE_STATUS);
+			      VMMDEVREQ_GET_MOUSE_STATUS,
+			      VBG_KERNEL_REQUEST);
 
 	if (!gdev->mem_balloon.get_req || !gdev->mem_balloon.change_req ||
 	    !gdev->cancel_req || !gdev->ack_events_req ||
@@ -892,9 +923,9 @@ void vbg_core_exit(struct vbg_dev *gdev)
  * vboxguest_linux.c calls this when userspace opens the char-device.
  * Return: A pointer to the new session or an ERR_PTR on error.
  * @gdev:		The Guest extension device.
- * @user:		Set if this is a session for the vboxuser device.
+ * @requestor:		VMMDEV_REQUESTOR_* flags
  */
-struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, bool user)
+struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, u32 requestor)
 {
 	struct vbg_session *session;
 
@@ -903,7 +934,7 @@ struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, bool user)
 		return ERR_PTR(-ENOMEM);
 
 	session->gdev = gdev;
-	session->user_session = user;
+	session->requestor = requestor;
 
 	return session;
 }
@@ -924,7 +955,9 @@ void vbg_core_close_session(struct vbg_session *session)
 		if (!session->hgcm_client_ids[i])
 			continue;
 
-		vbg_hgcm_disconnect(gdev, session->hgcm_client_ids[i], &rc);
+		/* requestor is kernel here, as we're cleaning up. */
+		vbg_hgcm_disconnect(gdev, VBG_KERNEL_REQUEST,
+				    session->hgcm_client_ids[i], &rc);
 	}
 
 	kfree(session);
@@ -1152,7 +1185,8 @@ static int vbg_req_allowed(struct vbg_dev *gdev, struct vbg_session *session,
 		return -EPERM;
 	}
 
-	if (trusted_apps_only && session->user_session) {
+	if (trusted_apps_only &&
+	    (session->requestor & VMMDEV_REQUESTOR_USER_DEVICE)) {
 		vbg_err("Denying userspace vmm call type %#08x through vboxuser device node\n",
 			req->request_type);
 		return -EPERM;
@@ -1209,8 +1243,8 @@ static int vbg_ioctl_hgcm_connect(struct vbg_dev *gdev,
 	if (i >= ARRAY_SIZE(session->hgcm_client_ids))
 		return -EMFILE;
 
-	ret = vbg_hgcm_connect(gdev, &conn->u.in.loc, &client_id,
-			       &conn->hdr.rc);
+	ret = vbg_hgcm_connect(gdev, session->requestor, &conn->u.in.loc,
+			       &client_id, &conn->hdr.rc);
 
 	mutex_lock(&gdev->session_mutex);
 	if (ret == 0 && conn->hdr.rc >= 0) {
@@ -1251,7 +1285,8 @@ static int vbg_ioctl_hgcm_disconnect(struct vbg_dev *gdev,
 	if (i >= ARRAY_SIZE(session->hgcm_client_ids))
 		return -EINVAL;
 
-	ret = vbg_hgcm_disconnect(gdev, client_id, &disconn->hdr.rc);
+	ret = vbg_hgcm_disconnect(gdev, session->requestor, client_id,
+				  &disconn->hdr.rc);
 
 	mutex_lock(&gdev->session_mutex);
 	if (ret == 0 && disconn->hdr.rc >= 0)
@@ -1313,12 +1348,12 @@ static int vbg_ioctl_hgcm_call(struct vbg_dev *gdev,
 	}
 
 	if (IS_ENABLED(CONFIG_COMPAT) && f32bit)
-		ret = vbg_hgcm_call32(gdev, client_id,
+		ret = vbg_hgcm_call32(gdev, session->requestor, client_id,
 				      call->function, call->timeout_ms,
 				      VBG_IOCTL_HGCM_CALL_PARMS32(call),
 				      call->parm_count, &call->hdr.rc);
 	else
-		ret = vbg_hgcm_call(gdev, client_id,
+		ret = vbg_hgcm_call(gdev, session->requestor, client_id,
 				    call->function, call->timeout_ms,
 				    VBG_IOCTL_HGCM_CALL_PARMS(call),
 				    call->parm_count, &call->hdr.rc);
@@ -1408,6 +1443,7 @@ static int vbg_ioctl_check_balloon(struct vbg_dev *gdev,
 }
 
 static int vbg_ioctl_write_core_dump(struct vbg_dev *gdev,
+				     struct vbg_session *session,
 				     struct vbg_ioctl_write_coredump *dump)
 {
 	struct vmmdev_write_core_dump *req;
@@ -1415,7 +1451,8 @@ static int vbg_ioctl_write_core_dump(struct vbg_dev *gdev,
 	if (vbg_ioctl_chk(&dump->hdr, sizeof(dump->u.in), 0))
 		return -EINVAL;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_WRITE_COREDUMP);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_WRITE_COREDUMP,
+			    session->requestor);
 	if (!req)
 		return -ENOMEM;
 
@@ -1476,7 +1513,7 @@ int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data)
 	case VBG_IOCTL_CHECK_BALLOON:
 		return vbg_ioctl_check_balloon(gdev, data);
 	case VBG_IOCTL_WRITE_CORE_DUMP:
-		return vbg_ioctl_write_core_dump(gdev, data);
+		return vbg_ioctl_write_core_dump(gdev, session, data);
 	}
 
 	/* Variable sized requests. */
@@ -1508,7 +1545,8 @@ int vbg_core_set_mouse_status(struct vbg_dev *gdev, u32 features)
 	struct vmmdev_mouse_status *req;
 	int rc;
 
-	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_MOUSE_STATUS);
+	req = vbg_req_alloc(sizeof(*req), VMMDEVREQ_SET_MOUSE_STATUS,
+			    VBG_KERNEL_REQUEST);
 	if (!req)
 		return -ENOMEM;
 
diff --git a/drivers/virt/vboxguest/vboxguest_core.h b/drivers/virt/vboxguest/vboxguest_core.h
index 7ad9ec45bfa9..4188c12b839f 100644
--- a/drivers/virt/vboxguest/vboxguest_core.h
+++ b/drivers/virt/vboxguest/vboxguest_core.h
@@ -154,15 +154,15 @@ struct vbg_session {
 	 * host. Protected by vbg_gdev.session_mutex.
 	 */
 	u32 guest_caps;
-	/** Does this session belong to a root process or a user one? */
-	bool user_session;
+	/** VMMDEV_REQUESTOR_* flags */
+	u32 requestor;
 	/** Set on CANCEL_ALL_WAITEVENTS, protected by vbg_devevent_spinlock. */
 	bool cancel_waiters;
 };
 
 int  vbg_core_init(struct vbg_dev *gdev, u32 fixed_events);
 void vbg_core_exit(struct vbg_dev *gdev);
-struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, bool user);
+struct vbg_session *vbg_core_open_session(struct vbg_dev *gdev, u32 requestor);
 void vbg_core_close_session(struct vbg_session *session);
 int  vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data);
 int  vbg_core_set_mouse_status(struct vbg_dev *gdev, u32 features);
@@ -172,12 +172,13 @@ irqreturn_t vbg_core_isr(int irq, void *dev_id);
 void vbg_linux_mouse_event(struct vbg_dev *gdev);
 
 /* Private (non exported) functions form vboxguest_utils.c */
-void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type);
+void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type,
+		    u32 requestor);
 void vbg_req_free(void *req, size_t len);
 int vbg_req_perform(struct vbg_dev *gdev, void *req);
 int vbg_hgcm_call32(
-	struct vbg_dev *gdev, u32 client_id, u32 function, u32 timeout_ms,
-	struct vmmdev_hgcm_function_parameter32 *parm32, u32 parm_count,
-	int *vbox_status);
+	struct vbg_dev *gdev, u32 requestor, u32 client_id, u32 function,
+	u32 timeout_ms, struct vmmdev_hgcm_function_parameter32 *parm32,
+	u32 parm_count, int *vbox_status);
 
 #endif
diff --git a/drivers/virt/vboxguest/vboxguest_linux.c b/drivers/virt/vboxguest/vboxguest_linux.c
index 6e2a9619192d..6e8c0f1c1056 100644
--- a/drivers/virt/vboxguest/vboxguest_linux.c
+++ b/drivers/virt/vboxguest/vboxguest_linux.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2006-2016 Oracle Corporation
  */
 
+#include <linux/cred.h>
 #include <linux/input.h>
 #include <linux/kernel.h>
 #include <linux/miscdevice.h>
@@ -28,6 +29,23 @@ static DEFINE_MUTEX(vbg_gdev_mutex);
 /** Global vbg_gdev pointer used by vbg_get/put_gdev. */
 static struct vbg_dev *vbg_gdev;
 
+static u32 vbg_misc_device_requestor(struct inode *inode)
+{
+	u32 requestor = VMMDEV_REQUESTOR_USERMODE |
+			VMMDEV_REQUESTOR_CON_DONT_KNOW |
+			VMMDEV_REQUESTOR_TRUST_NOT_GIVEN;
+
+	if (from_kuid(current_user_ns(), current->cred->uid) == 0)
+		requestor |= VMMDEV_REQUESTOR_USR_ROOT;
+	else
+		requestor |= VMMDEV_REQUESTOR_USR_USER;
+
+	if (in_egroup_p(inode->i_gid))
+		requestor |= VMMDEV_REQUESTOR_GRP_VBOX;
+
+	return requestor;
+}
+
 static int vbg_misc_device_open(struct inode *inode, struct file *filp)
 {
 	struct vbg_session *session;
@@ -36,7 +54,7 @@ static int vbg_misc_device_open(struct inode *inode, struct file *filp)
 	/* misc_open sets filp->private_data to our misc device */
 	gdev = container_of(filp->private_data, struct vbg_dev, misc_device);
 
-	session = vbg_core_open_session(gdev, false);
+	session = vbg_core_open_session(gdev, vbg_misc_device_requestor(inode));
 	if (IS_ERR(session))
 		return PTR_ERR(session);
 
@@ -53,7 +71,8 @@ static int vbg_misc_device_user_open(struct inode *inode, struct file *filp)
 	gdev = container_of(filp->private_data, struct vbg_dev,
 			    misc_device_user);
 
-	session = vbg_core_open_session(gdev, false);
+	session = vbg_core_open_session(gdev, vbg_misc_device_requestor(inode) |
+					      VMMDEV_REQUESTOR_USER_DEVICE);
 	if (IS_ERR(session))
 		return PTR_ERR(session);
 
@@ -115,7 +134,8 @@ static long vbg_misc_device_ioctl(struct file *filp, unsigned int req,
 			 req == VBG_IOCTL_VMMDEV_REQUEST_BIG;
 
 	if (is_vmmdev_req)
-		buf = vbg_req_alloc(size, VBG_IOCTL_HDR_TYPE_DEFAULT);
+		buf = vbg_req_alloc(size, VBG_IOCTL_HDR_TYPE_DEFAULT,
+				    session->requestor);
 	else
 		buf = kmalloc(size, GFP_KERNEL);
 	if (!buf)
diff --git a/drivers/virt/vboxguest/vboxguest_utils.c b/drivers/virt/vboxguest/vboxguest_utils.c
index bf4474214b4d..75fd140b02ff 100644
--- a/drivers/virt/vboxguest/vboxguest_utils.c
+++ b/drivers/virt/vboxguest/vboxguest_utils.c
@@ -62,7 +62,8 @@ VBG_LOG(vbg_err, pr_err);
 VBG_LOG(vbg_debug, pr_debug);
 #endif
 
-void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type)
+void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type,
+		    u32 requestor)
 {
 	struct vmmdev_request_header *req;
 	int order = get_order(PAGE_ALIGN(len));
@@ -78,7 +79,7 @@ void *vbg_req_alloc(size_t len, enum vmmdev_request_type req_type)
 	req->request_type = req_type;
 	req->rc = VERR_GENERAL_FAILURE;
 	req->reserved1 = 0;
-	req->reserved2 = 0;
+	req->requestor = requestor;
 
 	return req;
 }
@@ -119,7 +120,7 @@ static bool hgcm_req_done(struct vbg_dev *gdev,
 	return done;
 }
 
-int vbg_hgcm_connect(struct vbg_dev *gdev,
+int vbg_hgcm_connect(struct vbg_dev *gdev, u32 requestor,
 		     struct vmmdev_hgcm_service_location *loc,
 		     u32 *client_id, int *vbox_status)
 {
@@ -127,7 +128,7 @@ int vbg_hgcm_connect(struct vbg_dev *gdev,
 	int rc;
 
 	hgcm_connect = vbg_req_alloc(sizeof(*hgcm_connect),
-				     VMMDEVREQ_HGCM_CONNECT);
+				     VMMDEVREQ_HGCM_CONNECT, requestor);
 	if (!hgcm_connect)
 		return -ENOMEM;
 
@@ -153,13 +154,15 @@ int vbg_hgcm_connect(struct vbg_dev *gdev,
 }
 EXPORT_SYMBOL(vbg_hgcm_connect);
 
-int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 client_id, int *vbox_status)
+int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 requestor,
+			u32 client_id, int *vbox_status)
 {
 	struct vmmdev_hgcm_disconnect *hgcm_disconnect = NULL;
 	int rc;
 
 	hgcm_disconnect = vbg_req_alloc(sizeof(*hgcm_disconnect),
-					VMMDEVREQ_HGCM_DISCONNECT);
+					VMMDEVREQ_HGCM_DISCONNECT,
+					requestor);
 	if (!hgcm_disconnect)
 		return -ENOMEM;
 
@@ -593,9 +596,10 @@ static int hgcm_call_copy_back_result(
 	return 0;
 }
 
-int vbg_hgcm_call(struct vbg_dev *gdev, u32 client_id, u32 function,
-		  u32 timeout_ms, struct vmmdev_hgcm_function_parameter *parms,
-		  u32 parm_count, int *vbox_status)
+int vbg_hgcm_call(struct vbg_dev *gdev, u32 requestor, u32 client_id,
+		  u32 function, u32 timeout_ms,
+		  struct vmmdev_hgcm_function_parameter *parms, u32 parm_count,
+		  int *vbox_status)
 {
 	struct vmmdev_hgcm_call *call;
 	void **bounce_bufs = NULL;
@@ -615,7 +619,7 @@ int vbg_hgcm_call(struct vbg_dev *gdev, u32 client_id, u32 function,
 		goto free_bounce_bufs;
 	}
 
-	call = vbg_req_alloc(size, VMMDEVREQ_HGCM_CALL);
+	call = vbg_req_alloc(size, VMMDEVREQ_HGCM_CALL, requestor);
 	if (!call) {
 		ret = -ENOMEM;
 		goto free_bounce_bufs;
@@ -647,9 +651,9 @@ EXPORT_SYMBOL(vbg_hgcm_call);
 
 #ifdef CONFIG_COMPAT
 int vbg_hgcm_call32(
-	struct vbg_dev *gdev, u32 client_id, u32 function, u32 timeout_ms,
-	struct vmmdev_hgcm_function_parameter32 *parm32, u32 parm_count,
-	int *vbox_status)
+	struct vbg_dev *gdev, u32 requestor, u32 client_id, u32 function,
+	u32 timeout_ms, struct vmmdev_hgcm_function_parameter32 *parm32,
+	u32 parm_count, int *vbox_status)
 {
 	struct vmmdev_hgcm_function_parameter *parm64 = NULL;
 	u32 i, size;
@@ -689,7 +693,7 @@ int vbg_hgcm_call32(
 			goto out_free;
 	}
 
-	ret = vbg_hgcm_call(gdev, client_id, function, timeout_ms,
+	ret = vbg_hgcm_call(gdev, requestor, client_id, function, timeout_ms,
 			    parm64, parm_count, vbox_status);
 	if (ret < 0)
 		goto out_free;
diff --git a/drivers/virt/vboxguest/vboxguest_version.h b/drivers/virt/vboxguest/vboxguest_version.h
index 77f0c8f8a231..84834dad38d5 100644
--- a/drivers/virt/vboxguest/vboxguest_version.h
+++ b/drivers/virt/vboxguest/vboxguest_version.h
@@ -9,11 +9,10 @@
 #ifndef __VBOX_VERSION_H__
 #define __VBOX_VERSION_H__
 
-/* Last synced October 4th 2017 */
-#define VBG_VERSION_MAJOR 5
-#define VBG_VERSION_MINOR 2
+#define VBG_VERSION_MAJOR 6
+#define VBG_VERSION_MINOR 0
 #define VBG_VERSION_BUILD 0
-#define VBG_SVN_REV 68940
-#define VBG_VERSION_STRING "5.2.0"
+#define VBG_SVN_REV 127566
+#define VBG_VERSION_STRING "6.0.0"
 
 #endif
diff --git a/drivers/virt/vboxguest/vmmdev.h b/drivers/virt/vboxguest/vmmdev.h
index 5e2ae978935d..6337b8d75d96 100644
--- a/drivers/virt/vboxguest/vmmdev.h
+++ b/drivers/virt/vboxguest/vmmdev.h
@@ -98,8 +98,8 @@ struct vmmdev_request_header {
 	s32 rc;
 	/** Reserved field no.1. MBZ. */
 	u32 reserved1;
-	/** Reserved field no.2. MBZ. */
-	u32 reserved2;
+	/** IN: Requestor information (VMMDEV_REQUESTOR_*) */
+	u32 requestor;
 };
 VMMDEV_ASSERT_SIZE(vmmdev_request_header, 24);
 
@@ -247,6 +247,8 @@ struct vmmdev_guest_info {
 };
 VMMDEV_ASSERT_SIZE(vmmdev_guest_info, 24 + 8);
 
+#define VMMDEV_GUEST_INFO2_ADDITIONS_FEATURES_REQUESTOR_INFO	BIT(0)
+
 /** struct vmmdev_guestinfo2 - Guest information report, version 2. */
 struct vmmdev_guest_info2 {
 	/** Header. */
@@ -259,7 +261,7 @@ struct vmmdev_guest_info2 {
 	u32 additions_build;
 	/** SVN revision. */
 	u32 additions_revision;
-	/** Feature mask, currently unused. */
+	/** Feature mask. */
 	u32 additions_features;
 	/**
 	 * The intentional meaning of this field was:
diff --git a/include/linux/vbox_utils.h b/include/linux/vbox_utils.h
index a240ed2a0372..ff56c443180c 100644
--- a/include/linux/vbox_utils.h
+++ b/include/linux/vbox_utils.h
@@ -24,15 +24,17 @@ __printf(1, 2) void vbg_debug(const char *fmt, ...);
 #define vbg_debug pr_debug
 #endif
 
-int vbg_hgcm_connect(struct vbg_dev *gdev,
+int vbg_hgcm_connect(struct vbg_dev *gdev, u32 requestor,
 		     struct vmmdev_hgcm_service_location *loc,
 		     u32 *client_id, int *vbox_status);
 
-int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 client_id, int *vbox_status);
+int vbg_hgcm_disconnect(struct vbg_dev *gdev, u32 requestor,
+			u32 client_id, int *vbox_status);
 
-int vbg_hgcm_call(struct vbg_dev *gdev, u32 client_id, u32 function,
-		  u32 timeout_ms, struct vmmdev_hgcm_function_parameter *parms,
-		  u32 parm_count, int *vbox_status);
+int vbg_hgcm_call(struct vbg_dev *gdev, u32 requestor, u32 client_id,
+		  u32 function, u32 timeout_ms,
+		  struct vmmdev_hgcm_function_parameter *parms, u32 parm_count,
+		  int *vbox_status);
 
 /**
  * Convert a VirtualBox status code to a standard Linux kernel return value.
diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h
index 0e68024f36c7..26f39816af14 100644
--- a/include/uapi/linux/vbox_vmmdev_types.h
+++ b/include/uapi/linux/vbox_vmmdev_types.h
@@ -102,6 +102,66 @@ enum vmmdev_request_type {
 #define VMMDEVREQ_HGCM_CALL VMMDEVREQ_HGCM_CALL32
 #endif
 
+/* vmmdev_request_header.requestor defines */
+
+/* Requestor user not given. */
+#define VMMDEV_REQUESTOR_USR_NOT_GIVEN                      0x00000000
+/* The kernel driver (vboxguest) is the requestor. */
+#define VMMDEV_REQUESTOR_USR_DRV                            0x00000001
+/* Some other kernel driver is the requestor. */
+#define VMMDEV_REQUESTOR_USR_DRV_OTHER                      0x00000002
+/* The root or a admin user is the requestor. */
+#define VMMDEV_REQUESTOR_USR_ROOT                           0x00000003
+/* Regular joe user is making the request. */
+#define VMMDEV_REQUESTOR_USR_USER                           0x00000006
+/* User classification mask. */
+#define VMMDEV_REQUESTOR_USR_MASK                           0x00000007
+
+/* Kernel mode request. Note this is 0, check for !USERMODE instead. */
+#define VMMDEV_REQUESTOR_KERNEL                             0x00000000
+/* User mode request. */
+#define VMMDEV_REQUESTOR_USERMODE                           0x00000008
+/* User or kernel mode classification mask. */
+#define VMMDEV_REQUESTOR_MODE_MASK                          0x00000008
+
+/* Don't know the physical console association of the requestor. */
+#define VMMDEV_REQUESTOR_CON_DONT_KNOW                      0x00000000
+/*
+ * The request originates with a process that is NOT associated with the
+ * physical console.
+ */
+#define VMMDEV_REQUESTOR_CON_NO                             0x00000010
+/* Requestor process is associated with the physical console. */
+#define VMMDEV_REQUESTOR_CON_YES                            0x00000020
+/* Console classification mask. */
+#define VMMDEV_REQUESTOR_CON_MASK                           0x00000030
+
+/* Requestor is member of special VirtualBox user group. */
+#define VMMDEV_REQUESTOR_GRP_VBOX                           0x00000080
+
+/* Note: trust level is for windows guests only, linux always uses not-given */
+/* Requestor trust level: Unspecified */
+#define VMMDEV_REQUESTOR_TRUST_NOT_GIVEN                    0x00000000
+/* Requestor trust level: Untrusted (SID S-1-16-0) */
+#define VMMDEV_REQUESTOR_TRUST_UNTRUSTED                    0x00001000
+/* Requestor trust level: Untrusted (SID S-1-16-4096) */
+#define VMMDEV_REQUESTOR_TRUST_LOW                          0x00002000
+/* Requestor trust level: Medium (SID S-1-16-8192) */
+#define VMMDEV_REQUESTOR_TRUST_MEDIUM                       0x00003000
+/* Requestor trust level: Medium plus (SID S-1-16-8448) */
+#define VMMDEV_REQUESTOR_TRUST_MEDIUM_PLUS                  0x00004000
+/* Requestor trust level: High (SID S-1-16-12288) */
+#define VMMDEV_REQUESTOR_TRUST_HIGH                         0x00005000
+/* Requestor trust level: System (SID S-1-16-16384) */
+#define VMMDEV_REQUESTOR_TRUST_SYSTEM                       0x00006000
+/* Requestor trust level >= Protected (SID S-1-16-20480, S-1-16-28672) */
+#define VMMDEV_REQUESTOR_TRUST_PROTECTED                    0x00007000
+/* Requestor trust level mask */
+#define VMMDEV_REQUESTOR_TRUST_MASK                         0x00007000
+
+/* Requestor is using the less trusted user device node (/dev/vboxuser) */
+#define VMMDEV_REQUESTOR_USER_DEVICE                        0x00008000
+
 /** HGCM service location types. */
 enum vmmdev_hgcm_service_location_type {
 	VMMDEV_HGCM_LOC_INVALID    = 0,
-- 
cgit v1.2.3-59-g8ed1b


From 3d9683cf3bfb6d4e4605a153958dfca7e18b52f2 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 18 Mar 2019 18:08:12 +0900
Subject: KVM: export <linux/kvm_para.h> and <asm/kvm_para.h> iif KVM is
 supported

I do not see any consistency about headers_install of <linux/kvm_para.h>
and <asm/kvm_para.h>.

According to my analysis of Linux 5.1-rc1, there are 3 groups:

 [1] Both <linux/kvm_para.h> and <asm/kvm_para.h> are exported

    alpha, arm, hexagon, mips, powerpc, s390, sparc, x86

 [2] <asm/kvm_para.h> is exported, but <linux/kvm_para.h> is not

    arc, arm64, c6x, h8300, ia64, m68k, microblaze, nios2, openrisc,
    parisc, sh, unicore32, xtensa

 [3] Neither <linux/kvm_para.h> nor <asm/kvm_para.h> is exported

    csky, nds32, riscv

This does not match to the actual KVM support. At least, [2] is
half-baked.

Nor do arch maintainers look like they care about this. For example,
commit 0add53713b1c ("microblaze: Add missing kvm_para.h to Kbuild")
exported <asm/kvm_para.h> to user-space in order to fix an in-kernel
build error.

We have two ways to make this consistent:

 [A] export both <linux/kvm_para.h> and <asm/kvm_para.h> for all
     architectures, irrespective of the KVM support

 [B] Match the header export of <linux/kvm_para.h> and <asm/kvm_para.h>
     to the KVM support

My first attempt was [A] because the code looks cleaner, but Paolo
suggested [B].

So, this commit goes with [B].

For most architectures, <asm/kvm_para.h> was moved to the kernel-space.
I changed include/uapi/linux/Kbuild so that it checks generated
asm/kvm_para.h as well as check-in ones.

After this commit, there will be two groups:

 [1] Both <linux/kvm_para.h> and <asm/kvm_para.h> are exported

    arm, arm64, mips, powerpc, s390, x86

 [2] Neither <linux/kvm_para.h> nor <asm/kvm_para.h> is exported

    alpha, arc, c6x, csky, h8300, hexagon, ia64, m68k, microblaze,
    nds32, nios2, openrisc, parisc, riscv, sh, sparc, unicore32, xtensa

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/alpha/include/asm/Kbuild            | 1 +
 arch/alpha/include/uapi/asm/kvm_para.h   | 2 --
 arch/arc/include/asm/Kbuild              | 1 +
 arch/arc/include/uapi/asm/Kbuild         | 1 -
 arch/arm/include/uapi/asm/Kbuild         | 1 +
 arch/arm/include/uapi/asm/kvm_para.h     | 2 --
 arch/c6x/include/asm/Kbuild              | 1 +
 arch/c6x/include/uapi/asm/Kbuild         | 1 -
 arch/h8300/include/asm/Kbuild            | 1 +
 arch/h8300/include/uapi/asm/Kbuild       | 1 -
 arch/hexagon/include/asm/Kbuild          | 1 +
 arch/hexagon/include/uapi/asm/kvm_para.h | 2 --
 arch/ia64/include/asm/Kbuild             | 1 +
 arch/ia64/include/uapi/asm/Kbuild        | 1 -
 arch/m68k/include/asm/Kbuild             | 1 +
 arch/m68k/include/uapi/asm/Kbuild        | 1 -
 arch/microblaze/include/asm/Kbuild       | 1 +
 arch/microblaze/include/uapi/asm/Kbuild  | 1 -
 arch/nios2/include/asm/Kbuild            | 1 +
 arch/nios2/include/uapi/asm/Kbuild       | 1 -
 arch/openrisc/include/asm/Kbuild         | 1 +
 arch/openrisc/include/uapi/asm/Kbuild    | 1 -
 arch/parisc/include/asm/Kbuild           | 1 +
 arch/parisc/include/uapi/asm/Kbuild      | 1 -
 arch/sh/include/asm/Kbuild               | 1 +
 arch/sh/include/uapi/asm/Kbuild          | 1 -
 arch/sparc/include/asm/Kbuild            | 1 +
 arch/sparc/include/uapi/asm/kvm_para.h   | 2 --
 arch/unicore32/include/asm/Kbuild        | 1 +
 arch/unicore32/include/uapi/asm/Kbuild   | 1 -
 arch/xtensa/include/asm/Kbuild           | 1 +
 arch/xtensa/include/uapi/asm/Kbuild      | 1 -
 include/uapi/linux/Kbuild                | 2 ++
 33 files changed, 18 insertions(+), 20 deletions(-)
 delete mode 100644 arch/alpha/include/uapi/asm/kvm_para.h
 delete mode 100644 arch/arm/include/uapi/asm/kvm_para.h
 delete mode 100644 arch/hexagon/include/uapi/asm/kvm_para.h
 delete mode 100644 arch/sparc/include/uapi/asm/kvm_para.h

(limited to 'include/uapi')

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index dc0ab28baca1..70b783333965 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -6,6 +6,7 @@ generic-y += exec.h
 generic-y += export.h
 generic-y += fb.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
diff --git a/arch/alpha/include/uapi/asm/kvm_para.h b/arch/alpha/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/alpha/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index b41f8881ecc8..decc306a3b52 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -11,6 +11,7 @@ generic-y += hardirq.h
 generic-y += hw_irq.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/arc/include/uapi/asm/Kbuild
+++ b/arch/arc/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 23b4464c0995..ce8573157774 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -3,3 +3,4 @@
 generated-y += unistd-common.h
 generated-y += unistd-oabi.h
 generated-y += unistd-eabi.h
+generic-y += kvm_para.h
diff --git a/arch/arm/include/uapi/asm/kvm_para.h b/arch/arm/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/arm/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 63b4a1705182..249c9f6f26dc 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/c6x/include/uapi/asm/Kbuild
+++ b/arch/c6x/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 3e7c8ecf151e..e3dead402e5f 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -23,6 +23,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += linkage.h
 generic-y += local.h
 generic-y += local64.h
diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/h8300/include/uapi/asm/Kbuild
+++ b/arch/h8300/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index b25fd42aa0f4..d046e8ccdf78 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/hexagon/include/uapi/asm/kvm_para.h b/arch/hexagon/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/hexagon/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 43e21fe3499c..11f191689c9e 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -2,6 +2,7 @@ generated-y += syscall_table.h
 generic-y += compat.h
 generic-y += exec.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
index 20018cb883a9..62a9522af51e 100644
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ b/arch/ia64/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
 generated-y += unistd_64.h
-generic-y += kvm_para.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 95f8f631c4df..2c359d9e80f6 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -13,6 +13,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild
index 8a7ad40be463..7417847dc438 100644
--- a/arch/m68k/include/uapi/asm/Kbuild
+++ b/arch/m68k/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
 generated-y += unistd_32.h
-generic-y += kvm_para.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 791cc8d54d0a..1a8285c3f693 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += linkage.h
 generic-y += local.h
 generic-y += local64.h
diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild
index 3ce84fbb2678..13f59631c576 100644
--- a/arch/microblaze/include/uapi/asm/Kbuild
+++ b/arch/microblaze/include/uapi/asm/Kbuild
@@ -1,3 +1,2 @@
 generated-y += unistd_32.h
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 8fde4fa2c34f..88a667d12aaa 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -23,6 +23,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/nios2/include/uapi/asm/Kbuild
+++ b/arch/nios2/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 5a73e2956ac4..22aa97136c01 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -20,6 +20,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/openrisc/include/uapi/asm/Kbuild
+++ b/arch/openrisc/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 6f49e77d82a2..9bcd0c903dbb 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -11,6 +11,7 @@ generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild
index 22fdbd08cdc8..2bd5b392277c 100644
--- a/arch/parisc/include/uapi/asm/Kbuild
+++ b/arch/parisc/include/uapi/asm/Kbuild
@@ -1,3 +1,2 @@
 generated-y += unistd_32.h
 generated-y += unistd_64.h
-generic-y += kvm_para.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index a6ef3fee5f85..7bf2cb680d32 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild
index ecfbd40924dd..b8812c74c1de 100644
--- a/arch/sh/include/uapi/asm/Kbuild
+++ b/arch/sh/include/uapi/asm/Kbuild
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 
 generated-y += unistd_32.h
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index b82f64e28f55..a22cfd5c0ee8 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += exec.h
 generic-y += export.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
+generic-y += kvm_para.h
 generic-y += linkage.h
 generic-y += local.h
 generic-y += local64.h
diff --git a/arch/sparc/include/uapi/asm/kvm_para.h b/arch/sparc/include/uapi/asm/kvm_para.h
deleted file mode 100644
index baacc4996d18..000000000000
--- a/arch/sparc/include/uapi/asm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 1d1544b6ca74..d77d953c04c1 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -18,6 +18,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
index 755bb11323d8..1c72f04ff75d 100644
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ b/arch/unicore32/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
-generic-y += kvm_para.h
 generic-y += ucontext.h
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 42b6cb3d16f7..3843198e03d4 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -15,6 +15,7 @@ generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += kmap_types.h
 generic-y += kprobes.h
+generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild
index 8a7ad40be463..7417847dc438 100644
--- a/arch/xtensa/include/uapi/asm/Kbuild
+++ b/arch/xtensa/include/uapi/asm/Kbuild
@@ -1,2 +1 @@
 generated-y += unistd_32.h
-generic-y += kvm_para.h
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 5f24b50c9e88..059dc2bedaf6 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -7,5 +7,7 @@ no-export-headers += kvm.h
 endif
 
 ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h),)
+ifeq ($(wildcard $(objtree)/arch/$(SRCARCH)/include/generated/uapi/asm/kvm_para.h),)
 no-export-headers += kvm_para.h
 endif
+endif
-- 
cgit v1.2.3-59-g8ed1b


From b5bdbb6ccd1117896bf4fba6bff75336ca423e8c Mon Sep 17 00:00:00 2001
From: Daniel Mentz <danielmentz@google.com>
Date: Fri, 29 Mar 2019 15:48:54 -0700
Subject: ALSA: uapi: #include <time.h> in asound.h

The uapi header asound.h defines types based on struct timespec. We need
to #include <time.h> to get access to the definition of this struct.

Previously, we encountered the following error message when building
applications with a clang/bionic toolchain:

kernel-headers/sound/asound.h:350:19: error: field has incomplete type 'struct timespec'
  struct timespec trigger_tstamp;
                  ^

The absence of the time.h #include statement does not cause build errors
with glibc, because its version of stdlib.h indirectly includes time.h.

Signed-off-by: Daniel Mentz <danielmentz@google.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/sound/asound.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h
index 404d4b9ffe76..df1153cea0b7 100644
--- a/include/uapi/sound/asound.h
+++ b/include/uapi/sound/asound.h
@@ -32,6 +32,7 @@
 
 #ifndef __KERNEL__
 #include <stdlib.h>
+#include <time.h>
 #endif
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From afe64245af9f58267e7fa8fb76ad5650ee7ec25f Mon Sep 17 00:00:00 2001
From: Michael Zhivich <mzhivich@akamai.com>
Date: Mon, 8 Apr 2019 10:48:45 -0400
Subject: ethtool: avoid signed-unsigned comparison in ethtool_validate_speed()

When building C++ userspace code that includes ethtool.h
with "-Werror -Wall", g++ complains about signed-unsigned comparison in
ethtool_validate_speed() due to definition of SPEED_UNKNOWN as -1.

Explicitly cast SPEED_UNKNOWN to __u32 to match type of
ethtool_validate_speed() argument.

Signed-off-by: Michael Zhivich <mzhivich@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 3652b239dad1..d473e5ed044c 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1591,7 +1591,7 @@ enum ethtool_link_mode_bit_indices {
 
 static inline int ethtool_validate_speed(__u32 speed)
 {
-	return speed <= INT_MAX || speed == SPEED_UNKNOWN;
+	return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN;
 }
 
 /* Duplex, half or full. */
-- 
cgit v1.2.3-59-g8ed1b


From 7249c8ea227a582c14f63e9e8853eb7369122f10 Mon Sep 17 00:00:00 2001
From: Guy Levi <guyle@mellanox.com>
Date: Wed, 10 Apr 2019 10:59:45 +0300
Subject: IB/mlx5: Fix scatter to CQE in DCT QP creation

When scatter to CQE is enabled on a DCT QP it corrupts the mailbox command
since it tried to treat it as as QP create mailbox command instead of a
DCT create command.

The corrupted mailbox command causes userspace to malfunction as the
device doesn't create the QP as expected.

A new mlx5 capability is exposed to user-space which ensures that it will
not enable the feature on DCT without this fix in the kernel.

Fixes: 5d6ff1babe78 ("IB/mlx5: Support scatter to CQE for DC transport type")
Signed-off-by: Guy Levi <guyle@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c |  2 ++
 drivers/infiniband/hw/mlx5/qp.c   | 11 +++++++----
 include/uapi/rdma/mlx5-abi.h      |  1 +
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 531ff20b32ad..241d9ead79eb 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1119,6 +1119,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		if (MLX5_CAP_GEN(mdev, qp_packet_based))
 			resp.flags |=
 				MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
+
+		resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
 	}
 
 	if (field_avail(typeof(resp), sw_parsing_caps,
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 7cd006da1dae..8870c350fda0 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1818,13 +1818,16 @@ static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr,
 
 	rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq);
 
-	if (rcqe_sz == 128) {
-		MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
+	if (init_attr->qp_type == MLX5_IB_QPT_DCT) {
+		if (rcqe_sz == 128)
+			MLX5_SET(dctc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
+
 		return;
 	}
 
-	if (init_attr->qp_type != MLX5_IB_QPT_DCT)
-		MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE);
+	MLX5_SET(qpc, qpc, cs_res,
+		 rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE :
+				  MLX5_RES_SCAT_DATA32_CQE);
 }
 
 static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev,
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 87b3198f4b5d..f4d4010b7e3e 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -238,6 +238,7 @@ enum mlx5_ib_query_dev_resp_flags {
 	MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP = 1 << 0,
 	MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD  = 1 << 1,
 	MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE = 1 << 2,
+	MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT = 1 << 3,
 };
 
 enum mlx5_ib_tunnel_offloads {
-- 
cgit v1.2.3-59-g8ed1b