From 36e7999dc19a1a6e76258020a49f7f7836018e46 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 3 Dec 2018 14:24:33 -0800 Subject: drm/v3d: Document cache flushing ABI. Right now, userspace doesn't do any L2T writes, but we should lay out our expectations for how it works. v2: Explicitly mention the VCD cache flushing requirements and that we'll flush the other caches before each of the CLs. Signed-off-by: Eric Anholt Link: https://patchwork.freedesktop.org/patch/msgid/20181203222438.25417-1-eric@anholt.net Reviewed-by: Dave Emett --- include/uapi/drm/v3d_drm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h index 35c7d813c66e..ea70669d2138 100644 --- a/include/uapi/drm/v3d_drm.h +++ b/include/uapi/drm/v3d_drm.h @@ -52,6 +52,14 @@ extern "C" { * * This asks the kernel to have the GPU execute an optional binner * command list, and a render command list. + * + * The L1T, slice, L2C, L2T, and GCA caches will be flushed before + * each CL executes. The VCD cache should be flushed (if necessary) + * by the submitted CLs. The TLB writes are guaranteed to have been + * flushed by the time the render done IRQ happens, which is the + * trigger for out_sync. Any dirtying of cachelines by the job (only + * possible using TMU writes) must be flushed by the caller using the + * CL's cache flush commands. */ struct drm_v3d_submit_cl { /* Pointer to the binner command list. -- cgit v1.3-7-g2ca7 From 17abc9ec68b73ddeb262a507a62421016b9c54d5 Mon Sep 17 00:00:00 2001 From: Tomasz Duszynski Date: Fri, 14 Dec 2018 19:28:01 +0100 Subject: iio: add IIO_MASSCONCENTRATION channel type Measuring particulate matter in ug / m3 (micro-grams per cubic meter) is de facto standard. Existing air quality sensors usually follow this convention and are capable of returning measurements using this unit. IIO currently does not offer suitable channel type for this type of measurements hence this patch adds this. In addition, extra modifiers are introduced used for distinguishing between fine pm1, pm2p5 and coarse pm4, pm10 particle measurements, i.e IIO_MOD_PM1, IIO_MOD_PM25 and IIO_MOD_PM4, IIO_MOD_PM10. pmX consists of particles with aerodynamic diameter less or equal to X micrometers. Signed-off-by: Tomasz Duszynski Acked-by: Matt Ranostay Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 17 ++++++++++++++++- drivers/iio/industrialio-core.c | 5 +++++ include/uapi/linux/iio/types.h | 5 +++++ tools/iio/iio_event_monitor.c | 10 ++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 8127a08e366d..67fd88bf7910 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -1684,4 +1684,19 @@ KernelVersion: 4.18 Contact: linux-iio@vger.kernel.org Description: Raw (unscaled) phase difference reading from channel Y - that can be processed to radians. \ No newline at end of file + that can be processed to radians. + +What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm1_input +What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm1_input +What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm2p5_input +What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm2p5_input +What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm4_input +What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm4_input +What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm10_input +What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm10_input +KernelVersion: 4.22 +Contact: linux-iio@vger.kernel.org +Description: + Mass concentration reading of particulate matter in ug / m3. + pmX consists of particles with aerodynamic diameter less or + equal to X micrometers. diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 4f5cd9f60870..4700fd5d8c90 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -87,6 +87,7 @@ static const char * const iio_chan_type_name_spec[] = { [IIO_GRAVITY] = "gravity", [IIO_POSITIONRELATIVE] = "positionrelative", [IIO_PHASE] = "phase", + [IIO_MASSCONCENTRATION] = "massconcentration", }; static const char * const iio_modifier_names[] = { @@ -127,6 +128,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_Q] = "q", [IIO_MOD_CO2] = "co2", [IIO_MOD_VOC] = "voc", + [IIO_MOD_PM1] = "pm1", + [IIO_MOD_PM2P5] = "pm2p5", + [IIO_MOD_PM4] = "pm4", + [IIO_MOD_PM10] = "pm10", }; /* relies on pairs of these shared then separate */ diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 92baabc103ac..c59adac24b1c 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -46,6 +46,7 @@ enum iio_chan_type { IIO_GRAVITY, IIO_POSITIONRELATIVE, IIO_PHASE, + IIO_MASSCONCENTRATION, }; enum iio_modifier { @@ -87,6 +88,10 @@ enum iio_modifier { IIO_MOD_VOC, IIO_MOD_LIGHT_UV, IIO_MOD_LIGHT_DUV, + IIO_MOD_PM1, + IIO_MOD_PM2P5, + IIO_MOD_PM4, + IIO_MOD_PM10, }; enum iio_event_type { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index ac2de6b7e89f..f6b8003fbe3c 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -60,6 +60,7 @@ static const char * const iio_chan_type_name_spec[] = { [IIO_GRAVITY] = "gravity", [IIO_POSITIONRELATIVE] = "positionrelative", [IIO_PHASE] = "phase", + [IIO_MASSCONCENTRATION] = "massconcentration", }; static const char * const iio_ev_type_text[] = { @@ -115,6 +116,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_Q] = "q", [IIO_MOD_CO2] = "co2", [IIO_MOD_VOC] = "voc", + [IIO_MOD_PM1] = "pm1", + [IIO_MOD_PM2P5] = "pm2p5", + [IIO_MOD_PM4] = "pm4", + [IIO_MOD_PM10] = "pm10", }; static bool event_is_known(struct iio_event_data *event) @@ -156,6 +161,7 @@ static bool event_is_known(struct iio_event_data *event) case IIO_GRAVITY: case IIO_POSITIONRELATIVE: case IIO_PHASE: + case IIO_MASSCONCENTRATION: break; default: return false; @@ -200,6 +206,10 @@ static bool event_is_known(struct iio_event_data *event) case IIO_MOD_Q: case IIO_MOD_CO2: case IIO_MOD_VOC: + case IIO_MOD_PM1: + case IIO_MOD_PM2P5: + case IIO_MOD_PM4: + case IIO_MOD_PM10: break; default: return false; -- cgit v1.3-7-g2ca7 From b170f7d48443d1ea3e4ffbf409025b5e5b1146fe Mon Sep 17 00:00:00 2001 From: Andreas Brauchli Date: Thu, 13 Dec 2018 15:43:22 +0100 Subject: iio: Add modifiers for ethanol and H2 gases Add ethanol and H2 gas modifiers: * IIO_MOD_ETHANOL * IIO_MOD_H2 Signed-off-by: Andreas Brauchli Acked-by: Matt Ranostay Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 4 ++++ include/uapi/linux/iio/types.h | 2 ++ tools/iio/iio_event_monitor.c | 4 ++++ 3 files changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 67fd88bf7910..864f8efd12e5 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -1554,6 +1554,10 @@ What: /sys/bus/iio/devices/iio:deviceX/in_concentration_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentration_co2_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_co2_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentration_ethanol_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_ethanol_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentration_h2_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_h2_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentration_voc_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_voc_raw KernelVersion: 4.3 diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index c59adac24b1c..fdd81affca4b 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -92,6 +92,8 @@ enum iio_modifier { IIO_MOD_PM2P5, IIO_MOD_PM4, IIO_MOD_PM10, + IIO_MOD_ETHANOL, + IIO_MOD_H2, }; enum iio_event_type { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index f6b8003fbe3c..7bf9bde28bcc 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -115,6 +115,8 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_I] = "i", [IIO_MOD_Q] = "q", [IIO_MOD_CO2] = "co2", + [IIO_MOD_ETHANOL] = "ethanol", + [IIO_MOD_H2] = "h2", [IIO_MOD_VOC] = "voc", [IIO_MOD_PM1] = "pm1", [IIO_MOD_PM2P5] = "pm2p5", @@ -205,6 +207,8 @@ static bool event_is_known(struct iio_event_data *event) case IIO_MOD_I: case IIO_MOD_Q: case IIO_MOD_CO2: + case IIO_MOD_ETHANOL: + case IIO_MOD_H2: case IIO_MOD_VOC: case IIO_MOD_PM1: case IIO_MOD_PM2P5: -- cgit v1.3-7-g2ca7 From c2eb8effb265ac5cdd960d8e61ecb931e9c767cd Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 24 Oct 2018 06:50:34 -0400 Subject: media: videodev2.h: add v4l2_timeval_to_ns inline function We want to be able to uniquely identify buffers for stateless codecs. The internal timestamp (a u64) as stored internally in the kernel is a suitable candidate for that, but in struct v4l2_buffer it is represented as a struct timeval. Add a v4l2_timeval_to_ns() function that converts the struct timeval into a u64 in the same way that the kernel does. This makes it possible to use this u64 elsewhere as a unique identifier of the buffer. Since timestamps are also copied from the output buffer to the corresponding capture buffer(s) by M2M devices, the u64 can be used to refer to both output and capture buffers. The plan is that in the future we redesign struct v4l2_buffer and use u64 for the timestamp instead of a struct timeval (which has lots of problems with 32 vs 64 bit and y2038 layout changes), and then there is no more need to use this function. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index b5671ce2724f..d6eed479c3a6 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -973,6 +973,18 @@ struct v4l2_buffer { }; }; +/** + * v4l2_timeval_to_ns - Convert timeval to nanoseconds + * @ts: pointer to the timeval variable to be converted + * + * Returns the scalar nanosecond representation of the timeval + * parameter. + */ +static inline __u64 v4l2_timeval_to_ns(const struct timeval *tv) +{ + return (__u64)tv->tv_sec * 1000000000ULL + tv->tv_usec * 1000; +} + /* Flags for 'flags' field */ /* Buffer is mapped (flag) */ #define V4L2_BUF_FLAG_MAPPED 0x00000001 -- cgit v1.3-7-g2ca7 From 3affaa5a7ca3ca114e01049bc45e3ed4a3955505 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Mon, 3 Dec 2018 11:31:57 +0000 Subject: drm/afbc: Add AFBC modifier usage documentation AFBC is a flexible, proprietary, lossless compression protocol and format, with a number of defined DRM format modifiers. To facilitate consistency and compatibility between different AFBC producers and consumers, document the expectations for usage of the AFBC DRM format modifiers in a new .rst chapter. Signed-off-by: Brian Starkey Reviewed-by: Liviu Dudau [Updated MAINTAINERS entry to show that "Mali DP Maintainers" is actually a mailing list and added an SPDX-License-Identifier to the documentation] Signed-off-by: Liviu Dudau --- Documentation/gpu/afbc.rst | 235 ++++++++++++++++++++++++++++++++++++++++++ Documentation/gpu/drivers.rst | 1 + MAINTAINERS | 3 +- include/uapi/drm/drm_fourcc.h | 3 + 4 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 Documentation/gpu/afbc.rst (limited to 'include/uapi') diff --git a/Documentation/gpu/afbc.rst b/Documentation/gpu/afbc.rst new file mode 100644 index 000000000000..4d38dc49d105 --- /dev/null +++ b/Documentation/gpu/afbc.rst @@ -0,0 +1,235 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +=================================== + Arm Framebuffer Compression (AFBC) +=================================== + +AFBC is a proprietary lossless image compression protocol and format. +It provides fine-grained random access and minimizes the amount of +data transferred between IP blocks. + +AFBC can be enabled on drivers which support it via use of the AFBC +format modifiers defined in drm_fourcc.h. See DRM_FORMAT_MOD_ARM_AFBC(*). + +All users of the AFBC modifiers must follow the usage guidelines laid +out in this document, to ensure compatibility across different AFBC +producers and consumers. + +Components and Ordering +======================= + +AFBC streams can contain several components - where a component +corresponds to a color channel (i.e. R, G, B, X, A, Y, Cb, Cr). +The assignment of input/output color channels must be consistent +between the encoder and the decoder for correct operation, otherwise +the consumer will interpret the decoded data incorrectly. + +Furthermore, when the lossless colorspace transform is used +(AFBC_FORMAT_MOD_YTR, which should be enabled for RGB buffers for +maximum compression efficiency), the component order must be: + + * Component 0: R + * Component 1: G + * Component 2: B + +The component ordering is communicated via the fourcc code in the +fourcc:modifier pair. In general, component '0' is considered to +reside in the least-significant bits of the corresponding linear +format. For example, COMP(bits): + + * DRM_FORMAT_ABGR8888 + + * Component 0: R(8) + * Component 1: G(8) + * Component 2: B(8) + * Component 3: A(8) + + * DRM_FORMAT_BGR888 + + * Component 0: R(8) + * Component 1: G(8) + * Component 2: B(8) + + * DRM_FORMAT_YUYV + + * Component 0: Y(8) + * Component 1: Cb(8, 2x1 subsampled) + * Component 2: Cr(8, 2x1 subsampled) + +In AFBC, 'X' components are not treated any differently from any other +component. Therefore, an AFBC buffer with fourcc DRM_FORMAT_XBGR8888 +encodes with 4 components, like so: + + * DRM_FORMAT_XBGR8888 + + * Component 0: R(8) + * Component 1: G(8) + * Component 2: B(8) + * Component 3: X(8) + +Please note, however, that the inclusion of a "wasted" 'X' channel is +bad for compression efficiency, and so it's recommended to avoid +formats containing 'X' bits. If a fourth component is +required/expected by the encoder/decoder, then it is recommended to +instead use an equivalent format with alpha, setting all alpha bits to +'1'. If there is no requirement for a fourth component, then a format +which doesn't include alpha can be used, e.g. DRM_FORMAT_BGR888. + +Number of Planes +================ + +Formats which are typically multi-planar in linear layouts (e.g. YUV +420), can be encoded into one, or multiple, AFBC planes. As with +component order, the encoder and decoder must agree about the number +of planes in order to correctly decode the buffer. The fourcc code is +used to determine the number of encoded planes in an AFBC buffer, +matching the number of planes for the linear (unmodified) format. +Within each plane, the component ordering also follows the fourcc +code: + +For example: + + * DRM_FORMAT_YUYV: nplanes = 1 + + * Plane 0: + + * Component 0: Y(8) + * Component 1: Cb(8, 2x1 subsampled) + * Component 2: Cr(8, 2x1 subsampled) + + * DRM_FORMAT_NV12: nplanes = 2 + + * Plane 0: + + * Component 0: Y(8) + + * Plane 1: + + * Component 0: Cb(8, 2x1 subsampled) + * Component 1: Cr(8, 2x1 subsampled) + +Cross-device interoperability +============================= + +For maximum compatibility across devices, the table below defines +canonical formats for use between AFBC-enabled devices. Formats which +are listed here must be used exactly as specified when using the AFBC +modifiers. Formats which are not listed should be avoided. + +.. flat-table:: AFBC formats + + * - Fourcc code + - Description + - Planes/Components + + * - DRM_FORMAT_ABGR2101010 + - 10-bit per component RGB, with 2-bit alpha + - Plane 0: 4 components + * Component 0: R(10) + * Component 1: G(10) + * Component 2: B(10) + * Component 3: A(2) + + * - DRM_FORMAT_ABGR8888 + - 8-bit per component RGB, with 8-bit alpha + - Plane 0: 4 components + * Component 0: R(8) + * Component 1: G(8) + * Component 2: B(8) + * Component 3: A(8) + + * - DRM_FORMAT_BGR888 + - 8-bit per component RGB + - Plane 0: 3 components + * Component 0: R(8) + * Component 1: G(8) + * Component 2: B(8) + + * - DRM_FORMAT_BGR565 + - 5/6-bit per component RGB + - Plane 0: 3 components + * Component 0: R(5) + * Component 1: G(6) + * Component 2: B(5) + + * - DRM_FORMAT_ABGR1555 + - 5-bit per component RGB, with 1-bit alpha + - Plane 0: 4 components + * Component 0: R(5) + * Component 1: G(5) + * Component 2: B(5) + * Component 3: A(1) + + * - DRM_FORMAT_VUY888 + - 8-bit per component YCbCr 444, single plane + - Plane 0: 3 components + * Component 0: Y(8) + * Component 1: Cb(8) + * Component 2: Cr(8) + + * - DRM_FORMAT_VUY101010 + - 10-bit per component YCbCr 444, single plane + - Plane 0: 3 components + * Component 0: Y(10) + * Component 1: Cb(10) + * Component 2: Cr(10) + + * - DRM_FORMAT_YUYV + - 8-bit per component YCbCr 422, single plane + - Plane 0: 3 components + * Component 0: Y(8) + * Component 1: Cb(8, 2x1 subsampled) + * Component 2: Cr(8, 2x1 subsampled) + + * - DRM_FORMAT_NV16 + - 8-bit per component YCbCr 422, two plane + - Plane 0: 1 component + * Component 0: Y(8) + Plane 1: 2 components + * Component 0: Cb(8, 2x1 subsampled) + * Component 1: Cr(8, 2x1 subsampled) + + * - DRM_FORMAT_Y210 + - 10-bit per component YCbCr 422, single plane + - Plane 0: 3 components + * Component 0: Y(10) + * Component 1: Cb(10, 2x1 subsampled) + * Component 2: Cr(10, 2x1 subsampled) + + * - DRM_FORMAT_P210 + - 10-bit per component YCbCr 422, two plane + - Plane 0: 1 component + * Component 0: Y(10) + Plane 1: 2 components + * Component 0: Cb(10, 2x1 subsampled) + * Component 1: Cr(10, 2x1 subsampled) + + * - DRM_FORMAT_YUV420_8BIT + - 8-bit per component YCbCr 420, single plane + - Plane 0: 3 components + * Component 0: Y(8) + * Component 1: Cb(8, 2x2 subsampled) + * Component 2: Cr(8, 2x2 subsampled) + + * - DRM_FORMAT_YUV420_10BIT + - 10-bit per component YCbCr 420, single plane + - Plane 0: 3 components + * Component 0: Y(10) + * Component 1: Cb(10, 2x2 subsampled) + * Component 2: Cr(10, 2x2 subsampled) + + * - DRM_FORMAT_NV12 + - 8-bit per component YCbCr 420, two plane + - Plane 0: 1 component + * Component 0: Y(8) + Plane 1: 2 components + * Component 0: Cb(8, 2x2 subsampled) + * Component 1: Cr(8, 2x2 subsampled) + + * - DRM_FORMAT_P010 + - 10-bit per component YCbCr 420, two plane + - Plane 0: 1 component + * Component 0: Y(10) + Plane 1: 2 components + * Component 0: Cb(10, 2x2 subsampled) + * Component 1: Cr(10, 2x2 subsampled) diff --git a/Documentation/gpu/drivers.rst b/Documentation/gpu/drivers.rst index 7c1672118a73..c176b34ae9c8 100644 --- a/Documentation/gpu/drivers.rst +++ b/Documentation/gpu/drivers.rst @@ -17,6 +17,7 @@ GPU Driver Documentation vkms bridge/dw-hdmi xen-front + afbc .. only:: subproject and html diff --git a/MAINTAINERS b/MAINTAINERS index 32d444476a90..e79eca22532d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1136,10 +1136,11 @@ F: Documentation/devicetree/bindings/display/arm,hdlcd.txt ARM MALI-DP DRM DRIVER M: Liviu Dudau M: Brian Starkey -M: Mali DP Maintainers +L: Mali DP Maintainers S: Supported F: drivers/gpu/drm/arm/ F: Documentation/devicetree/bindings/display/arm,malidp.txt +F: Documentation/gpu/afbc.rst ARM MFM AND FLOPPY DRIVERS M: Ian Molton diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 0b44260a5ee9..df014ede4e57 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -572,6 +572,9 @@ extern "C" { * AFBC has several features which may be supported and/or used, which are * represented using bits in the modifier. Not all combinations are valid, * and different devices or use-cases may support different combinations. + * + * Further information on the use of AFBC modifiers can be found in + * Documentation/gpu/afbc.rst */ #define DRM_FORMAT_MOD_ARM_AFBC(__afbc_mode) fourcc_mod_code(ARM, __afbc_mode) -- cgit v1.3-7-g2ca7 From 2db8ebca1f6c158d0d439ebec4d4b9924c4c3d3c Mon Sep 17 00:00:00 2001 From: Matteo Franchin Date: Wed, 9 Jan 2019 15:58:37 +0000 Subject: drm/fourcc: Add modifier defininitions for AFBC 1.3 This commit adds definitions of format modifiers for version 1.3 of the Arm Framebuffer Compression (AFBC). Signed-off-by: Matteo Franchin Signed-off-by: Ayan Kumar Halder Reviewed-by: Brian Starkey Link: https://patchwork.freedesktop.org/patch/277333/ --- include/uapi/drm/drm_fourcc.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 0b44260a5ee9..41106c835747 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -581,10 +581,18 @@ extern "C" { * Indicates the superblock size(s) used for the AFBC buffer. The buffer * size (in pixels) must be aligned to a multiple of the superblock size. * Four lowest significant bits(LSBs) are reserved for block size. + * + * Where one superblock size is specified, it applies to all planes of the + * buffer (e.g. 16x16, 32x8). When multiple superblock sizes are specified, + * the first applies to the Luma plane and the second applies to the Chroma + * plane(s). e.g. (32x8_64x4 means 32x8 Luma, with 64x4 Chroma). + * Multiple superblock sizes are only valid for multi-plane YCbCr formats. */ #define AFBC_FORMAT_MOD_BLOCK_SIZE_MASK 0xf #define AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 (1ULL) #define AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 (2ULL) +#define AFBC_FORMAT_MOD_BLOCK_SIZE_64x4 (3ULL) +#define AFBC_FORMAT_MOD_BLOCK_SIZE_32x8_64x4 (4ULL) /* * AFBC lossless colorspace transform @@ -644,6 +652,21 @@ extern "C" { */ #define AFBC_FORMAT_MOD_SC (1ULL << 9) +/* + * AFBC double-buffer + * + * Indicates that the buffer is allocated in a layout safe for front-buffer + * rendering. + */ +#define AFBC_FORMAT_MOD_DB (1ULL << 10) + +/* + * AFBC buffer content hints + * + * Indicates that the buffer includes per-superblock content hints. + */ +#define AFBC_FORMAT_MOD_BCH (1ULL << 11) + #if defined(__cplusplus) } #endif -- cgit v1.3-7-g2ca7 From 4b837c6d7ee771f68a30f362c9f68171a95be222 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 14 Jan 2019 09:01:54 -0500 Subject: media: v4l: uAPI: V4L2_BUF_TYPE_META_OUTPUT is an output buffer type V4L2_BUF_TYPE_META_OUTPUT was added by commit 72148d1a57e7 ("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT") but the patch missed adding the type to the macro telling whether a given type is an output type or not. Do that now. Getting this wrong leads to handling the buffer as a capture buffer in a lot of places. Fixes: 72148d1a57e7 ("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT") Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index d6eed479c3a6..c0c36c165bf4 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -161,7 +161,8 @@ enum v4l2_buf_type { || (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY \ || (type) == V4L2_BUF_TYPE_VBI_OUTPUT \ || (type) == V4L2_BUF_TYPE_SLICED_VBI_OUTPUT \ - || (type) == V4L2_BUF_TYPE_SDR_OUTPUT) + || (type) == V4L2_BUF_TYPE_SDR_OUTPUT \ + || (type) == V4L2_BUF_TYPE_META_OUTPUT) enum v4l2_tuner_type { V4L2_TUNER_RADIO = 1, -- cgit v1.3-7-g2ca7 From 50656bad786d001b294764e9f047c5d5b3e4db75 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 10 Jan 2019 11:56:09 -0500 Subject: media: v4l2-ctrl: Add control to enable h.264 constrained intra prediction Allow to enable h.264 constrained intra prediction (macroblocks using intra prediction modes are not allowed to use residual data and decoded samples of neighboring macroblocks coded using inter prediction modes). This control directly corresponds to the constrained_intra_pred_flag field in the h.264 picture parameter set. Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/extended-controls.rst | 4 ++++ drivers/media/v4l2-core/v4l2-ctrls.c | 2 ++ include/uapi/linux/v4l2-controls.h | 1 + 3 files changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst index af4273aa5e85..235d0c293983 100644 --- a/Documentation/media/uapi/v4l/extended-controls.rst +++ b/Documentation/media/uapi/v4l/extended-controls.rst @@ -1154,6 +1154,10 @@ enum v4l2_mpeg_video_h264_entropy_mode - ``V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM (boolean)`` Enable 8X8 transform for H264. Applicable to the H264 encoder. +``V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (boolean)`` + Enable constrained intra prediction for H264. Applicable to the H264 + encoder. + ``V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB (integer)`` Cyclic intra macroblock refresh. This is the number of continuous macroblocks refreshed every frame. Each frame a successive set of diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 8c47d8f00429..50b56185c02e 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -825,6 +825,8 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER:return "H264 Number of HC Layers"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP: return "H264 Set QP Value for HC Layers"; + case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION: + return "H264 Constrained Intra Pred"; case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP: return "MPEG4 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP: return "MPEG4 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP: return "MPEG4 B-Frame QP Value"; diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 3dcfc6148f99..fd65c710b144 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -533,6 +533,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type { }; #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER (V4L2_CID_MPEG_BASE+381) #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP (V4L2_CID_MPEG_BASE+382) +#define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (V4L2_CID_MPEG_BASE+383) #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP (V4L2_CID_MPEG_BASE+400) #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP (V4L2_CID_MPEG_BASE+401) #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP (V4L2_CID_MPEG_BASE+402) -- cgit v1.3-7-g2ca7 From d034696cbe5a6e00f76ca4b7869c6cdef66aebd5 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 10 Jan 2019 11:56:10 -0500 Subject: media: v4l2-ctrl: Add control for h.264 chroma qp offset Allow to add fixed quantization parameter offset between luma and chroma quantization parameters. This control directly corresponds to the chroma_qp_index_offset field of the h.264 picture parameter set. Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/extended-controls.rst | 5 +++++ drivers/media/v4l2-core/v4l2-ctrls.c | 1 + include/uapi/linux/v4l2-controls.h | 1 + 3 files changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst index 235d0c293983..00934efdc9e4 100644 --- a/Documentation/media/uapi/v4l/extended-controls.rst +++ b/Documentation/media/uapi/v4l/extended-controls.rst @@ -1158,6 +1158,11 @@ enum v4l2_mpeg_video_h264_entropy_mode - Enable constrained intra prediction for H264. Applicable to the H264 encoder. +``V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET (integer)`` + Specify the offset that should be added to the luma quantization + parameter to determine the chroma quantization parameter. Applicable + to the H264 encoder. + ``V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB (integer)`` Cyclic intra macroblock refresh. This is the number of continuous macroblocks refreshed every frame. Each frame a successive set of diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 50b56185c02e..99308dac2daa 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -827,6 +827,7 @@ const char *v4l2_ctrl_get_name(u32 id) return "H264 Set QP Value for HC Layers"; case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION: return "H264 Constrained Intra Pred"; + case V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET: return "H264 Chroma QP Index Offset"; case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP: return "MPEG4 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP: return "MPEG4 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP: return "MPEG4 B-Frame QP Value"; diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index fd65c710b144..06479f2fb3ae 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -534,6 +534,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type { #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER (V4L2_CID_MPEG_BASE+381) #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP (V4L2_CID_MPEG_BASE+382) #define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (V4L2_CID_MPEG_BASE+383) +#define V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET (V4L2_CID_MPEG_BASE+384) #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP (V4L2_CID_MPEG_BASE+400) #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP (V4L2_CID_MPEG_BASE+401) #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP (V4L2_CID_MPEG_BASE+402) -- cgit v1.3-7-g2ca7 From 1c3721b1f22286033abeda30b7e12439b083ed0f Mon Sep 17 00:00:00 2001 From: Steve Longerbeam Date: Wed, 9 Jan 2019 13:30:04 -0500 Subject: media: videodev2.h: Add more field helper macros Adds two helper macros: V4L2_FIELD_IS_SEQUENTIAL: returns true if the given field type is 'sequential', that is a full frame is transmitted, or exists in memory, as all top field lines followed by all bottom field lines, or vice-versa. V4L2_FIELD_IS_INTERLACED: returns true if the given field type is 'interlaced', that is a full frame is transmitted, or exists in memory, as top field lines interlaced with bottom field lines. Signed-off-by: Steve Longerbeam Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index c0c36c165bf4..9a920f071ff9 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -130,6 +130,13 @@ enum v4l2_field { ((field) == V4L2_FIELD_BOTTOM ||\ (field) == V4L2_FIELD_TOP ||\ (field) == V4L2_FIELD_ALTERNATE) +#define V4L2_FIELD_IS_INTERLACED(field) \ + ((field) == V4L2_FIELD_INTERLACED ||\ + (field) == V4L2_FIELD_INTERLACED_TB ||\ + (field) == V4L2_FIELD_INTERLACED_BT) +#define V4L2_FIELD_IS_SEQUENTIAL(field) \ + ((field) == V4L2_FIELD_SEQ_TB ||\ + (field) == V4L2_FIELD_SEQ_BT) enum v4l2_buf_type { V4L2_BUF_TYPE_VIDEO_CAPTURE = 1, -- cgit v1.3-7-g2ca7 From 08cba016cdbe4f6bf0200366194a768cfef58edd Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 18 Jan 2019 15:51:21 +0100 Subject: drm/fourcc: Add definitions for Allwinner vendor and VPU tiled format This introduces specific definitions for vendor Allwinner and its associated tiled format modifier. This modifier is used for the output format of the VPU, that can be imported directly with the display engine hardware supported by the sun4i-drm driver. Signed-off-by: Paul Kocialkowski Reviewed-by: Maxime Ripard Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20190118145133.21281-12-paul.kocialkowski@bootlin.com --- include/uapi/drm/drm_fourcc.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 41106c835747..91d08a23f125 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -238,6 +238,8 @@ extern "C" { #define DRM_FORMAT_MOD_VENDOR_VIVANTE 0x06 #define DRM_FORMAT_MOD_VENDOR_BROADCOM 0x07 #define DRM_FORMAT_MOD_VENDOR_ARM 0x08 +#define DRM_FORMAT_MOD_VENDOR_ALLWINNER 0x09 + /* add more to the end as needed */ #define DRM_FORMAT_RESERVED ((1ULL << 56) - 1) @@ -667,6 +669,20 @@ extern "C" { */ #define AFBC_FORMAT_MOD_BCH (1ULL << 11) +/* + * Allwinner tiled modifier + * + * This tiling mode is implemented by the VPU found on all Allwinner platforms, + * codenamed sunxi. It is associated with a YUV format that uses either 2 or 3 + * planes. + * + * With this tiling, the luminance samples are disposed in tiles representing + * 32x32 pixels and the chrominance samples in tiles representing 32x64 pixels. + * The pixel order in each tile is linear and the tiles are disposed linearly, + * both in row-major order. + */ +#define DRM_FORMAT_MOD_ALLWINNER_TILED fourcc_mod_code(ALLWINNER, 1) + #if defined(__cplusplus) } #endif -- cgit v1.3-7-g2ca7 From ad07c8ceb6631a83b62d405a61448bba92adac68 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Thu, 10 Jan 2019 13:53:34 +0000 Subject: perf/core: Remove unused perf_flags Now that perf_flags is not used we remove it. Signed-off-by: Andrew Murray Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Mark Rutland Cc: Matt Turner Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Richard Henderson Cc: Russell King Cc: Sascha Hauer Cc: Shawn Guo Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: robin.murphy@arm.com Cc: suzuki.poulose@arm.com Link: https://lkml.kernel.org/r/1547128414-50693-13-git-send-email-andrew.murray@arm.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 2 -- tools/include/uapi/linux/perf_event.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9de8780ac8d9..ea19b5d491bf 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -445,8 +445,6 @@ struct perf_event_query_bpf { __u32 ids[0]; }; -#define perf_flags(attr) (*(&(attr)->read_format + 1)) - /* * Ioctls that can be done on a perf event fd: */ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 9de8780ac8d9..ea19b5d491bf 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -445,8 +445,6 @@ struct perf_event_query_bpf { __u32 ids[0]; }; -#define perf_flags(attr) (*(&(attr)->read_format + 1)) - /* * Ioctls that can be done on a perf event fd: */ -- cgit v1.3-7-g2ca7 From 76193a94522f1d4edf2447a536f3f796ce56343b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:13 -0800 Subject: perf, bpf: Introduce PERF_RECORD_KSYMBOL For better performance analysis of dynamically JITed and loaded kernel functions, such as BPF programs, this patch introduces PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol register/unregister information to user space. The following data structure is used for PERF_RECORD_KSYMBOL. /* * struct { * struct perf_event_header header; * u64 addr; * u32 len; * u16 ksym_type; * u16 flags; * char name[]; * struct sample_id sample_id; * }; */ Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-2-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 8 ++++ include/uapi/linux/perf_event.h | 26 ++++++++++- kernel/events/core.c | 98 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 130 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4eb88065a9b5..136fe0495374 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1122,6 +1122,10 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, } extern void perf_event_mmap(struct vm_area_struct *vma); + +extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, const char *sym); + extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -1342,6 +1346,10 @@ static inline int perf_unregister_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } + +typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); +static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, const char *sym) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index ea19b5d491bf..1dee5c8f166b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -372,7 +372,8 @@ struct perf_event_attr { context_switch : 1, /* context switch data */ write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ - __reserved_1 : 35; + ksymbol : 1, /* include ksymbol events */ + __reserved_1 : 34; union { __u32 wakeup_events; /* wakeup every n events */ @@ -963,9 +964,32 @@ enum perf_event_type { */ PERF_RECORD_NAMESPACES = 16, + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + PERF_RECORD_MAX, /* non-ABI */ }; +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 diff --git a/kernel/events/core.c b/kernel/events/core.c index bc525cd1615c..e04ab5f325cf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -385,6 +385,7 @@ static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4235,7 +4236,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || - attr->task || + attr->task || attr->ksymbol || attr->context_switch) return true; return false; @@ -4305,6 +4306,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (has_branch_stack(event)) dec = true; + if (event->attr.ksymbol) + atomic_dec(&nr_ksymbol_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -7653,6 +7656,97 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { + const char *name; + int name_len; + struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + } event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ + return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ + struct perf_ksymbol_event *ksymbol_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_ksymbol_match(event)) + return; + + perf_event_header__init_id(&ksymbol_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + ksymbol_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, ksymbol_event->event_id); + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, + const char *sym) +{ + struct perf_ksymbol_event ksymbol_event; + char name[KSYM_NAME_LEN]; + u16 flags = 0; + int name_len; + + if (!atomic_read(&nr_ksymbol_events)) + return; + + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) + goto err; + + strlcpy(name, sym, KSYM_NAME_LEN); + name_len = strlen(name) + 1; + while (!IS_ALIGNED(name_len, sizeof(u64))) + name[name_len++] = '\0'; + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + + if (unregister) + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + + ksymbol_event = (struct perf_ksymbol_event){ + .name = name, + .name_len = name_len, + .event_id = { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = sizeof(ksymbol_event.event_id) + + name_len, + }, + .addr = addr, + .len = len, + .ksym_type = ksym_type, + .flags = flags, + }, + }; + + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); + return; +err: + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -9912,6 +10006,8 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (event->attr.ksymbol) + atomic_inc(&nr_ksymbol_events); if (inc) { /* -- cgit v1.3-7-g2ca7 From 6ee52e2a3fe4ea35520720736e6791df1fb67106 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:15 -0800 Subject: perf, bpf: Introduce PERF_RECORD_BPF_EVENT For better performance analysis of BPF programs, this patch introduces PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program load/unload information to user space. Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs. The following example shows kernel symbols for a BPF program with 7 sub programs: ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi When a bpf program is loaded, PERF_RECORD_KSYMBOL is generated for each of these sub programs. Therefore, PERF_RECORD_BPF_EVENT is not needed for simple profiling. For annotation, user space need to listen to PERF_RECORD_BPF_EVENT and gather more information about these (sub) programs via sys_bpf. Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Acked-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Tested-by: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-4-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/filter.h | 7 +++ include/linux/perf_event.h | 6 +++ include/uapi/linux/perf_event.h | 29 +++++++++- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 2 + kernel/events/core.c | 115 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 159 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/filter.h b/include/linux/filter.h index ad106d845b22..d531d4250bff 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -951,6 +951,7 @@ bpf_address_lookup(unsigned long addr, unsigned long *size, void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym); #else /* CONFIG_BPF_JIT */ @@ -1006,6 +1007,12 @@ static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } + +static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +{ + sym[0] = '\0'; +} + #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 136fe0495374..a79e59fc3b7d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1125,6 +1125,9 @@ extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); +extern void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -1350,6 +1353,9 @@ static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } +static inline void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1dee5c8f166b..7198ddd0c6b1 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -373,7 +373,8 @@ struct perf_event_attr { write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ ksymbol : 1, /* include ksymbol events */ - __reserved_1 : 34; + bpf_event : 1, /* include bpf events */ + __reserved_1 : 33; union { __u32 wakeup_events; /* wakeup every n events */ @@ -979,6 +980,25 @@ enum perf_event_type { */ PERF_RECORD_KSYMBOL = 17, + /* + * Record bpf events: + * enum perf_bpf_event_type { + * PERF_BPF_EVENT_UNKNOWN = 0, + * PERF_BPF_EVENT_PROG_LOAD = 1, + * PERF_BPF_EVENT_PROG_UNLOAD = 2, + * }; + * + * struct { + * struct perf_event_header header; + * u16 type; + * u16 flags; + * u32 id; + * u8 tag[BPF_TAG_SIZE]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_BPF_EVENT = 18, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -990,6 +1010,13 @@ enum perf_record_ksymbol_type { #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) +enum perf_bpf_event_type { + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ +}; + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f908b9356025..19c49313c709 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -495,7 +495,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, *symbol_end = addr + hdr->pages * PAGE_SIZE; } -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b155cd17c1bd..30ebd085790b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1211,6 +1211,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); @@ -1554,6 +1555,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } bpf_prog_kallsyms_add(prog); + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); return err; free_used_maps: diff --git a/kernel/events/core.c b/kernel/events/core.c index e04ab5f325cf..236bb8ddb7bc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -386,6 +386,7 @@ static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; +static atomic_t nr_bpf_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4308,6 +4309,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (event->attr.ksymbol) atomic_dec(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_dec(&nr_bpf_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -7747,6 +7750,116 @@ err: WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); } +/* + * bpf program load/unload tracking + */ + +struct perf_bpf_event { + struct bpf_prog *prog; + struct { + struct perf_event_header header; + u16 type; + u16 flags; + u32 id; + u8 tag[BPF_TAG_SIZE]; + } event_id; +}; + +static int perf_event_bpf_match(struct perf_event *event) +{ + return event->attr.bpf_event; +} + +static void perf_event_bpf_output(struct perf_event *event, void *data) +{ + struct perf_bpf_event *bpf_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_bpf_match(event)) + return; + + perf_event_header__init_id(&bpf_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + bpf_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, bpf_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, + enum perf_bpf_event_type type) +{ + bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; + char sym[KSYM_NAME_LEN]; + int i; + + if (prog->aux->func_cnt == 0) { + bpf_get_prog_name(prog, sym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, sym); + } else { + for (i = 0; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + bpf_get_prog_name(subprog, sym); + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, sym); + } + } +} + +void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) +{ + struct perf_bpf_event bpf_event; + + if (type <= PERF_BPF_EVENT_UNKNOWN || + type >= PERF_BPF_EVENT_MAX) + return; + + switch (type) { + case PERF_BPF_EVENT_PROG_LOAD: + case PERF_BPF_EVENT_PROG_UNLOAD: + if (atomic_read(&nr_ksymbol_events)) + perf_event_bpf_emit_ksymbols(prog, type); + break; + default: + break; + } + + if (!atomic_read(&nr_bpf_events)) + return; + + bpf_event = (struct perf_bpf_event){ + .prog = prog, + .event_id = { + .header = { + .type = PERF_RECORD_BPF_EVENT, + .size = sizeof(bpf_event.event_id), + }, + .type = type, + .flags = flags, + .id = prog->aux->id, + }, + }; + + BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); + + memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); + perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10008,6 +10121,8 @@ static void account_event(struct perf_event *event) inc = true; if (event->attr.ksymbol) atomic_inc(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_inc(&nr_bpf_events); if (inc) { /* -- cgit v1.3-7-g2ca7 From aefcb7460e0b5f35f72601b7a98eec5ca1639cf2 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 15 Jan 2019 15:18:56 +1100 Subject: m68k/mac: Fix PRAM accessors PMU-based m68k Macs pre-date PowerMac-style NVRAM. Use the appropriate PMU commands. Also implement the missing XPRAM accessors for VIA-based Macs. Acked-by: Geert Uytterhoeven Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Greg Kroah-Hartman --- arch/m68k/mac/misc.c | 43 +++++++++++++++++++++++++++++++++---------- include/uapi/linux/pmu.h | 2 ++ 2 files changed, 35 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index af000a015f68..d016ca2e0d10 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -66,23 +66,22 @@ static unsigned char pmu_pram_read_byte(int offset) { struct adb_request req; - if (pmu_request(&req, NULL, 3, PMU_READ_NVRAM, - (offset >> 8) & 0xFF, offset & 0xFF) < 0) + if (pmu_request(&req, NULL, 3, PMU_READ_XPRAM, + offset & 0xFF, 1) < 0) return 0; - while (!req.complete) - pmu_poll(); - return req.reply[3]; + pmu_wait_complete(&req); + + return req.reply[0]; } static void pmu_pram_write_byte(unsigned char data, int offset) { struct adb_request req; - if (pmu_request(&req, NULL, 4, PMU_WRITE_NVRAM, - (offset >> 8) & 0xFF, offset & 0xFF, data) < 0) + if (pmu_request(&req, NULL, 4, PMU_WRITE_XPRAM, + offset & 0xFF, 1, data) < 0) return; - while (!req.complete) - pmu_poll(); + pmu_wait_complete(&req); } #endif /* CONFIG_ADB_PMU */ @@ -151,6 +150,16 @@ static void via_rtc_send(__u8 data) #define RTC_REG_SECONDS_3 3 #define RTC_REG_WRITE_PROTECT 13 +/* + * Inside Mac has no information about two-byte RTC commands but + * the MAME/MESS source code has the essentials. + */ + +#define RTC_REG_XPRAM 14 +#define RTC_CMD_XPRAM_READ (RTC_CMD_READ(RTC_REG_XPRAM) << 8) +#define RTC_CMD_XPRAM_WRITE (RTC_CMD_WRITE(RTC_REG_XPRAM) << 8) +#define RTC_CMD_XPRAM_ARG(a) (((a & 0xE0) << 3) | ((a & 0x1F) << 2)) + /* * Execute a VIA PRAM/RTC command. For read commands * data should point to a one-byte buffer for the @@ -198,11 +207,25 @@ static void via_rtc_command(int command, __u8 *data) static unsigned char via_pram_read_byte(int offset) { - return 0; + unsigned char temp; + + via_rtc_command(RTC_CMD_XPRAM_READ | RTC_CMD_XPRAM_ARG(offset), &temp); + + return temp; } static void via_pram_write_byte(unsigned char data, int offset) { + unsigned char temp; + + temp = 0x55; + via_rtc_command(RTC_CMD_WRITE(RTC_REG_WRITE_PROTECT), &temp); + + temp = data; + via_rtc_command(RTC_CMD_XPRAM_WRITE | RTC_CMD_XPRAM_ARG(offset), &temp); + + temp = 0x55 | RTC_FLG_WRITE_PROTECT; + via_rtc_command(RTC_CMD_WRITE(RTC_REG_WRITE_PROTECT), &temp); } /* diff --git a/include/uapi/linux/pmu.h b/include/uapi/linux/pmu.h index 97256f90e6df..f2fc1bd80017 100644 --- a/include/uapi/linux/pmu.h +++ b/include/uapi/linux/pmu.h @@ -19,7 +19,9 @@ #define PMU_POWER_CTRL 0x11 /* control power of some devices */ #define PMU_ADB_CMD 0x20 /* send ADB packet */ #define PMU_ADB_POLL_OFF 0x21 /* disable ADB auto-poll */ +#define PMU_WRITE_XPRAM 0x32 /* write eXtended Parameter RAM */ #define PMU_WRITE_NVRAM 0x33 /* write non-volatile RAM */ +#define PMU_READ_XPRAM 0x3a /* read eXtended Parameter RAM */ #define PMU_READ_NVRAM 0x3b /* read non-volatile RAM */ #define PMU_SET_RTC 0x30 /* set real-time clock */ #define PMU_READ_RTC 0x38 /* read real-time clock */ -- cgit v1.3-7-g2ca7 From ec74136ded792deed80780a2f8baf3521eeb72f9 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 14 Jan 2019 09:10:21 -0800 Subject: binder: create node flag to request sender's security context To allow servers to verify client identity, allow a node flag to be set that causes the sender's security context to be delivered with the transaction. The BR_TRANSACTION command is extended in BR_TRANSACTION_SEC_CTX to contain a pointer to the security context string. Signed-off-by: Todd Kjos Reviewed-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 106 ++++++++++++++++++++++++++++-------- include/uapi/linux/android/binder.h | 19 +++++++ 2 files changed, 102 insertions(+), 23 deletions(-) (limited to 'include/uapi') diff --git a/drivers/android/binder.c b/drivers/android/binder.c index cdfc87629efb..5f6ef5e63b91 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -329,6 +329,8 @@ struct binder_error { * (invariant after initialized) * @min_priority: minimum scheduling priority * (invariant after initialized) + * @txn_security_ctx: require sender's security context + * (invariant after initialized) * @async_todo: list of async work items * (protected by @proc->inner_lock) * @@ -365,6 +367,7 @@ struct binder_node { * invariant after initialization */ u8 accept_fds:1; + u8 txn_security_ctx:1; u8 min_priority; }; bool has_async_transaction; @@ -615,6 +618,7 @@ struct binder_transaction { long saved_priority; kuid_t sender_euid; struct list_head fd_fixups; + binder_uintptr_t security_ctx; /** * @lock: protects @from, @to_proc, and @to_thread * @@ -1152,6 +1156,7 @@ static struct binder_node *binder_init_node_ilocked( node->work.type = BINDER_WORK_NODE; node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); + node->txn_security_ctx = !!(flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); INIT_LIST_HEAD(&node->async_todo); @@ -2778,6 +2783,8 @@ static void binder_transaction(struct binder_proc *proc, binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; int t_debug_id = atomic_inc_return(&binder_last_id); + char *secctx = NULL; + u32 secctx_sz = 0; e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; @@ -3020,6 +3027,20 @@ static void binder_transaction(struct binder_proc *proc, t->flags = tr->flags; t->priority = task_nice(current); + if (target_node && target_node->txn_security_ctx) { + u32 secid; + + security_task_getsecid(proc->tsk, &secid); + ret = security_secid_to_secctx(secid, &secctx, &secctx_sz); + if (ret) { + return_error = BR_FAILED_REPLY; + return_error_param = ret; + return_error_line = __LINE__; + goto err_get_secctx_failed; + } + extra_buffers_size += ALIGN(secctx_sz, sizeof(u64)); + } + trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, @@ -3036,6 +3057,19 @@ static void binder_transaction(struct binder_proc *proc, t->buffer = NULL; goto err_binder_alloc_buf_failed; } + if (secctx) { + size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) + + ALIGN(tr->offsets_size, sizeof(void *)) + + ALIGN(extra_buffers_size, sizeof(void *)) - + ALIGN(secctx_sz, sizeof(u64)); + char *kptr = t->buffer->data + buf_offset; + + t->security_ctx = (uintptr_t)kptr + + binder_alloc_get_user_buffer_offset(&target_proc->alloc); + memcpy(kptr, secctx, secctx_sz); + security_release_secctx(secctx, secctx_sz); + secctx = NULL; + } t->buffer->debug_id = t->debug_id; t->buffer->transaction = t; t->buffer->target_node = target_node; @@ -3305,6 +3339,9 @@ err_copy_data_failed: t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); err_binder_alloc_buf_failed: + if (secctx) + security_release_secctx(secctx, secctx_sz); +err_get_secctx_failed: kfree(tcomplete); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); err_alloc_tcomplete_failed: @@ -4036,11 +4073,13 @@ retry: while (1) { uint32_t cmd; - struct binder_transaction_data tr; + struct binder_transaction_data_secctx tr; + struct binder_transaction_data *trd = &tr.transaction_data; struct binder_work *w = NULL; struct list_head *list = NULL; struct binder_transaction *t = NULL; struct binder_thread *t_from; + size_t trsize = sizeof(*trd); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&thread->todo)) @@ -4240,8 +4279,8 @@ retry: if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; - tr.target.ptr = target_node->ptr; - tr.cookie = target_node->cookie; + trd->target.ptr = target_node->ptr; + trd->cookie = target_node->cookie; t->saved_priority = task_nice(current); if (t->priority < target_node->min_priority && !(t->flags & TF_ONE_WAY)) @@ -4251,22 +4290,23 @@ retry: binder_set_nice(target_node->min_priority); cmd = BR_TRANSACTION; } else { - tr.target.ptr = 0; - tr.cookie = 0; + trd->target.ptr = 0; + trd->cookie = 0; cmd = BR_REPLY; } - tr.code = t->code; - tr.flags = t->flags; - tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid); + trd->code = t->code; + trd->flags = t->flags; + trd->sender_euid = from_kuid(current_user_ns(), t->sender_euid); t_from = binder_get_txn_from(t); if (t_from) { struct task_struct *sender = t_from->proc->tsk; - tr.sender_pid = task_tgid_nr_ns(sender, - task_active_pid_ns(current)); + trd->sender_pid = + task_tgid_nr_ns(sender, + task_active_pid_ns(current)); } else { - tr.sender_pid = 0; + trd->sender_pid = 0; } ret = binder_apply_fd_fixups(t); @@ -4297,15 +4337,20 @@ retry: } continue; } - tr.data_size = t->buffer->data_size; - tr.offsets_size = t->buffer->offsets_size; - tr.data.ptr.buffer = (binder_uintptr_t) + trd->data_size = t->buffer->data_size; + trd->offsets_size = t->buffer->offsets_size; + trd->data.ptr.buffer = (binder_uintptr_t) ((uintptr_t)t->buffer->data + binder_alloc_get_user_buffer_offset(&proc->alloc)); - tr.data.ptr.offsets = tr.data.ptr.buffer + + trd->data.ptr.offsets = trd->data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); + tr.secctx = t->security_ctx; + if (t->security_ctx) { + cmd = BR_TRANSACTION_SEC_CTX; + trsize = sizeof(tr); + } if (put_user(cmd, (uint32_t __user *)ptr)) { if (t_from) binder_thread_dec_tmpref(t_from); @@ -4316,7 +4361,7 @@ retry: return -EFAULT; } ptr += sizeof(uint32_t); - if (copy_to_user(ptr, &tr, sizeof(tr))) { + if (copy_to_user(ptr, &tr, trsize)) { if (t_from) binder_thread_dec_tmpref(t_from); @@ -4325,7 +4370,7 @@ retry: return -EFAULT; } - ptr += sizeof(tr); + ptr += trsize; trace_binder_transaction_received(t); binder_stat_br(proc, thread, cmd); @@ -4333,16 +4378,18 @@ retry: "%d:%d %s %d %d:%d, cmd %d size %zd-%zd ptr %016llx-%016llx\n", proc->pid, thread->pid, (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" : - "BR_REPLY", + (cmd == BR_TRANSACTION_SEC_CTX) ? + "BR_TRANSACTION_SEC_CTX" : "BR_REPLY", t->debug_id, t_from ? t_from->proc->pid : 0, t_from ? t_from->pid : 0, cmd, t->buffer->data_size, t->buffer->offsets_size, - (u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets); + (u64)trd->data.ptr.buffer, + (u64)trd->data.ptr.offsets); if (t_from) binder_thread_dec_tmpref(t_from); t->buffer->allow_user_free = 1; - if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) { + if (cmd != BR_REPLY && !(t->flags & TF_ONE_WAY)) { binder_inner_proc_lock(thread->proc); t->to_parent = thread->transaction_stack; t->to_thread = thread; @@ -4690,7 +4737,8 @@ out: return ret; } -static int binder_ioctl_set_ctx_mgr(struct file *filp) +static int binder_ioctl_set_ctx_mgr(struct file *filp, + struct flat_binder_object *fbo) { int ret = 0; struct binder_proc *proc = filp->private_data; @@ -4719,7 +4767,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) } else { context->binder_context_mgr_uid = curr_euid; } - new_node = binder_new_node(proc, NULL); + new_node = binder_new_node(proc, fbo); if (!new_node) { ret = -ENOMEM; goto out; @@ -4842,8 +4890,20 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) binder_inner_proc_unlock(proc); break; } + case BINDER_SET_CONTEXT_MGR_EXT: { + struct flat_binder_object fbo; + + if (copy_from_user(&fbo, ubuf, sizeof(fbo))) { + ret = -EINVAL; + goto err; + } + ret = binder_ioctl_set_ctx_mgr(filp, &fbo); + if (ret) + goto err; + break; + } case BINDER_SET_CONTEXT_MGR: - ret = binder_ioctl_set_ctx_mgr(filp); + ret = binder_ioctl_set_ctx_mgr(filp, NULL); if (ret) goto err; break; diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index b9ba520f7e4b..2832134e5397 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -41,6 +41,14 @@ enum { enum { FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff, FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, + + /** + * @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts + * + * Only when set, causes senders to include their security + * context + */ + FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000, }; #ifdef BINDER_IPC_32BIT @@ -218,6 +226,7 @@ struct binder_node_info_for_ref { #define BINDER_VERSION _IOWR('b', 9, struct binder_version) #define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) #define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref) +#define BINDER_SET_CONTEXT_MGR_EXT _IOW('b', 13, struct flat_binder_object) /* * NOTE: Two special error codes you should check for when calling @@ -276,6 +285,11 @@ struct binder_transaction_data { } data; }; +struct binder_transaction_data_secctx { + struct binder_transaction_data transaction_data; + binder_uintptr_t secctx; +}; + struct binder_transaction_data_sg { struct binder_transaction_data transaction_data; binder_size_t buffers_size; @@ -311,6 +325,11 @@ enum binder_driver_return_protocol { BR_OK = _IO('r', 1), /* No parameters! */ + BR_TRANSACTION_SEC_CTX = _IOR('r', 2, + struct binder_transaction_data_secctx), + /* + * binder_transaction_data_secctx: the received command. + */ BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data), BR_REPLY = _IOR('r', 3, struct binder_transaction_data), /* -- cgit v1.3-7-g2ca7 From 0d6040d4681735dfc47565de288525de405a5c99 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 31 Dec 2018 14:38:26 +0100 Subject: arch: add split IPC system calls where needed The IPC system call handling is highly inconsistent across architectures, some use sys_ipc, some use separate calls, and some use both. We also have some architectures that require passing IPC_64 in the flags, and others that set it implicitly. For the addition of a y2038 safe semtimedop() system call, I chose to only support the separate entry points, but that requires first supporting the regular ones with their own syscall numbers. The IPC_64 is now implied by the new semctl/shmctl/msgctl system calls even on the architectures that require passing it with the ipc() multiplexer. I'm not adding the new semtimedop() or semop() on 32-bit architectures, those will get implemented using the new semtimedop_time64() version that gets added along with the other time64 calls. Three 64-bit architectures (powerpc, s390 and sparc) get semtimedop(). Signed-off-by: Arnd Bergmann Acked-by: Geert Uytterhoeven Acked-by: Heiko Carstens --- arch/m68k/kernel/syscalls/syscall.tbl | 11 +++++++++++ arch/mips/kernel/syscalls/syscall_o32.tbl | 11 +++++++++++ arch/powerpc/kernel/syscalls/syscall.tbl | 13 +++++++++++++ arch/s390/kernel/syscalls/syscall.tbl | 12 ++++++++++++ arch/sh/kernel/syscalls/syscall.tbl | 11 +++++++++++ arch/sparc/kernel/syscalls/syscall.tbl | 12 ++++++++++++ arch/x86/entry/syscalls/syscall_32.tbl | 11 +++++++++++ arch/x86/entry/syscalls/syscall_64.tbl | 2 ++ include/uapi/asm-generic/unistd.h | 1 + 9 files changed, 84 insertions(+) (limited to 'include/uapi') diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 85779d6ef935..5354ba02eed2 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -388,3 +388,14 @@ 378 common pwritev2 sys_pwritev2 379 common statx sys_statx 380 common seccomp sys_seccomp +# room for arch specific calls +393 common semget sys_semget +394 common semctl sys_semctl +395 common shmget sys_shmget +396 common shmctl sys_shmctl +397 common shmat sys_shmat +398 common shmdt sys_shmdt +399 common msgget sys_msgget +400 common msgsnd sys_msgsnd +401 common msgrcv sys_msgrcv +402 common msgctl sys_msgctl diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 3d5a47b80d2b..fa47ea8cc6ef 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -380,3 +380,14 @@ 366 o32 statx sys_statx 367 o32 rseq sys_rseq 368 o32 io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +# room for arch specific calls +393 o32 semget sys_semget +394 o32 semctl sys_semctl compat_sys_semctl +395 o32 shmget sys_shmget +396 o32 shmctl sys_shmctl compat_sys_shmctl +397 o32 shmat sys_shmat compat_sys_shmat +398 o32 shmdt sys_shmdt +399 o32 msgget sys_msgget +400 o32 msgsnd sys_msgsnd compat_sys_msgsnd +401 o32 msgrcv sys_msgrcv compat_sys_msgrcv +402 o32 msgctl sys_msgctl compat_sys_msgctl diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index db3bbb8744af..7555874ce39c 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -414,6 +414,7 @@ 363 spu switch_endian sys_ni_syscall 364 common userfaultfd sys_userfaultfd 365 common membarrier sys_membarrier +# 366-377 originally left for IPC, now unused 378 nospu mlock2 sys_mlock2 379 nospu copy_file_range sys_copy_file_range 380 common preadv2 sys_preadv2 compat_sys_preadv2 @@ -425,3 +426,15 @@ 386 nospu pkey_mprotect sys_pkey_mprotect 387 nospu rseq sys_rseq 388 nospu io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +# room for arch specific syscalls +392 64 semtimedop sys_semtimedop +393 common semget sys_semget +394 common semctl sys_semctl compat_sys_semctl +395 common shmget sys_shmget +396 common shmctl sys_shmctl compat_sys_shmctl +397 common shmat sys_shmat compat_sys_shmat +398 common shmdt sys_shmdt +399 common msgget sys_msgget +400 common msgsnd sys_msgsnd compat_sys_msgsnd +401 common msgrcv sys_msgrcv compat_sys_msgrcv +402 common msgctl sys_msgctl compat_sys_msgctl diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 7413fd318e2a..0bccb01c6202 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -391,3 +391,15 @@ 381 common kexec_file_load sys_kexec_file_load sys_kexec_file_load 382 common io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents 383 common rseq sys_rseq sys_rseq +# room for arch specific syscalls +392 64 semtimedop sys_semtimedop - +393 common semget sys_semget sys_semget +394 common semctl sys_semctl compat_sys_semctl +395 common shmget sys_shmget sys_shmget +396 common shmctl sys_shmctl compat_sys_shmctl +397 common shmat sys_shmat compat_sys_shmat +398 common shmdt sys_shmdt sys_shmdt +399 common msgget sys_msgget sys_msgget +400 common msgsnd sys_msgsnd compat_sys_msgsnd +401 common msgrcv sys_msgrcv compat_sys_msgrcv +402 common msgctl sys_msgctl compat_sys_msgctl diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index a70db013dbc7..6d0b84e3ef2d 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -391,3 +391,14 @@ 381 common preadv2 sys_preadv2 382 common pwritev2 sys_pwritev2 383 common statx sys_statx +# room for arch specific syscalls +393 common semget sys_semget +394 common semctl sys_semctl +395 common shmget sys_shmget +396 common shmctl sys_shmctl +397 common shmat sys_shmat +398 common shmdt sys_shmdt +399 common msgget sys_msgget +400 common msgsnd sys_msgsnd +401 common msgrcv sys_msgrcv +402 common msgctl sys_msgctl diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index c8c77c05ea97..8c9580302422 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -407,3 +407,15 @@ 359 common pwritev2 sys_pwritev2 compat_sys_pwritev2 360 common statx sys_statx 361 common io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +# room for arch specific syscalls +392 64 semtimedop sys_semtimedop +393 common semget sys_semget +394 common semctl sys_semctl compat_sys_semctl +395 common shmget sys_shmget +396 common shmctl sys_shmctl compat_sys_shmctl +397 common shmat sys_shmat compat_sys_shmat +398 common shmdt sys_shmdt +399 common msgget sys_msgget +400 common msgsnd sys_msgsnd compat_sys_msgsnd +401 common msgrcv sys_msgrcv compat_sys_msgrcv +402 common msgctl sys_msgctl compat_sys_msgctl diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 3cf7b533b3d1..2be1d0eb7754 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -398,3 +398,14 @@ 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq +# don't use numbers 387 through 392, add new calls at the end +393 i386 semget sys_semget __ia32_sys_semget +394 i386 semctl sys_semctl __ia32_compat_sys_semctl +395 i386 shmget sys_shmget __ia32_sys_shmget +396 i386 shmctl sys_shmctl __ia32_compat_sys_shmctl +397 i386 shmat sys_shmat __ia32_compat_sys_shmat +398 i386 shmdt sys_shmdt __ia32_sys_shmdt +399 i386 msgget sys_msgget __ia32_sys_msgget +400 i386 msgsnd sys_msgsnd __ia32_compat_sys_msgsnd +401 i386 msgrcv sys_msgrcv __ia32_compat_sys_msgrcv +402 i386 msgctl sys_msgctl __ia32_compat_sys_msgctl diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f0b1709a5ffb..9d8128ef50a9 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,8 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +# don't use numbers 387 through 423, add new calls after the last +# 'common' entry # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d90127298f12..509484dbfd5d 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -740,6 +740,7 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +/* 295 through 402 are unassigned to sync up with generic numbers, don't use */ #undef __NR_syscalls #define __NR_syscalls 295 -- cgit v1.3-7-g2ca7 From 6bf8f22aea0ddd93af822aed8afeeee4acdf7694 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 22 Jan 2019 08:29:56 +0200 Subject: IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD and its initial implementation. This object is from type class FD and will be used to read DEVX async commands completion. The core layer should allow the driver to set object from type FD in a safe mode, this option was added with a matching comment in place. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rdma_core.c | 1 + drivers/infiniband/core/uverbs_uapi.c | 15 ++++-- drivers/infiniband/hw/mlx5/devx.c | 83 ++++++++++++++++++++++++++++++++ include/rdma/uverbs_types.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 ++++ 5 files changed, 104 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 6c4747e61d2b..a260d2f8e0b7 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -801,6 +801,7 @@ void uverbs_close_fd(struct file *f) /* Pairs with filp->private_data in alloc_begin_fd_uobject */ uverbs_uobject_put(uobj); } +EXPORT_SYMBOL(uverbs_close_fd); /* * Drop the ucontext off the ufile and completely disconnect it from the diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 9ae08e4b78a3..7a987acf0c0b 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -188,13 +188,18 @@ static int uapi_merge_obj_tree(struct uverbs_api *uapi, obj_elm->type_attrs = obj->type_attrs; obj_elm->type_class = obj->type_attrs->type_class; /* - * Today drivers are only permitted to use idr_class - * types. They cannot use FD types because we currently have - * no way to revoke the fops pointer after device - * disassociation. + * Today drivers are only permitted to use idr_class and + * fd_class types. We can revoke the IDR types during + * disassociation, and the FD types require the driver to use + * struct file_operations.owner to prevent the driver module + * code from unloading while the file is open. This provides + * enough safety that uverbs_close_fd() will continue to work. + * Drivers using FD are responsible to handle disassociation of + * the device on their own. */ if (WARN_ON(is_driver && - obj->type_attrs->type_class != &uverbs_idr_class)) + obj->type_attrs->type_class != &uverbs_idr_class && + obj->type_attrs->type_class != &uverbs_fd_class)) return -EINVAL; } diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 5a588f3cfb1b..9933bcf83a6b 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1168,6 +1168,38 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( cmd_out, cmd_out_len); } +struct devx_async_event_queue { + spinlock_t lock; + wait_queue_head_t poll_wait; + struct list_head event_list; +}; + +struct devx_async_cmd_event_file { + struct ib_uobject uobj; + struct devx_async_event_queue ev_queue; +}; + +static void devx_init_event_queue(struct devx_async_event_queue *ev_queue) +{ + spin_lock_init(&ev_queue->lock); + INIT_LIST_HEAD(&ev_queue->event_list); + init_waitqueue_head(&ev_queue->poll_wait); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct devx_async_cmd_event_file *ev_file; + + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE); + + ev_file = container_of(uobj, struct devx_async_cmd_event_file, + uobj); + devx_init_event_queue(&ev_file->ev_queue); + return 0; +} + static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, struct uverbs_attr_bundle *attrs, struct devx_umem *obj) @@ -1313,6 +1345,38 @@ static int devx_umem_cleanup(struct ib_uobject *uobject, return 0; } +static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + return -EINVAL; +} + +static int devx_async_cmd_event_close(struct inode *inode, struct file *filp) +{ + uverbs_close_fd(filp); + return 0; +} + +static __poll_t devx_async_cmd_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + return 0; +} + +const struct file_operations devx_async_cmd_event_fops = { + .owner = THIS_MODULE, + .read = devx_async_cmd_event_read, + .poll = devx_async_cmd_event_poll, + .release = devx_async_cmd_event_close, + .llseek = no_llseek, +}; + +static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + return 0; +}; + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_DEVX_UMEM_REG, UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, @@ -1440,6 +1504,22 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM, &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_REG), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_DEREG)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UVERBS_ACCESS_NEW, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_cmd_event_file), + devx_hot_unplug_async_cmd_event_file, + &devx_async_cmd_event_fops, "[devx_async_cmd]", + O_RDONLY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)); + static bool devx_is_supported(struct ib_device *device) { struct mlx5_ib_dev *dev = to_mdev(device); @@ -1457,5 +1537,8 @@ const struct uapi_definition mlx5_ib_devx_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_DEVX_UMEM, UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), {}, }; diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index acb1bfa3cc99..175d761695e1 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -157,6 +157,7 @@ struct uverbs_obj_fd_type { extern const struct uverbs_obj_type_class uverbs_idr_class; extern const struct uverbs_obj_type_class uverbs_fd_class; +void uverbs_close_fd(struct file *f); #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ sizeof(char)) diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index b8d121d457f1..6ceae29d77cd 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -113,11 +113,20 @@ enum mlx5_ib_devx_umem_methods { MLX5_IB_METHOD_DEVX_UMEM_DEREG, }; +enum mlx5_ib_devx_async_cmd_fd_alloc_attrs { + MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_devx_async_cmd_fd_methods { + MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), +}; + enum mlx5_ib_objects { MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_OBJECT_DEVX_OBJ, MLX5_IB_OBJECT_DEVX_UMEM, MLX5_IB_OBJECT_FLOW_MATCHER, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, }; enum mlx5_ib_flow_matcher_create_attrs { -- cgit v1.3-7-g2ca7 From a124edba26270697540f1058bfcd490c1c65b116 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 22 Jan 2019 08:29:57 +0200 Subject: IB/mlx5: Introduce async DEVX obj query API Introduce async DEVX obj query API to get the command response back to user space once it's ready without blocking when calling the firmware. The event's data includes a header with some meta data then the firmware output command data. The header includes: - The input 'wr_id' to let application recognizing the response. The input FD attribute is used to have the event data ready on. Downstream patches from this series will implement the file ops to let application read it. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 164 +++++++++++++++++++++++++++++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 ++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 5 + 3 files changed, 177 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 9933bcf83a6b..9ca116155f9c 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -17,6 +18,16 @@ #define UVERBS_MODULE_NAME mlx5_ib #include +struct devx_async_data { + struct mlx5_ib_dev *mdev; + struct list_head list; + struct ib_uobject *fd_uobj; + struct mlx5_async_work cb_work; + u16 cmd_out_len; + /* must be last field in this structure */ + struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; +}; + #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { struct mlx5_core_dev *mdev; @@ -1172,11 +1183,13 @@ struct devx_async_event_queue { spinlock_t lock; wait_queue_head_t poll_wait; struct list_head event_list; + atomic_t bytes_in_use; }; struct devx_async_cmd_event_file { struct ib_uobject uobj; struct devx_async_event_queue ev_queue; + struct mlx5_async_ctx async_ctx; }; static void devx_init_event_queue(struct devx_async_event_queue *ev_queue) @@ -1184,6 +1197,7 @@ static void devx_init_event_queue(struct devx_async_event_queue *ev_queue) spin_lock_init(&ev_queue->lock); INIT_LIST_HEAD(&ev_queue->event_list); init_waitqueue_head(&ev_queue->poll_wait); + atomic_set(&ev_queue->bytes_in_use, 0); } static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( @@ -1193,13 +1207,123 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( struct ib_uobject *uobj = uverbs_attr_get_uobject( attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE); + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); ev_file = container_of(uobj, struct devx_async_cmd_event_file, uobj); devx_init_event_queue(&ev_file->ev_queue); + mlx5_cmd_init_async_ctx(mdev->mdev, &ev_file->async_ctx); return 0; } +static void devx_query_callback(int status, struct mlx5_async_work *context) +{ + struct devx_async_data *async_data = + container_of(context, struct devx_async_data, cb_work); + struct ib_uobject *fd_uobj = async_data->fd_uobj; + struct devx_async_cmd_event_file *ev_file; + struct devx_async_event_queue *ev_queue; + unsigned long flags; + + ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file, + uobj); + ev_queue = &ev_file->ev_queue; + + spin_lock_irqsave(&ev_queue->lock, flags); + list_add_tail(&async_data->list, &ev_queue->event_list); + spin_unlock_irqrestore(&ev_queue->lock, flags); + + wake_up_interruptible(&ev_queue->poll_wait); + fput(fd_uobj->object); +} + +#define MAX_ASYNC_BYTES_IN_USE (1024 * 1024) /* 1MB */ + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)( + struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN); + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE); + u16 cmd_out_len; + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct ib_uobject *fd_uobj; + int err; + int uid; + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + struct devx_async_cmd_event_file *ev_file; + struct devx_async_data *async_data; + + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; + + if (!devx_is_obj_query_cmd(cmd_in)) + return -EINVAL; + + err = uverbs_get_const(&cmd_out_len, attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN); + if (err) + return err; + + if (!devx_is_valid_obj_id(uobj, cmd_in)) + return -EINVAL; + + fd_uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD); + if (IS_ERR(fd_uobj)) + return PTR_ERR(fd_uobj); + + ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file, + uobj); + + if (atomic_add_return(cmd_out_len, &ev_file->ev_queue.bytes_in_use) > + MAX_ASYNC_BYTES_IN_USE) { + atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use); + return -EAGAIN; + } + + async_data = kvzalloc(struct_size(async_data, hdr.out_data, + cmd_out_len), GFP_KERNEL); + if (!async_data) { + err = -ENOMEM; + goto sub_bytes; + } + + err = uverbs_copy_from(&async_data->hdr.wr_id, attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID); + if (err) + goto free_async; + + async_data->cmd_out_len = cmd_out_len; + async_data->mdev = mdev; + async_data->fd_uobj = fd_uobj; + + get_file(fd_uobj->object); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); + err = mlx5_cmd_exec_cb(&ev_file->async_ctx, cmd_in, + uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN), + async_data->hdr.out_data, + async_data->cmd_out_len, + devx_query_callback, &async_data->cb_work); + + if (err) + goto cb_err; + + return 0; + +cb_err: + fput(fd_uobj->object); +free_async: + kvfree(async_data); +sub_bytes: + atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use); + return err; +} + static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, struct uverbs_attr_bundle *attrs, struct devx_umem *obj) @@ -1353,6 +1477,17 @@ static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, static int devx_async_cmd_event_close(struct inode *inode, struct file *filp) { + struct ib_uobject *uobj = filp->private_data; + struct devx_async_cmd_event_file *comp_ev_file = container_of( + uobj, struct devx_async_cmd_event_file, uobj); + struct devx_async_data *entry, *tmp; + + spin_lock_irq(&comp_ev_file->ev_queue.lock); + list_for_each_entry_safe(entry, tmp, + &comp_ev_file->ev_queue.event_list, list) + kvfree(entry); + spin_unlock_irq(&comp_ev_file->ev_queue.lock); + uverbs_close_fd(filp); return 0; } @@ -1374,6 +1509,11 @@ const struct file_operations devx_async_cmd_event_fops = { static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, enum rdma_remove_reason why) { + struct devx_async_cmd_event_file *comp_ev_file = + container_of(uobj, struct devx_async_cmd_event_file, + uobj); + + mlx5_cmd_cleanup_async_ctx(&comp_ev_file->async_ctx); return 0; }; @@ -1487,6 +1627,27 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, + UVERBS_IDR_ANY_OBJECT, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, + u16, UA_MANDATORY), + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX, &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR), @@ -1497,7 +1658,8 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ, &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_CREATE), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_DESTROY), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_MODIFY), - &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY)); + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)); DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM, UVERBS_TYPE_ALLOC_IDR(devx_umem_cleanup), diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 6ceae29d77cd..8149d224030b 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -84,6 +84,14 @@ enum mlx5_ib_devx_obj_query_attrs { MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, }; +enum mlx5_ib_devx_obj_query_async_attrs { + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, +}; + enum mlx5_ib_devx_query_eqn_attrs { MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, @@ -94,6 +102,7 @@ enum mlx5_ib_devx_obj_methods { MLX5_IB_METHOD_DEVX_OBJ_DESTROY, MLX5_IB_METHOD_DEVX_OBJ_MODIFY, MLX5_IB_METHOD_DEVX_OBJ_QUERY, + MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY, }; enum mlx5_ib_devx_umem_reg_attrs { diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 4ef62c0e8452..4a701033b93f 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -51,5 +51,10 @@ enum mlx5_ib_uapi_flow_action_packet_reformat_type { MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, }; +struct mlx5_ib_uapi_devx_async_cmd_hdr { + __aligned_u64 wr_id; + __u8 out_data[]; +}; + #endif -- cgit v1.3-7-g2ca7 From 71368af9027f18fe5d1c6f372cfdff7e4bde8b48 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Jan 2019 17:01:36 -0500 Subject: x86/speculation: Add PR_SPEC_DISABLE_NOEXEC With the default SPEC_STORE_BYPASS_SECCOMP/SPEC_STORE_BYPASS_PRCTL mode, the TIF_SSBD bit will be inherited when a new task is fork'ed or cloned. It will also remain when a new program is execve'ed. Only certain class of applications (like Java) that can run on behalf of multiple users on a single thread will require disabling speculative store bypass for security purposes. Those applications will call prctl(2) at startup time to disable SSB. They won't rely on the fact the SSB might have been disabled. Other applications that don't need SSBD will just move on without checking if SSBD has been turned on or not. The fact that the TIF_SSBD is inherited across execve(2) boundary will cause performance of applications that don't need SSBD but their predecessors have SSBD on to be unwittingly impacted especially if they write to memory a lot. To remedy this problem, a new PR_SPEC_DISABLE_NOEXEC argument for the PR_SET_SPECULATION_CTRL option of prctl(2) is added to allow applications to specify that the SSBD feature bit on the task structure should be cleared whenever a new program is being execve'ed. Suggested-by: Thomas Gleixner Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: David Woodhouse Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Tim Chen Cc: KarimAllah Ahmed Cc: Peter Zijlstra Cc: Konrad Rzeszutek Wilk Link: https://lkml.kernel.org/r/1547676096-3281-1-git-send-email-longman@redhat.com --- Documentation/userspace-api/spec_ctrl.rst | 27 +++++++++++++++------------ arch/x86/kernel/cpu/bugs.c | 12 ++++++++++++ arch/x86/kernel/process.c | 12 ++++++++++++ include/linux/sched.h | 5 +++++ include/uapi/linux/prctl.h | 1 + tools/include/uapi/linux/prctl.h | 1 + 6 files changed, 46 insertions(+), 12 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst index c4dbe6f7cdae..1129c7550a48 100644 --- a/Documentation/userspace-api/spec_ctrl.rst +++ b/Documentation/userspace-api/spec_ctrl.rst @@ -28,18 +28,20 @@ PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature which is selected with arg2 of prctl(2). The return value uses bits 0-3 with the following meaning: -==== ===================== =================================================== -Bit Define Description -==== ===================== =================================================== -0 PR_SPEC_PRCTL Mitigation can be controlled per task by - PR_SET_SPECULATION_CTRL. -1 PR_SPEC_ENABLE The speculation feature is enabled, mitigation is - disabled. -2 PR_SPEC_DISABLE The speculation feature is disabled, mitigation is - enabled. -3 PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A - subsequent prctl(..., PR_SPEC_ENABLE) will fail. -==== ===================== =================================================== +==== ====================== ================================================== +Bit Define Description +==== ====================== ================================================== +0 PR_SPEC_PRCTL Mitigation can be controlled per task by + PR_SET_SPECULATION_CTRL. +1 PR_SPEC_ENABLE The speculation feature is enabled, mitigation is + disabled. +2 PR_SPEC_DISABLE The speculation feature is disabled, mitigation is + enabled. +3 PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A + subsequent prctl(..., PR_SPEC_ENABLE) will fail. +4 PR_SPEC_DISABLE_NOEXEC Same as PR_SPEC_DISABLE, but the state will be + cleared on :manpage:`execve(2)`. +==== ====================== ================================================== If all bits are 0 the CPU is not affected by the speculation misfeature. @@ -92,6 +94,7 @@ Speculation misfeature controls * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE_NOEXEC, 0, 0); - PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes (Mitigate Spectre V2 style attacks against user processes) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1de0f4170178..2faeaf46347a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -798,15 +798,25 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) if (task_spec_ssb_force_disable(task)) return -EPERM; task_clear_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_DISABLE: task_set_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_FORCE_DISABLE: task_set_spec_ssb_disable(task); task_set_spec_ssb_force_disable(task); + task_clear_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE_NOEXEC: + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_set_spec_ssb_disable(task); + task_set_spec_ssb_noexec(task); task_update_spec_tif(task); break; default: @@ -885,6 +895,8 @@ static int ssb_prctl_get(struct task_struct *task) case SPEC_STORE_BYPASS_PRCTL: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_noexec(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 90ae0ca51083..58ac7be52c7a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -255,6 +255,18 @@ void arch_setup_new_exec(void) /* If cpuid was previously disabled for this task, re-enable it. */ if (test_thread_flag(TIF_NOCPUID)) enable_cpuid(); + + /* + * Don't inherit TIF_SSBD across exec boundary when + * PR_SPEC_DISABLE_NOEXEC is used. + */ + if (test_thread_flag(TIF_SSBD) && + task_spec_ssb_noexec(current)) { + clear_thread_flag(TIF_SSBD); + task_clear_spec_ssb_disable(current); + task_clear_spec_ssb_noexec(current); + speculation_ctrl_update(task_thread_info(current)->flags); + } } static inline void switch_to_bitmap(struct thread_struct *prev, diff --git a/include/linux/sched.h b/include/linux/sched.h index d2f90fa92468..fc836dc71bba 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1459,6 +1459,7 @@ static inline bool is_percpu_thread(void) #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ +#define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ @@ -1487,6 +1488,10 @@ TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) + TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index b4875a93363a..094bb03b9cc2 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -219,6 +219,7 @@ struct prctl_mm_map { # define PR_SPEC_ENABLE (1UL << 1) # define PR_SPEC_DISABLE (1UL << 2) # define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) /* Reset arm64 pointer authentication keys */ #define PR_PAC_RESET_KEYS 54 diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index b4875a93363a..094bb03b9cc2 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -219,6 +219,7 @@ struct prctl_mm_map { # define PR_SPEC_ENABLE (1UL << 1) # define PR_SPEC_DISABLE (1UL << 2) # define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) /* Reset arm64 pointer authentication keys */ #define PR_PAC_RESET_KEYS 54 -- cgit v1.3-7-g2ca7 From 1194c4133195dfcb6c5fc0935d54bbed872a5285 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 29 Jan 2019 00:56:17 +0000 Subject: nfit: Add Hyper-V NVDIMM DSM command set to white list Add the Hyper-V _DSM command set to the white list of NVDIMM command sets. This command set is documented at http://www.uefi.org/RFIC_LIST (see "Virtual NVDIMM 0x1901"). Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 17 ++++++++++++++--- drivers/acpi/nfit/nfit.h | 6 +++++- include/uapi/linux/ndctl.h | 1 + 3 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 1598e3a121a6..4a7e8b1fa43b 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1846,9 +1846,17 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, dev_set_drvdata(&adev_dimm->dev, nfit_mem); /* - * Until standardization materializes we need to consider 4 - * different command sets. Note, that checking for function0 (bit0) - * tells us if any commands are reachable through this GUID. + * There are 4 "legacy" NVDIMM command sets + * (NVDIMM_FAMILY_{INTEL,MSFT,HPE1,HPE2}) that were created before + * an EFI working group was established to constrain this + * proliferation. The nfit driver probes for the supported command + * set by GUID. Note, if you're a platform developer looking to add + * a new command set to this probe, consider using an existing set, + * or otherwise seek approval to publish the command set at + * http://www.uefi.org/RFIC_LIST. + * + * Note, that checking for function0 (bit0) tells us if any commands + * are reachable through this GUID. */ for (i = 0; i <= NVDIMM_FAMILY_MAX; i++) if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) @@ -1871,6 +1879,8 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, dsm_mask &= ~(1 << 8); } else if (nfit_mem->family == NVDIMM_FAMILY_MSFT) { dsm_mask = 0xffffffff; + } else if (nfit_mem->family == NVDIMM_FAMILY_HYPERV) { + dsm_mask = 0x1f; } else { dev_dbg(dev, "unknown dimm command family\n"); nfit_mem->family = -1; @@ -3714,6 +3724,7 @@ static __init int nfit_init(void) guid_parse(UUID_NFIT_DIMM_N_HPE1, &nfit_uuid[NFIT_DEV_DIMM_N_HPE1]); guid_parse(UUID_NFIT_DIMM_N_HPE2, &nfit_uuid[NFIT_DEV_DIMM_N_HPE2]); guid_parse(UUID_NFIT_DIMM_N_MSFT, &nfit_uuid[NFIT_DEV_DIMM_N_MSFT]); + guid_parse(UUID_NFIT_DIMM_N_HYPERV, &nfit_uuid[NFIT_DEV_DIMM_N_HYPERV]); nfit_wq = create_singlethread_workqueue("nfit"); if (!nfit_wq) diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index 33691aecfcee..4de167b4f76f 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -34,11 +34,14 @@ /* https://msdn.microsoft.com/library/windows/hardware/mt604741 */ #define UUID_NFIT_DIMM_N_MSFT "1ee68b36-d4bd-4a1a-9a16-4f8e53d46e05" +/* http://www.uefi.org/RFIC_LIST (see "Virtual NVDIMM 0x1901") */ +#define UUID_NFIT_DIMM_N_HYPERV "5746c5f2-a9a2-4264-ad0e-e4ddc9e09e80" + #define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \ | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED) -#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT +#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV #define NVDIMM_STANDARD_CMDMASK \ (1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \ @@ -94,6 +97,7 @@ enum nfit_uuids { NFIT_DEV_DIMM_N_HPE1 = NVDIMM_FAMILY_HPE1, NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2, NFIT_DEV_DIMM_N_MSFT = NVDIMM_FAMILY_MSFT, + NFIT_DEV_DIMM_N_HYPERV = NVDIMM_FAMILY_HYPERV, NFIT_SPA_VOLATILE, NFIT_SPA_PM, NFIT_SPA_DCR, diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index f57c9e434d2d..de5d90212409 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -243,6 +243,7 @@ struct nd_cmd_pkg { #define NVDIMM_FAMILY_HPE1 1 #define NVDIMM_FAMILY_HPE2 2 #define NVDIMM_FAMILY_MSFT 3 +#define NVDIMM_FAMILY_HYPERV 4 #define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\ struct nd_cmd_pkg) -- cgit v1.3-7-g2ca7 From 2d908b38d40921a03225d42fd6e48eb51bffd606 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 23 Jan 2019 11:28:19 +0100 Subject: serial: Add Tegra Combined UART driver The Tegra Combined UART (TCU) is a mailbox-based mechanism that allows multiplexing multiple "virtual UARTs" into a single hardware serial port. The TCU is the primary serial port on Tegra194 devices. Add a TCU driver utilizing the mailbox framework, as the used mailboxes are part of Tegra HSP blocks that are already controlled by the Tegra HSP mailbox driver. Based on work by Mikko Perttunen . Signed-off-by: Thierry Reding Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/Kconfig | 22 +++ drivers/tty/serial/Makefile | 1 + drivers/tty/serial/tegra-tcu.c | 298 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/serial_core.h | 3 + 4 files changed, 324 insertions(+) create mode 100644 drivers/tty/serial/tegra-tcu.c (limited to 'include/uapi') diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 089a6f285d5e..72966bc0ac76 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -335,6 +335,28 @@ config SERIAL_TEGRA are enabled). This driver uses the APB DMA to achieve higher baudrate and better performance. +config SERIAL_TEGRA_TCU + tristate "NVIDIA Tegra Combined UART" + depends on ARCH_TEGRA && TEGRA_HSP_MBOX + select SERIAL_CORE + help + Support for the mailbox-based TCU (Tegra Combined UART) serial port. + TCU is a virtual serial port that allows multiplexing multiple data + streams into a single hardware serial port. + +config SERIAL_TEGRA_TCU_CONSOLE + bool "Support for console on a Tegra TCU serial port" + depends on SERIAL_TEGRA_TCU=y + select SERIAL_CORE_CONSOLE + default y + ---help--- + If you say Y here, it will be possible to use a the Tegra TCU as the + system console (the system console is the device which receives all + kernel messages and warnings and which allows logins in single user + mode). + + If unsure, say Y. + config SERIAL_MAX3100 tristate "MAX3100 support" depends on SPI diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index 1511e8a9f856..40b702aaa85e 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -77,6 +77,7 @@ obj-$(CONFIG_SERIAL_LANTIQ) += lantiq.o obj-$(CONFIG_SERIAL_XILINX_PS_UART) += xilinx_uartps.o obj-$(CONFIG_SERIAL_SIRFSOC) += sirfsoc_uart.o obj-$(CONFIG_SERIAL_TEGRA) += serial-tegra.o +obj-$(CONFIG_SERIAL_TEGRA_TCU) += tegra-tcu.o obj-$(CONFIG_SERIAL_AR933X) += ar933x_uart.o obj-$(CONFIG_SERIAL_EFM32_UART) += efm32-uart.o obj-$(CONFIG_SERIAL_ARC) += arc_uart.o diff --git a/drivers/tty/serial/tegra-tcu.c b/drivers/tty/serial/tegra-tcu.c new file mode 100644 index 000000000000..aaf8748a6147 --- /dev/null +++ b/drivers/tty/serial/tegra-tcu.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TCU_MBOX_BYTE(i, x) ((x) << (i * 8)) +#define TCU_MBOX_BYTE_V(x, i) (((x) >> (i * 8)) & 0xff) +#define TCU_MBOX_NUM_BYTES(x) ((x) << 24) +#define TCU_MBOX_NUM_BYTES_V(x) (((x) >> 24) & 0x3) + +struct tegra_tcu { + struct uart_driver driver; +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + struct console console; +#endif + struct uart_port port; + + struct mbox_client tx_client, rx_client; + struct mbox_chan *tx, *rx; +}; + +static unsigned int tegra_tcu_uart_tx_empty(struct uart_port *port) +{ + return TIOCSER_TEMT; +} + +static void tegra_tcu_uart_set_mctrl(struct uart_port *port, unsigned int mctrl) +{ +} + +static unsigned int tegra_tcu_uart_get_mctrl(struct uart_port *port) +{ + return 0; +} + +static void tegra_tcu_uart_stop_tx(struct uart_port *port) +{ +} + +static void tegra_tcu_write_one(struct tegra_tcu *tcu, u32 value, + unsigned int count) +{ + void *msg; + + value |= TCU_MBOX_NUM_BYTES(count); + msg = (void *)(unsigned long)value; + mbox_send_message(tcu->tx, msg); + mbox_flush(tcu->tx, 1000); +} + +static void tegra_tcu_write(struct tegra_tcu *tcu, const char *s, + unsigned int count) +{ + unsigned int written = 0, i = 0; + bool insert_nl = false; + u32 value = 0; + + while (i < count) { + if (insert_nl) { + value |= TCU_MBOX_BYTE(written++, '\n'); + insert_nl = false; + i++; + } else if (s[i] == '\n') { + value |= TCU_MBOX_BYTE(written++, '\r'); + insert_nl = true; + } else { + value |= TCU_MBOX_BYTE(written++, s[i++]); + } + + if (written == 3) { + tegra_tcu_write_one(tcu, value, 3); + value = written = 0; + } + } + + if (written) + tegra_tcu_write_one(tcu, value, written); +} + +static void tegra_tcu_uart_start_tx(struct uart_port *port) +{ + struct tegra_tcu *tcu = port->private_data; + struct circ_buf *xmit = &port->state->xmit; + unsigned long count; + + for (;;) { + count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE); + if (!count) + break; + + tegra_tcu_write(tcu, &xmit->buf[xmit->tail], count); + xmit->tail = (xmit->tail + count) & (UART_XMIT_SIZE - 1); + } + + uart_write_wakeup(port); +} + +static void tegra_tcu_uart_stop_rx(struct uart_port *port) +{ +} + +static void tegra_tcu_uart_break_ctl(struct uart_port *port, int ctl) +{ +} + +static int tegra_tcu_uart_startup(struct uart_port *port) +{ + return 0; +} + +static void tegra_tcu_uart_shutdown(struct uart_port *port) +{ +} + +static void tegra_tcu_uart_set_termios(struct uart_port *port, + struct ktermios *new, + struct ktermios *old) +{ +} + +static const struct uart_ops tegra_tcu_uart_ops = { + .tx_empty = tegra_tcu_uart_tx_empty, + .set_mctrl = tegra_tcu_uart_set_mctrl, + .get_mctrl = tegra_tcu_uart_get_mctrl, + .stop_tx = tegra_tcu_uart_stop_tx, + .start_tx = tegra_tcu_uart_start_tx, + .stop_rx = tegra_tcu_uart_stop_rx, + .break_ctl = tegra_tcu_uart_break_ctl, + .startup = tegra_tcu_uart_startup, + .shutdown = tegra_tcu_uart_shutdown, + .set_termios = tegra_tcu_uart_set_termios, +}; + +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) +static void tegra_tcu_console_write(struct console *cons, const char *s, + unsigned int count) +{ + struct tegra_tcu *tcu = container_of(cons, struct tegra_tcu, console); + + tegra_tcu_write(tcu, s, count); +} + +static int tegra_tcu_console_setup(struct console *cons, char *options) +{ + return 0; +} +#endif + +static void tegra_tcu_receive(struct mbox_client *cl, void *msg) +{ + struct tegra_tcu *tcu = container_of(cl, struct tegra_tcu, rx_client); + struct tty_port *port = &tcu->port.state->port; + u32 value = (u32)(unsigned long)msg; + unsigned int num_bytes, i; + + num_bytes = TCU_MBOX_NUM_BYTES_V(value); + + for (i = 0; i < num_bytes; i++) + tty_insert_flip_char(port, TCU_MBOX_BYTE_V(value, i), + TTY_NORMAL); + + tty_flip_buffer_push(port); +} + +static int tegra_tcu_probe(struct platform_device *pdev) +{ + struct uart_port *port; + struct tegra_tcu *tcu; + int err; + + tcu = devm_kzalloc(&pdev->dev, sizeof(*tcu), GFP_KERNEL); + if (!tcu) + return -ENOMEM; + + tcu->tx_client.dev = &pdev->dev; + tcu->rx_client.dev = &pdev->dev; + tcu->rx_client.rx_callback = tegra_tcu_receive; + + tcu->tx = mbox_request_channel_byname(&tcu->tx_client, "tx"); + if (IS_ERR(tcu->tx)) { + err = PTR_ERR(tcu->tx); + dev_err(&pdev->dev, "failed to get tx mailbox: %d\n", err); + return err; + } + + tcu->rx = mbox_request_channel_byname(&tcu->rx_client, "rx"); + if (IS_ERR(tcu->rx)) { + err = PTR_ERR(tcu->rx); + dev_err(&pdev->dev, "failed to get rx mailbox: %d\n", err); + goto free_tx; + } + +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + /* setup the console */ + strcpy(tcu->console.name, "ttyTCU"); + tcu->console.device = uart_console_device; + tcu->console.flags = CON_PRINTBUFFER | CON_ANYTIME; + tcu->console.index = -1; + tcu->console.write = tegra_tcu_console_write; + tcu->console.setup = tegra_tcu_console_setup; + tcu->console.data = &tcu->driver; +#endif + + /* setup the driver */ + tcu->driver.owner = THIS_MODULE; + tcu->driver.driver_name = "tegra-tcu"; + tcu->driver.dev_name = "ttyTCU"; +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + tcu->driver.cons = &tcu->console; +#endif + tcu->driver.nr = 1; + + err = uart_register_driver(&tcu->driver); + if (err) { + dev_err(&pdev->dev, "failed to register UART driver: %d\n", + err); + goto free_rx; + } + + /* setup the port */ + port = &tcu->port; + spin_lock_init(&port->lock); + port->dev = &pdev->dev; + port->type = PORT_TEGRA_TCU; + port->ops = &tegra_tcu_uart_ops; + port->fifosize = 1; + port->iotype = UPIO_MEM; + port->flags = UPF_BOOT_AUTOCONF; + port->private_data = tcu; + + err = uart_add_one_port(&tcu->driver, port); + if (err) { + dev_err(&pdev->dev, "failed to add UART port: %d\n", err); + goto unregister_uart; + } + + platform_set_drvdata(pdev, tcu); +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + register_console(&tcu->console); +#endif + + return 0; + +unregister_uart: + uart_unregister_driver(&tcu->driver); +free_rx: + mbox_free_channel(tcu->rx); +free_tx: + mbox_free_channel(tcu->tx); + + return err; +} + +static int tegra_tcu_remove(struct platform_device *pdev) +{ + struct tegra_tcu *tcu = platform_get_drvdata(pdev); + +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + unregister_console(&tcu->console); +#endif + uart_remove_one_port(&tcu->driver, &tcu->port); + uart_unregister_driver(&tcu->driver); + mbox_free_channel(tcu->rx); + mbox_free_channel(tcu->tx); + + return 0; +} + +static const struct of_device_id tegra_tcu_match[] = { + { .compatible = "nvidia,tegra194-tcu" }, + { } +}; + +static struct platform_driver tegra_tcu_driver = { + .driver = { + .name = "tegra-tcu", + .of_match_table = tegra_tcu_match, + }, + .probe = tegra_tcu_probe, + .remove = tegra_tcu_remove, +}; +module_platform_driver(tegra_tcu_driver); + +MODULE_AUTHOR("Mikko Perttunen "); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("NVIDIA Tegra Combined UART driver"); diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index df4a7534e239..6009ee2c2e99 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -79,6 +79,9 @@ /* Nuvoton UART */ #define PORT_NPCM 40 +/* NVIDIA Tegra Combined UART */ +#define PORT_TEGRA_TCU 41 + /* Intel EG20 */ #define PORT_PCH_8LINE 44 #define PORT_PCH_2LINE 45 -- cgit v1.3-7-g2ca7 From d0a060be573bfbf8753a15dca35497db5e968bb0 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 30 Jan 2019 12:02:44 +0000 Subject: arm64: add ptrace regsets for ptrauth key management Add two new ptrace regsets, which can be used to request and change the pointer authentication keys of a thread. NT_ARM_PACA_KEYS gives access to the instruction/data address keys, and NT_ARM_PACG_KEYS to the generic authentication key. The keys are also part of the core dump file of the process. The regsets are only exposed if the kernel is compiled with CONFIG_CHECKPOINT_RESTORE=y, as the only intended use case is checkpointing and restoring processes that are using pointer authentication. (This can be changed later if there are other use cases.) Reviewed-by: Dave Martin Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas --- Documentation/arm64/pointer-authentication.txt | 5 + arch/arm64/include/uapi/asm/ptrace.h | 13 +++ arch/arm64/kernel/ptrace.c | 147 +++++++++++++++++++++++++ include/uapi/linux/elf.h | 2 + 4 files changed, 167 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/arm64/pointer-authentication.txt b/Documentation/arm64/pointer-authentication.txt index a25cd21290e9..5baca42ba146 100644 --- a/Documentation/arm64/pointer-authentication.txt +++ b/Documentation/arm64/pointer-authentication.txt @@ -78,6 +78,11 @@ bits can vary between the two. Note that the masks apply to TTBR0 addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel pointers). +Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel +will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct +user_pac_address_keys and struct user_pac_generic_keys). These can be +used to get and set the keys for a thread. + Virtualization -------------- diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 28d77c9ed531..d78623acb649 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -233,6 +233,19 @@ struct user_pac_mask { __u64 insn_mask; }; +/* pointer authentication keys (NT_ARM_PACA_KEYS, NT_ARM_PACG_KEYS) */ + +struct user_pac_address_keys { + __uint128_t apiakey; + __uint128_t apibkey; + __uint128_t apdakey; + __uint128_t apdbkey; +}; + +struct user_pac_generic_keys { + __uint128_t apgakey; +}; + #endif /* __ASSEMBLY__ */ #endif /* _UAPI__ASM_PTRACE_H */ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 9dce33b0e260..a86413be5a2d 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -979,6 +979,131 @@ static int pac_mask_get(struct task_struct *target, return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &uregs, 0, -1); } + +#ifdef CONFIG_CHECKPOINT_RESTORE +static __uint128_t pac_key_to_user(const struct ptrauth_key *key) +{ + return (__uint128_t)key->hi << 64 | key->lo; +} + +static struct ptrauth_key pac_key_from_user(__uint128_t ukey) +{ + struct ptrauth_key key = { + .lo = (unsigned long)ukey, + .hi = (unsigned long)(ukey >> 64), + }; + + return key; +} + +static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys, + const struct ptrauth_keys *keys) +{ + ukeys->apiakey = pac_key_to_user(&keys->apia); + ukeys->apibkey = pac_key_to_user(&keys->apib); + ukeys->apdakey = pac_key_to_user(&keys->apda); + ukeys->apdbkey = pac_key_to_user(&keys->apdb); +} + +static void pac_address_keys_from_user(struct ptrauth_keys *keys, + const struct user_pac_address_keys *ukeys) +{ + keys->apia = pac_key_from_user(ukeys->apiakey); + keys->apib = pac_key_from_user(ukeys->apibkey); + keys->apda = pac_key_from_user(ukeys->apdakey); + keys->apdb = pac_key_from_user(ukeys->apdbkey); +} + +static int pac_address_keys_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct ptrauth_keys *keys = &target->thread.keys_user; + struct user_pac_address_keys user_keys; + + if (!system_supports_address_auth()) + return -EINVAL; + + pac_address_keys_to_user(&user_keys, keys); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); +} + +static int pac_address_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct ptrauth_keys *keys = &target->thread.keys_user; + struct user_pac_address_keys user_keys; + int ret; + + if (!system_supports_address_auth()) + return -EINVAL; + + pac_address_keys_to_user(&user_keys, keys); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); + if (ret) + return ret; + pac_address_keys_from_user(keys, &user_keys); + + return 0; +} + +static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys, + const struct ptrauth_keys *keys) +{ + ukeys->apgakey = pac_key_to_user(&keys->apga); +} + +static void pac_generic_keys_from_user(struct ptrauth_keys *keys, + const struct user_pac_generic_keys *ukeys) +{ + keys->apga = pac_key_from_user(ukeys->apgakey); +} + +static int pac_generic_keys_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct ptrauth_keys *keys = &target->thread.keys_user; + struct user_pac_generic_keys user_keys; + + if (!system_supports_generic_auth()) + return -EINVAL; + + pac_generic_keys_to_user(&user_keys, keys); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); +} + +static int pac_generic_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct ptrauth_keys *keys = &target->thread.keys_user; + struct user_pac_generic_keys user_keys; + int ret; + + if (!system_supports_generic_auth()) + return -EINVAL; + + pac_generic_keys_to_user(&user_keys, keys); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); + if (ret) + return ret; + pac_generic_keys_from_user(keys, &user_keys); + + return 0; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_ARM64_PTR_AUTH */ enum aarch64_regset { @@ -995,6 +1120,10 @@ enum aarch64_regset { #endif #ifdef CONFIG_ARM64_PTR_AUTH REGSET_PAC_MASK, +#ifdef CONFIG_CHECKPOINT_RESTORE + REGSET_PACA_KEYS, + REGSET_PACG_KEYS, +#endif #endif }; @@ -1074,6 +1203,24 @@ static const struct user_regset aarch64_regsets[] = { .get = pac_mask_get, /* this cannot be set dynamically */ }, +#ifdef CONFIG_CHECKPOINT_RESTORE + [REGSET_PACA_KEYS] = { + .core_note_type = NT_ARM_PACA_KEYS, + .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), + .size = sizeof(__uint128_t), + .align = sizeof(__uint128_t), + .get = pac_address_keys_get, + .set = pac_address_keys_set, + }, + [REGSET_PACG_KEYS] = { + .core_note_type = NT_ARM_PACG_KEYS, + .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), + .size = sizeof(__uint128_t), + .align = sizeof(__uint128_t), + .get = pac_generic_keys_get, + .set = pac_generic_keys_set, + }, +#endif #endif }; diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index e4d6ddd93567..34c02e4290fe 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -421,6 +421,8 @@ typedef struct elf64_shdr { #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ #define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ #define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ -- cgit v1.3-7-g2ca7 From 52a72e2a395fa3c5ab5df41058a8511e87215730 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 08:48:42 +0200 Subject: IB/uverbs: Expose XRC ODP device capabilities Expose XRC ODP capabilities as part of the extended device capabilities. Signed-off-by: Moni Shoua Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 1 + include/rdma/ib_verbs.h | 1 + include/uapi/rdma/ib_user_verbs.h | 2 ++ 3 files changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index d4f1a2ef5015..68c4ea514faf 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3613,6 +3613,7 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs) attr.odp_caps.per_transport_caps.uc_odp_caps; resp.odp_caps.per_transport_caps.ud_odp_caps = attr.odp_caps.per_transport_caps.ud_odp_caps; + resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps; resp.timestamp_mask = attr.timestamp_mask; resp.hca_core_clock = attr.hca_core_clock; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5eefdea62831..8219c07340a9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -277,6 +277,7 @@ struct ib_odp_caps { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; + uint32_t xrc_odp_caps; } per_transport_caps; }; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 480d9a60b68e..0474c7400268 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -270,6 +270,8 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_tm_caps tm_caps; struct ib_uverbs_cq_moderation_caps cq_moderation_caps; __aligned_u64 max_dm_size; + __u32 xrc_odp_caps; + __u32 reserved; }; struct ib_uverbs_query_port { -- cgit v1.3-7-g2ca7 From 668aa15b5bf87f156ec805cb7348c785c56b82ab Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 29 Jan 2019 12:08:50 +0200 Subject: RDMA/rxe: Improve loopback marking Currently a packet is marked for loopback only if the source and destination addresses equals. This is not enough when multiple gids are present in rxe device's gid table and the traffic is from one gid to another. Fix it by marking the packet for loopback if the destination MAC address is equal to the source MAC address. Signed-off-by: Kamal Heib Reviewed-by: Yuval Shaia Tested-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_av.c | 1 + drivers/infiniband/sw/rxe/rxe_net.c | 9 +++------ include/uapi/rdma/rdma_user_rxe.h | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 27a7dec18874..81ee756c19b8 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -38,6 +38,7 @@ void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av) { rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr); rxe_av_fill_ip_info(av, attr); + memcpy(av->dmac, attr->roce.dmac, ETH_ALEN); } int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 19f3c69916b1..3b162e92e8e8 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -384,9 +384,6 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb, return -EHOSTUNREACH; } - if (!memcmp(saddr, daddr, sizeof(*daddr))) - pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); @@ -411,9 +408,6 @@ static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb, return -EHOSTUNREACH; } - if (!memcmp(saddr, daddr, sizeof(*daddr))) - pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); @@ -437,6 +431,9 @@ int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc) *crc = rxe_icrc_hdr(pkt, skb); + if (ether_addr_equal(skb->dev->dev_addr, av->dmac)) + pkt->mask |= RXE_LOOPBACK_MASK; + return err; } diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 44ef6a3b7afc..aae2e696bb38 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -58,8 +58,7 @@ struct rxe_global_route { struct rxe_av { __u8 port_num; __u8 network_type; - __u16 reserved1; - __u32 reserved2; + __u8 dmac[6]; struct rxe_global_route grh; union { struct sockaddr_in _sockaddr_in; -- cgit v1.3-7-g2ca7 From f76903d574b26bc596951a5c5e757eb02c67abbd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 29 Jan 2019 13:33:11 -0800 Subject: RDMA/IWPM: refactor the IWPM message attribute names In order to add new IWPM_NL attributes, the enums for the IWPM commands attributes are refactored such that a new attribute can be added without breaking ABI version 3. Instead of sharing nl attribute enums for both request and response messages, we create separate enums for each IWPM message request and reply. This allows us to extend any given IWPM message by adding new attributes for just that message. These new enums are created, though, in a way to avoid breaking ABI version 3. Signed-off-by: Steve Wise Reviewed-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/iwpm_msg.c | 40 ++++++++++++++++++++++---------------- include/uapi/rdma/rdma_netlink.h | 21 +++++++++++++++++--- 2 files changed, 41 insertions(+), 20 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 8861c052155a..3a8753595c8f 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -403,10 +403,12 @@ register_pid_response_exit: /* netlink attribute policy for the received response to add mapping request */ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { - [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } + [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RMANAGE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } }; /* @@ -430,7 +432,7 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -439,9 +441,9 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]); mapped_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]); if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; @@ -472,11 +474,15 @@ add_mapping_response_exit: /* netlink attribute policy for the response to add and query mapping request * and response with remote address info */ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { - [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RQUERY_LOCAL_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_REMOTE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } }; @@ -502,7 +508,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, return -EINVAL; atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -511,9 +517,9 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) @@ -588,9 +594,9 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 2e18b77a817f..42d53e182d5f 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -83,13 +83,20 @@ enum { IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, IWPM_NLA_MANAGE_MAPPING_SEQ, IWPM_NLA_MANAGE_ADDR, - IWPM_NLA_MANAGE_MAPPED_LOC_ADDR, + IWPM_NLA_MANAGE_MAPPING_MAX +}; + +enum { + IWPM_NLA_RMANAGE_MAPPING_UNSPEC = 0, + IWPM_NLA_RMANAGE_MAPPING_SEQ, + IWPM_NLA_RMANAGE_ADDR, + IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, + /* The following maintains bisectability of rdma-core */ + IWPM_NLA_MANAGE_MAPPED_LOC_ADDR = IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, IWPM_NLA_RMANAGE_MAPPING_ERR, IWPM_NLA_RMANAGE_MAPPING_MAX }; -#define IWPM_NLA_MANAGE_MAPPING_MAX 3 -#define IWPM_NLA_QUERY_MAPPING_MAX 4 #define IWPM_NLA_MAPINFO_SEND_MAX 3 enum { @@ -97,6 +104,14 @@ enum { IWPM_NLA_QUERY_MAPPING_SEQ, IWPM_NLA_QUERY_LOCAL_ADDR, IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_QUERY_MAPPING_MAX, +}; + +enum { + IWPM_NLA_RQUERY_MAPPING_UNSPEC = 0, + IWPM_NLA_RQUERY_MAPPING_SEQ, + IWPM_NLA_RQUERY_LOCAL_ADDR, + IWPM_NLA_RQUERY_REMOTE_ADDR, IWPM_NLA_RQUERY_MAPPED_LOC_ADDR, IWPM_NLA_RQUERY_MAPPED_REM_ADDR, IWPM_NLA_RQUERY_MAPPING_ERR, -- cgit v1.3-7-g2ca7 From b0bad9ad514fc1dd8890f1749f5d2425a73270e3 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 29 Jan 2019 13:33:16 -0800 Subject: RDMA/IWPM: Support no port mapping requirements A soft iwarp driver that uses the host TCP stack via a kernel mode socket does not need port mapping. In fact, if the port map daemon, iwpmd, is running, then iwpmd must not try and create/bind a socket to the actual port for a soft iwarp connection, since the driver already has that socket bound. Yet if the soft iwarp driver wants to interoperate with hard iwarp devices that -are- using port mapping, then the soft iwarp driver's mappings still need to be maintained and advertised by the iwpm protocol. This patch enhances the rdma driver<->iwcm interface to allow an iwarp driver to specify that it does not want port mapping. The iwpm kernel<->iwpmd interface is also enhanced to pass up this information on map requests. Care is taken to interoperate with the current iwpmd version (ABI version 3) and only use the new NL attributes if iwpmd supports ABI version 4. The ABI version define has also been created in rdma_netlink.h so both kernel and user code can share it. The iwcm and iwpmd negotiate the ABI version to use with a new HELLO netlink message. Signed-off-by: Steve Wise Reviewed-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/iwcm.c | 7 +++- drivers/infiniband/core/iwpm_msg.c | 80 +++++++++++++++++++++++++++++++++++-- drivers/infiniband/core/iwpm_util.c | 48 +++++++++++++++++++++- drivers/infiniband/core/iwpm_util.h | 12 ++++++ include/rdma/iw_cm.h | 13 ++++++ include/rdma/iw_portmap.h | 15 ++++++- include/uapi/rdma/rdma_netlink.h | 24 +++++++++++ 7 files changed, 192 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 476abc74178e..350ea2bab78a 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -87,7 +87,8 @@ static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = { [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, - [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}, + [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb} }; static struct workqueue_struct *iwcm_wq; @@ -525,6 +526,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->mapped = true; pm_msg.loc_addr = cm_id->local_addr; pm_msg.rem_addr = cm_id->remote_addr; + pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ? + IWPM_FLAGS_NO_PORT_MAP : 0; if (active) status = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_IWCM); @@ -543,7 +546,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) return iwpm_create_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr, - RDMA_NL_IWCM); + RDMA_NL_IWCM, pm_msg.flags); } /* diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 3a8753595c8f..2e30e65b0816 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -34,7 +34,7 @@ #include "iwpm_util.h" static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser"; -static int iwpm_ulib_version = 3; +u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN; static int iwpm_user_pid = IWPM_PID_UNDEFINED; static atomic_t echo_nlmsg_seq; @@ -130,6 +130,7 @@ pid_query_error: * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] + * [IWPM_NLA_MANAGE_FLAGS] */ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -173,6 +174,18 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto add_mapping_error; + /* If flags are required and we're not V4, then return a quiet error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto add_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_MANAGE_FLAGS); + if (ret) + goto add_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; @@ -187,6 +200,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) return ret; add_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); +add_mapping_error_nowarn: if (skb) dev_kfree_skb(skb); if (nlmsg_request) @@ -201,6 +215,7 @@ add_mapping_error: * [IWPM_NLA_QUERY_MAPPING_SEQ] * [IWPM_NLA_QUERY_LOCAL_ADDR] * [IWPM_NLA_QUERY_REMOTE_ADDR] + * [IWPM_NLA_QUERY_FLAGS] */ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -251,6 +266,18 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto query_mapping_error; + /* If flags are required and we're not V4, then return a quite error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto query_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_QUERY_FLAGS); + if (ret) + goto query_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; @@ -264,6 +291,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) return ret; query_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); +query_mapping_error_nowarn: if (skb) dev_kfree_skb(skb); if (nlmsg_request) @@ -379,7 +407,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) /* check device name, ulib name and version */ if (strcmp(pm_msg->dev_name, dev_name) || strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { + iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", __func__, dev_name, iwpm_name, iwpm_version); @@ -387,6 +415,10 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) goto register_pid_response_exit; } iwpm_user_pid = cb->nlh->nlmsg_pid; + iwpm_ulib_version = iwpm_version; + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + __func__, iwpm_user_pid); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); @@ -661,7 +693,7 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); if (strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { + iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Invalid port mapper name = %s version = %d\n", __func__, iwpm_name, iwpm_version); return ret; @@ -675,6 +707,11 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); iwpm_user_pid = cb->nlh->nlmsg_pid; + + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + __func__, iwpm_user_pid); + if (!iwpm_mapinfo_available()) return 0; pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", @@ -754,3 +791,40 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) up(&nlmsg_request->sem); return 0; } + +/* netlink attribute policy for the received hello request */ +static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { + [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 } +}; + +/* + * iwpm_hello_cb - Process a port mapper hello request + */ +int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_HELLO_MAX]; + const char *msg_type = "Hello request"; + u8 nl_client; + u16 abi_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb, + msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]); + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + if (!iwpm_valid_client(nl_client)) { + pr_info("%s: Invalid port mapper client = %d\n", + __func__, nl_client); + return ret; + } + iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version); + pr_debug("Using ABI version %u\n", iwpm_ulib_version); + iwpm_user_pid = cb->nlh->nlmsg_pid; + ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index cdb63f3f4de7..363938435476 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -114,7 +114,7 @@ static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, - u8 nl_client) + u8 nl_client, u32 map_flags) { struct hlist_head *hash_bucket_head = NULL; struct iwpm_mapping_info *map_info; @@ -132,6 +132,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, sizeof(struct sockaddr_storage)); map_info->nl_client = nl_client; + map_info->map_flags = map_flags; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { @@ -686,6 +687,14 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) if (ret) goto send_mapping_info_unlock; + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &map_info->map_flags, + IWPM_NLA_MAPINFO_FLAGS); + if (ret) + goto send_mapping_info_unlock; + } + nlmsg_end(skb, nlh); iwpm_print_sockaddr(&map_info->local_sockaddr, @@ -754,3 +763,40 @@ int iwpm_mapinfo_available(void) spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return full_bucket; } + +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto hello_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + msg_seq = 0; + err_str = "Unable to put attribute of abi_version into nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, + IWPM_NLA_HELLO_ABI_VERSION); + if (ret) + goto hello_num_error; + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast(skb, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto hello_num_error; + } + pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); + return 0; +hello_num_error: + pr_info("%s: %s\n", __func__, err_str); + if (skb) + dev_kfree_skb(skb); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h index af1fc14a0d3d..7e2bcc72f66c 100644 --- a/drivers/infiniband/core/iwpm_util.h +++ b/drivers/infiniband/core/iwpm_util.h @@ -78,6 +78,7 @@ struct iwpm_mapping_info { struct sockaddr_storage local_sockaddr; struct sockaddr_storage mapped_sockaddr; u8 nl_client; + u32 map_flags; }; struct iwpm_remote_info { @@ -266,4 +267,15 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, * @msg: Message to print */ void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); + +/** + * iwpm_send_hello - Send hello response to iwpmd + * + * @nl_client: The index of the netlink client + * @abi_version: The kernel's abi_version + * + * Returns 0 on success or a negative error code + */ +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version); +extern u16 iwpm_ulib_version; #endif diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 5cd7701db148..48512abd3162 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -105,6 +105,18 @@ struct iw_cm_conn_param { u32 qpn; }; +enum iw_flags { + + /* + * This flag allows the iwcm and iwpmd to still advertise + * mappings but the real and mapped port numbers are the + * same. Further, iwpmd will not bind any user socket to + * reserve the port. This is required for soft iwarp + * to play in the port mapped iwarp space. + */ + IW_F_NO_PORT_MAP = (1 << 0), +}; + struct iw_cm_verbs { void (*add_ref)(struct ib_qp *qp); @@ -127,6 +139,7 @@ struct iw_cm_verbs { int (*destroy_listen)(struct iw_cm_id *cm_id); char ifname[IFNAMSIZ]; + enum iw_flags driver_flags; }; /** diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h index fda31673a562..84fac196ef80 100644 --- a/include/rdma/iw_portmap.h +++ b/include/rdma/iw_portmap.h @@ -58,6 +58,7 @@ struct iwpm_sa_data { struct sockaddr_storage mapped_loc_addr; struct sockaddr_storage rem_addr; struct sockaddr_storage mapped_rem_addr; + u32 flags; }; /** @@ -205,9 +206,11 @@ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, * @local_addr: Local ip/tcp address * @mapped_addr: Mapped local ip/tcp address * @nl_client: The index of the netlink client + * @map_flags: IWPM mapping flags */ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, - struct sockaddr_storage *mapped_addr, u8 nl_client); + struct sockaddr_storage *mapped_addr, u8 nl_client, + u32 map_flags); /** * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address @@ -221,4 +224,14 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr, struct sockaddr_storage *mapped_addr); +/** + * iwpm_hello_cb - Process a hello message from iwpmd + * + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send the kernel's abi_version + * after adjusting it to support the iwpmd version. + */ +int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _IW_PORTMAP_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 42d53e182d5f..0f5263767fb4 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -35,6 +35,19 @@ enum { RDMA_NL_RDMA_CM_NUM_ATTR, }; +/* The minimum version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION_MIN 3 + +/* The latest version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION 4 + +/* iwarp port mapper message flags */ +enum { + + /* Do not map the port for this IWPM request */ + IWPM_FLAGS_NO_PORT_MAP = (1 << 0), +}; + /* iwarp port mapper op-codes */ enum { RDMA_NL_IWPM_REG_PID = 0, @@ -45,6 +58,7 @@ enum { RDMA_NL_IWPM_HANDLE_ERR, RDMA_NL_IWPM_MAPINFO, RDMA_NL_IWPM_MAPINFO_NUM, + RDMA_NL_IWPM_HELLO, RDMA_NL_IWPM_NUM_OPS }; @@ -83,6 +97,7 @@ enum { IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, IWPM_NLA_MANAGE_MAPPING_SEQ, IWPM_NLA_MANAGE_ADDR, + IWPM_NLA_MANAGE_FLAGS, IWPM_NLA_MANAGE_MAPPING_MAX }; @@ -98,12 +113,14 @@ enum { }; #define IWPM_NLA_MAPINFO_SEND_MAX 3 +#define IWPM_NLA_REMOVE_MAPPING_MAX 3 enum { IWPM_NLA_QUERY_MAPPING_UNSPEC = 0, IWPM_NLA_QUERY_MAPPING_SEQ, IWPM_NLA_QUERY_LOCAL_ADDR, IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_QUERY_FLAGS, IWPM_NLA_QUERY_MAPPING_MAX, }; @@ -129,6 +146,7 @@ enum { IWPM_NLA_MAPINFO_UNSPEC = 0, IWPM_NLA_MAPINFO_LOCAL_ADDR, IWPM_NLA_MAPINFO_MAPPED_ADDR, + IWPM_NLA_MAPINFO_FLAGS, IWPM_NLA_MAPINFO_MAX }; @@ -147,6 +165,12 @@ enum { IWPM_NLA_ERR_MAX }; +enum { + IWPM_NLA_HELLO_UNSPEC = 0, + IWPM_NLA_HELLO_ABI_VERSION, + IWPM_NLA_HELLO_MAX +}; + /* * Local service operations: * RESOLVE - The client requests the local service to resolve a path. -- cgit v1.3-7-g2ca7 From e46c2e99f60043bfdb63f18fe775db1f688906d9 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 5 Feb 2019 09:50:31 +0000 Subject: drm/i915: Expose RPCS (SSEU) configuration to userspace (Gen11 only) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to allow userspace to reconfigure the subslice configuration on a per context basis. This is required for the functional requirement of shutting down non-VME enabled sub-slices on Gen11 parts. To do so, we expose a context parameter to allow adjustment of the RPCS register stored within the context image (and currently not accessible via LRI). If the context is adjusted before first use or whilst idle, the adjustment is for "free"; otherwise if the context is active we queue a request to do so (using the kernel context), following all other activity by that context, which is also marked as barrier for all following submission against the same context. Since the overhead of device re-configuration during context switching can be significant, especially in multi-context workloads, we limit this new uAPI to only support the Gen11 VME use case. In this use case either the device is fully enabled, and exactly one slice and half of the subslices are enabled. Example usage: struct drm_i915_gem_context_param_sseu sseu = { }; struct drm_i915_gem_context_param arg = { .param = I915_CONTEXT_PARAM_SSEU, .ctx_id = gem_context_create(fd), .size = sizeof(sseu), .value = to_user_pointer(&sseu) }; /* Query device defaults. */ gem_context_get_param(fd, &arg); /* Set VME configuration on a 1x6x8 part. */ sseu.slice_mask = 0x1; sseu.subslice_mask = 0xe0; gem_context_set_param(fd, &arg); v2: Fix offset of CTX_R_PWR_CLK_STATE in intel_lr_context_set_sseu() (Lionel) v3: Add ability to program this per engine (Chris) v4: Move most get_sseu() into i915_gem_context.c (Lionel) v5: Validate sseu configuration against the device's capabilities (Lionel) v6: Change context powergating settings through MI_SDM on kernel context (Chris) v7: Synchronize the requests following a powergating setting change using a global dependency (Chris) Iterate timelines through dev_priv.gt.active_rings (Tvrtko) Disable RPCS configuration setting for non capable users (Lionel/Tvrtko) v8: s/union intel_sseu/struct intel_sseu/ (Lionel) s/dev_priv/i915/ (Tvrtko) Change uapi class/instance fields to u16 (Tvrtko) Bump mask fields to 64bits (Lionel) Don't return EPERM when dynamic sseu is disabled (Tvrtko) v9: Import context image into kernel context's ppgtt only when reconfiguring powergated slice/subslices (Chris) Use aliasing ppgtt when needed (Michel) Tvrtko Ursulin: v10: * Update for upstream changes. * Request submit needs a RPM reference. * Reject on !FULL_PPGTT for simplicity. * Pull out get/set param to helpers for readability and less indent. * Use i915_request_await_dma_fence in add_global_barrier to skip waits on the same timeline and avoid GEM_BUG_ON. * No need to explicitly assign a NULL pointer to engine in legacy mode. * No need to move gen8_make_rpcs up. * Factored out global barrier as prep patch. * Allow to only CAP_SYS_ADMIN if !Gen11. v11: * Remove engine vfunc in favour of local helper. (Chris Wilson) * Stop retiring requests before updates since it is not needed (Chris Wilson) * Implement direct CPU update path for idle contexts. (Chris Wilson) * Left side dependency needs only be on the same context timeline. (Chris Wilson) * It is sufficient to order the timeline. (Chris Wilson) * Reject !RCS configuration attempts with -ENODEV for now. v12: * Rebase for make_rpcs. v13: * Centralize SSEU normalization to make_rpcs. * Type width checking (uAPI <-> implementation). * Gen11 restrictions uAPI checks. * Gen11 subslice count differences handling. Chris Wilson: * args->size handling fixes. * Update context image from GGTT. * Postpone context image update to pinning. * Use i915_gem_active_raw instead of last_request_on_engine. v14: * Add activity tracker on intel_context to fix the lifetime issues and simplify the code. (Chris Wilson) v15: * Fix context pin leak if no space in ring by simplifying the context pinning sequence. v16: * Rebase for context get/set param locking changes. * Just -ENODEV on !Gen11. (Joonas) v17: * Fix one Gen11 subslice enablement rule. * Handle error from i915_sw_fence_await_sw_fence_gfp. (Chris Wilson) v18: * Update commit message. (Joonas) * Restrict uAPI to VME use case. (Joonas) v19: * Rebase. v20: * Rebase for ce->active_tracker. v21: * Rebase for IS_GEN changes. v22: * Reserve uAPI for flags straight away. (Chris Wilson) v23: * Rebase for RUNTIME_INFO. v24: * Added some headline docs for the uapi usage. (Joonas/Chris) v25: * Renamed class/instance to engine_class/engine_instance to avoid clash with C++ keyword. (Tony Ye) v26: * Rebased for runtime pm api changes. v27: * Rebased for intel_context_init. * Wrap commit msg to 75. v28: (Chris Wilson) * Use i915_gem_ggtt. * Use i915_request_await_dma_fence to show a better example. v29: * i915_timeline_set_barrier can now fail. (Chris Wilson) v30: * Capture some acks. v31: * Drop the WARN_ON from use controllable paths. (Chris Wilson) * Use overflows_type for all checks. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100899 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107634 Issue: https://github.com/intel/media-driver/issues/267 Signed-off-by: Chris Wilson Signed-off-by: Lionel Landwerlin Cc: Dmitry Rogozhkin Cc: Tvrtko Ursulin Cc: Zhipeng Gong Cc: Joonas Lahtinen Cc: Tony Ye Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Reviewed-by: Joonas Lahtinen Acked-by: Timo Aaltonen Acked-by: Takashi Iwai Acked-by: Stéphane Marchesin Link: https://patchwork.freedesktop.org/patch/msgid/20190205095032.22673-4-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_gem_context.c | 340 +++++++++++++++++++++++++++++++- drivers/gpu/drm/i915/i915_gem_context.h | 6 + drivers/gpu/drm/i915/intel_lrc.c | 4 +- include/uapi/drm/i915_drm.h | 64 ++++++ 4 files changed, 411 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index d3887c27c3ba..2d3e1ce9cc76 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -89,6 +89,7 @@ #include #include "i915_drv.h" #include "i915_trace.h" +#include "intel_lrc_reg.h" #include "intel_workarounds.h" #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1 @@ -321,6 +322,15 @@ static u32 default_desc_template(const struct drm_i915_private *i915, return desc; } +static void intel_context_retire(struct i915_gem_active *active, + struct i915_request *rq) +{ + struct intel_context *ce = + container_of(active, typeof(*ce), active_tracker); + + intel_context_unpin(ce); +} + void intel_context_init(struct intel_context *ce, struct i915_gem_context *ctx, @@ -333,6 +343,8 @@ intel_context_init(struct intel_context *ce, /* Use the whole device by default */ ce->sseu = intel_device_default_sseu(ctx->i915); + + init_request_active(&ce->active_tracker, intel_context_retire); } static struct i915_gem_context * @@ -850,6 +862,56 @@ out: return 0; } +static int get_sseu(struct i915_gem_context *ctx, + struct drm_i915_gem_context_param *args) +{ + struct drm_i915_gem_context_param_sseu user_sseu; + struct intel_engine_cs *engine; + struct intel_context *ce; + int ret; + + if (args->size == 0) + goto out; + else if (args->size < sizeof(user_sseu)) + return -EINVAL; + + if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value), + sizeof(user_sseu))) + return -EFAULT; + + if (user_sseu.flags || user_sseu.rsvd) + return -EINVAL; + + engine = intel_engine_lookup_user(ctx->i915, + user_sseu.engine_class, + user_sseu.engine_instance); + if (!engine) + return -EINVAL; + + /* Only use for mutex here is to serialize get_param and set_param. */ + ret = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex); + if (ret) + return ret; + + ce = to_intel_context(ctx, engine); + + user_sseu.slice_mask = ce->sseu.slice_mask; + user_sseu.subslice_mask = ce->sseu.subslice_mask; + user_sseu.min_eus_per_subslice = ce->sseu.min_eus_per_subslice; + user_sseu.max_eus_per_subslice = ce->sseu.max_eus_per_subslice; + + mutex_unlock(&ctx->i915->drm.struct_mutex); + + if (copy_to_user(u64_to_user_ptr(args->value), &user_sseu, + sizeof(user_sseu))) + return -EFAULT; + +out: + args->size = sizeof(user_sseu); + + return 0; +} + int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { @@ -862,15 +924,17 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, if (!ctx) return -ENOENT; - args->size = 0; switch (args->param) { case I915_CONTEXT_PARAM_BAN_PERIOD: ret = -EINVAL; break; case I915_CONTEXT_PARAM_NO_ZEROMAP: + args->size = 0; args->value = test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags); break; case I915_CONTEXT_PARAM_GTT_SIZE: + args->size = 0; + if (ctx->ppgtt) args->value = ctx->ppgtt->vm.total; else if (to_i915(dev)->mm.aliasing_ppgtt) @@ -879,14 +943,20 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, args->value = to_i915(dev)->ggtt.vm.total; break; case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE: + args->size = 0; args->value = i915_gem_context_no_error_capture(ctx); break; case I915_CONTEXT_PARAM_BANNABLE: + args->size = 0; args->value = i915_gem_context_is_bannable(ctx); break; case I915_CONTEXT_PARAM_PRIORITY: + args->size = 0; args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT; break; + case I915_CONTEXT_PARAM_SSEU: + ret = get_sseu(ctx, args); + break; default: ret = -EINVAL; break; @@ -896,6 +966,270 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, return ret; } +static int gen8_emit_rpcs_config(struct i915_request *rq, + struct intel_context *ce, + struct intel_sseu sseu) +{ + u64 offset; + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + offset = i915_ggtt_offset(ce->state) + + LRC_STATE_PN * PAGE_SIZE + + (CTX_R_PWR_CLK_STATE + 1) * 4; + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + *cs++ = gen8_make_rpcs(rq->i915, &sseu); + + intel_ring_advance(rq, cs); + + return 0; +} + +static int +gen8_modify_rpcs_gpu(struct intel_context *ce, + struct intel_engine_cs *engine, + struct intel_sseu sseu) +{ + struct drm_i915_private *i915 = engine->i915; + struct i915_request *rq, *prev; + intel_wakeref_t wakeref; + int ret; + + GEM_BUG_ON(!ce->pin_count); + + lockdep_assert_held(&i915->drm.struct_mutex); + + /* Submitting requests etc needs the hw awake. */ + wakeref = intel_runtime_pm_get(i915); + + rq = i915_request_alloc(engine, i915->kernel_context); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + goto out_put; + } + + /* Queue this switch after all other activity by this context. */ + prev = i915_gem_active_raw(&ce->ring->timeline->last_request, + &i915->drm.struct_mutex); + if (prev && !i915_request_completed(prev)) { + ret = i915_request_await_dma_fence(rq, &prev->fence); + if (ret < 0) + goto out_add; + } + + /* Order all following requests to be after. */ + ret = i915_timeline_set_barrier(ce->ring->timeline, rq); + if (ret) + goto out_add; + + ret = gen8_emit_rpcs_config(rq, ce, sseu); + if (ret) + goto out_add; + + /* + * Guarantee context image and the timeline remains pinned until the + * modifying request is retired by setting the ce activity tracker. + * + * But we only need to take one pin on the account of it. Or in other + * words transfer the pinned ce object to tracked active request. + */ + if (!i915_gem_active_isset(&ce->active_tracker)) + __intel_context_pin(ce); + i915_gem_active_set(&ce->active_tracker, rq); + +out_add: + i915_request_add(rq); +out_put: + intel_runtime_pm_put(i915, wakeref); + + return ret; +} + +static int +i915_gem_context_reconfigure_sseu(struct i915_gem_context *ctx, + struct intel_engine_cs *engine, + struct intel_sseu sseu) +{ + struct intel_context *ce = to_intel_context(ctx, engine); + int ret; + + GEM_BUG_ON(INTEL_GEN(ctx->i915) < 8); + GEM_BUG_ON(engine->id != RCS); + + ret = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex); + if (ret) + return ret; + + /* Nothing to do if unmodified. */ + if (!memcmp(&ce->sseu, &sseu, sizeof(sseu))) + goto out; + + /* + * If context is not idle we have to submit an ordered request to modify + * its context image via the kernel context. Pristine and idle contexts + * will be configured on pinning. + */ + if (ce->pin_count) + ret = gen8_modify_rpcs_gpu(ce, engine, sseu); + + if (!ret) + ce->sseu = sseu; + +out: + mutex_unlock(&ctx->i915->drm.struct_mutex); + + return ret; +} + +static int +user_to_context_sseu(struct drm_i915_private *i915, + const struct drm_i915_gem_context_param_sseu *user, + struct intel_sseu *context) +{ + const struct sseu_dev_info *device = &RUNTIME_INFO(i915)->sseu; + + /* No zeros in any field. */ + if (!user->slice_mask || !user->subslice_mask || + !user->min_eus_per_subslice || !user->max_eus_per_subslice) + return -EINVAL; + + /* Max > min. */ + if (user->max_eus_per_subslice < user->min_eus_per_subslice) + return -EINVAL; + + /* + * Some future proofing on the types since the uAPI is wider than the + * current internal implementation. + */ + if (overflows_type(user->slice_mask, context->slice_mask) || + overflows_type(user->subslice_mask, context->subslice_mask) || + overflows_type(user->min_eus_per_subslice, + context->min_eus_per_subslice) || + overflows_type(user->max_eus_per_subslice, + context->max_eus_per_subslice)) + return -EINVAL; + + /* Check validity against hardware. */ + if (user->slice_mask & ~device->slice_mask) + return -EINVAL; + + if (user->subslice_mask & ~device->subslice_mask[0]) + return -EINVAL; + + if (user->max_eus_per_subslice > device->max_eus_per_subslice) + return -EINVAL; + + context->slice_mask = user->slice_mask; + context->subslice_mask = user->subslice_mask; + context->min_eus_per_subslice = user->min_eus_per_subslice; + context->max_eus_per_subslice = user->max_eus_per_subslice; + + /* Part specific restrictions. */ + if (IS_GEN(i915, 11)) { + unsigned int hw_s = hweight8(device->slice_mask); + unsigned int hw_ss_per_s = hweight8(device->subslice_mask[0]); + unsigned int req_s = hweight8(context->slice_mask); + unsigned int req_ss = hweight8(context->subslice_mask); + + /* + * Only full subslice enablement is possible if more than one + * slice is turned on. + */ + if (req_s > 1 && req_ss != hw_ss_per_s) + return -EINVAL; + + /* + * If more than four (SScount bitfield limit) subslices are + * requested then the number has to be even. + */ + if (req_ss > 4 && (req_ss & 1)) + return -EINVAL; + + /* + * If only one slice is enabled and subslice count is below the + * device full enablement, it must be at most half of the all + * available subslices. + */ + if (req_s == 1 && req_ss < hw_ss_per_s && + req_ss > (hw_ss_per_s / 2)) + return -EINVAL; + + /* ABI restriction - VME use case only. */ + + /* All slices or one slice only. */ + if (req_s != 1 && req_s != hw_s) + return -EINVAL; + + /* + * Half subslices or full enablement only when one slice is + * enabled. + */ + if (req_s == 1 && + (req_ss != hw_ss_per_s && req_ss != (hw_ss_per_s / 2))) + return -EINVAL; + + /* No EU configuration changes. */ + if ((user->min_eus_per_subslice != + device->max_eus_per_subslice) || + (user->max_eus_per_subslice != + device->max_eus_per_subslice)) + return -EINVAL; + } + + return 0; +} + +static int set_sseu(struct i915_gem_context *ctx, + struct drm_i915_gem_context_param *args) +{ + struct drm_i915_private *i915 = ctx->i915; + struct drm_i915_gem_context_param_sseu user_sseu; + struct intel_engine_cs *engine; + struct intel_sseu sseu; + int ret; + + if (args->size < sizeof(user_sseu)) + return -EINVAL; + + if (!IS_GEN(i915, 11)) + return -ENODEV; + + if (copy_from_user(&user_sseu, u64_to_user_ptr(args->value), + sizeof(user_sseu))) + return -EFAULT; + + if (user_sseu.flags || user_sseu.rsvd) + return -EINVAL; + + engine = intel_engine_lookup_user(i915, + user_sseu.engine_class, + user_sseu.engine_instance); + if (!engine) + return -EINVAL; + + /* Only render engine supports RPCS configuration. */ + if (engine->class != RENDER_CLASS) + return -ENODEV; + + ret = user_to_context_sseu(i915, &user_sseu, &sseu); + if (ret) + return ret; + + ret = i915_gem_context_reconfigure_sseu(ctx, engine, sseu); + if (ret) + return ret; + + args->size = sizeof(user_sseu); + + return 0; +} + int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { @@ -958,7 +1292,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, I915_USER_PRIORITY(priority); } break; - + case I915_CONTEXT_PARAM_SSEU: + ret = set_sseu(ctx, args); + break; default: ret = -EINVAL; break; diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h index 919f6f0a0f7a..92ad5272e57f 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.h +++ b/drivers/gpu/drm/i915/i915_gem_context.h @@ -183,6 +183,12 @@ struct i915_gem_context { u64 lrc_desc; int pin_count; + /** + * active_tracker: Active tracker for the external rq activity + * on this intel_context object. + */ + struct i915_gem_active active_tracker; + const struct intel_context_ops *ops; /** sseu: Control eu/slice partitioning */ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index d99c462e2c09..5e98fd79bd9d 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -2498,7 +2498,9 @@ u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *req_sseu) * subslices are enabled, or a count between one and four on the first * slice. */ - if (IS_GEN(i915, 11) && slices == 1 && subslices >= 4) { + if (IS_GEN(i915, 11) && + slices == 1 && + subslices > min_t(u8, 4, hweight8(sseu->subslice_mask[0]) / 2)) { GEM_BUG_ON(subslices & 1); subslice_pg = false; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 298b2e197744..397810fa2d33 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1486,9 +1486,73 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_MAX_USER_PRIORITY 1023 /* inclusive */ #define I915_CONTEXT_DEFAULT_PRIORITY 0 #define I915_CONTEXT_MIN_USER_PRIORITY -1023 /* inclusive */ + /* + * When using the following param, value should be a pointer to + * drm_i915_gem_context_param_sseu. + */ +#define I915_CONTEXT_PARAM_SSEU 0x7 __u64 value; }; +/** + * Context SSEU programming + * + * It may be necessary for either functional or performance reason to configure + * a context to run with a reduced number of SSEU (where SSEU stands for Slice/ + * Sub-slice/EU). + * + * This is done by configuring SSEU configuration using the below + * @struct drm_i915_gem_context_param_sseu for every supported engine which + * userspace intends to use. + * + * Not all GPUs or engines support this functionality in which case an error + * code -ENODEV will be returned. + * + * Also, flexibility of possible SSEU configuration permutations varies between + * GPU generations and software imposed limitations. Requesting such a + * combination will return an error code of -EINVAL. + * + * NOTE: When perf/OA is active the context's SSEU configuration is ignored in + * favour of a single global setting. + */ +struct drm_i915_gem_context_param_sseu { + /* + * Engine class & instance to be configured or queried. + */ + __u16 engine_class; + __u16 engine_instance; + + /* + * Unused for now. Must be cleared to zero. + */ + __u32 flags; + + /* + * Mask of slices to enable for the context. Valid values are a subset + * of the bitmask value returned for I915_PARAM_SLICE_MASK. + */ + __u64 slice_mask; + + /* + * Mask of subslices to enable for the context. Valid values are a + * subset of the bitmask value return by I915_PARAM_SUBSLICE_MASK. + */ + __u64 subslice_mask; + + /* + * Minimum/Maximum number of EUs to enable per subslice for the + * context. min_eus_per_subslice must be inferior or equal to + * max_eus_per_subslice. + */ + __u16 min_eus_per_subslice; + __u16 max_eus_per_subslice; + + /* + * Unused for now. Must be cleared to zero. + */ + __u32 rsvd; +}; + enum drm_i915_oa_format { I915_OA_FORMAT_A13 = 1, /* HSW only */ I915_OA_FORMAT_A29, /* HSW only */ -- cgit v1.3-7-g2ca7 From a78e8723a50530d15faa25cc0b6f009bcd251c20 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 16 Jan 2019 09:55:41 +0200 Subject: RDMA/cma: Remove CM_ID statistics provided by rdma-cm module Netlink statistics exported by rdma-cm never had any working user space component published to the mailing list or to any open source project. Canvassing various proprietary users, and the original requester, we find that there are no real users of this interface. This patch simply removes all occurrences of RDMA CM netlink in favour of modern nldev implementation, which provides the same information and accompanied by widely used user space component. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 83 --------------------------------------- drivers/infiniband/core/netlink.c | 4 +- include/uapi/rdma/rdma_netlink.h | 17 +------- 3 files changed, 3 insertions(+), 101 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 81bded0d37d1..e15546ae4d0f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4616,85 +4616,6 @@ static void cma_remove_one(struct ib_device *device, void *client_data) kfree(cma_dev); } -static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct nlmsghdr *nlh; - struct rdma_cm_id_stats *id_stats; - struct rdma_id_private *id_priv; - struct rdma_cm_id *id = NULL; - struct cma_device *cma_dev; - int i_dev = 0, i_id = 0; - - /* - * We export all of the IDs as a sequence of messages. Each - * ID gets its own netlink message. - */ - mutex_lock(&lock); - - list_for_each_entry(cma_dev, &dev_list, list) { - if (i_dev < cb->args[0]) { - i_dev++; - continue; - } - - i_id = 0; - list_for_each_entry(id_priv, &cma_dev->id_list, list) { - if (i_id < cb->args[1]) { - i_id++; - continue; - } - - id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, - sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS, - NLM_F_MULTI); - if (!id_stats) - goto out; - - memset(id_stats, 0, sizeof *id_stats); - id = &id_priv->id; - id_stats->node_type = id->route.addr.dev_addr.dev_type; - id_stats->port_num = id->port_num; - id_stats->bound_dev_if = - id->route.addr.dev_addr.bound_dev_if; - - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), - cma_src_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) - goto out; - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_dst_addr(id_priv)), - cma_dst_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) - goto out; - - id_stats->pid = task_pid_vnr(id_priv->res.task); - id_stats->port_space = id->ps; - id_stats->cm_state = id_priv->state; - id_stats->qp_num = id_priv->qp_num; - id_stats->qp_type = id->qp_type; - - i_id++; - nlmsg_end(skb, nlh); - } - - cb->args[1] = 0; - i_dev++; - } - -out: - mutex_unlock(&lock); - cb->args[0] = i_dev; - cb->args[1] = i_id; - - return skb->len; -} - -static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, -}; - static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); @@ -4743,7 +4664,6 @@ static int __init cma_init(void) if (ret) goto err; - rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table); cma_configfs_init(); return 0; @@ -4759,7 +4679,6 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); @@ -4767,7 +4686,5 @@ static void __exit cma_cleanup(void) destroy_workqueue(cma_wq); } -MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1); - module_init(cma_init); module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 724f5a62e82f..eecfc0b377c9 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -56,7 +56,6 @@ EXPORT_SYMBOL(rdma_nl_chk_listeners); static bool is_nl_msg_valid(unsigned int type, unsigned int op) { static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = { - [RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS, [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS, [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS, [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS, @@ -181,8 +180,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } /* FIXME: Convert IWCM to properly handle doit callbacks */ - if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM || - index == RDMA_NL_IWCM) { + if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) { struct netlink_dump_control c = { .dump = cb_table[op].dump, }; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 0f5263767fb4..3a9e681e4257 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -5,8 +5,7 @@ #include enum { - RDMA_NL_RDMA_CM = 1, - RDMA_NL_IWCM, + RDMA_NL_IWCM = 2, RDMA_NL_RSVD, RDMA_NL_LS, /* RDMA Local Services */ RDMA_NL_NLDEV, /* RDMA device interface */ @@ -14,8 +13,7 @@ enum { }; enum { - RDMA_NL_GROUP_CM = 1, - RDMA_NL_GROUP_IWPM, + RDMA_NL_GROUP_IWPM = 2, RDMA_NL_GROUP_LS, RDMA_NL_NUM_GROUPS }; @@ -24,17 +22,6 @@ enum { #define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1)) #define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op) -enum { - RDMA_NL_RDMA_CM_ID_STATS = 0, - RDMA_NL_RDMA_CM_NUM_OPS -}; - -enum { - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1, - RDMA_NL_RDMA_CM_ATTR_DST_ADDR, - RDMA_NL_RDMA_CM_NUM_ATTR, -}; - /* The minimum version that the iwpm kernel supports */ #define IWPM_UABI_VERSION_MIN 3 -- cgit v1.3-7-g2ca7 From 67dd1a36334ffce82bebeb2d633e152aa436d370 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 31 Jan 2019 15:44:22 -0500 Subject: drm/amdgpu: Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New chunk for dependency on start of job's execution instead on the end. This is used for GPU deadlock prevention when userspace uses mid-IB fences to wait for mid-IB work on other rings. v2: Fix typo in AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES v3: Bump KMS version v4: put old fence AFTER acquiring the scheduled fence. Signed-off-by: Andrey Grodzovsky Suggested-by: Christian Koenig Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 13 ++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- include/uapi/drm/amdgpu_drm.h | 1 + 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 1c49b8266d69..52a5e4fdc95b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -214,6 +214,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs case AMDGPU_CHUNK_ID_DEPENDENCIES: case AMDGPU_CHUNK_ID_SYNCOBJ_IN: case AMDGPU_CHUNK_ID_SYNCOBJ_OUT: + case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES: break; default: @@ -1090,6 +1091,15 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, fence = amdgpu_ctx_get_fence(ctx, entity, deps[i].handle); + + if (chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) { + struct drm_sched_fence *s_fence = to_drm_sched_fence(fence); + struct dma_fence *old = fence; + + fence = dma_fence_get(&s_fence->scheduled); + dma_fence_put(old); + } + if (IS_ERR(fence)) { r = PTR_ERR(fence); amdgpu_ctx_put(ctx); @@ -1177,7 +1187,8 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, chunk = &p->chunks[i]; - if (chunk->chunk_id == AMDGPU_CHUNK_ID_DEPENDENCIES) { + if (chunk->chunk_id == AMDGPU_CHUNK_ID_DEPENDENCIES || + chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) { r = amdgpu_cs_process_fence_dep(p, chunk); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index c806f984bcc5..1158a6f4eec6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -71,9 +71,10 @@ * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk). * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE. * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation. + * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 27 +#define KMS_DRIVER_MINOR 28 #define KMS_DRIVER_PATCHLEVEL 0 int amdgpu_vram_limit = 0; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index be84e43c1e19..47296f2ec3fa 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -523,6 +523,7 @@ struct drm_amdgpu_gem_va { #define AMDGPU_CHUNK_ID_SYNCOBJ_IN 0x04 #define AMDGPU_CHUNK_ID_SYNCOBJ_OUT 0x05 #define AMDGPU_CHUNK_ID_BO_HANDLES 0x06 +#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 struct drm_amdgpu_cs_chunk { __u32 chunk_id; -- cgit v1.3-7-g2ca7 From 41cca166cc57e75e94d888595a428d23a3bf4e36 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Mon, 21 Jan 2019 17:22:55 -0500 Subject: drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I'm not increasing the DRM version because GDS isn't totally without bugs yet. v2: update emit_ib_size Signed-off-by: Marek Olšák Acked-by: Christian König Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h | 2 ++ drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 19 +++++++++++++++- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 21 +++++++++++++++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 40 +++++++++++++++++++++++++++++++-- include/uapi/drm/amdgpu_drm.h | 5 +++++ 6 files changed, 84 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 1158a6f4eec6..2f0ea380c031 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -72,9 +72,10 @@ * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE. * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation. * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES + * - 3.29.0 - Add AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 28 +#define KMS_DRIVER_MINOR 29 #define KMS_DRIVER_PATCHLEVEL 0 int amdgpu_vram_limit = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h index ecbcefe49a98..f89f5734d985 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h @@ -37,6 +37,8 @@ struct amdgpu_gds { struct amdgpu_gds_asic_info mem; struct amdgpu_gds_asic_info gws; struct amdgpu_gds_asic_info oa; + uint32_t gds_compute_max_wave_id; + /* At present, GDS, GWS and OA resources for gfx (graphics) * is always pre-allocated and available for graphics operation. * Such resource is shared between all gfx clients. diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index 7984292f9282..a59e0fdf5a97 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -2264,6 +2264,22 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring, unsigned vmid = AMDGPU_JOB_GET_VMID(job); u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); + /* Currently, there is a high possibility to get wave ID mismatch + * between ME and GDS, leading to a hw deadlock, because ME generates + * different wave IDs than the GDS expects. This situation happens + * randomly when at least 5 compute pipes use GDS ordered append. + * The wave IDs generated by ME are also wrong after suspend/resume. + * Those are probably bugs somewhere else in the kernel driver. + * + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and + * GDS to 0 for this ring (me/pipe). + */ + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START); + amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id); + } + amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); amdgpu_ring_write(ring, #ifdef __BIG_ENDIAN @@ -5000,7 +5016,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = { 7 + /* gfx_v7_0_ring_emit_pipeline_sync */ CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */ 7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */ .emit_ib = gfx_v7_0_ring_emit_ib_compute, .emit_fence = gfx_v7_0_ring_emit_fence_compute, .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync, @@ -5057,6 +5073,7 @@ static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev) adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); adev->gds.gws.total_size = 64; adev->gds.oa.total_size = 16; + adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID); if (adev->gds.mem.total_size == 64 * 1024) { adev->gds.mem.gfx_partition_size = 4096; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index a26747681ed6..b8e50a34bdb3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -6084,6 +6084,22 @@ static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring, unsigned vmid = AMDGPU_JOB_GET_VMID(job); u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); + /* Currently, there is a high possibility to get wave ID mismatch + * between ME and GDS, leading to a hw deadlock, because ME generates + * different wave IDs than the GDS expects. This situation happens + * randomly when at least 5 compute pipes use GDS ordered append. + * The wave IDs generated by ME are also wrong after suspend/resume. + * Those are probably bugs somewhere else in the kernel driver. + * + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and + * GDS to 0 for this ring (me/pipe). + */ + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START); + amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id); + } + amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); amdgpu_ring_write(ring, #ifdef __BIG_ENDIAN @@ -6890,7 +6906,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = { 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */ 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */ .emit_ib = gfx_v8_0_ring_emit_ib_compute, .emit_fence = gfx_v8_0_ring_emit_fence_compute, .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync, @@ -6920,7 +6936,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = { 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ 17 + /* gfx_v8_0_ring_emit_vm_flush */ 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */ .emit_fence = gfx_v8_0_ring_emit_fence_kiq, .test_ring = gfx_v8_0_ring_test_ring, .insert_nop = amdgpu_ring_insert_nop, @@ -6996,6 +7012,7 @@ static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev) adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); adev->gds.gws.total_size = 64; adev->gds.oa.total_size = 16; + adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID); if (adev->gds.mem.total_size == 64 * 1024) { adev->gds.mem.gfx_partition_size = 4096; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 262ee3cf6f1c..5533f6e4f4a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4010,6 +4010,22 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring, unsigned vmid = AMDGPU_JOB_GET_VMID(job); u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); + /* Currently, there is a high possibility to get wave ID mismatch + * between ME and GDS, leading to a hw deadlock, because ME generates + * different wave IDs than the GDS expects. This situation happens + * randomly when at least 5 compute pipes use GDS ordered append. + * The wave IDs generated by ME are also wrong after suspend/resume. + * Those are probably bugs somewhere else in the kernel driver. + * + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and + * GDS to 0 for this ring (me/pipe). + */ + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID); + amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id); + } + amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); BUG_ON(ib->gpu_addr & 0x3); /* Dword align */ amdgpu_ring_write(ring, @@ -4729,7 +4745,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + 2 + /* gfx_v9_0_ring_emit_vm_flush */ 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ .emit_ib = gfx_v9_0_ring_emit_ib_compute, .emit_fence = gfx_v9_0_ring_emit_fence, .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync, @@ -4764,7 +4780,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = { SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + 2 + /* gfx_v9_0_ring_emit_vm_flush */ 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */ - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ .emit_fence = gfx_v9_0_ring_emit_fence_kiq, .test_ring = gfx_v9_0_ring_test_ring, .insert_nop = amdgpu_ring_insert_nop, @@ -4846,6 +4862,26 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev) break; } + switch (adev->asic_type) { + case CHIP_VEGA10: + case CHIP_VEGA20: + adev->gds.gds_compute_max_wave_id = 0x7ff; + break; + case CHIP_VEGA12: + adev->gds.gds_compute_max_wave_id = 0x27f; + break; + case CHIP_RAVEN: + if (adev->rev_id >= 0x8) + adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */ + else + adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */ + break; + default: + /* this really depends on the chip */ + adev->gds.gds_compute_max_wave_id = 0x7ff; + break; + } + adev->gds.gws.total_size = 64; adev->gds.oa.total_size = 16; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 47296f2ec3fa..2acbc8bc465b 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -566,6 +566,11 @@ union drm_amdgpu_cs { * caches (L2/vL1/sL1/I$). */ #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. + * This will reset wave ID counters for the IB. + */ +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) + struct drm_amdgpu_cs_chunk_ib { __u32 _pad; /** AMDGPU_IB_FLAG_* */ -- cgit v1.3-7-g2ca7 From 2c620ff93d9fbd5d644760d4c21d389078ec1080 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Mon, 2 Jul 2018 22:44:20 -0700 Subject: time: Add struct __kernel_timex struct timex uses struct timeval internally. struct timeval is not y2038 safe. Introduce a new UAPI type struct __kernel_timex that is y2038 safe. struct __kernel_timex uses a timeval type that is similar to struct __kernel_timespec which preserves the same structure size across 32 bit and 64 bit ABIs. struct __kernel_timex also restructures other members of the structure to make the structure the same on 64 bit and 32 bit architectures. Note that struct __kernel_timex is the same as struct timex on a 64 bit architecture. The above solution is similar to other new y2038 syscalls that are being introduced: both 32 bit and 64 bit ABIs have a common entry, and the compat entry supports the old 32 bit syscall interface. Alternatives considered were: 1. Add new time type to struct timex that makes use of padded bits. This time type could be based on the struct __kernel_timespec. modes will use a flag to notify which time structure should be used internally. This needs some application level changes on both 64 bit and 32 bit architectures. Although 64 bit machines could continue to use the older timeval structure without any changes. 2. Add a new u8 type to struct timex that makes use of padded bits. This can be used to save higher order tv_sec bits. modes will use a flag to notify presence of such a type. This will need some application level changes on 32 bit architectures. 3. Add a new compat_timex structure that differs in only the size of the time type; keep rest of struct timex the same. This requires extra syscalls to manage all 3 cases on 64 bit architectures. This will not need any application level changes but will add more complexity from kernel side. Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/timex.h | 7 +++++++ include/uapi/linux/timex.h | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/timex.h b/include/linux/timex.h index 39c25dbebfe8..7f40e9e42ecc 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -53,6 +53,13 @@ #ifndef _LINUX_TIMEX_H #define _LINUX_TIMEX_H +/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path + * and 32-bit emulation. + */ +#ifndef CONFIG_64BIT_TIME +#define __kernel_timex timex +#endif + #include #define ADJ_ADJTIME 0x8000 /* switch between adjtime/adjtimex modes */ diff --git a/include/uapi/linux/timex.h b/include/uapi/linux/timex.h index 92685d826444..a1c6b73016a5 100644 --- a/include/uapi/linux/timex.h +++ b/include/uapi/linux/timex.h @@ -92,6 +92,47 @@ struct timex { int :32; int :32; int :32; }; +struct __kernel_timex_timeval { + __kernel_time64_t tv_sec; + long long tv_usec; +}; + +#ifndef __kernel_timex +struct __kernel_timex { + unsigned int modes; /* mode selector */ + int :32; /* pad */ + long long offset; /* time offset (usec) */ + long long freq; /* frequency offset (scaled ppm) */ + long long maxerror;/* maximum error (usec) */ + long long esterror;/* estimated error (usec) */ + int status; /* clock command/status */ + int :32; /* pad */ + long long constant;/* pll time constant */ + long long precision;/* clock precision (usec) (read only) */ + long long tolerance;/* clock frequency tolerance (ppm) + * (read only) + */ + struct __kernel_timex_timeval time; /* (read only, except for ADJ_SETOFFSET) */ + long long tick; /* (modified) usecs between clock ticks */ + + long long ppsfreq;/* pps frequency (scaled ppm) (ro) */ + long long jitter; /* pps jitter (us) (ro) */ + int shift; /* interval duration (s) (shift) (ro) */ + int :32; /* pad */ + long long stabil; /* pps stability (scaled ppm) (ro) */ + long long jitcnt; /* jitter limit exceeded (ro) */ + long long calcnt; /* calibration intervals (ro) */ + long long errcnt; /* calibration errors (ro) */ + long long stbcnt; /* stability limit exceeded (ro) */ + + int tai; /* TAI offset (ro) */ + + int :32; int :32; int :32; int :32; + int :32; int :32; int :32; int :32; + int :32; int :32; int :32; +}; +#endif + /* * Mode codes (timex.mode) */ -- cgit v1.3-7-g2ca7 From 8dabe7245bbc134f2cfcc12cde75c019dab924cc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 7 Jan 2019 00:33:08 +0100 Subject: y2038: syscalls: rename y2038 compat syscalls A lot of system calls that pass a time_t somewhere have an implementation using a COMPAT_SYSCALL_DEFINEx() on 64-bit architectures, and have been reworked so that this implementation can now be used on 32-bit architectures as well. The missing step is to redefine them using the regular SYSCALL_DEFINEx() to get them out of the compat namespace and make it possible to build them on 32-bit architectures. Any system call that ends in 'time' gets a '32' suffix on its name for that version, while the others get a '_time32' suffix, to distinguish them from the normal version, which takes a 64-bit time argument in the future. In this step, only 64-bit architectures are changed, doing this rename first lets us avoid touching the 32-bit architectures twice. Acked-by: Catalin Marinas Signed-off-by: Arnd Bergmann --- arch/arm64/include/asm/unistd32.h | 48 ++++++++++---------- arch/mips/kernel/syscalls/syscall_n32.tbl | 50 ++++++++++----------- arch/mips/kernel/syscalls/syscall_o32.tbl | 52 +++++++++++----------- arch/parisc/kernel/syscalls/syscall.tbl | 54 +++++++++++------------ arch/powerpc/kernel/syscalls/syscall.tbl | 52 +++++++++++----------- arch/s390/kernel/syscalls/syscall.tbl | 52 +++++++++++----------- arch/sparc/kernel/syscalls/syscall.tbl | 52 +++++++++++----------- arch/x86/entry/syscalls/syscall_32.tbl | 52 +++++++++++----------- fs/aio.c | 10 ++--- fs/select.c | 4 +- fs/timerfd.c | 4 +- fs/utimes.c | 10 ++--- include/linux/compat.h | 73 ++----------------------------- include/linux/syscalls.h | 57 ++++++++++++++++++++++++ include/uapi/asm-generic/unistd.h | 44 +++++++++---------- ipc/mqueue.c | 16 +++---- ipc/sem.c | 2 +- kernel/futex.c | 2 +- kernel/sched/core.c | 5 +-- kernel/signal.c | 2 +- kernel/sys_ni.c | 18 ++++---- kernel/time/hrtimer.c | 2 +- kernel/time/posix-stubs.c | 25 ++++++----- kernel/time/posix-timers.c | 32 +++++++------- kernel/time/time.c | 8 ++-- net/compat.c | 2 +- 26 files changed, 361 insertions(+), 367 deletions(-) (limited to 'include/uapi') diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index d10cce69a4b0..1ded82857161 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -270,7 +270,7 @@ __SYSCALL(__NR_uname, sys_newuname) /* 123 was sys_modify_ldt */ __SYSCALL(123, sys_ni_syscall) #define __NR_adjtimex 124 -__SYSCALL(__NR_adjtimex, compat_sys_adjtimex) +__SYSCALL(__NR_adjtimex, sys_adjtimex_time32) #define __NR_mprotect 125 __SYSCALL(__NR_mprotect, sys_mprotect) #define __NR_sigprocmask 126 @@ -344,9 +344,9 @@ __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) #define __NR_sched_get_priority_min 160 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) #define __NR_sched_rr_get_interval 161 -__SYSCALL(__NR_sched_rr_get_interval, compat_sys_sched_rr_get_interval) +__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval_time32) #define __NR_nanosleep 162 -__SYSCALL(__NR_nanosleep, compat_sys_nanosleep) +__SYSCALL(__NR_nanosleep, sys_nanosleep_time32) #define __NR_mremap 163 __SYSCALL(__NR_mremap, sys_mremap) #define __NR_setresuid 164 @@ -376,7 +376,7 @@ __SYSCALL(__NR_rt_sigprocmask, compat_sys_rt_sigprocmask) #define __NR_rt_sigpending 176 __SYSCALL(__NR_rt_sigpending, compat_sys_rt_sigpending) #define __NR_rt_sigtimedwait 177 -__SYSCALL(__NR_rt_sigtimedwait, compat_sys_rt_sigtimedwait) +__SYSCALL(__NR_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time32) #define __NR_rt_sigqueueinfo 178 __SYSCALL(__NR_rt_sigqueueinfo, compat_sys_rt_sigqueueinfo) #define __NR_rt_sigsuspend 179 @@ -502,7 +502,7 @@ __SYSCALL(__NR_tkill, sys_tkill) #define __NR_sendfile64 239 __SYSCALL(__NR_sendfile64, sys_sendfile64) #define __NR_futex 240 -__SYSCALL(__NR_futex, compat_sys_futex) +__SYSCALL(__NR_futex, sys_futex_time32) #define __NR_sched_setaffinity 241 __SYSCALL(__NR_sched_setaffinity, compat_sys_sched_setaffinity) #define __NR_sched_getaffinity 242 @@ -512,7 +512,7 @@ __SYSCALL(__NR_io_setup, compat_sys_io_setup) #define __NR_io_destroy 244 __SYSCALL(__NR_io_destroy, sys_io_destroy) #define __NR_io_getevents 245 -__SYSCALL(__NR_io_getevents, compat_sys_io_getevents) +__SYSCALL(__NR_io_getevents, sys_io_getevents_time32) #define __NR_io_submit 246 __SYSCALL(__NR_io_submit, compat_sys_io_submit) #define __NR_io_cancel 247 @@ -538,21 +538,21 @@ __SYSCALL(__NR_set_tid_address, sys_set_tid_address) #define __NR_timer_create 257 __SYSCALL(__NR_timer_create, compat_sys_timer_create) #define __NR_timer_settime 258 -__SYSCALL(__NR_timer_settime, compat_sys_timer_settime) +__SYSCALL(__NR_timer_settime, sys_timer_settime32) #define __NR_timer_gettime 259 -__SYSCALL(__NR_timer_gettime, compat_sys_timer_gettime) +__SYSCALL(__NR_timer_gettime, sys_timer_gettime32) #define __NR_timer_getoverrun 260 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) #define __NR_timer_delete 261 __SYSCALL(__NR_timer_delete, sys_timer_delete) #define __NR_clock_settime 262 -__SYSCALL(__NR_clock_settime, compat_sys_clock_settime) +__SYSCALL(__NR_clock_settime, sys_clock_settime32) #define __NR_clock_gettime 263 -__SYSCALL(__NR_clock_gettime, compat_sys_clock_gettime) +__SYSCALL(__NR_clock_gettime, sys_clock_gettime32) #define __NR_clock_getres 264 -__SYSCALL(__NR_clock_getres, compat_sys_clock_getres) +__SYSCALL(__NR_clock_getres, sys_clock_getres_time32) #define __NR_clock_nanosleep 265 -__SYSCALL(__NR_clock_nanosleep, compat_sys_clock_nanosleep) +__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep_time32) #define __NR_statfs64 266 __SYSCALL(__NR_statfs64, compat_sys_aarch32_statfs64) #define __NR_fstatfs64 267 @@ -560,7 +560,7 @@ __SYSCALL(__NR_fstatfs64, compat_sys_aarch32_fstatfs64) #define __NR_tgkill 268 __SYSCALL(__NR_tgkill, sys_tgkill) #define __NR_utimes 269 -__SYSCALL(__NR_utimes, compat_sys_utimes) +__SYSCALL(__NR_utimes, sys_utimes_time32) #define __NR_arm_fadvise64_64 270 __SYSCALL(__NR_arm_fadvise64_64, compat_sys_aarch32_fadvise64_64) #define __NR_pciconfig_iobase 271 @@ -574,9 +574,9 @@ __SYSCALL(__NR_mq_open, compat_sys_mq_open) #define __NR_mq_unlink 275 __SYSCALL(__NR_mq_unlink, sys_mq_unlink) #define __NR_mq_timedsend 276 -__SYSCALL(__NR_mq_timedsend, compat_sys_mq_timedsend) +__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend_time32) #define __NR_mq_timedreceive 277 -__SYSCALL(__NR_mq_timedreceive, compat_sys_mq_timedreceive) +__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive_time32) #define __NR_mq_notify 278 __SYSCALL(__NR_mq_notify, compat_sys_mq_notify) #define __NR_mq_getsetattr 279 @@ -646,7 +646,7 @@ __SYSCALL(__NR_request_key, sys_request_key) #define __NR_keyctl 311 __SYSCALL(__NR_keyctl, compat_sys_keyctl) #define __NR_semtimedop 312 -__SYSCALL(__NR_semtimedop, compat_sys_semtimedop) +__SYSCALL(__NR_semtimedop, sys_semtimedop_time32) #define __NR_vserver 313 __SYSCALL(__NR_vserver, sys_ni_syscall) #define __NR_ioprio_set 314 @@ -674,7 +674,7 @@ __SYSCALL(__NR_mknodat, sys_mknodat) #define __NR_fchownat 325 __SYSCALL(__NR_fchownat, sys_fchownat) #define __NR_futimesat 326 -__SYSCALL(__NR_futimesat, compat_sys_futimesat) +__SYSCALL(__NR_futimesat, sys_futimesat_time32) #define __NR_fstatat64 327 __SYSCALL(__NR_fstatat64, sys_fstatat64) #define __NR_unlinkat 328 @@ -692,9 +692,9 @@ __SYSCALL(__NR_fchmodat, sys_fchmodat) #define __NR_faccessat 334 __SYSCALL(__NR_faccessat, sys_faccessat) #define __NR_pselect6 335 -__SYSCALL(__NR_pselect6, compat_sys_pselect6) +__SYSCALL(__NR_pselect6, compat_sys_pselect6_time32) #define __NR_ppoll 336 -__SYSCALL(__NR_ppoll, compat_sys_ppoll) +__SYSCALL(__NR_ppoll, compat_sys_ppoll_time32) #define __NR_unshare 337 __SYSCALL(__NR_unshare, sys_unshare) #define __NR_set_robust_list 338 @@ -718,7 +718,7 @@ __SYSCALL(__NR_epoll_pwait, compat_sys_epoll_pwait) #define __NR_kexec_load 347 __SYSCALL(__NR_kexec_load, compat_sys_kexec_load) #define __NR_utimensat 348 -__SYSCALL(__NR_utimensat, compat_sys_utimensat) +__SYSCALL(__NR_utimensat, sys_utimensat_time32) #define __NR_signalfd 349 __SYSCALL(__NR_signalfd, compat_sys_signalfd) #define __NR_timerfd_create 350 @@ -728,9 +728,9 @@ __SYSCALL(__NR_eventfd, sys_eventfd) #define __NR_fallocate 352 __SYSCALL(__NR_fallocate, compat_sys_aarch32_fallocate) #define __NR_timerfd_settime 353 -__SYSCALL(__NR_timerfd_settime, compat_sys_timerfd_settime) +__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime32) #define __NR_timerfd_gettime 354 -__SYSCALL(__NR_timerfd_gettime, compat_sys_timerfd_gettime) +__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime32) #define __NR_signalfd4 355 __SYSCALL(__NR_signalfd4, compat_sys_signalfd4) #define __NR_eventfd2 356 @@ -752,7 +752,7 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, compat_sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 364 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_recvmmsg 365 -__SYSCALL(__NR_recvmmsg, compat_sys_recvmmsg) +__SYSCALL(__NR_recvmmsg, compat_sys_recvmmsg_time32) #define __NR_accept4 366 __SYSCALL(__NR_accept4, sys_accept4) #define __NR_fanotify_init 367 @@ -766,7 +766,7 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) #define __NR_open_by_handle_at 371 __SYSCALL(__NR_open_by_handle_at, compat_sys_open_by_handle_at) #define __NR_clock_adjtime 372 -__SYSCALL(__NR_clock_adjtime, compat_sys_clock_adjtime) +__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime32) #define __NR_syncfs 373 __SYSCALL(__NR_syncfs, sys_syncfs) #define __NR_sendmmsg 374 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index cc134b1211aa..6d1e019817c8 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -41,7 +41,7 @@ 31 n32 dup sys_dup 32 n32 dup2 sys_dup2 33 n32 pause sys_pause -34 n32 nanosleep compat_sys_nanosleep +34 n32 nanosleep sys_nanosleep_time32 35 n32 getitimer compat_sys_getitimer 36 n32 setitimer compat_sys_setitimer 37 n32 alarm sys_alarm @@ -133,11 +133,11 @@ 123 n32 capget sys_capget 124 n32 capset sys_capset 125 n32 rt_sigpending compat_sys_rt_sigpending -126 n32 rt_sigtimedwait compat_sys_rt_sigtimedwait +126 n32 rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 127 n32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo 128 n32 rt_sigsuspend compat_sys_rt_sigsuspend 129 n32 sigaltstack compat_sys_sigaltstack -130 n32 utime compat_sys_utime +130 n32 utime sys_utime32 131 n32 mknod sys_mknod 132 n32 personality sys_32_personality 133 n32 ustat compat_sys_ustat @@ -152,7 +152,7 @@ 142 n32 sched_getscheduler sys_sched_getscheduler 143 n32 sched_get_priority_max sys_sched_get_priority_max 144 n32 sched_get_priority_min sys_sched_get_priority_min -145 n32 sched_rr_get_interval compat_sys_sched_rr_get_interval +145 n32 sched_rr_get_interval sys_sched_rr_get_interval_time32 146 n32 mlock sys_mlock 147 n32 munlock sys_munlock 148 n32 mlockall sys_mlockall @@ -161,7 +161,7 @@ 151 n32 pivot_root sys_pivot_root 152 n32 _sysctl compat_sys_sysctl 153 n32 prctl sys_prctl -154 n32 adjtimex compat_sys_adjtimex +154 n32 adjtimex sys_adjtimex_time32 155 n32 setrlimit compat_sys_setrlimit 156 n32 chroot sys_chroot 157 n32 sync sys_sync @@ -202,7 +202,7 @@ 191 n32 fremovexattr sys_fremovexattr 192 n32 tkill sys_tkill 193 n32 reserved193 sys_ni_syscall -194 n32 futex compat_sys_futex +194 n32 futex sys_futex_time32 195 n32 sched_setaffinity compat_sys_sched_setaffinity 196 n32 sched_getaffinity compat_sys_sched_getaffinity 197 n32 cacheflush sys_cacheflush @@ -210,7 +210,7 @@ 199 n32 sysmips __sys_sysmips 200 n32 io_setup compat_sys_io_setup 201 n32 io_destroy sys_io_destroy -202 n32 io_getevents compat_sys_io_getevents +202 n32 io_getevents sys_io_getevents_time32 203 n32 io_submit compat_sys_io_submit 204 n32 io_cancel sys_io_cancel 205 n32 exit_group sys_exit_group @@ -223,29 +223,29 @@ 212 n32 fcntl64 compat_sys_fcntl64 213 n32 set_tid_address sys_set_tid_address 214 n32 restart_syscall sys_restart_syscall -215 n32 semtimedop compat_sys_semtimedop +215 n32 semtimedop sys_semtimedop_time32 216 n32 fadvise64 sys_fadvise64_64 217 n32 statfs64 compat_sys_statfs64 218 n32 fstatfs64 compat_sys_fstatfs64 219 n32 sendfile64 sys_sendfile64 220 n32 timer_create compat_sys_timer_create -221 n32 timer_settime compat_sys_timer_settime -222 n32 timer_gettime compat_sys_timer_gettime +221 n32 timer_settime sys_timer_settime32 +222 n32 timer_gettime sys_timer_gettime32 223 n32 timer_getoverrun sys_timer_getoverrun 224 n32 timer_delete sys_timer_delete -225 n32 clock_settime compat_sys_clock_settime -226 n32 clock_gettime compat_sys_clock_gettime -227 n32 clock_getres compat_sys_clock_getres -228 n32 clock_nanosleep compat_sys_clock_nanosleep +225 n32 clock_settime sys_clock_settime32 +226 n32 clock_gettime sys_clock_gettime32 +227 n32 clock_getres sys_clock_getres_time32 +228 n32 clock_nanosleep sys_clock_nanosleep_time32 229 n32 tgkill sys_tgkill -230 n32 utimes compat_sys_utimes +230 n32 utimes sys_utimes_time32 231 n32 mbind compat_sys_mbind 232 n32 get_mempolicy compat_sys_get_mempolicy 233 n32 set_mempolicy compat_sys_set_mempolicy 234 n32 mq_open compat_sys_mq_open 235 n32 mq_unlink sys_mq_unlink -236 n32 mq_timedsend compat_sys_mq_timedsend -237 n32 mq_timedreceive compat_sys_mq_timedreceive +236 n32 mq_timedsend sys_mq_timedsend_time32 +237 n32 mq_timedreceive sys_mq_timedreceive_time32 238 n32 mq_notify compat_sys_mq_notify 239 n32 mq_getsetattr compat_sys_mq_getsetattr 240 n32 vserver sys_ni_syscall @@ -263,7 +263,7 @@ 252 n32 mkdirat sys_mkdirat 253 n32 mknodat sys_mknodat 254 n32 fchownat sys_fchownat -255 n32 futimesat compat_sys_futimesat +255 n32 futimesat sys_futimesat_time32 256 n32 newfstatat sys_newfstatat 257 n32 unlinkat sys_unlinkat 258 n32 renameat sys_renameat @@ -272,8 +272,8 @@ 261 n32 readlinkat sys_readlinkat 262 n32 fchmodat sys_fchmodat 263 n32 faccessat sys_faccessat -264 n32 pselect6 compat_sys_pselect6 -265 n32 ppoll compat_sys_ppoll +264 n32 pselect6 compat_sys_pselect6_time32 +265 n32 ppoll compat_sys_ppoll_time32 266 n32 unshare sys_unshare 267 n32 splice sys_splice 268 n32 sync_file_range sys_sync_file_range @@ -287,14 +287,14 @@ 276 n32 epoll_pwait compat_sys_epoll_pwait 277 n32 ioprio_set sys_ioprio_set 278 n32 ioprio_get sys_ioprio_get -279 n32 utimensat compat_sys_utimensat +279 n32 utimensat sys_utimensat_time32 280 n32 signalfd compat_sys_signalfd 281 n32 timerfd sys_ni_syscall 282 n32 eventfd sys_eventfd 283 n32 fallocate sys_fallocate 284 n32 timerfd_create sys_timerfd_create -285 n32 timerfd_gettime compat_sys_timerfd_gettime -286 n32 timerfd_settime compat_sys_timerfd_settime +285 n32 timerfd_gettime sys_timerfd_gettime32 +286 n32 timerfd_settime sys_timerfd_settime32 287 n32 signalfd4 compat_sys_signalfd4 288 n32 eventfd2 sys_eventfd2 289 n32 epoll_create1 sys_epoll_create1 @@ -306,14 +306,14 @@ 295 n32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo 296 n32 perf_event_open sys_perf_event_open 297 n32 accept4 sys_accept4 -298 n32 recvmmsg compat_sys_recvmmsg +298 n32 recvmmsg compat_sys_recvmmsg_time32 299 n32 getdents64 sys_getdents64 300 n32 fanotify_init sys_fanotify_init 301 n32 fanotify_mark sys_fanotify_mark 302 n32 prlimit64 sys_prlimit64 303 n32 name_to_handle_at sys_name_to_handle_at 304 n32 open_by_handle_at sys_open_by_handle_at -305 n32 clock_adjtime compat_sys_clock_adjtime +305 n32 clock_adjtime sys_clock_adjtime32 306 n32 syncfs sys_syncfs 307 n32 sendmmsg compat_sys_sendmmsg 308 n32 setns sys_setns diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index fa47ea8cc6ef..e9fec7bac5a9 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -20,7 +20,7 @@ 10 o32 unlink sys_unlink 11 o32 execve sys_execve compat_sys_execve 12 o32 chdir sys_chdir -13 o32 time sys_time compat_sys_time +13 o32 time sys_time sys_time32 14 o32 mknod sys_mknod 15 o32 chmod sys_chmod 16 o32 lchown sys_lchown @@ -33,13 +33,13 @@ 22 o32 umount sys_oldumount 23 o32 setuid sys_setuid 24 o32 getuid sys_getuid -25 o32 stime sys_stime compat_sys_stime +25 o32 stime sys_stime sys_stime32 26 o32 ptrace sys_ptrace compat_sys_ptrace 27 o32 alarm sys_alarm # 28 was sys_fstat 28 o32 unused28 sys_ni_syscall 29 o32 pause sys_pause -30 o32 utime sys_utime compat_sys_utime +30 o32 utime sys_utime sys_utime32 31 o32 stty sys_ni_syscall 32 o32 gtty sys_ni_syscall 33 o32 access sys_access @@ -135,7 +135,7 @@ 121 o32 setdomainname sys_setdomainname 122 o32 uname sys_newuname 123 o32 modify_ldt sys_ni_syscall -124 o32 adjtimex sys_adjtimex compat_sys_adjtimex +124 o32 adjtimex sys_adjtimex sys_adjtimex_time32 125 o32 mprotect sys_mprotect 126 o32 sigprocmask sys_sigprocmask compat_sys_sigprocmask 127 o32 create_module sys_ni_syscall @@ -176,8 +176,8 @@ 162 o32 sched_yield sys_sched_yield 163 o32 sched_get_priority_max sys_sched_get_priority_max 164 o32 sched_get_priority_min sys_sched_get_priority_min -165 o32 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -166 o32 nanosleep sys_nanosleep compat_sys_nanosleep +165 o32 sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 +166 o32 nanosleep sys_nanosleep sys_nanosleep_time32 167 o32 mremap sys_mremap 168 o32 accept sys_accept 169 o32 bind sys_bind @@ -208,7 +208,7 @@ 194 o32 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 195 o32 rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 196 o32 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -197 o32 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +197 o32 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 198 o32 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 199 o32 rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 200 o32 pread64 sys_pread64 sys_32_pread @@ -249,12 +249,12 @@ 235 o32 fremovexattr sys_fremovexattr 236 o32 tkill sys_tkill 237 o32 sendfile64 sys_sendfile64 -238 o32 futex sys_futex compat_sys_futex +238 o32 futex sys_futex sys_futex_time32 239 o32 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity 240 o32 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity 241 o32 io_setup sys_io_setup compat_sys_io_setup 242 o32 io_destroy sys_io_destroy -243 o32 io_getevents sys_io_getevents compat_sys_io_getevents +243 o32 io_getevents sys_io_getevents sys_io_getevents_time32 244 o32 io_submit sys_io_submit compat_sys_io_submit 245 o32 io_cancel sys_io_cancel 246 o32 exit_group sys_exit_group @@ -269,23 +269,23 @@ 255 o32 statfs64 sys_statfs64 compat_sys_statfs64 256 o32 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 257 o32 timer_create sys_timer_create compat_sys_timer_create -258 o32 timer_settime sys_timer_settime compat_sys_timer_settime -259 o32 timer_gettime sys_timer_gettime compat_sys_timer_gettime +258 o32 timer_settime sys_timer_settime sys_timer_settime32 +259 o32 timer_gettime sys_timer_gettime sys_timer_gettime32 260 o32 timer_getoverrun sys_timer_getoverrun 261 o32 timer_delete sys_timer_delete -262 o32 clock_settime sys_clock_settime compat_sys_clock_settime -263 o32 clock_gettime sys_clock_gettime compat_sys_clock_gettime -264 o32 clock_getres sys_clock_getres compat_sys_clock_getres -265 o32 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +262 o32 clock_settime sys_clock_settime sys_clock_settime32 +263 o32 clock_gettime sys_clock_gettime sys_clock_gettime32 +264 o32 clock_getres sys_clock_getres sys_clock_getres_time32 +265 o32 clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 266 o32 tgkill sys_tgkill -267 o32 utimes sys_utimes compat_sys_utimes +267 o32 utimes sys_utimes sys_utimes_time32 268 o32 mbind sys_mbind compat_sys_mbind 269 o32 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy 270 o32 set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy 271 o32 mq_open sys_mq_open compat_sys_mq_open 272 o32 mq_unlink sys_mq_unlink -273 o32 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend -274 o32 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +273 o32 mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 +274 o32 mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 275 o32 mq_notify sys_mq_notify compat_sys_mq_notify 276 o32 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 277 o32 vserver sys_ni_syscall @@ -303,7 +303,7 @@ 289 o32 mkdirat sys_mkdirat 290 o32 mknodat sys_mknodat 291 o32 fchownat sys_fchownat -292 o32 futimesat sys_futimesat compat_sys_futimesat +292 o32 futimesat sys_futimesat sys_futimesat_time32 293 o32 fstatat64 sys_fstatat64 sys_newfstatat 294 o32 unlinkat sys_unlinkat 295 o32 renameat sys_renameat @@ -312,8 +312,8 @@ 298 o32 readlinkat sys_readlinkat 299 o32 fchmodat sys_fchmodat 300 o32 faccessat sys_faccessat -301 o32 pselect6 sys_pselect6 compat_sys_pselect6 -302 o32 ppoll sys_ppoll compat_sys_ppoll +301 o32 pselect6 sys_pselect6 compat_sys_pselect6_time32 +302 o32 ppoll sys_ppoll compat_sys_ppoll_time32 303 o32 unshare sys_unshare 304 o32 splice sys_splice 305 o32 sync_file_range sys_sync_file_range sys32_sync_file_range @@ -327,14 +327,14 @@ 313 o32 epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 314 o32 ioprio_set sys_ioprio_set 315 o32 ioprio_get sys_ioprio_get -316 o32 utimensat sys_utimensat compat_sys_utimensat +316 o32 utimensat sys_utimensat sys_utimensat_time32 317 o32 signalfd sys_signalfd compat_sys_signalfd 318 o32 timerfd sys_ni_syscall 319 o32 eventfd sys_eventfd 320 o32 fallocate sys_fallocate sys32_fallocate 321 o32 timerfd_create sys_timerfd_create -322 o32 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime -323 o32 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime +322 o32 timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 +323 o32 timerfd_settime sys_timerfd_settime sys_timerfd_settime32 324 o32 signalfd4 sys_signalfd4 compat_sys_signalfd4 325 o32 eventfd2 sys_eventfd2 326 o32 epoll_create1 sys_epoll_create1 @@ -346,13 +346,13 @@ 332 o32 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo 333 o32 perf_event_open sys_perf_event_open 334 o32 accept4 sys_accept4 -335 o32 recvmmsg sys_recvmmsg compat_sys_recvmmsg +335 o32 recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 336 o32 fanotify_init sys_fanotify_init 337 o32 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark 338 o32 prlimit64 sys_prlimit64 339 o32 name_to_handle_at sys_name_to_handle_at 340 o32 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -341 o32 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +341 o32 clock_adjtime sys_clock_adjtime sys_clock_adjtime32 342 o32 syncfs sys_syncfs 343 o32 sendmmsg sys_sendmmsg compat_sys_sendmmsg 344 o32 setns sys_setns diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 71873bb72782..f7440427d459 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -20,7 +20,7 @@ 10 common unlink sys_unlink 11 common execve sys_execve compat_sys_execve 12 common chdir sys_chdir -13 common time sys_time compat_sys_time +13 common time sys_time sys_time32 14 common mknod sys_mknod 15 common chmod sys_chmod 16 common lchown sys_lchown @@ -32,12 +32,12 @@ 22 common bind sys_bind 23 common setuid sys_setuid 24 common getuid sys_getuid -25 common stime sys_stime compat_sys_stime +25 common stime sys_stime sys_stime32 26 common ptrace sys_ptrace compat_sys_ptrace 27 common alarm sys_alarm 28 common fstat sys_newfstat compat_sys_newfstat 29 common pause sys_pause -30 common utime sys_utime compat_sys_utime +30 common utime sys_utime sys_utime32 31 common connect sys_connect 32 common listen sys_listen 33 common access sys_access @@ -133,7 +133,7 @@ 121 common setdomainname sys_setdomainname 122 common sendfile sys_sendfile compat_sys_sendfile 123 common recvfrom sys_recvfrom -124 common adjtimex sys_adjtimex compat_sys_adjtimex +124 common adjtimex sys_adjtimex sys_adjtimex_time32 125 common mprotect sys_mprotect 126 common sigprocmask sys_sigprocmask compat_sys_sigprocmask # 127 was create_module @@ -171,8 +171,8 @@ 158 common sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -162 common nanosleep sys_nanosleep compat_sys_nanosleep +161 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 common nanosleep sys_nanosleep sys_nanosleep_time32 163 common mremap sys_mremap 164 common setresuid sys_setresuid 165 common getresuid sys_getresuid @@ -187,7 +187,7 @@ 174 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 175 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 176 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 180 common chown sys_chown @@ -223,14 +223,14 @@ 207 64 readahead sys_readahead 208 common tkill sys_tkill 209 common sendfile64 sys_sendfile64 compat_sys_sendfile64 -210 common futex sys_futex compat_sys_futex +210 common futex sys_futex sys_futex_time32 211 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity 212 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity # 213 was set_thread_area # 214 was get_thread_area 215 common io_setup sys_io_setup compat_sys_io_setup 216 common io_destroy sys_io_destroy -217 common io_getevents sys_io_getevents compat_sys_io_getevents +217 common io_getevents sys_io_getevents sys_io_getevents_time32 218 common io_submit sys_io_submit compat_sys_io_submit 219 common io_cancel sys_io_cancel # 220 was alloc_hugepages @@ -241,11 +241,11 @@ 225 common epoll_ctl sys_epoll_ctl 226 common epoll_wait sys_epoll_wait 227 common remap_file_pages sys_remap_file_pages -228 common semtimedop sys_semtimedop compat_sys_semtimedop +228 common semtimedop sys_semtimedop sys_semtimedop_time32 229 common mq_open sys_mq_open compat_sys_mq_open 230 common mq_unlink sys_mq_unlink -231 common mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend -232 common mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +231 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 +232 common mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 233 common mq_notify sys_mq_notify compat_sys_mq_notify 234 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 235 common waitid sys_waitid compat_sys_waitid @@ -265,14 +265,14 @@ 248 common lremovexattr sys_lremovexattr 249 common fremovexattr sys_fremovexattr 250 common timer_create sys_timer_create compat_sys_timer_create -251 common timer_settime sys_timer_settime compat_sys_timer_settime -252 common timer_gettime sys_timer_gettime compat_sys_timer_gettime +251 common timer_settime sys_timer_settime sys_timer_settime32 +252 common timer_gettime sys_timer_gettime sys_timer_gettime32 253 common timer_getoverrun sys_timer_getoverrun 254 common timer_delete sys_timer_delete -255 common clock_settime sys_clock_settime compat_sys_clock_settime -256 common clock_gettime sys_clock_gettime compat_sys_clock_gettime -257 common clock_getres sys_clock_getres compat_sys_clock_getres -258 common clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +255 common clock_settime sys_clock_settime sys_clock_settime32 +256 common clock_gettime sys_clock_gettime sys_clock_gettime32 +257 common clock_getres sys_clock_getres sys_clock_getres_time32 +258 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 259 common tgkill sys_tgkill 260 common mbind sys_mbind compat_sys_mbind 261 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy @@ -287,13 +287,13 @@ 270 common inotify_add_watch sys_inotify_add_watch 271 common inotify_rm_watch sys_inotify_rm_watch 272 common migrate_pages sys_migrate_pages -273 common pselect6 sys_pselect6 compat_sys_pselect6 -274 common ppoll sys_ppoll compat_sys_ppoll +273 common pselect6 sys_pselect6 compat_sys_pselect6_time32 +274 common ppoll sys_ppoll compat_sys_ppoll_time32 275 common openat sys_openat compat_sys_openat 276 common mkdirat sys_mkdirat 277 common mknodat sys_mknodat 278 common fchownat sys_fchownat -279 common futimesat sys_futimesat compat_sys_futimesat +279 common futimesat sys_futimesat sys_futimesat_time32 280 common fstatat64 sys_fstatat64 281 common unlinkat sys_unlinkat 282 common renameat sys_renameat @@ -316,15 +316,15 @@ 298 common statfs64 sys_statfs64 compat_sys_statfs64 299 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 300 common kexec_load sys_kexec_load compat_sys_kexec_load -301 common utimensat sys_utimensat compat_sys_utimensat +301 common utimensat sys_utimensat sys_utimensat_time32 302 common signalfd sys_signalfd compat_sys_signalfd # 303 was timerfd 304 common eventfd sys_eventfd 305 32 fallocate parisc_fallocate 305 64 fallocate sys_fallocate 306 common timerfd_create sys_timerfd_create -307 common timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime -308 common timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +307 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 +308 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 309 common signalfd4 sys_signalfd4 compat_sys_signalfd4 310 common eventfd2 sys_eventfd2 311 common epoll_create1 sys_epoll_create1 @@ -335,12 +335,12 @@ 316 common pwritev sys_pwritev compat_sys_pwritev 317 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo 318 common perf_event_open sys_perf_event_open -319 common recvmmsg sys_recvmmsg compat_sys_recvmmsg +319 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 320 common accept4 sys_accept4 321 common prlimit64 sys_prlimit64 322 common fanotify_init sys_fanotify_init 323 common fanotify_mark sys_fanotify_mark sys32_fanotify_mark -324 common clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +324 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 325 common name_to_handle_at sys_name_to_handle_at 326 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at 327 common syncfs sys_syncfs @@ -352,7 +352,7 @@ 333 common finit_module sys_finit_module 334 common sched_setattr sys_sched_setattr 335 common sched_getattr sys_sched_getattr -336 common utimes sys_utimes compat_sys_utimes +336 common utimes sys_utimes sys_utimes_time32 337 common renameat2 sys_renameat2 338 common seccomp sys_seccomp 339 common getrandom sys_getrandom diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 7555874ce39c..86650dcd2185 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -20,7 +20,7 @@ 10 common unlink sys_unlink 11 nospu execve sys_execve compat_sys_execve 12 common chdir sys_chdir -13 common time sys_time compat_sys_time +13 common time sys_time sys_time32 14 common mknod sys_mknod 15 common chmod sys_chmod 16 common lchown sys_lchown @@ -36,14 +36,14 @@ 22 spu umount sys_ni_syscall 23 common setuid sys_setuid 24 common getuid sys_getuid -25 common stime sys_stime compat_sys_stime +25 common stime sys_stime sys_stime32 26 nospu ptrace sys_ptrace compat_sys_ptrace 27 common alarm sys_alarm 28 32 oldfstat sys_fstat sys_ni_syscall 28 64 oldfstat sys_ni_syscall 28 spu oldfstat sys_ni_syscall 29 nospu pause sys_pause -30 nospu utime sys_utime compat_sys_utime +30 nospu utime sys_utime sys_utime32 31 common stty sys_ni_syscall 32 common gtty sys_ni_syscall 33 common access sys_access @@ -157,7 +157,7 @@ 121 common setdomainname sys_setdomainname 122 common uname sys_newuname 123 common modify_ldt sys_ni_syscall -124 common adjtimex sys_adjtimex compat_sys_adjtimex +124 common adjtimex sys_adjtimex sys_adjtimex_time32 125 common mprotect sys_mprotect 126 32 sigprocmask sys_sigprocmask compat_sys_sigprocmask 126 64 sigprocmask sys_ni_syscall @@ -198,8 +198,8 @@ 158 common sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -162 common nanosleep sys_nanosleep compat_sys_nanosleep +161 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 common nanosleep sys_nanosleep sys_nanosleep_time32 163 common mremap sys_mremap 164 common setresuid sys_setresuid 165 common getresuid sys_getresuid @@ -213,7 +213,7 @@ 173 nospu rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 174 nospu rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 175 nospu rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -176 nospu rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +176 nospu rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 177 nospu rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 179 common pread64 sys_pread64 compat_sys_pread64 @@ -260,7 +260,7 @@ 218 common removexattr sys_removexattr 219 common lremovexattr sys_lremovexattr 220 common fremovexattr sys_fremovexattr -221 common futex sys_futex compat_sys_futex +221 common futex sys_futex sys_futex_time32 222 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity 223 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity # 224 unused @@ -268,7 +268,7 @@ 226 32 sendfile64 sys_sendfile64 compat_sys_sendfile64 227 common io_setup sys_io_setup compat_sys_io_setup 228 common io_destroy sys_io_destroy -229 common io_getevents sys_io_getevents compat_sys_io_getevents +229 common io_getevents sys_io_getevents sys_io_getevents_time32 230 common io_submit sys_io_submit compat_sys_io_submit 231 common io_cancel sys_io_cancel 232 nospu set_tid_address sys_set_tid_address @@ -280,19 +280,19 @@ 238 common epoll_wait sys_epoll_wait 239 common remap_file_pages sys_remap_file_pages 240 common timer_create sys_timer_create compat_sys_timer_create -241 common timer_settime sys_timer_settime compat_sys_timer_settime -242 common timer_gettime sys_timer_gettime compat_sys_timer_gettime +241 common timer_settime sys_timer_settime sys_timer_settime32 +242 common timer_gettime sys_timer_gettime sys_timer_gettime32 243 common timer_getoverrun sys_timer_getoverrun 244 common timer_delete sys_timer_delete -245 common clock_settime sys_clock_settime compat_sys_clock_settime -246 common clock_gettime sys_clock_gettime compat_sys_clock_gettime -247 common clock_getres sys_clock_getres compat_sys_clock_getres -248 common clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +245 common clock_settime sys_clock_settime sys_clock_settime32 +246 common clock_gettime sys_clock_gettime sys_clock_gettime32 +247 common clock_getres sys_clock_getres sys_clock_getres_time32 +248 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 249 32 swapcontext ppc_swapcontext ppc32_swapcontext 249 64 swapcontext ppc64_swapcontext 249 spu swapcontext sys_ni_syscall 250 common tgkill sys_tgkill -251 common utimes sys_utimes compat_sys_utimes +251 common utimes sys_utimes sys_utimes_time32 252 common statfs64 sys_statfs64 compat_sys_statfs64 253 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 254 32 fadvise64_64 ppc_fadvise64_64 @@ -308,8 +308,8 @@ 261 nospu set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy 262 nospu mq_open sys_mq_open compat_sys_mq_open 263 nospu mq_unlink sys_mq_unlink -264 nospu mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend -265 nospu mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +264 nospu mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 +265 nospu mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 266 nospu mq_notify sys_mq_notify compat_sys_mq_notify 267 nospu mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 268 nospu kexec_load sys_kexec_load compat_sys_kexec_load @@ -324,8 +324,8 @@ 277 nospu inotify_rm_watch sys_inotify_rm_watch 278 nospu spu_run sys_spu_run 279 nospu spu_create sys_spu_create -280 nospu pselect6 sys_pselect6 compat_sys_pselect6 -281 nospu ppoll sys_ppoll compat_sys_ppoll +280 nospu pselect6 sys_pselect6 compat_sys_pselect6_time32 +281 nospu ppoll sys_ppoll compat_sys_ppoll_time32 282 common unshare sys_unshare 283 common splice sys_splice 284 common tee sys_tee @@ -334,7 +334,7 @@ 287 common mkdirat sys_mkdirat 288 common mknodat sys_mknodat 289 common fchownat sys_fchownat -290 common futimesat sys_futimesat compat_sys_futimesat +290 common futimesat sys_futimesat sys_futimesat_time32 291 32 fstatat64 sys_fstatat64 291 64 newfstatat sys_newfstatat 291 spu newfstatat sys_newfstatat @@ -350,15 +350,15 @@ 301 common move_pages sys_move_pages compat_sys_move_pages 302 common getcpu sys_getcpu 303 nospu epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait -304 common utimensat sys_utimensat compat_sys_utimensat +304 common utimensat sys_utimensat sys_utimensat_time32 305 common signalfd sys_signalfd compat_sys_signalfd 306 common timerfd_create sys_timerfd_create 307 common eventfd sys_eventfd 308 common sync_file_range2 sys_sync_file_range2 compat_sys_sync_file_range2 309 nospu fallocate sys_fallocate compat_sys_fallocate 310 nospu subpage_prot sys_subpage_prot -311 common timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime -312 common timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +311 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 +312 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 313 common signalfd4 sys_signalfd4 compat_sys_signalfd4 314 common eventfd2 sys_eventfd2 315 common epoll_create1 sys_epoll_create1 @@ -389,11 +389,11 @@ 340 common getsockopt sys_getsockopt compat_sys_getsockopt 341 common sendmsg sys_sendmsg compat_sys_sendmsg 342 common recvmsg sys_recvmsg compat_sys_recvmsg -343 common recvmmsg sys_recvmmsg compat_sys_recvmmsg +343 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 344 common accept4 sys_accept4 345 common name_to_handle_at sys_name_to_handle_at 346 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -347 common clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +347 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 348 common syncfs sys_syncfs 349 common sendmmsg sys_sendmmsg compat_sys_sendmmsg 350 common setns sys_setns diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 620e222003ca..285201cf1f83 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -20,7 +20,7 @@ 10 common unlink sys_unlink sys_unlink 11 common execve sys_execve compat_sys_execve 12 common chdir sys_chdir sys_chdir -13 32 time - compat_sys_time +13 32 time - sys_time32 14 common mknod sys_mknod sys_mknod 15 common chmod sys_chmod sys_chmod 16 32 lchown - sys_lchown16 @@ -30,11 +30,11 @@ 22 common umount sys_oldumount sys_oldumount 23 32 setuid - sys_setuid16 24 32 getuid - sys_getuid16 -25 32 stime - compat_sys_stime +25 32 stime - sys_stime32 26 common ptrace sys_ptrace compat_sys_ptrace 27 common alarm sys_alarm sys_alarm 29 common pause sys_pause sys_pause -30 common utime sys_utime compat_sys_utime +30 common utime sys_utime sys_utime32 33 common access sys_access sys_access 34 common nice sys_nice sys_nice 36 common sync sys_sync sys_sync @@ -112,7 +112,7 @@ 120 common clone sys_clone sys_clone 121 common setdomainname sys_setdomainname sys_setdomainname 122 common uname sys_newuname sys_newuname -124 common adjtimex sys_adjtimex compat_sys_adjtimex +124 common adjtimex sys_adjtimex sys_adjtimex_time32 125 common mprotect sys_mprotect sys_mprotect 126 common sigprocmask sys_sigprocmask compat_sys_sigprocmask 127 common create_module - - @@ -150,8 +150,8 @@ 158 common sched_yield sys_sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -162 common nanosleep sys_nanosleep compat_sys_nanosleep +161 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 common nanosleep sys_nanosleep sys_nanosleep_time32 163 common mremap sys_mremap sys_mremap 164 32 setresuid - sys_setresuid16 165 32 getresuid - sys_getresuid16 @@ -165,7 +165,7 @@ 174 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 175 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 176 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 180 common pread64 sys_pread64 compat_sys_s390_pread64 @@ -246,13 +246,13 @@ 235 common fremovexattr sys_fremovexattr sys_fremovexattr 236 common gettid sys_gettid sys_gettid 237 common tkill sys_tkill sys_tkill -238 common futex sys_futex compat_sys_futex +238 common futex sys_futex sys_futex_time32 239 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity 240 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity 241 common tgkill sys_tgkill sys_tgkill 243 common io_setup sys_io_setup compat_sys_io_setup 244 common io_destroy sys_io_destroy sys_io_destroy -245 common io_getevents sys_io_getevents compat_sys_io_getevents +245 common io_getevents sys_io_getevents sys_io_getevents_time32 246 common io_submit sys_io_submit compat_sys_io_submit 247 common io_cancel sys_io_cancel sys_io_cancel 248 common exit_group sys_exit_group sys_exit_group @@ -262,14 +262,14 @@ 252 common set_tid_address sys_set_tid_address sys_set_tid_address 253 common fadvise64 sys_fadvise64_64 compat_sys_s390_fadvise64 254 common timer_create sys_timer_create compat_sys_timer_create -255 common timer_settime sys_timer_settime compat_sys_timer_settime -256 common timer_gettime sys_timer_gettime compat_sys_timer_gettime +255 common timer_settime sys_timer_settime sys_timer_settime32 +256 common timer_gettime sys_timer_gettime sys_timer_gettime32 257 common timer_getoverrun sys_timer_getoverrun sys_timer_getoverrun 258 common timer_delete sys_timer_delete sys_timer_delete -259 common clock_settime sys_clock_settime compat_sys_clock_settime -260 common clock_gettime sys_clock_gettime compat_sys_clock_gettime -261 common clock_getres sys_clock_getres compat_sys_clock_getres -262 common clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +259 common clock_settime sys_clock_settime sys_clock_settime32 +260 common clock_gettime sys_clock_gettime sys_clock_gettime32 +261 common clock_getres sys_clock_getres sys_clock_getres_time32 +262 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 264 32 fadvise64_64 - compat_sys_s390_fadvise64_64 265 common statfs64 sys_statfs64 compat_sys_statfs64 266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 @@ -279,8 +279,8 @@ 270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy 271 common mq_open sys_mq_open compat_sys_mq_open 272 common mq_unlink sys_mq_unlink sys_mq_unlink -273 common mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend -274 common mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 +274 common mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 275 common mq_notify sys_mq_notify compat_sys_mq_notify 276 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 277 common kexec_load sys_kexec_load compat_sys_kexec_load @@ -298,7 +298,7 @@ 289 common mkdirat sys_mkdirat sys_mkdirat 290 common mknodat sys_mknodat sys_mknodat 291 common fchownat sys_fchownat sys_fchownat -292 common futimesat sys_futimesat compat_sys_futimesat +292 common futimesat sys_futimesat sys_futimesat_time32 293 32 fstatat64 - compat_sys_s390_fstatat64 293 64 newfstatat sys_newfstatat - 294 common unlinkat sys_unlinkat sys_unlinkat @@ -308,8 +308,8 @@ 298 common readlinkat sys_readlinkat sys_readlinkat 299 common fchmodat sys_fchmodat sys_fchmodat 300 common faccessat sys_faccessat sys_faccessat -301 common pselect6 sys_pselect6 compat_sys_pselect6 -302 common ppoll sys_ppoll compat_sys_ppoll +301 common pselect6 sys_pselect6 compat_sys_pselect6_time32 +302 common ppoll sys_ppoll compat_sys_ppoll_time32 303 common unshare sys_unshare sys_unshare 304 common set_robust_list sys_set_robust_list compat_sys_set_robust_list 305 common get_robust_list sys_get_robust_list compat_sys_get_robust_list @@ -320,15 +320,15 @@ 310 common move_pages sys_move_pages compat_sys_move_pages 311 common getcpu sys_getcpu sys_getcpu 312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait -313 common utimes sys_utimes compat_sys_utimes +313 common utimes sys_utimes sys_utimes_time32 314 common fallocate sys_fallocate compat_sys_s390_fallocate -315 common utimensat sys_utimensat compat_sys_utimensat +315 common utimensat sys_utimensat sys_utimensat_time32 316 common signalfd sys_signalfd compat_sys_signalfd 317 common timerfd - - 318 common eventfd sys_eventfd sys_eventfd 319 common timerfd_create sys_timerfd_create sys_timerfd_create -320 common timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime -321 common timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +320 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 +321 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 322 common signalfd4 sys_signalfd4 compat_sys_signalfd4 323 common eventfd2 sys_eventfd2 sys_eventfd2 324 common inotify_init1 sys_inotify_init1 sys_inotify_init1 @@ -344,7 +344,7 @@ 334 common prlimit64 sys_prlimit64 sys_prlimit64 335 common name_to_handle_at sys_name_to_handle_at sys_name_to_handle_at 336 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -337 common clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +337 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 338 common syncfs sys_syncfs sys_syncfs 339 common setns sys_setns sys_setns 340 common process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv @@ -364,7 +364,7 @@ 354 common execveat sys_execveat compat_sys_execveat 355 common userfaultfd sys_userfaultfd sys_userfaultfd 356 common membarrier sys_membarrier sys_membarrier -357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg +357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 358 common sendmmsg sys_sendmmsg compat_sys_sendmmsg 359 common socket sys_socket sys_socket 360 common socketpair sys_socketpair sys_socketpair diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index e63cd013cc77..7cb05b50aeaa 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -44,7 +44,7 @@ 28 common sigaltstack sys_sigaltstack compat_sys_sigaltstack 29 32 pause sys_pause 29 64 pause sys_nis_syscall -30 common utime sys_utime compat_sys_utime +30 common utime sys_utime sys_utime32 31 32 lchown32 sys_lchown 32 32 fchown32 sys_fchown 33 common access sys_access @@ -128,7 +128,7 @@ 102 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 103 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 104 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -105 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +105 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 106 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 107 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 108 32 setresuid32 sys_setresuid @@ -168,11 +168,11 @@ 135 common socketpair sys_socketpair 136 common mkdir sys_mkdir 137 common rmdir sys_rmdir -138 common utimes sys_utimes compat_sys_utimes +138 common utimes sys_utimes sys_utimes_time32 139 common stat64 sys_stat64 compat_sys_stat64 140 common sendfile64 sys_sendfile64 141 common getpeername sys_getpeername -142 common futex sys_futex compat_sys_futex +142 common futex sys_futex sys_futex_time32 143 common gettid sys_gettid 144 common getrlimit sys_getrlimit compat_sys_getrlimit 145 common setrlimit sys_setrlimit compat_sys_setrlimit @@ -258,7 +258,7 @@ 216 64 sigreturn sys_nis_syscall 217 common clone sys_clone 218 common ioprio_get sys_ioprio_get -219 32 adjtimex sys_adjtimex compat_sys_adjtimex +219 32 adjtimex sys_adjtimex sys_adjtimex_time32 219 64 adjtimex sys_sparc_adjtimex 220 32 sigprocmask sys_sigprocmask compat_sys_sigprocmask 220 64 sigprocmask sys_nis_syscall @@ -272,9 +272,9 @@ 228 common setfsuid sys_setfsuid16 229 common setfsgid sys_setfsgid16 230 common _newselect sys_select compat_sys_select -231 32 time sys_time compat_sys_time +231 32 time sys_time sys_time32 232 common splice sys_splice -233 common stime sys_stime compat_sys_stime +233 common stime sys_stime sys_stime32 234 common statfs64 sys_statfs64 compat_sys_statfs64 235 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 236 common _llseek sys_llseek @@ -289,8 +289,8 @@ 245 common sched_yield sys_sched_yield 246 common sched_get_priority_max sys_sched_get_priority_max 247 common sched_get_priority_min sys_sched_get_priority_min -248 common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -249 common nanosleep sys_nanosleep compat_sys_nanosleep +248 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 +249 common nanosleep sys_nanosleep sys_nanosleep_time32 250 32 mremap sys_mremap 250 64 mremap sys_64_mremap 251 common _sysctl sys_sysctl compat_sys_sysctl @@ -299,14 +299,14 @@ 254 32 nfsservctl sys_ni_syscall sys_nis_syscall 254 64 nfsservctl sys_nis_syscall 255 common sync_file_range sys_sync_file_range compat_sys_sync_file_range -256 common clock_settime sys_clock_settime compat_sys_clock_settime -257 common clock_gettime sys_clock_gettime compat_sys_clock_gettime -258 common clock_getres sys_clock_getres compat_sys_clock_getres -259 common clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +256 common clock_settime sys_clock_settime sys_clock_settime32 +257 common clock_gettime sys_clock_gettime sys_clock_gettime32 +258 common clock_getres sys_clock_getres sys_clock_getres_time32 +259 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 260 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity 261 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity -262 common timer_settime sys_timer_settime compat_sys_timer_settime -263 common timer_gettime sys_timer_gettime compat_sys_timer_gettime +262 common timer_settime sys_timer_settime sys_timer_settime32 +263 common timer_gettime sys_timer_gettime sys_timer_gettime32 264 common timer_getoverrun sys_timer_getoverrun 265 common timer_delete sys_timer_delete 266 common timer_create sys_timer_create compat_sys_timer_create @@ -316,11 +316,11 @@ 269 common io_destroy sys_io_destroy 270 common io_submit sys_io_submit compat_sys_io_submit 271 common io_cancel sys_io_cancel -272 common io_getevents sys_io_getevents compat_sys_io_getevents +272 common io_getevents sys_io_getevents sys_io_getevents_time32 273 common mq_open sys_mq_open compat_sys_mq_open 274 common mq_unlink sys_mq_unlink -275 common mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend -276 common mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +275 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 +276 common mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 277 common mq_notify sys_mq_notify compat_sys_mq_notify 278 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 279 common waitid sys_waitid compat_sys_waitid @@ -332,7 +332,7 @@ 285 common mkdirat sys_mkdirat 286 common mknodat sys_mknodat 287 common fchownat sys_fchownat -288 common futimesat sys_futimesat compat_sys_futimesat +288 common futimesat sys_futimesat sys_futimesat_time32 289 common fstatat64 sys_fstatat64 compat_sys_fstatat64 290 common unlinkat sys_unlinkat 291 common renameat sys_renameat @@ -341,8 +341,8 @@ 294 common readlinkat sys_readlinkat 295 common fchmodat sys_fchmodat 296 common faccessat sys_faccessat -297 common pselect6 sys_pselect6 compat_sys_pselect6 -298 common ppoll sys_ppoll compat_sys_ppoll +297 common pselect6 sys_pselect6 compat_sys_pselect6_time32 +298 common ppoll sys_ppoll compat_sys_ppoll_time32 299 common unshare sys_unshare 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list 301 common get_robust_list sys_get_robust_list compat_sys_get_robust_list @@ -354,13 +354,13 @@ 307 common move_pages sys_move_pages compat_sys_move_pages 308 common getcpu sys_getcpu 309 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait -310 common utimensat sys_utimensat compat_sys_utimensat +310 common utimensat sys_utimensat sys_utimensat_time32 311 common signalfd sys_signalfd compat_sys_signalfd 312 common timerfd_create sys_timerfd_create 313 common eventfd sys_eventfd 314 common fallocate sys_fallocate compat_sys_fallocate -315 common timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime -316 common timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +315 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 +316 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 317 common signalfd4 sys_signalfd4 compat_sys_signalfd4 318 common eventfd2 sys_eventfd2 319 common epoll_create1 sys_epoll_create1 @@ -372,13 +372,13 @@ 325 common pwritev sys_pwritev compat_sys_pwritev 326 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo 327 common perf_event_open sys_perf_event_open -328 common recvmmsg sys_recvmmsg compat_sys_recvmmsg +328 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 329 common fanotify_init sys_fanotify_init 330 common fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark 331 common prlimit64 sys_prlimit64 332 common name_to_handle_at sys_name_to_handle_at 333 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -334 32 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +334 32 clock_adjtime sys_clock_adjtime sys_clock_adjtime32 334 64 clock_adjtime sys_sparc_clock_adjtime 335 common syncfs sys_syncfs 336 common sendmmsg sys_sendmmsg compat_sys_sendmmsg diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 2be1d0eb7754..7705d5ecad25 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -24,7 +24,7 @@ 10 i386 unlink sys_unlink __ia32_sys_unlink 11 i386 execve sys_execve __ia32_compat_sys_execve 12 i386 chdir sys_chdir __ia32_sys_chdir -13 i386 time sys_time __ia32_compat_sys_time +13 i386 time sys_time __ia32_sys_time32 14 i386 mknod sys_mknod __ia32_sys_mknod 15 i386 chmod sys_chmod __ia32_sys_chmod 16 i386 lchown sys_lchown16 __ia32_sys_lchown16 @@ -36,12 +36,12 @@ 22 i386 umount sys_oldumount __ia32_sys_oldumount 23 i386 setuid sys_setuid16 __ia32_sys_setuid16 24 i386 getuid sys_getuid16 __ia32_sys_getuid16 -25 i386 stime sys_stime __ia32_compat_sys_stime +25 i386 stime sys_stime __ia32_sys_stime32 26 i386 ptrace sys_ptrace __ia32_compat_sys_ptrace 27 i386 alarm sys_alarm __ia32_sys_alarm 28 i386 oldfstat sys_fstat __ia32_sys_fstat 29 i386 pause sys_pause __ia32_sys_pause -30 i386 utime sys_utime __ia32_compat_sys_utime +30 i386 utime sys_utime __ia32_sys_utime32 31 i386 stty 32 i386 gtty 33 i386 access sys_access __ia32_sys_access @@ -135,7 +135,7 @@ 121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname 122 i386 uname sys_newuname __ia32_sys_newuname 123 i386 modify_ldt sys_modify_ldt __ia32_sys_modify_ldt -124 i386 adjtimex sys_adjtimex __ia32_compat_sys_adjtimex +124 i386 adjtimex sys_adjtimex __ia32_sys_adjtimex_time32 125 i386 mprotect sys_mprotect __ia32_sys_mprotect 126 i386 sigprocmask sys_sigprocmask __ia32_compat_sys_sigprocmask 127 i386 create_module @@ -172,8 +172,8 @@ 158 i386 sched_yield sys_sched_yield __ia32_sys_sched_yield 159 i386 sched_get_priority_max sys_sched_get_priority_max __ia32_sys_sched_get_priority_max 160 i386 sched_get_priority_min sys_sched_get_priority_min __ia32_sys_sched_get_priority_min -161 i386 sched_rr_get_interval sys_sched_rr_get_interval __ia32_compat_sys_sched_rr_get_interval -162 i386 nanosleep sys_nanosleep __ia32_compat_sys_nanosleep +161 i386 sched_rr_get_interval sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval_time32 +162 i386 nanosleep sys_nanosleep __ia32_sys_nanosleep_time32 163 i386 mremap sys_mremap __ia32_sys_mremap 164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16 165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16 @@ -188,7 +188,7 @@ 174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction 175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending -177 i386 rt_sigtimedwait sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait +177 i386 rt_sigtimedwait sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time32 178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo 179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend 180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread @@ -251,14 +251,14 @@ 237 i386 fremovexattr sys_fremovexattr __ia32_sys_fremovexattr 238 i386 tkill sys_tkill __ia32_sys_tkill 239 i386 sendfile64 sys_sendfile64 __ia32_sys_sendfile64 -240 i386 futex sys_futex __ia32_compat_sys_futex +240 i386 futex sys_futex __ia32_sys_futex_time32 241 i386 sched_setaffinity sys_sched_setaffinity __ia32_compat_sys_sched_setaffinity 242 i386 sched_getaffinity sys_sched_getaffinity __ia32_compat_sys_sched_getaffinity 243 i386 set_thread_area sys_set_thread_area __ia32_sys_set_thread_area 244 i386 get_thread_area sys_get_thread_area __ia32_sys_get_thread_area 245 i386 io_setup sys_io_setup __ia32_compat_sys_io_setup 246 i386 io_destroy sys_io_destroy __ia32_sys_io_destroy -247 i386 io_getevents sys_io_getevents __ia32_compat_sys_io_getevents +247 i386 io_getevents sys_io_getevents __ia32_sys_io_getevents_time32 248 i386 io_submit sys_io_submit __ia32_compat_sys_io_submit 249 i386 io_cancel sys_io_cancel __ia32_sys_io_cancel 250 i386 fadvise64 sys_fadvise64 __ia32_compat_sys_x86_fadvise64 @@ -271,18 +271,18 @@ 257 i386 remap_file_pages sys_remap_file_pages __ia32_sys_remap_file_pages 258 i386 set_tid_address sys_set_tid_address __ia32_sys_set_tid_address 259 i386 timer_create sys_timer_create __ia32_compat_sys_timer_create -260 i386 timer_settime sys_timer_settime __ia32_compat_sys_timer_settime -261 i386 timer_gettime sys_timer_gettime __ia32_compat_sys_timer_gettime +260 i386 timer_settime sys_timer_settime __ia32_sys_timer_settime32 +261 i386 timer_gettime sys_timer_gettime __ia32_sys_timer_gettime32 262 i386 timer_getoverrun sys_timer_getoverrun __ia32_sys_timer_getoverrun 263 i386 timer_delete sys_timer_delete __ia32_sys_timer_delete -264 i386 clock_settime sys_clock_settime __ia32_compat_sys_clock_settime -265 i386 clock_gettime sys_clock_gettime __ia32_compat_sys_clock_gettime -266 i386 clock_getres sys_clock_getres __ia32_compat_sys_clock_getres -267 i386 clock_nanosleep sys_clock_nanosleep __ia32_compat_sys_clock_nanosleep +264 i386 clock_settime sys_clock_settime __ia32_sys_clock_settime32 +265 i386 clock_gettime sys_clock_gettime __ia32_sys_clock_gettime32 +266 i386 clock_getres sys_clock_getres __ia32_sys_clock_getres_time32 +267 i386 clock_nanosleep sys_clock_nanosleep __ia32_sys_clock_nanosleep_time32 268 i386 statfs64 sys_statfs64 __ia32_compat_sys_statfs64 269 i386 fstatfs64 sys_fstatfs64 __ia32_compat_sys_fstatfs64 270 i386 tgkill sys_tgkill __ia32_sys_tgkill -271 i386 utimes sys_utimes __ia32_compat_sys_utimes +271 i386 utimes sys_utimes __ia32_sys_utimes_time32 272 i386 fadvise64_64 sys_fadvise64_64 __ia32_compat_sys_x86_fadvise64_64 273 i386 vserver 274 i386 mbind sys_mbind __ia32_sys_mbind @@ -290,8 +290,8 @@ 276 i386 set_mempolicy sys_set_mempolicy __ia32_sys_set_mempolicy 277 i386 mq_open sys_mq_open __ia32_compat_sys_mq_open 278 i386 mq_unlink sys_mq_unlink __ia32_sys_mq_unlink -279 i386 mq_timedsend sys_mq_timedsend __ia32_compat_sys_mq_timedsend -280 i386 mq_timedreceive sys_mq_timedreceive __ia32_compat_sys_mq_timedreceive +279 i386 mq_timedsend sys_mq_timedsend __ia32_sys_mq_timedsend_time32 +280 i386 mq_timedreceive sys_mq_timedreceive __ia32_sys_mq_timedreceive_time32 281 i386 mq_notify sys_mq_notify __ia32_compat_sys_mq_notify 282 i386 mq_getsetattr sys_mq_getsetattr __ia32_compat_sys_mq_getsetattr 283 i386 kexec_load sys_kexec_load __ia32_compat_sys_kexec_load @@ -310,7 +310,7 @@ 296 i386 mkdirat sys_mkdirat __ia32_sys_mkdirat 297 i386 mknodat sys_mknodat __ia32_sys_mknodat 298 i386 fchownat sys_fchownat __ia32_sys_fchownat -299 i386 futimesat sys_futimesat __ia32_compat_sys_futimesat +299 i386 futimesat sys_futimesat __ia32_sys_futimesat_time32 300 i386 fstatat64 sys_fstatat64 __ia32_compat_sys_x86_fstatat 301 i386 unlinkat sys_unlinkat __ia32_sys_unlinkat 302 i386 renameat sys_renameat __ia32_sys_renameat @@ -319,8 +319,8 @@ 305 i386 readlinkat sys_readlinkat __ia32_sys_readlinkat 306 i386 fchmodat sys_fchmodat __ia32_sys_fchmodat 307 i386 faccessat sys_faccessat __ia32_sys_faccessat -308 i386 pselect6 sys_pselect6 __ia32_compat_sys_pselect6 -309 i386 ppoll sys_ppoll __ia32_compat_sys_ppoll +308 i386 pselect6 sys_pselect6 __ia32_compat_sys_pselect6_time32 +309 i386 ppoll sys_ppoll __ia32_compat_sys_ppoll_time32 310 i386 unshare sys_unshare __ia32_sys_unshare 311 i386 set_robust_list sys_set_robust_list __ia32_compat_sys_set_robust_list 312 i386 get_robust_list sys_get_robust_list __ia32_compat_sys_get_robust_list @@ -331,13 +331,13 @@ 317 i386 move_pages sys_move_pages __ia32_compat_sys_move_pages 318 i386 getcpu sys_getcpu __ia32_sys_getcpu 319 i386 epoll_pwait sys_epoll_pwait __ia32_sys_epoll_pwait -320 i386 utimensat sys_utimensat __ia32_compat_sys_utimensat +320 i386 utimensat sys_utimensat __ia32_sys_utimensat_time32 321 i386 signalfd sys_signalfd __ia32_compat_sys_signalfd 322 i386 timerfd_create sys_timerfd_create __ia32_sys_timerfd_create 323 i386 eventfd sys_eventfd __ia32_sys_eventfd 324 i386 fallocate sys_fallocate __ia32_compat_sys_x86_fallocate -325 i386 timerfd_settime sys_timerfd_settime __ia32_compat_sys_timerfd_settime -326 i386 timerfd_gettime sys_timerfd_gettime __ia32_compat_sys_timerfd_gettime +325 i386 timerfd_settime sys_timerfd_settime __ia32_sys_timerfd_settime32 +326 i386 timerfd_gettime sys_timerfd_gettime __ia32_sys_timerfd_gettime32 327 i386 signalfd4 sys_signalfd4 __ia32_compat_sys_signalfd4 328 i386 eventfd2 sys_eventfd2 __ia32_sys_eventfd2 329 i386 epoll_create1 sys_epoll_create1 __ia32_sys_epoll_create1 @@ -348,13 +348,13 @@ 334 i386 pwritev sys_pwritev __ia32_compat_sys_pwritev 335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo __ia32_compat_sys_rt_tgsigqueueinfo 336 i386 perf_event_open sys_perf_event_open __ia32_sys_perf_event_open -337 i386 recvmmsg sys_recvmmsg __ia32_compat_sys_recvmmsg +337 i386 recvmmsg sys_recvmmsg __ia32_compat_sys_recvmmsg_time32 338 i386 fanotify_init sys_fanotify_init __ia32_sys_fanotify_init 339 i386 fanotify_mark sys_fanotify_mark __ia32_compat_sys_fanotify_mark 340 i386 prlimit64 sys_prlimit64 __ia32_sys_prlimit64 341 i386 name_to_handle_at sys_name_to_handle_at __ia32_sys_name_to_handle_at 342 i386 open_by_handle_at sys_open_by_handle_at __ia32_compat_sys_open_by_handle_at -343 i386 clock_adjtime sys_clock_adjtime __ia32_compat_sys_clock_adjtime +343 i386 clock_adjtime sys_clock_adjtime __ia32_sys_clock_adjtime32 344 i386 syncfs sys_syncfs __ia32_sys_syncfs 345 i386 sendmmsg sys_sendmmsg __ia32_compat_sys_sendmmsg 346 i386 setns sys_setns __ia32_sys_setns diff --git a/fs/aio.c b/fs/aio.c index b906ff70c90f..4394d3fe116a 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2198,11 +2198,11 @@ SYSCALL_DEFINE6(io_pgetevents_time32, #if defined(CONFIG_COMPAT_32BIT_TIME) -COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, - compat_long_t, min_nr, - compat_long_t, nr, - struct io_event __user *, events, - struct old_timespec32 __user *, timeout) +SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id, + __s32, min_nr, + __s32, nr, + struct io_event __user *, events, + struct old_timespec32 __user *, timeout) { struct timespec64 t; int ret; diff --git a/fs/select.c b/fs/select.c index d0f35dbc0e8f..6cbc9ff56ba0 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1379,7 +1379,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, #if defined(CONFIG_COMPAT_32BIT_TIME) -COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, +COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { @@ -1402,7 +1402,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, #endif #if defined(CONFIG_COMPAT_32BIT_TIME) -COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, +COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { diff --git a/fs/timerfd.c b/fs/timerfd.c index 803ca070d42e..6a6fc8aa1de7 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -560,7 +560,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, +SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags, const struct old_itimerspec32 __user *, utmr, struct old_itimerspec32 __user *, otmr) { @@ -577,7 +577,7 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, return ret; } -COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, +SYSCALL_DEFINE2(timerfd_gettime32, int, ufd, struct old_itimerspec32 __user *, otmr) { struct itimerspec64 kotmr; diff --git a/fs/utimes.c b/fs/utimes.c index bdcf2daf39c1..350c9c16ace1 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -224,8 +224,8 @@ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) * of sys_utimes. */ #ifdef __ARCH_WANT_SYS_UTIME32 -COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, - struct old_utimbuf32 __user *, t) +SYSCALL_DEFINE2(utime32, const char __user *, filename, + struct old_utimbuf32 __user *, t) { struct timespec64 tv[2]; @@ -240,7 +240,7 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, } #endif -COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) +SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) { struct timespec64 tv[2]; @@ -276,14 +276,14 @@ static long do_compat_futimesat(unsigned int dfd, const char __user *filename, return do_utimes(dfd, filename, t ? tv : NULL, 0); } -COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, +SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(dfd, filename, t); } -COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct old_timeval32 __user *, t) +SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(AT_FDCWD, filename, t); } diff --git a/include/linux/compat.h b/include/linux/compat.h index 657ca6abd855..ebddcb6cfcf8 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -520,11 +520,6 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long); asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p); asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr, u32 __user *iocb); -asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id, - compat_long_t min_nr, - compat_long_t nr, - struct io_event __user *events, - struct old_timespec32 __user *timeout); asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id, compat_long_t min_nr, compat_long_t nr, @@ -617,7 +612,7 @@ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset, compat_size_t count); /* fs/select.c */ -asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, +asmlinkage long compat_sys_pselect6_time32(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timespec32 __user *tsp, @@ -627,7 +622,7 @@ asmlinkage long compat_sys_pselect6_time64(int n, compat_ulong_t __user *inp, compat_ulong_t __user *exp, struct __kernel_timespec __user *tsp, void __user *sig); -asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, +asmlinkage long compat_sys_ppoll_time32(struct pollfd __user *ufds, unsigned int nfds, struct old_timespec32 __user *tsp, const compat_sigset_t __user *sigmask, @@ -657,19 +652,6 @@ asmlinkage long compat_sys_newfstat(unsigned int fd, /* fs/sync.c: No generic prototype for sync_file_range and sync_file_range2 */ -/* fs/timerfd.c */ -asmlinkage long compat_sys_timerfd_gettime(int ufd, - struct old_itimerspec32 __user *otmr); -asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, - const struct old_itimerspec32 __user *utmr, - struct old_itimerspec32 __user *otmr); - -/* fs/utimes.c */ -asmlinkage long compat_sys_utimensat(unsigned int dfd, - const char __user *filename, - struct old_timespec32 __user *t, - int flags); - /* kernel/exit.c */ asmlinkage long compat_sys_waitid(int, compat_pid_t, struct compat_siginfo __user *, int, @@ -678,9 +660,6 @@ asmlinkage long compat_sys_waitid(int, compat_pid_t, /* kernel/futex.c */ -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, - struct old_timespec32 __user *utime, u32 __user *uaddr2, - u32 val3); asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, compat_size_t len); @@ -688,10 +667,6 @@ asmlinkage long compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr); -/* kernel/hrtimer.c */ -asmlinkage long compat_sys_nanosleep(struct old_timespec32 __user *rqtp, - struct old_timespec32 __user *rmtp); - /* kernel/itimer.c */ asmlinkage long compat_sys_getitimer(int which, struct compat_itimerval __user *it); @@ -709,20 +684,6 @@ asmlinkage long compat_sys_kexec_load(compat_ulong_t entry, asmlinkage long compat_sys_timer_create(clockid_t which_clock, struct compat_sigevent __user *timer_event_spec, timer_t __user *created_timer_id); -asmlinkage long compat_sys_timer_gettime(timer_t timer_id, - struct old_itimerspec32 __user *setting); -asmlinkage long compat_sys_timer_settime(timer_t timer_id, int flags, - struct old_itimerspec32 __user *new, - struct old_itimerspec32 __user *old); -asmlinkage long compat_sys_clock_settime(clockid_t which_clock, - struct old_timespec32 __user *tp); -asmlinkage long compat_sys_clock_gettime(clockid_t which_clock, - struct old_timespec32 __user *tp); -asmlinkage long compat_sys_clock_getres(clockid_t which_clock, - struct old_timespec32 __user *tp); -asmlinkage long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, - struct old_timespec32 __user *rqtp, - struct old_timespec32 __user *rmtp); /* kernel/ptrace.c */ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, @@ -735,8 +696,6 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr); -asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, - struct old_timespec32 __user *interval); /* kernel/signal.c */ asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, @@ -754,7 +713,7 @@ asmlinkage long compat_sys_rt_sigprocmask(int how, compat_sigset_t __user *set, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigpending(compat_sigset_t __user *uset, compat_size_t sigsetsize); -asmlinkage long compat_sys_rt_sigtimedwait(compat_sigset_t __user *uthese, +asmlinkage long compat_sys_rt_sigtimedwait_time32(compat_sigset_t __user *uthese, struct compat_siginfo __user *uinfo, struct old_timespec32 __user *uts, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigtimedwait_time64(compat_sigset_t __user *uthese, @@ -777,7 +736,6 @@ asmlinkage long compat_sys_gettimeofday(struct old_timeval32 __user *tv, struct timezone __user *tz); asmlinkage long compat_sys_settimeofday(struct old_timeval32 __user *tv, struct timezone __user *tz); -asmlinkage long compat_sys_adjtimex(struct old_timex32 __user *utp); /* kernel/timer.c */ asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info); @@ -786,14 +744,6 @@ asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info); asmlinkage long compat_sys_mq_open(const char __user *u_name, int oflag, compat_mode_t mode, struct compat_mq_attr __user *u_attr); -asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, - const char __user *u_msg_ptr, - compat_size_t msg_len, unsigned int msg_prio, - const struct old_timespec32 __user *u_abs_timeout); -asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes, - char __user *u_msg_ptr, - compat_size_t msg_len, unsigned int __user *u_msg_prio, - const struct old_timespec32 __user *u_abs_timeout); asmlinkage long compat_sys_mq_notify(mqd_t mqdes, const struct compat_sigevent __user *u_notification); asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, @@ -809,8 +759,6 @@ asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp, /* ipc/sem.c */ asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg); -asmlinkage long compat_sys_semtimedop(int semid, struct sembuf __user *tsems, - unsigned nsems, const struct old_timespec32 __user *timeout); /* ipc/shm.c */ asmlinkage long compat_sys_shmctl(int first, int second, void __user *uptr); @@ -868,7 +816,7 @@ asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, asmlinkage long compat_sys_recvmmsg_time64(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags, struct __kernel_timespec __user *timeout); -asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, +asmlinkage long compat_sys_recvmmsg_time32(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags, struct old_timespec32 __user *timeout); asmlinkage long compat_sys_wait4(compat_pid_t pid, @@ -879,8 +827,6 @@ asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32, asmlinkage long compat_sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); -asmlinkage long compat_sys_clock_adjtime(clockid_t which_clock, - struct old_timex32 __user *tp); asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags); asmlinkage ssize_t compat_sys_process_vm_readv(compat_pid_t pid, @@ -921,8 +867,6 @@ asmlinkage long compat_sys_pwritev64v2(unsigned long fd, /* __ARCH_WANT_SYSCALL_NO_AT */ asmlinkage long compat_sys_open(const char __user *filename, int flags, umode_t mode); -asmlinkage long compat_sys_utimes(const char __user *filename, - struct old_timeval32 __user *t); /* __ARCH_WANT_SYSCALL_NO_FLAGS */ asmlinkage long compat_sys_signalfd(int ufd, @@ -936,12 +880,6 @@ asmlinkage long compat_sys_newlstat(const char __user *filename, struct compat_stat __user *statbuf); /* __ARCH_WANT_SYSCALL_DEPRECATED */ -asmlinkage long compat_sys_time(old_time32_t __user *tloc); -asmlinkage long compat_sys_utime(const char __user *filename, - struct old_utimbuf32 __user *t); -asmlinkage long compat_sys_futimesat(unsigned int dfd, - const char __user *filename, - struct old_timeval32 __user *t); asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp); @@ -976,9 +914,6 @@ asmlinkage long compat_sys_sigaction(int sig, struct compat_old_sigaction __user *oact); #endif -/* obsolete: kernel/time/time.c */ -asmlinkage long compat_sys_stime(old_time32_t __user *tptr); - /* obsolete: net/socket.c */ asmlinkage long compat_sys_socketcall(int call, u32 __user *args); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 09330d5bda0c..94369f5bd8e5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -297,6 +297,11 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id, long nr, struct io_event __user *events, struct __kernel_timespec __user *timeout); +asmlinkage long sys_io_getevents_time32(__u32 ctx_id, + __s32 min_nr, + __s32 nr, + struct io_event __user *events, + struct old_timespec32 __user *timeout); asmlinkage long sys_io_pgetevents(aio_context_t ctx_id, long min_nr, long nr, @@ -522,11 +527,19 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags, const struct __kernel_itimerspec __user *utmr, struct __kernel_itimerspec __user *otmr); asmlinkage long sys_timerfd_gettime(int ufd, struct __kernel_itimerspec __user *otmr); +asmlinkage long sys_timerfd_gettime32(int ufd, + struct old_itimerspec32 __user *otmr); +asmlinkage long sys_timerfd_settime32(int ufd, int flags, + const struct old_itimerspec32 __user *utmr, + struct old_itimerspec32 __user *otmr); /* fs/utimes.c */ asmlinkage long sys_utimensat(int dfd, const char __user *filename, struct __kernel_timespec __user *utimes, int flags); +asmlinkage long sys_utimensat_time32(unsigned int dfd, + const char __user *filename, + struct old_timespec32 __user *t, int flags); /* kernel/acct.c */ asmlinkage long sys_acct(const char __user *name); @@ -555,6 +568,9 @@ asmlinkage long sys_unshare(unsigned long unshare_flags); asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, struct __kernel_timespec __user *utime, u32 __user *uaddr2, u32 val3); +asmlinkage long sys_futex_time32(u32 __user *uaddr, int op, u32 val, + struct old_timespec32 __user *utime, u32 __user *uaddr2, + u32 val3); asmlinkage long sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, size_t __user *len_ptr); @@ -564,6 +580,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, /* kernel/hrtimer.c */ asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); +asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp, + struct old_timespec32 __user *rmtp); /* kernel/itimer.c */ asmlinkage long sys_getitimer(int which, struct itimerval __user *value); @@ -602,6 +620,20 @@ asmlinkage long sys_clock_getres(clockid_t which_clock, asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, const struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); +asmlinkage long sys_timer_gettime32(timer_t timer_id, + struct old_itimerspec32 __user *setting); +asmlinkage long sys_timer_settime32(timer_t timer_id, int flags, + struct old_itimerspec32 __user *new, + struct old_itimerspec32 __user *old); +asmlinkage long sys_clock_settime32(clockid_t which_clock, + struct old_timespec32 __user *tp); +asmlinkage long sys_clock_gettime32(clockid_t which_clock, + struct old_timespec32 __user *tp); +asmlinkage long sys_clock_getres_time32(clockid_t which_clock, + struct old_timespec32 __user *tp); +asmlinkage long sys_clock_nanosleep_time32(clockid_t which_clock, int flags, + struct old_timespec32 __user *rqtp, + struct old_timespec32 __user *rmtp); /* kernel/printk.c */ asmlinkage long sys_syslog(int type, char __user *buf, int len); @@ -627,6 +659,8 @@ asmlinkage long sys_sched_get_priority_max(int policy); asmlinkage long sys_sched_get_priority_min(int policy); asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct __kernel_timespec __user *interval); +asmlinkage long sys_sched_rr_get_interval_time32(pid_t pid, + struct old_timespec32 __user *interval); /* kernel/signal.c */ asmlinkage long sys_restart_syscall(void); @@ -696,6 +730,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, asmlinkage long sys_settimeofday(struct timeval __user *tv, struct timezone __user *tz); asmlinkage long sys_adjtimex(struct __kernel_timex __user *txc_p); +asmlinkage long sys_adjtimex_time32(struct old_timex32 __user *txc_p); /* kernel/timer.c */ asmlinkage long sys_getpid(void); @@ -714,6 +749,14 @@ asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct __kernel_timespec __user *abs_timeout); asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); +asmlinkage long sys_mq_timedreceive_time32(mqd_t mqdes, + char __user *u_msg_ptr, + unsigned int msg_len, unsigned int __user *u_msg_prio, + const struct old_timespec32 __user *u_abs_timeout); +asmlinkage long sys_mq_timedsend_time32(mqd_t mqdes, + const char __user *u_msg_ptr, + unsigned int msg_len, unsigned int msg_prio, + const struct old_timespec32 __user *u_abs_timeout); /* ipc/msg.c */ asmlinkage long sys_msgget(key_t key, int msgflg); @@ -731,6 +774,9 @@ asmlinkage long sys_old_semctl(int semid, int semnum, int cmd, unsigned long arg asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops, unsigned nsops, const struct __kernel_timespec __user *timeout); +asmlinkage long sys_semtimedop_time32(int semid, struct sembuf __user *sops, + unsigned nsops, + const struct old_timespec32 __user *timeout); asmlinkage long sys_semop(int semid, struct sembuf __user *sops, unsigned nsops); @@ -871,6 +917,8 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd, int flags); asmlinkage long sys_clock_adjtime(clockid_t which_clock, struct __kernel_timex __user *tx); +asmlinkage long sys_clock_adjtime32(clockid_t which_clock, + struct old_timex32 __user *tx); asmlinkage long sys_syncfs(int fd); asmlinkage long sys_setns(int fd, int nstype); asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg, @@ -1006,6 +1054,7 @@ asmlinkage long sys_alarm(unsigned int seconds); asmlinkage long sys_getpgrp(void); asmlinkage long sys_pause(void); asmlinkage long sys_time(time_t __user *tloc); +asmlinkage long sys_time32(old_time32_t __user *tloc); #ifdef __ARCH_WANT_SYS_UTIME asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times); @@ -1014,6 +1063,13 @@ asmlinkage long sys_utimes(char __user *filename, asmlinkage long sys_futimesat(int dfd, const char __user *filename, struct timeval __user *utimes); #endif +asmlinkage long sys_futimesat_time32(unsigned int dfd, + const char __user *filename, + struct old_timeval32 __user *t); +asmlinkage long sys_utime32(const char __user *filename, + struct old_utimbuf32 __user *t); +asmlinkage long sys_utimes_time32(const char __user *filename, + struct old_timeval32 __user *t); asmlinkage long sys_creat(const char __user *pathname, umode_t mode); asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user *dirent, @@ -1038,6 +1094,7 @@ asmlinkage long sys_fork(void); /* obsolete: kernel/time/time.c */ asmlinkage long sys_stime(time_t __user *tptr); +asmlinkage long sys_stime32(old_time32_t __user *tptr); /* obsolete: kernel/signal.c */ asmlinkage long sys_sigpending(old_sigset_t __user *uset); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 509484dbfd5d..153b55b94234 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -39,7 +39,7 @@ __SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit) #define __NR_io_cancel 3 __SYSCALL(__NR_io_cancel, sys_io_cancel) #define __NR_io_getevents 4 -__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents) +__SC_COMP(__NR_io_getevents, sys_io_getevents, sys_io_getevents_time32) /* fs/xattr.c */ #define __NR_setxattr 5 @@ -223,9 +223,9 @@ __SYSCALL(__NR3264_sendfile, sys_sendfile64) /* fs/select.c */ #define __NR_pselect6 72 -__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6) +__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6_time32) #define __NR_ppoll 73 -__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll) +__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll_time32) /* fs/signalfd.c */ #define __NR_signalfd4 74 @@ -271,14 +271,14 @@ __SC_COMP(__NR_sync_file_range, sys_sync_file_range, \ __SYSCALL(__NR_timerfd_create, sys_timerfd_create) #define __NR_timerfd_settime 86 __SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \ - compat_sys_timerfd_settime) + sys_timerfd_settime32) #define __NR_timerfd_gettime 87 __SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \ - compat_sys_timerfd_gettime) + sys_timerfd_gettime32) /* fs/utimes.c */ #define __NR_utimensat 88 -__SC_COMP(__NR_utimensat, sys_utimensat, compat_sys_utimensat) +__SC_COMP(__NR_utimensat, sys_utimensat, sys_utimensat_time32) /* kernel/acct.c */ #define __NR_acct 89 @@ -310,7 +310,7 @@ __SYSCALL(__NR_unshare, sys_unshare) /* kernel/futex.c */ #define __NR_futex 98 -__SC_COMP(__NR_futex, sys_futex, compat_sys_futex) +__SC_COMP(__NR_futex, sys_futex, sys_futex_time32) #define __NR_set_robust_list 99 __SC_COMP(__NR_set_robust_list, sys_set_robust_list, \ compat_sys_set_robust_list) @@ -320,7 +320,7 @@ __SC_COMP(__NR_get_robust_list, sys_get_robust_list, \ /* kernel/hrtimer.c */ #define __NR_nanosleep 101 -__SC_COMP(__NR_nanosleep, sys_nanosleep, compat_sys_nanosleep) +__SC_COMP(__NR_nanosleep, sys_nanosleep, sys_nanosleep_time32) /* kernel/itimer.c */ #define __NR_getitimer 102 @@ -342,22 +342,22 @@ __SYSCALL(__NR_delete_module, sys_delete_module) #define __NR_timer_create 107 __SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create) #define __NR_timer_gettime 108 -__SC_COMP(__NR_timer_gettime, sys_timer_gettime, compat_sys_timer_gettime) +__SC_COMP(__NR_timer_gettime, sys_timer_gettime, sys_timer_gettime32) #define __NR_timer_getoverrun 109 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) #define __NR_timer_settime 110 -__SC_COMP(__NR_timer_settime, sys_timer_settime, compat_sys_timer_settime) +__SC_COMP(__NR_timer_settime, sys_timer_settime, sys_timer_settime32) #define __NR_timer_delete 111 __SYSCALL(__NR_timer_delete, sys_timer_delete) #define __NR_clock_settime 112 -__SC_COMP(__NR_clock_settime, sys_clock_settime, compat_sys_clock_settime) +__SC_COMP(__NR_clock_settime, sys_clock_settime, sys_clock_settime32) #define __NR_clock_gettime 113 -__SC_COMP(__NR_clock_gettime, sys_clock_gettime, compat_sys_clock_gettime) +__SC_COMP(__NR_clock_gettime, sys_clock_gettime, sys_clock_gettime32) #define __NR_clock_getres 114 -__SC_COMP(__NR_clock_getres, sys_clock_getres, compat_sys_clock_getres) +__SC_COMP(__NR_clock_getres, sys_clock_getres, sys_clock_getres_time32) #define __NR_clock_nanosleep 115 __SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \ - compat_sys_clock_nanosleep) + sys_clock_nanosleep_time32) /* kernel/printk.c */ #define __NR_syslog 116 @@ -390,7 +390,7 @@ __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) #define __NR_sched_rr_get_interval 127 __SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \ - compat_sys_sched_rr_get_interval) + sys_sched_rr_get_interval_time32) /* kernel/signal.c */ #define __NR_restart_syscall 128 @@ -413,7 +413,7 @@ __SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask) __SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending) #define __NR_rt_sigtimedwait 137 __SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \ - compat_sys_rt_sigtimedwait) + compat_sys_rt_sigtimedwait_time32) #define __NR_rt_sigqueueinfo 138 __SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \ compat_sys_rt_sigqueueinfo) @@ -486,7 +486,7 @@ __SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday) #define __NR_settimeofday 170 __SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday) #define __NR_adjtimex 171 -__SC_COMP(__NR_adjtimex, sys_adjtimex, compat_sys_adjtimex) +__SC_COMP(__NR_adjtimex, sys_adjtimex, sys_adjtimex_time32) /* kernel/timer.c */ #define __NR_getpid 172 @@ -512,10 +512,10 @@ __SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open) #define __NR_mq_unlink 181 __SYSCALL(__NR_mq_unlink, sys_mq_unlink) #define __NR_mq_timedsend 182 -__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, compat_sys_mq_timedsend) +__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, sys_mq_timedsend_time32) #define __NR_mq_timedreceive 183 __SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \ - compat_sys_mq_timedreceive) + sys_mq_timedreceive_time32) #define __NR_mq_notify 184 __SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify) #define __NR_mq_getsetattr 185 @@ -537,7 +537,7 @@ __SYSCALL(__NR_semget, sys_semget) #define __NR_semctl 191 __SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl) #define __NR_semtimedop 192 -__SC_COMP(__NR_semtimedop, sys_semtimedop, compat_sys_semtimedop) +__SC_COMP(__NR_semtimedop, sys_semtimedop, sys_semtimedop_time32) #define __NR_semop 193 __SYSCALL(__NR_semop, sys_semop) @@ -659,7 +659,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_accept4 242 __SYSCALL(__NR_accept4, sys_accept4) #define __NR_recvmmsg 243 -__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg) +__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg_time32) /* * Architectures may provide up to 16 syscalls of their own @@ -681,7 +681,7 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) __SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \ compat_sys_open_by_handle_at) #define __NR_clock_adjtime 266 -__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, compat_sys_clock_adjtime) +__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, sys_clock_adjtime32) #define __NR_syncfs 267 __SYSCALL(__NR_syncfs, sys_syncfs) #define __NR_setns 268 diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c595bed7bfcb..c839bf83231d 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1471,10 +1471,10 @@ static int compat_prepare_timeout(const struct old_timespec32 __user *p, return 0; } -COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, - const char __user *, u_msg_ptr, - compat_size_t, msg_len, unsigned int, msg_prio, - const struct old_timespec32 __user *, u_abs_timeout) +SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes, + const char __user *, u_msg_ptr, + unsigned int, msg_len, unsigned int, msg_prio, + const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { @@ -1486,10 +1486,10 @@ COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p); } -COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, - char __user *, u_msg_ptr, - compat_size_t, msg_len, unsigned int __user *, u_msg_prio, - const struct old_timespec32 __user *, u_abs_timeout) +SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes, + char __user *, u_msg_ptr, + unsigned int, msg_len, unsigned int __user *, u_msg_prio, + const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { diff --git a/ipc/sem.c b/ipc/sem.c index d1efff3a81bb..80909464acff 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2250,7 +2250,7 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, return do_semtimedop(semid, tsems, nsops, NULL); } -COMPAT_SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsems, +SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems, unsigned int, nsops, const struct old_timespec32 __user *, timeout) { diff --git a/kernel/futex.c b/kernel/futex.c index be3bff2315ff..caead6c113d4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3812,7 +3812,7 @@ err_unlock: #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, +SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a674c7db2f29..62862419cd05 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5252,9 +5252,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, - compat_pid_t, pid, - struct old_timespec32 __user *, interval) +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); diff --git a/kernel/signal.c b/kernel/signal.c index e1d7ad8e6ab1..af27629918cf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3397,7 +3397,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ce04431a40d1..85e5ccec0955 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -42,9 +42,11 @@ COND_SYSCALL(io_destroy); COND_SYSCALL(io_submit); COND_SYSCALL_COMPAT(io_submit); COND_SYSCALL(io_cancel); +COND_SYSCALL(io_getevents_time32); COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents_time32); COND_SYSCALL(io_pgetevents); -COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents_time32); COND_SYSCALL_COMPAT(io_pgetevents); /* fs/xattr.c */ @@ -114,9 +116,9 @@ COND_SYSCALL_COMPAT(signalfd4); /* fs/timerfd.c */ COND_SYSCALL(timerfd_create); COND_SYSCALL(timerfd_settime); -COND_SYSCALL_COMPAT(timerfd_settime); +COND_SYSCALL(timerfd_settime32); COND_SYSCALL(timerfd_gettime); -COND_SYSCALL_COMPAT(timerfd_gettime); +COND_SYSCALL(timerfd_gettime32); /* fs/utimes.c */ @@ -135,7 +137,7 @@ COND_SYSCALL(capset); /* kernel/futex.c */ COND_SYSCALL(futex); -COND_SYSCALL_COMPAT(futex); +COND_SYSCALL(futex_time32); COND_SYSCALL(set_robust_list); COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); @@ -187,9 +189,9 @@ COND_SYSCALL(mq_open); COND_SYSCALL_COMPAT(mq_open); COND_SYSCALL(mq_unlink); COND_SYSCALL(mq_timedsend); -COND_SYSCALL_COMPAT(mq_timedsend); +COND_SYSCALL(mq_timedsend_time32); COND_SYSCALL(mq_timedreceive); -COND_SYSCALL_COMPAT(mq_timedreceive); +COND_SYSCALL(mq_timedreceive_time32); COND_SYSCALL(mq_notify); COND_SYSCALL_COMPAT(mq_notify); COND_SYSCALL(mq_getsetattr); @@ -211,7 +213,7 @@ COND_SYSCALL(old_semctl); COND_SYSCALL(semctl); COND_SYSCALL_COMPAT(semctl); COND_SYSCALL(semtimedop); -COND_SYSCALL_COMPAT(semtimedop); +COND_SYSCALL(semtimedop_time32); COND_SYSCALL(semop); /* ipc/shm.c */ @@ -288,7 +290,7 @@ COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); COND_SYSCALL(recvmmsg); COND_SYSCALL(recvmmsg_time32); -COND_SYSCALL_COMPAT(recvmmsg); +COND_SYSCALL_COMPAT(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time64); /* diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f5cfa1b73d6f..0f5f96075110 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1771,7 +1771,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, +SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index a51895486e5e..67df65f887ac 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -45,6 +45,7 @@ SYS_NI(timer_delete); SYS_NI(clock_adjtime); SYS_NI(getitimer); SYS_NI(setitimer); +SYS_NI(clock_adjtime32); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif @@ -150,16 +151,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT COMPAT_SYS_NI(timer_create); -COMPAT_SYS_NI(clock_adjtime); -COMPAT_SYS_NI(timer_settime); -COMPAT_SYS_NI(timer_gettime); COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); #endif #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYS_NI(timer_settime32); +SYS_NI(timer_gettime32); + +SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, + struct old_timespec32 __user *, tp) { struct timespec64 new_tp; @@ -171,8 +172,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, return do_sys_settimeofday64(&new_tp, NULL); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -186,8 +187,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return 0; } -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -206,9 +207,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, } } -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 t; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index de79f85ae14f..29176635991f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -730,8 +730,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct old_itimerspec32 __user *, setting) +SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, + struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; @@ -903,9 +903,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct old_itimerspec32 __user *, new, - struct old_itimerspec32 __user *, old) +SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags, + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -1096,8 +1096,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1111,8 +1111,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, return kc->clock_set(which_clock, &ts); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1129,8 +1129,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return err; } -COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, - struct old_timex32 __user *, utp) +SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, + struct old_timex32 __user *, utp) { struct __kernel_timex ktx; int err; @@ -1147,8 +1147,8 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, return err; } -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1204,9 +1204,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; diff --git a/kernel/time/time.c b/kernel/time/time.c index 78b5c8f1495a..6261f969dcb7 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -98,11 +98,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME */ -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME #ifdef __ARCH_WANT_COMPAT_SYS_TIME /* old_time32_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) +SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc) { old_time32_t i; @@ -116,7 +116,7 @@ COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) return i; } -COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) +SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) { struct timespec64 tv; int err; @@ -344,7 +344,7 @@ int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex return 0; } -COMPAT_SYSCALL_DEFINE1(adjtimex, struct old_timex32 __user *, utp) +SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) { struct __kernel_timex txc; int err, ret; diff --git a/net/compat.c b/net/compat.c index 959d1c51826d..2fef7b9db434 100644 --- a/net/compat.c +++ b/net/compat.c @@ -822,7 +822,7 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg_time64, int, fd, struct compat_mmsghdr __user *, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, +COMPAT_SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct old_timespec32 __user *, timeout) { -- cgit v1.3-7-g2ca7 From 00bf25d693e7f69497cb7f61d46ef99fe295a8a5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 1 Jan 2019 01:13:32 +0100 Subject: y2038: use time32 syscall names on 32-bit This is the big flip, where all 32-bit architectures set COMPAT_32BIT_TIME and use the _time32 system calls from the former compat layer instead of the system calls that take __kernel_timespec and similar arguments. The temporary redirects for __kernel_timespec, __kernel_itimerspec and __kernel_timex can get removed with this. It would be easy to split this commit by architecture, but with the new generated system call tables, it's easy enough to do it all at once, which makes it a little easier to check that the changes are the same in each table. Acked-by: Geert Uytterhoeven Signed-off-by: Arnd Bergmann --- arch/Kconfig | 2 +- arch/arm/kernel/sys_oabi-compat.c | 8 +-- arch/arm/tools/syscall.tbl | 46 ++++++++-------- arch/m68k/kernel/syscalls/syscall.tbl | 42 +++++++-------- arch/microblaze/kernel/syscalls/syscall.tbl | 46 ++++++++-------- arch/mips/kernel/syscalls/syscall_o32.tbl | 44 ++++++++-------- arch/parisc/kernel/syscalls/syscall.tbl | 69 ++++++++++++++++-------- arch/powerpc/kernel/syscalls/syscall.tbl | 82 +++++++++++++++++++++-------- arch/sh/kernel/syscalls/syscall.tbl | 42 +++++++-------- arch/sparc/kernel/syscalls/syscall.tbl | 64 ++++++++++++++-------- arch/x86/entry/syscalls/syscall_32.tbl | 44 ++++++++-------- arch/xtensa/kernel/syscalls/syscall.tbl | 44 ++++++++-------- include/uapi/asm-generic/unistd.h | 56 ++++++++++---------- 13 files changed, 335 insertions(+), 254 deletions(-) (limited to 'include/uapi') diff --git a/arch/Kconfig b/arch/Kconfig index 4cfb6de48f79..46db715a7f42 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -759,7 +759,7 @@ config 64BIT_TIME handling. config COMPAT_32BIT_TIME - def_bool (!64BIT && 64BIT_TIME) || COMPAT + def_bool !64BIT || COMPAT help This enables 32 bit time_t support in addition to 64 bit time_t support. This is relevant on all 32-bit architectures, and 64-bit architectures diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 92ab36f38795..acd054a42ba2 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -317,10 +317,10 @@ struct oabi_sembuf { asmlinkage long sys_oabi_semtimedop(int semid, struct oabi_sembuf __user *tsops, unsigned nsops, - const struct timespec __user *timeout) + const struct old_timespec32 __user *timeout) { struct sembuf *sops; - struct timespec local_timeout; + struct old_timespec32 local_timeout; long err; int i; @@ -350,7 +350,7 @@ asmlinkage long sys_oabi_semtimedop(int semid, } else { mm_segment_t fs = get_fs(); set_fs(KERNEL_DS); - err = sys_semtimedop(semid, sops, nsops, timeout); + err = sys_semtimedop_time32(semid, sops, nsops, timeout); set_fs(fs); } kfree(sops); @@ -375,7 +375,7 @@ asmlinkage int sys_oabi_ipc(uint call, int first, int second, int third, return sys_oabi_semtimedop(first, (struct oabi_sembuf __user *)ptr, second, - (const struct timespec __user *)fifth); + (const struct old_timespec32 __user *)fifth); default: return sys_ipc(call, first, second, third, ptr, fifth); } diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index b54b7f2bc24a..200f4b878a46 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -137,7 +137,7 @@ 121 common setdomainname sys_setdomainname 122 common uname sys_newuname # 123 was sys_modify_ldt -124 common adjtimex sys_adjtimex +124 common adjtimex sys_adjtimex_time32 125 common mprotect sys_mprotect 126 common sigprocmask sys_sigprocmask # 127 was sys_create_module @@ -174,8 +174,8 @@ 158 common sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval -162 common nanosleep sys_nanosleep +161 common sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 common nanosleep sys_nanosleep_time32 163 common mremap sys_mremap 164 common setresuid sys_setresuid16 165 common getresuid sys_getresuid16 @@ -190,7 +190,7 @@ 174 common rt_sigaction sys_rt_sigaction 175 common rt_sigprocmask sys_rt_sigprocmask 176 common rt_sigpending sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait +177 common rt_sigtimedwait sys_rt_sigtimedwait_time32 178 common rt_sigqueueinfo sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend 180 common pread64 sys_pread64 sys_oabi_pread64 @@ -254,12 +254,12 @@ 237 common fremovexattr sys_fremovexattr 238 common tkill sys_tkill 239 common sendfile64 sys_sendfile64 -240 common futex sys_futex +240 common futex sys_futex_time32 241 common sched_setaffinity sys_sched_setaffinity 242 common sched_getaffinity sys_sched_getaffinity 243 common io_setup sys_io_setup 244 common io_destroy sys_io_destroy -245 common io_getevents sys_io_getevents +245 common io_getevents sys_io_getevents_time32 246 common io_submit sys_io_submit 247 common io_cancel sys_io_cancel 248 common exit_group sys_exit_group @@ -272,14 +272,14 @@ # 255 for get_thread_area 256 common set_tid_address sys_set_tid_address 257 common timer_create sys_timer_create -258 common timer_settime sys_timer_settime -259 common timer_gettime sys_timer_gettime +258 common timer_settime sys_timer_settime32 +259 common timer_gettime sys_timer_gettime32 260 common timer_getoverrun sys_timer_getoverrun 261 common timer_delete sys_timer_delete -262 common clock_settime sys_clock_settime -263 common clock_gettime sys_clock_gettime -264 common clock_getres sys_clock_getres -265 common clock_nanosleep sys_clock_nanosleep +262 common clock_settime sys_clock_settime32 +263 common clock_gettime sys_clock_gettime32 +264 common clock_getres sys_clock_getres_time32 +265 common clock_nanosleep sys_clock_nanosleep_time32 266 common statfs64 sys_statfs64_wrapper 267 common fstatfs64 sys_fstatfs64_wrapper 268 common tgkill sys_tgkill @@ -290,8 +290,8 @@ 273 common pciconfig_write sys_pciconfig_write 274 common mq_open sys_mq_open 275 common mq_unlink sys_mq_unlink -276 common mq_timedsend sys_mq_timedsend -277 common mq_timedreceive sys_mq_timedreceive +276 common mq_timedsend sys_mq_timedsend_time32 +277 common mq_timedreceive sys_mq_timedreceive_time32 278 common mq_notify sys_mq_notify 279 common mq_getsetattr sys_mq_getsetattr 280 common waitid sys_waitid @@ -326,7 +326,7 @@ 309 common add_key sys_add_key 310 common request_key sys_request_key 311 common keyctl sys_keyctl -312 common semtimedop sys_semtimedop sys_oabi_semtimedop +312 common semtimedop sys_semtimedop_time32 sys_oabi_semtimedop 313 common vserver 314 common ioprio_set sys_ioprio_set 315 common ioprio_get sys_ioprio_get @@ -349,8 +349,8 @@ 332 common readlinkat sys_readlinkat 333 common fchmodat sys_fchmodat 334 common faccessat sys_faccessat -335 common pselect6 sys_pselect6 -336 common ppoll sys_ppoll +335 common pselect6 sys_pselect6_time32 +336 common ppoll sys_ppoll_time32 337 common unshare sys_unshare 338 common set_robust_list sys_set_robust_list 339 common get_robust_list sys_get_robust_list @@ -362,13 +362,13 @@ 345 common getcpu sys_getcpu 346 common epoll_pwait sys_epoll_pwait 347 common kexec_load sys_kexec_load -348 common utimensat sys_utimensat +348 common utimensat sys_utimensat_time32 349 common signalfd sys_signalfd 350 common timerfd_create sys_timerfd_create 351 common eventfd sys_eventfd 352 common fallocate sys_fallocate -353 common timerfd_settime sys_timerfd_settime -354 common timerfd_gettime sys_timerfd_gettime +353 common timerfd_settime sys_timerfd_settime32 +354 common timerfd_gettime sys_timerfd_gettime32 355 common signalfd4 sys_signalfd4 356 common eventfd2 sys_eventfd2 357 common epoll_create1 sys_epoll_create1 @@ -379,14 +379,14 @@ 362 common pwritev sys_pwritev 363 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo 364 common perf_event_open sys_perf_event_open -365 common recvmmsg sys_recvmmsg +365 common recvmmsg sys_recvmmsg_time32 366 common accept4 sys_accept4 367 common fanotify_init sys_fanotify_init 368 common fanotify_mark sys_fanotify_mark 369 common prlimit64 sys_prlimit64 370 common name_to_handle_at sys_name_to_handle_at 371 common open_by_handle_at sys_open_by_handle_at -372 common clock_adjtime sys_clock_adjtime +372 common clock_adjtime sys_clock_adjtime32 373 common syncfs sys_syncfs 374 common sendmmsg sys_sendmmsg 375 common setns sys_setns @@ -413,6 +413,6 @@ 396 common pkey_free sys_pkey_free 397 common statx sys_statx 398 common rseq sys_rseq -399 common io_pgetevents sys_io_pgetevents +399 common io_pgetevents sys_io_pgetevents_time32 400 common migrate_pages sys_migrate_pages 401 common kexec_file_load sys_kexec_file_load diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index bffffb202f8a..01529dce1d2e 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -131,7 +131,7 @@ 121 common setdomainname sys_setdomainname 122 common uname sys_newuname 123 common cacheflush sys_cacheflush -124 common adjtimex sys_adjtimex +124 common adjtimex sys_adjtimex_time32 125 common mprotect sys_mprotect 126 common sigprocmask sys_sigprocmask 127 common create_module sys_ni_syscall @@ -168,8 +168,8 @@ 158 common sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval -162 common nanosleep sys_nanosleep +161 common sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 common nanosleep sys_nanosleep_time32 163 common mremap sys_mremap 164 common setresuid sys_setresuid16 165 common getresuid sys_getresuid16 @@ -184,7 +184,7 @@ 174 common rt_sigaction sys_rt_sigaction 175 common rt_sigprocmask sys_rt_sigprocmask 176 common rt_sigpending sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait +177 common rt_sigtimedwait sys_rt_sigtimedwait_time32 178 common rt_sigqueueinfo sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend 180 common pread64 sys_pread64 @@ -242,7 +242,7 @@ 232 common removexattr sys_removexattr 233 common lremovexattr sys_lremovexattr 234 common fremovexattr sys_fremovexattr -235 common futex sys_futex +235 common futex sys_futex_time32 236 common sendfile64 sys_sendfile64 237 common mincore sys_mincore 238 common madvise sys_madvise @@ -250,7 +250,7 @@ 240 common readahead sys_readahead 241 common io_setup sys_io_setup 242 common io_destroy sys_io_destroy -243 common io_getevents sys_io_getevents +243 common io_getevents sys_io_getevents_time32 244 common io_submit sys_io_submit 245 common io_cancel sys_io_cancel 246 common fadvise64 sys_fadvise64 @@ -262,14 +262,14 @@ 252 common remap_file_pages sys_remap_file_pages 253 common set_tid_address sys_set_tid_address 254 common timer_create sys_timer_create -255 common timer_settime sys_timer_settime -256 common timer_gettime sys_timer_gettime +255 common timer_settime sys_timer_settime32 +256 common timer_gettime sys_timer_gettime32 257 common timer_getoverrun sys_timer_getoverrun 258 common timer_delete sys_timer_delete -259 common clock_settime sys_clock_settime -260 common clock_gettime sys_clock_gettime -261 common clock_getres sys_clock_getres -262 common clock_nanosleep sys_clock_nanosleep +259 common clock_settime sys_clock_settime32 +260 common clock_gettime sys_clock_gettime32 +261 common clock_getres sys_clock_getres_time32 +262 common clock_nanosleep sys_clock_nanosleep_time32 263 common statfs64 sys_statfs64 264 common fstatfs64 sys_fstatfs64 265 common tgkill sys_tgkill @@ -280,8 +280,8 @@ 270 common set_mempolicy sys_set_mempolicy 271 common mq_open sys_mq_open 272 common mq_unlink sys_mq_unlink -273 common mq_timedsend sys_mq_timedsend -274 common mq_timedreceive sys_mq_timedreceive +273 common mq_timedsend sys_mq_timedsend_time32 +274 common mq_timedreceive sys_mq_timedreceive_time32 275 common mq_notify sys_mq_notify 276 common mq_getsetattr sys_mq_getsetattr 277 common waitid sys_waitid @@ -308,8 +308,8 @@ 298 common readlinkat sys_readlinkat 299 common fchmodat sys_fchmodat 300 common faccessat sys_faccessat -301 common pselect6 sys_pselect6 -302 common ppoll sys_ppoll +301 common pselect6 sys_pselect6_time32 +302 common ppoll sys_ppoll_time32 303 common unshare sys_unshare 304 common set_robust_list sys_set_robust_list 305 common get_robust_list sys_get_robust_list @@ -323,13 +323,13 @@ 313 common kexec_load sys_kexec_load 314 common getcpu sys_getcpu 315 common epoll_pwait sys_epoll_pwait -316 common utimensat sys_utimensat +316 common utimensat sys_utimensat_time32 317 common signalfd sys_signalfd 318 common timerfd_create sys_timerfd_create 319 common eventfd sys_eventfd 320 common fallocate sys_fallocate -321 common timerfd_settime sys_timerfd_settime -322 common timerfd_gettime sys_timerfd_gettime +321 common timerfd_settime sys_timerfd_settime32 +322 common timerfd_gettime sys_timerfd_gettime32 323 common signalfd4 sys_signalfd4 324 common eventfd2 sys_eventfd2 325 common epoll_create1 sys_epoll_create1 @@ -349,7 +349,7 @@ 339 common prlimit64 sys_prlimit64 340 common name_to_handle_at sys_name_to_handle_at 341 common open_by_handle_at sys_open_by_handle_at -342 common clock_adjtime sys_clock_adjtime +342 common clock_adjtime sys_clock_adjtime32 343 common syncfs sys_syncfs 344 common setns sys_setns 345 common process_vm_readv sys_process_vm_readv @@ -378,7 +378,7 @@ 368 common recvfrom sys_recvfrom 369 common recvmsg sys_recvmsg 370 common shutdown sys_shutdown -371 common recvmmsg sys_recvmmsg +371 common recvmmsg sys_recvmmsg_time32 372 common sendmmsg sys_sendmmsg 373 common userfaultfd sys_userfaultfd 374 common membarrier sys_membarrier diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 7cc0f9554da3..492ff5c35b68 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -131,7 +131,7 @@ 121 common setdomainname sys_setdomainname 122 common uname sys_newuname 123 common modify_ldt sys_ni_syscall -124 common adjtimex sys_adjtimex +124 common adjtimex sys_adjtimex_time32 125 common mprotect sys_mprotect 126 common sigprocmask sys_sigprocmask 127 common create_module sys_ni_syscall @@ -168,8 +168,8 @@ 158 common sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval -162 common nanosleep sys_nanosleep +161 common sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 common nanosleep sys_nanosleep_time32 163 common mremap sys_mremap 164 common setresuid sys_setresuid 165 common getresuid sys_getresuid @@ -184,7 +184,7 @@ 174 common rt_sigaction sys_rt_sigaction 175 common rt_sigprocmask sys_rt_sigprocmask 176 common rt_sigpending sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait +177 common rt_sigtimedwait sys_rt_sigtimedwait_time32 178 common rt_sigqueueinfo sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend 180 common pread64 sys_pread64 @@ -247,14 +247,14 @@ 237 common fremovexattr sys_fremovexattr 238 common tkill sys_tkill 239 common sendfile64 sys_sendfile64 -240 common futex sys_futex +240 common futex sys_futex_time32 241 common sched_setaffinity sys_sched_setaffinity 242 common sched_getaffinity sys_sched_getaffinity 243 common set_thread_area sys_ni_syscall 244 common get_thread_area sys_ni_syscall 245 common io_setup sys_io_setup 246 common io_destroy sys_io_destroy -247 common io_getevents sys_io_getevents +247 common io_getevents sys_io_getevents_time32 248 common io_submit sys_io_submit 249 common io_cancel sys_io_cancel 250 common fadvise64 sys_fadvise64 @@ -267,14 +267,14 @@ 257 common remap_file_pages sys_remap_file_pages 258 common set_tid_address sys_set_tid_address 259 common timer_create sys_timer_create -260 common timer_settime sys_timer_settime -261 common timer_gettime sys_timer_gettime +260 common timer_settime sys_timer_settime32 +261 common timer_gettime sys_timer_gettime32 262 common timer_getoverrun sys_timer_getoverrun 263 common timer_delete sys_timer_delete -264 common clock_settime sys_clock_settime -265 common clock_gettime sys_clock_gettime -266 common clock_getres sys_clock_getres -267 common clock_nanosleep sys_clock_nanosleep +264 common clock_settime sys_clock_settime32 +265 common clock_gettime sys_clock_gettime32 +266 common clock_getres sys_clock_getres_time32 +267 common clock_nanosleep sys_clock_nanosleep_time32 268 common statfs64 sys_statfs64 269 common fstatfs64 sys_fstatfs64 270 common tgkill sys_tgkill @@ -286,8 +286,8 @@ 276 common set_mempolicy sys_set_mempolicy 277 common mq_open sys_mq_open 278 common mq_unlink sys_mq_unlink -279 common mq_timedsend sys_mq_timedsend -280 common mq_timedreceive sys_mq_timedreceive +279 common mq_timedsend sys_mq_timedsend_time32 +280 common mq_timedreceive sys_mq_timedreceive_time32 281 common mq_notify sys_mq_notify 282 common mq_getsetattr sys_mq_getsetattr 283 common kexec_load sys_kexec_load @@ -315,8 +315,8 @@ 305 common readlinkat sys_readlinkat 306 common fchmodat sys_fchmodat 307 common faccessat sys_faccessat -308 common pselect6 sys_pselect6 -309 common ppoll sys_ppoll +308 common pselect6 sys_pselect6_time32 +309 common ppoll sys_ppoll_time32 310 common unshare sys_unshare 311 common set_robust_list sys_set_robust_list 312 common get_robust_list sys_get_robust_list @@ -327,14 +327,14 @@ 317 common move_pages sys_move_pages 318 common getcpu sys_getcpu 319 common epoll_pwait sys_epoll_pwait -320 common utimensat sys_utimensat +320 common utimensat sys_utimensat_time32 321 common signalfd sys_signalfd 322 common timerfd_create sys_timerfd_create 323 common eventfd sys_eventfd 324 common fallocate sys_fallocate -325 common semtimedop sys_semtimedop -326 common timerfd_settime sys_timerfd_settime -327 common timerfd_gettime sys_timerfd_gettime +325 common semtimedop sys_semtimedop_time32 +326 common timerfd_settime sys_timerfd_settime32 +327 common timerfd_gettime sys_timerfd_gettime32 328 common semctl sys_old_semctl 329 common semget sys_semget 330 common semop sys_semop @@ -374,13 +374,13 @@ 364 common pwritev sys_pwritev 365 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo 366 common perf_event_open sys_perf_event_open -367 common recvmmsg sys_recvmmsg +367 common recvmmsg sys_recvmmsg_time32 368 common fanotify_init sys_fanotify_init 369 common fanotify_mark sys_fanotify_mark 370 common prlimit64 sys_prlimit64 371 common name_to_handle_at sys_name_to_handle_at 372 common open_by_handle_at sys_open_by_handle_at -373 common clock_adjtime sys_clock_adjtime +373 common clock_adjtime sys_clock_adjtime32 374 common syncfs sys_syncfs 375 common setns sys_setns 376 common sendmmsg sys_sendmmsg @@ -406,5 +406,5 @@ 396 common pkey_alloc sys_pkey_alloc 397 common pkey_free sys_pkey_free 398 common statx sys_statx -399 common io_pgetevents sys_io_pgetevents +399 common io_pgetevents sys_io_pgetevents_time32 400 common rseq sys_rseq diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index e9fec7bac5a9..5642d93b64c0 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -135,7 +135,7 @@ 121 o32 setdomainname sys_setdomainname 122 o32 uname sys_newuname 123 o32 modify_ldt sys_ni_syscall -124 o32 adjtimex sys_adjtimex sys_adjtimex_time32 +124 o32 adjtimex sys_adjtimex_time32 125 o32 mprotect sys_mprotect 126 o32 sigprocmask sys_sigprocmask compat_sys_sigprocmask 127 o32 create_module sys_ni_syscall @@ -176,8 +176,8 @@ 162 o32 sched_yield sys_sched_yield 163 o32 sched_get_priority_max sys_sched_get_priority_max 164 o32 sched_get_priority_min sys_sched_get_priority_min -165 o32 sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 -166 o32 nanosleep sys_nanosleep sys_nanosleep_time32 +165 o32 sched_rr_get_interval sys_sched_rr_get_interval_time32 +166 o32 nanosleep sys_nanosleep_time32 167 o32 mremap sys_mremap 168 o32 accept sys_accept 169 o32 bind sys_bind @@ -208,7 +208,7 @@ 194 o32 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 195 o32 rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 196 o32 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -197 o32 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 +197 o32 rt_sigtimedwait sys_rt_sigtimedwait_time32 compat_sys_rt_sigtimedwait_time32 198 o32 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 199 o32 rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 200 o32 pread64 sys_pread64 sys_32_pread @@ -249,12 +249,12 @@ 235 o32 fremovexattr sys_fremovexattr 236 o32 tkill sys_tkill 237 o32 sendfile64 sys_sendfile64 -238 o32 futex sys_futex sys_futex_time32 +238 o32 futex sys_futex_time32 239 o32 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity 240 o32 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity 241 o32 io_setup sys_io_setup compat_sys_io_setup 242 o32 io_destroy sys_io_destroy -243 o32 io_getevents sys_io_getevents sys_io_getevents_time32 +243 o32 io_getevents sys_io_getevents_time32 244 o32 io_submit sys_io_submit compat_sys_io_submit 245 o32 io_cancel sys_io_cancel 246 o32 exit_group sys_exit_group @@ -269,14 +269,14 @@ 255 o32 statfs64 sys_statfs64 compat_sys_statfs64 256 o32 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 257 o32 timer_create sys_timer_create compat_sys_timer_create -258 o32 timer_settime sys_timer_settime sys_timer_settime32 -259 o32 timer_gettime sys_timer_gettime sys_timer_gettime32 +258 o32 timer_settime sys_timer_settime32 +259 o32 timer_gettime sys_timer_gettime32 260 o32 timer_getoverrun sys_timer_getoverrun 261 o32 timer_delete sys_timer_delete -262 o32 clock_settime sys_clock_settime sys_clock_settime32 -263 o32 clock_gettime sys_clock_gettime sys_clock_gettime32 -264 o32 clock_getres sys_clock_getres sys_clock_getres_time32 -265 o32 clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 +262 o32 clock_settime sys_clock_settime32 +263 o32 clock_gettime sys_clock_gettime32 +264 o32 clock_getres sys_clock_getres_time32 +265 o32 clock_nanosleep sys_clock_nanosleep_time32 266 o32 tgkill sys_tgkill 267 o32 utimes sys_utimes sys_utimes_time32 268 o32 mbind sys_mbind compat_sys_mbind @@ -284,8 +284,8 @@ 270 o32 set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy 271 o32 mq_open sys_mq_open compat_sys_mq_open 272 o32 mq_unlink sys_mq_unlink -273 o32 mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 -274 o32 mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 +273 o32 mq_timedsend sys_mq_timedsend_time32 +274 o32 mq_timedreceive sys_mq_timedreceive_time32 275 o32 mq_notify sys_mq_notify compat_sys_mq_notify 276 o32 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 277 o32 vserver sys_ni_syscall @@ -312,8 +312,8 @@ 298 o32 readlinkat sys_readlinkat 299 o32 fchmodat sys_fchmodat 300 o32 faccessat sys_faccessat -301 o32 pselect6 sys_pselect6 compat_sys_pselect6_time32 -302 o32 ppoll sys_ppoll compat_sys_ppoll_time32 +301 o32 pselect6 sys_pselect6_time32 compat_sys_pselect6_time32 +302 o32 ppoll sys_ppoll_time32 compat_sys_ppoll_time32 303 o32 unshare sys_unshare 304 o32 splice sys_splice 305 o32 sync_file_range sys_sync_file_range sys32_sync_file_range @@ -327,14 +327,14 @@ 313 o32 epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 314 o32 ioprio_set sys_ioprio_set 315 o32 ioprio_get sys_ioprio_get -316 o32 utimensat sys_utimensat sys_utimensat_time32 +316 o32 utimensat sys_utimensat_time32 317 o32 signalfd sys_signalfd compat_sys_signalfd 318 o32 timerfd sys_ni_syscall 319 o32 eventfd sys_eventfd 320 o32 fallocate sys_fallocate sys32_fallocate 321 o32 timerfd_create sys_timerfd_create -322 o32 timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 -323 o32 timerfd_settime sys_timerfd_settime sys_timerfd_settime32 +322 o32 timerfd_gettime sys_timerfd_gettime32 +323 o32 timerfd_settime sys_timerfd_settime32 324 o32 signalfd4 sys_signalfd4 compat_sys_signalfd4 325 o32 eventfd2 sys_eventfd2 326 o32 epoll_create1 sys_epoll_create1 @@ -346,13 +346,13 @@ 332 o32 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo 333 o32 perf_event_open sys_perf_event_open 334 o32 accept4 sys_accept4 -335 o32 recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 +335 o32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 336 o32 fanotify_init sys_fanotify_init 337 o32 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark 338 o32 prlimit64 sys_prlimit64 339 o32 name_to_handle_at sys_name_to_handle_at 340 o32 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -341 o32 clock_adjtime sys_clock_adjtime sys_clock_adjtime32 +341 o32 clock_adjtime sys_clock_adjtime32 342 o32 syncfs sys_syncfs 343 o32 sendmmsg sys_sendmmsg compat_sys_sendmmsg 344 o32 setns sys_setns @@ -379,7 +379,7 @@ 365 o32 pkey_free sys_pkey_free 366 o32 statx sys_statx 367 o32 rseq sys_rseq -368 o32 io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +368 o32 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents # room for arch specific calls 393 o32 semget sys_semget 394 o32 semctl sys_semctl compat_sys_semctl diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index f7440427d459..b110c7371b08 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -133,7 +133,8 @@ 121 common setdomainname sys_setdomainname 122 common sendfile sys_sendfile compat_sys_sendfile 123 common recvfrom sys_recvfrom -124 common adjtimex sys_adjtimex sys_adjtimex_time32 +124 32 adjtimex sys_adjtimex_time32 +124 64 adjtimex sys_adjtimex 125 common mprotect sys_mprotect 126 common sigprocmask sys_sigprocmask compat_sys_sigprocmask # 127 was create_module @@ -171,8 +172,10 @@ 158 common sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 -162 common nanosleep sys_nanosleep sys_nanosleep_time32 +161 32 sched_rr_get_interval sys_sched_rr_get_interval_time32 +161 64 sched_rr_get_interval sys_sched_rr_get_interval +162 32 nanosleep sys_nanosleep_time32 +162 64 nanosleep sys_nanosleep 163 common mremap sys_mremap 164 common setresuid sys_setresuid 165 common getresuid sys_getresuid @@ -187,7 +190,8 @@ 174 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 175 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 176 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 +177 32 rt_sigtimedwait sys_rt_sigtimedwait_time32 compat_sys_rt_sigtimedwait_time32 +177 64 rt_sigtimedwait sys_rt_sigtimedwait 178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 180 common chown sys_chown @@ -223,14 +227,16 @@ 207 64 readahead sys_readahead 208 common tkill sys_tkill 209 common sendfile64 sys_sendfile64 compat_sys_sendfile64 -210 common futex sys_futex sys_futex_time32 +210 32 futex sys_futex_time32 +210 64 futex sys_futex 211 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity 212 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity # 213 was set_thread_area # 214 was get_thread_area 215 common io_setup sys_io_setup compat_sys_io_setup 216 common io_destroy sys_io_destroy -217 common io_getevents sys_io_getevents sys_io_getevents_time32 +217 32 io_getevents sys_io_getevents_time32 +217 64 io_getevents sys_io_getevents 218 common io_submit sys_io_submit compat_sys_io_submit 219 common io_cancel sys_io_cancel # 220 was alloc_hugepages @@ -241,11 +247,14 @@ 225 common epoll_ctl sys_epoll_ctl 226 common epoll_wait sys_epoll_wait 227 common remap_file_pages sys_remap_file_pages -228 common semtimedop sys_semtimedop sys_semtimedop_time32 +228 32 semtimedop sys_semtimedop_time32 +228 64 semtimedop sys_semtimedop 229 common mq_open sys_mq_open compat_sys_mq_open 230 common mq_unlink sys_mq_unlink -231 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 -232 common mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 +231 32 mq_timedsend sys_mq_timedsend_time32 +231 64 mq_timedsend sys_mq_timedsend +232 32 mq_timedreceive sys_mq_timedreceive_time32 +232 64 mq_timedreceive sys_mq_timedreceive 233 common mq_notify sys_mq_notify compat_sys_mq_notify 234 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 235 common waitid sys_waitid compat_sys_waitid @@ -265,14 +274,20 @@ 248 common lremovexattr sys_lremovexattr 249 common fremovexattr sys_fremovexattr 250 common timer_create sys_timer_create compat_sys_timer_create -251 common timer_settime sys_timer_settime sys_timer_settime32 -252 common timer_gettime sys_timer_gettime sys_timer_gettime32 +251 32 timer_settime sys_timer_settime32 +251 64 timer_settime sys_timer_settime +252 32 timer_gettime sys_timer_gettime32 +252 64 timer_gettime sys_timer_gettime 253 common timer_getoverrun sys_timer_getoverrun 254 common timer_delete sys_timer_delete -255 common clock_settime sys_clock_settime sys_clock_settime32 -256 common clock_gettime sys_clock_gettime sys_clock_gettime32 -257 common clock_getres sys_clock_getres sys_clock_getres_time32 -258 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 +255 32 clock_settime sys_clock_settime32 +255 64 clock_settime sys_clock_settime +256 32 clock_gettime sys_clock_gettime32 +256 64 clock_gettime sys_clock_gettime +257 32 clock_getres sys_clock_getres_time32 +257 64 clock_getres sys_clock_getres +258 32 clock_nanosleep sys_clock_nanosleep_time32 +258 64 clock_nanosleep sys_clock_nanosleep 259 common tgkill sys_tgkill 260 common mbind sys_mbind compat_sys_mbind 261 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy @@ -287,8 +302,10 @@ 270 common inotify_add_watch sys_inotify_add_watch 271 common inotify_rm_watch sys_inotify_rm_watch 272 common migrate_pages sys_migrate_pages -273 common pselect6 sys_pselect6 compat_sys_pselect6_time32 -274 common ppoll sys_ppoll compat_sys_ppoll_time32 +273 32 pselect6 sys_pselect6_time32 compat_sys_pselect6_time32 +273 64 pselect6 sys_pselect6 +274 32 ppoll sys_ppoll_time32 compat_sys_ppoll_time32 +274 64 ppoll sys_ppoll 275 common openat sys_openat compat_sys_openat 276 common mkdirat sys_mkdirat 277 common mknodat sys_mknodat @@ -316,15 +333,18 @@ 298 common statfs64 sys_statfs64 compat_sys_statfs64 299 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 300 common kexec_load sys_kexec_load compat_sys_kexec_load -301 common utimensat sys_utimensat sys_utimensat_time32 +301 32 utimensat sys_utimensat_time32 +301 64 utimensat sys_utimensat 302 common signalfd sys_signalfd compat_sys_signalfd # 303 was timerfd 304 common eventfd sys_eventfd 305 32 fallocate parisc_fallocate 305 64 fallocate sys_fallocate 306 common timerfd_create sys_timerfd_create -307 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 -308 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 +307 32 timerfd_settime sys_timerfd_settime32 +307 64 timerfd_settime sys_timerfd_settime +308 32 timerfd_gettime sys_timerfd_gettime32 +308 64 timerfd_gettime sys_timerfd_gettime 309 common signalfd4 sys_signalfd4 compat_sys_signalfd4 310 common eventfd2 sys_eventfd2 311 common epoll_create1 sys_epoll_create1 @@ -335,12 +355,14 @@ 316 common pwritev sys_pwritev compat_sys_pwritev 317 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo 318 common perf_event_open sys_perf_event_open -319 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 +319 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 +319 64 recvmmsg sys_recvmmsg 320 common accept4 sys_accept4 321 common prlimit64 sys_prlimit64 322 common fanotify_init sys_fanotify_init 323 common fanotify_mark sys_fanotify_mark sys32_fanotify_mark -324 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 +324 32 clock_adjtime sys_clock_adjtime32 +324 64 clock_adjtime sys_clock_adjtime 325 common name_to_handle_at sys_name_to_handle_at 326 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at 327 common syncfs sys_syncfs @@ -366,7 +388,8 @@ 347 common preadv2 sys_preadv2 compat_sys_preadv2 348 common pwritev2 sys_pwritev2 compat_sys_pwritev2 349 common statx sys_statx -350 common io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +350 32 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents +350 64 io_pgetevents sys_io_pgetevents 351 common pkey_mprotect sys_pkey_mprotect 352 common pkey_alloc sys_pkey_alloc 353 common pkey_free sys_pkey_free diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 86650dcd2185..2a8b060f73b3 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -157,7 +157,9 @@ 121 common setdomainname sys_setdomainname 122 common uname sys_newuname 123 common modify_ldt sys_ni_syscall -124 common adjtimex sys_adjtimex sys_adjtimex_time32 +124 32 adjtimex sys_adjtimex_time32 +124 64 adjtimex sys_adjtimex +124 spu adjtimex sys_adjtimex 125 common mprotect sys_mprotect 126 32 sigprocmask sys_sigprocmask compat_sys_sigprocmask 126 64 sigprocmask sys_ni_syscall @@ -198,8 +200,12 @@ 158 common sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 -162 common nanosleep sys_nanosleep sys_nanosleep_time32 +161 32 sched_rr_get_interval sys_sched_rr_get_interval_time32 +161 64 sched_rr_get_interval sys_sched_rr_get_interval +161 spu sched_rr_get_interval sys_sched_rr_get_interval +162 32 nanosleep sys_nanosleep_time32 +162 64 nanosleep sys_nanosleep +162 spu nanosleep sys_nanosleep 163 common mremap sys_mremap 164 common setresuid sys_setresuid 165 common getresuid sys_getresuid @@ -213,7 +219,8 @@ 173 nospu rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 174 nospu rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 175 nospu rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -176 nospu rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 +176 32 rt_sigtimedwait sys_rt_sigtimedwait_time32 compat_sys_rt_sigtimedwait_time32 +176 64 rt_sigtimedwait sys_rt_sigtimedwait 177 nospu rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 179 common pread64 sys_pread64 compat_sys_pread64 @@ -260,7 +267,9 @@ 218 common removexattr sys_removexattr 219 common lremovexattr sys_lremovexattr 220 common fremovexattr sys_fremovexattr -221 common futex sys_futex sys_futex_time32 +221 32 futex sys_futex_time32 +221 64 futex sys_futex +221 spu futex sys_futex 222 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity 223 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity # 224 unused @@ -268,7 +277,9 @@ 226 32 sendfile64 sys_sendfile64 compat_sys_sendfile64 227 common io_setup sys_io_setup compat_sys_io_setup 228 common io_destroy sys_io_destroy -229 common io_getevents sys_io_getevents sys_io_getevents_time32 +229 32 io_getevents sys_io_getevents_time32 +229 64 io_getevents sys_io_getevents +229 spu io_getevents sys_io_getevents 230 common io_submit sys_io_submit compat_sys_io_submit 231 common io_cancel sys_io_cancel 232 nospu set_tid_address sys_set_tid_address @@ -280,14 +291,26 @@ 238 common epoll_wait sys_epoll_wait 239 common remap_file_pages sys_remap_file_pages 240 common timer_create sys_timer_create compat_sys_timer_create -241 common timer_settime sys_timer_settime sys_timer_settime32 -242 common timer_gettime sys_timer_gettime sys_timer_gettime32 +241 32 timer_settime sys_timer_settime32 +241 64 timer_settime sys_timer_settime +241 spu timer_settime sys_timer_settime +242 32 timer_gettime sys_timer_gettime32 +242 64 timer_gettime sys_timer_gettime +242 spu timer_gettime sys_timer_gettime 243 common timer_getoverrun sys_timer_getoverrun 244 common timer_delete sys_timer_delete -245 common clock_settime sys_clock_settime sys_clock_settime32 -246 common clock_gettime sys_clock_gettime sys_clock_gettime32 -247 common clock_getres sys_clock_getres sys_clock_getres_time32 -248 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 +245 32 clock_settime sys_clock_settime32 +245 64 clock_settime sys_clock_settime +245 spu clock_settime sys_clock_settime +246 32 clock_gettime sys_clock_gettime32 +246 64 clock_gettime sys_clock_gettime +246 spu clock_gettime sys_clock_gettime +247 32 clock_getres sys_clock_getres_time32 +247 64 clock_getres sys_clock_getres +247 spu clock_getres sys_clock_getres +248 32 clock_nanosleep sys_clock_nanosleep_time32 +248 64 clock_nanosleep sys_clock_nanosleep +248 spu clock_nanosleep sys_clock_nanosleep 249 32 swapcontext ppc_swapcontext ppc32_swapcontext 249 64 swapcontext ppc64_swapcontext 249 spu swapcontext sys_ni_syscall @@ -308,8 +331,10 @@ 261 nospu set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy 262 nospu mq_open sys_mq_open compat_sys_mq_open 263 nospu mq_unlink sys_mq_unlink -264 nospu mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 -265 nospu mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 +264 32 mq_timedsend sys_mq_timedsend_time32 +264 64 mq_timedsend sys_mq_timedsend +265 32 mq_timedreceive sys_mq_timedreceive_time32 +265 64 mq_timedreceive sys_mq_timedreceive 266 nospu mq_notify sys_mq_notify compat_sys_mq_notify 267 nospu mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 268 nospu kexec_load sys_kexec_load compat_sys_kexec_load @@ -324,8 +349,10 @@ 277 nospu inotify_rm_watch sys_inotify_rm_watch 278 nospu spu_run sys_spu_run 279 nospu spu_create sys_spu_create -280 nospu pselect6 sys_pselect6 compat_sys_pselect6_time32 -281 nospu ppoll sys_ppoll compat_sys_ppoll_time32 +280 32 pselect6 sys_pselect6_time32 compat_sys_pselect6_time32 +280 64 pselect6 sys_pselect6 +281 32 ppoll sys_ppoll_time32 compat_sys_ppoll_time32 +281 64 ppoll sys_ppoll 282 common unshare sys_unshare 283 common splice sys_splice 284 common tee sys_tee @@ -350,15 +377,21 @@ 301 common move_pages sys_move_pages compat_sys_move_pages 302 common getcpu sys_getcpu 303 nospu epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait -304 common utimensat sys_utimensat sys_utimensat_time32 +304 32 utimensat sys_utimensat_time32 +304 64 utimensat sys_utimensat +304 spu utimensat sys_utimensat 305 common signalfd sys_signalfd compat_sys_signalfd 306 common timerfd_create sys_timerfd_create 307 common eventfd sys_eventfd 308 common sync_file_range2 sys_sync_file_range2 compat_sys_sync_file_range2 309 nospu fallocate sys_fallocate compat_sys_fallocate 310 nospu subpage_prot sys_subpage_prot -311 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 -312 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 +311 32 timerfd_settime sys_timerfd_settime32 +311 64 timerfd_settime sys_timerfd_settime +311 spu timerfd_settime sys_timerfd_settime +312 32 timerfd_gettime sys_timerfd_gettime32 +312 64 timerfd_gettime sys_timerfd_gettime +312 spu timerfd_gettime sys_timerfd_gettime 313 common signalfd4 sys_signalfd4 compat_sys_signalfd4 314 common eventfd2 sys_eventfd2 315 common epoll_create1 sys_epoll_create1 @@ -389,11 +422,15 @@ 340 common getsockopt sys_getsockopt compat_sys_getsockopt 341 common sendmsg sys_sendmsg compat_sys_sendmsg 342 common recvmsg sys_recvmsg compat_sys_recvmsg -343 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 +343 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 +343 64 recvmmsg sys_recvmmsg +343 spu recvmmsg sys_recvmmsg 344 common accept4 sys_accept4 345 common name_to_handle_at sys_name_to_handle_at 346 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -347 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 +347 32 clock_adjtime sys_clock_adjtime32 +347 64 clock_adjtime sys_clock_adjtime +347 spu clock_adjtime sys_clock_adjtime 348 common syncfs sys_syncfs 349 common sendmmsg sys_sendmmsg compat_sys_sendmmsg 350 common setns sys_setns @@ -425,7 +462,8 @@ 385 nospu pkey_free sys_pkey_free 386 nospu pkey_mprotect sys_pkey_mprotect 387 nospu rseq sys_rseq -388 nospu io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +388 32 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents +388 64 io_pgetevents sys_io_pgetevents # room for arch specific syscalls 392 64 semtimedop sys_semtimedop 393 common semget sys_semget diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index cafa63c6a932..a08a5312cd12 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -131,7 +131,7 @@ 121 common setdomainname sys_setdomainname 122 common uname sys_newuname 123 common cacheflush sys_cacheflush -124 common adjtimex sys_adjtimex +124 common adjtimex sys_adjtimex_time32 125 common mprotect sys_mprotect 126 common sigprocmask sys_sigprocmask # 127 was create_module @@ -168,8 +168,8 @@ 158 common sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval -162 common nanosleep sys_nanosleep +161 common sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 common nanosleep sys_nanosleep_time32 163 common mremap sys_mremap 164 common setresuid sys_setresuid16 165 common getresuid sys_getresuid16 @@ -184,7 +184,7 @@ 174 common rt_sigaction sys_rt_sigaction 175 common rt_sigprocmask sys_rt_sigprocmask 176 common rt_sigpending sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait +177 common rt_sigtimedwait sys_rt_sigtimedwait_time32 178 common rt_sigqueueinfo sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend 180 common pread64 sys_pread_wrapper @@ -247,14 +247,14 @@ 237 common fremovexattr sys_fremovexattr 238 common tkill sys_tkill 239 common sendfile64 sys_sendfile64 -240 common futex sys_futex +240 common futex sys_futex_time32 241 common sched_setaffinity sys_sched_setaffinity 242 common sched_getaffinity sys_sched_getaffinity # 243 is reserved for set_thread_area # 244 is reserved for get_thread_area 245 common io_setup sys_io_setup 246 common io_destroy sys_io_destroy -247 common io_getevents sys_io_getevents +247 common io_getevents sys_io_getevents_time32 248 common io_submit sys_io_submit 249 common io_cancel sys_io_cancel 250 common fadvise64 sys_fadvise64 @@ -267,14 +267,14 @@ 257 common remap_file_pages sys_remap_file_pages 258 common set_tid_address sys_set_tid_address 259 common timer_create sys_timer_create -260 common timer_settime sys_timer_settime -261 common timer_gettime sys_timer_gettime +260 common timer_settime sys_timer_settime32 +261 common timer_gettime sys_timer_gettime32 262 common timer_getoverrun sys_timer_getoverrun 263 common timer_delete sys_timer_delete -264 common clock_settime sys_clock_settime -265 common clock_gettime sys_clock_gettime -266 common clock_getres sys_clock_getres -267 common clock_nanosleep sys_clock_nanosleep +264 common clock_settime sys_clock_settime32 +265 common clock_gettime sys_clock_gettime32 +266 common clock_getres sys_clock_getres_time32 +267 common clock_nanosleep sys_clock_nanosleep_time32 268 common statfs64 sys_statfs64 269 common fstatfs64 sys_fstatfs64 270 common tgkill sys_tgkill @@ -286,8 +286,8 @@ 276 common set_mempolicy sys_set_mempolicy 277 common mq_open sys_mq_open 278 common mq_unlink sys_mq_unlink -279 common mq_timedsend sys_mq_timedsend -280 common mq_timedreceive sys_mq_timedreceive +279 common mq_timedsend sys_mq_timedsend_time32 +280 common mq_timedreceive sys_mq_timedreceive_time32 281 common mq_notify sys_mq_notify 282 common mq_getsetattr sys_mq_getsetattr 283 common kexec_load sys_kexec_load @@ -315,8 +315,8 @@ 305 common readlinkat sys_readlinkat 306 common fchmodat sys_fchmodat 307 common faccessat sys_faccessat -308 common pselect6 sys_pselect6 -309 common ppoll sys_ppoll +308 common pselect6 sys_pselect6_time32 +309 common ppoll sys_ppoll_time32 310 common unshare sys_unshare 311 common set_robust_list sys_set_robust_list 312 common get_robust_list sys_get_robust_list @@ -327,13 +327,13 @@ 317 common move_pages sys_move_pages 318 common getcpu sys_getcpu 319 common epoll_pwait sys_epoll_pwait -320 common utimensat sys_utimensat +320 common utimensat sys_utimensat_time32 321 common signalfd sys_signalfd 322 common timerfd_create sys_timerfd_create 323 common eventfd sys_eventfd 324 common fallocate sys_fallocate -325 common timerfd_settime sys_timerfd_settime -326 common timerfd_gettime sys_timerfd_gettime +325 common timerfd_settime sys_timerfd_settime32 +326 common timerfd_gettime sys_timerfd_gettime32 327 common signalfd4 sys_signalfd4 328 common eventfd2 sys_eventfd2 329 common epoll_create1 sys_epoll_create1 @@ -364,11 +364,11 @@ 354 common getsockopt sys_getsockopt 355 common sendmsg sys_sendmsg 356 common recvmsg sys_recvmsg -357 common recvmmsg sys_recvmmsg +357 common recvmmsg sys_recvmmsg_time32 358 common accept4 sys_accept4 359 common name_to_handle_at sys_name_to_handle_at 360 common open_by_handle_at sys_open_by_handle_at -361 common clock_adjtime sys_clock_adjtime +361 common clock_adjtime sys_clock_adjtime32 362 common syncfs sys_syncfs 363 common sendmmsg sys_sendmmsg 364 common setns sys_setns diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 7cb05b50aeaa..dc1e08040b39 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -128,7 +128,8 @@ 102 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 103 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 104 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -105 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 +105 32 rt_sigtimedwait sys_rt_sigtimedwait_time32 compat_sys_rt_sigtimedwait_time32 +105 64 rt_sigtimedwait sys_rt_sigtimedwait 106 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 107 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 108 32 setresuid32 sys_setresuid @@ -172,7 +173,8 @@ 139 common stat64 sys_stat64 compat_sys_stat64 140 common sendfile64 sys_sendfile64 141 common getpeername sys_getpeername -142 common futex sys_futex sys_futex_time32 +142 32 futex sys_futex_time32 +142 64 futex sys_futex 143 common gettid sys_gettid 144 common getrlimit sys_getrlimit compat_sys_getrlimit 145 common setrlimit sys_setrlimit compat_sys_setrlimit @@ -258,7 +260,7 @@ 216 64 sigreturn sys_nis_syscall 217 common clone sys_clone 218 common ioprio_get sys_ioprio_get -219 32 adjtimex sys_adjtimex sys_adjtimex_time32 +219 32 adjtimex sys_adjtimex_time32 219 64 adjtimex sys_sparc_adjtimex 220 32 sigprocmask sys_sigprocmask compat_sys_sigprocmask 220 64 sigprocmask sys_nis_syscall @@ -289,8 +291,10 @@ 245 common sched_yield sys_sched_yield 246 common sched_get_priority_max sys_sched_get_priority_max 247 common sched_get_priority_min sys_sched_get_priority_min -248 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 -249 common nanosleep sys_nanosleep sys_nanosleep_time32 +248 32 sched_rr_get_interval sys_sched_rr_get_interval_time32 +248 64 sched_rr_get_interval sys_sched_rr_get_interval +249 32 nanosleep sys_nanosleep_time32 +249 64 nanosleep sys_nanosleep 250 32 mremap sys_mremap 250 64 mremap sys_64_mremap 251 common _sysctl sys_sysctl compat_sys_sysctl @@ -299,14 +303,20 @@ 254 32 nfsservctl sys_ni_syscall sys_nis_syscall 254 64 nfsservctl sys_nis_syscall 255 common sync_file_range sys_sync_file_range compat_sys_sync_file_range -256 common clock_settime sys_clock_settime sys_clock_settime32 -257 common clock_gettime sys_clock_gettime sys_clock_gettime32 -258 common clock_getres sys_clock_getres sys_clock_getres_time32 -259 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 +256 32 clock_settime sys_clock_settime32 +256 64 clock_settime sys_clock_settime +257 32 clock_gettime sys_clock_gettime32 +257 64 clock_gettime sys_clock_gettime +258 32 clock_getres sys_clock_getres_time32 +258 64 clock_getres sys_clock_getres +259 32 clock_nanosleep sys_clock_nanosleep_time32 +259 64 clock_nanosleep sys_clock_nanosleep 260 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity 261 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity -262 common timer_settime sys_timer_settime sys_timer_settime32 -263 common timer_gettime sys_timer_gettime sys_timer_gettime32 +262 32 timer_settime sys_timer_settime32 +262 64 timer_settime sys_timer_settime +263 32 timer_gettime sys_timer_gettime32 +263 64 timer_gettime sys_timer_gettime 264 common timer_getoverrun sys_timer_getoverrun 265 common timer_delete sys_timer_delete 266 common timer_create sys_timer_create compat_sys_timer_create @@ -316,11 +326,14 @@ 269 common io_destroy sys_io_destroy 270 common io_submit sys_io_submit compat_sys_io_submit 271 common io_cancel sys_io_cancel -272 common io_getevents sys_io_getevents sys_io_getevents_time32 +272 32 io_getevents sys_io_getevents_time32 +272 64 io_getevents sys_io_getevents 273 common mq_open sys_mq_open compat_sys_mq_open 274 common mq_unlink sys_mq_unlink -275 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 -276 common mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 +275 32 mq_timedsend sys_mq_timedsend_time32 +275 64 mq_timedsend sys_mq_timedsend +276 32 mq_timedreceive sys_mq_timedreceive_time32 +276 64 mq_timedreceive sys_mq_timedreceive 277 common mq_notify sys_mq_notify compat_sys_mq_notify 278 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 279 common waitid sys_waitid compat_sys_waitid @@ -341,8 +354,10 @@ 294 common readlinkat sys_readlinkat 295 common fchmodat sys_fchmodat 296 common faccessat sys_faccessat -297 common pselect6 sys_pselect6 compat_sys_pselect6_time32 -298 common ppoll sys_ppoll compat_sys_ppoll_time32 +297 32 pselect6 sys_pselect6_time32 compat_sys_pselect6_time32 +297 64 pselect6 sys_pselect6 +298 32 ppoll sys_ppoll_time32 compat_sys_ppoll_time32 +298 64 ppoll sys_ppoll 299 common unshare sys_unshare 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list 301 common get_robust_list sys_get_robust_list compat_sys_get_robust_list @@ -354,13 +369,16 @@ 307 common move_pages sys_move_pages compat_sys_move_pages 308 common getcpu sys_getcpu 309 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait -310 common utimensat sys_utimensat sys_utimensat_time32 +310 32 utimensat sys_utimensat_time32 +310 64 utimensat sys_utimensat 311 common signalfd sys_signalfd compat_sys_signalfd 312 common timerfd_create sys_timerfd_create 313 common eventfd sys_eventfd 314 common fallocate sys_fallocate compat_sys_fallocate -315 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 -316 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 +315 32 timerfd_settime sys_timerfd_settime32 +315 64 timerfd_settime sys_timerfd_settime +316 32 timerfd_gettime sys_timerfd_gettime32 +316 64 timerfd_gettime sys_timerfd_gettime 317 common signalfd4 sys_signalfd4 compat_sys_signalfd4 318 common eventfd2 sys_eventfd2 319 common epoll_create1 sys_epoll_create1 @@ -372,13 +390,14 @@ 325 common pwritev sys_pwritev compat_sys_pwritev 326 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo 327 common perf_event_open sys_perf_event_open -328 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 +328 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 +328 64 recvmmsg sys_recvmmsg 329 common fanotify_init sys_fanotify_init 330 common fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark 331 common prlimit64 sys_prlimit64 332 common name_to_handle_at sys_name_to_handle_at 333 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -334 32 clock_adjtime sys_clock_adjtime sys_clock_adjtime32 +334 32 clock_adjtime sys_clock_adjtime32 334 64 clock_adjtime sys_sparc_clock_adjtime 335 common syncfs sys_syncfs 336 common sendmmsg sys_sendmmsg compat_sys_sendmmsg @@ -408,7 +427,8 @@ 358 common preadv2 sys_preadv2 compat_sys_preadv2 359 common pwritev2 sys_pwritev2 compat_sys_pwritev2 360 common statx sys_statx -361 common io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +361 32 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents +361 64 io_pgetevents sys_io_pgetevents 362 common pkey_mprotect sys_pkey_mprotect 363 common pkey_alloc sys_pkey_alloc 364 common pkey_free sys_pkey_free diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7705d5ecad25..28888d6b7f61 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -135,7 +135,7 @@ 121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname 122 i386 uname sys_newuname __ia32_sys_newuname 123 i386 modify_ldt sys_modify_ldt __ia32_sys_modify_ldt -124 i386 adjtimex sys_adjtimex __ia32_sys_adjtimex_time32 +124 i386 adjtimex sys_adjtimex_time32 __ia32_sys_adjtimex_time32 125 i386 mprotect sys_mprotect __ia32_sys_mprotect 126 i386 sigprocmask sys_sigprocmask __ia32_compat_sys_sigprocmask 127 i386 create_module @@ -172,8 +172,8 @@ 158 i386 sched_yield sys_sched_yield __ia32_sys_sched_yield 159 i386 sched_get_priority_max sys_sched_get_priority_max __ia32_sys_sched_get_priority_max 160 i386 sched_get_priority_min sys_sched_get_priority_min __ia32_sys_sched_get_priority_min -161 i386 sched_rr_get_interval sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval_time32 -162 i386 nanosleep sys_nanosleep __ia32_sys_nanosleep_time32 +161 i386 sched_rr_get_interval sys_sched_rr_get_interval_time32 __ia32_sys_sched_rr_get_interval_time32 +162 i386 nanosleep sys_nanosleep_time32 __ia32_sys_nanosleep_time32 163 i386 mremap sys_mremap __ia32_sys_mremap 164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16 165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16 @@ -188,7 +188,7 @@ 174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction 175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending -177 i386 rt_sigtimedwait sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time32 +177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 __ia32_compat_sys_rt_sigtimedwait_time32 178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo 179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend 180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread @@ -251,14 +251,14 @@ 237 i386 fremovexattr sys_fremovexattr __ia32_sys_fremovexattr 238 i386 tkill sys_tkill __ia32_sys_tkill 239 i386 sendfile64 sys_sendfile64 __ia32_sys_sendfile64 -240 i386 futex sys_futex __ia32_sys_futex_time32 +240 i386 futex sys_futex_time32 __ia32_sys_futex_time32 241 i386 sched_setaffinity sys_sched_setaffinity __ia32_compat_sys_sched_setaffinity 242 i386 sched_getaffinity sys_sched_getaffinity __ia32_compat_sys_sched_getaffinity 243 i386 set_thread_area sys_set_thread_area __ia32_sys_set_thread_area 244 i386 get_thread_area sys_get_thread_area __ia32_sys_get_thread_area 245 i386 io_setup sys_io_setup __ia32_compat_sys_io_setup 246 i386 io_destroy sys_io_destroy __ia32_sys_io_destroy -247 i386 io_getevents sys_io_getevents __ia32_sys_io_getevents_time32 +247 i386 io_getevents sys_io_getevents_time32 __ia32_sys_io_getevents_time32 248 i386 io_submit sys_io_submit __ia32_compat_sys_io_submit 249 i386 io_cancel sys_io_cancel __ia32_sys_io_cancel 250 i386 fadvise64 sys_fadvise64 __ia32_compat_sys_x86_fadvise64 @@ -271,14 +271,14 @@ 257 i386 remap_file_pages sys_remap_file_pages __ia32_sys_remap_file_pages 258 i386 set_tid_address sys_set_tid_address __ia32_sys_set_tid_address 259 i386 timer_create sys_timer_create __ia32_compat_sys_timer_create -260 i386 timer_settime sys_timer_settime __ia32_sys_timer_settime32 -261 i386 timer_gettime sys_timer_gettime __ia32_sys_timer_gettime32 +260 i386 timer_settime sys_timer_settime32 __ia32_sys_timer_settime32 +261 i386 timer_gettime sys_timer_gettime32 __ia32_sys_timer_gettime32 262 i386 timer_getoverrun sys_timer_getoverrun __ia32_sys_timer_getoverrun 263 i386 timer_delete sys_timer_delete __ia32_sys_timer_delete -264 i386 clock_settime sys_clock_settime __ia32_sys_clock_settime32 -265 i386 clock_gettime sys_clock_gettime __ia32_sys_clock_gettime32 -266 i386 clock_getres sys_clock_getres __ia32_sys_clock_getres_time32 -267 i386 clock_nanosleep sys_clock_nanosleep __ia32_sys_clock_nanosleep_time32 +264 i386 clock_settime sys_clock_settime32 __ia32_sys_clock_settime32 +265 i386 clock_gettime sys_clock_gettime32 __ia32_sys_clock_gettime32 +266 i386 clock_getres sys_clock_getres_time32 __ia32_sys_clock_getres_time32 +267 i386 clock_nanosleep sys_clock_nanosleep_time32 __ia32_sys_clock_nanosleep_time32 268 i386 statfs64 sys_statfs64 __ia32_compat_sys_statfs64 269 i386 fstatfs64 sys_fstatfs64 __ia32_compat_sys_fstatfs64 270 i386 tgkill sys_tgkill __ia32_sys_tgkill @@ -290,8 +290,8 @@ 276 i386 set_mempolicy sys_set_mempolicy __ia32_sys_set_mempolicy 277 i386 mq_open sys_mq_open __ia32_compat_sys_mq_open 278 i386 mq_unlink sys_mq_unlink __ia32_sys_mq_unlink -279 i386 mq_timedsend sys_mq_timedsend __ia32_sys_mq_timedsend_time32 -280 i386 mq_timedreceive sys_mq_timedreceive __ia32_sys_mq_timedreceive_time32 +279 i386 mq_timedsend sys_mq_timedsend_time32 __ia32_sys_mq_timedsend_time32 +280 i386 mq_timedreceive sys_mq_timedreceive_time32 __ia32_sys_mq_timedreceive_time32 281 i386 mq_notify sys_mq_notify __ia32_compat_sys_mq_notify 282 i386 mq_getsetattr sys_mq_getsetattr __ia32_compat_sys_mq_getsetattr 283 i386 kexec_load sys_kexec_load __ia32_compat_sys_kexec_load @@ -319,8 +319,8 @@ 305 i386 readlinkat sys_readlinkat __ia32_sys_readlinkat 306 i386 fchmodat sys_fchmodat __ia32_sys_fchmodat 307 i386 faccessat sys_faccessat __ia32_sys_faccessat -308 i386 pselect6 sys_pselect6 __ia32_compat_sys_pselect6_time32 -309 i386 ppoll sys_ppoll __ia32_compat_sys_ppoll_time32 +308 i386 pselect6 sys_pselect6_time32 __ia32_compat_sys_pselect6_time32 +309 i386 ppoll sys_ppoll_time32 __ia32_compat_sys_ppoll_time32 310 i386 unshare sys_unshare __ia32_sys_unshare 311 i386 set_robust_list sys_set_robust_list __ia32_compat_sys_set_robust_list 312 i386 get_robust_list sys_get_robust_list __ia32_compat_sys_get_robust_list @@ -331,13 +331,13 @@ 317 i386 move_pages sys_move_pages __ia32_compat_sys_move_pages 318 i386 getcpu sys_getcpu __ia32_sys_getcpu 319 i386 epoll_pwait sys_epoll_pwait __ia32_sys_epoll_pwait -320 i386 utimensat sys_utimensat __ia32_sys_utimensat_time32 +320 i386 utimensat sys_utimensat_time32 __ia32_sys_utimensat_time32 321 i386 signalfd sys_signalfd __ia32_compat_sys_signalfd 322 i386 timerfd_create sys_timerfd_create __ia32_sys_timerfd_create 323 i386 eventfd sys_eventfd __ia32_sys_eventfd 324 i386 fallocate sys_fallocate __ia32_compat_sys_x86_fallocate -325 i386 timerfd_settime sys_timerfd_settime __ia32_sys_timerfd_settime32 -326 i386 timerfd_gettime sys_timerfd_gettime __ia32_sys_timerfd_gettime32 +325 i386 timerfd_settime sys_timerfd_settime32 __ia32_sys_timerfd_settime32 +326 i386 timerfd_gettime sys_timerfd_gettime32 __ia32_sys_timerfd_gettime32 327 i386 signalfd4 sys_signalfd4 __ia32_compat_sys_signalfd4 328 i386 eventfd2 sys_eventfd2 __ia32_sys_eventfd2 329 i386 epoll_create1 sys_epoll_create1 __ia32_sys_epoll_create1 @@ -348,13 +348,13 @@ 334 i386 pwritev sys_pwritev __ia32_compat_sys_pwritev 335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo __ia32_compat_sys_rt_tgsigqueueinfo 336 i386 perf_event_open sys_perf_event_open __ia32_sys_perf_event_open -337 i386 recvmmsg sys_recvmmsg __ia32_compat_sys_recvmmsg_time32 +337 i386 recvmmsg sys_recvmmsg_time32 __ia32_compat_sys_recvmmsg_time32 338 i386 fanotify_init sys_fanotify_init __ia32_sys_fanotify_init 339 i386 fanotify_mark sys_fanotify_mark __ia32_compat_sys_fanotify_mark 340 i386 prlimit64 sys_prlimit64 __ia32_sys_prlimit64 341 i386 name_to_handle_at sys_name_to_handle_at __ia32_sys_name_to_handle_at 342 i386 open_by_handle_at sys_open_by_handle_at __ia32_compat_sys_open_by_handle_at -343 i386 clock_adjtime sys_clock_adjtime __ia32_sys_clock_adjtime32 +343 i386 clock_adjtime sys_clock_adjtime32 __ia32_sys_clock_adjtime32 344 i386 syncfs sys_syncfs __ia32_sys_syncfs 345 i386 sendmmsg sys_sendmmsg __ia32_compat_sys_sendmmsg 346 i386 setns sys_setns __ia32_sys_setns @@ -396,7 +396,7 @@ 382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free 383 i386 statx sys_statx __ia32_sys_statx 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl -385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents +385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq # don't use numbers 387 through 392, add new calls at the end 393 i386 semget sys_semget __ia32_sys_semget diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index c699e014e0dd..6f05bc8f015a 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -174,7 +174,7 @@ 158 common capget sys_capget 159 common capset sys_capset 160 common ptrace sys_ptrace -161 common semtimedop sys_semtimedop +161 common semtimedop sys_semtimedop_time32 162 common semget sys_semget 163 common semop sys_semop 164 common semctl sys_old_semctl @@ -206,11 +206,11 @@ 188 common setrlimit sys_setrlimit 189 common getrlimit sys_getrlimit 190 common getrusage sys_getrusage -191 common futex sys_futex +191 common futex sys_futex_time32 192 common gettimeofday sys_gettimeofday 193 common settimeofday sys_settimeofday -194 common adjtimex sys_adjtimex -195 common nanosleep sys_nanosleep +194 common adjtimex sys_adjtimex_time32 +195 common nanosleep sys_nanosleep_time32 196 common getgroups sys_getgroups 197 common setgroups sys_setgroups 198 common sethostname sys_sethostname @@ -234,7 +234,7 @@ 215 common sched_getscheduler sys_sched_getscheduler 216 common sched_get_priority_max sys_sched_get_priority_max 217 common sched_get_priority_min sys_sched_get_priority_min -218 common sched_rr_get_interval sys_sched_rr_get_interval +218 common sched_rr_get_interval sys_sched_rr_get_interval_time32 219 common sched_yield sys_sched_yield 222 common available222 sys_ni_syscall # Signal Handling @@ -244,14 +244,14 @@ 226 common rt_sigaction sys_rt_sigaction 227 common rt_sigprocmask sys_rt_sigprocmask 228 common rt_sigpending sys_rt_sigpending -229 common rt_sigtimedwait sys_rt_sigtimedwait +229 common rt_sigtimedwait sys_rt_sigtimedwait_time32 230 common rt_sigqueueinfo sys_rt_sigqueueinfo 231 common rt_sigsuspend sys_rt_sigsuspend # Message 232 common mq_open sys_mq_open 233 common mq_unlink sys_mq_unlink -234 common mq_timedsend sys_mq_timedsend -235 common mq_timedreceive sys_mq_timedreceive +234 common mq_timedsend sys_mq_timedsend_time32 +235 common mq_timedreceive sys_mq_timedreceive_time32 236 common mq_notify sys_mq_notify 237 common mq_getsetattr sys_mq_getsetattr 238 common available238 sys_ni_syscall @@ -259,17 +259,17 @@ # IO 240 common io_destroy sys_io_destroy 241 common io_submit sys_io_submit -242 common io_getevents sys_io_getevents +242 common io_getevents sys_io_getevents_time32 243 common io_cancel sys_io_cancel -244 common clock_settime sys_clock_settime -245 common clock_gettime sys_clock_gettime -246 common clock_getres sys_clock_getres -247 common clock_nanosleep sys_clock_nanosleep +244 common clock_settime sys_clock_settime32 +245 common clock_gettime sys_clock_gettime32 +246 common clock_getres sys_clock_getres_time32 +247 common clock_nanosleep sys_clock_nanosleep_time32 # Timer 248 common timer_create sys_timer_create 249 common timer_delete sys_timer_delete -250 common timer_settime sys_timer_settime -251 common timer_gettime sys_timer_gettime +250 common timer_settime sys_timer_settime32 +251 common timer_gettime sys_timer_gettime32 252 common timer_getoverrun sys_timer_getoverrun # System 253 common reserved253 sys_ni_syscall @@ -291,8 +291,8 @@ 269 common tee sys_tee 270 common vmsplice sys_vmsplice 271 common available271 sys_ni_syscall -272 common pselect6 sys_pselect6 -273 common ppoll sys_ppoll +272 common pselect6 sys_pselect6_time32 +273 common ppoll sys_ppoll_time32 274 common epoll_pwait sys_epoll_pwait 275 common epoll_create1 sys_epoll_create1 276 common inotify_init sys_inotify_init @@ -316,7 +316,7 @@ 293 common linkat sys_linkat 294 common symlinkat sys_symlinkat 295 common readlinkat sys_readlinkat -296 common utimensat sys_utimensat +296 common utimensat sys_utimensat_time32 297 common fchownat sys_fchownat 298 common futimesat sys_futimesat 299 common fstatat64 sys_fstatat64 @@ -327,14 +327,14 @@ 304 common signalfd sys_signalfd # 305 was timerfd 306 common eventfd sys_eventfd -307 common recvmmsg sys_recvmmsg +307 common recvmmsg sys_recvmmsg_time32 308 common setns sys_setns 309 common signalfd4 sys_signalfd4 310 common dup3 sys_dup3 311 common pipe2 sys_pipe2 312 common timerfd_create sys_timerfd_create -313 common timerfd_settime sys_timerfd_settime -314 common timerfd_gettime sys_timerfd_gettime +313 common timerfd_settime sys_timerfd_settime32 +314 common timerfd_gettime sys_timerfd_gettime32 315 common available315 sys_ni_syscall 316 common eventfd2 sys_eventfd2 317 common preadv sys_preadv @@ -349,7 +349,7 @@ 326 common sync_file_range2 sys_sync_file_range2 327 common perf_event_open sys_perf_event_open 328 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo -329 common clock_adjtime sys_clock_adjtime +329 common clock_adjtime sys_clock_adjtime32 330 common prlimit64 sys_prlimit64 331 common kcmp sys_kcmp 332 common finit_module sys_finit_module diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 153b55b94234..ab1831769030 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -39,7 +39,7 @@ __SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit) #define __NR_io_cancel 3 __SYSCALL(__NR_io_cancel, sys_io_cancel) #define __NR_io_getevents 4 -__SC_COMP(__NR_io_getevents, sys_io_getevents, sys_io_getevents_time32) +__SC_3264(__NR_io_getevents, sys_io_getevents_time32, sys_io_getevents) /* fs/xattr.c */ #define __NR_setxattr 5 @@ -223,9 +223,9 @@ __SYSCALL(__NR3264_sendfile, sys_sendfile64) /* fs/select.c */ #define __NR_pselect6 72 -__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6_time32) +__SC_COMP_3264(__NR_pselect6, sys_pselect6_time32, sys_pselect6, compat_sys_pselect6_time32) #define __NR_ppoll 73 -__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll_time32) +__SC_COMP_3264(__NR_ppoll, sys_ppoll_time32, sys_ppoll, compat_sys_ppoll_time32) /* fs/signalfd.c */ #define __NR_signalfd4 74 @@ -270,15 +270,15 @@ __SC_COMP(__NR_sync_file_range, sys_sync_file_range, \ #define __NR_timerfd_create 85 __SYSCALL(__NR_timerfd_create, sys_timerfd_create) #define __NR_timerfd_settime 86 -__SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \ - sys_timerfd_settime32) +__SC_3264(__NR_timerfd_settime, sys_timerfd_settime32, \ + sys_timerfd_settime) #define __NR_timerfd_gettime 87 -__SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \ - sys_timerfd_gettime32) +__SC_3264(__NR_timerfd_gettime, sys_timerfd_gettime32, \ + sys_timerfd_gettime) /* fs/utimes.c */ #define __NR_utimensat 88 -__SC_COMP(__NR_utimensat, sys_utimensat, sys_utimensat_time32) +__SC_3264(__NR_utimensat, sys_utimensat_time32, sys_utimensat) /* kernel/acct.c */ #define __NR_acct 89 @@ -310,7 +310,7 @@ __SYSCALL(__NR_unshare, sys_unshare) /* kernel/futex.c */ #define __NR_futex 98 -__SC_COMP(__NR_futex, sys_futex, sys_futex_time32) +__SC_3264(__NR_futex, sys_futex_time32, sys_futex) #define __NR_set_robust_list 99 __SC_COMP(__NR_set_robust_list, sys_set_robust_list, \ compat_sys_set_robust_list) @@ -320,7 +320,7 @@ __SC_COMP(__NR_get_robust_list, sys_get_robust_list, \ /* kernel/hrtimer.c */ #define __NR_nanosleep 101 -__SC_COMP(__NR_nanosleep, sys_nanosleep, sys_nanosleep_time32) +__SC_3264(__NR_nanosleep, sys_nanosleep_time32, sys_nanosleep) /* kernel/itimer.c */ #define __NR_getitimer 102 @@ -342,22 +342,22 @@ __SYSCALL(__NR_delete_module, sys_delete_module) #define __NR_timer_create 107 __SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create) #define __NR_timer_gettime 108 -__SC_COMP(__NR_timer_gettime, sys_timer_gettime, sys_timer_gettime32) +__SC_3264(__NR_timer_gettime, sys_timer_gettime32, sys_timer_gettime) #define __NR_timer_getoverrun 109 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) #define __NR_timer_settime 110 -__SC_COMP(__NR_timer_settime, sys_timer_settime, sys_timer_settime32) +__SC_3264(__NR_timer_settime, sys_timer_settime32, sys_timer_settime) #define __NR_timer_delete 111 __SYSCALL(__NR_timer_delete, sys_timer_delete) #define __NR_clock_settime 112 -__SC_COMP(__NR_clock_settime, sys_clock_settime, sys_clock_settime32) +__SC_3264(__NR_clock_settime, sys_clock_settime32, sys_clock_settime) #define __NR_clock_gettime 113 -__SC_COMP(__NR_clock_gettime, sys_clock_gettime, sys_clock_gettime32) +__SC_3264(__NR_clock_gettime, sys_clock_gettime32, sys_clock_gettime) #define __NR_clock_getres 114 -__SC_COMP(__NR_clock_getres, sys_clock_getres, sys_clock_getres_time32) +__SC_3264(__NR_clock_getres, sys_clock_getres_time32, sys_clock_getres) #define __NR_clock_nanosleep 115 -__SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \ - sys_clock_nanosleep_time32) +__SC_3264(__NR_clock_nanosleep, sys_clock_nanosleep_time32, \ + sys_clock_nanosleep) /* kernel/printk.c */ #define __NR_syslog 116 @@ -389,8 +389,8 @@ __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) #define __NR_sched_get_priority_min 126 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) #define __NR_sched_rr_get_interval 127 -__SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \ - sys_sched_rr_get_interval_time32) +__SC_3264(__NR_sched_rr_get_interval, sys_sched_rr_get_interval_time32, \ + sys_sched_rr_get_interval) /* kernel/signal.c */ #define __NR_restart_syscall 128 @@ -412,8 +412,8 @@ __SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask) #define __NR_rt_sigpending 136 __SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending) #define __NR_rt_sigtimedwait 137 -__SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \ - compat_sys_rt_sigtimedwait_time32) +__SC_COMP_3264(__NR_rt_sigtimedwait, sys_rt_sigtimedwait_time32, \ + sys_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time32) #define __NR_rt_sigqueueinfo 138 __SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \ compat_sys_rt_sigqueueinfo) @@ -486,7 +486,7 @@ __SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday) #define __NR_settimeofday 170 __SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday) #define __NR_adjtimex 171 -__SC_COMP(__NR_adjtimex, sys_adjtimex, sys_adjtimex_time32) +__SC_3264(__NR_adjtimex, sys_adjtimex_time32, sys_adjtimex) /* kernel/timer.c */ #define __NR_getpid 172 @@ -512,10 +512,10 @@ __SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open) #define __NR_mq_unlink 181 __SYSCALL(__NR_mq_unlink, sys_mq_unlink) #define __NR_mq_timedsend 182 -__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, sys_mq_timedsend_time32) +__SC_3264(__NR_mq_timedsend, sys_mq_timedsend_time32, sys_mq_timedsend) #define __NR_mq_timedreceive 183 -__SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \ - sys_mq_timedreceive_time32) +__SC_3264(__NR_mq_timedreceive, sys_mq_timedreceive_time32, \ + sys_mq_timedreceive) #define __NR_mq_notify 184 __SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify) #define __NR_mq_getsetattr 185 @@ -659,7 +659,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_accept4 242 __SYSCALL(__NR_accept4, sys_accept4) #define __NR_recvmmsg 243 -__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg_time32) +__SC_COMP_3264(__NR_recvmmsg, sys_recvmmsg_time32, sys_recvmmsg, compat_sys_recvmmsg_time32) /* * Architectures may provide up to 16 syscalls of their own @@ -681,7 +681,7 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) __SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \ compat_sys_open_by_handle_at) #define __NR_clock_adjtime 266 -__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, sys_clock_adjtime32) +__SC_3264(__NR_clock_adjtime, sys_clock_adjtime32, sys_clock_adjtime) #define __NR_syncfs 267 __SYSCALL(__NR_syncfs, sys_syncfs) #define __NR_setns 268 @@ -735,7 +735,7 @@ __SYSCALL(__NR_pkey_free, sys_pkey_free) #define __NR_statx 291 __SYSCALL(__NR_statx, sys_statx) #define __NR_io_pgetevents 292 -__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) +__SC_COMP_3264(__NR_io_pgetevents, sys_io_pgetevents_time32, sys_io_pgetevents, compat_sys_io_pgetevents) #define __NR_rseq 293 __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 -- cgit v1.3-7-g2ca7 From c70a772fda11570ebddecbce1543a3fda008db4a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 7 Jan 2019 00:00:34 +0100 Subject: y2038: remove struct definition redirects We now use 64-bit time_t on all architectures, so the __kernel_timex, __kernel_timeval and __kernel_timespec redirects can be removed after having served their purpose. This makes it all much less confusing, as the __kernel_* types now always refer to the same layout based on 64-bit time_t across all 32-bit and 64-bit architectures. Signed-off-by: Arnd Bergmann --- include/linux/time64.h | 8 -------- include/linux/timex.h | 7 ------- include/uapi/linux/time.h | 4 ---- include/uapi/linux/timex.h | 2 -- 4 files changed, 21 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/time64.h b/include/linux/time64.h index 05634afba0db..f38d382ffec1 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -7,14 +7,6 @@ typedef __s64 time64_t; typedef __u64 timeu64_t; -/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path - * and 32-bit emulation. - */ -#ifndef CONFIG_64BIT_TIME -#define __kernel_timespec timespec -#define __kernel_itimerspec itimerspec -#endif - #include struct timespec64 { diff --git a/include/linux/timex.h b/include/linux/timex.h index 4aff9f0d1367..ce0859763670 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -53,13 +53,6 @@ #ifndef _LINUX_TIMEX_H #define _LINUX_TIMEX_H -/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path - * and 32-bit emulation. - */ -#ifndef CONFIG_64BIT_TIME -#define __kernel_timex timex -#endif - #include #define ADJ_ADJTIME 0x8000 /* switch between adjtime/adjtimex modes */ diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 6b56a2208be7..b03f8717c312 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -42,19 +42,15 @@ struct itimerval { struct timeval it_value; /* current value */ }; -#ifndef __kernel_timespec struct __kernel_timespec { __kernel_time64_t tv_sec; /* seconds */ long long tv_nsec; /* nanoseconds */ }; -#endif -#ifndef __kernel_itimerspec struct __kernel_itimerspec { struct __kernel_timespec it_interval; /* timer period */ struct __kernel_timespec it_value; /* timer expiration */ }; -#endif /* * legacy timeval structure, only embedded in structures that diff --git a/include/uapi/linux/timex.h b/include/uapi/linux/timex.h index a1c6b73016a5..9f517f9010bb 100644 --- a/include/uapi/linux/timex.h +++ b/include/uapi/linux/timex.h @@ -97,7 +97,6 @@ struct __kernel_timex_timeval { long long tv_usec; }; -#ifndef __kernel_timex struct __kernel_timex { unsigned int modes; /* mode selector */ int :32; /* pad */ @@ -131,7 +130,6 @@ struct __kernel_timex { int :32; int :32; int :32; int :32; int :32; int :32; int :32; }; -#endif /* * Mode codes (timex.mode) -- cgit v1.3-7-g2ca7 From 48166e6ea47d23984f0b481ca199250e1ce0730a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 10 Jan 2019 12:45:11 +0100 Subject: y2038: add 64-bit time_t syscalls to all 32-bit architectures This adds 21 new system calls on each ABI that has 32-bit time_t today. All of these have the exact same semantics as their existing counterparts, and the new ones all have macro names that end in 'time64' for clarification. This gets us to the point of being able to safely use a C library that has 64-bit time_t in user space. There are still a couple of loose ends to tie up in various areas of the code, but this is the big one, and should be entirely uncontroversial at this point. In particular, there are four system calls (getitimer, setitimer, waitid, and getrusage) that don't have a 64-bit counterpart yet, but these can all be safely implemented in the C library by wrapping around the existing system calls because the 32-bit time_t they pass only counts elapsed time, not time since the epoch. They will be dealt with later. Signed-off-by: Arnd Bergmann Acked-by: Heiko Carstens Acked-by: Geert Uytterhoeven Acked-by: Catalin Marinas --- arch/alpha/kernel/syscalls/syscall.tbl | 2 ++ arch/arm/tools/syscall.tbl | 21 ++++++++++++++ arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 41 +++++++++++++++++++++++++++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 20 +++++++++++++ arch/microblaze/kernel/syscalls/syscall.tbl | 21 ++++++++++++++ arch/mips/kernel/syscalls/syscall_n32.tbl | 21 ++++++++++++++ arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 20 +++++++++++++ arch/parisc/kernel/syscalls/syscall.tbl | 21 ++++++++++++++ arch/powerpc/kernel/syscalls/syscall.tbl | 20 +++++++++++++ arch/s390/kernel/syscalls/syscall.tbl | 20 +++++++++++++ arch/sh/kernel/syscalls/syscall.tbl | 20 +++++++++++++ arch/sparc/kernel/syscalls/syscall.tbl | 20 +++++++++++++ arch/x86/entry/syscalls/syscall_32.tbl | 20 +++++++++++++ arch/xtensa/kernel/syscalls/syscall.tbl | 21 ++++++++++++++ include/uapi/asm-generic/unistd.h | 44 ++++++++++++++++++++++++++++- scripts/checksyscalls.sh | 40 ++++++++++++++++++++++++++ 19 files changed, 374 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 340b88dd397e..63ed39cbd3bd 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -461,3 +461,5 @@ 530 common getegid sys_getegid 531 common geteuid sys_geteuid 532 common getppid sys_getppid +# all other architectures have common numbers for new syscall, alpha +# is the exception. diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index a96d9b5ee04e..9016f4081bb9 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -416,3 +416,24 @@ 399 common io_pgetevents sys_io_pgetevents_time32 400 common migrate_pages sys_migrate_pages 401 common kexec_file_load sys_kexec_file_load +# 402 is unused +403 common clock_gettime64 sys_clock_gettime +404 common clock_settime64 sys_clock_settime +405 common clock_adjtime64 sys_clock_adjtime +406 common clock_getres_time64 sys_clock_getres +407 common clock_nanosleep_time64 sys_clock_nanosleep +408 common timer_gettime64 sys_timer_gettime +409 common timer_settime64 sys_timer_settime +410 common timerfd_gettime64 sys_timerfd_gettime +411 common timerfd_settime64 sys_timerfd_settime +412 common utimensat_time64 sys_utimensat +413 common pselect6_time64 sys_pselect6 +414 common ppoll_time64 sys_ppoll +416 common io_pgetevents_time64 sys_io_pgetevents +417 common recvmmsg_time64 sys_recvmmsg +418 common mq_timedsend_time64 sys_mq_timedsend +419 common mq_timedreceive_time64 sys_mq_timedreceive +420 common semtimedop_time64 sys_semtimedop +421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait +422 common futex_time64 sys_futex +423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 2c30e6f145ff..d1dd93436e1e 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -44,7 +44,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 402 +#define __NR_compat_syscalls 424 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 1ded82857161..5590f2623690 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -825,6 +825,47 @@ __SYSCALL(__NR_io_pgetevents, compat_sys_io_pgetevents) __SYSCALL(__NR_migrate_pages, compat_sys_migrate_pages) #define __NR_kexec_file_load 401 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +/* 402 is unused */ +#define __NR_clock_gettime64 403 +__SYSCALL(__NR_clock_gettime64, sys_clock_gettime) +#define __NR_clock_settime64 404 +__SYSCALL(__NR_clock_settime64, sys_clock_settime) +#define __NR_clock_adjtime64 405 +__SYSCALL(__NR_clock_adjtime64, sys_clock_adjtime) +#define __NR_clock_getres_time64 406 +__SYSCALL(__NR_clock_getres_time64, sys_clock_getres) +#define __NR_clock_nanosleep_time64 407 +__SYSCALL(__NR_clock_nanosleep_time64, sys_clock_nanosleep) +#define __NR_timer_gettime64 408 +__SYSCALL(__NR_timer_gettime64, sys_timer_gettime) +#define __NR_timer_settime64 409 +__SYSCALL(__NR_timer_settime64, sys_timer_settime) +#define __NR_timerfd_gettime64 410 +__SYSCALL(__NR_timerfd_gettime64, sys_timerfd_gettime) +#define __NR_timerfd_settime64 411 +__SYSCALL(__NR_timerfd_settime64, sys_timerfd_settime) +#define __NR_utimensat_time64 412 +__SYSCALL(__NR_utimensat_time64, sys_utimensat) +#define __NR_pselect6_time64 413 +__SYSCALL(__NR_pselect6_time64, compat_sys_pselect6_time64) +#define __NR_ppoll_time64 414 +__SYSCALL(__NR_ppoll_time64, compat_sys_ppoll_time64) +#define __NR_io_pgetevents_time64 416 +__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents) +#define __NR_recvmmsg_time64 417 +__SYSCALL(__NR_recvmmsg_time64, compat_sys_recvmmsg_time64) +#define __NR_mq_timedsend_time64 418 +__SYSCALL(__NR_mq_timedsend_time64, sys_mq_timedsend) +#define __NR_mq_timedreceive_time64 419 +__SYSCALL(__NR_mq_timedreceive_time64, sys_mq_timedreceive) +#define __NR_semtimedop_time64 420 +__SYSCALL(__NR_semtimedop_time64, sys_semtimedop) +#define __NR_rt_sigtimedwait_time64 421 +__SYSCALL(__NR_rt_sigtimedwait_time64, compat_sys_rt_sigtimedwait_time64) +#define __NR_futex_time64 422 +__SYSCALL(__NR_futex_time64, sys_futex) +#define __NR_sched_rr_get_interval_time64 423 +__SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 52319006de0d..ab9cda5f6136 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -343,3 +343,4 @@ 331 common pkey_alloc sys_pkey_alloc 332 common pkey_free sys_pkey_free 333 common rseq sys_rseq +# 334 through 423 are reserved to sync up with other architectures diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 253bd2a069bd..125c14178979 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -403,3 +403,23 @@ 400 common msgsnd sys_msgsnd 401 common msgrcv sys_msgrcv 402 common msgctl sys_msgctl +403 common clock_gettime64 sys_clock_gettime +404 common clock_settime64 sys_clock_settime +405 common clock_adjtime64 sys_clock_adjtime +406 common clock_getres_time64 sys_clock_getres +407 common clock_nanosleep_time64 sys_clock_nanosleep +408 common timer_gettime64 sys_timer_gettime +409 common timer_settime64 sys_timer_settime +410 common timerfd_gettime64 sys_timerfd_gettime +411 common timerfd_settime64 sys_timerfd_settime +412 common utimensat_time64 sys_utimensat +413 common pselect6_time64 sys_pselect6 +414 common ppoll_time64 sys_ppoll +416 common io_pgetevents_time64 sys_io_pgetevents +417 common recvmmsg_time64 sys_recvmmsg +418 common mq_timedsend_time64 sys_mq_timedsend +419 common mq_timedreceive_time64 sys_mq_timedreceive +420 common semtimedop_time64 sys_semtimedop +421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait +422 common futex_time64 sys_futex +423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 44a87649d681..8ee3a8c18498 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -408,3 +408,24 @@ 398 common statx sys_statx 399 common io_pgetevents sys_io_pgetevents_time32 400 common rseq sys_rseq +# 401 and 402 are unused +403 common clock_gettime64 sys_clock_gettime +404 common clock_settime64 sys_clock_settime +405 common clock_adjtime64 sys_clock_adjtime +406 common clock_getres_time64 sys_clock_getres +407 common clock_nanosleep_time64 sys_clock_nanosleep +408 common timer_gettime64 sys_timer_gettime +409 common timer_settime64 sys_timer_settime +410 common timerfd_gettime64 sys_timerfd_gettime +411 common timerfd_settime64 sys_timerfd_settime +412 common utimensat_time64 sys_utimensat +413 common pselect6_time64 sys_pselect6 +414 common ppoll_time64 sys_ppoll +416 common io_pgetevents_time64 sys_io_pgetevents +417 common recvmmsg_time64 sys_recvmmsg +418 common mq_timedsend_time64 sys_mq_timedsend +419 common mq_timedreceive_time64 sys_mq_timedreceive +420 common semtimedop_time64 sys_semtimedop +421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait +422 common futex_time64 sys_futex +423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 6d1e019817c8..15f4117900ee 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -341,3 +341,24 @@ 330 n32 statx sys_statx 331 n32 rseq sys_rseq 332 n32 io_pgetevents compat_sys_io_pgetevents +# 333 through 402 are unassigned to sync up with generic numbers +403 n32 clock_gettime64 sys_clock_gettime +404 n32 clock_settime64 sys_clock_settime +405 n32 clock_adjtime64 sys_clock_adjtime +406 n32 clock_getres_time64 sys_clock_getres +407 n32 clock_nanosleep_time64 sys_clock_nanosleep +408 n32 timer_gettime64 sys_timer_gettime +409 n32 timer_settime64 sys_timer_settime +410 n32 timerfd_gettime64 sys_timerfd_gettime +411 n32 timerfd_settime64 sys_timerfd_settime +412 n32 utimensat_time64 sys_utimensat +413 n32 pselect6_time64 compat_sys_pselect6_time64 +414 n32 ppoll_time64 compat_sys_ppoll_time64 +416 n32 io_pgetevents_time64 sys_io_pgetevents +417 n32 recvmmsg_time64 compat_sys_recvmmsg_time64 +418 n32 mq_timedsend_time64 sys_mq_timedsend +419 n32 mq_timedreceive_time64 sys_mq_timedreceive +420 n32 semtimedop_time64 sys_semtimedop +421 n32 rt_sigtimedwait_time64 compat_sys_rt_sigtimedwait_time64 +422 n32 futex_time64 sys_futex +423 n32 sched_rr_get_interval_time64 sys_sched_rr_get_interval diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index af0da757a7b2..c85502e67b44 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -337,3 +337,4 @@ 326 n64 statx sys_statx 327 n64 rseq sys_rseq 328 n64 io_pgetevents sys_io_pgetevents +# 329 through 423 are reserved to sync up with other architectures diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 54312c5b5343..2e063d0f837e 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -391,3 +391,23 @@ 400 o32 msgsnd sys_msgsnd compat_sys_msgsnd 401 o32 msgrcv sys_msgrcv compat_sys_msgrcv 402 o32 msgctl sys_msgctl compat_sys_msgctl +403 o32 clock_gettime64 sys_clock_gettime sys_clock_gettime +404 o32 clock_settime64 sys_clock_settime sys_clock_settime +405 o32 clock_adjtime64 sys_clock_adjtime sys_clock_adjtime +406 o32 clock_getres_time64 sys_clock_getres sys_clock_getres +407 o32 clock_nanosleep_time64 sys_clock_nanosleep sys_clock_nanosleep +408 o32 timer_gettime64 sys_timer_gettime sys_timer_gettime +409 o32 timer_settime64 sys_timer_settime sys_timer_settime +410 o32 timerfd_gettime64 sys_timerfd_gettime sys_timerfd_gettime +411 o32 timerfd_settime64 sys_timerfd_settime sys_timerfd_settime +412 o32 utimensat_time64 sys_utimensat sys_utimensat +413 o32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 +414 o32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 +416 o32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +417 o32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 +418 o32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend +419 o32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive +420 o32 semtimedop_time64 sys_semtimedop sys_semtimedop +421 o32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 +422 o32 futex_time64 sys_futex sys_futex +423 o32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 7eff3dc3d613..b26766c6647d 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -399,3 +399,24 @@ 352 common pkey_alloc sys_pkey_alloc 353 common pkey_free sys_pkey_free 354 common rseq sys_rseq +# 355 through 402 are unassigned to sync up with generic numbers +403 32 clock_gettime64 sys_clock_gettime sys_clock_gettime +404 32 clock_settime64 sys_clock_settime sys_clock_settime +405 32 clock_adjtime64 sys_clock_adjtime sys_clock_adjtime +406 32 clock_getres_time64 sys_clock_getres sys_clock_getres +407 32 clock_nanosleep_time64 sys_clock_nanosleep sys_clock_nanosleep +408 32 timer_gettime64 sys_timer_gettime sys_timer_gettime +409 32 timer_settime64 sys_timer_settime sys_timer_settime +410 32 timerfd_gettime64 sys_timerfd_gettime sys_timerfd_gettime +411 32 timerfd_settime64 sys_timerfd_settime sys_timerfd_settime +412 32 utimensat_time64 sys_utimensat sys_utimensat +413 32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 +414 32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 +416 32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +417 32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 +418 32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend +419 32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive +420 32 semtimedop_time64 sys_semtimedop sys_semtimedop +421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 +422 32 futex_time64 sys_futex sys_futex +423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 500edbf9e8a6..b18abb0c3dae 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -485,3 +485,23 @@ 400 common msgsnd sys_msgsnd compat_sys_msgsnd 401 common msgrcv sys_msgrcv compat_sys_msgrcv 402 common msgctl sys_msgctl compat_sys_msgctl +403 32 clock_gettime64 sys_clock_gettime sys_clock_gettime +404 32 clock_settime64 sys_clock_settime sys_clock_settime +405 32 clock_adjtime64 sys_clock_adjtime sys_clock_adjtime +406 32 clock_getres_time64 sys_clock_getres sys_clock_getres +407 32 clock_nanosleep_time64 sys_clock_nanosleep sys_clock_nanosleep +408 32 timer_gettime64 sys_timer_gettime sys_timer_gettime +409 32 timer_settime64 sys_timer_settime sys_timer_settime +410 32 timerfd_gettime64 sys_timerfd_gettime sys_timerfd_gettime +411 32 timerfd_settime64 sys_timerfd_settime sys_timerfd_settime +412 32 utimensat_time64 sys_utimensat sys_utimensat +413 32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 +414 32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 +416 32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +417 32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 +418 32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend +419 32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive +420 32 semtimedop_time64 sys_semtimedop sys_semtimedop +421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 +422 32 futex_time64 sys_futex sys_futex +423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 285201cf1f83..02579f95f391 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -406,3 +406,23 @@ 400 common msgsnd sys_msgsnd compat_sys_msgsnd 401 common msgrcv sys_msgrcv compat_sys_msgrcv 402 common msgctl sys_msgctl compat_sys_msgctl +403 32 clock_gettime64 - sys_clock_gettime +404 32 clock_settime64 - sys_clock_settime +405 32 clock_adjtime64 - sys_clock_adjtime +406 32 clock_getres_time64 - sys_clock_getres +407 32 clock_nanosleep_time64 - sys_clock_nanosleep +408 32 timer_gettime64 - sys_timer_gettime +409 32 timer_settime64 - sys_timer_settime +410 32 timerfd_gettime64 - sys_timerfd_gettime +411 32 timerfd_settime64 - sys_timerfd_settime +412 32 utimensat_time64 - sys_utimensat +413 32 pselect6_time64 - compat_sys_pselect6_time64 +414 32 ppoll_time64 - compat_sys_ppoll_time64 +416 32 io_pgetevents_time64 - sys_io_pgetevents +417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 +418 32 mq_timedsend_time64 - sys_mq_timedsend +419 32 mq_timedreceive_time64 - sys_mq_timedreceive +420 32 semtimedop_time64 - sys_semtimedop +421 32 rt_sigtimedwait_time64 - compat_sys_rt_sigtimedwait_time64 +422 32 futex_time64 - sys_futex +423 32 sched_rr_get_interval_time64 - sys_sched_rr_get_interval diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 06d768c3cc4c..bfda678576e4 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -406,3 +406,23 @@ 400 common msgsnd sys_msgsnd 401 common msgrcv sys_msgrcv 402 common msgctl sys_msgctl +403 common clock_gettime64 sys_clock_gettime +404 common clock_settime64 sys_clock_settime +405 common clock_adjtime64 sys_clock_adjtime +406 common clock_getres_time64 sys_clock_getres +407 common clock_nanosleep_time64 sys_clock_nanosleep +408 common timer_gettime64 sys_timer_gettime +409 common timer_settime64 sys_timer_settime +410 common timerfd_gettime64 sys_timerfd_gettime +411 common timerfd_settime64 sys_timerfd_settime +412 common utimensat_time64 sys_utimensat +413 common pselect6_time64 sys_pselect6 +414 common ppoll_time64 sys_ppoll +416 common io_pgetevents_time64 sys_io_pgetevents +417 common recvmmsg_time64 sys_recvmmsg +418 common mq_timedsend_time64 sys_mq_timedsend +419 common mq_timedreceive_time64 sys_mq_timedreceive +420 common semtimedop_time64 sys_semtimedop +421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait +422 common futex_time64 sys_futex +423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 99c40abd8878..b9a5a04b2d2c 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -449,3 +449,23 @@ 400 common msgsnd sys_msgsnd compat_sys_msgsnd 401 common msgrcv sys_msgrcv compat_sys_msgrcv 402 common msgctl sys_msgctl compat_sys_msgctl +403 32 clock_gettime64 sys_clock_gettime sys_clock_gettime +404 32 clock_settime64 sys_clock_settime sys_clock_settime +405 32 clock_adjtime64 sys_clock_adjtime sys_clock_adjtime +406 32 clock_getres_time64 sys_clock_getres sys_clock_getres +407 32 clock_nanosleep_time64 sys_clock_nanosleep sys_clock_nanosleep +408 32 timer_gettime64 sys_timer_gettime sys_timer_gettime +409 32 timer_settime64 sys_timer_settime sys_timer_settime +410 32 timerfd_gettime64 sys_timerfd_gettime sys_timerfd_gettime +411 32 timerfd_settime64 sys_timerfd_settime sys_timerfd_settime +412 32 utimensat_time64 sys_utimensat sys_utimensat +413 32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 +414 32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 +416 32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents +417 32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 +418 32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend +419 32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive +420 32 semtimedop_time64 sys_semtimedop sys_semtimedop +421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 +422 32 futex_time64 sys_futex sys_futex +423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 8c47c1191a53..955ab6a3b61f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -409,3 +409,23 @@ 400 i386 msgsnd sys_msgsnd __ia32_compat_sys_msgsnd 401 i386 msgrcv sys_msgrcv __ia32_compat_sys_msgrcv 402 i386 msgctl sys_msgctl __ia32_compat_sys_msgctl +403 i386 clock_gettime64 sys_clock_gettime __ia32_sys_clock_gettime +404 i386 clock_settime64 sys_clock_settime __ia32_sys_clock_settime +405 i386 clock_adjtime64 sys_clock_adjtime __ia32_sys_clock_adjtime +406 i386 clock_getres_time64 sys_clock_getres __ia32_sys_clock_getres +407 i386 clock_nanosleep_time64 sys_clock_nanosleep __ia32_sys_clock_nanosleep +408 i386 timer_gettime64 sys_timer_gettime __ia32_sys_timer_gettime +409 i386 timer_settime64 sys_timer_settime __ia32_sys_timer_settime +410 i386 timerfd_gettime64 sys_timerfd_gettime __ia32_sys_timerfd_gettime +411 i386 timerfd_settime64 sys_timerfd_settime __ia32_sys_timerfd_settime +412 i386 utimensat_time64 sys_utimensat __ia32_sys_utimensat +413 i386 pselect6_time64 sys_pselect6 __ia32_compat_sys_pselect6_time64 +414 i386 ppoll_time64 sys_ppoll __ia32_compat_sys_ppoll_time64 +416 i386 io_pgetevents_time64 sys_io_pgetevents __ia32_sys_io_pgetevents +417 i386 recvmmsg_time64 sys_recvmmsg __ia32_compat_sys_recvmmsg_time64 +418 i386 mq_timedsend_time64 sys_mq_timedsend __ia32_sys_mq_timedsend +419 i386 mq_timedreceive_time64 sys_mq_timedreceive __ia32_sys_mq_timedreceive +420 i386 semtimedop_time64 sys_semtimedop __ia32_sys_semtimedop +421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time64 +422 i386 futex_time64 sys_futex __ia32_sys_futex +423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 482673389e21..6af49929de85 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -373,3 +373,24 @@ 350 common pkey_free sys_pkey_free 351 common statx sys_statx 352 common rseq sys_rseq +# 353 through 402 are unassigned to sync up with generic numbers +403 common clock_gettime64 sys_clock_gettime +404 common clock_settime64 sys_clock_settime +405 common clock_adjtime64 sys_clock_adjtime +406 common clock_getres_time64 sys_clock_getres +407 common clock_nanosleep_time64 sys_clock_nanosleep +408 common timer_gettime64 sys_timer_gettime +409 common timer_settime64 sys_timer_settime +410 common timerfd_gettime64 sys_timerfd_gettime +411 common timerfd_settime64 sys_timerfd_settime +412 common utimensat_time64 sys_utimensat +413 common pselect6_time64 sys_pselect6 +414 common ppoll_time64 sys_ppoll +416 common io_pgetevents_time64 sys_io_pgetevents +417 common recvmmsg_time64 sys_recvmmsg +418 common mq_timedsend_time64 sys_mq_timedsend +419 common mq_timedreceive_time64 sys_mq_timedreceive +420 common semtimedop_time64 sys_semtimedop +421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait +422 common futex_time64 sys_futex +423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index ab1831769030..acf9a07ab2ff 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -741,9 +741,51 @@ __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) /* 295 through 402 are unassigned to sync up with generic numbers, don't use */ +#if __BITS_PER_LONG == 32 +#define __NR_clock_gettime64 403 +__SYSCALL(__NR_clock_gettime64, sys_clock_gettime) +#define __NR_clock_settime64 404 +__SYSCALL(__NR_clock_settime64, sys_clock_settime) +#define __NR_clock_adjtime64 405 +__SYSCALL(__NR_clock_adjtime64, sys_clock_adjtime) +#define __NR_clock_getres_time64 406 +__SYSCALL(__NR_clock_getres_time64, sys_clock_getres) +#define __NR_clock_nanosleep_time64 407 +__SYSCALL(__NR_clock_nanosleep_time64, sys_clock_nanosleep) +#define __NR_timer_gettime64 408 +__SYSCALL(__NR_timer_gettime64, sys_timer_gettime) +#define __NR_timer_settime64 409 +__SYSCALL(__NR_timer_settime64, sys_timer_settime) +#define __NR_timerfd_gettime64 410 +__SYSCALL(__NR_timerfd_gettime64, sys_timerfd_gettime) +#define __NR_timerfd_settime64 411 +__SYSCALL(__NR_timerfd_settime64, sys_timerfd_settime) +#define __NR_utimensat_time64 412 +__SYSCALL(__NR_utimensat_time64, sys_utimensat) +#define __NR_pselect6_time64 413 +__SC_COMP(__NR_pselect6_time64, sys_pselect6, compat_sys_pselect6_time64) +#define __NR_ppoll_time64 414 +__SC_COMP(__NR_ppoll_time64, sys_ppoll, compat_sys_ppoll_time64) +#define __NR_io_pgetevents_time64 416 +__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents) +#define __NR_recvmmsg_time64 417 +__SC_COMP(__NR_recvmmsg_time64, sys_recvmmsg, compat_sys_recvmmsg_time64) +#define __NR_mq_timedsend_time64 418 +__SYSCALL(__NR_mq_timedsend_time64, sys_mq_timedsend) +#define __NR_mq_timedreceive_time64 419 +__SYSCALL(__NR_mq_timedreceive_time64, sys_mq_timedreceive) +#define __NR_semtimedop_time64 420 +__SYSCALL(__NR_semtimedop_time64, sys_semtimedop) +#define __NR_rt_sigtimedwait_time64 421 +__SC_COMP(__NR_rt_sigtimedwait_time64, sys_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time64) +#define __NR_futex_time64 422 +__SYSCALL(__NR_futex_time64, sys_futex) +#define __NR_sched_rr_get_interval_time64 423 +__SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval) +#endif #undef __NR_syscalls -#define __NR_syscalls 295 +#define __NR_syscalls 424 /* * 32 bit systems traditionally used different diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh index cf931003395f..cc70a64fa81f 100755 --- a/scripts/checksyscalls.sh +++ b/scripts/checksyscalls.sh @@ -84,6 +84,26 @@ cat << EOF #define __IGNORE_statfs64 #define __IGNORE_llseek #define __IGNORE_mmap2 +#define __IGNORE_clock_gettime64 +#define __IGNORE_clock_settime64 +#define __IGNORE_clock_adjtime64 +#define __IGNORE_clock_getres_time64 +#define __IGNORE_clock_nanosleep_time64 +#define __IGNORE_timer_gettime64 +#define __IGNORE_timer_settime64 +#define __IGNORE_timerfd_gettime64 +#define __IGNORE_timerfd_settime64 +#define __IGNORE_utimensat_time64 +#define __IGNORE_pselect6_time64 +#define __IGNORE_ppoll_time64 +#define __IGNORE_io_pgetevents_time64 +#define __IGNORE_recvmmsg_time64 +#define __IGNORE_mq_timedsend_time64 +#define __IGNORE_mq_timedreceive_time64 +#define __IGNORE_semtimedop_time64 +#define __IGNORE_rt_sigtimedwait_time64 +#define __IGNORE_futex_time64 +#define __IGNORE_sched_rr_get_interval_time64 #else #define __IGNORE_sendfile #define __IGNORE_ftruncate @@ -98,6 +118,26 @@ cat << EOF #define __IGNORE_statfs #define __IGNORE_lseek #define __IGNORE_mmap +#define __IGNORE_clock_gettime +#define __IGNORE_clock_settime +#define __IGNORE_clock_adjtime +#define __IGNORE_clock_getres +#define __IGNORE_clock_nanosleep +#define __IGNORE_timer_gettime +#define __IGNORE_timer_settime +#define __IGNORE_timerfd_gettime +#define __IGNORE_timerfd_settime +#define __IGNORE_utimensat +#define __IGNORE_pselect6 +#define __IGNORE_ppoll +#define __IGNORE_io_pgetevents +#define __IGNORE_recvmmsg +#define __IGNORE_mq_timedsend +#define __IGNORE_mq_timedreceiv +#define __IGNORE_semtimedop +#define __IGNORE_rt_sigtimedwait +#define __IGNORE_futex +#define __IGNORE_sched_rr_get_interval #endif /* i386-specific or historical system calls */ -- cgit v1.3-7-g2ca7 From e9e0c8903009477b630e37a8b6364b26a00720da Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 10 Jan 2019 19:04:34 +0200 Subject: fanotify: encode file identifier for FAN_REPORT_FID When user requests the flag FAN_REPORT_FID in fanotify_init(), a unique file identifier of the event target object will be reported with the event. The file identifier includes the filesystem's fsid (i.e. from statfs(2)) and an NFS file handle of the file (i.e. from name_to_handle_at(2)). The file identifier makes holding the path reference and passing a file descriptor to user redundant, so those are disabled in a group with FAN_REPORT_FID. Encode fid and store it in event for a group with FAN_REPORT_FID. Up to 12 bytes of file handle on 32bit arch (16 bytes on 64bit arch) are stored inline in fanotify_event struct. Larger file handles are stored in an external allocated buffer. On failure to encode fid, we print a warning and queue the event without the fid information. [JK: Fold part of later patched into this one to use exportfs_encode_inode_fh() right away] Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 84 +++++++++++++++++++++++++++++++++++--- fs/notify/fanotify/fanotify.h | 78 +++++++++++++++++++++++++++++++++-- fs/notify/fanotify/fanotify_user.c | 6 +-- include/uapi/linux/fanotify.h | 1 + 4 files changed, 156 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d8e3b6e50844..dd33227e518a 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "fanotify.h" @@ -25,10 +26,18 @@ static bool should_merge(struct fsnotify_event *old_fsn, old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); - if (old_fsn->inode == new_fsn->inode && old->pid == new->pid && - old->path.mnt == new->path.mnt && - old->path.dentry == new->path.dentry) - return true; + if (old_fsn->inode != new_fsn->inode || old->pid != new->pid || + old->fh_type != new->fh_type || old->fh_len != new->fh_len) + return false; + + if (fanotify_event_has_path(old)) { + return old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry; + } else if (fanotify_event_has_fid(old)) { + return fanotify_fid_equal(&old->fid, &new->fid, old->fh_len); + } + + /* Do not merge events if we failed to encode fid */ return false; } @@ -143,6 +152,60 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, ~marks_ignored_mask; } +static int fanotify_encode_fid(struct fanotify_event *event, + const struct path *path, gfp_t gfp) +{ + struct fanotify_fid *fid = &event->fid; + int dwords, bytes = 0; + struct kstatfs stat; + int err, type; + + stat.f_fsid.val[0] = stat.f_fsid.val[1] = 0; + fid->ext_fh = NULL; + dwords = 0; + err = -ENOENT; + type = exportfs_encode_inode_fh(d_inode(path->dentry), NULL, &dwords, + NULL); + if (!dwords) + goto out_err; + + err = vfs_statfs(path, &stat); + if (err) + goto out_err; + + bytes = dwords << 2; + if (bytes > FANOTIFY_INLINE_FH_LEN) { + /* Treat failure to allocate fh as failure to allocate event */ + err = -ENOMEM; + fid->ext_fh = kmalloc(bytes, gfp); + if (!fid->ext_fh) + goto out_err; + } + + type = exportfs_encode_inode_fh(d_inode(path->dentry), + fanotify_fid_fh(fid, bytes), &dwords, + NULL); + err = -EINVAL; + if (!type || type == FILEID_INVALID || bytes != dwords << 2) + goto out_err; + + fid->fsid = stat.f_fsid; + event->fh_len = bytes; + + return type; + +out_err: + pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, " + "type=%d, bytes=%d, err=%i)\n", + stat.f_fsid.val[0], stat.f_fsid.val[1], + type, bytes, err); + kfree(fid->ext_fh); + fid->ext_fh = NULL; + event->fh_len = 0; + + return FILEID_INVALID; +} + struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const struct path *path) @@ -181,10 +244,16 @@ init: __maybe_unused event->pid = get_pid(task_pid(current)); else event->pid = get_pid(task_tgid(current)); - if (path) { + event->fh_len = 0; + if (path && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + /* Report the event without a file identifier on encode error */ + event->fh_type = fanotify_encode_fid(event, path, gfp); + } else if (path) { + event->fh_type = FILEID_ROOT; event->path = *path; path_get(&event->path); } else { + event->fh_type = FILEID_INVALID; event->path.mnt = NULL; event->path.dentry = NULL; } @@ -281,7 +350,10 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) struct fanotify_event *event; event = FANOTIFY_E(fsn_event); - path_put(&event->path); + if (fanotify_event_has_path(event)) + path_put(&event->path); + else if (fanotify_event_has_ext_fh(event)) + kfree(event->fid.ext_fh); put_pid(event->pid); if (fanotify_is_perm_event(event->mask)) { kmem_cache_free(fanotify_perm_event_cachep, diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 898b5b2bc1c7..271482fb9611 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -2,11 +2,49 @@ #include #include #include +#include extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; +/* + * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation). + * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and + * fh_* fields increase the size of fanotify_event by another 4 bytes. + * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and + * fh_* fields are packed in a hole after mask. + */ +#if BITS_PER_LONG == 32 +#define FANOTIFY_INLINE_FH_LEN (3 << 2) +#else +#define FANOTIFY_INLINE_FH_LEN (4 << 2) +#endif + +struct fanotify_fid { + __kernel_fsid_t fsid; + union { + unsigned char fh[FANOTIFY_INLINE_FH_LEN]; + unsigned char *ext_fh; + }; +}; + +static inline void *fanotify_fid_fh(struct fanotify_fid *fid, + unsigned int fh_len) +{ + return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh; +} + +static inline bool fanotify_fid_equal(struct fanotify_fid *fid1, + struct fanotify_fid *fid2, + unsigned int fh_len) +{ + return fid1->fsid.val[0] == fid2->fsid.val[0] && + fid1->fsid.val[1] == fid2->fsid.val[1] && + !memcmp(fanotify_fid_fh(fid1, fh_len), + fanotify_fid_fh(fid2, fh_len), fh_len); +} + /* * Structure for normal fanotify events. It gets allocated in * fanotify_handle_event() and freed when the information is retrieved by @@ -16,13 +54,47 @@ struct fanotify_event { struct fsnotify_event fse; u32 mask; /* - * We hold ref to this path so it may be dereferenced at any point - * during this object's lifetime + * Those fields are outside fanotify_fid to pack fanotify_event nicely + * on 64bit arch and to use fh_type as an indication of whether path + * or fid are used in the union: + * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither. */ - struct path path; + u8 fh_type; + u8 fh_len; + u16 pad; + union { + /* + * We hold ref to this path so it may be dereferenced at any + * point during this object's lifetime + */ + struct path path; + /* + * With FAN_REPORT_FID, we do not hold any reference on the + * victim object. Instead we store its NFS file handle and its + * filesystem's fsid as a unique identifier. + */ + struct fanotify_fid fid; + }; struct pid *pid; }; +static inline bool fanotify_event_has_path(struct fanotify_event *event) +{ + return event->fh_type == FILEID_ROOT; +} + +static inline bool fanotify_event_has_fid(struct fanotify_event *event) +{ + return event->fh_type != FILEID_ROOT && + event->fh_type != FILEID_INVALID; +} + +static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event) +{ + return fanotify_event_has_fid(event) && + event->fh_len > FANOTIFY_INLINE_FH_LEN; +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 096503bd0edb..c965fcf4979e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -181,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event_metadata metadata; struct fanotify_event *event; struct file *f = NULL; - int fd, ret; + int ret, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); @@ -193,9 +193,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; metadata.pid = pid_vnr(event->pid); - if (unlikely(event->mask & FAN_Q_OVERFLOW)) { - fd = FAN_NOFD; - } else { + if (fanotify_event_has_path(event)) { fd = create_fd(group, event, &f); if (fd < 0) return fd; diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 909c98fcace2..d07f3cbc2786 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -44,6 +44,7 @@ /* Flags to determine fanotify event format */ #define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */ +#define FAN_REPORT_FID 0x00000200 /* Report unique file id */ /* Deprecated - do not use this in programs and do not add new flags here! */ #define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \ -- cgit v1.3-7-g2ca7 From 5e469c830fdb5a1ebaa69b375b87f583326fd296 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 10 Jan 2019 19:04:35 +0200 Subject: fanotify: copy event fid info to user If group requested FAN_REPORT_FID and event has file identifier, copy that information to user reading the event after event metadata. fid information is formatted as struct fanotify_event_info_fid that includes a generic header struct fanotify_event_info_header, so that other info types could be defined in the future using the same header. metadata->event_len includes the length of the fid information. The fid information includes the filesystem's fsid (see statfs(2)) followed by an NFS file handle of the file that could be passed as an argument to open_by_handle_at(2). Cc: Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.h | 5 +++ fs/notify/fanotify/fanotify_user.c | 82 +++++++++++++++++++++++++++++++++++--- include/uapi/linux/fanotify.h | 20 ++++++++++ 3 files changed, 102 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 271482fb9611..4aafc7144c3d 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -95,6 +95,11 @@ static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event) event->fh_len > FANOTIFY_INLINE_FH_LEN; } +static inline void *fanotify_event_fh(struct fanotify_event *event) +{ + return fanotify_fid_fh(&event->fid, event->fh_len); +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c965fcf4979e..cd82dd713c91 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -47,6 +47,18 @@ struct kmem_cache *fanotify_mark_cache __read_mostly; struct kmem_cache *fanotify_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; +#define FANOTIFY_EVENT_ALIGN 4 + +static int fanotify_event_info_len(struct fanotify_event *event) +{ + if (!fanotify_event_has_fid(event)) + return 0; + + return roundup(sizeof(struct fanotify_event_info_fid) + + sizeof(struct file_handle) + event->fh_len, + FANOTIFY_EVENT_ALIGN); +} + /* * Get an fsnotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count @@ -57,6 +69,9 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { + size_t event_size = FAN_EVENT_METADATA_LEN; + struct fanotify_event *event; + assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -64,11 +79,18 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (fsnotify_notify_queue_is_empty(group)) return NULL; - if (FAN_EVENT_METADATA_LEN > count) + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + event = FANOTIFY_E(fsnotify_peek_first_event(group)); + event_size += fanotify_event_info_len(event); + } + + if (event_size > count) return ERR_PTR(-EINVAL); - /* held the notification_lock the whole time, so this is the - * same event we peeked above */ + /* + * Held the notification_lock the whole time, so this is the + * same event we peeked above + */ return fsnotify_remove_first_event(group); } @@ -174,6 +196,48 @@ static int process_access_response(struct fsnotify_group *group, return 0; } +static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) +{ + struct fanotify_event_info_fid info = { }; + struct file_handle handle = { }; + size_t fh_len = event->fh_len; + size_t len = fanotify_event_info_len(event); + + if (!len) + return 0; + + if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len)) + return -EFAULT; + + /* Copy event info fid header followed by vaiable sized file handle */ + info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID; + info.hdr.len = len; + info.fsid = event->fid.fsid; + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + buf += sizeof(info); + len -= sizeof(info); + handle.handle_type = event->fh_type; + handle.handle_bytes = fh_len; + if (copy_to_user(buf, &handle, sizeof(handle))) + return -EFAULT; + + buf += sizeof(handle); + len -= sizeof(handle); + if (copy_to_user(buf, fanotify_event_fh(event), fh_len)) + return -EFAULT; + + /* Pad with 0's */ + buf += fh_len; + len -= fh_len; + WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); + if (len > 0 && clear_user(buf, len)) + return -EFAULT; + + return 0; +} + static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event *fsn_event, char __user *buf, size_t count) @@ -197,6 +261,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fd = create_fd(group, event, &f); if (fd < 0) return fd; + } else if (fanotify_event_has_fid(event)) { + metadata.event_len += fanotify_event_info_len(event); } metadata.fd = fd; @@ -208,14 +274,20 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (WARN_ON_ONCE(metadata.event_len > count)) goto out_close_fd; - if (copy_to_user(buf, &metadata, metadata.event_len)) + if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) goto out_close_fd; if (fanotify_is_perm_event(event->mask)) FANOTIFY_PE(fsn_event)->fd = fd; - if (fd != FAN_NOFD) + if (fanotify_event_has_path(event)) { fd_install(fd, f); + } else if (fanotify_event_has_fid(event)) { + ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN); + if (ret < 0) + return ret; + } + return metadata.event_len; out_close_fd: diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index d07f3cbc2786..959ae2bdc7ca 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -107,6 +107,26 @@ struct fanotify_event_metadata { __s32 pid; }; +#define FAN_EVENT_INFO_TYPE_FID 1 + +/* Variable length info record following event metadata */ +struct fanotify_event_info_header { + __u8 info_type; + __u8 pad; + __u16 len; +}; + +/* Unique file identifier info record */ +struct fanotify_event_info_fid { + struct fanotify_event_info_header hdr; + __kernel_fsid_t fsid; + /* + * Following is an opaque struct file_handle that can be passed as + * an argument to open_by_handle_at(2). + */ + unsigned char handle[0]; +}; + struct fanotify_response { __s32 fd; __u32 response; -- cgit v1.3-7-g2ca7 From 235328d1fa4251c6dcb32351219bb553a58838d2 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 10 Jan 2019 19:04:43 +0200 Subject: fanotify: add support for create/attrib/move/delete events Add support for events with data type FSNOTIFY_EVENT_INODE (e.g. create/attrib/move/delete) for inode and filesystem mark types. The "inode" events do not carry enough information (i.e. path) to report event->fd, so we do not allow setting a mask for those events unless group supports reporting fid. The "inode" events are not supported on a mount mark, because they do not carry enough information (i.e. path) to be filtered by mount point. The "dirent" events (create/move/delete) report the fid of the parent directory where events took place without specifying the filename of the child. In the future, fanotify may get support for reporting filename information for those events. Cc: Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 9 ++++++++- fs/notify/fanotify/fanotify_user.c | 12 ++++++++++++ include/linux/fanotify.h | 22 ++++++++++++++++++++-- include/uapi/linux/fanotify.h | 8 ++++++++ 4 files changed, 48 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 974239b03442..158c69acb04d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -313,9 +313,16 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB); BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO); + BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM); + BUILD_BUG_ON(FAN_CREATE != FS_CREATE); + BUILD_BUG_ON(FAN_DELETE != FS_DELETE); + BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF); + BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); @@ -324,7 +331,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index bf06fd6ef761..6c61a06d0ef5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -976,6 +976,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group->priority == FS_PRIO_0) goto fput_and_out; + /* + * Events with data type inode do not carry enough information to report + * event->fd, so we do not allow setting a mask for inode events unless + * group supports reporting fid. + * inode events are not supported on a mount mark, because they do not + * carry enough information (i.e. path) to be filtered by mount point. + */ + if (mask & FANOTIFY_INODE_EVENTS && + (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) || + mark_type == FAN_MARK_MOUNT)) + goto fput_and_out; + if (flags & FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index f59be967f72b..e9d45387089f 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -35,10 +35,28 @@ FAN_MARK_IGNORED_SURV_MODIFY | \ FAN_MARK_FLUSH) -/* Events that user can request to be notified on */ -#define FANOTIFY_EVENTS (FAN_ACCESS | FAN_MODIFY | \ +/* + * Events that can be reported with data type FSNOTIFY_EVENT_PATH. + * Note that FAN_MODIFY can also be reported with data type + * FSNOTIFY_EVENT_INODE. + */ +#define FANOTIFY_PATH_EVENTS (FAN_ACCESS | FAN_MODIFY | \ FAN_CLOSE | FAN_OPEN | FAN_OPEN_EXEC) +/* + * Directory entry modification events - reported only to directory + * where entry is modified and not to a watching parent. + */ +#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE) + +/* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */ +#define FANOTIFY_INODE_EVENTS (FANOTIFY_DIRENT_EVENTS | \ + FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF) + +/* Events that user can request to be notified on */ +#define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ + FANOTIFY_INODE_EVENTS) + /* Events that require a permission response from user */ #define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ FAN_OPEN_EXEC_PERM) diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 959ae2bdc7ca..b9effa6f8503 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -7,9 +7,16 @@ /* the following events that user-space can register for */ #define FAN_ACCESS 0x00000001 /* File was accessed */ #define FAN_MODIFY 0x00000002 /* File was modified */ +#define FAN_ATTRIB 0x00000004 /* Metadata changed */ #define FAN_CLOSE_WRITE 0x00000008 /* Writtable file closed */ #define FAN_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */ #define FAN_OPEN 0x00000020 /* File was opened */ +#define FAN_MOVED_FROM 0x00000040 /* File was moved from X */ +#define FAN_MOVED_TO 0x00000080 /* File was moved to Y */ +#define FAN_CREATE 0x00000100 /* Subfile was created */ +#define FAN_DELETE 0x00000200 /* Subfile was deleted */ +#define FAN_DELETE_SELF 0x00000400 /* Self was deleted */ +#define FAN_MOVE_SELF 0x00000800 /* Self was moved */ #define FAN_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FAN_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ @@ -24,6 +31,7 @@ /* helper events */ #define FAN_CLOSE (FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */ +#define FAN_MOVE (FAN_MOVED_FROM | FAN_MOVED_TO) /* moves */ /* flags used for fanotify_init() */ #define FAN_CLOEXEC 0x00000001 -- cgit v1.3-7-g2ca7 From 95b86d1c91ad3b19f882d9e70aa37c8e99e8dc17 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 7 Feb 2019 01:31:27 -0500 Subject: RDMA/bnxt_re: Update kernel user abi to pass chip context User space verbs provider library would need chip context. Changing the ABI to add chip version details in structure. Furthermore, changing the kernel driver ucontext allocation code to initialize the abi structure with appropriate values. As suggested by community, appended the new fields at the bottom of the ABI structure and retaining to older fields as those were in the older versions. Keeping the ABI version at 1 and adding a new field in the ucontext response structure to hold the component mask. The user space library should check pre-defined flags to figure out if a certain feature is supported on not. Signed-off-by: Devesh Sharma Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 17 ++++++++++++++--- include/uapi/rdma/bnxt_re-abi.h | 11 +++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 08c1725f371a..1d7469e23cde 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3692,9 +3692,10 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; struct bnxt_re_uctx_resp resp; struct bnxt_re_ucontext *uctx; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + u32 chip_met_rev_num = 0; int rc; dev_dbg(rdev_to_dev(rdev), "ABI version requested %d", @@ -3719,14 +3720,24 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, } spin_lock_init(&uctx->sh_lock); - resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/ + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_CCTX; + chip_met_rev_num = rdev->chip_ctx.chip_num; + chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) << + BNXT_RE_CHIP_ID0_CHIP_REV_SFT; + chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) << + BNXT_RE_CHIP_ID0_CHIP_MET_SFT; + resp.chip_id0 = chip_met_rev_num; + /* Future extension of chip info */ + resp.chip_id1 = 0; + /*Temp, Use idr_alloc instead */ + resp.dev_id = rdev->en_dev->pdev->devfn; resp.max_qp = rdev->qplib_ctx.qpc_count; resp.pg_size = PAGE_SIZE; resp.cqe_sz = sizeof(struct cq_base); resp.max_cqd = dev_attr->max_cq_wqes; resp.rsvd = 0; - rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to copy user context"); rc = -EFAULT; diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index a7a6111e50c7..dc52e3cf574c 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -44,6 +44,14 @@ #define BNXT_RE_ABI_VERSION 1 +#define BNXT_RE_CHIP_ID0_CHIP_NUM_SFT 0x00 +#define BNXT_RE_CHIP_ID0_CHIP_REV_SFT 0x10 +#define BNXT_RE_CHIP_ID0_CHIP_MET_SFT 0x18 + +enum { + BNXT_RE_UCNTX_CMASK_HAVE_CCTX = 0x1ULL +}; + struct bnxt_re_uctx_resp { __u32 dev_id; __u32 max_qp; @@ -51,6 +59,9 @@ struct bnxt_re_uctx_resp { __u32 cqe_sz; __u32 max_cqd; __u32 rsvd; + __aligned_u64 comp_mask; + __u32 chip_id0; + __u32 chip_id1; }; /* -- cgit v1.3-7-g2ca7 From 05f8bc82fc428dbaf41764e95167dd759769f33d Mon Sep 17 00:00:00 2001 From: Randy Li Date: Thu, 10 Jan 2019 03:57:09 +0800 Subject: drm/fourcc: Add new P010, P016 video format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P010 is a planar 4:2:0 YUV with interleaved UV plane, 10 bits per channel video format. P012 is a planar 4:2:0 YUV 12 bits per channel P016 is a planar 4:2:0 YUV with interleaved UV plane, 16 bits per channel video format. V3: Added P012 and fixed cpp for P010. V4: format definition refined per review. V5: Format comment block for each new pixel format. V6: reversed Cb/Cr order in comments. v7: reversed Cb/Cr order in comments of header files, remove the wrong part of commit message. V8: reversed V7 changes except commit message and rebased. v9: used the new properties to describe those format and rebased. Cc: Daniel Stone Cc: Ville Syrjälä Signed-off-by: Randy Li Signed-off-by: Clint Taylor Reviewed-by: Ayan Kumar Halder Reviewed-by: Neil Armstrong Reviewed-by: Daniel Vetter Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20190109195710.28501-2-ayaka@soulik.info --- drivers/gpu/drm/drm_fourcc.c | 9 +++++++++ include/uapi/drm/drm_fourcc.h | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c index d90ee03a84c6..ba7e19d4336c 100644 --- a/drivers/gpu/drm/drm_fourcc.c +++ b/drivers/gpu/drm/drm_fourcc.c @@ -238,6 +238,15 @@ const struct drm_format_info *__drm_format_info(u32 format) { .format = DRM_FORMAT_X0L2, .depth = 0, .num_planes = 1, .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true }, + { .format = DRM_FORMAT_P010, .depth = 0, .num_planes = 2, + .char_per_block = { 2, 4, 0 }, .block_w = { 1, 0, 0 }, .block_h = { 1, 0, 0 }, + .hsub = 2, .vsub = 2, .is_yuv = true}, + { .format = DRM_FORMAT_P012, .depth = 0, .num_planes = 2, + .char_per_block = { 2, 4, 0 }, .block_w = { 1, 0, 0 }, .block_h = { 1, 0, 0 }, + .hsub = 2, .vsub = 2, .is_yuv = true}, + { .format = DRM_FORMAT_P016, .depth = 0, .num_planes = 2, + .char_per_block = { 2, 4, 0 }, .block_w = { 1, 0, 0 }, .block_h = { 1, 0, 0 }, + .hsub = 2, .vsub = 2, .is_yuv = true}, }; unsigned int i; diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 91d08a23f125..0fa360223255 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -195,6 +195,27 @@ extern "C" { #define DRM_FORMAT_NV24 fourcc_code('N', 'V', '2', '4') /* non-subsampled Cr:Cb plane */ #define DRM_FORMAT_NV42 fourcc_code('N', 'V', '4', '2') /* non-subsampled Cb:Cr plane */ +/* + * 2 plane YCbCr MSB aligned + * index 0 = Y plane, [15:0] Y:x [10:6] little endian + * index 1 = Cr:Cb plane, [31:0] Cr:x:Cb:x [10:6:10:6] little endian + */ +#define DRM_FORMAT_P010 fourcc_code('P', '0', '1', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel */ + +/* + * 2 plane YCbCr MSB aligned + * index 0 = Y plane, [15:0] Y:x [12:4] little endian + * index 1 = Cr:Cb plane, [31:0] Cr:x:Cb:x [12:4:12:4] little endian + */ +#define DRM_FORMAT_P012 fourcc_code('P', '0', '1', '2') /* 2x2 subsampled Cr:Cb plane 12 bits per channel */ + +/* + * 2 plane YCbCr MSB aligned + * index 0 = Y plane, [15:0] Y little endian + * index 1 = Cr:Cb plane, [31:0] Cr:Cb [16:16] little endian + */ +#define DRM_FORMAT_P016 fourcc_code('P', '0', '1', '6') /* 2x2 subsampled Cr:Cb plane 16 bits per channel */ + /* * 3 plane YCbCr * index 0: Y plane, [7:0] Y -- cgit v1.3-7-g2ca7 From 2c1619edef61a03cb516efaa81750784c3071d10 Mon Sep 17 00:00:00 2001 From: Danit Goldberg Date: Thu, 24 Jan 2019 14:18:15 +0200 Subject: IB/cma: Define option to set ack timeout and pack tos_set Define new option in 'rdma_set_option' to override calculated QP timeout when requested to provide QP attributes to modify a QP. At the same time, pack tos_set to be bitfield. Signed-off-by: Danit Goldberg Reviewed-by: Moni Shoua Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 32 ++++++++++++++++++++++++++++++++ drivers/infiniband/core/cma_priv.h | 4 +++- drivers/infiniband/core/ucma.c | 7 +++++++ include/rdma/rdma_cm.h | 1 + include/uapi/rdma/rdma_user_cm.h | 4 ++++ 5 files changed, 47 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e15546ae4d0f..83aa2ad0c27e 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -888,6 +888,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; + id_priv->timeout_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); @@ -1130,6 +1131,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, } else ret = -ENOSYS; + if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) + qp_attr->timeout = id_priv->timeout; + return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); @@ -2490,6 +2494,34 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) } EXPORT_SYMBOL(rdma_set_service_type); +/** + * rdma_set_ack_timeout() - Set the ack timeout of QP associated + * with a connection identifier. + * @id: Communication identifier to associated with service type. + * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. + * + * This function should be called before rdma_connect() on active side, + * and on passive side before rdma_accept(). It is applicable to primary + * path only. The timeout will affect the local side of the QP, it is not + * negotiated with remote side and zero disables the timer. + * + * Return: 0 for success + */ +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) +{ + struct rdma_id_private *id_priv; + + if (id->qp_type != IB_QPT_RC) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->timeout = timeout; + id_priv->timeout_set = true; + + return 0; +} +EXPORT_SYMBOL(rdma_set_ack_timeout); + static void cma_query_handler(int status, struct sa_path_rec *path_rec, void *context) { diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index cf47c69436a7..ca7307277518 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -84,9 +84,11 @@ struct rdma_id_private { u32 options; u8 srq; u8 tos; - bool tos_set; + u8 tos_set:1; + u8 timeout_set:1; u8 reuseaddr; u8 afonly; + u8 timeout; enum ib_gid_type gid_type; /* diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 01d68ed46c1b..7468b26b8a01 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1236,6 +1236,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; + case RDMA_OPTION_ID_ACK_TIMEOUT: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); + break; default: ret = -ENOSYS; } diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 60987a5903b7..71f48cfdc24c 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -374,6 +374,7 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); */ int rdma_set_afonly(struct rdma_cm_id *id, int afonly); +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout); /** * rdma_get_service_id - Return the IB service ID for a specified address. * @id: Communication identifier associated with the address. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 0d1e78ebad05..e42940a215a3 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -300,6 +300,10 @@ enum { RDMA_OPTION_ID_TOS = 0, RDMA_OPTION_ID_REUSEADDR = 1, RDMA_OPTION_ID_AFONLY = 2, + RDMA_OPTION_ID_ACK_TIMEOUT = 3 +}; + +enum { RDMA_OPTION_IB_PATH = 1 }; -- cgit v1.3-7-g2ca7 From c68cfb718c8f97b7f7a50ed66be5feb42d0c8988 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Fri, 8 Feb 2019 17:11:25 +0000 Subject: misc: fastrpc: Add support for context Invoke method This patch adds support to compute context invoke method on the remote processor (DSP). This involves setting up the functions input and output arguments, input and output handles and mapping the dmabuf fd for the argument/handle buffers. The below diagram depicts invocation of a single method where the client and objects reside on different processors. An object could expose multiple methods which can be grouped together and referred to as an interface. ,--------, ,------, ,-----------, ,------, ,--------, | | method | | | | | | method | | | Client |------->| Stub |->| Transport |->| Skel |------->| Object | | | | | | | | | | | `--------` `------` `-----------` `------` `--------` Client: Linux user mode process that initiates the remote invocation Stub: Auto generated code linked in with the user mode process that takes care of marshaling parameters Transport: Involved in carrying an invocation from a client to an object. This involves two portions: 1) FastRPC Linux kernel driver that receives the remote invocation, queues them up and then waits for the response after signaling the remote side. 2) Service running on the remote side that dequeues the messages from the queue and dispatches them for processing. Skel: Auto generated code that takes care of un-marshaling parameters Object: Method implementation Most of the work is derived from various downstream Qualcomm kernels. Credits to various Qualcomm authors who have contributed to this code. Specially Tharun Kumar Merugu Co-developed-by: Thierry Escande Signed-off-by: Thierry Escande Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 730 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/misc/fastrpc.h | 23 ++ 2 files changed, 753 insertions(+) create mode 100644 include/uapi/misc/fastrpc.h (limited to 'include/uapi') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 10b93fd5659a..cd69f8b308f6 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -2,7 +2,9 @@ // Copyright (c) 2011-2018, The Linux Foundation. All rights reserved. // Copyright (c) 2018, Linaro Limited +#include #include +#include #include #include #include @@ -14,6 +16,7 @@ #include #include #include +#include #define ADSP_DOMAIN_ID (0) #define MDSP_DOMAIN_ID (1) @@ -21,14 +24,118 @@ #define CDSP_DOMAIN_ID (3) #define FASTRPC_DEV_MAX 4 /* adsp, mdsp, slpi, cdsp*/ #define FASTRPC_MAX_SESSIONS 9 /*8 compute, 1 cpz*/ +#define FASTRPC_ALIGN 128 +#define FASTRPC_MAX_FDLIST 16 +#define FASTRPC_MAX_CRCLIST 64 +#define FASTRPC_PHYS(p) ((p) & 0xffffffff) #define FASTRPC_CTX_MAX (256) #define FASTRPC_CTXID_MASK (0xFF0) #define FASTRPC_DEVICE_NAME "fastrpc" +/* Retrives number of input buffers from the scalars parameter */ +#define REMOTE_SCALARS_INBUFS(sc) (((sc) >> 16) & 0x0ff) + +/* Retrives number of output buffers from the scalars parameter */ +#define REMOTE_SCALARS_OUTBUFS(sc) (((sc) >> 8) & 0x0ff) + +/* Retrives number of input handles from the scalars parameter */ +#define REMOTE_SCALARS_INHANDLES(sc) (((sc) >> 4) & 0x0f) + +/* Retrives number of output handles from the scalars parameter */ +#define REMOTE_SCALARS_OUTHANDLES(sc) ((sc) & 0x0f) + +#define REMOTE_SCALARS_LENGTH(sc) (REMOTE_SCALARS_INBUFS(sc) + \ + REMOTE_SCALARS_OUTBUFS(sc) + \ + REMOTE_SCALARS_INHANDLES(sc)+ \ + REMOTE_SCALARS_OUTHANDLES(sc)) +#define FASTRPC_BUILD_SCALARS(attr, method, in, out, oin, oout) \ + (((attr & 0x07) << 29) | \ + ((method & 0x1f) << 24) | \ + ((in & 0xff) << 16) | \ + ((out & 0xff) << 8) | \ + ((oin & 0x0f) << 4) | \ + (oout & 0x0f)) + +#define FASTRPC_SCALARS(method, in, out) \ + FASTRPC_BUILD_SCALARS(0, method, in, out, 0, 0) + #define miscdev_to_cctx(d) container_of(d, struct fastrpc_channel_ctx, miscdev) static const char *domains[FASTRPC_DEV_MAX] = { "adsp", "mdsp", "sdsp", "cdsp"}; +struct fastrpc_phy_page { + u64 addr; /* physical address */ + u64 size; /* size of contiguous region */ +}; + +struct fastrpc_invoke_buf { + u32 num; /* number of contiguous regions */ + u32 pgidx; /* index to start of contiguous region */ +}; + +struct fastrpc_remote_arg { + u64 pv; + u64 len; +}; + +struct fastrpc_msg { + int pid; /* process group id */ + int tid; /* thread id */ + u64 ctx; /* invoke caller context */ + u32 handle; /* handle to invoke */ + u32 sc; /* scalars structure describing the data */ + u64 addr; /* physical address */ + u64 size; /* size of contiguous region */ +}; + +struct fastrpc_invoke_rsp { + u64 ctx; /* invoke caller context */ + int retval; /* invoke return value */ +}; + +struct fastrpc_buf { + struct fastrpc_user *fl; + struct device *dev; + void *virt; + u64 phys; + u64 size; +}; + +struct fastrpc_map { + struct list_head node; + struct fastrpc_user *fl; + int fd; + struct dma_buf *buf; + struct sg_table *table; + struct dma_buf_attachment *attach; + u64 phys; + u64 size; + void *va; + u64 len; + struct kref refcount; +}; + +struct fastrpc_invoke_ctx { + int nscalars; + int nbufs; + int retval; + int pid; + int tgid; + u32 sc; + u32 *crc; + u64 ctxid; + u64 msg_sz; + struct kref refcount; + struct list_head node; /* list of ctxs */ + struct completion work; + struct fastrpc_msg msg; + struct fastrpc_user *fl; + struct fastrpc_remote_arg *rpra; + struct fastrpc_map **maps; + struct fastrpc_buf *buf; + struct fastrpc_invoke_args *args; + struct fastrpc_channel_ctx *cctx; +}; struct fastrpc_session_ctx { struct device *dev; @@ -55,6 +162,7 @@ struct fastrpc_user { struct fastrpc_channel_ctx *cctx; struct fastrpc_session_ctx *sctx; + struct fastrpc_buf *init_mem; int tgid; int pd; @@ -64,6 +172,522 @@ struct fastrpc_user { struct mutex mutex; }; +static void fastrpc_free_map(struct kref *ref) +{ + struct fastrpc_map *map; + + map = container_of(ref, struct fastrpc_map, refcount); + + if (map->table) { + dma_buf_unmap_attachment(map->attach, map->table, + DMA_BIDIRECTIONAL); + dma_buf_detach(map->buf, map->attach); + dma_buf_put(map->buf); + } + + kfree(map); +} + +static void fastrpc_map_put(struct fastrpc_map *map) +{ + if (map) + kref_put(&map->refcount, fastrpc_free_map); +} + +static void fastrpc_map_get(struct fastrpc_map *map) +{ + if (map) + kref_get(&map->refcount); +} + +static int fastrpc_map_find(struct fastrpc_user *fl, int fd, + struct fastrpc_map **ppmap) +{ + struct fastrpc_map *map = NULL; + + mutex_lock(&fl->mutex); + list_for_each_entry(map, &fl->maps, node) { + if (map->fd == fd) { + fastrpc_map_get(map); + *ppmap = map; + mutex_unlock(&fl->mutex); + return 0; + } + } + mutex_unlock(&fl->mutex); + + return -ENOENT; +} + +static void fastrpc_buf_free(struct fastrpc_buf *buf) +{ + dma_free_coherent(buf->dev, buf->size, buf->virt, + FASTRPC_PHYS(buf->phys)); + kfree(buf); +} + +static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, + u64 size, struct fastrpc_buf **obuf) +{ + struct fastrpc_buf *buf; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + buf->fl = fl; + buf->virt = NULL; + buf->phys = 0; + buf->size = size; + buf->dev = dev; + + buf->virt = dma_alloc_coherent(dev, buf->size, (dma_addr_t *)&buf->phys, + GFP_KERNEL); + if (!buf->virt) + return -ENOMEM; + + if (fl->sctx && fl->sctx->sid) + buf->phys += ((u64)fl->sctx->sid << 32); + + *obuf = buf; + + return 0; +} + +static void fastrpc_context_free(struct kref *ref) +{ + struct fastrpc_invoke_ctx *ctx; + struct fastrpc_channel_ctx *cctx; + int i; + + ctx = container_of(ref, struct fastrpc_invoke_ctx, refcount); + cctx = ctx->cctx; + + for (i = 0; i < ctx->nscalars; i++) + fastrpc_map_put(ctx->maps[i]); + + if (ctx->buf) + fastrpc_buf_free(ctx->buf); + + spin_lock(&cctx->lock); + idr_remove(&cctx->ctx_idr, ctx->ctxid >> 4); + spin_unlock(&cctx->lock); + + kfree(ctx->maps); + kfree(ctx); +} + +static void fastrpc_context_get(struct fastrpc_invoke_ctx *ctx) +{ + kref_get(&ctx->refcount); +} + +static void fastrpc_context_put(struct fastrpc_invoke_ctx *ctx) +{ + kref_put(&ctx->refcount, fastrpc_context_free); +} + +static struct fastrpc_invoke_ctx *fastrpc_context_alloc( + struct fastrpc_user *user, u32 kernel, u32 sc, + struct fastrpc_invoke_args *args) +{ + struct fastrpc_channel_ctx *cctx = user->cctx; + struct fastrpc_invoke_ctx *ctx = NULL; + int ret; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ctx->node); + ctx->fl = user; + ctx->nscalars = REMOTE_SCALARS_LENGTH(sc); + ctx->nbufs = REMOTE_SCALARS_INBUFS(sc) + + REMOTE_SCALARS_OUTBUFS(sc); + + if (ctx->nscalars) { + ctx->maps = kcalloc(ctx->nscalars, + sizeof(*ctx->maps), GFP_KERNEL); + if (!ctx->maps) { + kfree(ctx); + return ERR_PTR(-ENOMEM); + } + ctx->args = args; + } + + ctx->sc = sc; + ctx->retval = -1; + ctx->pid = current->pid; + ctx->tgid = user->tgid; + ctx->cctx = cctx; + init_completion(&ctx->work); + + spin_lock(&user->lock); + list_add_tail(&ctx->node, &user->pending); + spin_unlock(&user->lock); + + spin_lock(&cctx->lock); + ret = idr_alloc_cyclic(&cctx->ctx_idr, ctx, 1, + FASTRPC_CTX_MAX, GFP_ATOMIC); + if (ret < 0) { + spin_unlock(&cctx->lock); + goto err_idr; + } + ctx->ctxid = ret << 4; + spin_unlock(&cctx->lock); + + kref_init(&ctx->refcount); + + return ctx; +err_idr: + spin_lock(&user->lock); + list_del(&ctx->node); + spin_unlock(&user->lock); + kfree(ctx->maps); + kfree(ctx); + + return ERR_PTR(ret); +} + +static int fastrpc_map_create(struct fastrpc_user *fl, int fd, + u64 len, struct fastrpc_map **ppmap) +{ + struct fastrpc_session_ctx *sess = fl->sctx; + struct fastrpc_map *map = NULL; + int err = 0; + + if (!fastrpc_map_find(fl, fd, ppmap)) + return 0; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + INIT_LIST_HEAD(&map->node); + map->fl = fl; + map->fd = fd; + map->buf = dma_buf_get(fd); + if (!map->buf) { + err = -EINVAL; + goto get_err; + } + + map->attach = dma_buf_attach(map->buf, sess->dev); + if (IS_ERR(map->attach)) { + dev_err(sess->dev, "Failed to attach dmabuf\n"); + err = PTR_ERR(map->attach); + goto attach_err; + } + + map->table = dma_buf_map_attachment(map->attach, DMA_BIDIRECTIONAL); + if (IS_ERR(map->table)) { + err = PTR_ERR(map->table); + goto map_err; + } + + map->phys = sg_dma_address(map->table->sgl); + map->phys += ((u64)fl->sctx->sid << 32); + map->size = len; + map->va = sg_virt(map->table->sgl); + map->len = len; + kref_init(&map->refcount); + + spin_lock(&fl->lock); + list_add_tail(&map->node, &fl->maps); + spin_unlock(&fl->lock); + *ppmap = map; + + return 0; + +map_err: + dma_buf_detach(map->buf, map->attach); +attach_err: + dma_buf_put(map->buf); +get_err: + kfree(map); + + return err; +} + +/* + * Fastrpc payload buffer with metadata looks like: + * + * >>>>>> START of METADATA <<<<<<<<< + * +---------------------------------+ + * | Arguments | + * | type:(struct fastrpc_remote_arg)| + * | (0 - N) | + * +---------------------------------+ + * | Invoke Buffer list | + * | type:(struct fastrpc_invoke_buf)| + * | (0 - N) | + * +---------------------------------+ + * | Page info list | + * | type:(struct fastrpc_phy_page) | + * | (0 - N) | + * +---------------------------------+ + * | Optional info | + * |(can be specific to SoC/Firmware)| + * +---------------------------------+ + * >>>>>>>> END of METADATA <<<<<<<<< + * +---------------------------------+ + * | Inline ARGS | + * | (0-N) | + * +---------------------------------+ + */ + +static int fastrpc_get_meta_size(struct fastrpc_invoke_ctx *ctx) +{ + int size = 0; + + size = (sizeof(struct fastrpc_remote_arg) + + sizeof(struct fastrpc_invoke_buf) + + sizeof(struct fastrpc_phy_page)) * ctx->nscalars + + sizeof(u64) * FASTRPC_MAX_FDLIST + + sizeof(u32) * FASTRPC_MAX_CRCLIST; + + return size; +} + +static u64 fastrpc_get_payload_size(struct fastrpc_invoke_ctx *ctx, int metalen) +{ + u64 size = 0; + int i; + + size = ALIGN(metalen, FASTRPC_ALIGN); + for (i = 0; i < ctx->nscalars; i++) { + if (ctx->args[i].fd == 0 || ctx->args[i].fd == -1) { + size = ALIGN(size, FASTRPC_ALIGN); + size += ctx->args[i].length; + } + } + + return size; +} + +static int fastrpc_create_maps(struct fastrpc_invoke_ctx *ctx) +{ + struct device *dev = ctx->fl->sctx->dev; + int i, err; + + for (i = 0; i < ctx->nscalars; ++i) { + /* Make sure reserved field is set to 0 */ + if (ctx->args[i].reserved) + return -EINVAL; + + if (ctx->args[i].fd == 0 || ctx->args[i].fd == -1 || + ctx->args[i].length == 0) + continue; + + err = fastrpc_map_create(ctx->fl, ctx->args[i].fd, + ctx->args[i].length, &ctx->maps[i]); + if (err) { + dev_err(dev, "Error Creating map %d\n", err); + return -EINVAL; + } + + } + return 0; +} + +static int fastrpc_get_args(u32 kernel, struct fastrpc_invoke_ctx *ctx) +{ + struct device *dev = ctx->fl->sctx->dev; + struct fastrpc_remote_arg *rpra; + struct fastrpc_invoke_buf *list; + struct fastrpc_phy_page *pages; + int inbufs, i, err = 0; + u64 rlen, pkt_size; + uintptr_t args; + int metalen; + + + inbufs = REMOTE_SCALARS_INBUFS(ctx->sc); + metalen = fastrpc_get_meta_size(ctx); + pkt_size = fastrpc_get_payload_size(ctx, metalen); + + err = fastrpc_create_maps(ctx); + if (err) + return err; + + ctx->msg_sz = pkt_size; + + err = fastrpc_buf_alloc(ctx->fl, dev, pkt_size, &ctx->buf); + if (err) + return err; + + rpra = ctx->buf->virt; + list = ctx->buf->virt + ctx->nscalars * sizeof(*rpra); + pages = ctx->buf->virt + ctx->nscalars * (sizeof(*list) + + sizeof(*rpra)); + args = (uintptr_t)ctx->buf->virt + metalen; + rlen = pkt_size - metalen; + ctx->rpra = rpra; + + for (i = 0; i < ctx->nbufs; ++i) { + u64 len = ctx->args[i].length; + + rpra[i].pv = 0; + rpra[i].len = len; + list[i].num = len ? 1 : 0; + list[i].pgidx = i; + + if (!len) + continue; + + pages[i].size = roundup(len, PAGE_SIZE); + + if (ctx->maps[i]) { + rpra[i].pv = (u64) ctx->args[i].ptr; + pages[i].addr = ctx->maps[i]->phys; + } else { + rlen -= ALIGN(args, FASTRPC_ALIGN) - args; + args = ALIGN(args, FASTRPC_ALIGN); + if (rlen < len) + goto bail; + + rpra[i].pv = args; + pages[i].addr = ctx->buf->phys + (pkt_size - rlen); + pages[i].addr = pages[i].addr & PAGE_MASK; + args = args + len; + rlen -= len; + } + + if (i < inbufs && !ctx->maps[i]) { + void *dst = (void *)(uintptr_t)rpra[i].pv; + void *src = (void *)(uintptr_t)ctx->args[i].ptr; + + if (!kernel) { + if (copy_from_user(dst, (void __user *)src, + len)) { + err = -EFAULT; + goto bail; + } + } else { + memcpy(dst, src, len); + } + } + } + + for (i = ctx->nbufs; i < ctx->nscalars; ++i) { + rpra[i].pv = (u64) ctx->args[i].ptr; + rpra[i].len = ctx->args[i].length; + list[i].num = ctx->args[i].length ? 1 : 0; + list[i].pgidx = i; + pages[i].addr = ctx->maps[i]->phys; + pages[i].size = ctx->maps[i]->size; + } + +bail: + if (err) + dev_err(dev, "Error: get invoke args failed:%d\n", err); + + return err; +} + +static int fastrpc_put_args(struct fastrpc_invoke_ctx *ctx, + u32 kernel) +{ + struct fastrpc_remote_arg *rpra = ctx->rpra; + int i, inbufs; + + inbufs = REMOTE_SCALARS_INBUFS(ctx->sc); + + for (i = inbufs; i < ctx->nbufs; ++i) { + void *src = (void *)(uintptr_t)rpra[i].pv; + void *dst = (void *)(uintptr_t)ctx->args[i].ptr; + u64 len = rpra[i].len; + + if (!kernel) { + if (copy_to_user((void __user *)dst, src, len)) + return -EFAULT; + } else { + memcpy(dst, src, len); + } + } + + return 0; +} + +static int fastrpc_invoke_send(struct fastrpc_session_ctx *sctx, + struct fastrpc_invoke_ctx *ctx, + u32 kernel, uint32_t handle) +{ + struct fastrpc_channel_ctx *cctx; + struct fastrpc_user *fl = ctx->fl; + struct fastrpc_msg *msg = &ctx->msg; + + cctx = fl->cctx; + msg->pid = fl->tgid; + msg->tid = current->pid; + + if (kernel) + msg->pid = 0; + + msg->ctx = ctx->ctxid | fl->pd; + msg->handle = handle; + msg->sc = ctx->sc; + msg->addr = ctx->buf ? ctx->buf->phys : 0; + msg->size = roundup(ctx->msg_sz, PAGE_SIZE); + fastrpc_context_get(ctx); + + return rpmsg_send(cctx->rpdev->ept, (void *)msg, sizeof(*msg)); +} + +static int fastrpc_internal_invoke(struct fastrpc_user *fl, u32 kernel, + u32 handle, u32 sc, + struct fastrpc_invoke_args *args) +{ + struct fastrpc_invoke_ctx *ctx = NULL; + int err = 0; + + if (!fl->sctx) + return -EINVAL; + + ctx = fastrpc_context_alloc(fl, kernel, sc, args); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (ctx->nscalars) { + err = fastrpc_get_args(kernel, ctx); + if (err) + goto bail; + } + /* Send invoke buffer to remote dsp */ + err = fastrpc_invoke_send(fl->sctx, ctx, kernel, handle); + if (err) + goto bail; + + /* Wait for remote dsp to respond or time out */ + err = wait_for_completion_interruptible(&ctx->work); + if (err) + goto bail; + + /* Check the response from remote dsp */ + err = ctx->retval; + if (err) + goto bail; + + if (ctx->nscalars) { + /* populate all the output buffers with results */ + err = fastrpc_put_args(ctx, kernel); + if (err) + goto bail; + } + +bail: + /* We are done with this compute context, remove it from pending list */ + spin_lock(&fl->lock); + list_del(&ctx->node); + spin_unlock(&fl->lock); + fastrpc_context_put(ctx); + + if (err) + dev_dbg(fl->sctx->dev, "Error: Invoke Failed %d\n", err); + + return err; +} + static struct fastrpc_session_ctx *fastrpc_session_alloc( struct fastrpc_channel_ctx *cctx) { @@ -95,11 +719,26 @@ static int fastrpc_device_release(struct inode *inode, struct file *file) { struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data; struct fastrpc_channel_ctx *cctx = fl->cctx; + struct fastrpc_invoke_ctx *ctx, *n; + struct fastrpc_map *map, *m; spin_lock(&cctx->lock); list_del(&fl->user); spin_unlock(&cctx->lock); + if (fl->init_mem) + fastrpc_buf_free(fl->init_mem); + + list_for_each_entry_safe(ctx, n, &fl->pending, node) { + list_del(&ctx->node); + fastrpc_context_put(ctx); + } + + list_for_each_entry_safe(map, m, &fl->maps, node) { + list_del(&map->node); + fastrpc_map_put(map); + } + fastrpc_session_free(cctx, fl->sctx); mutex_destroy(&fl->mutex); @@ -134,9 +773,60 @@ static int fastrpc_device_open(struct inode *inode, struct file *filp) return 0; } +static int fastrpc_invoke(struct fastrpc_user *fl, char __user *argp) +{ + struct fastrpc_invoke_args *args = NULL; + struct fastrpc_invoke inv; + u32 nscalars; + int err; + + if (copy_from_user(&inv, argp, sizeof(inv))) + return -EFAULT; + + /* nscalars is truncated here to max supported value */ + nscalars = REMOTE_SCALARS_LENGTH(inv.sc); + if (nscalars) { + args = kcalloc(nscalars, sizeof(*args), GFP_KERNEL); + if (!args) + return -ENOMEM; + + if (copy_from_user(args, (void __user *)(uintptr_t)inv.args, + nscalars * sizeof(*args))) { + kfree(args); + return -EFAULT; + } + } + + err = fastrpc_internal_invoke(fl, false, inv.handle, inv.sc, args); + kfree(args); + + return err; +} + +static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data; + char __user *argp = (char __user *)arg; + int err; + + switch (cmd) { + case FASTRPC_IOCTL_INVOKE: + err = fastrpc_invoke(fl, argp); + break; + default: + err = -ENOTTY; + break; + } + + return err; +} + static const struct file_operations fastrpc_fops = { .open = fastrpc_device_open, .release = fastrpc_device_release, + .unlocked_ioctl = fastrpc_device_ioctl, + .compat_ioctl = fastrpc_device_ioctl, }; static int fastrpc_cb_probe(struct platform_device *pdev) @@ -260,9 +950,25 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev) return of_platform_populate(rdev->of_node, NULL, NULL, rdev); } +static void fastrpc_notify_users(struct fastrpc_user *user) +{ + struct fastrpc_invoke_ctx *ctx; + + spin_lock(&user->lock); + list_for_each_entry(ctx, &user->pending, node) + complete(&ctx->work); + spin_unlock(&user->lock); +} + static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev) { struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev); + struct fastrpc_user *user; + + spin_lock(&cctx->lock); + list_for_each_entry(user, &cctx->users, user) + fastrpc_notify_users(user); + spin_unlock(&cctx->lock); misc_deregister(&cctx->miscdev); of_platform_depopulate(&rpdev->dev); @@ -272,6 +978,30 @@ static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev) static int fastrpc_rpmsg_callback(struct rpmsg_device *rpdev, void *data, int len, void *priv, u32 addr) { + struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev); + struct fastrpc_invoke_rsp *rsp = data; + struct fastrpc_invoke_ctx *ctx; + unsigned long flags; + unsigned long ctxid; + + if (len < sizeof(*rsp)) + return -EINVAL; + + ctxid = ((rsp->ctx & FASTRPC_CTXID_MASK) >> 4); + + spin_lock_irqsave(&cctx->lock, flags); + ctx = idr_find(&cctx->ctx_idr, ctxid); + spin_unlock_irqrestore(&cctx->lock, flags); + + if (!ctx) { + dev_err(&rpdev->dev, "No context ID matches response\n"); + return -ENOENT; + } + + ctx->retval = rsp->retval; + complete(&ctx->work); + fastrpc_context_put(ctx); + return 0; } diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h new file mode 100644 index 000000000000..a69ef33dc37e --- /dev/null +++ b/include/uapi/misc/fastrpc.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __QCOM_FASTRPC_H__ +#define __QCOM_FASTRPC_H__ + +#include + +#define FASTRPC_IOCTL_INVOKE _IOWR('R', 3, struct fastrpc_invoke) + +struct fastrpc_invoke_args { + __u64 ptr; + __u64 length; + __s32 fd; + __u32 reserved; +}; + +struct fastrpc_invoke { + __u32 handle; + __u32 sc; + __u64 args; +}; + +#endif /* __QCOM_FASTRPC_H__ */ -- cgit v1.3-7-g2ca7 From d73f71c7c6ee1583c08c214c8f7b20d841490b36 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Fri, 8 Feb 2019 17:11:26 +0000 Subject: misc: fastrpc: Add support for create remote init process This patch adds support to create or attach remote shell process. The shell process called fastrpc_shell_0 is usually loaded on the DSP when a user process is spawned. Most of the work is derived from various downstream Qualcomm kernels. Credits to various Qualcomm authors who have contributed to this code. Specially Tharun Kumar Merugu Co-developed-by: Thierry Escande Signed-off-by: Thierry Escande Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 156 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/misc/fastrpc.h | 10 +++ 2 files changed, 166 insertions(+) (limited to 'include/uapi') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index cd69f8b308f6..ceb498487569 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -29,7 +29,10 @@ #define FASTRPC_MAX_CRCLIST 64 #define FASTRPC_PHYS(p) ((p) & 0xffffffff) #define FASTRPC_CTX_MAX (256) +#define FASTRPC_INIT_HANDLE 1 #define FASTRPC_CTXID_MASK (0xFF0) +#define INIT_FILELEN_MAX (2 * 1024 * 1024) +#define INIT_MEMLEN_MAX (8 * 1024 * 1024) #define FASTRPC_DEVICE_NAME "fastrpc" /* Retrives number of input buffers from the scalars parameter */ @@ -59,6 +62,14 @@ #define FASTRPC_SCALARS(method, in, out) \ FASTRPC_BUILD_SCALARS(0, method, in, out, 0, 0) +#define FASTRPC_CREATE_PROCESS_NARGS 6 +/* Remote Method id table */ +#define FASTRPC_RMID_INIT_ATTACH 0 +#define FASTRPC_RMID_INIT_RELEASE 1 +#define FASTRPC_RMID_INIT_CREATE 6 +#define FASTRPC_RMID_INIT_CREATE_ATTR 7 +#define FASTRPC_RMID_INIT_CREATE_STATIC 8 + #define miscdev_to_cctx(d) container_of(d, struct fastrpc_channel_ctx, miscdev) static const char *domains[FASTRPC_DEV_MAX] = { "adsp", "mdsp", @@ -688,6 +699,109 @@ bail: return err; } +static int fastrpc_init_create_process(struct fastrpc_user *fl, + char __user *argp) +{ + struct fastrpc_init_create init; + struct fastrpc_invoke_args *args; + struct fastrpc_phy_page pages[1]; + struct fastrpc_map *map = NULL; + struct fastrpc_buf *imem = NULL; + int memlen; + int err; + struct { + int pgid; + u32 namelen; + u32 filelen; + u32 pageslen; + u32 attrs; + u32 siglen; + } inbuf; + u32 sc; + + args = kcalloc(FASTRPC_CREATE_PROCESS_NARGS, sizeof(*args), GFP_KERNEL); + if (!args) + return -ENOMEM; + + if (copy_from_user(&init, argp, sizeof(init))) { + err = -EFAULT; + goto bail; + } + + if (init.filelen > INIT_FILELEN_MAX) { + err = -EINVAL; + goto bail; + } + + inbuf.pgid = fl->tgid; + inbuf.namelen = strlen(current->comm) + 1; + inbuf.filelen = init.filelen; + inbuf.pageslen = 1; + inbuf.attrs = init.attrs; + inbuf.siglen = init.siglen; + fl->pd = 1; + + if (init.filelen && init.filefd) { + err = fastrpc_map_create(fl, init.filefd, init.filelen, &map); + if (err) + goto bail; + } + + memlen = ALIGN(max(INIT_FILELEN_MAX, (int)init.filelen * 4), + 1024 * 1024); + err = fastrpc_buf_alloc(fl, fl->sctx->dev, memlen, + &imem); + if (err) { + fastrpc_map_put(map); + goto bail; + } + + fl->init_mem = imem; + args[0].ptr = (u64)(uintptr_t)&inbuf; + args[0].length = sizeof(inbuf); + args[0].fd = -1; + + args[1].ptr = (u64)(uintptr_t)current->comm; + args[1].length = inbuf.namelen; + args[1].fd = -1; + + args[2].ptr = (u64) init.file; + args[2].length = inbuf.filelen; + args[2].fd = init.filefd; + + pages[0].addr = imem->phys; + pages[0].size = imem->size; + + args[3].ptr = (u64)(uintptr_t) pages; + args[3].length = 1 * sizeof(*pages); + args[3].fd = -1; + + args[4].ptr = (u64)(uintptr_t)&inbuf.attrs; + args[4].length = sizeof(inbuf.attrs); + args[4].fd = -1; + + args[5].ptr = (u64)(uintptr_t) &inbuf.siglen; + args[5].length = sizeof(inbuf.siglen); + args[5].fd = -1; + + sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_CREATE, 4, 0); + if (init.attrs) + sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_CREATE_ATTR, 6, 0); + + err = fastrpc_internal_invoke(fl, true, FASTRPC_INIT_HANDLE, + sc, args); + + if (err) { + fastrpc_map_put(map); + fastrpc_buf_free(imem); + } + +bail: + kfree(args); + + return err; +} + static struct fastrpc_session_ctx *fastrpc_session_alloc( struct fastrpc_channel_ctx *cctx) { @@ -715,6 +829,23 @@ static void fastrpc_session_free(struct fastrpc_channel_ctx *cctx, spin_unlock(&cctx->lock); } +static int fastrpc_release_current_dsp_process(struct fastrpc_user *fl) +{ + struct fastrpc_invoke_args args[1]; + int tgid = 0; + u32 sc; + + tgid = fl->tgid; + args[0].ptr = (u64)(uintptr_t) &tgid; + args[0].length = sizeof(tgid); + args[0].fd = -1; + args[0].reserved = 0; + sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_RELEASE, 1, 0); + + return fastrpc_internal_invoke(fl, true, FASTRPC_INIT_HANDLE, + sc, &args[0]); +} + static int fastrpc_device_release(struct inode *inode, struct file *file) { struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data; @@ -722,6 +853,8 @@ static int fastrpc_device_release(struct inode *inode, struct file *file) struct fastrpc_invoke_ctx *ctx, *n; struct fastrpc_map *map, *m; + fastrpc_release_current_dsp_process(fl); + spin_lock(&cctx->lock); list_del(&fl->user); spin_unlock(&cctx->lock); @@ -773,6 +906,23 @@ static int fastrpc_device_open(struct inode *inode, struct file *filp) return 0; } +static int fastrpc_init_attach(struct fastrpc_user *fl) +{ + struct fastrpc_invoke_args args[1]; + int tgid = fl->tgid; + u32 sc; + + args[0].ptr = (u64)(uintptr_t) &tgid; + args[0].length = sizeof(tgid); + args[0].fd = -1; + args[0].reserved = 0; + sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_ATTACH, 1, 0); + fl->pd = 0; + + return fastrpc_internal_invoke(fl, true, FASTRPC_INIT_HANDLE, + sc, &args[0]); +} + static int fastrpc_invoke(struct fastrpc_user *fl, char __user *argp) { struct fastrpc_invoke_args *args = NULL; @@ -814,6 +964,12 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, case FASTRPC_IOCTL_INVOKE: err = fastrpc_invoke(fl, argp); break; + case FASTRPC_IOCTL_INIT_ATTACH: + err = fastrpc_init_attach(fl); + break; + case FASTRPC_IOCTL_INIT_CREATE: + err = fastrpc_init_create_process(fl, argp); + break; default: err = -ENOTTY; break; diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h index a69ef33dc37e..32d191c3b7bc 100644 --- a/include/uapi/misc/fastrpc.h +++ b/include/uapi/misc/fastrpc.h @@ -6,6 +6,8 @@ #include #define FASTRPC_IOCTL_INVOKE _IOWR('R', 3, struct fastrpc_invoke) +#define FASTRPC_IOCTL_INIT_ATTACH _IO('R', 4) +#define FASTRPC_IOCTL_INIT_CREATE _IOWR('R', 5, struct fastrpc_init_create) struct fastrpc_invoke_args { __u64 ptr; @@ -20,4 +22,12 @@ struct fastrpc_invoke { __u64 args; }; +struct fastrpc_init_create { + __u32 filelen; /* elf file length */ + __s32 filefd; /* fd for the file */ + __u32 attrs; + __u32 siglen; + __u64 file; /* pointer to elf file */ +}; + #endif /* __QCOM_FASTRPC_H__ */ -- cgit v1.3-7-g2ca7 From 6cffd79504ce040f460831030d3069fa1c99bb71 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Fri, 8 Feb 2019 17:11:27 +0000 Subject: misc: fastrpc: Add support for dmabuf exporter User process can involve dealing with big buffer sizes, and also passing buffers from one compute context bank to other compute context bank for complex dsp algorithms. This patch adds support to fastrpc to make it a proper dmabuf exporter to avoid making copies of buffers. Co-developed-by: Thierry Escande Signed-off-by: Thierry Escande Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 184 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/misc/fastrpc.h | 8 ++ 2 files changed, 192 insertions(+) (limited to 'include/uapi') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index ceb498487569..4b0db33896df 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -106,10 +106,20 @@ struct fastrpc_invoke_rsp { struct fastrpc_buf { struct fastrpc_user *fl; + struct dma_buf *dmabuf; struct device *dev; void *virt; u64 phys; u64 size; + /* Lock for dma buf attachments */ + struct mutex lock; + struct list_head attachments; +}; + +struct fastrpc_dma_buf_attachment { + struct device *dev; + struct sg_table sgt; + struct list_head node; }; struct fastrpc_map { @@ -246,6 +256,9 @@ static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, if (!buf) return -ENOMEM; + INIT_LIST_HEAD(&buf->attachments); + mutex_init(&buf->lock); + buf->fl = fl; buf->virt = NULL; buf->phys = 0; @@ -360,6 +373,111 @@ err_idr: return ERR_PTR(ret); } +static struct sg_table * +fastrpc_map_dma_buf(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct fastrpc_dma_buf_attachment *a = attachment->priv; + struct sg_table *table; + + table = &a->sgt; + + if (!dma_map_sg(attachment->dev, table->sgl, table->nents, dir)) + return ERR_PTR(-ENOMEM); + + return table; +} + +static void fastrpc_unmap_dma_buf(struct dma_buf_attachment *attach, + struct sg_table *table, + enum dma_data_direction dir) +{ + dma_unmap_sg(attach->dev, table->sgl, table->nents, dir); +} + +static void fastrpc_release(struct dma_buf *dmabuf) +{ + struct fastrpc_buf *buffer = dmabuf->priv; + + fastrpc_buf_free(buffer); +} + +static int fastrpc_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct fastrpc_dma_buf_attachment *a; + struct fastrpc_buf *buffer = dmabuf->priv; + int ret; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return -ENOMEM; + + ret = dma_get_sgtable(buffer->dev, &a->sgt, buffer->virt, + FASTRPC_PHYS(buffer->phys), buffer->size); + if (ret < 0) { + dev_err(buffer->dev, "failed to get scatterlist from DMA API\n"); + return -EINVAL; + } + + a->dev = attachment->dev; + INIT_LIST_HEAD(&a->node); + attachment->priv = a; + + mutex_lock(&buffer->lock); + list_add(&a->node, &buffer->attachments); + mutex_unlock(&buffer->lock); + + return 0; +} + +static void fastrpc_dma_buf_detatch(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct fastrpc_dma_buf_attachment *a = attachment->priv; + struct fastrpc_buf *buffer = dmabuf->priv; + + mutex_lock(&buffer->lock); + list_del(&a->node); + mutex_unlock(&buffer->lock); + kfree(a); +} + +static void *fastrpc_kmap(struct dma_buf *dmabuf, unsigned long pgnum) +{ + struct fastrpc_buf *buf = dmabuf->priv; + + return buf->virt ? buf->virt + pgnum * PAGE_SIZE : NULL; +} + +static void *fastrpc_vmap(struct dma_buf *dmabuf) +{ + struct fastrpc_buf *buf = dmabuf->priv; + + return buf->virt; +} + +static int fastrpc_mmap(struct dma_buf *dmabuf, + struct vm_area_struct *vma) +{ + struct fastrpc_buf *buf = dmabuf->priv; + size_t size = vma->vm_end - vma->vm_start; + + return dma_mmap_coherent(buf->dev, vma, buf->virt, + FASTRPC_PHYS(buf->phys), size); +} + +static const struct dma_buf_ops fastrpc_dma_buf_ops = { + .attach = fastrpc_dma_buf_attach, + .detach = fastrpc_dma_buf_detatch, + .map_dma_buf = fastrpc_map_dma_buf, + .unmap_dma_buf = fastrpc_unmap_dma_buf, + .mmap = fastrpc_mmap, + .map = fastrpc_kmap, + .vmap = fastrpc_vmap, + .release = fastrpc_release, +}; + static int fastrpc_map_create(struct fastrpc_user *fl, int fd, u64 len, struct fastrpc_map **ppmap) { @@ -906,6 +1024,66 @@ static int fastrpc_device_open(struct inode *inode, struct file *filp) return 0; } +static int fastrpc_dmabuf_free(struct fastrpc_user *fl, char __user *argp) +{ + struct dma_buf *buf; + int info; + + if (copy_from_user(&info, argp, sizeof(info))) + return -EFAULT; + + buf = dma_buf_get(info); + if (IS_ERR_OR_NULL(buf)) + return -EINVAL; + /* + * one for the last get and other for the ALLOC_DMA_BUFF ioctl + */ + dma_buf_put(buf); + dma_buf_put(buf); + + return 0; +} + +static int fastrpc_dmabuf_alloc(struct fastrpc_user *fl, char __user *argp) +{ + struct fastrpc_alloc_dma_buf bp; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct fastrpc_buf *buf = NULL; + int err; + + if (copy_from_user(&bp, argp, sizeof(bp))) + return -EFAULT; + + err = fastrpc_buf_alloc(fl, fl->sctx->dev, bp.size, &buf); + if (err) + return err; + exp_info.ops = &fastrpc_dma_buf_ops; + exp_info.size = bp.size; + exp_info.flags = O_RDWR; + exp_info.priv = buf; + buf->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(buf->dmabuf)) { + err = PTR_ERR(buf->dmabuf); + fastrpc_buf_free(buf); + return err; + } + + bp.fd = dma_buf_fd(buf->dmabuf, O_ACCMODE); + if (bp.fd < 0) { + dma_buf_put(buf->dmabuf); + return -EINVAL; + } + + if (copy_to_user(argp, &bp, sizeof(bp))) { + dma_buf_put(buf->dmabuf); + return -EFAULT; + } + + get_dma_buf(buf->dmabuf); + + return 0; +} + static int fastrpc_init_attach(struct fastrpc_user *fl) { struct fastrpc_invoke_args args[1]; @@ -970,6 +1148,12 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, case FASTRPC_IOCTL_INIT_CREATE: err = fastrpc_init_create_process(fl, argp); break; + case FASTRPC_IOCTL_FREE_DMA_BUFF: + err = fastrpc_dmabuf_free(fl, argp); + break; + case FASTRPC_IOCTL_ALLOC_DMA_BUFF: + err = fastrpc_dmabuf_alloc(fl, argp); + break; default: err = -ENOTTY; break; diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h index 32d191c3b7bc..6d701af9fc42 100644 --- a/include/uapi/misc/fastrpc.h +++ b/include/uapi/misc/fastrpc.h @@ -5,6 +5,8 @@ #include +#define FASTRPC_IOCTL_ALLOC_DMA_BUFF _IOWR('R', 1, struct fastrpc_alloc_dma_buf) +#define FASTRPC_IOCTL_FREE_DMA_BUFF _IOWR('R', 2, __u32) #define FASTRPC_IOCTL_INVOKE _IOWR('R', 3, struct fastrpc_invoke) #define FASTRPC_IOCTL_INIT_ATTACH _IO('R', 4) #define FASTRPC_IOCTL_INIT_CREATE _IOWR('R', 5, struct fastrpc_init_create) @@ -30,4 +32,10 @@ struct fastrpc_init_create { __u64 file; /* pointer to elf file */ }; +struct fastrpc_alloc_dma_buf { + __s32 fd; /* fd */ + __u32 flags; /* flags to map with */ + __u64 size; /* size */ +}; + #endif /* __QCOM_FASTRPC_H__ */ -- cgit v1.3-7-g2ca7 From d9a9ea94f748f47b1d75c6c5e33edcf74476c445 Mon Sep 17 00:00:00 2001 From: Chad Austin Date: Mon, 7 Jan 2019 16:53:17 -0800 Subject: fuse: support clients that don't implement 'opendir' Allow filesystems to return ENOSYS from opendir, preventing the kernel from sending opendir and releasedir messages in the future. This avoids userspace transitions when filesystems don't need to keep track of state per directory handle. A new capability flag, FUSE_NO_OPENDIR_SUPPORT, parallels FUSE_NO_OPEN_SUPPORT, indicating the new semantics for returning ENOSYS from opendir. Signed-off-by: Chad Austin Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 11 +++++++---- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 3 ++- include/uapi/linux/fuse.h | 7 ++++++- 4 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8ee0446a8322..cc6ffd23b80f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -90,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (ff->fc->no_open && !isdir) { + if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { /* * Drop the release request when client does not * implement 'open' @@ -125,7 +125,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->fh = 0; ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ - if (!fc->no_open || isdir) { + if (isdir ? !fc->no_opendir : !fc->no_open) { struct fuse_open_out outarg; int err; @@ -134,11 +134,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; - } else if (err != -ENOSYS || isdir) { + } else if (err != -ENOSYS) { fuse_file_free(ff); return err; } else { - fc->no_open = 1; + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 033e30af519f..0920c0c032a0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -630,6 +630,9 @@ struct fuse_conn { /** Is open/release not implemented by fs? */ unsigned no_open:1; + /** Is opendir/releasedir not implemented by fs? */ + unsigned no_opendir:1; + /** Is fsync not implemented by fs? */ unsigned no_fsync:1; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2bbb7c59d6da..1b3f3b67d9f0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -972,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | + FUSE_NO_OPENDIR_SUPPORT; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index b4967d48bfda..2ac598614a8f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -122,6 +122,9 @@ * - add FOPEN_CACHE_DIR * - add FUSE_MAX_PAGES, add max_pages to init_out * - add FUSE_CACHE_SYMLINKS + * + * 7.29 + * - add FUSE_NO_OPENDIR_SUPPORT flag */ #ifndef _LINUX_FUSE_H @@ -157,7 +160,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 28 +#define FUSE_KERNEL_MINOR_VERSION 29 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -259,6 +262,7 @@ struct fuse_file_lock { * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages * FUSE_CACHE_SYMLINKS: cache READLINK responses + * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -284,6 +288,7 @@ struct fuse_file_lock { #define FUSE_ABORT_ERROR (1 << 21) #define FUSE_MAX_PAGES (1 << 22) #define FUSE_CACHE_SYMLINKS (1 << 23) +#define FUSE_NO_OPENDIR_SUPPORT (1 << 24) /** * CUSE INIT request/reply flags -- cgit v1.3-7-g2ca7 From b5bb37eddb63b16b7ab959598d108b1c444be77d Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Wed, 30 Jan 2019 02:53:22 +0100 Subject: drm/amdgpu: Add command to override the context priority. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Given a master fd we can then override the priority of the context in another fd. Using these overrides was recommended by Christian instead of trying to submit from a master fd, and I am adding a way to override a single context instead of the entire process so we can only upgrade a single Vulkan queue and not effectively the entire process. Reused the flags field as it was checked to be 0 anyways, so nothing used it. This is source-incompatible (due to the name change), but ABI compatible. Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c | 41 ++++++++++++++++++++++++++++++- include/uapi/drm/amdgpu_drm.h | 3 ++- 2 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c index 0b70410488b6..0767a93e4d91 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c @@ -76,6 +76,39 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev, return 0; } +static int amdgpu_sched_context_priority_override(struct amdgpu_device *adev, + int fd, + unsigned ctx_id, + enum drm_sched_priority priority) +{ + struct file *filp = fget(fd); + struct amdgpu_fpriv *fpriv; + struct amdgpu_ctx *ctx; + int r; + + if (!filp) + return -EINVAL; + + r = amdgpu_file_to_fpriv(filp, &fpriv); + if (r) { + fput(filp); + return r; + } + + ctx = amdgpu_ctx_get(fpriv, ctx_id); + + if (!ctx) { + fput(filp); + return -EINVAL; + } + + amdgpu_ctx_priority_override(ctx, priority); + amdgpu_ctx_put(ctx); + fput(filp); + + return 0; +} + int amdgpu_sched_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { @@ -85,7 +118,7 @@ int amdgpu_sched_ioctl(struct drm_device *dev, void *data, int r; priority = amdgpu_to_sched_priority(args->in.priority); - if (args->in.flags || priority == DRM_SCHED_PRIORITY_INVALID) + if (priority == DRM_SCHED_PRIORITY_INVALID) return -EINVAL; switch (args->in.op) { @@ -94,6 +127,12 @@ int amdgpu_sched_ioctl(struct drm_device *dev, void *data, args->in.fd, priority); break; + case AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE: + r = amdgpu_sched_context_priority_override(adev, + args->in.fd, + args->in.ctx_id, + priority); + break; default: DRM_ERROR("Invalid sched op specified: %d\n", args->in.op); r = -EINVAL; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 2acbc8bc465b..4a53f6cfa034 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -272,13 +272,14 @@ union drm_amdgpu_vm { /* sched ioctl */ #define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE 1 +#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE 2 struct drm_amdgpu_sched_in { /* AMDGPU_SCHED_OP_* */ __u32 op; __u32 fd; __s32 priority; - __u32 flags; + __u32 ctx_id; }; union drm_amdgpu_sched { -- cgit v1.3-7-g2ca7 From 99b9d7b4970cf131fd17a8f4ad4870049bd7a365 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 16 Feb 2019 00:39:13 +0200 Subject: habanalabs: add basic Goya support This patch adds a basic support for the Goya device. The code initializes the device's PCI controller and PCI bars. It also initializes various S/W structures and adds some basic helper functions. Reviewed-by: Mike Rapoport Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/Makefile | 3 + drivers/misc/habanalabs/device.c | 71 ++++ drivers/misc/habanalabs/goya/Makefile | 3 + drivers/misc/habanalabs/goya/goya.c | 632 ++++++++++++++++++++++++++++ drivers/misc/habanalabs/goya/goyaP.h | 152 +++++++ drivers/misc/habanalabs/habanalabs.h | 137 ++++++ drivers/misc/habanalabs/habanalabs_drv.c | 3 + drivers/misc/habanalabs/include/goya/goya.h | 45 ++ include/uapi/misc/habanalabs.h | 20 + 9 files changed, 1066 insertions(+) create mode 100644 drivers/misc/habanalabs/goya/Makefile create mode 100644 drivers/misc/habanalabs/goya/goya.c create mode 100644 drivers/misc/habanalabs/goya/goyaP.h create mode 100644 drivers/misc/habanalabs/include/goya/goya.h create mode 100644 include/uapi/misc/habanalabs.h (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile index 0910f7fa34ec..6f1ead69bd77 100644 --- a/drivers/misc/habanalabs/Makefile +++ b/drivers/misc/habanalabs/Makefile @@ -5,3 +5,6 @@ obj-m := habanalabs.o habanalabs-y := habanalabs_drv.o device.o + +include $(src)/goya/Makefile +habanalabs-y += $(HL_GOYA_FILES) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 6189fd0e2ccd..a4feaa784db3 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -120,8 +120,11 @@ err_cdev_add: */ static int device_early_init(struct hl_device *hdev) { + int rc; + switch (hdev->asic_type) { case ASIC_GOYA: + goya_set_asic_funcs(hdev); strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name)); break; default: @@ -130,6 +133,10 @@ static int device_early_init(struct hl_device *hdev) return -EINVAL; } + rc = hdev->asic_funcs->early_init(hdev); + if (rc) + return rc; + return 0; } @@ -141,6 +148,10 @@ static int device_early_init(struct hl_device *hdev) */ static void device_early_fini(struct hl_device *hdev) { + + if (hdev->asic_funcs->early_fini) + hdev->asic_funcs->early_fini(hdev); + } /* @@ -154,8 +165,15 @@ static void device_early_fini(struct hl_device *hdev) */ int hl_device_suspend(struct hl_device *hdev) { + int rc; + pci_save_state(hdev->pdev); + rc = hdev->asic_funcs->suspend(hdev); + if (rc) + dev_err(hdev->dev, + "Failed to disable PCI access of device CPU\n"); + /* Shut down the device */ pci_disable_device(hdev->pdev); pci_set_power_state(hdev->pdev, PCI_D3hot); @@ -185,6 +203,13 @@ int hl_device_resume(struct hl_device *hdev) return rc; } + rc = hdev->asic_funcs->resume(hdev); + if (rc) { + dev_err(hdev->dev, + "Failed to enable PCI access from device CPU\n"); + return rc; + } + return 0; } @@ -212,11 +237,21 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) if (rc) goto release_device; + /* + * Start calling ASIC initialization. First S/W then H/W and finally + * late init + */ + rc = hdev->asic_funcs->sw_init(hdev); + if (rc) + goto early_fini; + dev_notice(hdev->dev, "Successfully added device to habanalabs driver\n"); return 0; +early_fini: + device_early_fini(hdev); release_device: device_destroy(hclass, hdev->dev->devt); cdev_del(&hdev->cdev); @@ -247,6 +282,9 @@ void hl_device_fini(struct hl_device *hdev) /* Mark device as disabled */ hdev->disabled = true; + /* Call ASIC S/W finalize function */ + hdev->asic_funcs->sw_fini(hdev); + device_early_fini(hdev); /* Hide device from user */ @@ -338,3 +376,36 @@ int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr, return *val ? 0 : -ETIMEDOUT; } + +/* + * MMIO register access helper functions. + */ + +/* + * hl_rreg - Read an MMIO register + * + * @hdev: pointer to habanalabs device structure + * @reg: MMIO register offset (in bytes) + * + * Returns the value of the MMIO register we are asked to read + * + */ +inline u32 hl_rreg(struct hl_device *hdev, u32 reg) +{ + return readl(hdev->rmmio + reg); +} + +/* + * hl_wreg - Write to an MMIO register + * + * @hdev: pointer to habanalabs device structure + * @reg: MMIO register offset (in bytes) + * @val: 32-bit value + * + * Writes the 32-bit value into the MMIO register + * + */ +inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val) +{ + writel(val, hdev->rmmio + reg); +} diff --git a/drivers/misc/habanalabs/goya/Makefile b/drivers/misc/habanalabs/goya/Makefile new file mode 100644 index 000000000000..38d43006386d --- /dev/null +++ b/drivers/misc/habanalabs/goya/Makefile @@ -0,0 +1,3 @@ +subdir-ccflags-y += -I$(src) + +HL_GOYA_FILES := goya/goya.o diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c new file mode 100644 index 000000000000..ceaffa9afd83 --- /dev/null +++ b/drivers/misc/habanalabs/goya/goya.c @@ -0,0 +1,632 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "goyaP.h" +#include "include/goya/asic_reg/goya_masks.h" + +#include +#include +#include + +/* + * GOYA security scheme: + * + * 1. Host is protected by: + * - Range registers (When MMU is enabled, DMA RR does NOT protect host) + * - MMU + * + * 2. DRAM is protected by: + * - Range registers (protect the first 512MB) + * - MMU (isolation between users) + * + * 3. Configuration is protected by: + * - Range registers + * - Protection bits + * + * When MMU is disabled: + * + * QMAN DMA: PQ, CQ, CP, DMA are secured. + * PQ, CB and the data are on the host. + * + * QMAN TPC/MME: + * PQ, CQ and CP are not secured. + * PQ, CB and the data are on the SRAM/DRAM. + * + * Since QMAN DMA is secured, KMD is parsing the DMA CB: + * - KMD checks DMA pointer + * - WREG, MSG_PROT are not allowed. + * - MSG_LONG/SHORT are allowed. + * + * A read/write transaction by the QMAN to a protected area will succeed if + * and only if the QMAN's CP is secured and MSG_PROT is used + * + * + * When MMU is enabled: + * + * QMAN DMA: PQ, CQ and CP are secured. + * MMU is set to bypass on the Secure props register of the QMAN. + * The reasons we don't enable MMU for PQ, CQ and CP are: + * - PQ entry is in kernel address space and KMD doesn't map it. + * - CP writes to MSIX register and to kernel address space (completion + * queue). + * + * DMA is not secured but because CP is secured, KMD still needs to parse the + * CB, but doesn't need to check the DMA addresses. + * + * For QMAN DMA 0, DMA is also secured because only KMD uses this DMA and KMD + * doesn't map memory in MMU. + * + * QMAN TPC/MME: PQ, CQ and CP aren't secured (no change from MMU disabled mode) + * + * DMA RR does NOT protect host because DMA is not secured + * + */ + +#define GOYA_MMU_REGS_NUM 61 + +#define GOYA_DMA_POOL_BLK_SIZE 0x100 /* 256 bytes */ + +#define GOYA_RESET_TIMEOUT_MSEC 500 /* 500ms */ +#define GOYA_PLDM_RESET_TIMEOUT_MSEC 20000 /* 20s */ +#define GOYA_RESET_WAIT_MSEC 1 /* 1ms */ +#define GOYA_CPU_RESET_WAIT_MSEC 100 /* 100ms */ +#define GOYA_PLDM_RESET_WAIT_MSEC 1000 /* 1s */ +#define GOYA_CPU_TIMEOUT_USEC 10000000 /* 10s */ +#define GOYA_TEST_QUEUE_WAIT_USEC 100000 /* 100ms */ + +#define GOYA_QMAN0_FENCE_VAL 0xD169B243 + +#define GOYA_MAX_INITIATORS 20 + +static void goya_get_fixed_properties(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + + prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; + + prop->dram_base_address = DRAM_PHYS_BASE; + prop->dram_size = DRAM_PHYS_DEFAULT_SIZE; + prop->dram_end_address = prop->dram_base_address + prop->dram_size; + prop->dram_user_base_address = DRAM_BASE_ADDR_USER; + + prop->sram_base_address = SRAM_BASE_ADDR; + prop->sram_size = SRAM_SIZE; + prop->sram_end_address = prop->sram_base_address + prop->sram_size; + prop->sram_user_base_address = prop->sram_base_address + + SRAM_USER_BASE_OFFSET; + + prop->host_phys_base_address = HOST_PHYS_BASE; + prop->va_space_host_start_address = VA_HOST_SPACE_START; + prop->va_space_host_end_address = VA_HOST_SPACE_END; + prop->va_space_dram_start_address = VA_DDR_SPACE_START; + prop->va_space_dram_end_address = VA_DDR_SPACE_END; + prop->cfg_size = CFG_SIZE; + prop->max_asid = MAX_ASID; + prop->tpc_enabled_mask = TPC_ENABLED_MASK; + + prop->high_pll = PLL_HIGH_DEFAULT; +} + +/* + * goya_pci_bars_map - Map PCI BARS of Goya device + * + * @hdev: pointer to hl_device structure + * + * Request PCI regions and map them to kernel virtual addresses. + * Returns 0 on success + * + */ +int goya_pci_bars_map(struct hl_device *hdev) +{ + struct pci_dev *pdev = hdev->pdev; + int rc; + + rc = pci_request_regions(pdev, HL_NAME); + if (rc) { + dev_err(hdev->dev, "Cannot obtain PCI resources\n"); + return rc; + } + + hdev->pcie_bar[SRAM_CFG_BAR_ID] = + pci_ioremap_bar(pdev, SRAM_CFG_BAR_ID); + if (!hdev->pcie_bar[SRAM_CFG_BAR_ID]) { + dev_err(hdev->dev, "pci_ioremap_bar failed for CFG\n"); + rc = -ENODEV; + goto err_release_regions; + } + + hdev->pcie_bar[MSIX_BAR_ID] = pci_ioremap_bar(pdev, MSIX_BAR_ID); + if (!hdev->pcie_bar[MSIX_BAR_ID]) { + dev_err(hdev->dev, "pci_ioremap_bar failed for MSIX\n"); + rc = -ENODEV; + goto err_unmap_sram_cfg; + } + + hdev->pcie_bar[DDR_BAR_ID] = pci_ioremap_wc_bar(pdev, DDR_BAR_ID); + if (!hdev->pcie_bar[DDR_BAR_ID]) { + dev_err(hdev->dev, "pci_ioremap_bar failed for DDR\n"); + rc = -ENODEV; + goto err_unmap_msix; + } + + hdev->rmmio = hdev->pcie_bar[SRAM_CFG_BAR_ID] + + (CFG_BASE - SRAM_BASE_ADDR); + + return 0; + +err_unmap_msix: + iounmap(hdev->pcie_bar[MSIX_BAR_ID]); +err_unmap_sram_cfg: + iounmap(hdev->pcie_bar[SRAM_CFG_BAR_ID]); +err_release_regions: + pci_release_regions(pdev); + + return rc; +} + +/* + * goya_pci_bars_unmap - Unmap PCI BARS of Goya device + * + * @hdev: pointer to hl_device structure + * + * Release all PCI BARS and unmap their virtual addresses + * + */ +static void goya_pci_bars_unmap(struct hl_device *hdev) +{ + struct pci_dev *pdev = hdev->pdev; + + iounmap(hdev->pcie_bar[DDR_BAR_ID]); + iounmap(hdev->pcie_bar[MSIX_BAR_ID]); + iounmap(hdev->pcie_bar[SRAM_CFG_BAR_ID]); + pci_release_regions(pdev); +} + +/* + * goya_elbi_write - Write through the ELBI interface + * + * @hdev: pointer to hl_device structure + * + * return 0 on success, -1 on failure + * + */ +static int goya_elbi_write(struct hl_device *hdev, u64 addr, u32 data) +{ + struct pci_dev *pdev = hdev->pdev; + ktime_t timeout; + u32 val; + + /* Clear previous status */ + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, 0); + + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_ADDR, (u32) addr); + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_DATA, data); + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_CTRL, + PCI_CONFIG_ELBI_CTRL_WRITE); + + timeout = ktime_add_ms(ktime_get(), 10); + for (;;) { + pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, &val); + if (val & PCI_CONFIG_ELBI_STS_MASK) + break; + if (ktime_compare(ktime_get(), timeout) > 0) { + pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, + &val); + break; + } + usleep_range(300, 500); + } + + if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE) + return 0; + + if (val & PCI_CONFIG_ELBI_STS_ERR) { + dev_err(hdev->dev, "Error writing to ELBI\n"); + return -EIO; + } + + if (!(val & PCI_CONFIG_ELBI_STS_MASK)) { + dev_err(hdev->dev, "ELBI write didn't finish in time\n"); + return -EIO; + } + + dev_err(hdev->dev, "ELBI write has undefined bits in status\n"); + return -EIO; +} + +/* + * goya_iatu_write - iatu write routine + * + * @hdev: pointer to hl_device structure + * + */ +static int goya_iatu_write(struct hl_device *hdev, u32 addr, u32 data) +{ + u32 dbi_offset; + int rc; + + dbi_offset = addr & 0xFFF; + + rc = goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI, 0x00300000); + rc |= goya_elbi_write(hdev, mmPCIE_DBI_BASE + dbi_offset, data); + + if (rc) + return -EIO; + + return 0; +} + +void goya_reset_link_through_bridge(struct hl_device *hdev) +{ + struct pci_dev *pdev = hdev->pdev; + struct pci_dev *parent_port; + u16 val; + + parent_port = pdev->bus->self; + pci_read_config_word(parent_port, PCI_BRIDGE_CONTROL, &val); + val |= PCI_BRIDGE_CTL_BUS_RESET; + pci_write_config_word(parent_port, PCI_BRIDGE_CONTROL, val); + ssleep(1); + + val &= ~(PCI_BRIDGE_CTL_BUS_RESET); + pci_write_config_word(parent_port, PCI_BRIDGE_CONTROL, val); + ssleep(3); +} + +/* + * goya_set_ddr_bar_base - set DDR bar to map specific device address + * + * @hdev: pointer to hl_device structure + * @addr: address in DDR. Must be aligned to DDR bar size + * + * This function configures the iATU so that the DDR bar will start at the + * specified addr. + * + */ +static int goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) +{ + struct goya_device *goya = hdev->asic_specific; + int rc; + + if ((goya) && (goya->ddr_bar_cur_addr == addr)) + return 0; + + /* Inbound Region 1 - Bar 4 - Point to DDR */ + rc = goya_iatu_write(hdev, 0x314, lower_32_bits(addr)); + rc |= goya_iatu_write(hdev, 0x318, upper_32_bits(addr)); + rc |= goya_iatu_write(hdev, 0x300, 0); + /* Enable + Bar match + match enable + Bar 4 */ + rc |= goya_iatu_write(hdev, 0x304, 0xC0080400); + + /* Return the DBI window to the default location */ + rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI, 0); + rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI_32, 0); + + if (rc) { + dev_err(hdev->dev, "failed to map DDR bar to 0x%08llx\n", addr); + return -EIO; + } + + if (goya) + goya->ddr_bar_cur_addr = addr; + + return 0; +} + +/* + * goya_init_iatu - Initialize the iATU unit inside the PCI controller + * + * @hdev: pointer to hl_device structure + * + * This is needed in case the firmware doesn't initialize the iATU + * + */ +static int goya_init_iatu(struct hl_device *hdev) +{ + int rc; + + /* Inbound Region 0 - Bar 0 - Point to SRAM_BASE_ADDR */ + rc = goya_iatu_write(hdev, 0x114, lower_32_bits(SRAM_BASE_ADDR)); + rc |= goya_iatu_write(hdev, 0x118, upper_32_bits(SRAM_BASE_ADDR)); + rc |= goya_iatu_write(hdev, 0x100, 0); + /* Enable + Bar match + match enable */ + rc |= goya_iatu_write(hdev, 0x104, 0xC0080000); + + /* Inbound Region 1 - Bar 4 - Point to DDR */ + rc |= goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE); + + /* Outbound Region 0 - Point to Host */ + rc |= goya_iatu_write(hdev, 0x008, lower_32_bits(HOST_PHYS_BASE)); + rc |= goya_iatu_write(hdev, 0x00C, upper_32_bits(HOST_PHYS_BASE)); + rc |= goya_iatu_write(hdev, 0x010, + lower_32_bits(HOST_PHYS_BASE + HOST_PHYS_SIZE - 1)); + rc |= goya_iatu_write(hdev, 0x014, 0); + rc |= goya_iatu_write(hdev, 0x018, 0); + rc |= goya_iatu_write(hdev, 0x020, + upper_32_bits(HOST_PHYS_BASE + HOST_PHYS_SIZE - 1)); + /* Increase region size */ + rc |= goya_iatu_write(hdev, 0x000, 0x00002000); + /* Enable */ + rc |= goya_iatu_write(hdev, 0x004, 0x80000000); + + /* Return the DBI window to the default location */ + rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI, 0); + rc |= goya_elbi_write(hdev, CFG_BASE + mmPCIE_AUX_DBI_32, 0); + + if (rc) + return -EIO; + + return 0; +} + +/* + * goya_early_init - GOYA early initialization code + * + * @hdev: pointer to hl_device structure + * + * Verify PCI bars + * Set DMA masks + * PCI controller initialization + * Map PCI bars + * + */ +static int goya_early_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct pci_dev *pdev = hdev->pdev; + u32 val; + int rc; + + goya_get_fixed_properties(hdev); + + /* Check BAR sizes */ + if (pci_resource_len(pdev, SRAM_CFG_BAR_ID) != CFG_BAR_SIZE) { + dev_err(hdev->dev, + "Not " HL_NAME "? BAR %d size %llu, expecting %llu\n", + SRAM_CFG_BAR_ID, + (unsigned long long) pci_resource_len(pdev, + SRAM_CFG_BAR_ID), + CFG_BAR_SIZE); + return -ENODEV; + } + + if (pci_resource_len(pdev, MSIX_BAR_ID) != MSIX_BAR_SIZE) { + dev_err(hdev->dev, + "Not " HL_NAME "? BAR %d size %llu, expecting %llu\n", + MSIX_BAR_ID, + (unsigned long long) pci_resource_len(pdev, + MSIX_BAR_ID), + MSIX_BAR_SIZE); + return -ENODEV; + } + + prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID); + + /* set DMA mask for GOYA */ + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(39)); + if (rc) { + dev_warn(hdev->dev, "Unable to set pci dma mask to 39 bits\n"); + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (rc) { + dev_err(hdev->dev, + "Unable to set pci dma mask to 32 bits\n"); + return rc; + } + } + + rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(39)); + if (rc) { + dev_warn(hdev->dev, + "Unable to set pci consistent dma mask to 39 bits\n"); + rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (rc) { + dev_err(hdev->dev, + "Unable to set pci consistent dma mask to 32 bits\n"); + return rc; + } + } + + if (hdev->reset_pcilink) + goya_reset_link_through_bridge(hdev); + + rc = pci_enable_device_mem(pdev); + if (rc) { + dev_err(hdev->dev, "can't enable PCI device\n"); + return rc; + } + + pci_set_master(pdev); + + rc = goya_init_iatu(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to initialize iATU\n"); + goto disable_device; + } + + rc = goya_pci_bars_map(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to initialize PCI BARS\n"); + goto disable_device; + } + + val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS); + if (val & PSOC_GLOBAL_CONF_BOOT_STRAP_PINS_SRIOV_EN_MASK) + dev_warn(hdev->dev, + "PCI strap is not configured correctly, PCI bus errors may occur\n"); + + return 0; + +disable_device: + pci_clear_master(pdev); + pci_disable_device(pdev); + + return rc; +} + +/* + * goya_early_fini - GOYA early finalization code + * + * @hdev: pointer to hl_device structure + * + * Unmap PCI bars + * + */ +int goya_early_fini(struct hl_device *hdev) +{ + goya_pci_bars_unmap(hdev); + + pci_clear_master(hdev->pdev); + pci_disable_device(hdev->pdev); + + return 0; +} + +/* + * goya_sw_init - Goya software initialization code + * + * @hdev: pointer to hl_device structure + * + */ +static int goya_sw_init(struct hl_device *hdev) +{ + struct goya_device *goya; + int rc; + + /* Allocate device structure */ + goya = kzalloc(sizeof(*goya), GFP_KERNEL); + if (!goya) + return -ENOMEM; + + /* according to goya_init_iatu */ + goya->ddr_bar_cur_addr = DRAM_PHYS_BASE; + hdev->asic_specific = goya; + + /* Create DMA pool for small allocations */ + hdev->dma_pool = dma_pool_create(dev_name(hdev->dev), + &hdev->pdev->dev, GOYA_DMA_POOL_BLK_SIZE, 8, 0); + if (!hdev->dma_pool) { + dev_err(hdev->dev, "failed to create DMA pool\n"); + rc = -ENOMEM; + goto free_goya_device; + } + + hdev->cpu_accessible_dma_mem = + hdev->asic_funcs->dma_alloc_coherent(hdev, + CPU_ACCESSIBLE_MEM_SIZE, + &hdev->cpu_accessible_dma_address, + GFP_KERNEL | __GFP_ZERO); + + if (!hdev->cpu_accessible_dma_mem) { + dev_err(hdev->dev, + "failed to allocate %d of dma memory for CPU accessible memory space\n", + CPU_ACCESSIBLE_MEM_SIZE); + rc = -ENOMEM; + goto free_dma_pool; + } + + hdev->cpu_accessible_dma_pool = gen_pool_create(CPU_PKT_SHIFT, -1); + if (!hdev->cpu_accessible_dma_pool) { + dev_err(hdev->dev, + "Failed to create CPU accessible DMA pool\n"); + rc = -ENOMEM; + goto free_cpu_pq_dma_mem; + } + + rc = gen_pool_add(hdev->cpu_accessible_dma_pool, + (uintptr_t) hdev->cpu_accessible_dma_mem, + CPU_ACCESSIBLE_MEM_SIZE, -1); + if (rc) { + dev_err(hdev->dev, + "Failed to add memory to CPU accessible DMA pool\n"); + rc = -EFAULT; + goto free_cpu_pq_pool; + } + + spin_lock_init(&goya->hw_queues_lock); + + return 0; + +free_cpu_pq_pool: + gen_pool_destroy(hdev->cpu_accessible_dma_pool); +free_cpu_pq_dma_mem: + hdev->asic_funcs->dma_free_coherent(hdev, CPU_ACCESSIBLE_MEM_SIZE, + hdev->cpu_accessible_dma_mem, + hdev->cpu_accessible_dma_address); +free_dma_pool: + dma_pool_destroy(hdev->dma_pool); +free_goya_device: + kfree(goya); + + return rc; +} + +/* + * goya_sw_fini - Goya software tear-down code + * + * @hdev: pointer to hl_device structure + * + */ +int goya_sw_fini(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + + gen_pool_destroy(hdev->cpu_accessible_dma_pool); + + hdev->asic_funcs->dma_free_coherent(hdev, CPU_ACCESSIBLE_MEM_SIZE, + hdev->cpu_accessible_dma_mem, + hdev->cpu_accessible_dma_address); + + dma_pool_destroy(hdev->dma_pool); + + kfree(goya); + + return 0; +} + +int goya_suspend(struct hl_device *hdev) +{ + return 0; +} + +int goya_resume(struct hl_device *hdev) +{ + return 0; +} + +void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) +{ + return dma_alloc_coherent(&hdev->pdev->dev, size, dma_handle, flags); +} + +void goya_dma_free_coherent(struct hl_device *hdev, size_t size, void *cpu_addr, + dma_addr_t dma_handle) +{ + dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle); +} + +static const struct hl_asic_funcs goya_funcs = { + .early_init = goya_early_init, + .early_fini = goya_early_fini, + .sw_init = goya_sw_init, + .sw_fini = goya_sw_fini, + .suspend = goya_suspend, + .resume = goya_resume, + .dma_alloc_coherent = goya_dma_alloc_coherent, + .dma_free_coherent = goya_dma_free_coherent, +}; + +/* + * goya_set_asic_funcs - set Goya function pointers + * + * @*hdev: pointer to hl_device structure + * + */ +void goya_set_asic_funcs(struct hl_device *hdev) +{ + hdev->asic_funcs = &goya_funcs; +} diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h new file mode 100644 index 000000000000..6a78976e2098 --- /dev/null +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#ifndef GOYAP_H_ +#define GOYAP_H_ + +#include +#include "habanalabs.h" +#include "include/goya/goya.h" + +#define NUMBER_OF_CMPLT_QUEUES 5 +#define NUMBER_OF_EXT_HW_QUEUES 5 +#define NUMBER_OF_CPU_HW_QUEUES 1 +#define NUMBER_OF_INT_HW_QUEUES 9 +#define NUMBER_OF_HW_QUEUES (NUMBER_OF_EXT_HW_QUEUES + \ + NUMBER_OF_CPU_HW_QUEUES + \ + NUMBER_OF_INT_HW_QUEUES) + +/* + * Number of MSIX interrupts IDS: + * Each completion queue has 1 ID + * The event queue has 1 ID + */ +#define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + 1) + +#if (NUMBER_OF_HW_QUEUES >= HL_MAX_QUEUES) +#error "Number of H/W queues must be smaller than HL_MAX_QUEUES" +#endif + +#if (NUMBER_OF_INTERRUPTS > GOYA_MSIX_ENTRIES) +#error "Number of MSIX interrupts must be smaller or equal to GOYA_MSIX_ENTRIES" +#endif + +#define QMAN_FENCE_TIMEOUT_USEC 10000 /* 10 ms */ + +#define QMAN_STOP_TIMEOUT_USEC 100000 /* 100 ms */ + +#define TPC_ENABLED_MASK 0xFF + +#define PLL_HIGH_DEFAULT 1575000000 /* 1.575 GHz */ + +#define GOYA_ARMCP_INFO_TIMEOUT 10000000 /* 10s */ + +#define DRAM_PHYS_DEFAULT_SIZE 0x100000000ull /* 4GB */ + +/* DRAM Memory Map */ + +#define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */ +#define MMU_PAGE_TABLES_SIZE 0x0E000000 /* 224MB */ +#define MMU_CACHE_MNG_SIZE 0x00001000 /* 4KB */ +#define CPU_PQ_PKT_SIZE 0x00001000 /* 4KB */ +#define CPU_PQ_DATA_SIZE 0x01FFE000 /* 32MB - 8KB */ + +#define CPU_FW_IMAGE_ADDR DRAM_PHYS_BASE +#define MMU_PAGE_TABLES_ADDR (CPU_FW_IMAGE_ADDR + CPU_FW_IMAGE_SIZE) +#define MMU_CACHE_MNG_ADDR (MMU_PAGE_TABLES_ADDR + MMU_PAGE_TABLES_SIZE) +#define CPU_PQ_PKT_ADDR (MMU_CACHE_MNG_ADDR + MMU_CACHE_MNG_SIZE) +#define CPU_PQ_DATA_ADDR (CPU_PQ_PKT_ADDR + CPU_PQ_PKT_SIZE) +#define DRAM_BASE_ADDR_USER (CPU_PQ_DATA_ADDR + CPU_PQ_DATA_SIZE) + +#if (DRAM_BASE_ADDR_USER != 0x20000000) +#error "KMD must reserve 512MB" +#endif + +/* + * SRAM Memory Map for KMD + * + * KMD occupies KMD_SRAM_SIZE bytes from the start of SRAM. It is used for + * MME/TPC QMANs + * + */ + +#define MME_QMAN_BASE_OFFSET 0x000000 /* Must be 0 */ +#define MME_QMAN_LENGTH 64 +#define TPC_QMAN_LENGTH 64 + +#define TPC0_QMAN_BASE_OFFSET (MME_QMAN_BASE_OFFSET + \ + (MME_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)) +#define TPC1_QMAN_BASE_OFFSET (TPC0_QMAN_BASE_OFFSET + \ + (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)) +#define TPC2_QMAN_BASE_OFFSET (TPC1_QMAN_BASE_OFFSET + \ + (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)) +#define TPC3_QMAN_BASE_OFFSET (TPC2_QMAN_BASE_OFFSET + \ + (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)) +#define TPC4_QMAN_BASE_OFFSET (TPC3_QMAN_BASE_OFFSET + \ + (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)) +#define TPC5_QMAN_BASE_OFFSET (TPC4_QMAN_BASE_OFFSET + \ + (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)) +#define TPC6_QMAN_BASE_OFFSET (TPC5_QMAN_BASE_OFFSET + \ + (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)) +#define TPC7_QMAN_BASE_OFFSET (TPC6_QMAN_BASE_OFFSET + \ + (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)) + +#define SRAM_KMD_RES_OFFSET (TPC7_QMAN_BASE_OFFSET + \ + (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)) + +#if (SRAM_KMD_RES_OFFSET >= GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START) +#error "MME/TPC QMANs SRAM space exceeds limit" +#endif + +#define SRAM_USER_BASE_OFFSET GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START + +/* Virtual address space */ +#define VA_HOST_SPACE_START 0x1000000000000ull /* 256TB */ +#define VA_HOST_SPACE_END 0x3FF8000000000ull /* 1PB - 1TB */ +#define VA_HOST_SPACE_SIZE (VA_HOST_SPACE_END - \ + VA_HOST_SPACE_START) /* 767TB */ + +#define VA_DDR_SPACE_START 0x800000000ull /* 32GB */ +#define VA_DDR_SPACE_END 0x2000000000ull /* 128GB */ +#define VA_DDR_SPACE_SIZE (VA_DDR_SPACE_END - \ + VA_DDR_SPACE_START) /* 128GB */ + +#define DMA_MAX_TRANSFER_SIZE 0xFFFFFFFF + +#define HW_CAP_PLL 0x00000001 +#define HW_CAP_DDR_0 0x00000002 +#define HW_CAP_DDR_1 0x00000004 +#define HW_CAP_MME 0x00000008 +#define HW_CAP_CPU 0x00000010 +#define HW_CAP_DMA 0x00000020 +#define HW_CAP_MSIX 0x00000040 +#define HW_CAP_CPU_Q 0x00000080 +#define HW_CAP_MMU 0x00000100 +#define HW_CAP_TPC_MBIST 0x00000200 +#define HW_CAP_GOLDEN 0x00000400 +#define HW_CAP_TPC 0x00000800 + +#define CPU_PKT_SHIFT 5 +#define CPU_PKT_SIZE (1 << CPU_PKT_SHIFT) +#define CPU_PKT_MASK (~((1 << CPU_PKT_SHIFT) - 1)) +#define CPU_MAX_PKTS_IN_CB 32 +#define CPU_CB_SIZE (CPU_PKT_SIZE * CPU_MAX_PKTS_IN_CB) +#define CPU_ACCESSIBLE_MEM_SIZE (HL_QUEUE_LENGTH * CPU_CB_SIZE) + +enum goya_fw_component { + FW_COMP_UBOOT, + FW_COMP_PREBOOT +}; + +struct goya_device { + /* TODO: remove hw_queues_lock after moving to scheduler code */ + spinlock_t hw_queues_lock; + u64 ddr_bar_cur_addr; + u32 hw_cap_initialized; +}; + +#endif /* GOYAP_H_ */ diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index fa628b05db13..5076e680a73c 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -14,9 +14,62 @@ #define HL_NAME "habanalabs" +#define HL_MAX_QUEUES 128 + struct hl_device; +/** + * struct asic_fixed_properties - ASIC specific immutable properties. + * @sram_base_address: SRAM physical start address. + * @sram_end_address: SRAM physical end address. + * @sram_user_base_address - SRAM physical start address for user access. + * @dram_base_address: DRAM physical start address. + * @dram_end_address: DRAM physical end address. + * @dram_user_base_address: DRAM physical start address for user access. + * @dram_size: DRAM total size. + * @dram_pci_bar_size: size of PCI bar towards DRAM. + * @host_phys_base_address: base physical address of host memory for + * transactions that the device generates. + * @va_space_host_start_address: base address of virtual memory range for + * mapping host memory. + * @va_space_host_end_address: end address of virtual memory range for + * mapping host memory. + * @va_space_dram_start_address: base address of virtual memory range for + * mapping DRAM memory. + * @va_space_dram_end_address: end address of virtual memory range for + * mapping DRAM memory. + * @cfg_size: configuration space size on SRAM. + * @sram_size: total size of SRAM. + * @max_asid: maximum number of open contexts (ASIDs). + * @completion_queues_count: number of completion queues. + * @high_pll: high PLL frequency used by the device. + * @tpc_enabled_mask: which TPCs are enabled. + */ +struct asic_fixed_properties { + u64 sram_base_address; + u64 sram_end_address; + u64 sram_user_base_address; + u64 dram_base_address; + u64 dram_end_address; + u64 dram_user_base_address; + u64 dram_size; + u64 dram_pci_bar_size; + u64 host_phys_base_address; + u64 va_space_host_start_address; + u64 va_space_host_end_address; + u64 va_space_dram_start_address; + u64 va_space_dram_end_address; + u32 cfg_size; + u32 sram_size; + u32 max_asid; + u32 high_pll; + u8 completion_queues_count; + u8 tpc_enabled_mask; +}; + + +#define HL_QUEUE_LENGTH 256 /* * ASICs */ @@ -33,6 +86,36 @@ enum hl_asic_type { ASIC_INVALID }; +/** + * struct hl_asic_funcs - ASIC specific functions that are can be called from + * common code. + * @early_init: sets up early driver state (pre sw_init), doesn't configure H/W. + * @early_fini: tears down what was done in early_init. + * @sw_init: sets up driver state, does not configure H/W. + * @sw_fini: tears down driver state, does not configure H/W. + * @suspend: handles IP specific H/W or SW changes for suspend. + * @resume: handles IP specific H/W or SW changes for resume. + * @dma_alloc_coherent: Allocate coherent DMA memory by calling + * dma_alloc_coherent(). This is ASIC function because its + * implementation is not trivial when the driver is loaded + * in simulation mode (not upstreamed). + * @dma_free_coherent: Free coherent DMA memory by calling dma_free_coherent(). + * This is ASIC function because its implementation is not + * trivial when the driver is loaded in simulation mode + * (not upstreamed). + */ +struct hl_asic_funcs { + int (*early_init)(struct hl_device *hdev); + int (*early_fini)(struct hl_device *hdev); + int (*sw_init)(struct hl_device *hdev); + int (*sw_fini)(struct hl_device *hdev); + int (*suspend)(struct hl_device *hdev); + int (*resume)(struct hl_device *hdev); + void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag); + void (*dma_free_coherent)(struct hl_device *hdev, size_t size, + void *cpu_addr, dma_addr_t dma_handle); +}; /* * FILE PRIVATE STRUCTURE @@ -62,26 +145,78 @@ struct hl_fpriv { */ #define HL_MAX_MINORS 256 +/* + * Registers read & write functions. + */ + +u32 hl_rreg(struct hl_device *hdev, u32 reg); +void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); + +#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \ + readl_poll_timeout(hdev->rmmio + addr, val, cond, sleep_us, timeout_us) + +#define RREG32(reg) hl_rreg(hdev, (reg)) +#define WREG32(reg, v) hl_wreg(hdev, (reg), (v)) +#define DREG32(reg) pr_info("REGISTER: " #reg " : 0x%08X\n", \ + hl_rreg(hdev, (reg))) + +#define WREG32_P(reg, val, mask) \ + do { \ + u32 tmp_ = RREG32(reg); \ + tmp_ &= (mask); \ + tmp_ |= ((val) & ~(mask)); \ + WREG32(reg, tmp_); \ + } while (0) +#define WREG32_AND(reg, and) WREG32_P(reg, 0, and) +#define WREG32_OR(reg, or) WREG32_P(reg, or, ~(or)) + +#define REG_FIELD_SHIFT(reg, field) reg##_##field##_SHIFT +#define REG_FIELD_MASK(reg, field) reg##_##field##_MASK +#define WREG32_FIELD(reg, field, val) \ + WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \ + (val) << REG_FIELD_SHIFT(reg, field)) + /** * struct hl_device - habanalabs device structure. * @pdev: pointer to PCI device, can be NULL in case of simulator device. + * @pcie_bar: array of available PCIe bars. + * @rmmio: configuration area address on SRAM. * @cdev: related char device. * @dev: realted kernel basic device structure. * @asic_name: ASIC specific nmae. * @asic_type: ASIC specific type. + * @dma_pool: DMA pool for small allocations. + * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address. + * @cpu_accessible_dma_address: KMD <-> ArmCP shared memory DMA address. + * @cpu_accessible_dma_pool: KMD <-> ArmCP shared memory pool. + * @asic_prop: ASIC specific immutable properties. + * @asic_funcs: ASIC specific functions. + * @asic_specific: ASIC specific information to use only from ASIC files. * @major: habanalabs KMD major. * @id: device minor. * @disabled: is device disabled. */ struct hl_device { struct pci_dev *pdev; + void __iomem *pcie_bar[6]; + void __iomem *rmmio; struct cdev cdev; struct device *dev; char asic_name[16]; enum hl_asic_type asic_type; + struct dma_pool *dma_pool; + void *cpu_accessible_dma_mem; + dma_addr_t cpu_accessible_dma_address; + struct gen_pool *cpu_accessible_dma_pool; + struct asic_fixed_properties asic_prop; + const struct hl_asic_funcs *asic_funcs; + void *asic_specific; u32 major; u16 id; u8 disabled; + + /* Parameters for bring-up */ + u8 reset_pcilink; }; @@ -128,4 +263,6 @@ void hl_device_fini(struct hl_device *hdev); int hl_device_suspend(struct hl_device *hdev); int hl_device_resume(struct hl_device *hdev); +void goya_set_asic_funcs(struct hl_device *hdev); + #endif /* HABANALABSP_H_ */ diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c index a5251ed277d1..edf626b2b7b1 100644 --- a/drivers/misc/habanalabs/habanalabs_drv.c +++ b/drivers/misc/habanalabs/habanalabs_drv.c @@ -122,6 +122,9 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, hdev->major = hl_major; + /* Parameters for bring-up - set them to defaults */ + hdev->reset_pcilink = 0; + hdev->disabled = true; hdev->pdev = pdev; /* can be NULL in case of simulator device */ diff --git a/drivers/misc/habanalabs/include/goya/goya.h b/drivers/misc/habanalabs/include/goya/goya.h new file mode 100644 index 000000000000..614149efa412 --- /dev/null +++ b/drivers/misc/habanalabs/include/goya/goya.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#ifndef GOYA_H +#define GOYA_H + +#include "asic_reg/goya_regs.h" + +#include + +#define SRAM_CFG_BAR_ID 0 +#define MSIX_BAR_ID 2 +#define DDR_BAR_ID 4 + +#define CFG_BAR_SIZE 0x10000000ull /* 256MB */ +#define MSIX_BAR_SIZE 0x1000ull /* 4KB */ + +#define CFG_BASE 0x7FFC000000ull +#define CFG_SIZE 0x4000000 /* 32MB CFG + 32MB DBG*/ + +#define SRAM_BASE_ADDR 0x7FF0000000ull +#define SRAM_SIZE 0x32A0000 /* 50.625MB */ + +#define DRAM_PHYS_BASE 0x0ull + +#define HOST_PHYS_BASE 0x8000000000ull /* 0.5TB */ +#define HOST_PHYS_SIZE 0x1000000000000ull /* 0.25PB (48 bits) */ + +#define GOYA_MSIX_ENTRIES 8 + +#define QMAN_PQ_ENTRY_SIZE 16 /* Bytes */ + +#define MAX_ASID 1024 + +#define PROT_BITS_OFFS 0xF80 + +#define DMA_MAX_NUM 5 + +#define TPC_MAX_NUM 8 + +#endif /* GOYA_H */ diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h new file mode 100644 index 000000000000..a0ec23adf8f5 --- /dev/null +++ b/include/uapi/misc/habanalabs.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note + * + * Copyright 2016-2018 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#ifndef HABANALABS_H_ +#define HABANALABS_H_ + +#include +#include + +/* + * Defines that are asic-specific but constitutes as ABI between kernel driver + * and userspace + */ +#define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START 0x8000 /* 32KB */ + +#endif /* HABANALABS_H_ */ -- cgit v1.3-7-g2ca7 From be5d926b5c10430671ae975b80efb7a5652e3f9a Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 16 Feb 2019 00:39:15 +0200 Subject: habanalabs: add command buffer module This patch adds the command buffer (CB) module, which allows the user to create and destroy CBs and to map them to the user's process address-space. A command buffer is a memory blocks that reside in DMA-able address-space and is physically contiguous so it can be accessed by the device without MMU translation. The command buffer memory is allocated using the coherent DMA API. When creating a new CB, the IOCTL returns a handle of it, and the user-space process needs to use that handle to mmap the buffer to get a VA in the user's address-space. Before destroying (freeing) a CB, the user must unmap the CB's VA using the CB handle. Each CB has a reference counter, which tracks its usage in command submissions and also its mmaps (only a single mmap is allowed). The driver maintains a pool of pre-allocated CBs in order to reduce latency during command submissions. In case the pool is empty, the driver will go to the slow-path of allocating a new CB, i.e. calling dma_alloc_coherent. Reviewed-by: Mike Rapoport Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/Makefile | 3 +- drivers/misc/habanalabs/command_buffer.c | 433 +++++++++++++++++++++++++++++ drivers/misc/habanalabs/device.c | 43 ++- drivers/misc/habanalabs/goya/goya.c | 28 ++ drivers/misc/habanalabs/habanalabs.h | 86 ++++++ drivers/misc/habanalabs/habanalabs_drv.c | 2 + drivers/misc/habanalabs/habanalabs_ioctl.c | 99 +++++++ include/uapi/misc/habanalabs.h | 46 +++ 8 files changed, 738 insertions(+), 2 deletions(-) create mode 100644 drivers/misc/habanalabs/command_buffer.c create mode 100644 drivers/misc/habanalabs/habanalabs_ioctl.c (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile index 3ffbadc2ca01..2530c9b78ca4 100644 --- a/drivers/misc/habanalabs/Makefile +++ b/drivers/misc/habanalabs/Makefile @@ -4,7 +4,8 @@ obj-m := habanalabs.o -habanalabs-y := habanalabs_drv.o device.o context.o asid.o +habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \ + command_buffer.o include $(src)/goya/Makefile habanalabs-y += $(HL_GOYA_FILES) diff --git a/drivers/misc/habanalabs/command_buffer.c b/drivers/misc/habanalabs/command_buffer.c new file mode 100644 index 000000000000..e659ca3035e4 --- /dev/null +++ b/drivers/misc/habanalabs/command_buffer.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include +#include "habanalabs.h" + +#include +#include + +static void cb_fini(struct hl_device *hdev, struct hl_cb *cb) +{ + hdev->asic_funcs->dma_free_coherent(hdev, cb->size, + (void *) (uintptr_t) cb->kernel_address, + cb->bus_address); + kfree(cb); +} + +static void cb_do_release(struct hl_device *hdev, struct hl_cb *cb) +{ + if (cb->is_pool) { + spin_lock(&hdev->cb_pool_lock); + list_add(&cb->pool_list, &hdev->cb_pool); + spin_unlock(&hdev->cb_pool_lock); + } else { + cb_fini(hdev, cb); + } +} + +static void cb_release(struct kref *ref) +{ + struct hl_device *hdev; + struct hl_cb *cb; + + cb = container_of(ref, struct hl_cb, refcount); + hdev = cb->hdev; + + cb_do_release(hdev, cb); +} + +static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, + int ctx_id) +{ + struct hl_cb *cb; + void *p; + + /* + * We use of GFP_ATOMIC here because this function can be called from + * the latency-sensitive code path for command submission. Due to H/W + * limitations in some of the ASICs, the kernel must copy the user CB + * that is designated for an external queue and actually enqueue + * the kernel's copy. Hence, we must never sleep in this code section + * and must use GFP_ATOMIC for all memory allocations. + */ + if (ctx_id == HL_KERNEL_ASID_ID) + cb = kzalloc(sizeof(*cb), GFP_ATOMIC); + else + cb = kzalloc(sizeof(*cb), GFP_KERNEL); + + if (!cb) + return NULL; + + if (ctx_id == HL_KERNEL_ASID_ID) + p = hdev->asic_funcs->dma_alloc_coherent(hdev, cb_size, + &cb->bus_address, GFP_ATOMIC); + else + p = hdev->asic_funcs->dma_alloc_coherent(hdev, cb_size, + &cb->bus_address, + GFP_USER | __GFP_ZERO); + if (!p) { + dev_err(hdev->dev, + "failed to allocate %d of dma memory for CB\n", + cb_size); + kfree(cb); + return NULL; + } + + cb->kernel_address = (u64) (uintptr_t) p; + cb->size = cb_size; + + return cb; +} + +int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, + u32 cb_size, u64 *handle, int ctx_id) +{ + struct hl_cb *cb; + bool alloc_new_cb = true; + int rc; + + if (hdev->disabled) { + dev_warn_ratelimited(hdev->dev, + "Device is disabled. Can't create new CBs\n"); + rc = -EBUSY; + goto out_err; + } + + if (cb_size > HL_MAX_CB_SIZE) { + dev_err(hdev->dev, + "CB size %d must be less then %d\n", + cb_size, HL_MAX_CB_SIZE); + rc = -EINVAL; + goto out_err; + } + + /* Minimum allocation must be PAGE SIZE */ + if (cb_size < PAGE_SIZE) + cb_size = PAGE_SIZE; + + if (ctx_id == HL_KERNEL_ASID_ID && + cb_size <= hdev->asic_prop.cb_pool_cb_size) { + + spin_lock(&hdev->cb_pool_lock); + if (!list_empty(&hdev->cb_pool)) { + cb = list_first_entry(&hdev->cb_pool, typeof(*cb), + pool_list); + list_del(&cb->pool_list); + spin_unlock(&hdev->cb_pool_lock); + alloc_new_cb = false; + } else { + spin_unlock(&hdev->cb_pool_lock); + dev_dbg(hdev->dev, "CB pool is empty\n"); + } + } + + if (alloc_new_cb) { + cb = hl_cb_alloc(hdev, cb_size, ctx_id); + if (!cb) { + rc = -ENOMEM; + goto out_err; + } + } + + cb->hdev = hdev; + cb->ctx_id = ctx_id; + + spin_lock(&mgr->cb_lock); + rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_ATOMIC); + spin_unlock(&mgr->cb_lock); + + if (rc < 0) { + dev_err(hdev->dev, "Failed to allocate IDR for a new CB\n"); + goto release_cb; + } + + cb->id = rc; + + kref_init(&cb->refcount); + spin_lock_init(&cb->lock); + + /* + * idr is 32-bit so we can safely OR it with a mask that is above + * 32 bit + */ + *handle = cb->id | HL_MMAP_CB_MASK; + *handle <<= PAGE_SHIFT; + + return 0; + +release_cb: + cb_do_release(hdev, cb); +out_err: + *handle = 0; + + return rc; +} + +int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle) +{ + struct hl_cb *cb; + u32 handle; + int rc = 0; + + /* + * handle was given to user to do mmap, I need to shift it back to + * how the idr module gave it to me + */ + cb_handle >>= PAGE_SHIFT; + handle = (u32) cb_handle; + + spin_lock(&mgr->cb_lock); + + cb = idr_find(&mgr->cb_handles, handle); + if (cb) { + idr_remove(&mgr->cb_handles, handle); + spin_unlock(&mgr->cb_lock); + kref_put(&cb->refcount, cb_release); + } else { + spin_unlock(&mgr->cb_lock); + dev_err(hdev->dev, + "CB destroy failed, no match to handle 0x%x\n", handle); + rc = -EINVAL; + } + + return rc; +} + +int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data) +{ + union hl_cb_args *args = data; + struct hl_device *hdev = hpriv->hdev; + u64 handle; + int rc; + + switch (args->in.op) { + case HL_CB_OP_CREATE: + rc = hl_cb_create(hdev, &hpriv->cb_mgr, args->in.cb_size, + &handle, hpriv->ctx->asid); + memset(args, 0, sizeof(*args)); + args->out.cb_handle = handle; + break; + case HL_CB_OP_DESTROY: + rc = hl_cb_destroy(hdev, &hpriv->cb_mgr, + args->in.cb_handle); + break; + default: + rc = -ENOTTY; + break; + } + + return rc; +} + +static void cb_vm_close(struct vm_area_struct *vma) +{ + struct hl_cb *cb = (struct hl_cb *) vma->vm_private_data; + + cb->mmap_size -= vma->vm_end - vma->vm_start; + + if (cb->mmap_size) + return; + + spin_lock(&cb->lock); + cb->mmap = false; + spin_unlock(&cb->lock); + + hl_cb_put(cb); + vma->vm_private_data = NULL; +} + +static const struct vm_operations_struct cb_vm_ops = { + .close = cb_vm_close +}; + +int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_cb *cb; + phys_addr_t address; + u32 handle; + int rc; + + handle = vma->vm_pgoff; + + /* reference was taken here */ + cb = hl_cb_get(hdev, &hpriv->cb_mgr, handle); + if (!cb) { + dev_err(hdev->dev, + "CB mmap failed, no match to handle %d\n", handle); + return -EINVAL; + } + + /* Validation check */ + if ((vma->vm_end - vma->vm_start) != cb->size) { + dev_err(hdev->dev, + "CB mmap failed, mmap size 0x%lx != 0x%x cb size\n", + vma->vm_end - vma->vm_start, cb->size); + rc = -EINVAL; + goto put_cb; + } + + spin_lock(&cb->lock); + + if (cb->mmap) { + dev_err(hdev->dev, + "CB mmap failed, CB already mmaped to user\n"); + rc = -EINVAL; + goto release_lock; + } + + cb->mmap = true; + + spin_unlock(&cb->lock); + + vma->vm_ops = &cb_vm_ops; + + /* + * Note: We're transferring the cb reference to + * vma->vm_private_data here. + */ + + vma->vm_private_data = cb; + + /* Calculate address for CB */ + address = virt_to_phys((void *) (uintptr_t) cb->kernel_address); + + rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address, + address, cb->size); + + if (rc) { + spin_lock(&cb->lock); + cb->mmap = false; + goto release_lock; + } + + cb->mmap_size = cb->size; + + return 0; + +release_lock: + spin_unlock(&cb->lock); +put_cb: + hl_cb_put(cb); + return rc; +} + +struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr, + u32 handle) +{ + struct hl_cb *cb; + + spin_lock(&mgr->cb_lock); + cb = idr_find(&mgr->cb_handles, handle); + + if (!cb) { + spin_unlock(&mgr->cb_lock); + dev_warn(hdev->dev, + "CB get failed, no match to handle %d\n", handle); + return NULL; + } + + kref_get(&cb->refcount); + + spin_unlock(&mgr->cb_lock); + + return cb; + +} + +void hl_cb_put(struct hl_cb *cb) +{ + kref_put(&cb->refcount, cb_release); +} + +void hl_cb_mgr_init(struct hl_cb_mgr *mgr) +{ + spin_lock_init(&mgr->cb_lock); + idr_init(&mgr->cb_handles); +} + +void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr) +{ + struct hl_cb *cb; + struct idr *idp; + u32 id; + + idp = &mgr->cb_handles; + + idr_for_each_entry(idp, cb, id) { + if (kref_put(&cb->refcount, cb_release) != 1) + dev_err(hdev->dev, + "CB %d for CTX ID %d is still alive\n", + id, cb->ctx_id); + } + + idr_destroy(&mgr->cb_handles); +} + +struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size) +{ + u64 cb_handle; + struct hl_cb *cb; + int rc; + + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle, + HL_KERNEL_ASID_ID); + if (rc) { + dev_err(hdev->dev, "Failed to allocate CB for KMD %d\n", rc); + return NULL; + } + + cb_handle >>= PAGE_SHIFT; + cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr, (u32) cb_handle); + /* hl_cb_get should never fail here so use kernel WARN */ + WARN(!cb, "Kernel CB handle invalid 0x%x\n", (u32) cb_handle); + if (!cb) + goto destroy_cb; + + return cb; + +destroy_cb: + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb_handle << PAGE_SHIFT); + + return NULL; +} + +int hl_cb_pool_init(struct hl_device *hdev) +{ + struct hl_cb *cb; + int i; + + INIT_LIST_HEAD(&hdev->cb_pool); + spin_lock_init(&hdev->cb_pool_lock); + + for (i = 0 ; i < hdev->asic_prop.cb_pool_cb_cnt ; i++) { + cb = hl_cb_alloc(hdev, hdev->asic_prop.cb_pool_cb_size, + HL_KERNEL_ASID_ID); + if (cb) { + cb->is_pool = true; + list_add(&cb->pool_list, &hdev->cb_pool); + } else { + hl_cb_pool_fini(hdev); + return -ENOMEM; + } + } + + return 0; +} + +int hl_cb_pool_fini(struct hl_device *hdev) +{ + struct hl_cb *cb, *tmp; + + list_for_each_entry_safe(cb, tmp, &hdev->cb_pool, pool_list) { + list_del(&cb->pool_list); + cb_fini(hdev, cb); + } + + return 0; +} diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 2423588ecf22..77f0018e7aa9 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -52,6 +52,7 @@ static int hl_device_release(struct inode *inode, struct file *filp) { struct hl_fpriv *hpriv = filp->private_data; + hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr); hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); filp->private_data = NULL; @@ -61,10 +62,34 @@ static int hl_device_release(struct inode *inode, struct file *filp) return 0; } +/* + * hl_mmap - mmap function for habanalabs device + * + * @*filp: pointer to file structure + * @*vma: pointer to vm_area_struct of the process + * + * Called when process does an mmap on habanalabs device. Call the device's mmap + * function at the end of the common code. + */ +static int hl_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct hl_fpriv *hpriv = filp->private_data; + + if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) { + vma->vm_pgoff ^= HL_MMAP_CB_MASK; + return hl_cb_mmap(hpriv, vma); + } + + return hpriv->hdev->asic_funcs->mmap(hpriv, vma); +} + static const struct file_operations hl_ops = { .owner = THIS_MODULE, .open = hl_device_open, - .release = hl_device_release + .release = hl_device_release, + .mmap = hl_mmap, + .unlocked_ioctl = hl_ioctl, + .compat_ioctl = hl_ioctl }; /* @@ -149,6 +174,8 @@ static int device_early_init(struct hl_device *hdev) if (rc) goto early_fini; + hl_cb_mgr_init(&hdev->kernel_cb_mgr); + mutex_init(&hdev->fd_open_cnt_lock); atomic_set(&hdev->fd_open_cnt, 0); @@ -170,6 +197,8 @@ early_fini: static void device_early_fini(struct hl_device *hdev) { + hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr); + hl_asid_fini(hdev); if (hdev->asic_funcs->early_fini) @@ -284,11 +313,21 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) goto free_ctx; } + rc = hl_cb_pool_init(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize CB pool\n"); + goto release_ctx; + } + dev_notice(hdev->dev, "Successfully added device to habanalabs driver\n"); return 0; +release_ctx: + if (hl_ctx_put(hdev->kernel_ctx) != 1) + dev_err(hdev->dev, + "kernel ctx is still alive on initialization failure\n"); free_ctx: kfree(hdev->kernel_ctx); sw_fini: @@ -325,6 +364,8 @@ void hl_device_fini(struct hl_device *hdev) /* Mark device as disabled */ hdev->disabled = true; + hl_cb_pool_fini(hdev); + /* Release kernel context */ if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) dev_err(hdev->dev, "kernel ctx is still alive\n"); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index ceaffa9afd83..88b047f7d85d 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -82,6 +82,9 @@ #define GOYA_MAX_INITIATORS 20 +#define GOYA_CB_POOL_CB_CNT 512 +#define GOYA_CB_POOL_CB_SIZE 0x20000 /* 128KB */ + static void goya_get_fixed_properties(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; @@ -109,6 +112,8 @@ static void goya_get_fixed_properties(struct hl_device *hdev) prop->tpc_enabled_mask = TPC_ENABLED_MASK; prop->high_pll = PLL_HIGH_DEFAULT; + prop->cb_pool_cb_cnt = GOYA_CB_POOL_CB_CNT; + prop->cb_pool_cb_size = GOYA_CB_POOL_CB_SIZE; } /* @@ -597,6 +602,27 @@ int goya_resume(struct hl_device *hdev) return 0; } +int goya_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma) +{ + return -EINVAL; +} + +int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma, + u64 kaddress, phys_addr_t paddress, u32 size) +{ + int rc; + + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE; + + rc = remap_pfn_range(vma, vma->vm_start, paddress >> PAGE_SHIFT, + size, vma->vm_page_prot); + if (rc) + dev_err(hdev->dev, "remap_pfn_range error %d", rc); + + return rc; +} + void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { @@ -616,6 +642,8 @@ static const struct hl_asic_funcs goya_funcs = { .sw_fini = goya_sw_fini, .suspend = goya_suspend, .resume = goya_resume, + .mmap = goya_mmap, + .cb_mmap = goya_cb_mmap, .dma_alloc_coherent = goya_dma_alloc_coherent, .dma_free_coherent = goya_dma_free_coherent, }; diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index ca8171ca3a04..63741c7224b6 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -14,9 +14,12 @@ #define HL_NAME "habanalabs" +#define HL_MMAP_CB_MASK (0x8000000000000000ull >> PAGE_SHIFT) + #define HL_MAX_QUEUES 128 struct hl_device; +struct hl_fpriv; /** @@ -44,6 +47,8 @@ struct hl_device; * @max_asid: maximum number of open contexts (ASIDs). * @completion_queues_count: number of completion queues. * @high_pll: high PLL frequency used by the device. + * @cb_pool_cb_cnt: number of CBs in the CB pool. + * @cb_pool_cb_size: size of each CB in the CB pool. * @tpc_enabled_mask: which TPCs are enabled. */ struct asic_fixed_properties { @@ -64,11 +69,60 @@ struct asic_fixed_properties { u32 sram_size; u32 max_asid; u32 high_pll; + u32 cb_pool_cb_cnt; + u32 cb_pool_cb_size; u8 completion_queues_count; u8 tpc_enabled_mask; }; +/* + * Command Buffers + */ + +#define HL_MAX_CB_SIZE 0x200000 /* 2MB */ + +/** + * struct hl_cb_mgr - describes a Command Buffer Manager. + * @cb_lock: protects cb_handles. + * @cb_handles: an idr to hold all command buffer handles. + */ +struct hl_cb_mgr { + spinlock_t cb_lock; + struct idr cb_handles; /* protected by cb_lock */ +}; + +/** + * struct hl_cb - describes a Command Buffer. + * @refcount: reference counter for usage of the CB. + * @hdev: pointer to device this CB belongs to. + * @lock: spinlock to protect mmap/cs flows. + * @pool_list: node in pool list of command buffers. + * @kernel_address: Holds the CB's kernel virtual address. + * @bus_address: Holds the CB's DMA address. + * @mmap_size: Holds the CB's size that was mmaped. + * @size: holds the CB's size. + * @id: the CB's ID. + * @ctx_id: holds the ID of the owner's context. + * @mmap: true if the CB is currently mmaped to user. + * @is_pool: true if CB was acquired from the pool, false otherwise. + */ +struct hl_cb { + struct kref refcount; + struct hl_device *hdev; + spinlock_t lock; + struct list_head pool_list; + u64 kernel_address; + dma_addr_t bus_address; + u32 mmap_size; + u32 size; + u32 id; + u32 ctx_id; + u8 mmap; + u8 is_pool; +}; + + #define HL_QUEUE_LENGTH 256 @@ -97,6 +151,8 @@ enum hl_asic_type { * @sw_fini: tears down driver state, does not configure H/W. * @suspend: handles IP specific H/W or SW changes for suspend. * @resume: handles IP specific H/W or SW changes for resume. + * @mmap: mmap function, does nothing. + * @cb_mmap: maps a CB. * @dma_alloc_coherent: Allocate coherent DMA memory by calling * dma_alloc_coherent(). This is ASIC function because its * implementation is not trivial when the driver is loaded @@ -113,6 +169,9 @@ struct hl_asic_funcs { int (*sw_fini)(struct hl_device *hdev); int (*suspend)(struct hl_device *hdev); int (*resume)(struct hl_device *hdev); + int (*mmap)(struct hl_fpriv *hpriv, struct vm_area_struct *vma); + int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma, + u64 kaddress, phys_addr_t paddress, u32 size); void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flag); void (*dma_free_coherent)(struct hl_device *hdev, size_t size, @@ -163,6 +222,7 @@ struct hl_ctx_mgr { * @taskpid: current process ID. * @ctx: current executing context. * @ctx_mgr: context manager to handle multiple context for this FD. + * @cb_mgr: command buffer manager to handle multiple buffers for this FD. * @refcount: number of related contexts. */ struct hl_fpriv { @@ -171,6 +231,7 @@ struct hl_fpriv { struct pid *taskpid; struct hl_ctx *ctx; /* TODO: remove for multiple ctx */ struct hl_ctx_mgr ctx_mgr; + struct hl_cb_mgr cb_mgr; struct kref refcount; }; @@ -225,6 +286,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); * @asic_name: ASIC specific nmae. * @asic_type: ASIC specific type. * @kernel_ctx: KMD context structure. + * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs. * @dma_pool: DMA pool for small allocations. * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address. * @cpu_accessible_dma_address: KMD <-> ArmCP shared memory DMA address. @@ -240,6 +302,8 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); * @asic_prop: ASIC specific immutable properties. * @asic_funcs: ASIC specific functions. * @asic_specific: ASIC specific information to use only from ASIC files. + * @cb_pool: list of preallocated CBs. + * @cb_pool_lock: protects the CB pool. * @user_ctx: current user context executing. * @fd_open_cnt: number of open user processes. * @major: habanalabs KMD major. @@ -255,6 +319,7 @@ struct hl_device { char asic_name[16]; enum hl_asic_type asic_type; struct hl_ctx *kernel_ctx; + struct hl_cb_mgr kernel_cb_mgr; struct dma_pool *dma_pool; void *cpu_accessible_dma_mem; dma_addr_t cpu_accessible_dma_address; @@ -266,6 +331,10 @@ struct hl_device { struct asic_fixed_properties asic_prop; const struct hl_asic_funcs *asic_funcs; void *asic_specific; + + struct list_head cb_pool; + spinlock_t cb_pool_lock; + /* TODO: remove user_ctx for multiple process support */ struct hl_ctx *user_ctx; atomic_t fd_open_cnt; @@ -334,6 +403,23 @@ int hl_device_resume(struct hl_device *hdev); void hl_hpriv_get(struct hl_fpriv *hpriv); void hl_hpriv_put(struct hl_fpriv *hpriv); +int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size, + u64 *handle, int ctx_id); +int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle); +int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma); +struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr, + u32 handle); +void hl_cb_put(struct hl_cb *cb); +void hl_cb_mgr_init(struct hl_cb_mgr *mgr); +void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr); +struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size); +int hl_cb_pool_init(struct hl_device *hdev); +int hl_cb_pool_fini(struct hl_device *hdev); + void goya_set_asic_funcs(struct hl_device *hdev); +/* IOCTLs */ +long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); +int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data); + #endif /* HABANALABSP_H_ */ diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c index 6fddd801aca3..8628d1d8f037 100644 --- a/drivers/misc/habanalabs/habanalabs_drv.c +++ b/drivers/misc/habanalabs/habanalabs_drv.c @@ -116,6 +116,7 @@ int hl_device_open(struct inode *inode, struct file *filp) kref_init(&hpriv->refcount); nonseekable_open(inode, filp); + hl_cb_mgr_init(&hpriv->cb_mgr); hl_ctx_mgr_init(&hpriv->ctx_mgr); rc = hl_ctx_create(hdev, hpriv); @@ -131,6 +132,7 @@ int hl_device_open(struct inode *inode, struct file *filp) out_err: filp->private_data = NULL; hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); + hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr); kfree(hpriv); close_device: diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c new file mode 100644 index 000000000000..e53265fe9543 --- /dev/null +++ b/drivers/misc/habanalabs/habanalabs_ioctl.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include +#include "habanalabs.h" + +#include +#include +#include + +#define HL_IOCTL_DEF(ioctl, _func) \ + [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func} + +static const struct hl_ioctl_desc hl_ioctls[] = { + HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl) +}; + +#define HL_CORE_IOCTL_COUNT ARRAY_SIZE(hl_ioctls) + +long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct hl_fpriv *hpriv = filep->private_data; + struct hl_device *hdev = hpriv->hdev; + hl_ioctl_t *func; + const struct hl_ioctl_desc *ioctl = NULL; + unsigned int nr = _IOC_NR(cmd); + char stack_kdata[128] = {0}; + char *kdata = NULL; + unsigned int usize, asize; + int retcode; + + if ((nr >= HL_COMMAND_START) && (nr < HL_COMMAND_END)) { + u32 hl_size; + + ioctl = &hl_ioctls[nr]; + + hl_size = _IOC_SIZE(ioctl->cmd); + usize = asize = _IOC_SIZE(cmd); + if (hl_size > asize) + asize = hl_size; + + cmd = ioctl->cmd; + } else { + dev_err(hdev->dev, "invalid ioctl: pid=%d, nr=0x%02x\n", + task_pid_nr(current), nr); + return -ENOTTY; + } + + /* Do not trust userspace, use our own definition */ + func = ioctl->func; + + if (unlikely(!func)) { + dev_dbg(hdev->dev, "no function\n"); + retcode = -ENOTTY; + goto out_err; + } + + if (cmd & (IOC_IN | IOC_OUT)) { + if (asize <= sizeof(stack_kdata)) { + kdata = stack_kdata; + } else { + kdata = kzalloc(asize, GFP_KERNEL); + if (!kdata) { + retcode = -ENOMEM; + goto out_err; + } + } + } + + if (cmd & IOC_IN) { + if (copy_from_user(kdata, (void __user *)arg, usize)) { + retcode = -EFAULT; + goto out_err; + } + } else if (cmd & IOC_OUT) { + memset(kdata, 0, usize); + } + + retcode = func(hpriv, kdata); + + if (cmd & IOC_OUT) + if (copy_to_user((void __user *)arg, kdata, usize)) + retcode = -EFAULT; + +out_err: + if (retcode) + dev_dbg(hdev->dev, + "error in ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n", + task_pid_nr(current), cmd, nr); + + if (kdata != stack_kdata) + kfree(kdata); + + return retcode; +} diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index a0ec23adf8f5..a8edfd3e9c95 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -17,4 +17,50 @@ */ #define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START 0x8000 /* 32KB */ +/* Opcode to create a new command buffer */ +#define HL_CB_OP_CREATE 0 +/* Opcode to destroy previously created command buffer */ +#define HL_CB_OP_DESTROY 1 + +struct hl_cb_in { + /* Handle of CB or 0 if we want to create one */ + __u64 cb_handle; + /* HL_CB_OP_* */ + __u32 op; + /* Size of CB. Minimum requested size must be PAGE_SIZE */ + __u32 cb_size; + /* Context ID - Currently not in use */ + __u32 ctx_id; + __u32 pad; +}; + +struct hl_cb_out { + /* Handle of CB */ + __u64 cb_handle; +}; + +union hl_cb_args { + struct hl_cb_in in; + struct hl_cb_out out; +}; + +/* + * Command Buffer + * - Request a Command Buffer + * - Destroy a Command Buffer + * + * The command buffers are memory blocks that reside in DMA-able address + * space and are physically contiguous so they can be accessed by the device + * directly. They are allocated using the coherent DMA API. + * + * When creating a new CB, the IOCTL returns a handle of it, and the user-space + * process needs to use that handle to mmap the buffer so it can access them. + * + */ +#define HL_IOCTL_CB \ + _IOWR('H', 0x02, union hl_cb_args) + +#define HL_COMMAND_START 0x02 +#define HL_COMMAND_END 0x03 + #endif /* HABANALABS_H_ */ -- cgit v1.3-7-g2ca7 From 9494a8dd8d22cbff8ce358aaa223fffe1b070cb0 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 16 Feb 2019 00:39:17 +0200 Subject: habanalabs: add h/w queues module This patch adds the H/W queues module and the code to initialize Goya's various compute and DMA engines and their queues. Goya has 5 DMA channels, 8 TPC engines and a single MME engine. For each channel/engine, there is a H/W queue logic which is used to pass commands from the user to the H/W. That logic is called QMAN. There are two types of QMANs: external and internal. The DMA QMANs are considered external while the TPC and MME QMANs are considered internal. For each external queue there is a completion queue, which is located on the Host memory. The differences between external and internal QMANs are: 1. The location of the queue's memory. External QMANs are located on the Host memory while internal QMANs are located on the on-chip memory. 2. The external QMAN write an entry to a completion queue and sends an MSI-X interrupt upon completion of a command buffer that was given to it. The internal QMAN doesn't do that. Reviewed-by: Mike Rapoport Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/Makefile | 2 +- drivers/misc/habanalabs/device.c | 75 +- drivers/misc/habanalabs/goya/goya.c | 1527 ++++++++++++++++++-- drivers/misc/habanalabs/goya/goyaP.h | 7 + drivers/misc/habanalabs/habanalabs.h | 174 ++- drivers/misc/habanalabs/habanalabs_drv.c | 5 + drivers/misc/habanalabs/hw_queue.c | 400 +++++ drivers/misc/habanalabs/include/armcp_if.h | 292 ++++ .../habanalabs/include/goya/goya_async_events.h | 186 +++ .../misc/habanalabs/include/goya/goya_packets.h | 129 ++ drivers/misc/habanalabs/include/qman_if.h | 56 + drivers/misc/habanalabs/irq.c | 149 ++ include/uapi/misc/habanalabs.h | 29 + 13 files changed, 2915 insertions(+), 116 deletions(-) create mode 100644 drivers/misc/habanalabs/hw_queue.c create mode 100644 drivers/misc/habanalabs/include/goya/goya_async_events.h create mode 100644 drivers/misc/habanalabs/include/goya/goya_packets.h create mode 100644 drivers/misc/habanalabs/include/qman_if.h create mode 100644 drivers/misc/habanalabs/irq.c (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile index 2530c9b78ca4..c07f3ccb57dc 100644 --- a/drivers/misc/habanalabs/Makefile +++ b/drivers/misc/habanalabs/Makefile @@ -5,7 +5,7 @@ obj-m := habanalabs.o habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \ - command_buffer.o + command_buffer.o hw_queue.o irq.o include $(src)/goya/Makefile habanalabs-y += $(HL_GOYA_FILES) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 5eefc2952be5..06e2b7f32499 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -174,13 +174,23 @@ static int device_early_init(struct hl_device *hdev) if (rc) goto early_fini; + hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0); + if (hdev->cq_wq == NULL) { + dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); + rc = -ENOMEM; + goto asid_fini; + } + hl_cb_mgr_init(&hdev->kernel_cb_mgr); mutex_init(&hdev->fd_open_cnt_lock); + mutex_init(&hdev->send_cpu_message_lock); atomic_set(&hdev->fd_open_cnt, 0); return 0; +asid_fini: + hl_asid_fini(hdev); early_fini: if (hdev->asic_funcs->early_fini) hdev->asic_funcs->early_fini(hdev); @@ -196,9 +206,12 @@ early_fini: */ static void device_early_fini(struct hl_device *hdev) { + mutex_destroy(&hdev->send_cpu_message_lock); hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr); + destroy_workqueue(hdev->cq_wq); + hl_asid_fini(hdev); if (hdev->asic_funcs->early_fini) @@ -277,7 +290,7 @@ int hl_device_resume(struct hl_device *hdev) */ int hl_device_init(struct hl_device *hdev, struct class *hclass) { - int rc; + int i, rc, cq_ready_cnt; /* Create device */ rc = device_setup_cdev(hdev, hclass, hdev->id, &hl_ops); @@ -298,11 +311,48 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) if (rc) goto early_fini; + /* + * Initialize the H/W queues. Must be done before hw_init, because + * there the addresses of the kernel queue are being written to the + * registers of the device + */ + rc = hl_hw_queues_create(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize kernel queues\n"); + goto sw_fini; + } + + /* + * Initialize the completion queues. Must be done before hw_init, + * because there the addresses of the completion queues are being + * passed as arguments to request_irq + */ + hdev->completion_queue = + kcalloc(hdev->asic_prop.completion_queues_count, + sizeof(*hdev->completion_queue), GFP_KERNEL); + + if (!hdev->completion_queue) { + dev_err(hdev->dev, "failed to allocate completion queues\n"); + rc = -ENOMEM; + goto hw_queues_destroy; + } + + for (i = 0, cq_ready_cnt = 0; + i < hdev->asic_prop.completion_queues_count; + i++, cq_ready_cnt++) { + rc = hl_cq_init(hdev, &hdev->completion_queue[i], i); + if (rc) { + dev_err(hdev->dev, + "failed to initialize completion queue\n"); + goto cq_fini; + } + } + /* Allocate the kernel context */ hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL); if (!hdev->kernel_ctx) { rc = -ENOMEM; - goto sw_fini; + goto cq_fini; } hdev->user_ctx = NULL; @@ -328,6 +378,14 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) hdev->disabled = false; + /* Check that the communication with the device is working */ + rc = hdev->asic_funcs->test_queues(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to detect if device is alive\n"); + rc = 0; + goto out_disabled; + } + dev_notice(hdev->dev, "Successfully added device to habanalabs driver\n"); @@ -339,6 +397,12 @@ release_ctx: "kernel ctx is still alive on initialization failure\n"); free_ctx: kfree(hdev->kernel_ctx); +cq_fini: + for (i = 0 ; i < cq_ready_cnt ; i++) + hl_cq_fini(hdev, &hdev->completion_queue[i]); + kfree(hdev->completion_queue); +hw_queues_destroy: + hl_hw_queues_destroy(hdev); sw_fini: hdev->asic_funcs->sw_fini(hdev); early_fini: @@ -368,6 +432,7 @@ out_disabled: */ void hl_device_fini(struct hl_device *hdev) { + int i; dev_info(hdev->dev, "Removing device\n"); /* Mark device as disabled */ @@ -382,6 +447,12 @@ void hl_device_fini(struct hl_device *hdev) /* Reset the H/W. It will be in idle state after this returns */ hdev->asic_funcs->hw_fini(hdev, true); + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + hl_cq_fini(hdev, &hdev->completion_queue[i]); + kfree(hdev->completion_queue); + + hl_hw_queues_destroy(hdev); + /* Call ASIC S/W finalize function */ hdev->asic_funcs->sw_fini(hdev); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 1fa5b91fd703..a45a183d4e5c 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -90,6 +90,26 @@ static void goya_get_fixed_properties(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; + int i; + + for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) { + prop->hw_queues_props[i].type = QUEUE_TYPE_EXT; + prop->hw_queues_props[i].kmd_only = 0; + } + + for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) { + prop->hw_queues_props[i].type = QUEUE_TYPE_CPU; + prop->hw_queues_props[i].kmd_only = 1; + } + + for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES + + NUMBER_OF_INT_HW_QUEUES; i++) { + prop->hw_queues_props[i].type = QUEUE_TYPE_INT; + prop->hw_queues_props[i].kmd_only = 0; + } + + for (; i < HL_MAX_QUEUES; i++) + prop->hw_queues_props[i].type = QUEUE_TYPE_NA; prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; @@ -118,6 +138,18 @@ static void goya_get_fixed_properties(struct hl_device *hdev) prop->high_pll = PLL_HIGH_DEFAULT; } +int goya_send_pci_access_msg(struct hl_device *hdev, u32 opcode) +{ + struct armcp_packet pkt; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = opcode << ARMCP_PKT_CTL_OPCODE_SHIFT; + + return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, + sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL); +} + /* * goya_pci_bars_map - Map PCI BARS of Goya device * @@ -510,6 +542,8 @@ static int goya_sw_init(struct hl_device *hdev) if (!goya) return -ENOMEM; + goya->test_cpu_queue = goya_test_cpu_queue; + /* according to goya_init_iatu */ goya->ddr_bar_cur_addr = DRAM_PHYS_BASE; hdev->asic_specific = goya; @@ -596,6 +630,311 @@ int goya_sw_fini(struct hl_device *hdev) return 0; } +static void goya_init_dma_qman(struct hl_device *hdev, int dma_id, + dma_addr_t bus_address) +{ + struct goya_device *goya = hdev->asic_specific; + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u32 reg_off = dma_id * (mmDMA_QM_1_PQ_PI - mmDMA_QM_0_PQ_PI); + + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + + WREG32(mmDMA_QM_0_PQ_BASE_LO + reg_off, lower_32_bits(bus_address)); + WREG32(mmDMA_QM_0_PQ_BASE_HI + reg_off, upper_32_bits(bus_address)); + + WREG32(mmDMA_QM_0_PQ_SIZE + reg_off, ilog2(HL_QUEUE_LENGTH)); + WREG32(mmDMA_QM_0_PQ_PI + reg_off, 0); + WREG32(mmDMA_QM_0_PQ_CI + reg_off, 0); + + WREG32(mmDMA_QM_0_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo); + WREG32(mmDMA_QM_0_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi); + WREG32(mmDMA_QM_0_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo); + WREG32(mmDMA_QM_0_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi); + WREG32(mmDMA_QM_0_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo); + WREG32(mmDMA_QM_0_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi); + WREG32(mmDMA_QM_0_GLBL_ERR_WDATA + reg_off, + GOYA_ASYNC_EVENT_ID_DMA0_QM + dma_id); + + /* PQ has buffer of 2 cache lines, while CQ has 8 lines */ + WREG32(mmDMA_QM_0_PQ_CFG1 + reg_off, 0x00020002); + WREG32(mmDMA_QM_0_CQ_CFG1 + reg_off, 0x00080008); + + if (dma_id == 0) + WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, QMAN_DMA_FULLY_TRUSTED); + else + if (goya->hw_cap_initialized & HW_CAP_MMU) + WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, + QMAN_DMA_PARTLY_TRUSTED); + else + WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, + QMAN_DMA_FULLY_TRUSTED); + + WREG32(mmDMA_QM_0_GLBL_ERR_CFG + reg_off, QMAN_DMA_ERR_MSG_EN); + WREG32(mmDMA_QM_0_GLBL_CFG0 + reg_off, QMAN_DMA_ENABLE); +} + +static void goya_init_dma_ch(struct hl_device *hdev, int dma_id) +{ + u32 gic_base_lo, gic_base_hi; + u64 sob_addr; + u32 reg_off = dma_id * (mmDMA_CH_1_CFG1 - mmDMA_CH_0_CFG1); + + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + + WREG32(mmDMA_CH_0_ERRMSG_ADDR_LO + reg_off, gic_base_lo); + WREG32(mmDMA_CH_0_ERRMSG_ADDR_HI + reg_off, gic_base_hi); + WREG32(mmDMA_CH_0_ERRMSG_WDATA + reg_off, + GOYA_ASYNC_EVENT_ID_DMA0_CH + dma_id); + + if (dma_id) { + sob_addr = CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1000 + + (dma_id - 1) * 4; + WREG32(mmDMA_CH_0_WR_COMP_ADDR_LO + reg_off, + lower_32_bits(sob_addr)); + WREG32(mmDMA_CH_0_WR_COMP_ADDR_HI + reg_off, + upper_32_bits(sob_addr)); + WREG32(mmDMA_CH_0_WR_COMP_WDATA + reg_off, 0x80000001); + } +} + +/* + * goya_init_dma_qmans - Initialize QMAN DMA registers + * + * @hdev: pointer to hl_device structure + * + * Initialize the H/W registers of the QMAN DMA channels + * + */ +static void goya_init_dma_qmans(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + struct hl_hw_queue *q; + dma_addr_t bus_address; + int i; + + if (goya->hw_cap_initialized & HW_CAP_DMA) + return; + + q = &hdev->kernel_queues[0]; + + for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++, q++) { + bus_address = q->bus_address + + hdev->asic_prop.host_phys_base_address; + + goya_init_dma_qman(hdev, i, bus_address); + goya_init_dma_ch(hdev, i); + } + + goya->hw_cap_initialized |= HW_CAP_DMA; +} + +/* + * goya_disable_external_queues - Disable external queues + * + * @hdev: pointer to hl_device structure + * + */ +static void goya_disable_external_queues(struct hl_device *hdev) +{ + WREG32(mmDMA_QM_0_GLBL_CFG0, 0); + WREG32(mmDMA_QM_1_GLBL_CFG0, 0); + WREG32(mmDMA_QM_2_GLBL_CFG0, 0); + WREG32(mmDMA_QM_3_GLBL_CFG0, 0); + WREG32(mmDMA_QM_4_GLBL_CFG0, 0); +} + +static int goya_stop_queue(struct hl_device *hdev, u32 cfg_reg, + u32 cp_sts_reg, u32 glbl_sts0_reg) +{ + int rc; + u32 status; + + /* use the values of TPC0 as they are all the same*/ + + WREG32(cfg_reg, 1 << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); + + status = RREG32(cp_sts_reg); + if (status & TPC0_QM_CP_STS_FENCE_IN_PROGRESS_MASK) { + rc = hl_poll_timeout( + hdev, + cp_sts_reg, + status, + !(status & TPC0_QM_CP_STS_FENCE_IN_PROGRESS_MASK), + 1000, + QMAN_FENCE_TIMEOUT_USEC); + + /* if QMAN is stuck in fence no need to check for stop */ + if (rc) + return 0; + } + + rc = hl_poll_timeout( + hdev, + glbl_sts0_reg, + status, + (status & TPC0_QM_GLBL_STS0_CP_IS_STOP_MASK), + 1000, + QMAN_STOP_TIMEOUT_USEC); + + if (rc) { + dev_err(hdev->dev, + "Timeout while waiting for QMAN to stop\n"); + return -EINVAL; + } + + return 0; +} + +/* + * goya_stop_external_queues - Stop external queues + * + * @hdev: pointer to hl_device structure + * + * Returns 0 on success + * + */ +static int goya_stop_external_queues(struct hl_device *hdev) +{ + int rc, retval = 0; + + rc = goya_stop_queue(hdev, + mmDMA_QM_0_GLBL_CFG1, + mmDMA_QM_0_CP_STS, + mmDMA_QM_0_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 0\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmDMA_QM_1_GLBL_CFG1, + mmDMA_QM_1_CP_STS, + mmDMA_QM_1_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 1\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmDMA_QM_2_GLBL_CFG1, + mmDMA_QM_2_CP_STS, + mmDMA_QM_2_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 2\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmDMA_QM_3_GLBL_CFG1, + mmDMA_QM_3_CP_STS, + mmDMA_QM_3_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 3\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmDMA_QM_4_GLBL_CFG1, + mmDMA_QM_4_CP_STS, + mmDMA_QM_4_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 4\n"); + retval = -EIO; + } + + return retval; +} + +static void goya_resume_external_queues(struct hl_device *hdev) +{ + WREG32(mmDMA_QM_0_GLBL_CFG1, 0); + WREG32(mmDMA_QM_1_GLBL_CFG1, 0); + WREG32(mmDMA_QM_2_GLBL_CFG1, 0); + WREG32(mmDMA_QM_3_GLBL_CFG1, 0); + WREG32(mmDMA_QM_4_GLBL_CFG1, 0); +} + +/* + * goya_init_cpu_queues - Initialize PQ/CQ/EQ of CPU + * + * @hdev: pointer to hl_device structure + * + * Returns 0 on success + * + */ +int goya_init_cpu_queues(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + dma_addr_t bus_address; + u32 status; + struct hl_hw_queue *cpu_pq = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ]; + int err; + + if (!hdev->cpu_queues_enable) + return 0; + + if (goya->hw_cap_initialized & HW_CAP_CPU_Q) + return 0; + + bus_address = cpu_pq->bus_address + + hdev->asic_prop.host_phys_base_address; + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0, lower_32_bits(bus_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1, upper_32_bits(bus_address)); + + bus_address = hdev->cpu_accessible_dma_address + + hdev->asic_prop.host_phys_base_address; + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8, lower_32_bits(bus_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9, upper_32_bits(bus_address)); + + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_10, CPU_ACCESSIBLE_MEM_SIZE); + + /* Used for EQ CI */ + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, 0); + + WREG32(mmCPU_IF_PF_PQ_PI, 0); + + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_7, PQ_INIT_STATUS_READY_FOR_CP); + + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, + GOYA_ASYNC_EVENT_ID_PI_UPDATE); + + err = hl_poll_timeout( + hdev, + mmPSOC_GLOBAL_CONF_SCRATCHPAD_7, + status, + (status == PQ_INIT_STATUS_READY_FOR_HOST), + 1000, + GOYA_CPU_TIMEOUT_USEC); + + if (err) { + dev_err(hdev->dev, + "Failed to communicate with ARM CPU (ArmCP timeout)\n"); + return -EIO; + } + + goya->hw_cap_initialized |= HW_CAP_CPU_Q; + return 0; +} + static void goya_set_pll_refclk(struct hl_device *hdev) { WREG32(mmCPU_PLL_DIV_SEL_0, 0x0); @@ -1023,144 +1362,644 @@ static void goya_init_golden_registers(struct hl_device *hdev) goya->hw_cap_initialized |= HW_CAP_GOLDEN; } - -/* - * goya_push_fw_to_device - Push FW code to device - * - * @hdev: pointer to hl_device structure - * - * Copy fw code from firmware file to device memory. - * Returns 0 on success - * - */ -static int goya_push_fw_to_device(struct hl_device *hdev, const char *fw_name, - void __iomem *dst) +static void goya_init_mme_qman(struct hl_device *hdev) { - const struct firmware *fw; - const u64 *fw_data; - size_t fw_size, i; - int rc; + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u64 qman_base_addr; - rc = request_firmware(&fw, fw_name, hdev->dev); + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); - if (rc) { - dev_err(hdev->dev, "Failed to request %s\n", fw_name); - goto out; - } + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); - fw_size = fw->size; - if ((fw_size % 4) != 0) { - dev_err(hdev->dev, "illegal %s firmware size %zu\n", - fw_name, fw_size); - rc = -EINVAL; - goto out; - } + qman_base_addr = hdev->asic_prop.sram_base_address + + MME_QMAN_BASE_OFFSET; - dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size); + WREG32(mmMME_QM_PQ_BASE_LO, lower_32_bits(qman_base_addr)); + WREG32(mmMME_QM_PQ_BASE_HI, upper_32_bits(qman_base_addr)); + WREG32(mmMME_QM_PQ_SIZE, ilog2(MME_QMAN_LENGTH)); + WREG32(mmMME_QM_PQ_PI, 0); + WREG32(mmMME_QM_PQ_CI, 0); + WREG32(mmMME_QM_CP_LDMA_SRC_BASE_LO_OFFSET, 0x10C0); + WREG32(mmMME_QM_CP_LDMA_SRC_BASE_HI_OFFSET, 0x10C4); + WREG32(mmMME_QM_CP_LDMA_TSIZE_OFFSET, 0x10C8); + WREG32(mmMME_QM_CP_LDMA_COMMIT_OFFSET, 0x10CC); - fw_data = (const u64 *) fw->data; + WREG32(mmMME_QM_CP_MSG_BASE0_ADDR_LO, mtr_base_lo); + WREG32(mmMME_QM_CP_MSG_BASE0_ADDR_HI, mtr_base_hi); + WREG32(mmMME_QM_CP_MSG_BASE1_ADDR_LO, so_base_lo); + WREG32(mmMME_QM_CP_MSG_BASE1_ADDR_HI, so_base_hi); - if ((fw->size % 8) != 0) - fw_size -= 8; + /* QMAN CQ has 8 cache lines */ + WREG32(mmMME_QM_CQ_CFG1, 0x00080008); - for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) { - if (!(i & (0x80000 - 1))) { - dev_dbg(hdev->dev, - "copied so far %zu out of %zu for %s firmware", - i, fw_size, fw_name); - usleep_range(20, 100); - } + WREG32(mmMME_QM_GLBL_ERR_ADDR_LO, gic_base_lo); + WREG32(mmMME_QM_GLBL_ERR_ADDR_HI, gic_base_hi); - writeq(*fw_data, dst); - } + WREG32(mmMME_QM_GLBL_ERR_WDATA, GOYA_ASYNC_EVENT_ID_MME_QM); - if ((fw->size % 8) != 0) - writel(*(const u32 *) fw_data, dst); + WREG32(mmMME_QM_GLBL_ERR_CFG, QMAN_MME_ERR_MSG_EN); -out: - release_firmware(fw); - return rc; + WREG32(mmMME_QM_GLBL_PROT, QMAN_MME_ERR_PROT); + + WREG32(mmMME_QM_GLBL_CFG0, QMAN_MME_ENABLE); } -static int goya_pldm_init_cpu(struct hl_device *hdev) +static void goya_init_mme_cmdq(struct hl_device *hdev) { - char fw_name[200]; - void __iomem *dst; - u32 val, unit_rst_val; - int rc; + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u64 qman_base_addr; - /* Must initialize SRAM scrambler before pushing u-boot to SRAM */ - goya_init_golden_registers(hdev); + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); - /* Put ARM cores into reset */ - WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, CPU_RESET_ASSERT); - val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL); + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); - /* Reset the CA53 MACRO */ - unit_rst_val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); - WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, CA53_RESET); - val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); - WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, unit_rst_val); - val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); + qman_base_addr = hdev->asic_prop.sram_base_address + + MME_QMAN_BASE_OFFSET; - snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin"); - dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET; - rc = goya_push_fw_to_device(hdev, fw_name, dst); - if (rc) - return rc; + WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_LO, mtr_base_lo); + WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_HI, mtr_base_hi); + WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_LO, so_base_lo); + WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_HI, so_base_hi); - snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb"); - dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET; - rc = goya_push_fw_to_device(hdev, fw_name, dst); - if (rc) - return rc; + /* CMDQ CQ has 20 cache lines */ + WREG32(mmMME_CMDQ_CQ_CFG1, 0x00140014); - WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_FIT_RDY); - WREG32(mmPSOC_GLOBAL_CONF_WARM_REBOOT, CPU_BOOT_STATUS_NA); + WREG32(mmMME_CMDQ_GLBL_ERR_ADDR_LO, gic_base_lo); + WREG32(mmMME_CMDQ_GLBL_ERR_ADDR_HI, gic_base_hi); - WREG32(mmCPU_CA53_CFG_RST_ADDR_LSB_0, - lower_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET)); - WREG32(mmCPU_CA53_CFG_RST_ADDR_MSB_0, - upper_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET)); + WREG32(mmMME_CMDQ_GLBL_ERR_WDATA, GOYA_ASYNC_EVENT_ID_MME_CMDQ); - /* Release ARM core 0 from reset */ - WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, - CPU_RESET_CORE0_DEASSERT); - val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL); + WREG32(mmMME_CMDQ_GLBL_ERR_CFG, CMDQ_MME_ERR_MSG_EN); - return 0; + WREG32(mmMME_CMDQ_GLBL_PROT, CMDQ_MME_ERR_PROT); + + WREG32(mmMME_CMDQ_GLBL_CFG0, CMDQ_MME_ENABLE); } -/* - * FW component passes an offset from SRAM_BASE_ADDR in SCRATCHPAD_xx. - * The version string should be located by that offset. - */ -static void goya_read_device_fw_version(struct hl_device *hdev, - enum goya_fw_component fwc) +static void goya_init_mme_qmans(struct hl_device *hdev) { - const char *name; - u32 ver_off; - char *dest; + struct goya_device *goya = hdev->asic_specific; + u32 so_base_lo, so_base_hi; - switch (fwc) { - case FW_COMP_UBOOT: - ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_29); - dest = hdev->asic_prop.uboot_ver; - name = "U-Boot"; - break; - case FW_COMP_PREBOOT: - ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_28); - dest = hdev->asic_prop.preboot_ver; - name = "Preboot"; - break; - default: - dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc); + if (goya->hw_cap_initialized & HW_CAP_MME) return; - } - ver_off &= ~((u32)SRAM_BASE_ADDR); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); - if (ver_off < SRAM_SIZE - VERSION_MAX_LEN) { + WREG32(mmMME_SM_BASE_ADDRESS_LOW, so_base_lo); + WREG32(mmMME_SM_BASE_ADDRESS_HIGH, so_base_hi); + + goya_init_mme_qman(hdev); + goya_init_mme_cmdq(hdev); + + goya->hw_cap_initialized |= HW_CAP_MME; +} + +static void goya_init_tpc_qman(struct hl_device *hdev, u32 base_off, int tpc_id) +{ + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u64 qman_base_addr; + u32 reg_off = tpc_id * (mmTPC1_QM_PQ_PI - mmTPC0_QM_PQ_PI); + + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + + qman_base_addr = hdev->asic_prop.sram_base_address + base_off; + + WREG32(mmTPC0_QM_PQ_BASE_LO + reg_off, lower_32_bits(qman_base_addr)); + WREG32(mmTPC0_QM_PQ_BASE_HI + reg_off, upper_32_bits(qman_base_addr)); + WREG32(mmTPC0_QM_PQ_SIZE + reg_off, ilog2(TPC_QMAN_LENGTH)); + WREG32(mmTPC0_QM_PQ_PI + reg_off, 0); + WREG32(mmTPC0_QM_PQ_CI + reg_off, 0); + WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET + reg_off, 0x10C0); + WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_HI_OFFSET + reg_off, 0x10C4); + WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET + reg_off, 0x10C8); + WREG32(mmTPC0_QM_CP_LDMA_COMMIT_OFFSET + reg_off, 0x10CC); + + WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo); + WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi); + WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo); + WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi); + + WREG32(mmTPC0_QM_CQ_CFG1 + reg_off, 0x00080008); + + WREG32(mmTPC0_QM_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo); + WREG32(mmTPC0_QM_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi); + + WREG32(mmTPC0_QM_GLBL_ERR_WDATA + reg_off, + GOYA_ASYNC_EVENT_ID_TPC0_QM + tpc_id); + + WREG32(mmTPC0_QM_GLBL_ERR_CFG + reg_off, QMAN_TPC_ERR_MSG_EN); + + WREG32(mmTPC0_QM_GLBL_PROT + reg_off, QMAN_TPC_ERR_PROT); + + WREG32(mmTPC0_QM_GLBL_CFG0 + reg_off, QMAN_TPC_ENABLE); +} + +static void goya_init_tpc_cmdq(struct hl_device *hdev, int tpc_id) +{ + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u32 reg_off = tpc_id * (mmTPC1_CMDQ_CQ_CFG1 - mmTPC0_CMDQ_CQ_CFG1); + + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + + WREG32(mmTPC0_CMDQ_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo); + WREG32(mmTPC0_CMDQ_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi); + WREG32(mmTPC0_CMDQ_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo); + WREG32(mmTPC0_CMDQ_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi); + + WREG32(mmTPC0_CMDQ_CQ_CFG1 + reg_off, 0x00140014); + + WREG32(mmTPC0_CMDQ_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo); + WREG32(mmTPC0_CMDQ_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi); + + WREG32(mmTPC0_CMDQ_GLBL_ERR_WDATA + reg_off, + GOYA_ASYNC_EVENT_ID_TPC0_CMDQ + tpc_id); + + WREG32(mmTPC0_CMDQ_GLBL_ERR_CFG + reg_off, CMDQ_TPC_ERR_MSG_EN); + + WREG32(mmTPC0_CMDQ_GLBL_PROT + reg_off, CMDQ_TPC_ERR_PROT); + + WREG32(mmTPC0_CMDQ_GLBL_CFG0 + reg_off, CMDQ_TPC_ENABLE); +} + +static void goya_init_tpc_qmans(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + u32 so_base_lo, so_base_hi; + u32 cfg_off = mmTPC1_CFG_SM_BASE_ADDRESS_LOW - + mmTPC0_CFG_SM_BASE_ADDRESS_LOW; + int i; + + if (goya->hw_cap_initialized & HW_CAP_TPC) + return; + + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + + for (i = 0 ; i < TPC_MAX_NUM ; i++) { + WREG32(mmTPC0_CFG_SM_BASE_ADDRESS_LOW + i * cfg_off, + so_base_lo); + WREG32(mmTPC0_CFG_SM_BASE_ADDRESS_HIGH + i * cfg_off, + so_base_hi); + } + + goya_init_tpc_qman(hdev, TPC0_QMAN_BASE_OFFSET, 0); + goya_init_tpc_qman(hdev, TPC1_QMAN_BASE_OFFSET, 1); + goya_init_tpc_qman(hdev, TPC2_QMAN_BASE_OFFSET, 2); + goya_init_tpc_qman(hdev, TPC3_QMAN_BASE_OFFSET, 3); + goya_init_tpc_qman(hdev, TPC4_QMAN_BASE_OFFSET, 4); + goya_init_tpc_qman(hdev, TPC5_QMAN_BASE_OFFSET, 5); + goya_init_tpc_qman(hdev, TPC6_QMAN_BASE_OFFSET, 6); + goya_init_tpc_qman(hdev, TPC7_QMAN_BASE_OFFSET, 7); + + for (i = 0 ; i < TPC_MAX_NUM ; i++) + goya_init_tpc_cmdq(hdev, i); + + goya->hw_cap_initialized |= HW_CAP_TPC; +} + +/* + * goya_disable_internal_queues - Disable internal queues + * + * @hdev: pointer to hl_device structure + * + */ +static void goya_disable_internal_queues(struct hl_device *hdev) +{ + WREG32(mmMME_QM_GLBL_CFG0, 0); + WREG32(mmMME_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC0_QM_GLBL_CFG0, 0); + WREG32(mmTPC0_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC1_QM_GLBL_CFG0, 0); + WREG32(mmTPC1_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC2_QM_GLBL_CFG0, 0); + WREG32(mmTPC2_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC3_QM_GLBL_CFG0, 0); + WREG32(mmTPC3_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC4_QM_GLBL_CFG0, 0); + WREG32(mmTPC4_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC5_QM_GLBL_CFG0, 0); + WREG32(mmTPC5_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC6_QM_GLBL_CFG0, 0); + WREG32(mmTPC6_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC7_QM_GLBL_CFG0, 0); + WREG32(mmTPC7_CMDQ_GLBL_CFG0, 0); +} + +/* + * goya_stop_internal_queues - Stop internal queues + * + * @hdev: pointer to hl_device structure + * + * Returns 0 on success + * + */ +static int goya_stop_internal_queues(struct hl_device *hdev) +{ + int rc, retval = 0; + + /* + * Each queue (QMAN) is a separate H/W logic. That means that each + * QMAN can be stopped independently and failure to stop one does NOT + * mandate we should not try to stop other QMANs + */ + + rc = goya_stop_queue(hdev, + mmMME_QM_GLBL_CFG1, + mmMME_QM_CP_STS, + mmMME_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop MME QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmMME_CMDQ_GLBL_CFG1, + mmMME_CMDQ_CP_STS, + mmMME_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop MME CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC0_QM_GLBL_CFG1, + mmTPC0_QM_CP_STS, + mmTPC0_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 0 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC0_CMDQ_GLBL_CFG1, + mmTPC0_CMDQ_CP_STS, + mmTPC0_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 0 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC1_QM_GLBL_CFG1, + mmTPC1_QM_CP_STS, + mmTPC1_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 1 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC1_CMDQ_GLBL_CFG1, + mmTPC1_CMDQ_CP_STS, + mmTPC1_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 1 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC2_QM_GLBL_CFG1, + mmTPC2_QM_CP_STS, + mmTPC2_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 2 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC2_CMDQ_GLBL_CFG1, + mmTPC2_CMDQ_CP_STS, + mmTPC2_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 2 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC3_QM_GLBL_CFG1, + mmTPC3_QM_CP_STS, + mmTPC3_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 3 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC3_CMDQ_GLBL_CFG1, + mmTPC3_CMDQ_CP_STS, + mmTPC3_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 3 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC4_QM_GLBL_CFG1, + mmTPC4_QM_CP_STS, + mmTPC4_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 4 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC4_CMDQ_GLBL_CFG1, + mmTPC4_CMDQ_CP_STS, + mmTPC4_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 4 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC5_QM_GLBL_CFG1, + mmTPC5_QM_CP_STS, + mmTPC5_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 5 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC5_CMDQ_GLBL_CFG1, + mmTPC5_CMDQ_CP_STS, + mmTPC5_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 5 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC6_QM_GLBL_CFG1, + mmTPC6_QM_CP_STS, + mmTPC6_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 6 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC6_CMDQ_GLBL_CFG1, + mmTPC6_CMDQ_CP_STS, + mmTPC6_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 6 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC7_QM_GLBL_CFG1, + mmTPC7_QM_CP_STS, + mmTPC7_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 7 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC7_CMDQ_GLBL_CFG1, + mmTPC7_CMDQ_CP_STS, + mmTPC7_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 7 CMDQ\n"); + retval = -EIO; + } + + return retval; +} + +static void goya_resume_internal_queues(struct hl_device *hdev) +{ + WREG32(mmMME_QM_GLBL_CFG1, 0); + WREG32(mmMME_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC0_QM_GLBL_CFG1, 0); + WREG32(mmTPC0_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC1_QM_GLBL_CFG1, 0); + WREG32(mmTPC1_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC2_QM_GLBL_CFG1, 0); + WREG32(mmTPC2_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC3_QM_GLBL_CFG1, 0); + WREG32(mmTPC3_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC4_QM_GLBL_CFG1, 0); + WREG32(mmTPC4_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC5_QM_GLBL_CFG1, 0); + WREG32(mmTPC5_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC6_QM_GLBL_CFG1, 0); + WREG32(mmTPC6_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC7_QM_GLBL_CFG1, 0); + WREG32(mmTPC7_CMDQ_GLBL_CFG1, 0); +} + + +/* + * goya_push_fw_to_device - Push FW code to device + * + * @hdev: pointer to hl_device structure + * + * Copy fw code from firmware file to device memory. + * Returns 0 on success + * + */ +static int goya_push_fw_to_device(struct hl_device *hdev, const char *fw_name, + void __iomem *dst) +{ + const struct firmware *fw; + const u64 *fw_data; + size_t fw_size, i; + int rc; + + rc = request_firmware(&fw, fw_name, hdev->dev); + + if (rc) { + dev_err(hdev->dev, "Failed to request %s\n", fw_name); + goto out; + } + + fw_size = fw->size; + if ((fw_size % 4) != 0) { + dev_err(hdev->dev, "illegal %s firmware size %zu\n", + fw_name, fw_size); + rc = -EINVAL; + goto out; + } + + dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size); + + fw_data = (const u64 *) fw->data; + + if ((fw->size % 8) != 0) + fw_size -= 8; + + for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) { + if (!(i & (0x80000 - 1))) { + dev_dbg(hdev->dev, + "copied so far %zu out of %zu for %s firmware", + i, fw_size, fw_name); + usleep_range(20, 100); + } + + writeq(*fw_data, dst); + } + + if ((fw->size % 8) != 0) + writel(*(const u32 *) fw_data, dst); + +out: + release_firmware(fw); + return rc; +} + +static int goya_pldm_init_cpu(struct hl_device *hdev) +{ + char fw_name[200]; + void __iomem *dst; + u32 val, unit_rst_val; + int rc; + + /* Must initialize SRAM scrambler before pushing u-boot to SRAM */ + goya_init_golden_registers(hdev); + + /* Put ARM cores into reset */ + WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, CPU_RESET_ASSERT); + val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL); + + /* Reset the CA53 MACRO */ + unit_rst_val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); + WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, CA53_RESET); + val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); + WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, unit_rst_val); + val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); + + snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin"); + dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET; + rc = goya_push_fw_to_device(hdev, fw_name, dst); + if (rc) + return rc; + + snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb"); + dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET; + rc = goya_push_fw_to_device(hdev, fw_name, dst); + if (rc) + return rc; + + WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_FIT_RDY); + WREG32(mmPSOC_GLOBAL_CONF_WARM_REBOOT, CPU_BOOT_STATUS_NA); + + WREG32(mmCPU_CA53_CFG_RST_ADDR_LSB_0, + lower_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET)); + WREG32(mmCPU_CA53_CFG_RST_ADDR_MSB_0, + upper_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET)); + + /* Release ARM core 0 from reset */ + WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, + CPU_RESET_CORE0_DEASSERT); + val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL); + + return 0; +} + +/* + * FW component passes an offset from SRAM_BASE_ADDR in SCRATCHPAD_xx. + * The version string should be located by that offset. + */ +static void goya_read_device_fw_version(struct hl_device *hdev, + enum goya_fw_component fwc) +{ + const char *name; + u32 ver_off; + char *dest; + + switch (fwc) { + case FW_COMP_UBOOT: + ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_29); + dest = hdev->asic_prop.uboot_ver; + name = "U-Boot"; + break; + case FW_COMP_PREBOOT: + ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_28); + dest = hdev->asic_prop.preboot_ver; + name = "Preboot"; + break; + default: + dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc); + return; + } + + ver_off &= ~((u32)SRAM_BASE_ADDR); + + if (ver_off < SRAM_SIZE - VERSION_MAX_LEN) { memcpy_fromio(dest, hdev->pcie_bar[SRAM_CFG_BAR_ID] + ver_off, VERSION_MAX_LEN); } else { @@ -1344,6 +2183,19 @@ static int goya_hw_init(struct hl_device *hdev) goya_init_security(hdev); + goya_init_dma_qmans(hdev); + + goya_init_mme_qmans(hdev); + + goya_init_tpc_qmans(hdev); + + rc = goya_init_cpu_queues(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize CPU H/W queues %d\n", + rc); + goto disable_queues; + } + /* CPU initialization is finished, we can now move to 48 bit DMA mask */ rc = pci_set_dma_mask(hdev->pdev, DMA_BIT_MASK(48)); if (rc) { @@ -1352,7 +2204,7 @@ static int goya_hw_init(struct hl_device *hdev) if (rc) { dev_err(hdev->dev, "Unable to set pci dma mask to 32 bits\n"); - return rc; + goto disable_pci_access; } } @@ -1364,7 +2216,7 @@ static int goya_hw_init(struct hl_device *hdev) if (rc) { dev_err(hdev->dev, "Unable to set pci consistent dma mask to 32 bits\n"); - return rc; + goto disable_pci_access; } } @@ -1372,6 +2224,14 @@ static int goya_hw_init(struct hl_device *hdev) val = RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG); return 0; + +disable_pci_access: + goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS); +disable_queues: + goya_disable_internal_queues(hdev); + goya_disable_external_queues(hdev); + + return rc; } /* @@ -1460,12 +2320,40 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset) int goya_suspend(struct hl_device *hdev) { - return 0; + int rc; + + rc = goya_stop_internal_queues(hdev); + + if (rc) { + dev_err(hdev->dev, "failed to stop internal queues\n"); + return rc; + } + + rc = goya_stop_external_queues(hdev); + + if (rc) { + dev_err(hdev->dev, "failed to stop external queues\n"); + return rc; + } + + rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS); + if (rc) + dev_err(hdev->dev, "Failed to disable PCI access from CPU\n"); + + return rc; } int goya_resume(struct hl_device *hdev) { - return 0; + int rc; + + goya_resume_external_queues(hdev); + goya_resume_internal_queues(hdev); + + rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS); + if (rc) + dev_err(hdev->dev, "Failed to enable PCI access from CPU\n"); + return rc; } int goya_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma) @@ -1489,6 +2377,101 @@ int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma, return rc; } +void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi) +{ + u32 db_reg_offset, db_value; + bool invalid_queue = false; + + switch (hw_queue_id) { + case GOYA_QUEUE_ID_DMA_0: + db_reg_offset = mmDMA_QM_0_PQ_PI; + break; + + case GOYA_QUEUE_ID_DMA_1: + db_reg_offset = mmDMA_QM_1_PQ_PI; + break; + + case GOYA_QUEUE_ID_DMA_2: + db_reg_offset = mmDMA_QM_2_PQ_PI; + break; + + case GOYA_QUEUE_ID_DMA_3: + db_reg_offset = mmDMA_QM_3_PQ_PI; + break; + + case GOYA_QUEUE_ID_DMA_4: + db_reg_offset = mmDMA_QM_4_PQ_PI; + break; + + case GOYA_QUEUE_ID_CPU_PQ: + if (hdev->cpu_queues_enable) + db_reg_offset = mmCPU_IF_PF_PQ_PI; + else + invalid_queue = true; + break; + + case GOYA_QUEUE_ID_MME: + db_reg_offset = mmMME_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC0: + db_reg_offset = mmTPC0_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC1: + db_reg_offset = mmTPC1_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC2: + db_reg_offset = mmTPC2_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC3: + db_reg_offset = mmTPC3_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC4: + db_reg_offset = mmTPC4_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC5: + db_reg_offset = mmTPC5_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC6: + db_reg_offset = mmTPC6_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC7: + db_reg_offset = mmTPC7_QM_PQ_PI; + break; + + default: + invalid_queue = true; + } + + if (invalid_queue) { + /* Should never get here */ + dev_err(hdev->dev, "h/w queue %d is invalid. Can't set pi\n", + hw_queue_id); + return; + } + + db_value = pi; + + /* ring the doorbell */ + WREG32(db_reg_offset, db_value); + + if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ) + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, + GOYA_ASYNC_EVENT_ID_PI_UPDATE); +} + +void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val) +{ + /* Not needed in Goya */ +} + void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { @@ -1501,6 +2484,315 @@ void goya_dma_free_coherent(struct hl_device *hdev, size_t size, void *cpu_addr, dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle); } +void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id, + dma_addr_t *dma_handle, u16 *queue_len) +{ + void *base; + u32 offset; + + *dma_handle = hdev->asic_prop.sram_base_address; + + base = hdev->pcie_bar[SRAM_CFG_BAR_ID]; + + switch (queue_id) { + case GOYA_QUEUE_ID_MME: + offset = MME_QMAN_BASE_OFFSET; + *queue_len = MME_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC0: + offset = TPC0_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC1: + offset = TPC1_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC2: + offset = TPC2_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC3: + offset = TPC3_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC4: + offset = TPC4_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC5: + offset = TPC5_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC6: + offset = TPC6_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC7: + offset = TPC7_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + default: + dev_err(hdev->dev, "Got invalid queue id %d\n", queue_id); + return NULL; + } + + base += offset; + *dma_handle += offset; + + return base; +} + +int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len, + u32 timeout, long *result) +{ + struct goya_device *goya = hdev->asic_specific; + struct armcp_packet *pkt; + dma_addr_t pkt_dma_addr; + u32 tmp; + int rc = 0; + + if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q)) { + if (result) + *result = 0; + return 0; + } + + if (len > CPU_CB_SIZE) { + dev_err(hdev->dev, "Invalid CPU message size of %d bytes\n", + len); + return -ENOMEM; + } + + pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len, + &pkt_dma_addr); + if (!pkt) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for packet to CPU\n"); + return -ENOMEM; + } + + memcpy(pkt, msg, len); + + mutex_lock(&hdev->send_cpu_message_lock); + + if (hdev->disabled) + goto out; + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_CPU_PQ, len, + pkt_dma_addr); + if (rc) { + dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc); + goto out; + } + + rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) &pkt->fence, + timeout, &tmp); + + hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_CPU_PQ); + + if (rc == -ETIMEDOUT) { + dev_err(hdev->dev, + "Timeout while waiting for CPU packet fence\n"); + goto out; + } + + if (tmp == ARMCP_PACKET_FENCE_VAL) { + rc = (pkt->ctl & ARMCP_PKT_CTL_RC_MASK) >> + ARMCP_PKT_CTL_RC_SHIFT; + if (rc) { + dev_err(hdev->dev, + "F/W ERROR %d for CPU packet %d\n", + rc, (pkt->ctl & ARMCP_PKT_CTL_OPCODE_MASK) + >> ARMCP_PKT_CTL_OPCODE_SHIFT); + rc = -EINVAL; + } else if (result) { + *result = pkt->result; + } + } else { + dev_err(hdev->dev, "CPU packet wrong fence value\n"); + rc = -EINVAL; + } + +out: + mutex_unlock(&hdev->send_cpu_message_lock); + + hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, len, pkt); + + return rc; +} + +int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id) +{ + struct packet_msg_prot *fence_pkt; + dma_addr_t pkt_dma_addr; + u32 fence_val, tmp; + dma_addr_t fence_dma_addr; + u32 *fence_ptr; + int rc; + + fence_val = GOYA_QMAN0_FENCE_VAL; + + fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL, + &fence_dma_addr); + if (!fence_ptr) { + dev_err(hdev->dev, + "Failed to allocate memory for queue testing\n"); + return -ENOMEM; + } + + *fence_ptr = 0; + + fence_pkt = hdev->asic_funcs->dma_pool_zalloc(hdev, + sizeof(struct packet_msg_prot), + GFP_KERNEL, &pkt_dma_addr); + if (!fence_pkt) { + dev_err(hdev->dev, + "Failed to allocate packet for queue testing\n"); + rc = -ENOMEM; + goto free_fence_ptr; + } + + fence_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | + (1 << GOYA_PKT_CTL_EB_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT); + fence_pkt->value = fence_val; + fence_pkt->addr = fence_dma_addr + + hdev->asic_prop.host_phys_base_address; + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, + sizeof(struct packet_msg_prot), + pkt_dma_addr); + if (rc) { + dev_err(hdev->dev, + "Failed to send fence packet\n"); + goto free_pkt; + } + + rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) fence_ptr, + GOYA_TEST_QUEUE_WAIT_USEC, &tmp); + + hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); + + if ((!rc) && (tmp == fence_val)) { + dev_info(hdev->dev, + "queue test on H/W queue %d succeeded\n", + hw_queue_id); + } else { + dev_err(hdev->dev, + "H/W queue %d test failed (scratch(0x%08llX) == 0x%08X)\n", + hw_queue_id, (unsigned long long) fence_dma_addr, tmp); + rc = -EINVAL; + } + +free_pkt: + hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_pkt, + pkt_dma_addr); +free_fence_ptr: + hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr, + fence_dma_addr); + return rc; +} + +int goya_test_cpu_queue(struct hl_device *hdev) +{ + struct armcp_packet test_pkt; + long result; + int rc; + + /* cpu_queues_enable flag is always checked in send cpu message */ + + memset(&test_pkt, 0, sizeof(test_pkt)); + + test_pkt.ctl = ARMCP_PACKET_TEST << ARMCP_PKT_CTL_OPCODE_SHIFT; + test_pkt.value = ARMCP_PACKET_FENCE_VAL; + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt, + sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result); + + if (!rc) + dev_info(hdev->dev, "queue test on CPU queue succeeded\n"); + else + dev_err(hdev->dev, "CPU queue test failed (0x%08lX)\n", result); + + return rc; +} + +static int goya_test_queues(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + int i, rc, ret_val = 0; + + for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) { + rc = goya_test_queue(hdev, i); + if (rc) + ret_val = -EINVAL; + } + + if (hdev->cpu_queues_enable) { + rc = goya->test_cpu_queue(hdev); + if (rc) + ret_val = -EINVAL; + } + + return ret_val; +} + +void *goya_dma_pool_zalloc(struct hl_device *hdev, size_t size, gfp_t mem_flags, + dma_addr_t *dma_handle) +{ + if (size > GOYA_DMA_POOL_BLK_SIZE) + return NULL; + + return dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle); +} + +void goya_dma_pool_free(struct hl_device *hdev, void *vaddr, + dma_addr_t dma_addr) +{ + dma_pool_free(hdev->dma_pool, vaddr, dma_addr); +} + +void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle) +{ + u64 kernel_addr; + + /* roundup to CPU_PKT_SIZE */ + size = (size + (CPU_PKT_SIZE - 1)) & CPU_PKT_MASK; + + kernel_addr = gen_pool_alloc(hdev->cpu_accessible_dma_pool, size); + + *dma_handle = hdev->cpu_accessible_dma_address + + (kernel_addr - (u64) (uintptr_t) hdev->cpu_accessible_dma_mem); + + return (void *) (uintptr_t) kernel_addr; +} + +void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, + void *vaddr) +{ + /* roundup to CPU_PKT_SIZE */ + size = (size + (CPU_PKT_SIZE - 1)) & CPU_PKT_MASK; + + gen_pool_free(hdev->cpu_accessible_dma_pool, (u64) (uintptr_t) vaddr, + size); +} + + +static void goya_hw_queues_lock(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + + spin_lock(&goya->hw_queues_lock); +} + +static void goya_hw_queues_unlock(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + + spin_unlock(&goya->hw_queues_lock); +} + static const struct hl_asic_funcs goya_funcs = { .early_init = goya_early_init, .early_fini = goya_early_fini, @@ -1512,8 +2804,19 @@ static const struct hl_asic_funcs goya_funcs = { .resume = goya_resume, .mmap = goya_mmap, .cb_mmap = goya_cb_mmap, + .ring_doorbell = goya_ring_doorbell, + .flush_pq_write = goya_flush_pq_write, .dma_alloc_coherent = goya_dma_alloc_coherent, .dma_free_coherent = goya_dma_free_coherent, + .get_int_queue_base = goya_get_int_queue_base, + .test_queues = goya_test_queues, + .dma_pool_zalloc = goya_dma_pool_zalloc, + .dma_pool_free = goya_dma_pool_free, + .cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc, + .cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free, + .hw_queues_lock = goya_hw_queues_lock, + .hw_queues_unlock = goya_hw_queues_unlock, + .send_cpu_message = goya_send_cpu_message }; /* diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index 65cbb45d7083..791605cbecfe 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -11,7 +11,9 @@ #include #include "habanalabs.h" #include "include/hl_boot_if.h" +#include "include/goya/goya_packets.h" #include "include/goya/goya.h" +#include "include/goya/goya_async_events.h" #include "include/goya/goya_fw_if.h" #define NUMBER_OF_CMPLT_QUEUES 5 @@ -145,12 +147,17 @@ enum goya_fw_component { }; struct goya_device { + int (*test_cpu_queue)(struct hl_device *hdev); + /* TODO: remove hw_queues_lock after moving to scheduler code */ spinlock_t hw_queues_lock; u64 ddr_bar_cur_addr; u32 hw_cap_initialized; }; +int goya_test_cpu_queue(struct hl_device *hdev); +int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len, + u32 timeout, long *result); void goya_init_security(struct hl_device *hdev); #endif /* GOYAP_H_ */ diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index e099f7a9dac2..2121babbebdc 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -9,6 +9,7 @@ #define HABANALABSP_H_ #include "include/armcp_if.h" +#include "include/qman_if.h" #define pr_fmt(fmt) "habanalabs: " fmt @@ -26,9 +27,36 @@ struct hl_device; struct hl_fpriv; +/** + * enum hl_queue_type - Supported QUEUE types. + * @QUEUE_TYPE_NA: queue is not available. + * @QUEUE_TYPE_EXT: external queue which is a DMA channel that may access the + * host. + * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's + * memories and/or operates the compute engines. + * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU. + */ +enum hl_queue_type { + QUEUE_TYPE_NA, + QUEUE_TYPE_EXT, + QUEUE_TYPE_INT, + QUEUE_TYPE_CPU +}; + +/** + * struct hw_queue_properties - queue information. + * @type: queue type. + * @kmd_only: true if only KMD is allowed to send a job to this queue, false + * otherwise. + */ +struct hw_queue_properties { + enum hl_queue_type type; + u8 kmd_only; +}; /** * struct asic_fixed_properties - ASIC specific immutable properties. + * @hw_queues_props: H/W queues properties. * @uboot_ver: F/W U-boot version. * @preboot_ver: F/W Preboot version. * @sram_base_address: SRAM physical start address. @@ -59,6 +87,7 @@ struct hl_fpriv; * @tpc_enabled_mask: which TPCs are enabled. */ struct asic_fixed_properties { + struct hw_queue_properties hw_queues_props[HL_MAX_QUEUES]; char uboot_ver[VERSION_MAX_LEN]; char preboot_ver[VERSION_MAX_LEN]; u64 sram_base_address; @@ -132,7 +161,89 @@ struct hl_cb { }; +/* + * QUEUES + */ + +struct hl_cs_job; + +/* + * Currently, there are two limitations on the maximum length of a queue: + * + * 1. The memory footprint of the queue. The current allocated space for the + * queue is PAGE_SIZE. Because each entry in the queue is HL_BD_SIZE, + * the maximum length of the queue can be PAGE_SIZE / HL_BD_SIZE, + * which currently is 4096/16 = 256 entries. + * + * To increase that, we need either to decrease the size of the + * BD (difficult), or allocate more than a single page (easier). + * + * 2. Because the size of the JOB handle field in the BD CTL / completion queue + * is 10-bit, we can have up to 1024 open jobs per hardware queue. + * Therefore, each queue can hold up to 1024 entries. + * + * HL_QUEUE_LENGTH is in units of struct hl_bd. + * HL_QUEUE_LENGTH * sizeof(struct hl_bd) should be <= HL_PAGE_SIZE + */ + +#define HL_PAGE_SIZE 4096 /* minimum page size */ +/* Must be power of 2 (HL_PAGE_SIZE / HL_BD_SIZE) */ #define HL_QUEUE_LENGTH 256 +#define HL_QUEUE_SIZE_IN_BYTES (HL_QUEUE_LENGTH * HL_BD_SIZE) + +/* + * HL_CQ_LENGTH is in units of struct hl_cq_entry. + * HL_CQ_LENGTH should be <= HL_PAGE_SIZE + */ +#define HL_CQ_LENGTH HL_QUEUE_LENGTH +#define HL_CQ_SIZE_IN_BYTES (HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE) + + + +/** + * struct hl_hw_queue - describes a H/W transport queue. + * @shadow_queue: pointer to a shadow queue that holds pointers to jobs. + * @queue_type: type of queue. + * @kernel_address: holds the queue's kernel virtual address. + * @bus_address: holds the queue's DMA address. + * @pi: holds the queue's pi value. + * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci). + * @hw_queue_id: the id of the H/W queue. + * @int_queue_len: length of internal queue (number of entries). + * @valid: is the queue valid (we have array of 32 queues, not all of them + * exists). + */ +struct hl_hw_queue { + struct hl_cs_job **shadow_queue; + enum hl_queue_type queue_type; + u64 kernel_address; + dma_addr_t bus_address; + u32 pi; + u32 ci; + u32 hw_queue_id; + u16 int_queue_len; + u8 valid; +}; + +/** + * struct hl_cq - describes a completion queue + * @hdev: pointer to the device structure + * @kernel_address: holds the queue's kernel virtual address + * @bus_address: holds the queue's DMA address + * @hw_queue_id: the id of the matching H/W queue + * @ci: ci inside the queue + * @pi: pi inside the queue + * @free_slots_cnt: counter of free slots in queue + */ +struct hl_cq { + struct hl_device *hdev; + u64 kernel_address; + dma_addr_t bus_address; + u32 hw_queue_id; + u32 ci; + u32 pi; + atomic_t free_slots_cnt; +}; /* @@ -164,6 +275,8 @@ enum hl_asic_type { * @resume: handles IP specific H/W or SW changes for resume. * @mmap: mmap function, does nothing. * @cb_mmap: maps a CB. + * @ring_doorbell: increment PI on a given QMAN. + * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed. * @dma_alloc_coherent: Allocate coherent DMA memory by calling * dma_alloc_coherent(). This is ASIC function because its * implementation is not trivial when the driver is loaded @@ -172,6 +285,16 @@ enum hl_asic_type { * This is ASIC function because its implementation is not * trivial when the driver is loaded in simulation mode * (not upstreamed). + * @get_int_queue_base: get the internal queue base address. + * @test_queues: run simple test on all queues for sanity check. + * @dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool. + * size of allocation is HL_DMA_POOL_BLK_SIZE. + * @dma_pool_free: free small DMA allocation from pool. + * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool. + * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool. + * @hw_queues_lock: acquire H/W queues lock. + * @hw_queues_unlock: release H/W queues lock. + * @send_cpu_message: send buffer to ArmCP. */ struct hl_asic_funcs { int (*early_init)(struct hl_device *hdev); @@ -185,10 +308,27 @@ struct hl_asic_funcs { int (*mmap)(struct hl_fpriv *hpriv, struct vm_area_struct *vma); int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma, u64 kaddress, phys_addr_t paddress, u32 size); + void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi); + void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val); void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flag); void (*dma_free_coherent)(struct hl_device *hdev, size_t size, void *cpu_addr, dma_addr_t dma_handle); + void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id, + dma_addr_t *dma_handle, u16 *queue_len); + int (*test_queues)(struct hl_device *hdev); + void* (*dma_pool_zalloc)(struct hl_device *hdev, size_t size, + gfp_t mem_flags, dma_addr_t *dma_handle); + void (*dma_pool_free)(struct hl_device *hdev, void *vaddr, + dma_addr_t dma_addr); + void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev, + size_t size, dma_addr_t *dma_handle); + void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev, + size_t size, void *vaddr); + void (*hw_queues_lock)(struct hl_device *hdev); + void (*hw_queues_unlock)(struct hl_device *hdev); + int (*send_cpu_message)(struct hl_device *hdev, u32 *msg, + u16 len, u32 timeout, long *result); }; @@ -224,6 +364,17 @@ struct hl_ctx_mgr { }; + + +/** + * struct hl_cs_job - command submission job. + * @finish_work: workqueue object to run when job is completed. + * @id: the id of this job inside a CS. + */ +struct hl_cs_job { + struct work_struct finish_work; + u32 id; +}; /* * FILE PRIVATE STRUCTURE */ @@ -298,7 +449,11 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); * @dev: realted kernel basic device structure. * @asic_name: ASIC specific nmae. * @asic_type: ASIC specific type. + * @completion_queue: array of hl_cq. + * @cq_wq: work queue of completion queues for executing work in process context + * @eq_wq: work queue of event queue for executing work in process context. * @kernel_ctx: KMD context structure. + * @kernel_queues: array of hl_hw_queue. * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs. * @dma_pool: DMA pool for small allocations. * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address. @@ -312,6 +467,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); * only a single process at a time. In addition, we need a * lock here so we can flush user processes which are opening * the device while we are trying to hard reset it + * @send_cpu_message_lock: enforces only one message in KMD <-> ArmCP queue. * @asic_prop: ASIC specific immutable properties. * @asic_funcs: ASIC specific functions. * @asic_specific: ASIC specific information to use only from ASIC files. @@ -331,7 +487,10 @@ struct hl_device { struct device *dev; char asic_name[16]; enum hl_asic_type asic_type; + struct hl_cq *completion_queue; + struct workqueue_struct *cq_wq; struct hl_ctx *kernel_ctx; + struct hl_hw_queue *kernel_queues; struct hl_cb_mgr kernel_cb_mgr; struct dma_pool *dma_pool; void *cpu_accessible_dma_mem; @@ -341,6 +500,7 @@ struct hl_device { struct mutex asid_mutex; /* TODO: remove fd_open_cnt_lock for multiple process support */ struct mutex fd_open_cnt_lock; + struct mutex send_cpu_message_lock; struct asic_fixed_properties asic_prop; const struct hl_asic_funcs *asic_funcs; void *asic_specific; @@ -358,6 +518,7 @@ struct hl_device { /* Parameters for bring-up */ u8 cpu_enable; u8 reset_pcilink; + u8 cpu_queues_enable; u8 fw_loading; u8 pldm; }; @@ -400,7 +561,18 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr, u32 timeout_us, u32 *val); int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr, u32 timeout_us, u32 *val); - +int hl_hw_queues_create(struct hl_device *hdev); +void hl_hw_queues_destroy(struct hl_device *hdev); +int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, + u32 cb_size, u64 cb_ptr); +u32 hl_hw_queue_add_ptr(u32 ptr, u16 val); +void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id); + +#define hl_queue_inc_ptr(p) hl_hw_queue_add_ptr(p, 1) +#define hl_pi_2_offset(pi) ((pi) & (HL_QUEUE_LENGTH - 1)) + +int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id); +void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q); int hl_asid_init(struct hl_device *hdev); void hl_asid_fini(struct hl_device *hdev); unsigned long hl_asid_alloc(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c index 59c2fd196659..93576249307b 100644 --- a/drivers/misc/habanalabs/habanalabs_drv.c +++ b/drivers/misc/habanalabs/habanalabs_drv.c @@ -169,6 +169,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, /* Parameters for bring-up - set them to defaults */ hdev->cpu_enable = 1; hdev->reset_pcilink = 0; + hdev->cpu_queues_enable = 1; hdev->fw_loading = 1; hdev->pldm = 0; @@ -176,6 +177,10 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, if (!hdev->cpu_enable) hdev->fw_loading = 0; + /* If we don't load FW, no need to initialize CPU queues */ + if (!hdev->fw_loading) + hdev->cpu_queues_enable = 0; + hdev->disabled = true; hdev->pdev = pdev; /* can be NULL in case of simulator device */ diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c new file mode 100644 index 000000000000..2ec43f36cdb8 --- /dev/null +++ b/drivers/misc/habanalabs/hw_queue.c @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include + +/* + * hl_queue_add_ptr - add to pi or ci and checks if it wraps around + * + * @ptr: the current pi/ci value + * @val: the amount to add + * + * Add val to ptr. It can go until twice the queue length. + */ +inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val) +{ + ptr += val; + ptr &= ((HL_QUEUE_LENGTH << 1) - 1); + return ptr; +} + +static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len) +{ + int delta = (q->pi - q->ci); + + if (delta >= 0) + return (queue_len - delta); + else + return (abs(delta) - queue_len); +} + +/* + * ext_queue_submit_bd - Submit a buffer descriptor to an external queue + * + * @hdev: pointer to habanalabs device structure + * @q: pointer to habanalabs queue structure + * @ctl: BD's control word + * @len: BD's length + * @ptr: BD's pointer + * + * This function assumes there is enough space on the queue to submit a new + * BD to it. It initializes the next BD and calls the device specific + * function to set the pi (and doorbell) + * + * This function must be called when the scheduler mutex is taken + * + */ +static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, + u32 ctl, u32 len, u64 ptr) +{ + struct hl_bd *bd; + + bd = (struct hl_bd *) (uintptr_t) q->kernel_address; + bd += hl_pi_2_offset(q->pi); + bd->ctl = ctl; + bd->len = len; + bd->ptr = ptr + hdev->asic_prop.host_phys_base_address; + + q->pi = hl_queue_inc_ptr(q->pi); + hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); +} + +/* + * ext_queue_sanity_checks - perform some sanity checks on external queue + * + * @hdev : pointer to hl_device structure + * @q : pointer to hl_hw_queue structure + * @num_of_entries : how many entries to check for space + * @reserve_cq_entry : whether to reserve an entry in the cq + * + * H/W queues spinlock should be taken before calling this function + * + * Perform the following: + * - Make sure we have enough space in the h/w queue + * - Make sure we have enough space in the completion queue + * - Reserve space in the completion queue (needs to be reversed if there + * is a failure down the road before the actual submission of work). Only + * do this action if reserve_cq_entry is true + * + */ +static int ext_queue_sanity_checks(struct hl_device *hdev, + struct hl_hw_queue *q, int num_of_entries, + bool reserve_cq_entry) +{ + atomic_t *free_slots = + &hdev->completion_queue[q->hw_queue_id].free_slots_cnt; + int free_slots_cnt; + + /* Check we have enough space in the queue */ + free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH); + + if (free_slots_cnt < num_of_entries) { + dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n", + q->hw_queue_id, num_of_entries); + return -EAGAIN; + } + + if (reserve_cq_entry) { + /* + * Check we have enough space in the completion queue + * Add -1 to counter (decrement) unless counter was already 0 + * In that case, CQ is full so we can't submit a new CB because + * we won't get ack on its completion + * atomic_add_unless will return 0 if counter was already 0 + */ + if (atomic_add_negative(num_of_entries * -1, free_slots)) { + dev_dbg(hdev->dev, "No space for %d on CQ %d\n", + num_of_entries, q->hw_queue_id); + atomic_add(num_of_entries, free_slots); + return -EAGAIN; + } + } + + return 0; +} + +/* + * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion + * + * @hdev: pointer to hl_device structure + * @hw_queue_id: Queue's type + * @cb_size: size of CB + * @cb_ptr: pointer to CB location + * + * This function sends a single CB, that must NOT generate a completion entry + * + */ +int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, + u32 cb_size, u64 cb_ptr) +{ + struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; + int rc; + + /* + * The CPU queue is a synchronous queue with an effective depth of + * a single entry (although it is allocated with room for multiple + * entries). Therefore, there is a different lock, called + * send_cpu_message_lock, that serializes accesses to the CPU queue. + * As a result, we don't need to lock the access to the entire H/W + * queues module when submitting a JOB to the CPU queue + */ + if (q->queue_type != QUEUE_TYPE_CPU) + hdev->asic_funcs->hw_queues_lock(hdev); + + if (hdev->disabled) { + rc = -EPERM; + goto out; + } + + rc = ext_queue_sanity_checks(hdev, q, 1, false); + if (rc) + goto out; + + ext_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr); + +out: + if (q->queue_type != QUEUE_TYPE_CPU) + hdev->asic_funcs->hw_queues_unlock(hdev); + + return rc; +} + +/* + * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue + * + * @hdev: pointer to hl_device structure + * @hw_queue_id: which queue to increment its ci + */ +void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id) +{ + struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; + + q->ci = hl_queue_inc_ptr(q->ci); +} + +static int ext_and_cpu_hw_queue_init(struct hl_device *hdev, + struct hl_hw_queue *q) +{ + void *p; + int rc; + + p = hdev->asic_funcs->dma_alloc_coherent(hdev, + HL_QUEUE_SIZE_IN_BYTES, + &q->bus_address, GFP_KERNEL | __GFP_ZERO); + if (!p) + return -ENOMEM; + + q->kernel_address = (u64) (uintptr_t) p; + + q->shadow_queue = kmalloc_array(HL_QUEUE_LENGTH, + sizeof(*q->shadow_queue), + GFP_KERNEL); + if (!q->shadow_queue) { + dev_err(hdev->dev, + "Failed to allocate shadow queue for H/W queue %d\n", + q->hw_queue_id); + rc = -ENOMEM; + goto free_queue; + } + + /* Make sure read/write pointers are initialized to start of queue */ + q->ci = 0; + q->pi = 0; + + return 0; + +free_queue: + hdev->asic_funcs->dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, + (void *) (uintptr_t) q->kernel_address, q->bus_address); + + return rc; +} + +static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) +{ + void *p; + + p = hdev->asic_funcs->get_int_queue_base(hdev, q->hw_queue_id, + &q->bus_address, &q->int_queue_len); + if (!p) { + dev_err(hdev->dev, + "Failed to get base address for internal queue %d\n", + q->hw_queue_id); + return -EFAULT; + } + + q->kernel_address = (u64) (uintptr_t) p; + q->pi = 0; + q->ci = 0; + + return 0; +} + +static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) +{ + return ext_and_cpu_hw_queue_init(hdev, q); +} + +static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) +{ + return ext_and_cpu_hw_queue_init(hdev, q); +} + +/* + * hw_queue_init - main initialization function for H/W queue object + * + * @hdev: pointer to hl_device device structure + * @q: pointer to hl_hw_queue queue structure + * @hw_queue_id: The id of the H/W queue + * + * Allocate dma-able memory for the queue and initialize fields + * Returns 0 on success + */ +static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, + u32 hw_queue_id) +{ + int rc; + + BUILD_BUG_ON(HL_QUEUE_SIZE_IN_BYTES > HL_PAGE_SIZE); + + q->hw_queue_id = hw_queue_id; + + switch (q->queue_type) { + case QUEUE_TYPE_EXT: + rc = ext_hw_queue_init(hdev, q); + break; + + case QUEUE_TYPE_INT: + rc = int_hw_queue_init(hdev, q); + break; + + case QUEUE_TYPE_CPU: + rc = cpu_hw_queue_init(hdev, q); + break; + + case QUEUE_TYPE_NA: + q->valid = 0; + return 0; + + default: + dev_crit(hdev->dev, "wrong queue type %d during init\n", + q->queue_type); + rc = -EINVAL; + break; + } + + if (rc) + return rc; + + q->valid = 1; + + return 0; +} + +/* + * hw_queue_fini - destroy queue + * + * @hdev: pointer to hl_device device structure + * @q: pointer to hl_hw_queue queue structure + * + * Free the queue memory + */ +static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q) +{ + if (!q->valid) + return; + + /* + * If we arrived here, there are no jobs waiting on this queue + * so we can safely remove it. + * This is because this function can only called when: + * 1. Either a context is deleted, which only can occur if all its + * jobs were finished + * 2. A context wasn't able to be created due to failure or timeout, + * which means there are no jobs on the queue yet + * + * The only exception are the queues of the kernel context, but + * if they are being destroyed, it means that the entire module is + * being removed. If the module is removed, it means there is no open + * user context. It also means that if a job was submitted by + * the kernel driver (e.g. context creation), the job itself was + * released by the kernel driver when a timeout occurred on its + * Completion. Thus, we don't need to release it again. + */ + + if (q->queue_type == QUEUE_TYPE_INT) + return; + + kfree(q->shadow_queue); + + hdev->asic_funcs->dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, + (void *) (uintptr_t) q->kernel_address, q->bus_address); +} + +int hl_hw_queues_create(struct hl_device *hdev) +{ + struct asic_fixed_properties *asic = &hdev->asic_prop; + struct hl_hw_queue *q; + int i, rc, q_ready_cnt; + + hdev->kernel_queues = kcalloc(HL_MAX_QUEUES, + sizeof(*hdev->kernel_queues), GFP_KERNEL); + + if (!hdev->kernel_queues) { + dev_err(hdev->dev, "Not enough memory for H/W queues\n"); + return -ENOMEM; + } + + /* Initialize the H/W queues */ + for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues; + i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) { + + q->queue_type = asic->hw_queues_props[i].type; + rc = hw_queue_init(hdev, q, i); + if (rc) { + dev_err(hdev->dev, + "failed to initialize queue %d\n", i); + goto release_queues; + } + } + + return 0; + +release_queues: + for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++) + hw_queue_fini(hdev, q); + + kfree(hdev->kernel_queues); + + return rc; +} + +void hl_hw_queues_destroy(struct hl_device *hdev) +{ + struct hl_hw_queue *q; + int i; + + for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) + hw_queue_fini(hdev, q); + + kfree(hdev->kernel_queues); +} + +void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset) +{ + struct hl_hw_queue *q; + int i; + + for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) { + if ((!q->valid) || + ((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU))) + continue; + q->pi = q->ci = 0; + } +} diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h index 85fc2efe144b..cc37003aa6b7 100644 --- a/drivers/misc/habanalabs/include/armcp_if.h +++ b/drivers/misc/habanalabs/include/armcp_if.h @@ -10,10 +10,302 @@ #include +enum pq_init_status { + PQ_INIT_STATUS_NA = 0, + PQ_INIT_STATUS_READY_FOR_CP, + PQ_INIT_STATUS_READY_FOR_HOST +}; + +/* + * ArmCP Primary Queue Packets + * + * During normal operation, KMD needs to send various messages to ArmCP, + * usually either to SET some value into a H/W periphery or to GET the current + * value of some H/W periphery. For example, SET the frequency of MME/TPC and + * GET the value of the thermal sensor. + * + * These messages can be initiated either by the User application or by KMD + * itself, e.g. power management code. In either case, the communication from + * KMD to ArmCP will *always* be in synchronous mode, meaning that KMD will + * send a single message and poll until the message was acknowledged and the + * results are ready (if results are needed). + * + * This means that only a single message can be sent at a time and KMD must + * wait for its result before sending the next message. Having said that, + * because these are control messages which are sent in a relatively low + * frequency, this limitation seems acceptable. It's important to note that + * in case of multiple devices, messages to different devices *can* be sent + * at the same time. + * + * The message, inputs/outputs (if relevant) and fence object will be located + * on the device DDR at an address that will be determined by KMD. During + * device initialization phase, KMD will pass to ArmCP that address. Most of + * the message types will contain inputs/outputs inside the message itself. + * The common part of each message will contain the opcode of the message (its + * type) and a field representing a fence object. + * + * When KMD wishes to send a message to ArmCP, it will write the message + * contents to the device DDR, clear the fence object and then write the + * value 484 to the mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR register to issue + * the 484 interrupt-id to the ARM core. + * + * Upon receiving the 484 interrupt-id, ArmCP will read the message from the + * DDR. In case the message is a SET operation, ArmCP will first perform the + * operation and then write to the fence object on the device DDR. In case the + * message is a GET operation, ArmCP will first fill the results section on the + * device DDR and then write to the fence object. If an error occurred, ArmCP + * will fill the rc field with the right error code. + * + * In the meantime, KMD will poll on the fence object. Once KMD sees that the + * fence object is signaled, it will read the results from the device DDR + * (if relevant) and resume the code execution in KMD. + * + * To use QMAN packets, the opcode must be the QMAN opcode, shifted by 8 + * so the value being put by the KMD matches the value read by ArmCP + * + * Non-QMAN packets should be limited to values 1 through (2^8 - 1) + * + * Detailed description: + * + * ARMCP_PACKET_DISABLE_PCI_ACCESS - + * After receiving this packet the embedded CPU must NOT issue PCI + * transactions (read/write) towards the Host CPU. This also include + * sending MSI-X interrupts. + * This packet is usually sent before the device is moved to D3Hot state. + * + * ARMCP_PACKET_ENABLE_PCI_ACCESS - + * After receiving this packet the embedded CPU is allowed to issue PCI + * transactions towards the Host CPU, including sending MSI-X interrupts. + * This packet is usually send after the device is moved to D0 state. + * + * ARMCP_PACKET_TEMPERATURE_GET - + * Fetch the current temperature / Max / Max Hyst / Critical / + * Critical Hyst of a specified thermal sensor. The packet's + * arguments specify the desired sensor and the field to get. + * + * ARMCP_PACKET_VOLTAGE_GET - + * Fetch the voltage / Max / Min of a specified sensor. The packet's + * arguments specify the sensor and type. + * + * ARMCP_PACKET_CURRENT_GET - + * Fetch the current / Max / Min of a specified sensor. The packet's + * arguments specify the sensor and type. + * + * ARMCP_PACKET_FAN_SPEED_GET - + * Fetch the speed / Max / Min of a specified fan. The packet's + * arguments specify the sensor and type. + * + * ARMCP_PACKET_PWM_GET - + * Fetch the pwm value / mode of a specified pwm. The packet's + * arguments specify the sensor and type. + * + * ARMCP_PACKET_PWM_SET - + * Set the pwm value / mode of a specified pwm. The packet's + * arguments specify the sensor, type and value. + * + * ARMCP_PACKET_FREQUENCY_SET - + * Set the frequency of a specified PLL. The packet's arguments specify + * the PLL and the desired frequency. The actual frequency in the device + * might differ from the requested frequency. + * + * ARMCP_PACKET_FREQUENCY_GET - + * Fetch the frequency of a specified PLL. The packet's arguments specify + * the PLL. + * + * ARMCP_PACKET_LED_SET - + * Set the state of a specified led. The packet's arguments + * specify the led and the desired state. + * + * ARMCP_PACKET_I2C_WR - + * Write 32-bit value to I2C device. The packet's arguments specify the + * I2C bus, address and value. + * + * ARMCP_PACKET_I2C_RD - + * Read 32-bit value from I2C device. The packet's arguments specify the + * I2C bus and address. + * + * ARMCP_PACKET_INFO_GET - + * Fetch information from the device as specified in the packet's + * structure. KMD passes the max size it allows the ArmCP to write to + * the structure, to prevent data corruption in case of mismatched + * KMD/FW versions. + * + * ARMCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed + * + * ARMCP_PACKET_UNMASK_RAZWI_IRQ - + * Unmask the given IRQ. The IRQ number is specified in the value field. + * The packet is sent after receiving an interrupt and printing its + * relevant information. + * + * ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY - + * Unmask the given IRQs. The IRQs numbers are specified in an array right + * after the armcp_packet structure, where its first element is the array + * length. The packet is sent after a soft reset was done in order to + * handle any interrupts that were sent during the reset process. + * + * ARMCP_PACKET_TEST - + * Test packet for ArmCP connectivity. The CPU will put the fence value + * in the result field. + * + * ARMCP_PACKET_FREQUENCY_CURR_GET - + * Fetch the current frequency of a specified PLL. The packet's arguments + * specify the PLL. + * + * ARMCP_PACKET_MAX_POWER_GET - + * Fetch the maximal power of the device. + * + * ARMCP_PACKET_MAX_POWER_SET - + * Set the maximal power of the device. The packet's arguments specify + * the power. + * + * ARMCP_PACKET_EEPROM_DATA_GET - + * Get EEPROM data from the ArmCP kernel. The buffer is specified in the + * addr field. The CPU will put the returned data size in the result + * field. In addition, KMD passes the max size it allows the ArmCP to + * write to the structure, to prevent data corruption in case of + * mismatched KMD/FW versions. + * + */ + +enum armcp_packet_id { + ARMCP_PACKET_DISABLE_PCI_ACCESS = 1, /* internal */ + ARMCP_PACKET_ENABLE_PCI_ACCESS, /* internal */ + ARMCP_PACKET_TEMPERATURE_GET, /* sysfs */ + ARMCP_PACKET_VOLTAGE_GET, /* sysfs */ + ARMCP_PACKET_CURRENT_GET, /* sysfs */ + ARMCP_PACKET_FAN_SPEED_GET, /* sysfs */ + ARMCP_PACKET_PWM_GET, /* sysfs */ + ARMCP_PACKET_PWM_SET, /* sysfs */ + ARMCP_PACKET_FREQUENCY_SET, /* sysfs */ + ARMCP_PACKET_FREQUENCY_GET, /* sysfs */ + ARMCP_PACKET_LED_SET, /* debugfs */ + ARMCP_PACKET_I2C_WR, /* debugfs */ + ARMCP_PACKET_I2C_RD, /* debugfs */ + ARMCP_PACKET_INFO_GET, /* IOCTL */ + ARMCP_PACKET_FLASH_PROGRAM_REMOVED, + ARMCP_PACKET_UNMASK_RAZWI_IRQ, /* internal */ + ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY, /* internal */ + ARMCP_PACKET_TEST, /* internal */ + ARMCP_PACKET_FREQUENCY_CURR_GET, /* sysfs */ + ARMCP_PACKET_MAX_POWER_GET, /* sysfs */ + ARMCP_PACKET_MAX_POWER_SET, /* sysfs */ + ARMCP_PACKET_EEPROM_DATA_GET, /* sysfs */ +}; + +#define ARMCP_PACKET_FENCE_VAL 0xFE8CE7A5 + +#define ARMCP_PKT_CTL_RC_SHIFT 12 +#define ARMCP_PKT_CTL_RC_MASK 0x0000F000 + +#define ARMCP_PKT_CTL_OPCODE_SHIFT 16 +#define ARMCP_PKT_CTL_OPCODE_MASK 0x1FFF0000 + +struct armcp_packet { + union { + __le64 value; /* For SET packets */ + __le64 result; /* For GET packets */ + __le64 addr; /* For PQ */ + }; + + __le32 ctl; + + __le32 fence; /* Signal to KMD that message is completed */ + + union { + struct {/* For temperature/current/voltage/fan/pwm get/set */ + __le16 sensor_index; + __le16 type; + }; + + struct { /* For I2C read/write */ + __u8 i2c_bus; + __u8 i2c_addr; + __u8 i2c_reg; + __u8 pad; /* unused */ + }; + + /* For frequency get/set */ + __le32 pll_index; + + /* For led set */ + __le32 led_index; + + /* For get Armcp info/EEPROM data */ + __le32 data_max_size; + }; +}; + +struct armcp_unmask_irq_arr_packet { + struct armcp_packet armcp_pkt; + __le32 length; + __le32 irqs[0]; +}; + +enum armcp_packet_rc { + armcp_packet_success, + armcp_packet_invalid, + armcp_packet_fault +}; + +enum armcp_temp_type { + armcp_temp_input, + armcp_temp_max = 6, + armcp_temp_max_hyst, + armcp_temp_crit, + armcp_temp_crit_hyst +}; + +enum armcp_in_attributes { + armcp_in_input, + armcp_in_min, + armcp_in_max +}; + +enum armcp_curr_attributes { + armcp_curr_input, + armcp_curr_min, + armcp_curr_max +}; + +enum armcp_fan_attributes { + armcp_fan_input, + armcp_fan_min = 2, + armcp_fan_max +}; + +enum armcp_pwm_attributes { + armcp_pwm_input, + armcp_pwm_enable +}; + +/* Event Queue Packets */ + +struct eq_generic_event { + __le64 data[7]; +}; + /* * ArmCP info */ #define VERSION_MAX_LEN 128 +#define ARMCP_MAX_SENSORS 128 + +struct armcp_sensor { + __le32 type; + __le32 flags; +}; + +struct armcp_info { + struct armcp_sensor sensors[ARMCP_MAX_SENSORS]; + __u8 kernel_version[VERSION_MAX_LEN]; + __le32 reserved[3]; + __le32 cpld_version; + __le32 infineon_version; + __u8 fuse_version[VERSION_MAX_LEN]; + __u8 thermal_version[VERSION_MAX_LEN]; + __u8 armcp_version[VERSION_MAX_LEN]; + __le64 dram_size; +}; #endif /* ARMCP_IF_H */ diff --git a/drivers/misc/habanalabs/include/goya/goya_async_events.h b/drivers/misc/habanalabs/include/goya/goya_async_events.h new file mode 100644 index 000000000000..497937a17ee9 --- /dev/null +++ b/drivers/misc/habanalabs/include/goya/goya_async_events.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2018 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#ifndef __GOYA_ASYNC_EVENTS_H_ +#define __GOYA_ASYNC_EVENTS_H_ + +enum goya_async_event_id { + GOYA_ASYNC_EVENT_ID_PCIE_IF = 33, + GOYA_ASYNC_EVENT_ID_TPC0_ECC = 36, + GOYA_ASYNC_EVENT_ID_TPC1_ECC = 39, + GOYA_ASYNC_EVENT_ID_TPC2_ECC = 42, + GOYA_ASYNC_EVENT_ID_TPC3_ECC = 45, + GOYA_ASYNC_EVENT_ID_TPC4_ECC = 48, + GOYA_ASYNC_EVENT_ID_TPC5_ECC = 51, + GOYA_ASYNC_EVENT_ID_TPC6_ECC = 54, + GOYA_ASYNC_EVENT_ID_TPC7_ECC = 57, + GOYA_ASYNC_EVENT_ID_MME_ECC = 60, + GOYA_ASYNC_EVENT_ID_MME_ECC_EXT = 61, + GOYA_ASYNC_EVENT_ID_MMU_ECC = 63, + GOYA_ASYNC_EVENT_ID_DMA_MACRO = 64, + GOYA_ASYNC_EVENT_ID_DMA_ECC = 66, + GOYA_ASYNC_EVENT_ID_CPU_IF_ECC = 75, + GOYA_ASYNC_EVENT_ID_PSOC_MEM = 78, + GOYA_ASYNC_EVENT_ID_PSOC_CORESIGHT = 79, + GOYA_ASYNC_EVENT_ID_SRAM0 = 81, + GOYA_ASYNC_EVENT_ID_SRAM1 = 82, + GOYA_ASYNC_EVENT_ID_SRAM2 = 83, + GOYA_ASYNC_EVENT_ID_SRAM3 = 84, + GOYA_ASYNC_EVENT_ID_SRAM4 = 85, + GOYA_ASYNC_EVENT_ID_SRAM5 = 86, + GOYA_ASYNC_EVENT_ID_SRAM6 = 87, + GOYA_ASYNC_EVENT_ID_SRAM7 = 88, + GOYA_ASYNC_EVENT_ID_SRAM8 = 89, + GOYA_ASYNC_EVENT_ID_SRAM9 = 90, + GOYA_ASYNC_EVENT_ID_SRAM10 = 91, + GOYA_ASYNC_EVENT_ID_SRAM11 = 92, + GOYA_ASYNC_EVENT_ID_SRAM12 = 93, + GOYA_ASYNC_EVENT_ID_SRAM13 = 94, + GOYA_ASYNC_EVENT_ID_SRAM14 = 95, + GOYA_ASYNC_EVENT_ID_SRAM15 = 96, + GOYA_ASYNC_EVENT_ID_SRAM16 = 97, + GOYA_ASYNC_EVENT_ID_SRAM17 = 98, + GOYA_ASYNC_EVENT_ID_SRAM18 = 99, + GOYA_ASYNC_EVENT_ID_SRAM19 = 100, + GOYA_ASYNC_EVENT_ID_SRAM20 = 101, + GOYA_ASYNC_EVENT_ID_SRAM21 = 102, + GOYA_ASYNC_EVENT_ID_SRAM22 = 103, + GOYA_ASYNC_EVENT_ID_SRAM23 = 104, + GOYA_ASYNC_EVENT_ID_SRAM24 = 105, + GOYA_ASYNC_EVENT_ID_SRAM25 = 106, + GOYA_ASYNC_EVENT_ID_SRAM26 = 107, + GOYA_ASYNC_EVENT_ID_SRAM27 = 108, + GOYA_ASYNC_EVENT_ID_SRAM28 = 109, + GOYA_ASYNC_EVENT_ID_SRAM29 = 110, + GOYA_ASYNC_EVENT_ID_GIC500 = 112, + GOYA_ASYNC_EVENT_ID_PCIE_DEC = 115, + GOYA_ASYNC_EVENT_ID_TPC0_DEC = 117, + GOYA_ASYNC_EVENT_ID_TPC1_DEC = 120, + GOYA_ASYNC_EVENT_ID_TPC2_DEC = 123, + GOYA_ASYNC_EVENT_ID_TPC3_DEC = 126, + GOYA_ASYNC_EVENT_ID_TPC4_DEC = 129, + GOYA_ASYNC_EVENT_ID_TPC5_DEC = 132, + GOYA_ASYNC_EVENT_ID_TPC6_DEC = 135, + GOYA_ASYNC_EVENT_ID_TPC7_DEC = 138, + GOYA_ASYNC_EVENT_ID_AXI_ECC = 139, + GOYA_ASYNC_EVENT_ID_L2_RAM_ECC = 140, + GOYA_ASYNC_EVENT_ID_MME_WACS = 141, + GOYA_ASYNC_EVENT_ID_MME_WACSD = 142, + GOYA_ASYNC_EVENT_ID_PLL0 = 143, + GOYA_ASYNC_EVENT_ID_PLL1 = 144, + GOYA_ASYNC_EVENT_ID_PLL3 = 146, + GOYA_ASYNC_EVENT_ID_PLL4 = 147, + GOYA_ASYNC_EVENT_ID_PLL5 = 148, + GOYA_ASYNC_EVENT_ID_PLL6 = 149, + GOYA_ASYNC_EVENT_ID_CPU_AXI_SPLITTER = 155, + GOYA_ASYNC_EVENT_ID_PSOC_AXI_DEC = 159, + GOYA_ASYNC_EVENT_ID_PSOC = 160, + GOYA_ASYNC_EVENT_ID_PCIE_FLR = 171, + GOYA_ASYNC_EVENT_ID_PCIE_HOT_RESET = 172, + GOYA_ASYNC_EVENT_ID_PCIE_QID0_ENG0 = 174, + GOYA_ASYNC_EVENT_ID_PCIE_QID0_ENG1 = 175, + GOYA_ASYNC_EVENT_ID_PCIE_QID0_ENG2 = 176, + GOYA_ASYNC_EVENT_ID_PCIE_QID0_ENG3 = 177, + GOYA_ASYNC_EVENT_ID_PCIE_QID1_ENG0 = 178, + GOYA_ASYNC_EVENT_ID_PCIE_QID1_ENG1 = 179, + GOYA_ASYNC_EVENT_ID_PCIE_QID1_ENG2 = 180, + GOYA_ASYNC_EVENT_ID_PCIE_QID1_ENG3 = 181, + GOYA_ASYNC_EVENT_ID_PCIE_APB = 182, + GOYA_ASYNC_EVENT_ID_PCIE_QDB = 183, + GOYA_ASYNC_EVENT_ID_PCIE_BM_D_P_WR = 184, + GOYA_ASYNC_EVENT_ID_PCIE_BM_D_RD = 185, + GOYA_ASYNC_EVENT_ID_PCIE_BM_U_P_WR = 186, + GOYA_ASYNC_EVENT_ID_PCIE_BM_U_RD = 187, + GOYA_ASYNC_EVENT_ID_TPC0_BMON_SPMU = 190, + GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR = 191, + GOYA_ASYNC_EVENT_ID_TPC1_BMON_SPMU = 200, + GOYA_ASYNC_EVENT_ID_TPC1_KRN_ERR = 201, + GOYA_ASYNC_EVENT_ID_TPC2_BMON_SPMU = 210, + GOYA_ASYNC_EVENT_ID_TPC2_KRN_ERR = 211, + GOYA_ASYNC_EVENT_ID_TPC3_BMON_SPMU = 220, + GOYA_ASYNC_EVENT_ID_TPC3_KRN_ERR = 221, + GOYA_ASYNC_EVENT_ID_TPC4_BMON_SPMU = 230, + GOYA_ASYNC_EVENT_ID_TPC4_KRN_ERR = 231, + GOYA_ASYNC_EVENT_ID_TPC5_BMON_SPMU = 240, + GOYA_ASYNC_EVENT_ID_TPC5_KRN_ERR = 241, + GOYA_ASYNC_EVENT_ID_TPC6_BMON_SPMU = 250, + GOYA_ASYNC_EVENT_ID_TPC6_KRN_ERR = 251, + GOYA_ASYNC_EVENT_ID_TPC7_BMON_SPMU = 260, + GOYA_ASYNC_EVENT_ID_TPC7_KRN_ERR = 261, + GOYA_ASYNC_EVENT_ID_MMU_SBA_SPMU0 = 270, + GOYA_ASYNC_EVENT_ID_MMU_SBA_SPMU1 = 271, + GOYA_ASYNC_EVENT_ID_MME_WACS_UP = 272, + GOYA_ASYNC_EVENT_ID_MME_WACS_DOWN = 273, + GOYA_ASYNC_EVENT_ID_MMU_PAGE_FAULT = 280, + GOYA_ASYNC_EVENT_ID_MMU_WR_PERM = 281, + GOYA_ASYNC_EVENT_ID_MMU_DBG_BM = 282, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH0 = 290, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH1 = 291, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH2 = 292, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH3 = 293, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH4 = 294, + GOYA_ASYNC_EVENT_ID_DDR0_PHY_DFI = 300, + GOYA_ASYNC_EVENT_ID_DDR0_ECC_SCRUB = 301, + GOYA_ASYNC_EVENT_ID_DDR0_DB_ECC = 302, + GOYA_ASYNC_EVENT_ID_DDR0_SB_ECC = 303, + GOYA_ASYNC_EVENT_ID_DDR0_SB_ECC_MC = 304, + GOYA_ASYNC_EVENT_ID_DDR0_AXI_RD = 305, + GOYA_ASYNC_EVENT_ID_DDR0_AXI_WR = 306, + GOYA_ASYNC_EVENT_ID_DDR1_PHY_DFI = 310, + GOYA_ASYNC_EVENT_ID_DDR1_ECC_SCRUB = 311, + GOYA_ASYNC_EVENT_ID_DDR1_DB_ECC = 312, + GOYA_ASYNC_EVENT_ID_DDR1_SB_ECC = 313, + GOYA_ASYNC_EVENT_ID_DDR1_SB_ECC_MC = 314, + GOYA_ASYNC_EVENT_ID_DDR1_AXI_RD = 315, + GOYA_ASYNC_EVENT_ID_DDR1_AXI_WR = 316, + GOYA_ASYNC_EVENT_ID_CPU_BMON = 320, + GOYA_ASYNC_EVENT_ID_TS_EAST = 322, + GOYA_ASYNC_EVENT_ID_TS_WEST = 323, + GOYA_ASYNC_EVENT_ID_TS_NORTH = 324, + GOYA_ASYNC_EVENT_ID_PSOC_GPIO_U16_0 = 330, + GOYA_ASYNC_EVENT_ID_PSOC_GPIO_U16_1 = 331, + GOYA_ASYNC_EVENT_ID_PSOC_GPIO_U16_2 = 332, + GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET = 356, + GOYA_ASYNC_EVENT_ID_PSOC_GPIO_10_VRHOT_ICRIT = 361, + GOYA_ASYNC_EVENT_ID_TPC0_CMDQ = 430, + GOYA_ASYNC_EVENT_ID_TPC1_CMDQ = 431, + GOYA_ASYNC_EVENT_ID_TPC2_CMDQ = 432, + GOYA_ASYNC_EVENT_ID_TPC3_CMDQ = 433, + GOYA_ASYNC_EVENT_ID_TPC4_CMDQ = 434, + GOYA_ASYNC_EVENT_ID_TPC5_CMDQ = 435, + GOYA_ASYNC_EVENT_ID_TPC6_CMDQ = 436, + GOYA_ASYNC_EVENT_ID_TPC7_CMDQ = 437, + GOYA_ASYNC_EVENT_ID_TPC0_QM = 438, + GOYA_ASYNC_EVENT_ID_TPC1_QM = 439, + GOYA_ASYNC_EVENT_ID_TPC2_QM = 440, + GOYA_ASYNC_EVENT_ID_TPC3_QM = 441, + GOYA_ASYNC_EVENT_ID_TPC4_QM = 442, + GOYA_ASYNC_EVENT_ID_TPC5_QM = 443, + GOYA_ASYNC_EVENT_ID_TPC6_QM = 444, + GOYA_ASYNC_EVENT_ID_TPC7_QM = 445, + GOYA_ASYNC_EVENT_ID_MME_QM = 447, + GOYA_ASYNC_EVENT_ID_MME_CMDQ = 448, + GOYA_ASYNC_EVENT_ID_DMA0_QM = 449, + GOYA_ASYNC_EVENT_ID_DMA1_QM = 450, + GOYA_ASYNC_EVENT_ID_DMA2_QM = 451, + GOYA_ASYNC_EVENT_ID_DMA3_QM = 452, + GOYA_ASYNC_EVENT_ID_DMA4_QM = 453, + GOYA_ASYNC_EVENT_ID_DMA_ON_HBW = 454, + GOYA_ASYNC_EVENT_ID_DMA0_CH = 455, + GOYA_ASYNC_EVENT_ID_DMA1_CH = 456, + GOYA_ASYNC_EVENT_ID_DMA2_CH = 457, + GOYA_ASYNC_EVENT_ID_DMA3_CH = 458, + GOYA_ASYNC_EVENT_ID_DMA4_CH = 459, + GOYA_ASYNC_EVENT_ID_PI_UPDATE = 484, + GOYA_ASYNC_EVENT_ID_HALT_MACHINE = 485, + GOYA_ASYNC_EVENT_ID_INTS_REGISTER = 486, + GOYA_ASYNC_EVENT_ID_SOFT_RESET = 487, + GOYA_ASYNC_EVENT_ID_LAST_VALID_ID = 1023, + GOYA_ASYNC_EVENT_ID_SIZE +}; + +#endif /* __GOYA_ASYNC_EVENTS_H_ */ diff --git a/drivers/misc/habanalabs/include/goya/goya_packets.h b/drivers/misc/habanalabs/include/goya/goya_packets.h new file mode 100644 index 000000000000..a14407b975e4 --- /dev/null +++ b/drivers/misc/habanalabs/include/goya/goya_packets.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2017-2018 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#ifndef GOYA_PACKETS_H +#define GOYA_PACKETS_H + +#include + +#define PACKET_HEADER_PACKET_ID_SHIFT 56 +#define PACKET_HEADER_PACKET_ID_MASK 0x1F00000000000000ull + +enum packet_id { + PACKET_WREG_32 = 0x1, + PACKET_WREG_BULK = 0x2, + PACKET_MSG_LONG = 0x3, + PACKET_MSG_SHORT = 0x4, + PACKET_CP_DMA = 0x5, + PACKET_MSG_PROT = 0x7, + PACKET_FENCE = 0x8, + PACKET_LIN_DMA = 0x9, + PACKET_NOP = 0xA, + PACKET_STOP = 0xB, + MAX_PACKET_ID = (PACKET_HEADER_PACKET_ID_MASK >> + PACKET_HEADER_PACKET_ID_SHIFT) + 1 +}; + +enum goya_dma_direction { + DMA_HOST_TO_DRAM, + DMA_HOST_TO_SRAM, + DMA_DRAM_TO_SRAM, + DMA_SRAM_TO_DRAM, + DMA_SRAM_TO_HOST, + DMA_DRAM_TO_HOST, + DMA_DRAM_TO_DRAM, + DMA_SRAM_TO_SRAM, + DMA_ENUM_MAX +}; + +#define GOYA_PKT_CTL_OPCODE_SHIFT 24 +#define GOYA_PKT_CTL_OPCODE_MASK 0x1F000000 + +#define GOYA_PKT_CTL_EB_SHIFT 29 +#define GOYA_PKT_CTL_EB_MASK 0x20000000 + +#define GOYA_PKT_CTL_RB_SHIFT 30 +#define GOYA_PKT_CTL_RB_MASK 0x40000000 + +#define GOYA_PKT_CTL_MB_SHIFT 31 +#define GOYA_PKT_CTL_MB_MASK 0x80000000 + +struct packet_nop { + __le32 reserved; + __le32 ctl; +}; + +struct packet_stop { + __le32 reserved; + __le32 ctl; +}; + +#define GOYA_PKT_WREG32_CTL_REG_OFFSET_SHIFT 0 +#define GOYA_PKT_WREG32_CTL_REG_OFFSET_MASK 0x0000FFFF + +struct packet_wreg32 { + __le32 value; + __le32 ctl; +}; + +struct packet_wreg_bulk { + __le32 size64; + __le32 ctl; + __le64 values[0]; /* data starts here */ +}; + +struct packet_msg_long { + __le32 value; + __le32 ctl; + __le64 addr; +}; + +struct packet_msg_short { + __le32 value; + __le32 ctl; +}; + +struct packet_msg_prot { + __le32 value; + __le32 ctl; + __le64 addr; +}; + +struct packet_fence { + __le32 cfg; + __le32 ctl; +}; + +#define GOYA_PKT_LIN_DMA_CTL_WO_SHIFT 0 +#define GOYA_PKT_LIN_DMA_CTL_WO_MASK 0x00000001 + +#define GOYA_PKT_LIN_DMA_CTL_RDCOMP_SHIFT 1 +#define GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK 0x00000002 + +#define GOYA_PKT_LIN_DMA_CTL_WRCOMP_SHIFT 2 +#define GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK 0x00000004 + +#define GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT 6 +#define GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK 0x00000040 + +#define GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT 20 +#define GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK 0x00700000 + +struct packet_lin_dma { + __le32 tsize; + __le32 ctl; + __le64 src_addr; + __le64 dst_addr; +}; + +struct packet_cp_dma { + __le32 tsize; + __le32 ctl; + __le64 src_addr; +}; + +#endif /* GOYA_PACKETS_H */ diff --git a/drivers/misc/habanalabs/include/qman_if.h b/drivers/misc/habanalabs/include/qman_if.h new file mode 100644 index 000000000000..bf59bbe27fdc --- /dev/null +++ b/drivers/misc/habanalabs/include/qman_if.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2016-2018 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#ifndef QMAN_IF_H +#define QMAN_IF_H + +#include + +/* + * PRIMARY QUEUE + */ + +struct hl_bd { + __le64 ptr; + __le32 len; + __le32 ctl; +}; + +#define HL_BD_SIZE sizeof(struct hl_bd) + +/* + * BD_CTL_REPEAT_VALID tells the CP whether the repeat field in the BD CTL is + * valid. 1 means the repeat field is valid, 0 means not-valid, + * i.e. repeat == 1 + */ +#define BD_CTL_REPEAT_VALID_SHIFT 24 +#define BD_CTL_REPEAT_VALID_MASK 0x01000000 + +#define BD_CTL_SHADOW_INDEX_SHIFT 0 +#define BD_CTL_SHADOW_INDEX_MASK 0x00000FFF + +/* + * COMPLETION QUEUE + */ + +struct hl_cq_entry { + __le32 data; +}; + +#define HL_CQ_ENTRY_SIZE sizeof(struct hl_cq_entry) + +#define CQ_ENTRY_READY_SHIFT 31 +#define CQ_ENTRY_READY_MASK 0x80000000 + +#define CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT 30 +#define CQ_ENTRY_SHADOW_INDEX_VALID_MASK 0x40000000 + +#define CQ_ENTRY_SHADOW_INDEX_SHIFT BD_CTL_SHADOW_INDEX_SHIFT +#define CQ_ENTRY_SHADOW_INDEX_MASK BD_CTL_SHADOW_INDEX_MASK + + +#endif /* QMAN_IF_H */ diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c new file mode 100644 index 000000000000..6b7d35f6af08 --- /dev/null +++ b/drivers/misc/habanalabs/irq.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include + +/* + * hl_cq_inc_ptr - increment ci or pi of cq + * + * @ptr: the current ci or pi value of the completion queue + * + * Increment ptr by 1. If it reaches the number of completion queue + * entries, set it to 0 + */ +inline u32 hl_cq_inc_ptr(u32 ptr) +{ + ptr++; + if (unlikely(ptr == HL_CQ_LENGTH)) + ptr = 0; + return ptr; +} + +/* + * hl_irq_handler_cq - irq handler for completion queue + * + * @irq: irq number + * @arg: pointer to completion queue structure + * + */ +irqreturn_t hl_irq_handler_cq(int irq, void *arg) +{ + struct hl_cq *cq = arg; + struct hl_device *hdev = cq->hdev; + struct hl_hw_queue *queue; + struct hl_cs_job *job; + bool shadow_index_valid; + u16 shadow_index; + u32 *cq_entry; + u32 *cq_base; + + if (hdev->disabled) { + dev_dbg(hdev->dev, + "Device disabled but received IRQ %d for CQ %d\n", + irq, cq->hw_queue_id); + return IRQ_HANDLED; + } + + cq_base = (u32 *) (uintptr_t) cq->kernel_address; + + while (1) { + bool entry_ready = ((cq_base[cq->ci] & CQ_ENTRY_READY_MASK) + >> CQ_ENTRY_READY_SHIFT); + + if (!entry_ready) + break; + + cq_entry = (u32 *) &cq_base[cq->ci]; + + /* + * Make sure we read CQ entry contents after we've + * checked the ownership bit. + */ + dma_rmb(); + + shadow_index_valid = + ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_VALID_MASK) + >> CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT); + + shadow_index = (u16) + ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_MASK) + >> CQ_ENTRY_SHADOW_INDEX_SHIFT); + + queue = &hdev->kernel_queues[cq->hw_queue_id]; + + if ((shadow_index_valid) && (!hdev->disabled)) { + job = queue->shadow_queue[hl_pi_2_offset(shadow_index)]; + queue_work(hdev->cq_wq, &job->finish_work); + } + + /* + * Update ci of the context's queue. There is no + * need to protect it with spinlock because this update is + * done only inside IRQ and there is a different IRQ per + * queue + */ + queue->ci = hl_queue_inc_ptr(queue->ci); + + /* Clear CQ entry ready bit */ + cq_base[cq->ci] &= ~CQ_ENTRY_READY_MASK; + + cq->ci = hl_cq_inc_ptr(cq->ci); + + /* Increment free slots */ + atomic_inc(&cq->free_slots_cnt); + } + + return IRQ_HANDLED; +} + +/* + * hl_cq_init - main initialization function for an cq object + * + * @hdev: pointer to device structure + * @q: pointer to cq structure + * @hw_queue_id: The H/W queue ID this completion queue belongs to + * + * Allocate dma-able memory for the completion queue and initialize fields + * Returns 0 on success + */ +int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id) +{ + void *p; + + BUILD_BUG_ON(HL_CQ_SIZE_IN_BYTES > HL_PAGE_SIZE); + + p = hdev->asic_funcs->dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES, + &q->bus_address, GFP_KERNEL | __GFP_ZERO); + if (!p) + return -ENOMEM; + + q->hdev = hdev; + q->kernel_address = (u64) (uintptr_t) p; + q->hw_queue_id = hw_queue_id; + q->ci = 0; + q->pi = 0; + + atomic_set(&q->free_slots_cnt, HL_CQ_LENGTH); + + return 0; +} + +/* + * hl_cq_fini - destroy completion queue + * + * @hdev: pointer to device structure + * @q: pointer to cq structure + * + * Free the completion queue memory + */ +void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q) +{ + hdev->asic_funcs->dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES, + (void *) (uintptr_t) q->kernel_address, q->bus_address); +} diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index a8edfd3e9c95..756266cf0416 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -17,6 +17,35 @@ */ #define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START 0x8000 /* 32KB */ +/* + * Queue Numbering + * + * The external queues (DMA channels + CPU) MUST be before the internal queues + * and each group (DMA channels + CPU and internal) must be contiguous inside + * itself but there can be a gap between the two groups (although not + * recommended) + */ + +enum goya_queue_id { + GOYA_QUEUE_ID_DMA_0 = 0, + GOYA_QUEUE_ID_DMA_1, + GOYA_QUEUE_ID_DMA_2, + GOYA_QUEUE_ID_DMA_3, + GOYA_QUEUE_ID_DMA_4, + GOYA_QUEUE_ID_CPU_PQ, + GOYA_QUEUE_ID_MME, + GOYA_QUEUE_ID_TPC0, + GOYA_QUEUE_ID_TPC1, + GOYA_QUEUE_ID_TPC2, + GOYA_QUEUE_ID_TPC3, + GOYA_QUEUE_ID_TPC4, + GOYA_QUEUE_ID_TPC5, + GOYA_QUEUE_ID_TPC6, + GOYA_QUEUE_ID_TPC7, + GOYA_QUEUE_ID_SIZE +}; + + /* Opcode to create a new command buffer */ #define HL_CB_OP_CREATE 0 /* Opcode to destroy previously created command buffer */ -- cgit v1.3-7-g2ca7 From eff6f4a0e70b7bcf4674f471a768860a74e638a6 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 16 Feb 2019 00:39:21 +0200 Subject: habanalabs: add command submission module This patch adds the main flow for the user to submit work to the device. Each work is described by a command submission object (CS). The CS contains 3 arrays of command buffers: One for execution, and two for context-switch (store and restore). For each CB, the user specifies on which queue to put that CB. In case of an internal queue, the entry doesn't contain a pointer to the CB but the address in the on-chip memory that the CB resides at. The driver parses some of the CBs to enforce security restrictions. The user receives a sequence number that represents the CS object. The user can then query the driver regarding the status of the CS, using that sequence number. In case the CS doesn't finish before the timeout expires, the driver will perform a soft-reset of the device. Reviewed-by: Mike Rapoport Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/Makefile | 3 +- drivers/misc/habanalabs/command_submission.c | 766 ++++++++++++++++++ drivers/misc/habanalabs/context.c | 52 +- drivers/misc/habanalabs/device.c | 16 + drivers/misc/habanalabs/goya/goya.c | 1126 ++++++++++++++++++++++++++ drivers/misc/habanalabs/habanalabs.h | 275 +++++++ drivers/misc/habanalabs/habanalabs_drv.c | 20 + drivers/misc/habanalabs/habanalabs_ioctl.c | 4 +- drivers/misc/habanalabs/hw_queue.c | 232 ++++++ drivers/misc/habanalabs/memory.c | 198 +++++ include/uapi/misc/habanalabs.h | 158 +++- 11 files changed, 2843 insertions(+), 7 deletions(-) create mode 100644 drivers/misc/habanalabs/command_submission.c create mode 100644 drivers/misc/habanalabs/memory.c (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile index b5607233d216..d2fd0e18b1eb 100644 --- a/drivers/misc/habanalabs/Makefile +++ b/drivers/misc/habanalabs/Makefile @@ -5,7 +5,8 @@ obj-m := habanalabs.o habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \ - command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o + command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \ + command_submission.o include $(src)/goya/Makefile habanalabs-y += $(HL_GOYA_FILES) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c new file mode 100644 index 000000000000..ae68b97e428d --- /dev/null +++ b/drivers/misc/habanalabs/command_submission.c @@ -0,0 +1,766 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include +#include "habanalabs.h" + +#include +#include + +static void job_wq_completion(struct work_struct *work); +static long _hl_cs_wait_ioctl(struct hl_device *hdev, + struct hl_ctx *ctx, u64 timeout_us, u64 seq); +static void cs_do_release(struct kref *ref); + +static const char *hl_fence_get_driver_name(struct dma_fence *fence) +{ + return "HabanaLabs"; +} + +static const char *hl_fence_get_timeline_name(struct dma_fence *fence) +{ + struct hl_dma_fence *hl_fence = + container_of(fence, struct hl_dma_fence, base_fence); + + return dev_name(hl_fence->hdev->dev); +} + +static bool hl_fence_enable_signaling(struct dma_fence *fence) +{ + return true; +} + +static void hl_fence_release(struct dma_fence *fence) +{ + struct hl_dma_fence *hl_fence = + container_of(fence, struct hl_dma_fence, base_fence); + + kfree_rcu(hl_fence, base_fence.rcu); +} + +static const struct dma_fence_ops hl_fence_ops = { + .get_driver_name = hl_fence_get_driver_name, + .get_timeline_name = hl_fence_get_timeline_name, + .enable_signaling = hl_fence_enable_signaling, + .wait = dma_fence_default_wait, + .release = hl_fence_release +}; + +static void cs_get(struct hl_cs *cs) +{ + kref_get(&cs->refcount); +} + +static int cs_get_unless_zero(struct hl_cs *cs) +{ + return kref_get_unless_zero(&cs->refcount); +} + +static void cs_put(struct hl_cs *cs) +{ + kref_put(&cs->refcount, cs_do_release); +} + +/* + * cs_parser - parse the user command submission + * + * @hpriv : pointer to the private data of the fd + * @job : pointer to the job that holds the command submission info + * + * The function parses the command submission of the user. It calls the + * ASIC specific parser, which returns a list of memory blocks to send + * to the device as different command buffers + * + */ +static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_cs_parser parser; + int rc; + + parser.ctx_id = job->cs->ctx->asid; + parser.cs_sequence = job->cs->sequence; + parser.job_id = job->id; + + parser.hw_queue_id = job->hw_queue_id; + parser.job_userptr_list = &job->userptr_list; + parser.patched_cb = NULL; + parser.user_cb = job->user_cb; + parser.user_cb_size = job->user_cb_size; + parser.ext_queue = job->ext_queue; + job->patched_cb = NULL; + parser.use_virt_addr = hdev->mmu_enable; + + rc = hdev->asic_funcs->cs_parser(hdev, &parser); + if (job->ext_queue) { + if (!rc) { + job->patched_cb = parser.patched_cb; + job->job_cb_size = parser.patched_cb_size; + + spin_lock(&job->patched_cb->lock); + job->patched_cb->cs_cnt++; + spin_unlock(&job->patched_cb->lock); + } + + /* + * Whether the parsing worked or not, we don't need the + * original CB anymore because it was already parsed and + * won't be accessed again for this CS + */ + spin_lock(&job->user_cb->lock); + job->user_cb->cs_cnt--; + spin_unlock(&job->user_cb->lock); + hl_cb_put(job->user_cb); + job->user_cb = NULL; + } + + return rc; +} + +static void free_job(struct hl_device *hdev, struct hl_cs_job *job) +{ + struct hl_cs *cs = job->cs; + + if (job->ext_queue) { + hl_userptr_delete_list(hdev, &job->userptr_list); + + /* + * We might arrive here from rollback and patched CB wasn't + * created, so we need to check it's not NULL + */ + if (job->patched_cb) { + spin_lock(&job->patched_cb->lock); + job->patched_cb->cs_cnt--; + spin_unlock(&job->patched_cb->lock); + + hl_cb_put(job->patched_cb); + } + } + + /* + * This is the only place where there can be multiple threads + * modifying the list at the same time + */ + spin_lock(&cs->job_lock); + list_del(&job->cs_node); + spin_unlock(&cs->job_lock); + + if (job->ext_queue) + cs_put(cs); + + kfree(job); +} + +static void cs_do_release(struct kref *ref) +{ + struct hl_cs *cs = container_of(ref, struct hl_cs, + refcount); + struct hl_device *hdev = cs->ctx->hdev; + struct hl_cs_job *job, *tmp; + + cs->completed = true; + + /* + * Although if we reached here it means that all external jobs have + * finished, because each one of them took refcnt to CS, we still + * need to go over the internal jobs and free them. Otherwise, we + * will have leaked memory and what's worse, the CS object (and + * potentially the CTX object) could be released, while the JOB + * still holds a pointer to them (but no reference). + */ + list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) + free_job(hdev, job); + + /* We also need to update CI for internal queues */ + if (cs->submitted) { + hl_int_hw_queue_update_ci(cs); + + spin_lock(&hdev->hw_queues_mirror_lock); + /* remove CS from hw_queues mirror list */ + list_del_init(&cs->mirror_node); + spin_unlock(&hdev->hw_queues_mirror_lock); + + /* + * Don't cancel TDR in case this CS was timedout because we + * might be running from the TDR context + */ + if ((!cs->timedout) && + (hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT)) { + struct hl_cs *next; + + if (cs->tdr_active) + cancel_delayed_work_sync(&cs->work_tdr); + + spin_lock(&hdev->hw_queues_mirror_lock); + + /* queue TDR for next CS */ + next = list_first_entry_or_null( + &hdev->hw_queues_mirror_list, + struct hl_cs, mirror_node); + + if ((next) && (!next->tdr_active)) { + next->tdr_active = true; + schedule_delayed_work(&next->work_tdr, + hdev->timeout_jiffies); + } + + spin_unlock(&hdev->hw_queues_mirror_lock); + } + } + + hl_ctx_put(cs->ctx); + + if (cs->timedout) + dma_fence_set_error(cs->fence, -ETIMEDOUT); + else if (cs->aborted) + dma_fence_set_error(cs->fence, -EIO); + + dma_fence_signal(cs->fence); + dma_fence_put(cs->fence); + + kfree(cs); +} + +static void cs_timedout(struct work_struct *work) +{ + struct hl_device *hdev; + int ctx_asid, rc; + struct hl_cs *cs = container_of(work, struct hl_cs, + work_tdr.work); + rc = cs_get_unless_zero(cs); + if (!rc) + return; + + if ((!cs->submitted) || (cs->completed)) { + cs_put(cs); + return; + } + + /* Mark the CS is timed out so we won't try to cancel its TDR */ + cs->timedout = true; + + hdev = cs->ctx->hdev; + ctx_asid = cs->ctx->asid; + + /* TODO: add information about last signaled seq and last emitted seq */ + dev_err(hdev->dev, "CS %d.%llu got stuck!\n", ctx_asid, cs->sequence); + + cs_put(cs); + + if (hdev->reset_on_lockup) + hl_device_reset(hdev, false, false); +} + +static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, + struct hl_cs **cs_new) +{ + struct hl_dma_fence *fence; + struct dma_fence *other = NULL; + struct hl_cs *cs; + int rc; + + cs = kzalloc(sizeof(*cs), GFP_ATOMIC); + if (!cs) + return -ENOMEM; + + cs->ctx = ctx; + cs->submitted = false; + cs->completed = false; + INIT_LIST_HEAD(&cs->job_list); + INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout); + kref_init(&cs->refcount); + spin_lock_init(&cs->job_lock); + + fence = kmalloc(sizeof(*fence), GFP_ATOMIC); + if (!fence) { + rc = -ENOMEM; + goto free_cs; + } + + fence->hdev = hdev; + spin_lock_init(&fence->lock); + cs->fence = &fence->base_fence; + + spin_lock(&ctx->cs_lock); + + fence->cs_seq = ctx->cs_sequence; + other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)]; + if ((other) && (!dma_fence_is_signaled(other))) { + spin_unlock(&ctx->cs_lock); + rc = -EAGAIN; + goto free_fence; + } + + dma_fence_init(&fence->base_fence, &hl_fence_ops, &fence->lock, + ctx->asid, ctx->cs_sequence); + + cs->sequence = fence->cs_seq; + + ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)] = + &fence->base_fence; + ctx->cs_sequence++; + + dma_fence_get(&fence->base_fence); + + dma_fence_put(other); + + spin_unlock(&ctx->cs_lock); + + *cs_new = cs; + + return 0; + +free_fence: + kfree(fence); +free_cs: + kfree(cs); + return rc; +} + +static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs) +{ + struct hl_cs_job *job, *tmp; + + list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) + free_job(hdev, job); +} + +void hl_cs_rollback_all(struct hl_device *hdev) +{ + struct hl_cs *cs, *tmp; + + /* flush all completions */ + flush_workqueue(hdev->cq_wq); + + /* Make sure we don't have leftovers in the H/W queues mirror list */ + list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list, + mirror_node) { + cs_get(cs); + cs->aborted = true; + dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n", + cs->ctx->asid, cs->sequence); + cs_rollback(hdev, cs); + cs_put(cs); + } +} + +static void job_wq_completion(struct work_struct *work) +{ + struct hl_cs_job *job = container_of(work, struct hl_cs_job, + finish_work); + struct hl_cs *cs = job->cs; + struct hl_device *hdev = cs->ctx->hdev; + + /* job is no longer needed */ + free_job(hdev, job); +} + +static struct hl_cb *validate_queue_index(struct hl_device *hdev, + struct hl_cb_mgr *cb_mgr, + struct hl_cs_chunk *chunk, + bool *ext_queue) +{ + struct asic_fixed_properties *asic = &hdev->asic_prop; + struct hw_queue_properties *hw_queue_prop; + u32 cb_handle; + struct hl_cb *cb; + + /* Assume external queue */ + *ext_queue = true; + + hw_queue_prop = &asic->hw_queues_props[chunk->queue_index]; + + if ((chunk->queue_index >= HL_MAX_QUEUES) || + (hw_queue_prop->type == QUEUE_TYPE_NA)) { + dev_err(hdev->dev, "Queue index %d is invalid\n", + chunk->queue_index); + return NULL; + } + + if (hw_queue_prop->kmd_only) { + dev_err(hdev->dev, "Queue index %d is restricted for KMD\n", + chunk->queue_index); + return NULL; + } else if (hw_queue_prop->type == QUEUE_TYPE_INT) { + *ext_queue = false; + return (struct hl_cb *) (uintptr_t) chunk->cb_handle; + } + + /* Retrieve CB object */ + cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT); + + cb = hl_cb_get(hdev, cb_mgr, cb_handle); + if (!cb) { + dev_err(hdev->dev, "CB handle 0x%x invalid\n", cb_handle); + return NULL; + } + + if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) { + dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size); + goto release_cb; + } + + spin_lock(&cb->lock); + cb->cs_cnt++; + spin_unlock(&cb->lock); + + return cb; + +release_cb: + hl_cb_put(cb); + return NULL; +} + +struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue) +{ + struct hl_cs_job *job; + + job = kzalloc(sizeof(*job), GFP_ATOMIC); + if (!job) + return NULL; + + job->ext_queue = ext_queue; + + if (job->ext_queue) { + INIT_LIST_HEAD(&job->userptr_list); + INIT_WORK(&job->finish_work, job_wq_completion); + } + + return job; +} + +static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, + u32 num_chunks, u64 *cs_seq) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_cs_chunk *cs_chunk_array; + struct hl_cs_job *job; + struct hl_cs *cs; + struct hl_cb *cb; + bool ext_queue_present = false; + u32 size_to_copy; + int rc, i, parse_cnt; + + *cs_seq = ULLONG_MAX; + + if (num_chunks > HL_MAX_JOBS_PER_CS) { + dev_err(hdev->dev, + "Number of chunks can NOT be larger than %d\n", + HL_MAX_JOBS_PER_CS); + rc = -EINVAL; + goto out; + } + + cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array), + GFP_ATOMIC); + if (!cs_chunk_array) { + rc = -ENOMEM; + goto out; + } + + size_to_copy = num_chunks * sizeof(struct hl_cs_chunk); + if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) { + dev_err(hdev->dev, "Failed to copy cs chunk array from user\n"); + rc = -EFAULT; + goto free_cs_chunk_array; + } + + /* increment refcnt for context */ + hl_ctx_get(hdev, hpriv->ctx); + + rc = allocate_cs(hdev, hpriv->ctx, &cs); + if (rc) { + hl_ctx_put(hpriv->ctx); + goto free_cs_chunk_array; + } + + *cs_seq = cs->sequence; + + /* Validate ALL the CS chunks before submitting the CS */ + for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) { + struct hl_cs_chunk *chunk = &cs_chunk_array[i]; + bool ext_queue; + + cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk, + &ext_queue); + if (ext_queue) { + ext_queue_present = true; + if (!cb) { + rc = -EINVAL; + goto free_cs_object; + } + } + + job = hl_cs_allocate_job(hdev, ext_queue); + if (!job) { + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + if (ext_queue) + goto release_cb; + else + goto free_cs_object; + } + + job->id = i + 1; + job->cs = cs; + job->user_cb = cb; + job->user_cb_size = chunk->cb_size; + if (job->ext_queue) + job->job_cb_size = cb->size; + else + job->job_cb_size = chunk->cb_size; + job->hw_queue_id = chunk->queue_index; + + cs->jobs_in_queue_cnt[job->hw_queue_id]++; + + list_add_tail(&job->cs_node, &cs->job_list); + + /* + * Increment CS reference. When CS reference is 0, CS is + * done and can be signaled to user and free all its resources + * Only increment for JOB on external queues, because only + * for those JOBs we get completion + */ + if (job->ext_queue) + cs_get(cs); + + rc = cs_parser(hpriv, job); + if (rc) { + dev_err(hdev->dev, + "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n", + cs->ctx->asid, cs->sequence, job->id, rc); + goto free_cs_object; + } + } + + if (!ext_queue_present) { + dev_err(hdev->dev, + "Reject CS %d.%llu because no external queues jobs\n", + cs->ctx->asid, cs->sequence); + rc = -EINVAL; + goto free_cs_object; + } + + rc = hl_hw_queue_schedule_cs(cs); + if (rc) { + dev_err(hdev->dev, + "Failed to submit CS %d.%llu to H/W queues, error %d\n", + cs->ctx->asid, cs->sequence, rc); + goto free_cs_object; + } + + rc = HL_CS_STATUS_SUCCESS; + goto put_cs; + +release_cb: + spin_lock(&cb->lock); + cb->cs_cnt--; + spin_unlock(&cb->lock); + hl_cb_put(cb); +free_cs_object: + cs_rollback(hdev, cs); + *cs_seq = ULLONG_MAX; + /* The path below is both for good and erroneous exits */ +put_cs: + /* We finished with the CS in this function, so put the ref */ + cs_put(cs); +free_cs_chunk_array: + kfree(cs_chunk_array); +out: + return rc; +} + +int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) +{ + struct hl_device *hdev = hpriv->hdev; + union hl_cs_args *args = data; + struct hl_ctx *ctx = hpriv->ctx; + void __user *chunks; + u32 num_chunks; + u64 cs_seq = ULONG_MAX; + int rc, do_restore; + bool need_soft_reset = false; + + if (hl_device_disabled_or_in_reset(hdev)) { + dev_warn(hdev->dev, + "Device is %s. Can't submit new CS\n", + atomic_read(&hdev->in_reset) ? "in_reset" : "disabled"); + rc = -EBUSY; + goto out; + } + + do_restore = atomic_cmpxchg(&ctx->thread_restore_token, 1, 0); + + if (do_restore || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) { + long ret; + + chunks = (void __user *)(uintptr_t)args->in.chunks_restore; + num_chunks = args->in.num_chunks_restore; + + mutex_lock(&hpriv->restore_phase_mutex); + + if (do_restore) { + rc = hdev->asic_funcs->context_switch(hdev, ctx->asid); + if (rc) { + dev_err_ratelimited(hdev->dev, + "Failed to switch to context %d, rejecting CS! %d\n", + ctx->asid, rc); + /* + * If we timedout, we need to soft-reset because + * QMAN is probably stuck. However, we can't + * call to reset here directly because of + * deadlock, so need to do it at the very end + * of this function + */ + if (rc == -ETIMEDOUT) + need_soft_reset = true; + mutex_unlock(&hpriv->restore_phase_mutex); + goto out; + } + } + + hdev->asic_funcs->restore_phase_topology(hdev); + + if (num_chunks == 0) { + dev_dbg(hdev->dev, + "Need to run restore phase but restore CS is empty\n"); + rc = 0; + } else { + rc = _hl_cs_ioctl(hpriv, chunks, num_chunks, + &cs_seq); + } + + mutex_unlock(&hpriv->restore_phase_mutex); + + if (rc) { + dev_err(hdev->dev, + "Failed to submit restore CS for context %d (%d)\n", + ctx->asid, rc); + goto out; + } + + /* Need to wait for restore completion before execution phase */ + if (num_chunks > 0) { + ret = _hl_cs_wait_ioctl(hdev, ctx, + jiffies_to_usecs(hdev->timeout_jiffies), + cs_seq); + if (ret <= 0) { + dev_err(hdev->dev, + "Restore CS for context %d failed to complete %ld\n", + ctx->asid, ret); + rc = -ENOEXEC; + goto out; + } + } + + ctx->thread_restore_wait_token = 1; + } else if (!ctx->thread_restore_wait_token) { + u32 tmp; + + rc = hl_poll_timeout_memory(hdev, + (u64) (uintptr_t) &ctx->thread_restore_wait_token, + jiffies_to_usecs(hdev->timeout_jiffies), + &tmp); + + if (rc || !tmp) { + dev_err(hdev->dev, + "restore phase hasn't finished in time\n"); + rc = -ETIMEDOUT; + goto out; + } + } + + chunks = (void __user *)(uintptr_t)args->in.chunks_execute; + num_chunks = args->in.num_chunks_execute; + + if (num_chunks == 0) { + dev_err(hdev->dev, + "Got execute CS with 0 chunks, context %d\n", + ctx->asid); + rc = -EINVAL; + goto out; + } + + rc = _hl_cs_ioctl(hpriv, chunks, num_chunks, &cs_seq); + +out: + if (rc != -EAGAIN) { + memset(args, 0, sizeof(*args)); + args->out.status = rc; + args->out.seq = cs_seq; + } + + if ((rc == -ETIMEDOUT) && (need_soft_reset)) + hl_device_reset(hdev, false, false); + + return rc; +} + +static long _hl_cs_wait_ioctl(struct hl_device *hdev, + struct hl_ctx *ctx, u64 timeout_us, u64 seq) +{ + struct dma_fence *fence; + unsigned long timeout; + long rc; + + if (timeout_us == MAX_SCHEDULE_TIMEOUT) + timeout = timeout_us; + else + timeout = usecs_to_jiffies(timeout_us); + + hl_ctx_get(hdev, ctx); + + fence = hl_ctx_get_fence(ctx, seq); + if (IS_ERR(fence)) { + rc = PTR_ERR(fence); + } else if (fence) { + rc = dma_fence_wait_timeout(fence, true, timeout); + if (fence->error == -ETIMEDOUT) + rc = -ETIMEDOUT; + else if (fence->error == -EIO) + rc = -EIO; + dma_fence_put(fence); + } else + rc = 1; + + hl_ctx_put(ctx); + + return rc; +} + +int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) +{ + struct hl_device *hdev = hpriv->hdev; + union hl_wait_cs_args *args = data; + u64 seq = args->in.seq; + long rc; + + rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq); + + memset(args, 0, sizeof(*args)); + + if (rc < 0) { + dev_err(hdev->dev, "Error %ld on waiting for CS handle %llu\n", + rc, seq); + if (rc == -ERESTARTSYS) { + args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED; + rc = -EINTR; + } else if (rc == -ETIMEDOUT) { + args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT; + } else if (rc == -EIO) { + args->out.status = HL_WAIT_CS_STATUS_ABORTED; + } + return rc; + } + + if (rc == 0) + args->out.status = HL_WAIT_CS_STATUS_BUSY; + else + args->out.status = HL_WAIT_CS_STATUS_COMPLETED; + + return 0; +} diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c index de1258e7a6e6..c3854714b46c 100644 --- a/drivers/misc/habanalabs/context.c +++ b/drivers/misc/habanalabs/context.c @@ -12,6 +12,18 @@ static void hl_ctx_fini(struct hl_ctx *ctx) { struct hl_device *hdev = ctx->hdev; + int i; + + /* + * If we arrived here, there are no jobs waiting for this context + * on its queues so we can safely remove it. + * This is because for each CS, we increment the ref count and for + * every CS that was finished we decrement it and we won't arrive + * to this function unless the ref count is 0 + */ + + for (i = 0 ; i < HL_MAX_PENDING_CS ; i++) + dma_fence_put(ctx->cs_pending[i]); if (ctx->asid != HL_KERNEL_ASID_ID) hl_asid_free(hdev, ctx->asid); @@ -23,8 +35,6 @@ void hl_ctx_do_release(struct kref *ref) ctx = container_of(ref, struct hl_ctx, refcount); - dev_dbg(ctx->hdev->dev, "Now really releasing context %d\n", ctx->asid); - hl_ctx_fini(ctx); if (ctx->hpriv) @@ -90,6 +100,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) kref_init(&ctx->refcount); + ctx->cs_sequence = 1; + spin_lock_init(&ctx->cs_lock); + atomic_set(&ctx->thread_restore_token, 1); + ctx->thread_restore_wait_token = 0; + if (is_kernel_ctx) { ctx->asid = HL_KERNEL_ASID_ID; /* KMD gets ASID 0 */ } else { @@ -100,8 +115,6 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) } } - dev_dbg(hdev->dev, "Created context with ASID %u\n", ctx->asid); - return 0; } @@ -115,6 +128,37 @@ int hl_ctx_put(struct hl_ctx *ctx) return kref_put(&ctx->refcount, hl_ctx_do_release); } +struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) +{ + struct hl_device *hdev = ctx->hdev; + struct dma_fence *fence; + + spin_lock(&ctx->cs_lock); + + if (seq >= ctx->cs_sequence) { + dev_notice(hdev->dev, + "Can't wait on seq %llu because current CS is at seq %llu\n", + seq, ctx->cs_sequence); + spin_unlock(&ctx->cs_lock); + return ERR_PTR(-EINVAL); + } + + + if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) { + dev_dbg(hdev->dev, + "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n", + seq, ctx->cs_sequence); + spin_unlock(&ctx->cs_lock); + return NULL; + } + + fence = dma_fence_get( + ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]); + spin_unlock(&ctx->cs_lock); + + return fence; +} + /* * hl_ctx_mgr_init - initialize the context manager * diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 2aa8a68cdf76..cc5f068df597 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -30,6 +30,8 @@ static void hpriv_release(struct kref *ref) put_pid(hpriv->taskpid); + mutex_destroy(&hpriv->restore_phase_mutex); + kfree(hpriv); /* Now the FD is really closed */ @@ -208,6 +210,8 @@ static int device_early_init(struct hl_device *hdev) mutex_init(&hdev->fd_open_cnt_lock); mutex_init(&hdev->send_cpu_message_lock); + INIT_LIST_HEAD(&hdev->hw_queues_mirror_list); + spin_lock_init(&hdev->hw_queues_mirror_lock); atomic_set(&hdev->in_reset, 0); atomic_set(&hdev->fd_open_cnt, 0); @@ -593,6 +597,9 @@ again: */ hdev->asic_funcs->halt_engines(hdev, hard_reset); + /* Go over all the queues, release all CS and their jobs */ + hl_cs_rollback_all(hdev); + if (hard_reset) { /* Release kernel context */ if (hl_ctx_put(hdev->kernel_ctx) != 1) { @@ -616,6 +623,12 @@ again: for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) hl_cq_reset(hdev, &hdev->completion_queue[i]); + /* Make sure the setup phase for the user context will run again */ + if (hdev->user_ctx) { + atomic_set(&hdev->user_ctx->thread_restore_token, 1); + hdev->user_ctx->thread_restore_wait_token = 0; + } + /* Finished tear-down, starting to re-initialize */ if (hard_reset) { @@ -952,6 +965,9 @@ void hl_device_fini(struct hl_device *hdev) */ hdev->asic_funcs->halt_engines(hdev, true); + /* Go over all the queues, release all CS and their jobs */ + hl_cs_rollback_all(hdev); + hl_cb_pool_fini(hdev); /* Release kernel context */ diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 1fe1d6a1ff9e..e3878fd7dc94 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -95,6 +95,19 @@ static const char goya_irq_name[GOYA_MSIX_ENTRIES][GOYA_MAX_STRING_LEN] = { "goya cq 4", "goya cpu eq" }; +static u16 goya_packet_sizes[MAX_PACKET_ID] = { + [PACKET_WREG_32] = sizeof(struct packet_wreg32), + [PACKET_WREG_BULK] = sizeof(struct packet_wreg_bulk), + [PACKET_MSG_LONG] = sizeof(struct packet_msg_long), + [PACKET_MSG_SHORT] = sizeof(struct packet_msg_short), + [PACKET_CP_DMA] = sizeof(struct packet_cp_dma), + [PACKET_MSG_PROT] = sizeof(struct packet_msg_prot), + [PACKET_FENCE] = sizeof(struct packet_fence), + [PACKET_LIN_DMA] = sizeof(struct packet_lin_dma), + [PACKET_NOP] = sizeof(struct packet_nop), + [PACKET_STOP] = sizeof(struct packet_stop) +}; + static const char *goya_axi_name[GOYA_MAX_INITIATORS] = { "MME0", "MME1", @@ -2978,6 +2991,84 @@ void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id, return base; } +int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) +{ + struct goya_device *goya = hdev->asic_specific; + struct packet_msg_prot *fence_pkt; + u32 *fence_ptr; + dma_addr_t fence_dma_addr; + struct hl_cb *cb; + u32 tmp; + int rc; + + if (!hdev->asic_funcs->is_device_idle(hdev)) { + dev_err_ratelimited(hdev->dev, + "Can't send KMD job on QMAN0 if device is not idle\n"); + return -EFAULT; + } + + fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL, + &fence_dma_addr); + if (!fence_ptr) { + dev_err(hdev->dev, + "Failed to allocate fence memory for QMAN0\n"); + return -ENOMEM; + } + + *fence_ptr = 0; + + if (goya->hw_cap_initialized & HW_CAP_MMU) { + WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_FULLY_TRUSTED); + RREG32(mmDMA_QM_0_GLBL_PROT); + } + + /* + * goya cs parser saves space for 2xpacket_msg_prot at end of CB. For + * synchronized kernel jobs we only need space for 1 packet_msg_prot + */ + job->job_cb_size -= sizeof(struct packet_msg_prot); + + cb = job->patched_cb; + + fence_pkt = (struct packet_msg_prot *) (uintptr_t) (cb->kernel_address + + job->job_cb_size - sizeof(struct packet_msg_prot)); + + fence_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | + (1 << GOYA_PKT_CTL_EB_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT); + fence_pkt->value = GOYA_QMAN0_FENCE_VAL; + fence_pkt->addr = fence_dma_addr + + hdev->asic_prop.host_phys_base_address; + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_DMA_0, + job->job_cb_size, cb->bus_address); + if (rc) { + dev_err(hdev->dev, "Failed to send CB on QMAN0, %d\n", rc); + goto free_fence_ptr; + } + + rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) fence_ptr, + HL_DEVICE_TIMEOUT_USEC, &tmp); + + hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_DMA_0); + + if ((rc) || (tmp != GOYA_QMAN0_FENCE_VAL)) { + dev_err(hdev->dev, "QMAN0 Job hasn't finished in time\n"); + rc = -ETIMEDOUT; + } + +free_fence_ptr: + hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr, + fence_dma_addr); + + if (goya->hw_cap_initialized & HW_CAP_MMU) { + WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_PARTLY_TRUSTED); + RREG32(mmDMA_QM_0_GLBL_PROT); + } + + return rc; +} + int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len, u32 timeout, long *result) { @@ -3214,11 +3305,985 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, size); } +int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + if (!dma_map_sg(&hdev->pdev->dev, sg, nents, dir)) + return -ENOMEM; + + return 0; +} + +void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + dma_unmap_sg(&hdev->pdev->dev, sg, nents, dir); +} + +u32 goya_get_dma_desc_list_size(struct hl_device *hdev, + struct sg_table *sgt) +{ + struct scatterlist *sg, *sg_next_iter; + u32 count, len, dma_desc_cnt, len_next; + dma_addr_t addr, addr_next; + + dma_desc_cnt = 0; + + for_each_sg(sgt->sgl, sg, sgt->nents, count) { + + len = sg_dma_len(sg); + addr = sg_dma_address(sg); + + if (len == 0) + break; + + while ((count + 1) < sgt->nents) { + sg_next_iter = sg_next(sg); + len_next = sg_dma_len(sg_next_iter); + addr_next = sg_dma_address(sg_next_iter); + + if (len_next == 0) + break; + + if ((addr + len == addr_next) && + (len + len_next <= DMA_MAX_TRANSFER_SIZE)) { + len += len_next; + count++; + sg = sg_next_iter; + } else { + break; + } + } + + dma_desc_cnt++; + } + + return dma_desc_cnt * sizeof(struct packet_lin_dma); +} + +static int goya_pin_memory_before_cs(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt, + u64 addr, enum dma_data_direction dir) +{ + struct hl_userptr *userptr; + int rc; + + if (hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize, + parser->job_userptr_list, &userptr)) + goto already_pinned; + + userptr = kzalloc(sizeof(*userptr), GFP_ATOMIC); + if (!userptr) + return -ENOMEM; + + rc = hl_pin_host_memory(hdev, addr, user_dma_pkt->tsize, userptr); + if (rc) + goto free_userptr; + + list_add_tail(&userptr->job_node, parser->job_userptr_list); + + rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl, + userptr->sgt->nents, dir); + if (rc) { + dev_err(hdev->dev, "failed to map sgt with DMA region\n"); + goto unpin_memory; + } + + userptr->dma_mapped = true; + userptr->dir = dir; + +already_pinned: + parser->patched_cb_size += + goya_get_dma_desc_list_size(hdev, userptr->sgt); + + return 0; + +unpin_memory: + hl_unpin_host_memory(hdev, userptr); +free_userptr: + kfree(userptr); + return rc; +} + +static int goya_validate_dma_pkt_host(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt) +{ + u64 device_memory_addr, addr; + enum dma_data_direction dir; + enum goya_dma_direction user_dir; + bool sram_addr = true; + bool skip_host_mem_pin = false; + bool user_memset; + int rc = 0; + + user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> + GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT; + + user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >> + GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT; + + switch (user_dir) { + case DMA_HOST_TO_DRAM: + dev_dbg(hdev->dev, "DMA direction is HOST --> DRAM\n"); + dir = DMA_TO_DEVICE; + sram_addr = false; + addr = user_dma_pkt->src_addr; + device_memory_addr = user_dma_pkt->dst_addr; + if (user_memset) + skip_host_mem_pin = true; + break; + + case DMA_DRAM_TO_HOST: + dev_dbg(hdev->dev, "DMA direction is DRAM --> HOST\n"); + dir = DMA_FROM_DEVICE; + sram_addr = false; + addr = user_dma_pkt->dst_addr; + device_memory_addr = user_dma_pkt->src_addr; + break; + + case DMA_HOST_TO_SRAM: + dev_dbg(hdev->dev, "DMA direction is HOST --> SRAM\n"); + dir = DMA_TO_DEVICE; + addr = user_dma_pkt->src_addr; + device_memory_addr = user_dma_pkt->dst_addr; + if (user_memset) + skip_host_mem_pin = true; + break; + + case DMA_SRAM_TO_HOST: + dev_dbg(hdev->dev, "DMA direction is SRAM --> HOST\n"); + dir = DMA_FROM_DEVICE; + addr = user_dma_pkt->dst_addr; + device_memory_addr = user_dma_pkt->src_addr; + break; + default: + dev_err(hdev->dev, "DMA direction is undefined\n"); + return -EFAULT; + } + + if (parser->ctx_id != HL_KERNEL_ASID_ID) { + if (sram_addr) { + if (!hl_mem_area_inside_range(device_memory_addr, + user_dma_pkt->tsize, + hdev->asic_prop.sram_user_base_address, + hdev->asic_prop.sram_end_address)) { + + dev_err(hdev->dev, + "SRAM address 0x%llx + 0x%x is invalid\n", + device_memory_addr, + user_dma_pkt->tsize); + return -EFAULT; + } + } else { + if (!hl_mem_area_inside_range(device_memory_addr, + user_dma_pkt->tsize, + hdev->asic_prop.dram_user_base_address, + hdev->asic_prop.dram_end_address)) { + + dev_err(hdev->dev, + "DRAM address 0x%llx + 0x%x is invalid\n", + device_memory_addr, + user_dma_pkt->tsize); + return -EFAULT; + } + } + } + + if (skip_host_mem_pin) + parser->patched_cb_size += sizeof(*user_dma_pkt); + else { + if ((dir == DMA_TO_DEVICE) && + (parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1)) { + dev_err(hdev->dev, + "Can't DMA from host on queue other then 1\n"); + return -EFAULT; + } + + rc = goya_pin_memory_before_cs(hdev, parser, user_dma_pkt, + addr, dir); + } + + return rc; +} + +static int goya_validate_dma_pkt_no_host(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt) +{ + u64 sram_memory_addr, dram_memory_addr; + enum goya_dma_direction user_dir; + + user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> + GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT; + + if (user_dir == DMA_DRAM_TO_SRAM) { + dev_dbg(hdev->dev, "DMA direction is DRAM --> SRAM\n"); + dram_memory_addr = user_dma_pkt->src_addr; + sram_memory_addr = user_dma_pkt->dst_addr; + } else { + dev_dbg(hdev->dev, "DMA direction is SRAM --> DRAM\n"); + sram_memory_addr = user_dma_pkt->src_addr; + dram_memory_addr = user_dma_pkt->dst_addr; + } + + if (!hl_mem_area_inside_range(sram_memory_addr, user_dma_pkt->tsize, + hdev->asic_prop.sram_user_base_address, + hdev->asic_prop.sram_end_address)) { + dev_err(hdev->dev, "SRAM address 0x%llx + 0x%x is invalid\n", + sram_memory_addr, user_dma_pkt->tsize); + return -EFAULT; + } + + if (!hl_mem_area_inside_range(dram_memory_addr, user_dma_pkt->tsize, + hdev->asic_prop.dram_user_base_address, + hdev->asic_prop.dram_end_address)) { + dev_err(hdev->dev, "DRAM address 0x%llx + 0x%x is invalid\n", + dram_memory_addr, user_dma_pkt->tsize); + return -EFAULT; + } + + parser->patched_cb_size += sizeof(*user_dma_pkt); + + return 0; +} + +static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt) +{ + enum goya_dma_direction user_dir; + int rc; + + dev_dbg(hdev->dev, "DMA packet details:\n"); + dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr); + dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr); + dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize); + + user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> + GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT; + + /* + * Special handling for DMA with size 0. The H/W has a bug where + * this can cause the QMAN DMA to get stuck, so block it here. + */ + if (user_dma_pkt->tsize == 0) { + dev_err(hdev->dev, + "Got DMA with size 0, might reset the device\n"); + return -EINVAL; + } + + if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM)) + rc = goya_validate_dma_pkt_no_host(hdev, parser, user_dma_pkt); + else + rc = goya_validate_dma_pkt_host(hdev, parser, user_dma_pkt); + + return rc; +} + +static int goya_validate_dma_pkt_mmu(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt) +{ + dev_dbg(hdev->dev, "DMA packet details:\n"); + dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr); + dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr); + dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize); + + /* + * WA for HW-23. + * We can't allow user to read from Host using QMANs other than 1. + */ + if (parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1 && + hl_mem_area_inside_range(user_dma_pkt->src_addr, + user_dma_pkt->tsize, + hdev->asic_prop.va_space_host_start_address, + hdev->asic_prop.va_space_host_end_address)) { + dev_err(hdev->dev, + "Can't DMA from host on queue other then 1\n"); + return -EFAULT; + } + + if (user_dma_pkt->tsize == 0) { + dev_err(hdev->dev, + "Got DMA with size 0, might reset the device\n"); + return -EINVAL; + } + + parser->patched_cb_size += sizeof(*user_dma_pkt); + + return 0; +} + +static int goya_validate_wreg32(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_wreg32 *wreg_pkt) +{ + struct goya_device *goya = hdev->asic_specific; + u32 sob_start_addr, sob_end_addr; + u16 reg_offset; + + reg_offset = wreg_pkt->ctl & GOYA_PKT_WREG32_CTL_REG_OFFSET_MASK; + + dev_dbg(hdev->dev, "WREG32 packet details:\n"); + dev_dbg(hdev->dev, "reg_offset == 0x%x\n", reg_offset); + dev_dbg(hdev->dev, "value == 0x%x\n", wreg_pkt->value); + + if (reg_offset != (mmDMA_CH_1_WR_COMP_ADDR_LO & 0xFFFF)) { + dev_err(hdev->dev, "WREG32 packet with illegal address 0x%x\n", + reg_offset); + return -EPERM; + } + + /* + * With MMU, DMA channels are not secured, so it doesn't matter where + * the WR COMP will be written to because it will go out with + * non-secured property + */ + if (goya->hw_cap_initialized & HW_CAP_MMU) + return 0; + + sob_start_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + sob_end_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1023); + + if ((wreg_pkt->value < sob_start_addr) || + (wreg_pkt->value > sob_end_addr)) { + + dev_err(hdev->dev, "WREG32 packet with illegal value 0x%x\n", + wreg_pkt->value); + return -EPERM; + } + + return 0; +} + +static int goya_validate_cb(struct hl_device *hdev, + struct hl_cs_parser *parser, bool is_mmu) +{ + u32 cb_parsed_length = 0; + int rc = 0; + + parser->patched_cb_size = 0; + + /* cb_user_size is more than 0 so loop will always be executed */ + while (cb_parsed_length < parser->user_cb_size) { + enum packet_id pkt_id; + u16 pkt_size; + void *user_pkt; + + user_pkt = (void *) (uintptr_t) + (parser->user_cb->kernel_address + cb_parsed_length); + + pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & + PACKET_HEADER_PACKET_ID_MASK) >> + PACKET_HEADER_PACKET_ID_SHIFT); + + pkt_size = goya_packet_sizes[pkt_id]; + cb_parsed_length += pkt_size; + if (cb_parsed_length > parser->user_cb_size) { + dev_err(hdev->dev, + "packet 0x%x is out of CB boundary\n", pkt_id); + rc = -EINVAL; + break; + } + + switch (pkt_id) { + case PACKET_WREG_32: + /* + * Although it is validated after copy in patch_cb(), + * need to validate here as well because patch_cb() is + * not called in MMU path while this function is called + */ + rc = goya_validate_wreg32(hdev, parser, user_pkt); + break; + + case PACKET_WREG_BULK: + dev_err(hdev->dev, + "User not allowed to use WREG_BULK\n"); + rc = -EPERM; + break; + + case PACKET_MSG_PROT: + dev_err(hdev->dev, + "User not allowed to use MSG_PROT\n"); + rc = -EPERM; + break; + + case PACKET_CP_DMA: + dev_err(hdev->dev, "User not allowed to use CP_DMA\n"); + rc = -EPERM; + break; + + case PACKET_STOP: + dev_err(hdev->dev, "User not allowed to use STOP\n"); + rc = -EPERM; + break; + + case PACKET_LIN_DMA: + if (is_mmu) + rc = goya_validate_dma_pkt_mmu(hdev, parser, + user_pkt); + else + rc = goya_validate_dma_pkt_no_mmu(hdev, parser, + user_pkt); + break; + + case PACKET_MSG_LONG: + case PACKET_MSG_SHORT: + case PACKET_FENCE: + case PACKET_NOP: + parser->patched_cb_size += pkt_size; + break; + + default: + dev_err(hdev->dev, "Invalid packet header 0x%x\n", + pkt_id); + rc = -EINVAL; + break; + } + + if (rc) + break; + } + + /* + * The new CB should have space at the end for two MSG_PROT packets: + * 1. A packet that will act as a completion packet + * 2. A packet that will generate MSI-X interrupt + */ + parser->patched_cb_size += sizeof(struct packet_msg_prot) * 2; + + return rc; +} + +static int goya_patch_dma_packet(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt, + struct packet_lin_dma *new_dma_pkt, + u32 *new_dma_pkt_size) +{ + struct hl_userptr *userptr; + struct scatterlist *sg, *sg_next_iter; + u32 count, len, dma_desc_cnt, len_next; + dma_addr_t dma_addr, dma_addr_next; + enum goya_dma_direction user_dir; + u64 device_memory_addr, addr; + enum dma_data_direction dir; + struct sg_table *sgt; + bool skip_host_mem_pin = false; + bool user_memset; + u32 user_rdcomp_mask, user_wrcomp_mask; + + user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> + GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT; + + user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >> + GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT; + + if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM) || + (user_dma_pkt->tsize == 0)) { + memcpy(new_dma_pkt, user_dma_pkt, sizeof(*new_dma_pkt)); + *new_dma_pkt_size = sizeof(*new_dma_pkt); + return 0; + } + + if ((user_dir == DMA_HOST_TO_DRAM) || (user_dir == DMA_HOST_TO_SRAM)) { + addr = user_dma_pkt->src_addr; + device_memory_addr = user_dma_pkt->dst_addr; + dir = DMA_TO_DEVICE; + if (user_memset) + skip_host_mem_pin = true; + } else { + addr = user_dma_pkt->dst_addr; + device_memory_addr = user_dma_pkt->src_addr; + dir = DMA_FROM_DEVICE; + } + + if ((!skip_host_mem_pin) && + (hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize, + parser->job_userptr_list, &userptr) == false)) { + dev_err(hdev->dev, "Userptr 0x%llx + 0x%x NOT mapped\n", + addr, user_dma_pkt->tsize); + return -EFAULT; + } + + if ((user_memset) && (dir == DMA_TO_DEVICE)) { + memcpy(new_dma_pkt, user_dma_pkt, sizeof(*user_dma_pkt)); + *new_dma_pkt_size = sizeof(*user_dma_pkt); + return 0; + } + + user_rdcomp_mask = + (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK); + + user_wrcomp_mask = + (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK); + + sgt = userptr->sgt; + dma_desc_cnt = 0; + + for_each_sg(sgt->sgl, sg, sgt->nents, count) { + len = sg_dma_len(sg); + dma_addr = sg_dma_address(sg); + + if (len == 0) + break; + + while ((count + 1) < sgt->nents) { + sg_next_iter = sg_next(sg); + len_next = sg_dma_len(sg_next_iter); + dma_addr_next = sg_dma_address(sg_next_iter); + + if (len_next == 0) + break; + + if ((dma_addr + len == dma_addr_next) && + (len + len_next <= DMA_MAX_TRANSFER_SIZE)) { + len += len_next; + count++; + sg = sg_next_iter; + } else { + break; + } + } + + new_dma_pkt->ctl = user_dma_pkt->ctl; + if (likely(dma_desc_cnt)) + new_dma_pkt->ctl &= ~GOYA_PKT_CTL_EB_MASK; + new_dma_pkt->ctl &= ~(GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK | + GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK); + new_dma_pkt->tsize = len; + + dma_addr += hdev->asic_prop.host_phys_base_address; + + if (dir == DMA_TO_DEVICE) { + new_dma_pkt->src_addr = dma_addr; + new_dma_pkt->dst_addr = device_memory_addr; + } else { + new_dma_pkt->src_addr = device_memory_addr; + new_dma_pkt->dst_addr = dma_addr; + } + + if (!user_memset) + device_memory_addr += len; + dma_desc_cnt++; + new_dma_pkt++; + } + + if (!dma_desc_cnt) { + dev_err(hdev->dev, + "Error of 0 SG entries when patching DMA packet\n"); + return -EFAULT; + } + + /* Fix the last dma packet - rdcomp/wrcomp must be as user set them */ + new_dma_pkt--; + new_dma_pkt->ctl |= (user_rdcomp_mask | user_wrcomp_mask); + + *new_dma_pkt_size = dma_desc_cnt * sizeof(struct packet_lin_dma); + + return 0; +} + +static int goya_patch_cb(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + u32 cb_parsed_length = 0; + u32 cb_patched_cur_length = 0; + int rc = 0; + + /* cb_user_size is more than 0 so loop will always be executed */ + while (cb_parsed_length < parser->user_cb_size) { + enum packet_id pkt_id; + u16 pkt_size; + u32 new_pkt_size = 0; + void *user_pkt, *kernel_pkt; + + user_pkt = (void *) (uintptr_t) + (parser->user_cb->kernel_address + cb_parsed_length); + kernel_pkt = (void *) (uintptr_t) + (parser->patched_cb->kernel_address + + cb_patched_cur_length); + + pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & + PACKET_HEADER_PACKET_ID_MASK) >> + PACKET_HEADER_PACKET_ID_SHIFT); + + pkt_size = goya_packet_sizes[pkt_id]; + cb_parsed_length += pkt_size; + if (cb_parsed_length > parser->user_cb_size) { + dev_err(hdev->dev, + "packet 0x%x is out of CB boundary\n", pkt_id); + rc = -EINVAL; + break; + } + + switch (pkt_id) { + case PACKET_LIN_DMA: + rc = goya_patch_dma_packet(hdev, parser, user_pkt, + kernel_pkt, &new_pkt_size); + cb_patched_cur_length += new_pkt_size; + break; + + case PACKET_WREG_32: + memcpy(kernel_pkt, user_pkt, pkt_size); + cb_patched_cur_length += pkt_size; + rc = goya_validate_wreg32(hdev, parser, kernel_pkt); + break; + + case PACKET_WREG_BULK: + dev_err(hdev->dev, + "User not allowed to use WREG_BULK\n"); + rc = -EPERM; + break; + + case PACKET_MSG_PROT: + dev_err(hdev->dev, + "User not allowed to use MSG_PROT\n"); + rc = -EPERM; + break; + + case PACKET_CP_DMA: + dev_err(hdev->dev, "User not allowed to use CP_DMA\n"); + rc = -EPERM; + break; + + case PACKET_STOP: + dev_err(hdev->dev, "User not allowed to use STOP\n"); + rc = -EPERM; + break; + + case PACKET_MSG_LONG: + case PACKET_MSG_SHORT: + case PACKET_FENCE: + case PACKET_NOP: + memcpy(kernel_pkt, user_pkt, pkt_size); + cb_patched_cur_length += pkt_size; + break; + + default: + dev_err(hdev->dev, "Invalid packet header 0x%x\n", + pkt_id); + rc = -EINVAL; + break; + } + + if (rc) + break; + } + + return rc; +} + +static int goya_parse_cb_mmu(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + u64 patched_cb_handle; + u32 patched_cb_size; + struct hl_cb *user_cb; + int rc; + + /* + * The new CB should have space at the end for two MSG_PROT pkt: + * 1. A packet that will act as a completion packet + * 2. A packet that will generate MSI-X interrupt + */ + parser->patched_cb_size = parser->user_cb_size + + sizeof(struct packet_msg_prot) * 2; + + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, + parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID); + + if (rc) { + dev_err(hdev->dev, + "Failed to allocate patched CB for DMA CS %d\n", + rc); + return rc; + } + + patched_cb_handle >>= PAGE_SHIFT; + parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr, + (u32) patched_cb_handle); + /* hl_cb_get should never fail here so use kernel WARN */ + WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n", + (u32) patched_cb_handle); + if (!parser->patched_cb) { + rc = -EFAULT; + goto out; + } + + /* + * The check that parser->user_cb_size <= parser->user_cb->size was done + * in validate_queue_index(). + */ + memcpy((void *) (uintptr_t) parser->patched_cb->kernel_address, + (void *) (uintptr_t) parser->user_cb->kernel_address, + parser->user_cb_size); + + patched_cb_size = parser->patched_cb_size; + + /* validate patched CB instead of user CB */ + user_cb = parser->user_cb; + parser->user_cb = parser->patched_cb; + rc = goya_validate_cb(hdev, parser, true); + parser->user_cb = user_cb; + + if (rc) { + hl_cb_put(parser->patched_cb); + goto out; + } + + if (patched_cb_size != parser->patched_cb_size) { + dev_err(hdev->dev, "user CB size mismatch\n"); + hl_cb_put(parser->patched_cb); + rc = -EINVAL; + goto out; + } + +out: + /* + * Always call cb destroy here because we still have 1 reference + * to it by calling cb_get earlier. After the job will be completed, + * cb_put will release it, but here we want to remove it from the + * idr + */ + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, + patched_cb_handle << PAGE_SHIFT); + + return rc; +} + +int goya_parse_cb_no_mmu(struct hl_device *hdev, struct hl_cs_parser *parser) +{ + u64 patched_cb_handle; + int rc; + + rc = goya_validate_cb(hdev, parser, false); + + if (rc) + goto free_userptr; + + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, + parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID); + if (rc) { + dev_err(hdev->dev, + "Failed to allocate patched CB for DMA CS %d\n", rc); + goto free_userptr; + } + + patched_cb_handle >>= PAGE_SHIFT; + parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr, + (u32) patched_cb_handle); + /* hl_cb_get should never fail here so use kernel WARN */ + WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n", + (u32) patched_cb_handle); + if (!parser->patched_cb) { + rc = -EFAULT; + goto out; + } + + rc = goya_patch_cb(hdev, parser); + + if (rc) + hl_cb_put(parser->patched_cb); + +out: + /* + * Always call cb destroy here because we still have 1 reference + * to it by calling cb_get earlier. After the job will be completed, + * cb_put will release it, but here we want to remove it from the + * idr + */ + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, + patched_cb_handle << PAGE_SHIFT); + +free_userptr: + if (rc) + hl_userptr_delete_list(hdev, parser->job_userptr_list); + return rc; +} + +int goya_parse_cb_no_ext_quque(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + struct asic_fixed_properties *asic_prop = &hdev->asic_prop; + struct goya_device *goya = hdev->asic_specific; + + if (!(goya->hw_cap_initialized & HW_CAP_MMU)) { + /* For internal queue jobs, just check if cb address is valid */ + if (hl_mem_area_inside_range( + (u64) (uintptr_t) parser->user_cb, + parser->user_cb_size, + asic_prop->sram_user_base_address, + asic_prop->sram_end_address)) + return 0; + + if (hl_mem_area_inside_range( + (u64) (uintptr_t) parser->user_cb, + parser->user_cb_size, + asic_prop->dram_user_base_address, + asic_prop->dram_end_address)) + return 0; + + dev_err(hdev->dev, + "Internal CB address 0x%llx + 0x%x is not in SRAM nor in DRAM\n", + (u64) (uintptr_t) parser->user_cb, + parser->user_cb_size); + + return -EFAULT; + } + + return 0; +} + +int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser) +{ + struct goya_device *goya = hdev->asic_specific; + + if (!parser->ext_queue) + return goya_parse_cb_no_ext_quque(hdev, parser); + + if ((goya->hw_cap_initialized & HW_CAP_MMU) && parser->use_virt_addr) + return goya_parse_cb_mmu(hdev, parser); + else + return goya_parse_cb_no_mmu(hdev, parser); +} + +void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr, + u32 cq_val, u32 msix_vec) +{ + struct packet_msg_prot *cq_pkt; + + cq_pkt = (struct packet_msg_prot *) (uintptr_t) + (kernel_address + len - (sizeof(struct packet_msg_prot) * 2)); + + cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | + (1 << GOYA_PKT_CTL_EB_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT); + cq_pkt->value = cq_val; + cq_pkt->addr = cq_addr; + + cq_pkt++; + + cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT); + cq_pkt->value = msix_vec & 0x7FF; + cq_pkt->addr = CFG_BASE + mmPCIE_DBI_MSIX_DOORBELL_OFF; +} + static void goya_update_eq_ci(struct hl_device *hdev, u32 val) { WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, val); } +int goya_context_switch(struct hl_device *hdev, u32 asid) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct packet_lin_dma *clear_sram_pkt; + struct hl_cs_parser parser; + struct hl_cs_job *job; + u32 cb_size; + struct hl_cb *cb; + int rc; + + cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + if (!cb) + return -EFAULT; + + clear_sram_pkt = (struct packet_lin_dma *) + (uintptr_t) cb->kernel_address; + + memset(clear_sram_pkt, 0, sizeof(*clear_sram_pkt)); + cb_size = sizeof(*clear_sram_pkt); + + clear_sram_pkt->ctl = ((PACKET_LIN_DMA << GOYA_PKT_CTL_OPCODE_SHIFT) | + (DMA_HOST_TO_SRAM << GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT) | + (1 << GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT) | + (1 << GOYA_PKT_LIN_DMA_CTL_WO_SHIFT) | + (1 << GOYA_PKT_CTL_RB_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT)); + + clear_sram_pkt->src_addr = 0x7777777777777777ull; + clear_sram_pkt->dst_addr = prop->sram_base_address; + if (hdev->pldm) + clear_sram_pkt->tsize = 0x10000; + else + clear_sram_pkt->tsize = prop->sram_size; + + job = hl_cs_allocate_job(hdev, true); + if (!job) { + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + goto release_cb; + } + + job->id = 0; + job->user_cb = cb; + job->user_cb->cs_cnt++; + job->user_cb_size = cb_size; + job->hw_queue_id = GOYA_QUEUE_ID_DMA_0; + + parser.ctx_id = HL_KERNEL_ASID_ID; + parser.cs_sequence = 0; + parser.job_id = job->id; + parser.hw_queue_id = job->hw_queue_id; + parser.job_userptr_list = &job->userptr_list; + parser.user_cb = job->user_cb; + parser.user_cb_size = job->user_cb_size; + parser.ext_queue = job->ext_queue; + parser.use_virt_addr = hdev->mmu_enable; + + rc = hdev->asic_funcs->cs_parser(hdev, &parser); + if (rc) { + dev_err(hdev->dev, + "Failed to parse kernel CB during context switch\n"); + goto free_job; + } + + job->patched_cb = parser.patched_cb; + job->job_cb_size = parser.patched_cb_size; + job->patched_cb->cs_cnt++; + + rc = goya_send_job_on_qman0(hdev, job); + + job->patched_cb->cs_cnt--; + hl_cb_put(job->patched_cb); + +free_job: + hl_userptr_delete_list(hdev, &job->userptr_list); + kfree(job); + cb->cs_cnt--; + +release_cb: + hl_cb_put(cb); + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT); + + return rc; +} + +void goya_restore_phase_topology(struct hl_device *hdev) +{ + int i, num_of_sob_in_longs, num_of_mon_in_longs; + + num_of_sob_in_longs = + ((mmSYNC_MNGR_SOB_OBJ_1023 - mmSYNC_MNGR_SOB_OBJ_0) + 4); + + num_of_mon_in_longs = + ((mmSYNC_MNGR_MON_STATUS_255 - mmSYNC_MNGR_MON_STATUS_0) + 4); + + for (i = 0 ; i < num_of_sob_in_longs ; i += 4) + WREG32(mmSYNC_MNGR_SOB_OBJ_0 + i, 0); + + for (i = 0 ; i < num_of_mon_in_longs ; i += 4) + WREG32(mmSYNC_MNGR_MON_STATUS_0 + i, 0); + + /* Flush all WREG to prevent race */ + i = RREG32(mmSYNC_MNGR_SOB_OBJ_0); +} + static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id, u16 event_type, char *axi_name, int len) { @@ -3608,6 +4673,59 @@ static void goya_disable_clock_gating(struct hl_device *hdev) } +static bool goya_is_device_idle(struct hl_device *hdev) +{ + u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg; + int i; + + offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0; + + for (i = 0 ; i < DMA_MAX_NUM ; i++) { + dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset; + + if ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) != + DMA_QM_IDLE_MASK) + return false; + } + + offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0; + + for (i = 0 ; i < TPC_MAX_NUM ; i++) { + tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset; + tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset; + tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset; + + if ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) != + TPC_QM_IDLE_MASK) + return false; + + if ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) != + TPC_CMDQ_IDLE_MASK) + return false; + + if ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) != + TPC_CFG_IDLE_MASK) + return false; + } + + if ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) != + MME_QM_IDLE_MASK) + return false; + + if ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) != + MME_CMDQ_IDLE_MASK) + return false; + + if ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) != + MME_ARCH_IDLE_MASK) + return false; + + if (RREG32(mmMME_SHADOW_0_STATUS) & MME_SHADOW_IDLE_MASK) + return false; + + return true; +} + static void goya_hw_queues_lock(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; @@ -3700,7 +4818,14 @@ static const struct hl_asic_funcs goya_funcs = { .dma_pool_free = goya_dma_pool_free, .cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc, .cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free, + .hl_dma_unmap_sg = goya_dma_unmap_sg, + .cs_parser = goya_cs_parser, + .asic_dma_map_sg = goya_dma_map_sg, + .get_dma_desc_list_size = goya_get_dma_desc_list_size, + .add_end_of_cb_packets = goya_add_end_of_cb_packets, .update_eq_ci = goya_update_eq_ci, + .context_switch = goya_context_switch, + .restore_phase_topology = goya_restore_phase_topology, .add_device_attr = goya_add_device_attr, .handle_eqe = goya_handle_eqe, .set_pll_profile = goya_set_pll_profile, @@ -3708,6 +4833,7 @@ static const struct hl_asic_funcs goya_funcs = { .send_heartbeat = goya_send_heartbeat, .enable_clock_gating = goya_init_clock_gating, .disable_clock_gating = goya_disable_clock_gating, + .is_device_idle = goya_is_device_idle, .soft_reset_late_init = goya_soft_reset_late_init, .hw_queues_lock = goya_hw_queues_lock, .hw_queues_unlock = goya_hw_queues_unlock, diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 744e37bbc2a6..9adc7c6ec08b 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #define HL_NAME "habanalabs" @@ -31,6 +34,11 @@ #define HL_MAX_QUEUES 128 +#define HL_MAX_JOBS_PER_CS 64 + +/* MUST BE POWER OF 2 and larger than 1 */ +#define HL_MAX_PENDING_CS 64 + struct hl_device; struct hl_fpriv; @@ -61,6 +69,16 @@ struct hw_queue_properties { u8 kmd_only; }; +/** + * enum vm_type_t - virtual memory mapping request information. + * @VM_TYPE_USERPTR: mapping of user memory to device virtual address. + * @VM_TYPE_PHYS_LIST: mapping of DRAM memory to device virtual address. + */ +enum vm_type_t { + VM_TYPE_USERPTR, + VM_TYPE_PHYS_LIST +}; + /** * enum hl_device_hw_state - H/W device state. use this to understand whether * to do reset before hw_init or not @@ -147,6 +165,19 @@ struct asic_fixed_properties { u8 tpc_enabled_mask; }; +/** + * struct hl_dma_fence - wrapper for fence object used by command submissions. + * @base_fence: kernel fence object. + * @lock: spinlock to protect fence. + * @hdev: habanalabs device structure. + * @cs_seq: command submission sequence number. + */ +struct hl_dma_fence { + struct dma_fence base_fence; + spinlock_t lock; + struct hl_device *hdev; + u64 cs_seq; +}; /* * Command Buffers @@ -175,6 +206,7 @@ struct hl_cb_mgr { * @mmap_size: Holds the CB's size that was mmaped. * @size: holds the CB's size. * @id: the CB's ID. + * @cs_cnt: holds number of CS that this CB participates in. * @ctx_id: holds the ID of the owner's context. * @mmap: true if the CB is currently mmaped to user. * @is_pool: true if CB was acquired from the pool, false otherwise. @@ -189,6 +221,7 @@ struct hl_cb { u32 mmap_size; u32 size; u32 id; + u32 cs_cnt; u32 ctx_id; u8 mmap; u8 is_pool; @@ -313,6 +346,8 @@ enum hl_asic_type { ASIC_INVALID }; +struct hl_cs_parser; + /** * enum hl_pm_mng_profile - power management profile. * @PM_AUTO: internal clock is set by KMD. @@ -372,7 +407,14 @@ enum hl_pll_frequency { * @dma_pool_free: free small DMA allocation from pool. * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool. * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool. + * @hl_dma_unmap_sg: DMA unmap scatter-gather list. + * @cs_parser: parse Command Submission. + * @asic_dma_map_sg: DMA map scatter-gather list. + * @get_dma_desc_list_size: get number of LIN_DMA packets required for CB. + * @add_end_of_cb_packets: Add packets to the end of CB, if device requires it. * @update_eq_ci: update event queue CI. + * @context_switch: called upon ASID context switch. + * @restore_phase_topology: clear all SOBs amd MONs. * @add_device_attr: add ASIC specific device attributes. * @handle_eqe: handle event queue entry (IRQ) from ArmCP. * @set_pll_profile: change PLL profile (manual/automatic). @@ -380,6 +422,7 @@ enum hl_pll_frequency { * @send_heartbeat: send is-alive packet to ArmCP and verify response. * @enable_clock_gating: enable clock gating for reducing power consumption. * @disable_clock_gating: disable clock for accessing registers on HBW. + * @is_device_idle: return true if device is idle, false otherwise. * @soft_reset_late_init: perform certain actions needed after soft reset. * @hw_queues_lock: acquire H/W queues lock. * @hw_queues_unlock: release H/W queues lock. @@ -419,7 +462,20 @@ struct hl_asic_funcs { size_t size, dma_addr_t *dma_handle); void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev, size_t size, void *vaddr); + void (*hl_dma_unmap_sg)(struct hl_device *hdev, + struct scatterlist *sg, int nents, + enum dma_data_direction dir); + int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser); + int (*asic_dma_map_sg)(struct hl_device *hdev, + struct scatterlist *sg, int nents, + enum dma_data_direction dir); + u32 (*get_dma_desc_list_size)(struct hl_device *hdev, + struct sg_table *sgt); + void (*add_end_of_cb_packets)(u64 kernel_address, u32 len, u64 cq_addr, + u32 cq_val, u32 msix_num); void (*update_eq_ci)(struct hl_device *hdev, u32 val); + int (*context_switch)(struct hl_device *hdev, u32 asid); + void (*restore_phase_topology)(struct hl_device *hdev); void (*add_device_attr)(struct hl_device *hdev, struct attribute_group *dev_attr_grp); void (*handle_eqe)(struct hl_device *hdev, @@ -430,6 +486,7 @@ struct hl_asic_funcs { int (*send_heartbeat)(struct hl_device *hdev); void (*enable_clock_gating)(struct hl_device *hdev); void (*disable_clock_gating)(struct hl_device *hdev); + bool (*is_device_idle)(struct hl_device *hdev); int (*soft_reset_late_init)(struct hl_device *hdev); void (*hw_queues_lock)(struct hl_device *hdev); void (*hw_queues_unlock)(struct hl_device *hdev); @@ -453,12 +510,28 @@ struct hl_asic_funcs { * @hdev: pointer to the device structure. * @refcount: reference counter for the context. Context is released only when * this hits 0l. It is incremented on CS and CS_WAIT. + * @cs_pending: array of DMA fence objects representing pending CS. + * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed + * to user so user could inquire about CS. It is used as + * index to cs_pending array. + * @cs_lock: spinlock to protect cs_sequence. + * @thread_restore_token: token to prevent multiple threads of the same context + * from running the restore phase. Only one thread + * should run it. + * @thread_restore_wait_token: token to prevent the threads that didn't run + * the restore phase from moving to their execution + * phase before the restore phase has finished. * @asid: context's unique address space ID in the device's MMU. */ struct hl_ctx { struct hl_fpriv *hpriv; struct hl_device *hdev; struct kref refcount; + struct dma_fence *cs_pending[HL_MAX_PENDING_CS]; + u64 cs_sequence; + spinlock_t cs_lock; + atomic_t thread_restore_token; + u32 thread_restore_wait_token; u32 asid; }; @@ -473,14 +546,129 @@ struct hl_ctx_mgr { }; + +/* + * COMMAND SUBMISSIONS + */ + +/** + * struct hl_userptr - memory mapping chunk information + * @vm_type: type of the VM. + * @job_node: linked-list node for hanging the object on the Job's list. + * @vec: pointer to the frame vector. + * @sgt: pointer to the scatter-gather table that holds the pages. + * @dir: for DMA unmapping, the direction must be supplied, so save it. + * @debugfs_list: node in debugfs list of command submissions. + * @addr: user-space virtual pointer to the start of the memory area. + * @size: size of the memory area to pin & map. + * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise. + */ +struct hl_userptr { + enum vm_type_t vm_type; /* must be first */ + struct list_head job_node; + struct frame_vector *vec; + struct sg_table *sgt; + enum dma_data_direction dir; + struct list_head debugfs_list; + u64 addr; + u32 size; + u8 dma_mapped; +}; + +/** + * struct hl_cs - command submission. + * @jobs_in_queue_cnt: per each queue, maintain counter of submitted jobs. + * @ctx: the context this CS belongs to. + * @job_list: list of the CS's jobs in the various queues. + * @job_lock: spinlock for the CS's jobs list. Needed for free_job. + * @refcount: reference counter for usage of the CS. + * @fence: pointer to the fence object of this CS. + * @work_tdr: delayed work node for TDR. + * @mirror_node : node in device mirror list of command submissions. + * @sequence: the sequence number of this CS. + * @submitted: true if CS was submitted to H/W. + * @completed: true if CS was completed by device. + * @timedout : true if CS was timedout. + * @tdr_active: true if TDR was activated for this CS (to prevent + * double TDR activation). + * @aborted: true if CS was aborted due to some device error. + */ +struct hl_cs { + u8 jobs_in_queue_cnt[HL_MAX_QUEUES]; + struct hl_ctx *ctx; + struct list_head job_list; + spinlock_t job_lock; + struct kref refcount; + struct dma_fence *fence; + struct delayed_work work_tdr; + struct list_head mirror_node; + u64 sequence; + u8 submitted; + u8 completed; + u8 timedout; + u8 tdr_active; + u8 aborted; +}; + /** * struct hl_cs_job - command submission job. + * @cs_node: the node to hang on the CS jobs list. + * @cs: the CS this job belongs to. + * @user_cb: the CB we got from the user. + * @patched_cb: in case of patching, this is internal CB which is submitted on + * the queue instead of the CB we got from the IOCTL. * @finish_work: workqueue object to run when job is completed. + * @userptr_list: linked-list of userptr mappings that belong to this job and + * wait for completion. * @id: the id of this job inside a CS. + * @hw_queue_id: the id of the H/W queue this job is submitted to. + * @user_cb_size: the actual size of the CB we got from the user. + * @job_cb_size: the actual size of the CB that we put on the queue. + * @ext_queue: whether the job is for external queue or internal queue. */ struct hl_cs_job { + struct list_head cs_node; + struct hl_cs *cs; + struct hl_cb *user_cb; + struct hl_cb *patched_cb; struct work_struct finish_work; + struct list_head userptr_list; u32 id; + u32 hw_queue_id; + u32 user_cb_size; + u32 job_cb_size; + u8 ext_queue; +}; + +/** + * struct hl_cs_parser - command submission paerser properties. + * @user_cb: the CB we got from the user. + * @patched_cb: in case of patching, this is internal CB which is submitted on + * the queue instead of the CB we got from the IOCTL. + * @job_userptr_list: linked-list of userptr mappings that belong to the related + * job and wait for completion. + * @cs_sequence: the sequence number of the related CS. + * @ctx_id: the ID of the context the related CS belongs to. + * @hw_queue_id: the id of the H/W queue this job is submitted to. + * @user_cb_size: the actual size of the CB we got from the user. + * @patched_cb_size: the size of the CB after parsing. + * @ext_queue: whether the job is for external queue or internal queue. + * @job_id: the id of the related job inside the related CS. + * @use_virt_addr: whether to treat the addresses in the CB as virtual during + * parsing. + */ +struct hl_cs_parser { + struct hl_cb *user_cb; + struct hl_cb *patched_cb; + struct list_head *job_userptr_list; + u64 cs_sequence; + u32 ctx_id; + u32 hw_queue_id; + u32 user_cb_size; + u32 patched_cb_size; + u8 ext_queue; + u8 job_id; + u8 use_virt_addr; }; @@ -497,6 +685,7 @@ struct hl_cs_job { * @ctx_mgr: context manager to handle multiple context for this FD. * @cb_mgr: command buffer manager to handle multiple buffers for this FD. * @refcount: number of related contexts. + * @restore_phase_mutex: lock for context switch and restore phase. */ struct hl_fpriv { struct hl_device *hdev; @@ -506,6 +695,7 @@ struct hl_fpriv { struct hl_ctx_mgr ctx_mgr; struct hl_cb_mgr cb_mgr; struct kref refcount; + struct mutex restore_phase_mutex; }; @@ -577,6 +767,8 @@ struct hl_device_reset_work { * @eq_wq: work queue of event queue for executing work in process context. * @kernel_ctx: KMD context structure. * @kernel_queues: array of hl_hw_queue. + * @hw_queues_mirror_list: CS mirror list for TDR. + * @hw_queues_mirror_lock: protects hw_queues_mirror_list. * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs. * @event_queue: event queue for IRQ from ArmCP. * @dma_pool: DMA pool for small allocations. @@ -604,6 +796,7 @@ struct hl_device_reset_work { * @in_reset: is device in reset flow. * @curr_pll_profile: current PLL profile. * @fd_open_cnt: number of open user processes. + * @timeout_jiffies: device CS timeout value. * @max_power: the max power of the device, as configured by the sysadmin. This * value is saved so in case of hard-reset, KMD will restore this * value and update the F/W after the re-initialization @@ -617,7 +810,10 @@ struct hl_device_reset_work { * @hwmon_initialized: is H/W monitor sensors was initialized. * @hard_reset_pending: is there a hard reset work pending. * @heartbeat: is heartbeat sanity check towards ArmCP enabled. + * @reset_on_lockup: true if a reset should be done in case of stuck CS, false + * otherwise. * @init_done: is the initialization of the device done. + * @mmu_enable: is MMU enabled. */ struct hl_device { struct pci_dev *pdev; @@ -634,6 +830,8 @@ struct hl_device { struct workqueue_struct *eq_wq; struct hl_ctx *kernel_ctx; struct hl_hw_queue *kernel_queues; + struct list_head hw_queues_mirror_list; + spinlock_t hw_queues_mirror_lock; struct hl_cb_mgr kernel_cb_mgr; struct hl_eq event_queue; struct dma_pool *dma_pool; @@ -661,6 +859,7 @@ struct hl_device { atomic_t in_reset; atomic_t curr_pll_profile; atomic_t fd_open_cnt; + u64 timeout_jiffies; u64 max_power; u32 major; u32 high_pll; @@ -672,9 +871,11 @@ struct hl_device { u8 hwmon_initialized; u8 hard_reset_pending; u8 heartbeat; + u8 reset_on_lockup; u8 init_done; /* Parameters for bring-up */ + u8 mmu_enable; u8 cpu_enable; u8 reset_pcilink; u8 cpu_queues_enable; @@ -712,6 +913,58 @@ struct hl_ioctl_desc { * Kernel module functions that can be accessed by entire module */ +/** + * hl_mem_area_inside_range() - Checks whether address+size are inside a range. + * @address: The start address of the area we want to validate. + * @size: The size in bytes of the area we want to validate. + * @range_start_address: The start address of the valid range. + * @range_end_address: The end address of the valid range. + * + * Return: true if the area is inside the valid range, false otherwise. + */ +static inline bool hl_mem_area_inside_range(u64 address, u32 size, + u64 range_start_address, u64 range_end_address) +{ + u64 end_address = address + size; + + if ((address >= range_start_address) && + (end_address <= range_end_address) && + (end_address > address)) + return true; + + return false; +} + +/** + * hl_mem_area_crosses_range() - Checks whether address+size crossing a range. + * @address: The start address of the area we want to validate. + * @size: The size in bytes of the area we want to validate. + * @range_start_address: The start address of the valid range. + * @range_end_address: The end address of the valid range. + * + * Return: true if the area overlaps part or all of the valid range, + * false otherwise. + */ +static inline bool hl_mem_area_crosses_range(u64 address, u32 size, + u64 range_start_address, u64 range_end_address) +{ + u64 end_address = address + size; + + if ((address >= range_start_address) && + (address < range_end_address)) + return true; + + if ((end_address >= range_start_address) && + (end_address < range_end_address)) + return true; + + if ((address < range_start_address) && + (end_address >= range_end_address)) + return true; + + return false; +} + int hl_device_open(struct inode *inode, struct file *filp); bool hl_device_disabled_or_in_reset(struct hl_device *hdev); int create_hdev(struct hl_device **dev, struct pci_dev *pdev, @@ -725,8 +978,10 @@ int hl_hw_queues_create(struct hl_device *hdev); void hl_hw_queues_destroy(struct hl_device *hdev); int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, u32 cb_size, u64 cb_ptr); +int hl_hw_queue_schedule_cs(struct hl_cs *cs); u32 hl_hw_queue_add_ptr(u32 ptr, u16 val); void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id); +void hl_int_hw_queue_update_ci(struct hl_cs *cs); void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset); #define hl_queue_inc_ptr(p) hl_hw_queue_add_ptr(p, 1) @@ -740,6 +995,8 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q); void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q); irqreturn_t hl_irq_handler_cq(int irq, void *arg); irqreturn_t hl_irq_handler_eq(int irq, void *arg); +u32 hl_cq_inc_ptr(u32 ptr); + int hl_asid_init(struct hl_device *hdev); void hl_asid_fini(struct hl_device *hdev); unsigned long hl_asid_alloc(struct hl_device *hdev); @@ -748,9 +1005,13 @@ void hl_asid_free(struct hl_device *hdev, unsigned long asid); int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv); void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx); int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx); +void hl_ctx_do_release(struct kref *ref); +void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx); int hl_ctx_put(struct hl_ctx *ctx); +struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq); void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr); void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr); + int hl_device_init(struct hl_device *hdev, struct class *hclass); void hl_device_fini(struct hl_device *hdev); int hl_device_suspend(struct hl_device *hdev); @@ -782,8 +1043,20 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size); int hl_cb_pool_init(struct hl_device *hdev); int hl_cb_pool_fini(struct hl_device *hdev); +void hl_cs_rollback_all(struct hl_device *hdev); +struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue); + void goya_set_asic_funcs(struct hl_device *hdev); +int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size, + struct hl_userptr *userptr); +int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr); +void hl_userptr_delete_list(struct hl_device *hdev, + struct list_head *userptr_list); +bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size, + struct list_head *userptr_list, + struct hl_userptr **userptr); + long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr); void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq); long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr); @@ -799,5 +1072,7 @@ void hl_set_max_power(struct hl_device *hdev, u64 value); /* IOCTLs */ long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data); +int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data); +int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data); #endif /* HABANALABSP_H_ */ diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c index b0bf77af1e40..77a1cc85e530 100644 --- a/drivers/misc/habanalabs/habanalabs_drv.c +++ b/drivers/misc/habanalabs/habanalabs_drv.c @@ -24,6 +24,17 @@ static struct class *hl_class; DEFINE_IDR(hl_devs_idr); DEFINE_MUTEX(hl_devs_idr_lock); +static int timeout_locked = 5; +static int reset_on_lockup = 1; + +module_param(timeout_locked, int, 0444); +MODULE_PARM_DESC(timeout_locked, + "Device lockup timeout in seconds (0 = disabled, default 5s)"); + +module_param(reset_on_lockup, int, 0444); +MODULE_PARM_DESC(reset_on_lockup, + "Do device reset on lockup (0 = no, 1 = yes, default yes)"); + #define PCI_VENDOR_ID_HABANALABS 0x1da3 #define PCI_IDS_GOYA 0x0001 @@ -113,6 +124,7 @@ int hl_device_open(struct inode *inode, struct file *filp) hpriv->hdev = hdev; filp->private_data = hpriv; hpriv->filp = filp; + mutex_init(&hpriv->restore_phase_mutex); kref_init(&hpriv->refcount); nonseekable_open(inode, filp); @@ -140,6 +152,7 @@ out_err: filp->private_data = NULL; hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr); + mutex_destroy(&hpriv->restore_phase_mutex); kfree(hpriv); close_device: @@ -172,8 +185,10 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, return -ENOMEM; hdev->major = hl_major; + hdev->reset_on_lockup = reset_on_lockup; /* Parameters for bring-up - set them to defaults */ + hdev->mmu_enable = 0; hdev->cpu_enable = 1; hdev->reset_pcilink = 0; hdev->cpu_queues_enable = 1; @@ -193,6 +208,11 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, if (!hdev->cpu_queues_enable) hdev->heartbeat = 0; + if (timeout_locked) + hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000); + else + hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; + hdev->disabled = true; hdev->pdev = pdev; /* can be NULL in case of simulator device */ diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c index e56a51f6bab6..481db1a5e97e 100644 --- a/drivers/misc/habanalabs/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/habanalabs_ioctl.c @@ -16,7 +16,9 @@ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func} static const struct hl_ioctl_desc hl_ioctls[] = { - HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl) + HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl), + HL_IOCTL_DEF(HL_IOCTL_CS, hl_cs_ioctl), + HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl) }; #define HL_CORE_IOCTL_COUNT ARRAY_SIZE(hl_ioctls) diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index 2ec43f36cdb8..68dfda59a875 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -34,6 +34,29 @@ static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len) return (abs(delta) - queue_len); } +void hl_int_hw_queue_update_ci(struct hl_cs *cs) +{ + struct hl_device *hdev = cs->ctx->hdev; + struct hl_hw_queue *q; + int i; + + hdev->asic_funcs->hw_queues_lock(hdev); + + if (hdev->disabled) + goto out; + + q = &hdev->kernel_queues[0]; + for (i = 0 ; i < HL_MAX_QUEUES ; i++, q++) { + if (q->queue_type == QUEUE_TYPE_INT) { + q->ci += cs->jobs_in_queue_cnt[i]; + q->ci &= ((q->int_queue_len << 1) - 1); + } + } + +out: + hdev->asic_funcs->hw_queues_unlock(hdev); +} + /* * ext_queue_submit_bd - Submit a buffer descriptor to an external queue * @@ -119,6 +142,37 @@ static int ext_queue_sanity_checks(struct hl_device *hdev, return 0; } +/* + * int_queue_sanity_checks - perform some sanity checks on internal queue + * + * @hdev : pointer to hl_device structure + * @q : pointer to hl_hw_queue structure + * @num_of_entries : how many entries to check for space + * + * H/W queues spinlock should be taken before calling this function + * + * Perform the following: + * - Make sure we have enough space in the h/w queue + * + */ +static int int_queue_sanity_checks(struct hl_device *hdev, + struct hl_hw_queue *q, + int num_of_entries) +{ + int free_slots_cnt; + + /* Check we have enough space in the queue */ + free_slots_cnt = queue_free_slots(q, q->int_queue_len); + + if (free_slots_cnt < num_of_entries) { + dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n", + q->hw_queue_id, num_of_entries); + return -EAGAIN; + } + + return 0; +} + /* * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion * @@ -165,6 +219,184 @@ out: return rc; } +/* + * ext_hw_queue_schedule_job - submit an JOB to an external queue + * + * @job: pointer to the job that needs to be submitted to the queue + * + * This function must be called when the scheduler mutex is taken + * + */ +static void ext_hw_queue_schedule_job(struct hl_cs_job *job) +{ + struct hl_device *hdev = job->cs->ctx->hdev; + struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; + struct hl_cq_entry cq_pkt; + struct hl_cq *cq; + u64 cq_addr; + struct hl_cb *cb; + u32 ctl; + u32 len; + u64 ptr; + + /* + * Update the JOB ID inside the BD CTL so the device would know what + * to write in the completion queue + */ + ctl = ((q->pi << BD_CTL_SHADOW_INDEX_SHIFT) & BD_CTL_SHADOW_INDEX_MASK); + + cb = job->patched_cb; + len = job->job_cb_size; + ptr = cb->bus_address; + + cq_pkt.data = (q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT) + & CQ_ENTRY_SHADOW_INDEX_MASK; + cq_pkt.data |= 1 << CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT; + cq_pkt.data |= 1 << CQ_ENTRY_READY_SHIFT; + + /* + * No need to protect pi_offset because scheduling to the + * H/W queues is done under the scheduler mutex + * + * No need to check if CQ is full because it was already + * checked in hl_queue_sanity_checks + */ + cq = &hdev->completion_queue[q->hw_queue_id]; + cq_addr = cq->bus_address + + hdev->asic_prop.host_phys_base_address; + cq_addr += cq->pi * sizeof(struct hl_cq_entry); + + hdev->asic_funcs->add_end_of_cb_packets(cb->kernel_address, len, + cq_addr, cq_pkt.data, q->hw_queue_id); + + q->shadow_queue[hl_pi_2_offset(q->pi)] = job; + + cq->pi = hl_cq_inc_ptr(cq->pi); + + ext_queue_submit_bd(hdev, q, ctl, len, ptr); +} + +/* + * int_hw_queue_schedule_job - submit an JOB to an internal queue + * + * @job: pointer to the job that needs to be submitted to the queue + * + * This function must be called when the scheduler mutex is taken + * + */ +static void int_hw_queue_schedule_job(struct hl_cs_job *job) +{ + struct hl_device *hdev = job->cs->ctx->hdev; + struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; + struct hl_bd bd; + u64 *pi, *pbd = (u64 *) &bd; + + bd.ctl = 0; + bd.len = job->job_cb_size; + bd.ptr = (u64) (uintptr_t) job->user_cb; + + pi = (u64 *) (uintptr_t) (q->kernel_address + + ((q->pi & (q->int_queue_len - 1)) * sizeof(bd))); + + pi[0] = pbd[0]; + pi[1] = pbd[1]; + + q->pi++; + q->pi &= ((q->int_queue_len << 1) - 1); + + /* Flush PQ entry write. Relevant only for specific ASICs */ + hdev->asic_funcs->flush_pq_write(hdev, pi, pbd[0]); + + hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); +} + +/* + * hl_hw_queue_schedule_cs - schedule a command submission + * + * @job : pointer to the CS + * + */ +int hl_hw_queue_schedule_cs(struct hl_cs *cs) +{ + struct hl_device *hdev = cs->ctx->hdev; + struct hl_cs_job *job, *tmp; + struct hl_hw_queue *q; + int rc = 0, i, cq_cnt; + + hdev->asic_funcs->hw_queues_lock(hdev); + + if (hl_device_disabled_or_in_reset(hdev)) { + dev_err(hdev->dev, + "device is disabled or in reset, CS rejected!\n"); + rc = -EPERM; + goto out; + } + + q = &hdev->kernel_queues[0]; + /* This loop assumes all external queues are consecutive */ + for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) { + if (q->queue_type == QUEUE_TYPE_EXT) { + if (cs->jobs_in_queue_cnt[i]) { + rc = ext_queue_sanity_checks(hdev, q, + cs->jobs_in_queue_cnt[i], true); + if (rc) + goto unroll_cq_resv; + cq_cnt++; + } + } else if (q->queue_type == QUEUE_TYPE_INT) { + if (cs->jobs_in_queue_cnt[i]) { + rc = int_queue_sanity_checks(hdev, q, + cs->jobs_in_queue_cnt[i]); + if (rc) + goto unroll_cq_resv; + } + } + } + + spin_lock(&hdev->hw_queues_mirror_lock); + list_add_tail(&cs->mirror_node, &hdev->hw_queues_mirror_list); + + /* Queue TDR if the CS is the first entry and if timeout is wanted */ + if ((hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) && + (list_first_entry(&hdev->hw_queues_mirror_list, + struct hl_cs, mirror_node) == cs)) { + cs->tdr_active = true; + schedule_delayed_work(&cs->work_tdr, hdev->timeout_jiffies); + spin_unlock(&hdev->hw_queues_mirror_lock); + } else { + spin_unlock(&hdev->hw_queues_mirror_lock); + } + + list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) { + if (job->ext_queue) + ext_hw_queue_schedule_job(job); + else + int_hw_queue_schedule_job(job); + } + + cs->submitted = true; + + goto out; + +unroll_cq_resv: + /* This loop assumes all external queues are consecutive */ + q = &hdev->kernel_queues[0]; + for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) { + if ((q->queue_type == QUEUE_TYPE_EXT) && + (cs->jobs_in_queue_cnt[i])) { + atomic_t *free_slots = + &hdev->completion_queue[i].free_slots_cnt; + atomic_add(cs->jobs_in_queue_cnt[i], free_slots); + cq_cnt--; + } + } + +out: + hdev->asic_funcs->hw_queues_unlock(hdev); + + return rc; +} + /* * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue * diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c new file mode 100644 index 000000000000..ad14376a1c25 --- /dev/null +++ b/drivers/misc/habanalabs/memory.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include +#include + +/* + * hl_pin_host_memory - pins a chunk of host memory + * + * @hdev : pointer to the habanalabs device structure + * @addr : the user-space virtual address of the memory area + * @size : the size of the memory area + * @userptr : pointer to hl_userptr structure + * + * This function does the following: + * - Pins the physical pages + * - Create a SG list from those pages + */ +int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size, + struct hl_userptr *userptr) +{ + u64 start, end; + u32 npages, offset; + int rc; + + if (!size) { + dev_err(hdev->dev, "size to pin is invalid - %d\n", + size); + return -EINVAL; + } + + if (!access_ok((void __user *) (uintptr_t) addr, size)) { + dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", + addr); + return -EFAULT; + } + + /* + * If the combination of the address and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((addr + size) < addr) || + PAGE_ALIGN(addr + size) < (addr + size)) { + dev_err(hdev->dev, + "user pointer 0x%llx + %u causes integer overflow\n", + addr, size); + return -EINVAL; + } + + start = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + end = PAGE_ALIGN(addr + size); + npages = (end - start) >> PAGE_SHIFT; + + userptr->size = size; + userptr->addr = addr; + userptr->dma_mapped = false; + INIT_LIST_HEAD(&userptr->job_node); + + userptr->vec = frame_vector_create(npages); + if (!userptr->vec) { + dev_err(hdev->dev, "Failed to create frame vector\n"); + return -ENOMEM; + } + + rc = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE, + userptr->vec); + + if (rc != npages) { + dev_err(hdev->dev, + "Failed to map host memory, user ptr probably wrong\n"); + if (rc < 0) + goto destroy_framevec; + rc = -EFAULT; + goto put_framevec; + } + + if (frame_vector_to_pages(userptr->vec) < 0) { + dev_err(hdev->dev, + "Failed to translate frame vector to pages\n"); + rc = -EFAULT; + goto put_framevec; + } + + userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_ATOMIC); + if (!userptr->sgt) { + rc = -ENOMEM; + goto put_framevec; + } + + rc = sg_alloc_table_from_pages(userptr->sgt, + frame_vector_pages(userptr->vec), + npages, offset, size, GFP_ATOMIC); + if (rc < 0) { + dev_err(hdev->dev, "failed to create SG table from pages\n"); + goto free_sgt; + } + + return 0; + +free_sgt: + kfree(userptr->sgt); +put_framevec: + put_vaddr_frames(userptr->vec); +destroy_framevec: + frame_vector_destroy(userptr->vec); + return rc; +} + +/* + * hl_unpin_host_memory - unpins a chunk of host memory + * + * @hdev : pointer to the habanalabs device structure + * @userptr : pointer to hl_userptr structure + * + * This function does the following: + * - Unpins the physical pages related to the host memory + * - Free the SG list + */ +int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr) +{ + struct page **pages; + + if (userptr->dma_mapped) + hdev->asic_funcs->hl_dma_unmap_sg(hdev, + userptr->sgt->sgl, + userptr->sgt->nents, + userptr->dir); + + pages = frame_vector_pages(userptr->vec); + if (!IS_ERR(pages)) { + int i; + + for (i = 0; i < frame_vector_count(userptr->vec); i++) + set_page_dirty_lock(pages[i]); + } + put_vaddr_frames(userptr->vec); + frame_vector_destroy(userptr->vec); + + list_del(&userptr->job_node); + + sg_free_table(userptr->sgt); + kfree(userptr->sgt); + + return 0; +} + +/* + * hl_userptr_delete_list - clear userptr list + * + * @hdev : pointer to the habanalabs device structure + * @userptr_list : pointer to the list to clear + * + * This function does the following: + * - Iterates over the list and unpins the host memory and frees the userptr + * structure. + */ +void hl_userptr_delete_list(struct hl_device *hdev, + struct list_head *userptr_list) +{ + struct hl_userptr *userptr, *tmp; + + list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) { + hl_unpin_host_memory(hdev, userptr); + kfree(userptr); + } + + INIT_LIST_HEAD(userptr_list); +} + +/* + * hl_userptr_is_pinned - returns whether the given userptr is pinned + * + * @hdev : pointer to the habanalabs device structure + * @userptr_list : pointer to the list to clear + * @userptr : pointer to userptr to check + * + * This function does the following: + * - Iterates over the list and checks if the given userptr is in it, means is + * pinned. If so, returns true, otherwise returns false. + */ +bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, + u32 size, struct list_head *userptr_list, + struct hl_userptr **userptr) +{ + list_for_each_entry((*userptr), userptr_list, job_node) { + if ((addr == (*userptr)->addr) && (size == (*userptr)->size)) + return true; + } + + return false; +} diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 756266cf0416..fba49417f607 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -73,6 +73,95 @@ union hl_cb_args { struct hl_cb_out out; }; +/* + * This structure size must always be fixed to 64-bytes for backward + * compatibility + */ +struct hl_cs_chunk { + /* + * For external queue, this represents a Handle of CB on the Host + * For internal queue, this represents an SRAM or DRAM address of the + * internal CB + */ + __u64 cb_handle; + /* Index of queue to put the CB on */ + __u32 queue_index; + /* + * Size of command buffer with valid packets + * Can be smaller then actual CB size + */ + __u32 cb_size; + /* HL_CS_CHUNK_FLAGS_* */ + __u32 cs_chunk_flags; + /* Align structure to 64 bytes */ + __u32 pad[11]; +}; + +#define HL_CS_FLAGS_FORCE_RESTORE 0x1 + +#define HL_CS_STATUS_SUCCESS 0 + +struct hl_cs_in { + /* this holds address of array of hl_cs_chunk for restore phase */ + __u64 chunks_restore; + /* this holds address of array of hl_cs_chunk for execution phase */ + __u64 chunks_execute; + /* this holds address of array of hl_cs_chunk for store phase - + * Currently not in use + */ + __u64 chunks_store; + /* Number of chunks in restore phase array */ + __u32 num_chunks_restore; + /* Number of chunks in execution array */ + __u32 num_chunks_execute; + /* Number of chunks in restore phase array - Currently not in use */ + __u32 num_chunks_store; + /* HL_CS_FLAGS_* */ + __u32 cs_flags; + /* Context ID - Currently not in use */ + __u32 ctx_id; +}; + +struct hl_cs_out { + /* this holds the sequence number of the CS to pass to wait ioctl */ + __u64 seq; + /* HL_CS_STATUS_* */ + __u32 status; + __u32 pad; +}; + +union hl_cs_args { + struct hl_cs_in in; + struct hl_cs_out out; +}; + +struct hl_wait_cs_in { + /* Command submission sequence number */ + __u64 seq; + /* Absolute timeout to wait in microseconds */ + __u64 timeout_us; + /* Context ID - Currently not in use */ + __u32 ctx_id; + __u32 pad; +}; + +#define HL_WAIT_CS_STATUS_COMPLETED 0 +#define HL_WAIT_CS_STATUS_BUSY 1 +#define HL_WAIT_CS_STATUS_TIMEDOUT 2 +#define HL_WAIT_CS_STATUS_ABORTED 3 +#define HL_WAIT_CS_STATUS_INTERRUPTED 4 + +struct hl_wait_cs_out { + /* HL_WAIT_CS_STATUS_* */ + __u32 status; + __u32 pad; +}; + +union hl_wait_cs_args { + struct hl_wait_cs_in in; + struct hl_wait_cs_out out; +}; + /* * Command Buffer * - Request a Command Buffer @@ -89,7 +178,74 @@ union hl_cb_args { #define HL_IOCTL_CB \ _IOWR('H', 0x02, union hl_cb_args) +/* + * Command Submission + * + * To submit work to the device, the user need to call this IOCTL with a set + * of JOBS. That set of JOBS constitutes a CS object. + * Each JOB will be enqueued on a specific queue, according to the user's input. + * There can be more then one JOB per queue. + * + * There are two types of queues - external and internal. External queues + * are DMA queues which transfer data from/to the Host. All other queues are + * internal. The driver will get completion notifications from the device only + * on JOBS which are enqueued in the external queues. + * + * This IOCTL is asynchronous in regard to the actual execution of the CS. This + * means it returns immediately after ALL the JOBS were enqueued on their + * relevant queues. Therefore, the user mustn't assume the CS has been completed + * or has even started to execute. + * + * Upon successful enqueue, the IOCTL returns an opaque handle which the user + * can use with the "Wait for CS" IOCTL to check whether the handle's CS + * external JOBS have been completed. Note that if the CS has internal JOBS + * which can execute AFTER the external JOBS have finished, the driver might + * report that the CS has finished executing BEFORE the internal JOBS have + * actually finish executing. + * + * The CS IOCTL will receive three sets of JOBS. One set is for "restore" phase, + * a second set is for "execution" phase and a third set is for "store" phase. + * The JOBS on the "restore" phase are enqueued only after context-switch + * (or if its the first CS for this context). The user can also order the + * driver to run the "restore" phase explicitly + * + */ +#define HL_IOCTL_CS \ + _IOWR('H', 0x03, union hl_cs_args) + +/* + * Wait for Command Submission + * + * The user can call this IOCTL with a handle it received from the CS IOCTL + * to wait until the handle's CS has finished executing. The user will wait + * inside the kernel until the CS has finished or until the user-requeusted + * timeout has expired. + * + * The return value of the IOCTL is a standard Linux error code. The possible + * values are: + * + * EINTR - Kernel waiting has been interrupted, e.g. due to OS signal + * that the user process received + * ETIMEDOUT - The CS has caused a timeout on the device + * EIO - The CS was aborted (usually because the device was reset) + * ENODEV - The device wants to do hard-reset (so user need to close FD) + * + * The driver also returns a custom define inside the IOCTL which can be: + * + * HL_WAIT_CS_STATUS_COMPLETED - The CS has been completed successfully (0) + * HL_WAIT_CS_STATUS_BUSY - The CS is still executing (0) + * HL_WAIT_CS_STATUS_TIMEDOUT - The CS has caused a timeout on the device + * (ETIMEDOUT) + * HL_WAIT_CS_STATUS_ABORTED - The CS was aborted, usually because the + * device was reset (EIO) + * HL_WAIT_CS_STATUS_INTERRUPTED - Waiting for the CS was interrupted (EINTR) + * + */ + +#define HL_IOCTL_WAIT_CS \ + _IOWR('H', 0x04, union hl_wait_cs_args) + #define HL_COMMAND_START 0x02 -#define HL_COMMAND_END 0x03 +#define HL_COMMAND_END 0x05 #endif /* HABANALABS_H_ */ -- cgit v1.3-7-g2ca7 From 0feaf86d4e69507ab9b2af7dcc63a6886352d5db Mon Sep 17 00:00:00 2001 From: Omer Shpigelman Date: Sat, 16 Feb 2019 00:39:22 +0200 Subject: habanalabs: add virtual memory and MMU modules This patch adds the Virtual Memory and MMU modules. Goya has an internal MMU which provides process isolation on the internal DDR. The internal MMU also performs translations for transactions that go from Goya to the Host. The driver is responsible for allocating and freeing memory on the DDR upon user request. It also provides an interface to map and unmap DDR and Host memory to the device address space. The MMU in Goya supports 3-level and 4-level page tables. With 3-level, the size of each page is 2MB, while with 4-level the size of each page is 4KB. In the DDR, the physical pages are always 2MB. Reviewed-by: Mike Rapoport Signed-off-by: Omer Shpigelman Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/Makefile | 2 +- drivers/misc/habanalabs/context.c | 19 +- drivers/misc/habanalabs/device.c | 20 +- drivers/misc/habanalabs/goya/goya.c | 395 +++++ drivers/misc/habanalabs/habanalabs.h | 189 ++- drivers/misc/habanalabs/habanalabs_drv.c | 2 +- drivers/misc/habanalabs/habanalabs_ioctl.c | 3 +- .../habanalabs/include/hw_ip/mmu/mmu_general.h | 46 + .../misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h | 15 + drivers/misc/habanalabs/memory.c | 1517 ++++++++++++++++++++ drivers/misc/habanalabs/mmu.c | 691 +++++++++ include/uapi/misc/habanalabs.h | 122 +- 12 files changed, 3013 insertions(+), 8 deletions(-) create mode 100644 drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h create mode 100644 drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h create mode 100644 drivers/misc/habanalabs/mmu.c (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile index d2fd0e18b1eb..fd46f8b48bab 100644 --- a/drivers/misc/habanalabs/Makefile +++ b/drivers/misc/habanalabs/Makefile @@ -6,7 +6,7 @@ obj-m := habanalabs.o habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \ command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \ - command_submission.o + command_submission.o mmu.o include $(src)/goya/Makefile habanalabs-y += $(HL_GOYA_FILES) diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c index c3854714b46c..619ace1c4ef7 100644 --- a/drivers/misc/habanalabs/context.c +++ b/drivers/misc/habanalabs/context.c @@ -25,8 +25,10 @@ static void hl_ctx_fini(struct hl_ctx *ctx) for (i = 0 ; i < HL_MAX_PENDING_CS ; i++) dma_fence_put(ctx->cs_pending[i]); - if (ctx->asid != HL_KERNEL_ASID_ID) + if (ctx->asid != HL_KERNEL_ASID_ID) { + hl_vm_ctx_fini(ctx); hl_asid_free(hdev, ctx->asid); + } } void hl_ctx_do_release(struct kref *ref) @@ -96,6 +98,8 @@ void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx) int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) { + int rc = 0; + ctx->hdev = hdev; kref_init(&ctx->refcount); @@ -113,9 +117,22 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) dev_err(hdev->dev, "No free ASID, failed to create context\n"); return -ENOMEM; } + + rc = hl_vm_ctx_init(ctx); + if (rc) { + dev_err(hdev->dev, "Failed to init mem ctx module\n"); + rc = -ENOMEM; + goto mem_ctx_err; + } } return 0; + +mem_ctx_err: + if (ctx->asid != HL_KERNEL_ASID_ID) + hl_asid_free(hdev, ctx->asid); + + return rc; } void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index cc5f068df597..d0929022655b 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -615,8 +615,10 @@ again: /* Reset the H/W. It will be in idle state after this returns */ hdev->asic_funcs->hw_fini(hdev, hard_reset); - if (hard_reset) + if (hard_reset) { + hl_vm_fini(hdev); hl_eq_reset(hdev, &hdev->event_queue); + } /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */ hl_hw_queue_reset(hdev, hard_reset); @@ -677,6 +679,13 @@ again: goto out_err; } + rc = hl_vm_init(hdev); + if (rc) { + dev_err(hdev->dev, + "Failed to init memory module after hard reset\n"); + goto out_err; + } + hl_set_max_power(hdev, hdev->max_power); hdev->hard_reset_pending = false; @@ -861,6 +870,13 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) hdev->asic_name, hdev->asic_prop.dram_size / 1024 / 1024 / 1024); + rc = hl_vm_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to initialize memory module\n"); + rc = 0; + goto out_disabled; + } + /* * hl_hwmon_init must be called after device_late_init, because only * there we get the information from the device about which @@ -977,6 +993,8 @@ void hl_device_fini(struct hl_device *hdev) /* Reset the H/W. It will be in idle state after this returns */ hdev->asic_funcs->hw_fini(hdev, true); + hl_vm_fini(hdev); + hl_eq_fini(hdev, &hdev->event_queue); for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index e3878fd7dc94..89b82b989966 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -6,6 +6,8 @@ */ #include "goyaP.h" +#include "include/hw_ip/mmu/mmu_general.h" +#include "include/hw_ip/mmu/mmu_v1_0.h" #include "include/goya/asic_reg/goya_masks.h" #include @@ -80,6 +82,7 @@ #define GOYA_PLDM_RESET_WAIT_MSEC 1000 /* 1s */ #define GOYA_CPU_TIMEOUT_USEC 10000000 /* 10s */ #define GOYA_TEST_QUEUE_WAIT_USEC 100000 /* 100ms */ +#define GOYA_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100) #define GOYA_QMAN0_FENCE_VAL 0xD169B243 @@ -131,6 +134,70 @@ static const char *goya_axi_name[GOYA_MAX_INITIATORS] = { "MMU" }; +static u64 goya_mmu_regs[GOYA_MMU_REGS_NUM] = { + mmDMA_QM_0_GLBL_NON_SECURE_PROPS, + mmDMA_QM_1_GLBL_NON_SECURE_PROPS, + mmDMA_QM_2_GLBL_NON_SECURE_PROPS, + mmDMA_QM_3_GLBL_NON_SECURE_PROPS, + mmDMA_QM_4_GLBL_NON_SECURE_PROPS, + mmTPC0_QM_GLBL_SECURE_PROPS, + mmTPC0_QM_GLBL_NON_SECURE_PROPS, + mmTPC0_CMDQ_GLBL_SECURE_PROPS, + mmTPC0_CMDQ_GLBL_NON_SECURE_PROPS, + mmTPC0_CFG_ARUSER, + mmTPC0_CFG_AWUSER, + mmTPC1_QM_GLBL_SECURE_PROPS, + mmTPC1_QM_GLBL_NON_SECURE_PROPS, + mmTPC1_CMDQ_GLBL_SECURE_PROPS, + mmTPC1_CMDQ_GLBL_NON_SECURE_PROPS, + mmTPC1_CFG_ARUSER, + mmTPC1_CFG_AWUSER, + mmTPC2_QM_GLBL_SECURE_PROPS, + mmTPC2_QM_GLBL_NON_SECURE_PROPS, + mmTPC2_CMDQ_GLBL_SECURE_PROPS, + mmTPC2_CMDQ_GLBL_NON_SECURE_PROPS, + mmTPC2_CFG_ARUSER, + mmTPC2_CFG_AWUSER, + mmTPC3_QM_GLBL_SECURE_PROPS, + mmTPC3_QM_GLBL_NON_SECURE_PROPS, + mmTPC3_CMDQ_GLBL_SECURE_PROPS, + mmTPC3_CMDQ_GLBL_NON_SECURE_PROPS, + mmTPC3_CFG_ARUSER, + mmTPC3_CFG_AWUSER, + mmTPC4_QM_GLBL_SECURE_PROPS, + mmTPC4_QM_GLBL_NON_SECURE_PROPS, + mmTPC4_CMDQ_GLBL_SECURE_PROPS, + mmTPC4_CMDQ_GLBL_NON_SECURE_PROPS, + mmTPC4_CFG_ARUSER, + mmTPC4_CFG_AWUSER, + mmTPC5_QM_GLBL_SECURE_PROPS, + mmTPC5_QM_GLBL_NON_SECURE_PROPS, + mmTPC5_CMDQ_GLBL_SECURE_PROPS, + mmTPC5_CMDQ_GLBL_NON_SECURE_PROPS, + mmTPC5_CFG_ARUSER, + mmTPC5_CFG_AWUSER, + mmTPC6_QM_GLBL_SECURE_PROPS, + mmTPC6_QM_GLBL_NON_SECURE_PROPS, + mmTPC6_CMDQ_GLBL_SECURE_PROPS, + mmTPC6_CMDQ_GLBL_NON_SECURE_PROPS, + mmTPC6_CFG_ARUSER, + mmTPC6_CFG_AWUSER, + mmTPC7_QM_GLBL_SECURE_PROPS, + mmTPC7_QM_GLBL_NON_SECURE_PROPS, + mmTPC7_CMDQ_GLBL_SECURE_PROPS, + mmTPC7_CMDQ_GLBL_NON_SECURE_PROPS, + mmTPC7_CFG_ARUSER, + mmTPC7_CFG_AWUSER, + mmMME_QM_GLBL_SECURE_PROPS, + mmMME_QM_GLBL_NON_SECURE_PROPS, + mmMME_CMDQ_GLBL_SECURE_PROPS, + mmMME_CMDQ_GLBL_NON_SECURE_PROPS, + mmMME_SBA_CONTROL_DATA, + mmMME_SBB_CONTROL_DATA, + mmMME_SBC_CONTROL_DATA, + mmMME_WBC_CONTROL_DATA +}; + #define GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE 121 static u32 goya_non_fatal_events[GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE] = { @@ -258,6 +325,10 @@ static u32 goya_non_fatal_events[GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE] = { }; static int goya_armcp_info_get(struct hl_device *hdev); +static void goya_mmu_prepare(struct hl_device *hdev, u32 asid); +static int goya_mmu_clear_pgt_range(struct hl_device *hdev); +static int goya_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid, + u64 phys_addr); static void goya_get_fixed_properties(struct hl_device *hdev) { @@ -296,6 +367,16 @@ static void goya_get_fixed_properties(struct hl_device *hdev) prop->sram_user_base_address = prop->sram_base_address + SRAM_USER_BASE_OFFSET; + prop->mmu_pgt_addr = MMU_PAGE_TABLES_ADDR; + if (hdev->pldm) + prop->mmu_pgt_size = 0x800000; /* 8MB */ + else + prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE; + prop->mmu_pte_size = HL_PTE_SIZE; + prop->mmu_hop_table_size = HOP_TABLE_SIZE; + prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE; + prop->dram_page_size = PAGE_SIZE_2MB; + prop->host_phys_base_address = HOST_PHYS_BASE; prop->va_space_host_start_address = VA_HOST_SPACE_START; prop->va_space_host_end_address = VA_HOST_SPACE_END; @@ -752,7 +833,18 @@ static int goya_late_init(struct hl_device *hdev) goya_fetch_psoc_frequency(hdev); + rc = goya_mmu_clear_pgt_range(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to clear MMU page tables range\n"); + goto disable_pci_access; + } + return 0; + +disable_pci_access: + goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS); + + return rc; } /* @@ -2565,6 +2657,54 @@ out: return 0; } +static int goya_mmu_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct goya_device *goya = hdev->asic_specific; + u64 hop0_addr; + int rc, i; + + if (!hdev->mmu_enable) + return 0; + + if (goya->hw_cap_initialized & HW_CAP_MMU) + return 0; + + hdev->dram_supports_virtual_memory = true; + + for (i = 0 ; i < prop->max_asid ; i++) { + hop0_addr = prop->mmu_pgt_addr + + (i * prop->mmu_hop_table_size); + + rc = goya_mmu_update_asid_hop0_addr(hdev, i, hop0_addr); + if (rc) { + dev_err(hdev->dev, + "failed to set hop0 addr for asid %d\n", i); + goto err; + } + } + + goya->hw_cap_initialized |= HW_CAP_MMU; + + /* init MMU cache manage page */ + WREG32(mmSTLB_CACHE_INV_BASE_39_8, MMU_CACHE_MNG_ADDR >> 8); + WREG32(mmSTLB_CACHE_INV_BASE_49_40, MMU_CACHE_MNG_ADDR << 40); + + /* Remove follower feature due to performance bug */ + WREG32_AND(mmSTLB_STLB_FEATURE_EN, + (~STLB_STLB_FEATURE_EN_FOLLOWER_EN_MASK)); + + hdev->asic_funcs->mmu_invalidate_cache(hdev, true); + + WREG32(mmMMU_MMU_ENABLE, 1); + WREG32(mmMMU_SPI_MASK, 0xF); + + return 0; + +err: + return rc; +} + /* * goya_hw_init - Goya hardware initialization code * @@ -2614,6 +2754,10 @@ static int goya_hw_init(struct hl_device *hdev) return rc; } + rc = goya_mmu_init(hdev); + if (rc) + return rc; + goya_init_security(hdev); goya_init_dma_qmans(hdev); @@ -4249,6 +4393,10 @@ int goya_context_switch(struct hl_device *hdev, u32 asid) rc = goya_send_job_on_qman0(hdev, job); + /* no point in setting the asid in case of failure */ + if (!rc) + goya_mmu_prepare(hdev, asid); + job->patched_cb->cs_cnt--; hl_cb_put(job->patched_cb); @@ -4284,6 +4432,22 @@ void goya_restore_phase_topology(struct hl_device *hdev) i = RREG32(mmSYNC_MNGR_SOB_OBJ_0); } +static u64 goya_read_pte(struct hl_device *hdev, u64 addr) +{ + struct goya_device *goya = hdev->asic_specific; + + return readq(hdev->pcie_bar[DDR_BAR_ID] + + (addr - goya->ddr_bar_cur_addr)); +} + +static void goya_write_pte(struct hl_device *hdev, u64 addr, u64 val) +{ + struct goya_device *goya = hdev->asic_specific; + + writeq(val, hdev->pcie_bar[DDR_BAR_ID] + + (addr - goya->ddr_bar_cur_addr)); +} + static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id, u16 event_type, char *axi_name, int len) { @@ -4567,6 +4731,233 @@ void *goya_get_events_stat(struct hl_device *hdev, u32 *size) return goya->events_stat; } +static int goya_mmu_clear_pgt_range(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct goya_device *goya = hdev->asic_specific; + struct packet_lin_dma *clear_pgt_range_pkt; + struct hl_cs_parser parser; + struct hl_cs_job *job; + u32 cb_size; + struct hl_cb *cb; + int rc; + + if (!(goya->hw_cap_initialized & HW_CAP_MMU)) + return 0; + + cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + if (!cb) + return -EFAULT; + + clear_pgt_range_pkt = (struct packet_lin_dma *) + (uintptr_t) cb->kernel_address; + + memset(clear_pgt_range_pkt, 0, sizeof(*clear_pgt_range_pkt)); + cb_size = sizeof(*clear_pgt_range_pkt); + + clear_pgt_range_pkt->ctl = + ((PACKET_LIN_DMA << GOYA_PKT_CTL_OPCODE_SHIFT) | + (DMA_HOST_TO_DRAM << GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT) | + (1 << GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT) | + (1 << GOYA_PKT_LIN_DMA_CTL_WO_SHIFT) | + (1 << GOYA_PKT_CTL_RB_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT)); + + clear_pgt_range_pkt->src_addr = 0; + clear_pgt_range_pkt->dst_addr = prop->mmu_pgt_addr; + clear_pgt_range_pkt->tsize = prop->mmu_pgt_size + MMU_CACHE_MNG_SIZE; + + job = hl_cs_allocate_job(hdev, true); + if (!job) { + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + goto release_cb; + } + + job->id = 0; + job->user_cb = cb; + job->user_cb->cs_cnt++; + job->user_cb_size = cb_size; + job->hw_queue_id = GOYA_QUEUE_ID_DMA_0; + + parser.ctx_id = HL_KERNEL_ASID_ID; + parser.cs_sequence = 0; + parser.job_id = job->id; + parser.hw_queue_id = job->hw_queue_id; + parser.job_userptr_list = &job->userptr_list; + parser.user_cb = job->user_cb; + parser.user_cb_size = job->user_cb_size; + parser.ext_queue = job->ext_queue; + parser.use_virt_addr = hdev->mmu_enable; + + rc = hdev->asic_funcs->cs_parser(hdev, &parser); + if (rc) { + dev_err(hdev->dev, + "Failed to parse kernel CB when clearing pgt\n"); + goto free_job; + } + + job->patched_cb = parser.patched_cb; + job->job_cb_size = parser.patched_cb_size; + job->patched_cb->cs_cnt++; + + rc = goya_send_job_on_qman0(hdev, job); + + job->patched_cb->cs_cnt--; + hl_cb_put(job->patched_cb); + +free_job: + hl_userptr_delete_list(hdev, &job->userptr_list); + kfree(job); + cb->cs_cnt--; + +release_cb: + hl_cb_put(cb); + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT); + + return rc; +} + +static void goya_mmu_prepare(struct hl_device *hdev, u32 asid) +{ + struct goya_device *goya = hdev->asic_specific; + int i; + + if (!(goya->hw_cap_initialized & HW_CAP_MMU)) + return; + + if (asid & ~MME_QM_GLBL_SECURE_PROPS_ASID_MASK) { + WARN(1, "asid %u is too big\n", asid); + return; + } + + /* zero the MMBP and ASID bits and then set the ASID */ + for (i = 0 ; i < GOYA_MMU_REGS_NUM ; i++) { + WREG32_AND(goya_mmu_regs[i], ~0x7FF); + WREG32_OR(goya_mmu_regs[i], asid); + } +} + +static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard) +{ + struct goya_device *goya = hdev->asic_specific; + u32 status, timeout_usec; + int rc; + + if (!(goya->hw_cap_initialized & HW_CAP_MMU)) + return; + + /* no need in L1 only invalidation in Goya */ + if (!is_hard) + return; + + if (hdev->pldm) + timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC; + else + timeout_usec = MMU_CONFIG_TIMEOUT_USEC; + + mutex_lock(&hdev->mmu_cache_lock); + + /* L0 & L1 invalidation */ + WREG32(mmSTLB_INV_ALL_START, 1); + + rc = hl_poll_timeout( + hdev, + mmSTLB_INV_ALL_START, + status, + !status, + 1000, + timeout_usec); + + mutex_unlock(&hdev->mmu_cache_lock); + + if (rc) + dev_notice_ratelimited(hdev->dev, + "Timeout when waiting for MMU cache invalidation\n"); +} + +static void goya_mmu_invalidate_cache_range(struct hl_device *hdev, + bool is_hard, u32 asid, u64 va, u64 size) +{ + struct goya_device *goya = hdev->asic_specific; + u32 status, timeout_usec, inv_data, pi; + int rc; + + if (!(goya->hw_cap_initialized & HW_CAP_MMU)) + return; + + /* no need in L1 only invalidation in Goya */ + if (!is_hard) + return; + + if (hdev->pldm) + timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC; + else + timeout_usec = MMU_CONFIG_TIMEOUT_USEC; + + mutex_lock(&hdev->mmu_cache_lock); + + /* + * TODO: currently invalidate entire L0 & L1 as in regular hard + * invalidation. Need to apply invalidation of specific cache lines with + * mask of ASID & VA & size. + * Note that L1 with be flushed entirely in any case. + */ + + /* L0 & L1 invalidation */ + inv_data = RREG32(mmSTLB_CACHE_INV); + /* PI is 8 bit */ + pi = ((inv_data & STLB_CACHE_INV_PRODUCER_INDEX_MASK) + 1) & 0xFF; + WREG32(mmSTLB_CACHE_INV, + (inv_data & STLB_CACHE_INV_INDEX_MASK_MASK) | pi); + + rc = hl_poll_timeout( + hdev, + mmSTLB_INV_CONSUMER_INDEX, + status, + status == pi, + 1000, + timeout_usec); + + mutex_unlock(&hdev->mmu_cache_lock); + + if (rc) + dev_notice_ratelimited(hdev->dev, + "Timeout when waiting for MMU cache invalidation\n"); +} + +static int goya_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid, + u64 phys_addr) +{ + u32 status, timeout_usec; + int rc; + + if (hdev->pldm) + timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC; + else + timeout_usec = MMU_CONFIG_TIMEOUT_USEC; + + WREG32(MMU_HOP0_PA43_12, phys_addr >> MMU_HOP0_PA43_12_SHIFT); + WREG32(MMU_HOP0_PA49_44, phys_addr >> MMU_HOP0_PA49_44_SHIFT); + WREG32(MMU_ASID_BUSY, 0x80000000 | asid); + + rc = hl_poll_timeout( + hdev, + MMU_ASID_BUSY, + status, + !(status & 0x80000000), + 1000, + timeout_usec); + + if (rc) { + dev_err(hdev->dev, + "Timeout during MMU hop0 config of asid %d\n", asid); + return rc; + } + + return 0; +} + int goya_send_heartbeat(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; @@ -4830,6 +5221,10 @@ static const struct hl_asic_funcs goya_funcs = { .handle_eqe = goya_handle_eqe, .set_pll_profile = goya_set_pll_profile, .get_events_stat = goya_get_events_stat, + .read_pte = goya_read_pte, + .write_pte = goya_write_pte, + .mmu_invalidate_cache = goya_mmu_invalidate_cache, + .mmu_invalidate_cache_range = goya_mmu_invalidate_cache_range, .send_heartbeat = goya_send_heartbeat, .enable_clock_gating = goya_init_clock_gating, .disable_clock_gating = goya_disable_clock_gating, diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 9adc7c6ec08b..03085e7a12dd 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -19,6 +19,7 @@ #include #include #include +#include #define HL_NAME "habanalabs" @@ -39,6 +40,31 @@ /* MUST BE POWER OF 2 and larger than 1 */ #define HL_MAX_PENDING_CS 64 +/* Memory */ +#define MEM_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ + +/* MMU */ +#define MMU_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ + +/** + * struct pgt_info - MMU hop page info. + * @node: hash linked-list node for the pgts hash of pgts. + * @addr: physical address of the pgt. + * @ctx: pointer to the owner ctx. + * @num_of_ptes: indicates how many ptes are used in the pgt. + * + * The MMU page tables hierarchy is placed on the DRAM. When a new level (hop) + * is needed during mapping, a new page is allocated and this structure holds + * its essential information. During unmapping, if no valid PTEs remained in the + * page, it is freed with its pgt_info structure. + */ +struct pgt_info { + struct hlist_node node; + u64 addr; + struct hl_ctx *ctx; + int num_of_ptes; +}; + struct hl_device; struct hl_fpriv; @@ -72,11 +98,11 @@ struct hw_queue_properties { /** * enum vm_type_t - virtual memory mapping request information. * @VM_TYPE_USERPTR: mapping of user memory to device virtual address. - * @VM_TYPE_PHYS_LIST: mapping of DRAM memory to device virtual address. + * @VM_TYPE_PHYS_PACK: mapping of DRAM memory to device virtual address. */ enum vm_type_t { VM_TYPE_USERPTR, - VM_TYPE_PHYS_LIST + VM_TYPE_PHYS_PACK }; /** @@ -117,6 +143,12 @@ enum hl_device_hw_state { * mapping DRAM memory. * @va_space_dram_end_address: end address of virtual memory range for * mapping DRAM memory. + * @mmu_pgt_addr: base physical address in DRAM of MMU page tables. + * @mmu_pgt_size: MMU page tables total size. + * @mmu_pte_size: PTE size in MMU page tables. + * @mmu_hop_table_size: MMU hop table size. + * @mmu_hop0_tables_total_size: total size of MMU hop0 tables. + * @dram_page_size: page size for MMU DRAM allocation. * @cfg_size: configuration space size on SRAM. * @sram_size: total size of SRAM. * @max_asid: maximum number of open contexts (ASIDs). @@ -150,6 +182,12 @@ struct asic_fixed_properties { u64 va_space_host_end_address; u64 va_space_dram_start_address; u64 va_space_dram_end_address; + u64 mmu_pgt_addr; + u32 mmu_pgt_size; + u32 mmu_pte_size; + u32 mmu_hop_table_size; + u32 mmu_hop0_tables_total_size; + u32 dram_page_size; u32 cfg_size; u32 sram_size; u32 max_asid; @@ -419,6 +457,12 @@ enum hl_pll_frequency { * @handle_eqe: handle event queue entry (IRQ) from ArmCP. * @set_pll_profile: change PLL profile (manual/automatic). * @get_events_stat: retrieve event queue entries histogram. + * @read_pte: read MMU page table entry from DRAM. + * @write_pte: write MMU page table entry to DRAM. + * @mmu_invalidate_cache: flush MMU STLB cache, either with soft (L1 only) or + * hard (L0 & L1) flush. + * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with + * ASID-VA-size mask. * @send_heartbeat: send is-alive packet to ArmCP and verify response. * @enable_clock_gating: enable clock gating for reducing power consumption. * @disable_clock_gating: disable clock for accessing registers on HBW. @@ -483,6 +527,11 @@ struct hl_asic_funcs { void (*set_pll_profile)(struct hl_device *hdev, enum hl_pll_frequency freq); void* (*get_events_stat)(struct hl_device *hdev, u32 *size); + u64 (*read_pte)(struct hl_device *hdev, u64 addr); + void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val); + void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard); + void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard, + u32 asid, u64 va, u64 size); int (*send_heartbeat)(struct hl_device *hdev); void (*enable_clock_gating)(struct hl_device *hdev); void (*disable_clock_gating)(struct hl_device *hdev); @@ -504,17 +553,40 @@ struct hl_asic_funcs { #define HL_KERNEL_ASID_ID 0 +/** + * struct hl_va_range - virtual addresses range. + * @lock: protects the virtual addresses list. + * @list: list of virtual addresses blocks available for mappings. + * @start_addr: range start address. + * @end_addr: range end address. + */ +struct hl_va_range { + struct mutex lock; + struct list_head list; + u64 start_addr; + u64 end_addr; +}; + /** * struct hl_ctx - user/kernel context. + * @mem_hash: holds mapping from virtual address to virtual memory area + * descriptor (hl_vm_phys_pg_list or hl_userptr). + * @mmu_hash: holds a mapping from virtual address to pgt_info structure. * @hpriv: pointer to the private (KMD) data of the process (fd). * @hdev: pointer to the device structure. * @refcount: reference counter for the context. Context is released only when * this hits 0l. It is incremented on CS and CS_WAIT. * @cs_pending: array of DMA fence objects representing pending CS. + * @host_va_range: holds available virtual addresses for host mappings. + * @dram_va_range: holds available virtual addresses for DRAM mappings. + * @mem_hash_lock: protects the mem_hash. + * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifing the + * MMU hash or walking the PGT requires talking this lock * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed * to user so user could inquire about CS. It is used as * index to cs_pending array. * @cs_lock: spinlock to protect cs_sequence. + * @dram_phys_mem: amount of used physical DRAM memory by this context. * @thread_restore_token: token to prevent multiple threads of the same context * from running the restore phase. Only one thread * should run it. @@ -524,12 +596,19 @@ struct hl_asic_funcs { * @asid: context's unique address space ID in the device's MMU. */ struct hl_ctx { + DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS); + DECLARE_HASHTABLE(mmu_hash, MMU_HASH_TABLE_BITS); struct hl_fpriv *hpriv; struct hl_device *hdev; struct kref refcount; struct dma_fence *cs_pending[HL_MAX_PENDING_CS]; + struct hl_va_range host_va_range; + struct hl_va_range dram_va_range; + struct mutex mem_hash_lock; + struct mutex mmu_lock; u64 cs_sequence; spinlock_t cs_lock; + atomic64_t dram_phys_mem; atomic_t thread_restore_token; u32 thread_restore_wait_token; u32 asid; @@ -672,6 +751,85 @@ struct hl_cs_parser { }; +/* + * MEMORY STRUCTURE + */ + +/** + * struct hl_vm_hash_node - hash element from virtual address to virtual + * memory area descriptor (hl_vm_phys_pg_list or + * hl_userptr). + * @node: node to hang on the hash table in context object. + * @vaddr: key virtual address. + * @ptr: value pointer (hl_vm_phys_pg_list or hl_userptr). + */ +struct hl_vm_hash_node { + struct hlist_node node; + u64 vaddr; + void *ptr; +}; + +/** + * struct hl_vm_phys_pg_pack - physical page pack. + * @vm_type: describes the type of the virtual area descriptor. + * @pages: the physical page array. + * @mapping_cnt: number of shared mappings. + * @asid: the context related to this list. + * @npages: num physical pages in the pack. + * @page_size: size of each page in the pack. + * @total_size: total size of all the pages in this list. + * @flags: HL_MEM_* flags related to this list. + * @handle: the provided handle related to this list. + * @offset: offset from the first page. + * @contiguous: is contiguous physical memory. + * @created_from_userptr: is product of host virtual address. + */ +struct hl_vm_phys_pg_pack { + enum vm_type_t vm_type; /* must be first */ + u64 *pages; + atomic_t mapping_cnt; + u32 asid; + u32 npages; + u32 page_size; + u32 total_size; + u32 flags; + u32 handle; + u32 offset; + u8 contiguous; + u8 created_from_userptr; +}; + +/** + * struct hl_vm_va_block - virtual range block information. + * @node: node to hang on the virtual range list in context object. + * @start: virtual range start address. + * @end: virtual range end address. + * @size: virtual range size. + */ +struct hl_vm_va_block { + struct list_head node; + u64 start; + u64 end; + u64 size; +}; + +/** + * struct hl_vm - virtual memory manager for MMU. + * @dram_pg_pool: pool for DRAM physical pages of 2MB. + * @dram_pg_pool_refcount: reference counter for the pool usage. + * @idr_lock: protects the phys_pg_list_handles. + * @phys_pg_pack_handles: idr to hold all device allocations handles. + * @init_done: whether initialization was done. We need this because VM + * initialization might be skipped during device initialization. + */ +struct hl_vm { + struct gen_pool *dram_pg_pool; + struct kref dram_pg_pool_refcount; + spinlock_t idr_lock; + struct idr phys_pg_pack_handles; + u8 init_done; +}; + /* * FILE PRIVATE STRUCTURE */ @@ -787,12 +945,16 @@ struct hl_device_reset_work { * @asic_prop: ASIC specific immutable properties. * @asic_funcs: ASIC specific functions. * @asic_specific: ASIC specific information to use only from ASIC files. + * @mmu_pgt_pool: pool of available MMU hops. + * @vm: virtual memory manager for MMU. + * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context * @hwmon_dev: H/W monitor device. * @pm_mng_profile: current power management profile. * @hl_chip_info: ASIC's sensors information. * @cb_pool: list of preallocated CBs. * @cb_pool_lock: protects the CB pool. * @user_ctx: current user context executing. + * @dram_used_mem: current DRAM memory consumption. * @in_reset: is device in reset flow. * @curr_pll_profile: current PLL profile. * @fd_open_cnt: number of open user processes. @@ -812,6 +974,7 @@ struct hl_device_reset_work { * @heartbeat: is heartbeat sanity check towards ArmCP enabled. * @reset_on_lockup: true if a reset should be done in case of stuck CS, false * otherwise. + * @dram_supports_virtual_memory: is MMU enabled towards DRAM. * @init_done: is the initialization of the device done. * @mmu_enable: is MMU enabled. */ @@ -846,6 +1009,9 @@ struct hl_device { struct asic_fixed_properties asic_prop; const struct hl_asic_funcs *asic_funcs; void *asic_specific; + struct gen_pool *mmu_pgt_pool; + struct hl_vm vm; + struct mutex mmu_cache_lock; struct device *hwmon_dev; enum hl_pm_mng_profile pm_mng_profile; struct hwmon_chip_info *hl_chip_info; @@ -856,6 +1022,7 @@ struct hl_device { /* TODO: remove user_ctx for multiple process support */ struct hl_ctx *user_ctx; + atomic64_t dram_used_mem; atomic_t in_reset; atomic_t curr_pll_profile; atomic_t fd_open_cnt; @@ -872,6 +1039,7 @@ struct hl_device { u8 hard_reset_pending; u8 heartbeat; u8 reset_on_lockup; + u8 dram_supports_virtual_memory; u8 init_done; /* Parameters for bring-up */ @@ -1021,6 +1189,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, void hl_hpriv_get(struct hl_fpriv *hpriv); void hl_hpriv_put(struct hl_fpriv *hpriv); int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq); + int hl_build_hwmon_channel_info(struct hl_device *hdev, struct armcp_sensor *sensors_arr); @@ -1048,6 +1217,12 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue); void goya_set_asic_funcs(struct hl_device *hdev); +int hl_vm_ctx_init(struct hl_ctx *ctx); +void hl_vm_ctx_fini(struct hl_ctx *ctx); + +int hl_vm_init(struct hl_device *hdev); +void hl_vm_fini(struct hl_device *hdev); + int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size, struct hl_userptr *userptr); int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr); @@ -1057,6 +1232,15 @@ bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size, struct list_head *userptr_list, struct hl_userptr **userptr); +int hl_mmu_init(struct hl_device *hdev); +void hl_mmu_fini(struct hl_device *hdev); +void hl_mmu_ctx_init(struct hl_ctx *ctx); +void hl_mmu_ctx_fini(struct hl_ctx *ctx); +int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size); +int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size); +void hl_mmu_swap_out(struct hl_ctx *ctx); +void hl_mmu_swap_in(struct hl_ctx *ctx); + long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr); void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq); long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr); @@ -1074,5 +1258,6 @@ long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data); int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data); int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data); +int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data); #endif /* HABANALABSP_H_ */ diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c index 77a1cc85e530..436ccae0989d 100644 --- a/drivers/misc/habanalabs/habanalabs_drv.c +++ b/drivers/misc/habanalabs/habanalabs_drv.c @@ -188,7 +188,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, hdev->reset_on_lockup = reset_on_lockup; /* Parameters for bring-up - set them to defaults */ - hdev->mmu_enable = 0; + hdev->mmu_enable = 1; hdev->cpu_enable = 1; hdev->reset_pcilink = 0; hdev->cpu_queues_enable = 1; diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c index 481db1a5e97e..6e4dc5b5e696 100644 --- a/drivers/misc/habanalabs/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/habanalabs_ioctl.c @@ -18,7 +18,8 @@ static const struct hl_ioctl_desc hl_ioctls[] = { HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl), HL_IOCTL_DEF(HL_IOCTL_CS, hl_cs_ioctl), - HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl) + HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl), + HL_IOCTL_DEF(HL_IOCTL_MEMORY, hl_mem_ioctl) }; #define HL_CORE_IOCTL_COUNT ARRAY_SIZE(hl_ioctls) diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h new file mode 100644 index 000000000000..1bc36aba1426 --- /dev/null +++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2016-2018 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#ifndef INCLUDE_MMU_GENERAL_H_ +#define INCLUDE_MMU_GENERAL_H_ + +#define PAGE_SHIFT_4KB 12 +#define PAGE_SHIFT_2MB 21 +#define PAGE_SIZE_2MB (_AC(1, UL) << PAGE_SHIFT_2MB) +#define PAGE_SIZE_4KB (_AC(1, UL) << PAGE_SHIFT_4KB) +#define PAGE_MASK_2MB (~(PAGE_SIZE_2MB - 1)) + +#define PAGE_PRESENT_MASK 0x0000000000001 +#define SWAP_OUT_MASK 0x0000000000004 +#define LAST_MASK 0x0000000000800 +#define PHYS_ADDR_MASK 0x3FFFFFFFFF000ull +#define HOP0_MASK 0x3000000000000ull +#define HOP1_MASK 0x0FF8000000000ull +#define HOP2_MASK 0x0007FC0000000ull +#define HOP3_MASK 0x000003FE00000 +#define HOP4_MASK 0x00000001FF000 +#define OFFSET_MASK 0x0000000000FFF + +#define HOP0_SHIFT 48 +#define HOP1_SHIFT 39 +#define HOP2_SHIFT 30 +#define HOP3_SHIFT 21 +#define HOP4_SHIFT 12 + +#define PTE_PHYS_ADDR_SHIFT 12 +#define PTE_PHYS_ADDR_MASK ~0xFFF + +#define HL_PTE_SIZE sizeof(u64) +#define HOP_TABLE_SIZE PAGE_SIZE_4KB +#define HOP0_TABLES_TOTAL_SIZE (HOP_TABLE_SIZE * MAX_ASID) + +#define MMU_HOP0_PA43_12_SHIFT 12 +#define MMU_HOP0_PA49_44_SHIFT (12 + 32) + +#define MMU_CONFIG_TIMEOUT_USEC 2000 /* 2 ms */ + +#endif /* INCLUDE_MMU_GENERAL_H_ */ diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h new file mode 100644 index 000000000000..8539dd041f2c --- /dev/null +++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2016-2018 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#ifndef INCLUDE_MMU_V1_0_H_ +#define INCLUDE_MMU_V1_0_H_ + +#define MMU_HOP0_PA43_12 0x490004 +#define MMU_HOP0_PA49_44 0x490008 +#define MMU_ASID_BUSY 0x490000 + +#endif /* INCLUDE_MMU_V1_0_H_ */ diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c index ad14376a1c25..6650c8085fc6 100644 --- a/drivers/misc/habanalabs/memory.c +++ b/drivers/misc/habanalabs/memory.c @@ -5,10 +5,1198 @@ * All Rights Reserved. */ +#include #include "habanalabs.h" +#include "include/hw_ip/mmu/mmu_general.h" #include #include +#include + +#define PGS_IN_2MB_PAGE (PAGE_SIZE_2MB >> PAGE_SHIFT) +#define HL_MMU_DEBUG 0 + +/* + * The va ranges in context object contain a list with the available chunks of + * device virtual memory. + * There is one range for host allocations and one for DRAM allocations. + * + * On initialization each range contains one chunk of all of its available + * virtual range which is a half of the total device virtual range. + * + * On each mapping of physical pages, a suitable virtual range chunk (with a + * minimum size) is selected from the list. If the chunk size equals the + * requested size, the chunk is returned. Otherwise, the chunk is split into + * two chunks - one to return as result and a remainder to stay in the list. + * + * On each Unmapping of a virtual address, the relevant virtual chunk is + * returned to the list. The chunk is added to the list and if its edges match + * the edges of the adjacent chunks (means a contiguous chunk can be created), + * the chunks are merged. + * + * On finish, the list is checked to have only one chunk of all the relevant + * virtual range (which is a half of the device total virtual range). + * If not (means not all mappings were unmapped), a warning is printed. + */ + +/* + * alloc_device_memory - allocate device memory + * + * @ctx : current context + * @args : host parameters containing the requested size + * @ret_handle : result handle + * + * This function does the following: + * - Allocate the requested size rounded up to 2MB pages + * - Return unique handle + */ +static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args, + u32 *ret_handle) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + struct hl_vm_phys_pg_pack *phys_pg_pack; + u64 paddr = 0; + u32 total_size, num_pgs, num_curr_pgs, page_size, page_shift; + int handle, rc, i; + bool contiguous; + + num_curr_pgs = 0; + page_size = hdev->asic_prop.dram_page_size; + page_shift = __ffs(page_size); + num_pgs = (args->alloc.mem_size + (page_size - 1)) >> page_shift; + total_size = num_pgs << page_shift; + + contiguous = args->flags & HL_MEM_CONTIGUOUS; + + if (contiguous) { + paddr = (u64) gen_pool_alloc(vm->dram_pg_pool, total_size); + if (!paddr) { + dev_err(hdev->dev, + "failed to allocate %u huge contiguous pages\n", + num_pgs); + return -ENOMEM; + } + } + + phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL); + if (!phys_pg_pack) { + rc = -ENOMEM; + goto pages_pack_err; + } + + phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK; + phys_pg_pack->asid = ctx->asid; + phys_pg_pack->npages = num_pgs; + phys_pg_pack->page_size = page_size; + phys_pg_pack->total_size = total_size; + phys_pg_pack->flags = args->flags; + phys_pg_pack->contiguous = contiguous; + + phys_pg_pack->pages = kcalloc(num_pgs, sizeof(u64), GFP_KERNEL); + if (!phys_pg_pack->pages) { + rc = -ENOMEM; + goto pages_arr_err; + } + + if (phys_pg_pack->contiguous) { + for (i = 0 ; i < num_pgs ; i++) + phys_pg_pack->pages[i] = paddr + i * page_size; + } else { + for (i = 0 ; i < num_pgs ; i++) { + phys_pg_pack->pages[i] = (u64) gen_pool_alloc( + vm->dram_pg_pool, + page_size); + if (!phys_pg_pack->pages[i]) { + dev_err(hdev->dev, + "ioctl failed to allocate page\n"); + rc = -ENOMEM; + goto page_err; + } + + num_curr_pgs++; + } + } + + spin_lock(&vm->idr_lock); + handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0, + GFP_KERNEL); + spin_unlock(&vm->idr_lock); + + if (handle < 0) { + dev_err(hdev->dev, "Failed to get handle for page\n"); + rc = -EFAULT; + goto idr_err; + } + + for (i = 0 ; i < num_pgs ; i++) + kref_get(&vm->dram_pg_pool_refcount); + + phys_pg_pack->handle = handle; + + atomic64_add(phys_pg_pack->total_size, &ctx->dram_phys_mem); + atomic64_add(phys_pg_pack->total_size, &hdev->dram_used_mem); + + *ret_handle = handle; + + return 0; + +idr_err: +page_err: + if (!phys_pg_pack->contiguous) + for (i = 0 ; i < num_curr_pgs ; i++) + gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[i], + page_size); + + kfree(phys_pg_pack->pages); +pages_arr_err: + kfree(phys_pg_pack); +pages_pack_err: + if (contiguous) + gen_pool_free(vm->dram_pg_pool, paddr, total_size); + + return rc; +} + +/* + * get_userptr_from_host_va - initialize userptr structure from given host + * virtual address + * + * @hdev : habanalabs device structure + * @args : parameters containing the virtual address and size + * @p_userptr : pointer to result userptr structure + * + * This function does the following: + * - Allocate userptr structure + * - Pin the given host memory using the userptr structure + * - Perform DMA mapping to have the DMA addresses of the pages + */ +static int get_userptr_from_host_va(struct hl_device *hdev, + struct hl_mem_in *args, struct hl_userptr **p_userptr) +{ + struct hl_userptr *userptr; + int rc; + + userptr = kzalloc(sizeof(*userptr), GFP_KERNEL); + if (!userptr) { + rc = -ENOMEM; + goto userptr_err; + } + + rc = hl_pin_host_memory(hdev, args->map_host.host_virt_addr, + args->map_host.mem_size, userptr); + if (rc) { + dev_err(hdev->dev, "Failed to pin host memory\n"); + goto pin_err; + } + + rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl, + userptr->sgt->nents, DMA_BIDIRECTIONAL); + if (rc) { + dev_err(hdev->dev, "failed to map sgt with DMA region\n"); + goto dma_map_err; + } + + userptr->dma_mapped = true; + userptr->dir = DMA_BIDIRECTIONAL; + userptr->vm_type = VM_TYPE_USERPTR; + + *p_userptr = userptr; + + return 0; + +dma_map_err: + hl_unpin_host_memory(hdev, userptr); +pin_err: + kfree(userptr); +userptr_err: + + return rc; +} + +/* + * free_userptr - free userptr structure + * + * @hdev : habanalabs device structure + * @userptr : userptr to free + * + * This function does the following: + * - Unpins the physical pages + * - Frees the userptr structure + */ +static void free_userptr(struct hl_device *hdev, struct hl_userptr *userptr) +{ + hl_unpin_host_memory(hdev, userptr); + kfree(userptr); +} + +/* + * dram_pg_pool_do_release - free DRAM pages pool + * + * @ref : pointer to reference object + * + * This function does the following: + * - Frees the idr structure of physical pages handles + * - Frees the generic pool of DRAM physical pages + */ +static void dram_pg_pool_do_release(struct kref *ref) +{ + struct hl_vm *vm = container_of(ref, struct hl_vm, + dram_pg_pool_refcount); + + /* + * free the idr here as only here we know for sure that there are no + * allocated physical pages and hence there are no handles in use + */ + idr_destroy(&vm->phys_pg_pack_handles); + gen_pool_destroy(vm->dram_pg_pool); +} + +/* + * free_phys_pg_pack - free physical page pack + * + * @hdev : habanalabs device structure + * @phys_pg_pack : physical page pack to free + * + * This function does the following: + * - For DRAM memory only, iterate over the pack and free each physical block + * structure by returning it to the general pool + * - Free the hl_vm_phys_pg_pack structure + */ +static void free_phys_pg_pack(struct hl_device *hdev, + struct hl_vm_phys_pg_pack *phys_pg_pack) +{ + struct hl_vm *vm = &hdev->vm; + int i; + + if (!phys_pg_pack->created_from_userptr) { + if (phys_pg_pack->contiguous) { + gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[0], + phys_pg_pack->total_size); + + for (i = 0; i < phys_pg_pack->npages ; i++) + kref_put(&vm->dram_pg_pool_refcount, + dram_pg_pool_do_release); + } else { + for (i = 0 ; i < phys_pg_pack->npages ; i++) { + gen_pool_free(vm->dram_pg_pool, + phys_pg_pack->pages[i], + phys_pg_pack->page_size); + kref_put(&vm->dram_pg_pool_refcount, + dram_pg_pool_do_release); + } + } + } + + kfree(phys_pg_pack->pages); + kfree(phys_pg_pack); +} + +/* + * free_device_memory - free device memory + * + * @ctx : current context + * @handle : handle of the memory chunk to free + * + * This function does the following: + * - Free the device memory related to the given handle + */ +static int free_device_memory(struct hl_ctx *ctx, u32 handle) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + struct hl_vm_phys_pg_pack *phys_pg_pack; + + spin_lock(&vm->idr_lock); + phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle); + if (phys_pg_pack) { + if (atomic_read(&phys_pg_pack->mapping_cnt) > 0) { + dev_err(hdev->dev, "handle %u is mapped, cannot free\n", + handle); + spin_unlock(&vm->idr_lock); + return -EINVAL; + } + + /* + * must remove from idr before the freeing of the physical + * pages as the refcount of the pool is also the trigger of the + * idr destroy + */ + idr_remove(&vm->phys_pg_pack_handles, handle); + spin_unlock(&vm->idr_lock); + + atomic64_sub(phys_pg_pack->total_size, &ctx->dram_phys_mem); + atomic64_sub(phys_pg_pack->total_size, &hdev->dram_used_mem); + + free_phys_pg_pack(hdev, phys_pg_pack); + } else { + spin_unlock(&vm->idr_lock); + dev_err(hdev->dev, + "free device memory failed, no match for handle %u\n", + handle); + return -EINVAL; + } + + return 0; +} + +/* + * clear_va_list_locked - free virtual addresses list + * + * @hdev : habanalabs device structure + * @va_list : list of virtual addresses to free + * + * This function does the following: + * - Iterate over the list and free each virtual addresses block + * + * This function should be called only when va_list lock is taken + */ +static void clear_va_list_locked(struct hl_device *hdev, + struct list_head *va_list) +{ + struct hl_vm_va_block *va_block, *tmp; + + list_for_each_entry_safe(va_block, tmp, va_list, node) { + list_del(&va_block->node); + kfree(va_block); + } +} + +/* + * print_va_list_locked - print virtual addresses list + * + * @hdev : habanalabs device structure + * @va_list : list of virtual addresses to print + * + * This function does the following: + * - Iterate over the list and print each virtual addresses block + * + * This function should be called only when va_list lock is taken + */ +static void print_va_list_locked(struct hl_device *hdev, + struct list_head *va_list) +{ +#if HL_MMU_DEBUG + struct hl_vm_va_block *va_block; + + dev_dbg(hdev->dev, "print va list:\n"); + + list_for_each_entry(va_block, va_list, node) + dev_dbg(hdev->dev, + "va block, start: 0x%llx, end: 0x%llx, size: %llu\n", + va_block->start, va_block->end, va_block->size); +#endif +} + +/* + * merge_va_blocks_locked - merge a virtual block if possible + * + * @hdev : pointer to the habanalabs device structure + * @va_list : pointer to the virtual addresses block list + * @va_block : virtual block to merge with adjacent blocks + * + * This function does the following: + * - Merge the given blocks with the adjacent blocks if their virtual ranges + * create a contiguous virtual range + * + * This Function should be called only when va_list lock is taken + */ +static void merge_va_blocks_locked(struct hl_device *hdev, + struct list_head *va_list, struct hl_vm_va_block *va_block) +{ + struct hl_vm_va_block *prev, *next; + + prev = list_prev_entry(va_block, node); + if (&prev->node != va_list && prev->end + 1 == va_block->start) { + prev->end = va_block->end; + prev->size = prev->end - prev->start; + list_del(&va_block->node); + kfree(va_block); + va_block = prev; + } + + next = list_next_entry(va_block, node); + if (&next->node != va_list && va_block->end + 1 == next->start) { + next->start = va_block->start; + next->size = next->end - next->start; + list_del(&va_block->node); + kfree(va_block); + } +} + +/* + * add_va_block_locked - add a virtual block to the virtual addresses list + * + * @hdev : pointer to the habanalabs device structure + * @va_list : pointer to the virtual addresses block list + * @start : start virtual address + * @end : end virtual address + * + * This function does the following: + * - Add the given block to the virtual blocks list and merge with other + * blocks if a contiguous virtual block can be created + * + * This Function should be called only when va_list lock is taken + */ +static int add_va_block_locked(struct hl_device *hdev, + struct list_head *va_list, u64 start, u64 end) +{ + struct hl_vm_va_block *va_block, *res = NULL; + u64 size = end - start; + + print_va_list_locked(hdev, va_list); + + list_for_each_entry(va_block, va_list, node) { + /* TODO: remove upon matureness */ + if (hl_mem_area_crosses_range(start, size, va_block->start, + va_block->end)) { + dev_err(hdev->dev, + "block crossing ranges at start 0x%llx, end 0x%llx\n", + va_block->start, va_block->end); + return -EINVAL; + } + + if (va_block->end < start) + res = va_block; + } + + va_block = kmalloc(sizeof(*va_block), GFP_KERNEL); + if (!va_block) + return -ENOMEM; + + va_block->start = start; + va_block->end = end; + va_block->size = size; + + if (!res) + list_add(&va_block->node, va_list); + else + list_add(&va_block->node, &res->node); + + merge_va_blocks_locked(hdev, va_list, va_block); + + print_va_list_locked(hdev, va_list); + + return 0; +} + +/* + * add_va_block - wrapper for add_va_block_locked + * + * @hdev : pointer to the habanalabs device structure + * @va_list : pointer to the virtual addresses block list + * @start : start virtual address + * @end : end virtual address + * + * This function does the following: + * - Takes the list lock and calls add_va_block_locked + */ +static inline int add_va_block(struct hl_device *hdev, + struct hl_va_range *va_range, u64 start, u64 end) +{ + int rc; + + mutex_lock(&va_range->lock); + rc = add_va_block_locked(hdev, &va_range->list, start, end); + mutex_unlock(&va_range->lock); + + return rc; +} + +/* + * get_va_block - get a virtual block with the requested size + * + * @hdev : pointer to the habanalabs device structure + * @va_range : pointer to the virtual addresses range + * @size : requested block size + * @hint_addr : hint for request address by the user + * @is_userptr : is host or DRAM memory + * + * This function does the following: + * - Iterate on the virtual block list to find a suitable virtual block for the + * requested size + * - Reserve the requested block and update the list + * - Return the start address of the virtual block + */ +static u64 get_va_block(struct hl_device *hdev, + struct hl_va_range *va_range, u32 size, u64 hint_addr, + bool is_userptr) +{ + struct hl_vm_va_block *va_block, *new_va_block = NULL; + u64 valid_start, valid_size, prev_start, prev_end, page_mask, + res_valid_start = 0, res_valid_size = 0; + u32 page_size; + bool add_prev = false; + + if (is_userptr) { + /* + * We cannot know if the user allocated memory with huge pages + * or not, hence we continue with the biggest possible + * granularity. + */ + page_size = PAGE_SIZE_2MB; + page_mask = PAGE_MASK_2MB; + } else { + page_size = hdev->asic_prop.dram_page_size; + page_mask = ~((u64)page_size - 1); + } + + mutex_lock(&va_range->lock); + + print_va_list_locked(hdev, &va_range->list); + + list_for_each_entry(va_block, &va_range->list, node) { + /* calc the first possible aligned addr */ + valid_start = va_block->start; + + + if (valid_start & (page_size - 1)) { + valid_start &= page_mask; + valid_start += page_size; + if (valid_start > va_block->end) + continue; + } + + valid_size = va_block->end - valid_start; + + if (valid_size >= size && + (!new_va_block || valid_size < res_valid_size)) { + + new_va_block = va_block; + res_valid_start = valid_start; + res_valid_size = valid_size; + } + + if (hint_addr && hint_addr >= valid_start && + ((hint_addr + size) <= va_block->end)) { + new_va_block = va_block; + res_valid_start = hint_addr; + res_valid_size = valid_size; + break; + } + } + + if (!new_va_block) { + dev_err(hdev->dev, "no available va block for size %u\n", size); + goto out; + } + + if (res_valid_start > new_va_block->start) { + prev_start = new_va_block->start; + prev_end = res_valid_start - 1; + + new_va_block->start = res_valid_start; + new_va_block->size = res_valid_size; + + add_prev = true; + } + + if (new_va_block->size > size) { + new_va_block->start += size; + new_va_block->size = new_va_block->end - new_va_block->start; + } else { + list_del(&new_va_block->node); + kfree(new_va_block); + } + + if (add_prev) + add_va_block_locked(hdev, &va_range->list, prev_start, + prev_end); + + print_va_list_locked(hdev, &va_range->list); +out: + mutex_unlock(&va_range->lock); + + return res_valid_start; +} + +/* + * get_sg_info - get number of pages and the DMA address from SG list + * + * @sg : the SG list + * @dma_addr : pointer to DMA address to return + * + * Calculate the number of consecutive pages described by the SG list. Take the + * offset of the address in the first page, add to it the length and round it up + * to the number of needed pages. + */ +static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr) +{ + *dma_addr = sg_dma_address(sg); + + return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) + + (PAGE_SIZE - 1)) >> PAGE_SHIFT; +} + +/* + * init_phys_pg_pack_from_userptr - initialize physical page pack from host + * memory + * + * @ctx : current context + * @userptr : userptr to initialize from + * @pphys_pg_pack : res pointer + * + * This function does the following: + * - Pin the physical pages related to the given virtual block + * - Create a physical page pack from the physical pages related to the given + * virtual block + */ +static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx, + struct hl_userptr *userptr, + struct hl_vm_phys_pg_pack **pphys_pg_pack) +{ + struct hl_vm_phys_pg_pack *phys_pg_pack; + struct scatterlist *sg; + dma_addr_t dma_addr; + u64 page_mask; + u32 npages, total_npages, page_size = PAGE_SIZE; + bool first = true, is_huge_page_opt = true; + int rc, i, j; + + phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL); + if (!phys_pg_pack) + return -ENOMEM; + + phys_pg_pack->vm_type = userptr->vm_type; + phys_pg_pack->created_from_userptr = true; + phys_pg_pack->asid = ctx->asid; + atomic_set(&phys_pg_pack->mapping_cnt, 1); + + /* Only if all dma_addrs are aligned to 2MB and their + * sizes is at least 2MB, we can use huge page mapping. + * We limit the 2MB optimization to this condition, + * since later on we acquire the related VA range as one + * consecutive block. + */ + total_npages = 0; + for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) { + npages = get_sg_info(sg, &dma_addr); + + total_npages += npages; + + if (first) { + first = false; + dma_addr &= PAGE_MASK_2MB; + } + + if ((npages % PGS_IN_2MB_PAGE) || + (dma_addr & (PAGE_SIZE_2MB - 1))) + is_huge_page_opt = false; + } + + if (is_huge_page_opt) { + page_size = PAGE_SIZE_2MB; + total_npages /= PGS_IN_2MB_PAGE; + } + + page_mask = ~(((u64) page_size) - 1); + + phys_pg_pack->pages = kcalloc(total_npages, sizeof(u64), GFP_KERNEL); + if (!phys_pg_pack->pages) { + rc = -ENOMEM; + goto page_pack_arr_mem_err; + } + + phys_pg_pack->npages = total_npages; + phys_pg_pack->page_size = page_size; + phys_pg_pack->total_size = total_npages * page_size; + + j = 0; + first = true; + for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) { + npages = get_sg_info(sg, &dma_addr); + + /* align down to physical page size and save the offset */ + if (first) { + first = false; + phys_pg_pack->offset = dma_addr & (page_size - 1); + dma_addr &= page_mask; + } + + while (npages) { + phys_pg_pack->pages[j++] = dma_addr; + dma_addr += page_size; + + if (is_huge_page_opt) + npages -= PGS_IN_2MB_PAGE; + else + npages--; + } + } + + *pphys_pg_pack = phys_pg_pack; + + return 0; + +page_pack_arr_mem_err: + kfree(phys_pg_pack); + + return rc; +} + +/* + * map_phys_page_pack - maps the physical page pack + * + * @ctx : current context + * @vaddr : start address of the virtual area to map from + * @phys_pg_pack : the pack of physical pages to map to + * + * This function does the following: + * - Maps each chunk of virtual memory to matching physical chunk + * - Stores number of successful mappings in the given argument + * - Returns 0 on success, error code otherwise. + */ +static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr, + struct hl_vm_phys_pg_pack *phys_pg_pack) +{ + struct hl_device *hdev = ctx->hdev; + u64 next_vaddr = vaddr, paddr; + u32 page_size = phys_pg_pack->page_size; + int i, rc = 0, mapped_pg_cnt = 0; + + for (i = 0 ; i < phys_pg_pack->npages ; i++) { + paddr = phys_pg_pack->pages[i]; + + /* For accessing the host we need to turn on bit 39 */ + if (phys_pg_pack->created_from_userptr) + paddr += hdev->asic_prop.host_phys_base_address; + + rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size); + if (rc) { + dev_err(hdev->dev, + "map failed for handle %u, npages: %d, mapped: %d", + phys_pg_pack->handle, phys_pg_pack->npages, + mapped_pg_cnt); + goto err; + } + + mapped_pg_cnt++; + next_vaddr += page_size; + } + + return 0; + +err: + next_vaddr = vaddr; + for (i = 0 ; i < mapped_pg_cnt ; i++) { + if (hl_mmu_unmap(ctx, next_vaddr, page_size)) + dev_warn_ratelimited(hdev->dev, + "failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n", + phys_pg_pack->handle, next_vaddr, + phys_pg_pack->pages[i], page_size); + + next_vaddr += page_size; + } + + return rc; +} + +static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args, + u64 *paddr) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + struct hl_vm_phys_pg_pack *phys_pg_pack; + u32 handle; + + handle = lower_32_bits(args->map_device.handle); + spin_lock(&vm->idr_lock); + phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle); + if (!phys_pg_pack) { + spin_unlock(&vm->idr_lock); + dev_err(hdev->dev, "no match for handle %u\n", handle); + return -EINVAL; + } + + *paddr = phys_pg_pack->pages[0]; + + spin_unlock(&vm->idr_lock); + + return 0; +} + +/* + * map_device_va - map the given memory + * + * @ctx : current context + * @args : host parameters with handle/host virtual address + * @device_addr : pointer to result device virtual address + * + * This function does the following: + * - If given a physical device memory handle, map to a device virtual block + * and return the start address of this block + * - If given a host virtual address and size, find the related physical pages, + * map a device virtual block to this pages and return the start address of + * this block + */ +static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, + u64 *device_addr) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + struct hl_vm_phys_pg_pack *phys_pg_pack; + struct hl_userptr *userptr = NULL; + struct hl_vm_hash_node *hnode; + enum vm_type_t *vm_type; + u64 ret_vaddr, hint_addr; + u32 handle = 0; + int rc; + bool is_userptr = args->flags & HL_MEM_USERPTR; + + /* Assume failure */ + *device_addr = 0; + + if (is_userptr) { + rc = get_userptr_from_host_va(hdev, args, &userptr); + if (rc) { + dev_err(hdev->dev, "failed to get userptr from va\n"); + return rc; + } + + rc = init_phys_pg_pack_from_userptr(ctx, userptr, + &phys_pg_pack); + if (rc) { + dev_err(hdev->dev, + "unable to init page pack for vaddr 0x%llx\n", + args->map_host.host_virt_addr); + goto init_page_pack_err; + } + + vm_type = (enum vm_type_t *) userptr; + hint_addr = args->map_host.hint_addr; + } else { + handle = lower_32_bits(args->map_device.handle); + + spin_lock(&vm->idr_lock); + phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle); + if (!phys_pg_pack) { + spin_unlock(&vm->idr_lock); + dev_err(hdev->dev, + "no match for handle %u\n", handle); + return -EINVAL; + } + + /* increment now to avoid freeing device memory while mapping */ + atomic_inc(&phys_pg_pack->mapping_cnt); + + spin_unlock(&vm->idr_lock); + + vm_type = (enum vm_type_t *) phys_pg_pack; + + hint_addr = args->map_device.hint_addr; + } + + /* + * relevant for mapping device physical memory only, as host memory is + * implicitly shared + */ + if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) && + phys_pg_pack->asid != ctx->asid) { + dev_err(hdev->dev, + "Failed to map memory, handle %u is not shared\n", + handle); + rc = -EPERM; + goto shared_err; + } + + hnode = kzalloc(sizeof(*hnode), GFP_KERNEL); + if (!hnode) { + rc = -ENOMEM; + goto hnode_err; + } + + ret_vaddr = get_va_block(hdev, + is_userptr ? &ctx->host_va_range : &ctx->dram_va_range, + phys_pg_pack->total_size, hint_addr, is_userptr); + if (!ret_vaddr) { + dev_err(hdev->dev, "no available va block for handle %u\n", + handle); + rc = -ENOMEM; + goto va_block_err; + } + + mutex_lock(&ctx->mmu_lock); + + rc = map_phys_page_pack(ctx, ret_vaddr, phys_pg_pack); + if (rc) { + mutex_unlock(&ctx->mmu_lock); + dev_err(hdev->dev, "mapping page pack failed for handle %u\n", + handle); + goto map_err; + } + + hdev->asic_funcs->mmu_invalidate_cache_range(hdev, false, ctx->asid, + ret_vaddr, phys_pg_pack->total_size); + + mutex_unlock(&ctx->mmu_lock); + + ret_vaddr += phys_pg_pack->offset; + + hnode->ptr = vm_type; + hnode->vaddr = ret_vaddr; + + mutex_lock(&ctx->mem_hash_lock); + hash_add(ctx->mem_hash, &hnode->node, ret_vaddr); + mutex_unlock(&ctx->mem_hash_lock); + + *device_addr = ret_vaddr; + + if (is_userptr) + free_phys_pg_pack(hdev, phys_pg_pack); + + return 0; + +map_err: + if (add_va_block(hdev, + is_userptr ? &ctx->host_va_range : &ctx->dram_va_range, + ret_vaddr, + ret_vaddr + phys_pg_pack->total_size - 1)) + dev_warn(hdev->dev, + "release va block failed for handle 0x%x, vaddr: 0x%llx\n", + handle, ret_vaddr); + +va_block_err: + kfree(hnode); +hnode_err: +shared_err: + atomic_dec(&phys_pg_pack->mapping_cnt); + if (is_userptr) + free_phys_pg_pack(hdev, phys_pg_pack); +init_page_pack_err: + if (is_userptr) + free_userptr(hdev, userptr); + + return rc; +} + +/* + * unmap_device_va - unmap the given device virtual address + * + * @ctx : current context + * @vaddr : device virtual address to unmap + * + * This function does the following: + * - Unmap the physical pages related to the given virtual address + * - return the device virtual block to the virtual block list + */ +static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; + struct hl_vm_hash_node *hnode = NULL; + struct hl_userptr *userptr = NULL; + enum vm_type_t *vm_type; + u64 next_vaddr; + u32 page_size; + bool is_userptr; + int i, rc; + + /* protect from double entrance */ + mutex_lock(&ctx->mem_hash_lock); + hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr) + if (vaddr == hnode->vaddr) + break; + + if (!hnode) { + mutex_unlock(&ctx->mem_hash_lock); + dev_err(hdev->dev, + "unmap failed, no mem hnode for vaddr 0x%llx\n", + vaddr); + return -EINVAL; + } + + hash_del(&hnode->node); + mutex_unlock(&ctx->mem_hash_lock); + + vm_type = hnode->ptr; + + if (*vm_type == VM_TYPE_USERPTR) { + is_userptr = true; + userptr = hnode->ptr; + rc = init_phys_pg_pack_from_userptr(ctx, userptr, + &phys_pg_pack); + if (rc) { + dev_err(hdev->dev, + "unable to init page pack for vaddr 0x%llx\n", + vaddr); + goto vm_type_err; + } + } else if (*vm_type == VM_TYPE_PHYS_PACK) { + is_userptr = false; + phys_pg_pack = hnode->ptr; + } else { + dev_warn(hdev->dev, + "unmap failed, unknown vm desc for vaddr 0x%llx\n", + vaddr); + rc = -EFAULT; + goto vm_type_err; + } + + if (atomic_read(&phys_pg_pack->mapping_cnt) == 0) { + dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr); + rc = -EINVAL; + goto mapping_cnt_err; + } + + page_size = phys_pg_pack->page_size; + vaddr &= ~(((u64) page_size) - 1); + + next_vaddr = vaddr; + + mutex_lock(&ctx->mmu_lock); + + for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) + if (hl_mmu_unmap(ctx, next_vaddr, page_size)) + dev_warn_ratelimited(hdev->dev, + "unmap failed for vaddr: 0x%llx\n", next_vaddr); + + hdev->asic_funcs->mmu_invalidate_cache_range(hdev, true, ctx->asid, + vaddr, phys_pg_pack->total_size); + + mutex_unlock(&ctx->mmu_lock); + + if (add_va_block(hdev, + is_userptr ? &ctx->host_va_range : &ctx->dram_va_range, + vaddr, + vaddr + phys_pg_pack->total_size - 1)) + dev_warn(hdev->dev, "add va block failed for vaddr: 0x%llx\n", + vaddr); + + atomic_dec(&phys_pg_pack->mapping_cnt); + kfree(hnode); + + if (is_userptr) { + free_phys_pg_pack(hdev, phys_pg_pack); + free_userptr(hdev, userptr); + } + + return 0; + +mapping_cnt_err: + if (is_userptr) + free_phys_pg_pack(hdev, phys_pg_pack); +vm_type_err: + mutex_lock(&ctx->mem_hash_lock); + hash_add(ctx->mem_hash, &hnode->node, vaddr); + mutex_unlock(&ctx->mem_hash_lock); + + return rc; +} + +int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) +{ + union hl_mem_args *args = data; + struct hl_device *hdev = hpriv->hdev; + struct hl_ctx *ctx = hpriv->ctx; + u64 device_addr = 0; + u32 handle = 0; + int rc; + + if (hl_device_disabled_or_in_reset(hdev)) { + dev_warn_ratelimited(hdev->dev, + "Device is disabled or in reset. Can't execute memory IOCTL\n"); + return -EBUSY; + } + + if (hdev->mmu_enable) { + switch (args->in.op) { + case HL_MEM_OP_ALLOC: + if (!hdev->dram_supports_virtual_memory) { + dev_err(hdev->dev, + "DRAM alloc is not supported\n"); + rc = -EINVAL; + goto out; + } + if (args->in.alloc.mem_size == 0) { + dev_err(hdev->dev, + "alloc size must be larger than 0\n"); + rc = -EINVAL; + goto out; + } + rc = alloc_device_memory(ctx, &args->in, &handle); + + memset(args, 0, sizeof(*args)); + args->out.handle = (__u64) handle; + break; + + case HL_MEM_OP_FREE: + if (!hdev->dram_supports_virtual_memory) { + dev_err(hdev->dev, + "DRAM free is not supported\n"); + rc = -EINVAL; + goto out; + } + rc = free_device_memory(ctx, args->in.free.handle); + break; + + case HL_MEM_OP_MAP: + rc = map_device_va(ctx, &args->in, &device_addr); + + memset(args, 0, sizeof(*args)); + args->out.device_virt_addr = device_addr; + break; + + case HL_MEM_OP_UNMAP: + rc = unmap_device_va(ctx, + args->in.unmap.device_virt_addr); + break; + + default: + dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); + rc = -ENOTTY; + break; + } + } else { + switch (args->in.op) { + case HL_MEM_OP_ALLOC: + if (args->in.alloc.mem_size == 0) { + dev_err(hdev->dev, + "alloc size must be larger than 0\n"); + rc = -EINVAL; + goto out; + } + + /* Force contiguous as there are no real MMU + * translations to overcome physical memory gaps + */ + args->in.flags |= HL_MEM_CONTIGUOUS; + rc = alloc_device_memory(ctx, &args->in, &handle); + + memset(args, 0, sizeof(*args)); + args->out.handle = (__u64) handle; + break; + + case HL_MEM_OP_FREE: + rc = free_device_memory(ctx, args->in.free.handle); + break; + + case HL_MEM_OP_MAP: + if (args->in.flags & HL_MEM_USERPTR) { + device_addr = args->in.map_host.host_virt_addr; + rc = 0; + } else { + rc = get_paddr_from_handle(ctx, &args->in, + &device_addr); + } + + memset(args, 0, sizeof(*args)); + args->out.device_virt_addr = device_addr; + break; + + case HL_MEM_OP_UNMAP: + rc = 0; + break; + + default: + dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); + rc = -ENOTTY; + break; + } + } + +out: + return rc; +} /* * hl_pin_host_memory - pins a chunk of host memory @@ -196,3 +1384,332 @@ bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, return false; } + +/* + * hl_va_range_init - initialize virtual addresses range + * + * @hdev : pointer to the habanalabs device structure + * @va_range : pointer to the range to initialize + * @start : range start address + * @end : range end address + * + * This function does the following: + * - Initializes the virtual addresses list of the given range with the given + * addresses. + */ +static int hl_va_range_init(struct hl_device *hdev, + struct hl_va_range *va_range, u64 start, u64 end) +{ + int rc; + + INIT_LIST_HEAD(&va_range->list); + + /* PAGE_SIZE alignment */ + + if (start & (PAGE_SIZE - 1)) { + start &= PAGE_MASK; + start += PAGE_SIZE; + } + + if (end & (PAGE_SIZE - 1)) + end &= PAGE_MASK; + + if (start >= end) { + dev_err(hdev->dev, "too small vm range for va list\n"); + return -EFAULT; + } + + rc = add_va_block(hdev, va_range, start, end); + + if (rc) { + dev_err(hdev->dev, "Failed to init host va list\n"); + return rc; + } + + va_range->start_addr = start; + va_range->end_addr = end; + + return 0; +} + +/* + * hl_vm_ctx_init_with_ranges - initialize virtual memory for context + * + * @ctx : pointer to the habanalabs context structure + * @host_range_start : host virtual addresses range start + * @host_range_end : host virtual addresses range end + * @dram_range_start : dram virtual addresses range start + * @dram_range_end : dram virtual addresses range end + * + * This function initializes the following: + * - MMU for context + * - Virtual address to area descriptor hashtable + * - Virtual block list of available virtual memory + */ +int hl_vm_ctx_init_with_ranges(struct hl_ctx *ctx, u64 host_range_start, + u64 host_range_end, u64 dram_range_start, + u64 dram_range_end) +{ + struct hl_device *hdev = ctx->hdev; + int rc; + + hl_mmu_ctx_init(ctx); + + mutex_init(&ctx->mem_hash_lock); + hash_init(ctx->mem_hash); + + mutex_init(&ctx->host_va_range.lock); + + rc = hl_va_range_init(hdev, &ctx->host_va_range, host_range_start, + host_range_end); + if (rc) { + dev_err(hdev->dev, "failed to init host vm range\n"); + goto host_vm_err; + } + + mutex_init(&ctx->dram_va_range.lock); + + rc = hl_va_range_init(hdev, &ctx->dram_va_range, dram_range_start, + dram_range_end); + if (rc) { + dev_err(hdev->dev, "failed to init dram vm range\n"); + goto dram_vm_err; + } + + return 0; + +dram_vm_err: + mutex_destroy(&ctx->dram_va_range.lock); + + mutex_lock(&ctx->host_va_range.lock); + clear_va_list_locked(hdev, &ctx->host_va_range.list); + mutex_unlock(&ctx->host_va_range.lock); +host_vm_err: + mutex_destroy(&ctx->host_va_range.lock); + mutex_destroy(&ctx->mem_hash_lock); + hl_mmu_ctx_fini(ctx); + + return rc; +} + +int hl_vm_ctx_init(struct hl_ctx *ctx) +{ + struct asic_fixed_properties *prop = &ctx->hdev->asic_prop; + u64 host_range_start, host_range_end, dram_range_start, + dram_range_end; + + atomic64_set(&ctx->dram_phys_mem, 0); + + /* + * - If MMU is enabled, init the ranges as usual. + * - If MMU is disabled, in case of host mapping, the returned address + * is the given one. + * In case of DRAM mapping, the returned address is the physical + * address of the memory related to the given handle. + */ + if (ctx->hdev->mmu_enable) { + dram_range_start = prop->va_space_dram_start_address; + dram_range_end = prop->va_space_dram_end_address; + host_range_start = prop->va_space_host_start_address; + host_range_end = prop->va_space_host_end_address; + } else { + dram_range_start = prop->dram_user_base_address; + dram_range_end = prop->dram_end_address; + host_range_start = prop->dram_user_base_address; + host_range_end = prop->dram_end_address; + } + + return hl_vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end, + dram_range_start, dram_range_end); +} + +/* + * hl_va_range_fini - clear a virtual addresses range + * + * @hdev : pointer to the habanalabs structure + * va_range : pointer to virtual addresses range + * + * This function initializes the following: + * - Checks that the given range contains the whole initial range + * - Frees the virtual addresses block list and its lock + */ +static void hl_va_range_fini(struct hl_device *hdev, + struct hl_va_range *va_range) +{ + struct hl_vm_va_block *va_block; + + if (list_empty(&va_range->list)) { + dev_warn(hdev->dev, + "va list should not be empty on cleanup!\n"); + goto out; + } + + if (!list_is_singular(&va_range->list)) { + dev_warn(hdev->dev, + "va list should not contain multiple blocks on cleanup!\n"); + goto free_va_list; + } + + va_block = list_first_entry(&va_range->list, typeof(*va_block), node); + + if (va_block->start != va_range->start_addr || + va_block->end != va_range->end_addr) { + dev_warn(hdev->dev, + "wrong va block on cleanup, from 0x%llx to 0x%llx\n", + va_block->start, va_block->end); + goto free_va_list; + } + +free_va_list: + mutex_lock(&va_range->lock); + clear_va_list_locked(hdev, &va_range->list); + mutex_unlock(&va_range->lock); + +out: + mutex_destroy(&va_range->lock); +} + +/* + * hl_vm_ctx_fini - virtual memory teardown of context + * + * @ctx : pointer to the habanalabs context structure + * + * This function perform teardown the following: + * - Virtual block list of available virtual memory + * - Virtual address to area descriptor hashtable + * - MMU for context + * + * In addition this function does the following: + * - Unmaps the existing hashtable nodes if the hashtable is not empty. The + * hashtable should be empty as no valid mappings should exist at this + * point. + * - Frees any existing physical page list from the idr which relates to the + * current context asid. + * - This function checks the virtual block list for correctness. At this point + * the list should contain one element which describes the whole virtual + * memory range of the context. Otherwise, a warning is printed. + */ +void hl_vm_ctx_fini(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + struct hl_vm_phys_pg_pack *phys_pg_list; + struct hl_vm_hash_node *hnode; + struct hlist_node *tmp_node; + int i; + + if (!hash_empty(ctx->mem_hash)) + dev_notice(hdev->dev, "ctx is freed while it has va in use\n"); + + hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) { + dev_dbg(hdev->dev, + "hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n", + hnode->vaddr, ctx->asid); + unmap_device_va(ctx, hnode->vaddr); + } + + spin_lock(&vm->idr_lock); + idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i) + if (phys_pg_list->asid == ctx->asid) { + dev_dbg(hdev->dev, + "page list 0x%p of asid %d is still alive\n", + phys_pg_list, ctx->asid); + free_phys_pg_pack(hdev, phys_pg_list); + idr_remove(&vm->phys_pg_pack_handles, i); + } + spin_unlock(&vm->idr_lock); + + hl_va_range_fini(hdev, &ctx->dram_va_range); + hl_va_range_fini(hdev, &ctx->host_va_range); + + mutex_destroy(&ctx->mem_hash_lock); + hl_mmu_ctx_fini(ctx); +} + +/* + * hl_vm_init - initialize virtual memory module + * + * @hdev : pointer to the habanalabs device structure + * + * This function initializes the following: + * - MMU module + * - DRAM physical pages pool of 2MB + * - Idr for device memory allocation handles + */ +int hl_vm_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_vm *vm = &hdev->vm; + int rc; + + rc = hl_mmu_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to init MMU\n"); + return rc; + } + + vm->dram_pg_pool = gen_pool_create(__ffs(prop->dram_page_size), -1); + if (!vm->dram_pg_pool) { + dev_err(hdev->dev, "Failed to create dram page pool\n"); + rc = -ENOMEM; + goto pool_create_err; + } + + kref_init(&vm->dram_pg_pool_refcount); + + rc = gen_pool_add(vm->dram_pg_pool, prop->dram_user_base_address, + prop->dram_end_address - prop->dram_user_base_address, + -1); + + if (rc) { + dev_err(hdev->dev, + "Failed to add memory to dram page pool %d\n", rc); + goto pool_add_err; + } + + spin_lock_init(&vm->idr_lock); + idr_init(&vm->phys_pg_pack_handles); + + atomic64_set(&hdev->dram_used_mem, 0); + + vm->init_done = true; + + return 0; + +pool_add_err: + gen_pool_destroy(vm->dram_pg_pool); +pool_create_err: + hl_mmu_fini(hdev); + + return rc; +} + +/* + * hl_vm_fini - virtual memory module teardown + * + * @hdev : pointer to the habanalabs device structure + * + * This function perform teardown to the following: + * - Idr for device memory allocation handles + * - DRAM physical pages pool of 2MB + * - MMU module + */ +void hl_vm_fini(struct hl_device *hdev) +{ + struct hl_vm *vm = &hdev->vm; + + if (!vm->init_done) + return; + + /* + * At this point all the contexts should be freed and hence no DRAM + * memory should be in use. Hence the DRAM pool should be freed here. + */ + if (kref_put(&vm->dram_pg_pool_refcount, dram_pg_pool_do_release) != 1) + dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n", + __func__); + + hl_mmu_fini(hdev); + + vm->init_done = false; +} diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/mmu.c new file mode 100644 index 000000000000..79c70d92e74b --- /dev/null +++ b/drivers/misc/habanalabs/mmu.c @@ -0,0 +1,691 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" +#include "include/hw_ip/mmu/mmu_general.h" + +#include +#include + +static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 addr) +{ + struct pgt_info *pgt_info = NULL; + + hash_for_each_possible(ctx->mmu_hash, pgt_info, node, + (unsigned long) addr) + if (addr == pgt_info->addr) + break; + + return pgt_info; +} + +static void free_hop(struct hl_ctx *ctx, u64 hop_addr) +{ + struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr); + + gen_pool_free(pgt_info->ctx->hdev->mmu_pgt_pool, pgt_info->addr, + ctx->hdev->asic_prop.mmu_hop_table_size); + hash_del(&pgt_info->node); + + kfree(pgt_info); +} + +static u64 alloc_hop(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct pgt_info *pgt_info; + u64 addr; + + pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL); + if (!pgt_info) + return ULLONG_MAX; + + addr = (u64) gen_pool_alloc(hdev->mmu_pgt_pool, + hdev->asic_prop.mmu_hop_table_size); + if (!addr) { + dev_err(hdev->dev, "failed to allocate page\n"); + kfree(pgt_info); + return ULLONG_MAX; + } + + pgt_info->addr = addr; + pgt_info->ctx = ctx; + pgt_info->num_of_ptes = 0; + hash_add(ctx->mmu_hash, &pgt_info->node, addr); + + return addr; +} + +static inline void clear_pte(struct hl_device *hdev, u64 pte_addr) +{ + /* clear the last and present bits */ + hdev->asic_funcs->write_pte(hdev, pte_addr, 0); +} + +static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr) +{ + get_pgt_info(ctx, hop_addr)->num_of_ptes++; +} + +/* + * put_pte - decrement the num of ptes and free the hop if possible + * + * @ctx: pointer to the context structure + * @hop_addr: addr of the hop + * + * This function returns the number of ptes left on this hop. If the number is + * 0, it means the pte was freed. + */ +static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr) +{ + struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr); + int num_of_ptes_left; + + pgt_info->num_of_ptes--; + + /* + * Need to save the number of ptes left because free_hop might free + * the pgt_info + */ + num_of_ptes_left = pgt_info->num_of_ptes; + if (!num_of_ptes_left) + free_hop(ctx, hop_addr); + + return num_of_ptes_left; +} + +static inline u64 get_hop0_addr(struct hl_ctx *ctx) +{ + return ctx->hdev->asic_prop.mmu_pgt_addr + + (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size); +} + +static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr, + u64 virt_addr, u64 mask, u64 shift) +{ + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * + ((virt_addr & mask) >> shift); +} + +static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) +{ + return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP0_MASK, HOP0_SHIFT); +} + +static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) +{ + return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP1_MASK, HOP1_SHIFT); +} + +static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) +{ + return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP2_MASK, HOP2_SHIFT); +} + +static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) +{ + return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP3_MASK, HOP3_SHIFT); +} + +static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) +{ + return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP4_MASK, HOP4_SHIFT); +} + +static inline u64 get_next_hop_addr(u64 curr_pte) +{ + if (curr_pte & PAGE_PRESENT_MASK) + return curr_pte & PHYS_ADDR_MASK; + else + return ULLONG_MAX; +} + +static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, + bool *is_new_hop) +{ + u64 hop_addr = get_next_hop_addr(curr_pte); + + if (hop_addr == ULLONG_MAX) { + hop_addr = alloc_hop(ctx); + *is_new_hop = true; + } + + return hop_addr; +} + +/* + * hl_mmu_init - init the mmu module + * + * @hdev: pointer to the habanalabs device structure + * + * This function does the following: + * - Allocate max_asid zeroed hop0 pgts so no mapping is available + * - Enable mmu in hw + * - Invalidate the mmu cache + * - Create a pool of pages for pgts + * - Returns 0 on success + * + * This function depends on DMA QMAN to be working! + */ +int hl_mmu_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + int rc; + + if (!hdev->mmu_enable) + return 0; + + /* MMU HW init was already done in device hw_init() */ + + mutex_init(&hdev->mmu_cache_lock); + + hdev->mmu_pgt_pool = + gen_pool_create(__ffs(prop->mmu_hop_table_size), -1); + + if (!hdev->mmu_pgt_pool) { + dev_err(hdev->dev, "Failed to create page gen pool\n"); + rc = -ENOMEM; + goto err_pool_create; + } + + rc = gen_pool_add(hdev->mmu_pgt_pool, prop->mmu_pgt_addr + + prop->mmu_hop0_tables_total_size, + prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size, + -1); + if (rc) { + dev_err(hdev->dev, "Failed to add memory to page gen pool\n"); + goto err_pool_add; + } + + return 0; + +err_pool_add: + gen_pool_destroy(hdev->mmu_pgt_pool); +err_pool_create: + mutex_destroy(&hdev->mmu_cache_lock); + + return rc; +} + +/* + * hl_mmu_fini - release the mmu module. + * + * @hdev: pointer to the habanalabs device structure + * + * This function does the following: + * - Disable mmu in hw + * - free the pgts pool + * + * All ctxs should be freed before calling this func + */ +void hl_mmu_fini(struct hl_device *hdev) +{ + if (!hdev->mmu_enable) + return; + + gen_pool_destroy(hdev->mmu_pgt_pool); + + mutex_destroy(&hdev->mmu_cache_lock); + + /* MMU HW fini will be done in device hw_fini() */ +} + +/* + * hl_mmu_ctx_init - init a ctx for using the mmu module + * + * @ctx: pointer to the context structure + * + * This function does the following: + * - Init a mutex to protect the concurrent mapping flow + * - Init a hash to hold all pgts related to this ctx + */ +void hl_mmu_ctx_init(struct hl_ctx *ctx) +{ + if (!ctx->hdev->mmu_enable) + return; + + mutex_init(&ctx->mmu_lock); + hash_init(ctx->mmu_hash); +} + +/* + * hl_mmu_ctx_fini - disable a ctx from using the mmu module + * + * @ctx: pointer to the context structure + * + * This function does the following: + * - Free any pgts which were not freed yet + * - Free the mutex + */ +void hl_mmu_ctx_fini(struct hl_ctx *ctx) +{ + struct pgt_info *pgt_info; + struct hlist_node *tmp; + int i; + + if (!ctx->hdev->mmu_enable) + return; + + if (!hash_empty(ctx->mmu_hash)) + dev_err(ctx->hdev->dev, + "ctx is freed while it has pgts in use\n"); + + hash_for_each_safe(ctx->mmu_hash, i, tmp, pgt_info, node) { + dev_err(ctx->hdev->dev, + "pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n", + pgt_info->addr, ctx->asid, pgt_info->num_of_ptes); + free_hop(ctx, pgt_info->addr); + } + + mutex_destroy(&ctx->mmu_lock); +} + +static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr) +{ + struct hl_device *hdev = ctx->hdev; + u64 hop0_addr = 0, hop0_pte_addr = 0, + hop1_addr = 0, hop1_pte_addr = 0, + hop2_addr = 0, hop2_pte_addr = 0, + hop3_addr = 0, hop3_pte_addr = 0, + hop4_addr = 0, hop4_pte_addr = 0, + curr_pte; + int clear_hop3 = 1; + + hop0_addr = get_hop0_addr(ctx); + + hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr); + + hop1_addr = get_next_hop_addr(curr_pte); + + if (hop1_addr == ULLONG_MAX) + goto not_mapped; + + hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr); + + hop2_addr = get_next_hop_addr(curr_pte); + + if (hop2_addr == ULLONG_MAX) + goto not_mapped; + + hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr); + + hop3_addr = get_next_hop_addr(curr_pte); + + if (hop3_addr == ULLONG_MAX) + goto not_mapped; + + hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr); + + if (!(curr_pte & LAST_MASK)) { + hop4_addr = get_next_hop_addr(curr_pte); + + if (hop4_addr == ULLONG_MAX) + goto not_mapped; + + hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr); + + clear_hop3 = 0; + } + + if (!(curr_pte & PAGE_PRESENT_MASK)) + goto not_mapped; + + clear_pte(hdev, hop4_addr ? hop4_pte_addr : hop3_pte_addr); + + if (hop4_addr && !put_pte(ctx, hop4_addr)) + clear_hop3 = 1; + + if (!clear_hop3) + goto flush; + clear_pte(hdev, hop3_pte_addr); + + if (put_pte(ctx, hop3_addr)) + goto flush; + clear_pte(hdev, hop2_pte_addr); + + if (put_pte(ctx, hop2_addr)) + goto flush; + clear_pte(hdev, hop1_pte_addr); + + if (put_pte(ctx, hop1_addr)) + goto flush; + clear_pte(hdev, hop0_pte_addr); + +flush: + /* flush all writes from all cores to reach PCI */ + mb(); + + hdev->asic_funcs->read_pte(hdev, + hop4_addr ? hop4_pte_addr : hop3_pte_addr); + + return 0; + +not_mapped: + dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n", + virt_addr); + + return -EINVAL; +} + +/* + * hl_mmu_unmap - unmaps a virtual addr + * + * @ctx: pointer to the context structure + * @virt_addr: virt addr to map from + * @page_size: size of the page to unmap + * + * This function does the following: + * - Check that the virt addr is mapped + * - Unmap the virt addr and frees pgts if possible + * - Returns 0 on success, -EINVAL if the given addr is not mapped + * + * Because this function changes the page tables in the device and because it + * changes the MMU hash, it must be protected by a lock. + * However, because it maps only a single page, the lock should be implemented + * in a higher level in order to protect the entire mapping of the memory area + */ +int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size) +{ + struct hl_device *hdev = ctx->hdev; + u64 real_virt_addr; + u32 real_page_size, npages; + int i, rc; + + if (!hdev->mmu_enable) + return 0; + + /* + * The H/W handles mapping of 4KB/2MB page. Hence if the host page size + * is bigger, we break it to sub-pages and unmap them separately. + */ + if ((page_size % PAGE_SIZE_2MB) == 0) { + real_page_size = PAGE_SIZE_2MB; + } else if ((page_size % PAGE_SIZE_4KB) == 0) { + real_page_size = PAGE_SIZE_4KB; + } else { + dev_err(hdev->dev, + "page size of %u is not 4KB nor 2MB aligned, can't unmap\n", + page_size); + + return -EFAULT; + } + + npages = page_size / real_page_size; + real_virt_addr = virt_addr; + + for (i = 0 ; i < npages ; i++) { + rc = _hl_mmu_unmap(ctx, real_virt_addr); + if (rc) + return rc; + + real_virt_addr += real_page_size; + } + + return 0; +} + +static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, + u32 page_size) +{ + struct hl_device *hdev = ctx->hdev; + u64 hop0_addr = 0, hop0_pte_addr = 0, + hop1_addr = 0, hop1_pte_addr = 0, + hop2_addr = 0, hop2_pte_addr = 0, + hop3_addr = 0, hop3_pte_addr = 0, + hop4_addr = 0, hop4_pte_addr = 0, + curr_pte = 0; + bool hop1_new = false, hop2_new = false, hop3_new = false, + hop4_new = false, is_huge; + int rc = -ENOMEM; + + /* + * This mapping function can map a 4KB/2MB page. For 2MB page there are + * only 3 hops rather than 4. Currently the DRAM allocation uses 2MB + * pages only but user memory could have been allocated with one of the + * two page sizes. Since this is a common code for all the three cases, + * we need this hugs page check. + */ + is_huge = page_size == PAGE_SIZE_2MB; + + hop0_addr = get_hop0_addr(ctx); + + hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr); + + hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new); + + if (hop1_addr == ULLONG_MAX) + goto err; + + hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr); + + hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new); + + if (hop2_addr == ULLONG_MAX) + goto err; + + hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr); + + hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new); + + if (hop3_addr == ULLONG_MAX) + goto err; + + hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr); + + if (!is_huge) { + hop4_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop4_new); + + if (hop4_addr == ULLONG_MAX) + goto err; + + hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr); + + curr_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr); + } + + if (curr_pte & PAGE_PRESENT_MASK) { + dev_err(hdev->dev, + "mapping already exists for virt_addr 0x%llx\n", + virt_addr); + + dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n", + hdev->asic_funcs->read_pte(hdev, hop0_pte_addr), + hop0_pte_addr); + dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n", + hdev->asic_funcs->read_pte(hdev, hop1_pte_addr), + hop1_pte_addr); + dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n", + hdev->asic_funcs->read_pte(hdev, hop2_pte_addr), + hop2_pte_addr); + dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n", + hdev->asic_funcs->read_pte(hdev, hop3_pte_addr), + hop3_pte_addr); + + if (!is_huge) + dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n", + hdev->asic_funcs->read_pte(hdev, + hop4_pte_addr), + hop4_pte_addr); + + rc = EINVAL; + goto err; + } + + curr_pte = (phys_addr & PTE_PHYS_ADDR_MASK) | LAST_MASK + | PAGE_PRESENT_MASK; + + hdev->asic_funcs->write_pte(hdev, + is_huge ? hop3_pte_addr : hop4_pte_addr, + curr_pte); + + if (hop1_new) { + curr_pte = (hop1_addr & PTE_PHYS_ADDR_MASK) | + PAGE_PRESENT_MASK; + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop0_pte_addr, + curr_pte); + } + if (hop2_new) { + curr_pte = (hop2_addr & PTE_PHYS_ADDR_MASK) | + PAGE_PRESENT_MASK; + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop1_pte_addr, + curr_pte); + get_pte(ctx, hop1_addr); + } + if (hop3_new) { + curr_pte = (hop3_addr & PTE_PHYS_ADDR_MASK) | + PAGE_PRESENT_MASK; + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop2_pte_addr, + curr_pte); + get_pte(ctx, hop2_addr); + } + + if (!is_huge) { + if (hop4_new) { + curr_pte = (hop4_addr & PTE_PHYS_ADDR_MASK) | + PAGE_PRESENT_MASK; + ctx->hdev->asic_funcs->write_pte(ctx->hdev, + hop3_pte_addr, curr_pte); + get_pte(ctx, hop3_addr); + } + + get_pte(ctx, hop4_addr); + } else { + get_pte(ctx, hop3_addr); + } + + /* flush all writes from all cores to reach PCI */ + mb(); + + hdev->asic_funcs->read_pte(hdev, + is_huge ? hop3_pte_addr : hop4_pte_addr); + + return 0; + +err: + if (hop4_new) + free_hop(ctx, hop4_addr); + if (hop3_new) + free_hop(ctx, hop3_addr); + if (hop2_new) + free_hop(ctx, hop2_addr); + if (hop1_new) + free_hop(ctx, hop1_addr); + + return rc; +} + +/* + * hl_mmu_map - maps a virtual addr to physical addr + * + * @ctx: pointer to the context structure + * @virt_addr: virt addr to map from + * @phys_addr: phys addr to map to + * @page_size: physical page size + * + * This function does the following: + * - Check that the virt addr is not mapped + * - Allocate pgts as necessary in order to map the virt addr to the phys + * - Returns 0 on success, -EINVAL if addr is already mapped, or -ENOMEM. + * + * Because this function changes the page tables in the device and because it + * changes the MMU hash, it must be protected by a lock. + * However, because it maps only a single page, the lock should be implemented + * in a higher level in order to protect the entire mapping of the memory area + */ +int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size) +{ + struct hl_device *hdev = ctx->hdev; + u64 real_virt_addr; + u32 real_page_size, npages; + int i, rc, mapped_cnt = 0; + + if (!hdev->mmu_enable) + return 0; + + /* + * The H/W handles mapping of 4KB/2MB page. Hence if the host page size + * is bigger, we break it to sub-pages and map them separately. + */ + if ((page_size % PAGE_SIZE_2MB) == 0) { + real_page_size = PAGE_SIZE_2MB; + } else if ((page_size % PAGE_SIZE_4KB) == 0) { + real_page_size = PAGE_SIZE_4KB; + } else { + dev_err(hdev->dev, + "page size of %u is not 4KB nor 2MB aligned, can't map\n", + page_size); + + return -EFAULT; + } + + npages = page_size / real_page_size; + real_virt_addr = virt_addr; + + for (i = 0 ; i < npages ; i++) { + rc = _hl_mmu_map(ctx, real_virt_addr, phys_addr, + real_page_size); + if (rc) + goto err; + + real_virt_addr += real_page_size; + mapped_cnt++; + } + + return 0; + +err: + real_virt_addr = virt_addr; + for (i = 0 ; i < mapped_cnt ; i++) { + if (_hl_mmu_unmap(ctx, real_virt_addr)) + dev_warn_ratelimited(hdev->dev, + "failed to unmap va: 0x%llx\n", real_virt_addr); + + real_virt_addr += real_page_size; + } + + return rc; +} + +/* + * hl_mmu_swap_out - marks all mapping of the given ctx as swapped out + * + * @ctx: pointer to the context structure + * + */ +void hl_mmu_swap_out(struct hl_ctx *ctx) +{ + +} + +/* + * hl_mmu_swap_in - marks all mapping of the given ctx as swapped in + * + * @ctx: pointer to the context structure + * + */ +void hl_mmu_swap_in(struct hl_ctx *ctx) +{ + +} diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index fba49417f607..9015043887d1 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -162,6 +162,108 @@ union hl_wait_cs_args { struct hl_wait_cs_out out; }; +/* Opcode to alloc device memory */ +#define HL_MEM_OP_ALLOC 0 +/* Opcode to free previously allocated device memory */ +#define HL_MEM_OP_FREE 1 +/* Opcode to map host memory */ +#define HL_MEM_OP_MAP 2 +/* Opcode to unmap previously mapped host memory */ +#define HL_MEM_OP_UNMAP 3 + +/* Memory flags */ +#define HL_MEM_CONTIGUOUS 0x1 +#define HL_MEM_SHARED 0x2 +#define HL_MEM_USERPTR 0x4 + +struct hl_mem_in { + union { + /* HL_MEM_OP_ALLOC- allocate device memory */ + struct { + /* Size to alloc */ + __u32 mem_size; + __u32 pad; + } alloc; + + /* HL_MEM_OP_FREE - free device memory */ + struct { + /* Handle returned from HL_MEM_OP_ALLOC */ + __u64 handle; + } free; + + /* HL_MEM_OP_MAP - map device memory */ + struct { + /* + * Requested virtual address of mapped memory. + * KMD will try to map the requested region to this + * hint address, as long as the address is valid and + * not already mapped. The user should check the + * returned address of the IOCTL to make sure he got + * the hint address. Passing 0 here means that KMD + * will choose the address itself. + */ + __u64 hint_addr; + /* Handle returned from HL_MEM_OP_ALLOC */ + __u64 handle; + } map_device; + + /* HL_MEM_OP_MAP - map host memory */ + struct { + /* Address of allocated host memory */ + __u64 host_virt_addr; + /* + * Requested virtual address of mapped memory. + * KMD will try to map the requested region to this + * hint address, as long as the address is valid and + * not already mapped. The user should check the + * returned address of the IOCTL to make sure he got + * the hint address. Passing 0 here means that KMD + * will choose the address itself. + */ + __u64 hint_addr; + /* Size of allocated host memory */ + __u32 mem_size; + __u32 pad; + } map_host; + + /* HL_MEM_OP_UNMAP - unmap host memory */ + struct { + /* Virtual address returned from HL_MEM_OP_MAP */ + __u64 device_virt_addr; + } unmap; + }; + + /* HL_MEM_OP_* */ + __u32 op; + /* HL_MEM_* flags */ + __u32 flags; + /* Context ID - Currently not in use */ + __u32 ctx_id; + __u32 pad; +}; + +struct hl_mem_out { + union { + /* + * Used for HL_MEM_OP_MAP as the virtual address that was + * assigned in the device VA space. + * A value of 0 means the requested operation failed. + */ + __u64 device_virt_addr; + + /* + * Used for HL_MEM_OP_ALLOC. This is the assigned + * handle for the allocated memory + */ + __u64 handle; + }; +}; + +union hl_mem_args { + struct hl_mem_in in; + struct hl_mem_out out; +}; + /* * Command Buffer * - Request a Command Buffer @@ -245,7 +347,25 @@ union hl_wait_cs_args { #define HL_IOCTL_WAIT_CS \ _IOWR('H', 0x04, union hl_wait_cs_args) +/* + * Memory + * - Map host memory to device MMU + * - Unmap host memory from device MMU + * + * This IOCTL allows the user to map host memory to the device MMU + * + * For host memory, the IOCTL doesn't allocate memory. The user is supposed + * to allocate the memory in user-space (malloc/new). The driver pins the + * physical pages (up to the allowed limit by the OS), assigns a virtual + * address in the device VA space and initializes the device MMU. + * + * There is an option for the user to specify the requested virtual address. + * + */ +#define HL_IOCTL_MEMORY \ + _IOWR('H', 0x05, union hl_mem_args) + #define HL_COMMAND_START 0x02 -#define HL_COMMAND_END 0x05 +#define HL_COMMAND_END 0x06 #endif /* HABANALABS_H_ */ -- cgit v1.3-7-g2ca7 From d8dd7b0a81cc192ef5d30ec76ed6f6d35a1a7cf5 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 16 Feb 2019 00:39:23 +0200 Subject: habanalabs: implement INFO IOCTL This patch implements the INFO IOCTL. That IOCTL is used by the user to query information that is relevant/needed by the user in order to submit deep learning jobs to Goya. The information is divided into several categories, such as H/W IP, Events that happened, DDR usage and more. Reviewed-by: Mike Rapoport Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/goya/goya.c | 6 ++ drivers/misc/habanalabs/habanalabs.h | 2 + drivers/misc/habanalabs/habanalabs_ioctl.c | 126 +++++++++++++++++++++++++++++ include/uapi/misc/habanalabs.h | 75 ++++++++++++++++- 4 files changed, 208 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 89b82b989966..bf3f76f1aeae 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -5131,6 +5131,11 @@ static void goya_hw_queues_unlock(struct hl_device *hdev) spin_unlock(&goya->hw_queues_lock); } +static u32 goya_get_pci_id(struct hl_device *hdev) +{ + return hdev->pdev->device; +} + int goya_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size) { struct goya_device *goya = hdev->asic_specific; @@ -5232,6 +5237,7 @@ static const struct hl_asic_funcs goya_funcs = { .soft_reset_late_init = goya_soft_reset_late_init, .hw_queues_lock = goya_hw_queues_lock, .hw_queues_unlock = goya_hw_queues_unlock, + .get_pci_id = goya_get_pci_id, .get_eeprom_data = goya_get_eeprom_data, .send_cpu_message = goya_send_cpu_message, .get_hw_state = goya_get_hw_state diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 03085e7a12dd..02de4a2cab27 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -470,6 +470,7 @@ enum hl_pll_frequency { * @soft_reset_late_init: perform certain actions needed after soft reset. * @hw_queues_lock: acquire H/W queues lock. * @hw_queues_unlock: release H/W queues lock. + * @get_pci_id: retrieve PCI ID. * @get_eeprom_data: retrieve EEPROM data from F/W. * @send_cpu_message: send buffer to ArmCP. * @get_hw_state: retrieve the H/W state @@ -539,6 +540,7 @@ struct hl_asic_funcs { int (*soft_reset_late_init)(struct hl_device *hdev); void (*hw_queues_lock)(struct hl_device *hdev); void (*hw_queues_unlock)(struct hl_device *hdev); + u32 (*get_pci_id)(struct hl_device *hdev); int (*get_eeprom_data)(struct hl_device *hdev, void *data, size_t max_size); int (*send_cpu_message)(struct hl_device *hdev, u32 *msg, diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c index 6e4dc5b5e696..12408d3302e9 100644 --- a/drivers/misc/habanalabs/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/habanalabs_ioctl.c @@ -12,10 +12,136 @@ #include #include +static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_hw_ip_info hw_ip = {0}; + u32 size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 sram_kmd_size, dram_kmd_size; + + if ((!size) || (!out)) + return -EINVAL; + + sram_kmd_size = (prop->sram_user_base_address - + prop->sram_base_address); + dram_kmd_size = (prop->dram_user_base_address - + prop->dram_base_address); + + hw_ip.device_id = hdev->asic_funcs->get_pci_id(hdev); + hw_ip.sram_base_address = prop->sram_user_base_address; + hw_ip.dram_base_address = prop->dram_user_base_address; + hw_ip.tpc_enabled_mask = prop->tpc_enabled_mask; + hw_ip.sram_size = prop->sram_size - sram_kmd_size; + hw_ip.dram_size = prop->dram_size - dram_kmd_size; + if (hw_ip.dram_size > 0) + hw_ip.dram_enabled = 1; + hw_ip.num_of_events = prop->num_of_events; + memcpy(hw_ip.armcp_version, + prop->armcp_info.armcp_version, VERSION_MAX_LEN); + hw_ip.armcp_cpld_version = prop->armcp_info.cpld_version; + hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr; + hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf; + hw_ip.psoc_pci_pll_od = prop->psoc_pci_pll_od; + hw_ip.psoc_pci_pll_div_factor = prop->psoc_pci_pll_div_factor; + + return copy_to_user(out, &hw_ip, + min((size_t)size, sizeof(hw_ip))) ? -EFAULT : 0; +} + +static int hw_events_info(struct hl_device *hdev, struct hl_info_args *args) +{ + u32 size, max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + void *arr; + + if ((!max_size) || (!out)) + return -EINVAL; + + arr = hdev->asic_funcs->get_events_stat(hdev, &size); + + return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0; +} + +static int dram_usage_info(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_dram_usage dram_usage = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 dram_kmd_size; + + if ((!max_size) || (!out)) + return -EINVAL; + + dram_kmd_size = (prop->dram_user_base_address - + prop->dram_base_address); + dram_usage.dram_free_mem = (prop->dram_size - dram_kmd_size) - + atomic64_read(&hdev->dram_used_mem); + dram_usage.ctx_dram_mem = atomic64_read(&hdev->user_ctx->dram_phys_mem); + + return copy_to_user(out, &dram_usage, + min((size_t) max_size, sizeof(dram_usage))) ? -EFAULT : 0; +} + +static int hw_idle(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_hw_idle hw_idle = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev); + + return copy_to_user(out, &hw_idle, + min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0; +} + +static int hl_info_ioctl(struct hl_fpriv *hpriv, void *data) +{ + struct hl_info_args *args = data; + struct hl_device *hdev = hpriv->hdev; + int rc; + + if (hl_device_disabled_or_in_reset(hdev)) { + dev_err(hdev->dev, + "Device is disabled or in reset. Can't execute INFO IOCTL\n"); + return -EBUSY; + } + + switch (args->op) { + case HL_INFO_HW_IP_INFO: + rc = hw_ip_info(hdev, args); + break; + + case HL_INFO_HW_EVENTS: + rc = hw_events_info(hdev, args); + break; + + case HL_INFO_DRAM_USAGE: + rc = dram_usage_info(hdev, args); + break; + + case HL_INFO_HW_IDLE: + rc = hw_idle(hdev, args); + break; + + default: + dev_err(hdev->dev, "Invalid request %d\n", args->op); + rc = -ENOTTY; + break; + } + + return rc; +} + #define HL_IOCTL_DEF(ioctl, _func) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func} static const struct hl_ioctl_desc hl_ioctls[] = { + HL_IOCTL_DEF(HL_IOCTL_INFO, hl_info_ioctl), HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl), HL_IOCTL_DEF(HL_IOCTL_CS, hl_cs_ioctl), HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl), diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 9015043887d1..4afc1891ece8 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -45,6 +45,62 @@ enum goya_queue_id { GOYA_QUEUE_ID_SIZE }; +/* Opcode for management ioctl */ +#define HL_INFO_HW_IP_INFO 0 +#define HL_INFO_HW_EVENTS 1 +#define HL_INFO_DRAM_USAGE 2 +#define HL_INFO_HW_IDLE 3 + +#define HL_INFO_VERSION_MAX_LEN 128 + +struct hl_info_hw_ip_info { + __u64 sram_base_address; + __u64 dram_base_address; + __u64 dram_size; + __u32 sram_size; + __u32 num_of_events; + __u32 device_id; /* PCI Device ID */ + __u32 reserved[3]; + __u32 armcp_cpld_version; + __u32 psoc_pci_pll_nr; + __u32 psoc_pci_pll_nf; + __u32 psoc_pci_pll_od; + __u32 psoc_pci_pll_div_factor; + __u8 tpc_enabled_mask; + __u8 dram_enabled; + __u8 pad[2]; + __u8 armcp_version[HL_INFO_VERSION_MAX_LEN]; +}; + +struct hl_info_dram_usage { + __u64 dram_free_mem; + __u64 ctx_dram_mem; +}; + +struct hl_info_hw_idle { + __u32 is_idle; + __u32 pad; +}; + +struct hl_info_args { + /* Location of relevant struct in userspace */ + __u64 return_pointer; + /* + * The size of the return value. Just like "size" in "snprintf", + * it limits how many bytes the kernel can write + * + * For hw_events array, the size should be + * hl_info_hw_ip_info.num_of_events * sizeof(__u32) + */ + __u32 return_size; + + /* HL_INFO_* */ + __u32 op; + + /* Context ID - Currently not in use */ + __u32 ctx_id; + __u32 pad; +}; /* Opcode to create a new command buffer */ #define HL_CB_OP_CREATE 0 @@ -264,6 +320,23 @@ union hl_mem_args { struct hl_mem_out out; }; +/* + * Various information operations such as: + * - H/W IP information + * - Current dram usage + * + * The user calls this IOCTL with an opcode that describes the required + * information. The user should supply a pointer to a user-allocated memory + * chunk, which will be filled by the driver with the requested information. + * + * The user supplies the maximum amount of size to copy into the user's memory, + * in order to prevent data corruption in case of differences between the + * definitions of structures in kernel and userspace, e.g. in case of old + * userspace and new kernel driver + */ +#define HL_IOCTL_INFO \ + _IOWR('H', 0x01, struct hl_info_args) + /* * Command Buffer * - Request a Command Buffer @@ -365,7 +438,7 @@ union hl_mem_args { #define HL_IOCTL_MEMORY \ _IOWR('H', 0x05, union hl_mem_args) -#define HL_COMMAND_START 0x02 +#define HL_COMMAND_START 0x01 #define HL_COMMAND_END 0x06 #endif /* HABANALABS_H_ */ -- cgit v1.3-7-g2ca7 From 0d0216c03a7a14e121abb2e3eb38e491767c36e8 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 16 May 2018 11:18:48 +0300 Subject: compat ABI: use non-compat openat and open_by_handle_at variants The only difference between native and compat openat and open_by_handle_at is that non-compat version forces O_LARGEFILE, and it should be the default behaviour for all architectures, as we are going to drop the support of 32-bit userspace off_t. Signed-off-by: Yury Norov Signed-off-by: Yury Norov Signed-off-by: Arnd Bergmann --- include/uapi/asm-generic/unistd.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index acf9a07ab2ff..b928eff3bf92 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -179,7 +179,7 @@ __SYSCALL(__NR_fchownat, sys_fchownat) #define __NR_fchown 55 __SYSCALL(__NR_fchown, sys_fchown) #define __NR_openat 56 -__SC_COMP(__NR_openat, sys_openat, compat_sys_openat) +__SYSCALL(__NR_openat, sys_openat) #define __NR_close 57 __SYSCALL(__NR_close, sys_close) #define __NR_vhangup 58 @@ -678,8 +678,7 @@ __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #define __NR_name_to_handle_at 264 __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) #define __NR_open_by_handle_at 265 -__SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \ - compat_sys_open_by_handle_at) +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) #define __NR_clock_adjtime 266 __SC_3264(__NR_clock_adjtime, sys_clock_adjtime32, sys_clock_adjtime) #define __NR_syncfs 267 -- cgit v1.3-7-g2ca7 From 746c9398f5ac2b3f5730da4ed09e99ef4bb50b4a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 8 Feb 2019 01:02:55 -0500 Subject: arch: move common mmap flags to linux/mman.h Now that we have 3 mmap flags shared by all architectures, let's move them into the common header. This will help discourage future architectures from duplicating code. Signed-off-by: Michael S. Tsirkin Signed-off-by: Arnd Bergmann --- arch/alpha/include/uapi/asm/mman.h | 4 +--- arch/mips/include/uapi/asm/mman.h | 4 +--- arch/parisc/include/uapi/asm/mman.h | 4 +--- arch/xtensa/include/uapi/asm/mman.h | 4 +--- include/uapi/asm-generic/mman-common.h | 4 +--- include/uapi/linux/mman.h | 4 ++++ 6 files changed, 9 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index f9d4e6b6d4bd..ac23379b7a87 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -10,9 +10,7 @@ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ -#define MAP_SHARED 0x01 /* Share changes */ -#define MAP_PRIVATE 0x02 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x0f /* Mask for type of mapping (OSF/1 is _wrong_) */ #define MAP_FIXED 0x100 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10 /* don't use a file */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 3035ca499cd8..c2b40969eb1f 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -27,9 +27,7 @@ /* * Flags for mmap */ -#define MAP_SHARED 0x001 /* Share changes */ -#define MAP_PRIVATE 0x002 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 870fbf8c7088..c98162f494db 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -10,9 +10,7 @@ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ -#define MAP_SHARED 0x01 /* Share changes */ -#define MAP_PRIVATE 0x02 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x2b /* Mask for type of mapping, includes bits 0x08 and 0x20 */ #define MAP_FIXED 0x04 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10 /* don't use a file */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 58f29a9d895d..be726062412b 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -34,9 +34,7 @@ /* * Flags for mmap */ -#define MAP_SHARED 0x001 /* Share changes */ -#define MAP_PRIVATE 0x002 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index e7ee32861d51..abd238d0f7a4 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -15,9 +15,7 @@ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ -#define MAP_SHARED 0x01 /* Share changes */ -#define MAP_PRIVATE 0x02 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index d0f515d53299..fc1a64c3447b 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -12,6 +12,10 @@ #define OVERCOMMIT_ALWAYS 1 #define OVERCOMMIT_NEVER 2 +#define MAP_SHARED 0x01 /* Share changes */ +#define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ + /* * Huge page size encoding when MAP_HUGETLB is specified, and a huge page * size other than the default is desired. See hugetlb_encode.h. -- cgit v1.3-7-g2ca7 From a7fe4ca72b1fe7877de5672640d0b4e023d0fdca Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 7 Feb 2019 22:18:43 -0500 Subject: media: v4l: Add 32-bit packed YUV formats The formats added in this patch include: V4L2_PIX_FMT_AYUV32 V4L2_PIX_FMT_XYUV32 V4L2_PIX_FMT_VUYA32 V4L2_PIX_FMT_VUYX32 These formats enable the trasmission of alpha channel data to other drivers and userspace applications in addition to YUV data. For example, buffers generated by drivers in one of these formats can be used by the Weston compositor to display as a texture or flipped directly onto the overlay planes with the help of a DRM driver. Signed-off-by: Vivek Kasireddy Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst | 170 ++++++++++++++++++++- drivers/media/v4l2-core/v4l2-ioctl.c | 4 + include/uapi/linux/videodev2.h | 4 + 3 files changed, 177 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst b/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst index f53e8f57a003..7fcee1c11ac4 100644 --- a/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst +++ b/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst @@ -190,6 +190,170 @@ component of each pixel in one 16 or 32 bit word. - Cr\ :sub:`2` - Cr\ :sub:`1` - Cr\ :sub:`0` + - + * .. _V4L2-PIX-FMT-AYUV32: + + - ``V4L2_PIX_FMT_AYUV32`` + - 'AYUV' + + - a\ :sub:`7` + - a\ :sub:`6` + - a\ :sub:`5` + - a\ :sub:`4` + - a\ :sub:`3` + - a\ :sub:`2` + - a\ :sub:`1` + - a\ :sub:`0` + + - Y'\ :sub:`7` + - Y'\ :sub:`6` + - Y'\ :sub:`5` + - Y'\ :sub:`4` + - Y'\ :sub:`3` + - Y'\ :sub:`2` + - Y'\ :sub:`1` + - Y'\ :sub:`0` + + - Cb\ :sub:`7` + - Cb\ :sub:`6` + - Cb\ :sub:`5` + - Cb\ :sub:`4` + - Cb\ :sub:`3` + - Cb\ :sub:`2` + - Cb\ :sub:`1` + - Cb\ :sub:`0` + + - Cr\ :sub:`7` + - Cr\ :sub:`6` + - Cr\ :sub:`5` + - Cr\ :sub:`4` + - Cr\ :sub:`3` + - Cr\ :sub:`2` + - Cr\ :sub:`1` + - Cr\ :sub:`0` + - + * .. _V4L2-PIX-FMT-XYUV32: + + - ``V4L2_PIX_FMT_XYUV32`` + - 'XYUV' + + - + - + - + - + - + - + - + - + + - Y'\ :sub:`7` + - Y'\ :sub:`6` + - Y'\ :sub:`5` + - Y'\ :sub:`4` + - Y'\ :sub:`3` + - Y'\ :sub:`2` + - Y'\ :sub:`1` + - Y'\ :sub:`0` + + - Cb\ :sub:`7` + - Cb\ :sub:`6` + - Cb\ :sub:`5` + - Cb\ :sub:`4` + - Cb\ :sub:`3` + - Cb\ :sub:`2` + - Cb\ :sub:`1` + - Cb\ :sub:`0` + + - Cr\ :sub:`7` + - Cr\ :sub:`6` + - Cr\ :sub:`5` + - Cr\ :sub:`4` + - Cr\ :sub:`3` + - Cr\ :sub:`2` + - Cr\ :sub:`1` + - Cr\ :sub:`0` + - + * .. _V4L2-PIX-FMT-VUYA32: + + - ``V4L2_PIX_FMT_VUYA32`` + - 'VUYA' + + - Cr\ :sub:`7` + - Cr\ :sub:`6` + - Cr\ :sub:`5` + - Cr\ :sub:`4` + - Cr\ :sub:`3` + - Cr\ :sub:`2` + - Cr\ :sub:`1` + - Cr\ :sub:`0` + + - Cb\ :sub:`7` + - Cb\ :sub:`6` + - Cb\ :sub:`5` + - Cb\ :sub:`4` + - Cb\ :sub:`3` + - Cb\ :sub:`2` + - Cb\ :sub:`1` + - Cb\ :sub:`0` + + - Y'\ :sub:`7` + - Y'\ :sub:`6` + - Y'\ :sub:`5` + - Y'\ :sub:`4` + - Y'\ :sub:`3` + - Y'\ :sub:`2` + - Y'\ :sub:`1` + - Y'\ :sub:`0` + + - a\ :sub:`7` + - a\ :sub:`6` + - a\ :sub:`5` + - a\ :sub:`4` + - a\ :sub:`3` + - a\ :sub:`2` + - a\ :sub:`1` + - a\ :sub:`0` + - + * .. _V4L2-PIX-FMT-VUYX32: + + - ``V4L2_PIX_FMT_VUYX32`` + - 'VUYX' + + - Cr\ :sub:`7` + - Cr\ :sub:`6` + - Cr\ :sub:`5` + - Cr\ :sub:`4` + - Cr\ :sub:`3` + - Cr\ :sub:`2` + - Cr\ :sub:`1` + - Cr\ :sub:`0` + + - Cb\ :sub:`7` + - Cb\ :sub:`6` + - Cb\ :sub:`5` + - Cb\ :sub:`4` + - Cb\ :sub:`3` + - Cb\ :sub:`2` + - Cb\ :sub:`1` + - Cb\ :sub:`0` + + - Y'\ :sub:`7` + - Y'\ :sub:`6` + - Y'\ :sub:`5` + - Y'\ :sub:`4` + - Y'\ :sub:`3` + - Y'\ :sub:`2` + - Y'\ :sub:`1` + - Y'\ :sub:`0` + + - + - + - + - + - + - + - + - .. raw:: latex @@ -202,4 +366,8 @@ component of each pixel in one 16 or 32 bit word. #) The value of a = alpha bits is undefined when reading from the driver, ignored when writing to the driver, except when alpha blending has been negotiated for a :ref:`Video Overlay ` or - :ref:`Video Output Overlay `. + :ref:`Video Output Overlay ` for the formats Y444, YUV555 and + YUV4. However, for formats AYUV32 and VUYA32, the alpha component is + expected to contain a meaningful value that can be used by drivers + and applications. And, the formats XYUV32 and VUYX32 contain undefined + alpha values that must be ignored by all applications and drivers. diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 206b7348797e..6f707466b5d2 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -1220,6 +1220,10 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt) case V4L2_PIX_FMT_YUV555: descr = "16-bit A/XYUV 1-5-5-5"; break; case V4L2_PIX_FMT_YUV565: descr = "16-bit YUV 5-6-5"; break; case V4L2_PIX_FMT_YUV32: descr = "32-bit A/XYUV 8-8-8-8"; break; + case V4L2_PIX_FMT_AYUV32: descr = "32-bit AYUV 8-8-8-8"; break; + case V4L2_PIX_FMT_XYUV32: descr = "32-bit XYUV 8-8-8-8"; break; + case V4L2_PIX_FMT_VUYA32: descr = "32-bit VUYA 8-8-8-8"; break; + case V4L2_PIX_FMT_VUYX32: descr = "32-bit VUYX 8-8-8-8"; break; case V4L2_PIX_FMT_YUV410: descr = "Planar YUV 4:1:0"; break; case V4L2_PIX_FMT_YUV420: descr = "Planar YUV 4:2:0"; break; case V4L2_PIX_FMT_HI240: descr = "8-bit Dithered RGB (BTTV)"; break; diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 9a920f071ff9..1db220da3bcc 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -562,6 +562,10 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_YUV555 v4l2_fourcc('Y', 'U', 'V', 'O') /* 16 YUV-5-5-5 */ #define V4L2_PIX_FMT_YUV565 v4l2_fourcc('Y', 'U', 'V', 'P') /* 16 YUV-5-6-5 */ #define V4L2_PIX_FMT_YUV32 v4l2_fourcc('Y', 'U', 'V', '4') /* 32 YUV-8-8-8-8 */ +#define V4L2_PIX_FMT_AYUV32 v4l2_fourcc('A', 'Y', 'U', 'V') /* 32 AYUV-8-8-8-8 */ +#define V4L2_PIX_FMT_XYUV32 v4l2_fourcc('X', 'Y', 'U', 'V') /* 32 XYUV-8-8-8-8 */ +#define V4L2_PIX_FMT_VUYA32 v4l2_fourcc('V', 'U', 'Y', 'A') /* 32 VUYA-8-8-8-8 */ +#define V4L2_PIX_FMT_VUYX32 v4l2_fourcc('V', 'U', 'Y', 'X') /* 32 VUYX-8-8-8-8 */ #define V4L2_PIX_FMT_HI240 v4l2_fourcc('H', 'I', '2', '4') /* 8 8-bit color */ #define V4L2_PIX_FMT_HM12 v4l2_fourcc('H', 'M', '1', '2') /* 8 YUV 4:2:0 16x16 macroblocks */ #define V4L2_PIX_FMT_M420 v4l2_fourcc('M', '4', '2', '0') /* 12 YUV 4:2:0 2 lines y, 1 line uv interleaved */ -- cgit v1.3-7-g2ca7 From 721074b03411327e7bf41555d4cc7c18f49313f7 Mon Sep 17 00:00:00 2001 From: Patrick Lerda Date: Thu, 17 Jan 2019 03:50:13 -0500 Subject: media: rc: rcmm decoder and encoder media: add support for RCMM infrared remote controls. Signed-off-by: Patrick Lerda Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/lirc.h.rst.exceptions | 3 + MAINTAINERS | 5 + drivers/media/rc/Kconfig | 13 ++ drivers/media/rc/Makefile | 1 + drivers/media/rc/ir-rcmm-decoder.c | 254 ++++++++++++++++++++++++++++++ drivers/media/rc/rc-core-priv.h | 5 + drivers/media/rc/rc-main.c | 9 ++ include/media/rc-map.h | 14 +- include/uapi/linux/lirc.h | 6 + tools/include/uapi/linux/lirc.h | 12 ++ tools/testing/selftests/ir/ir_loopback.c | 9 ++ 11 files changed, 328 insertions(+), 3 deletions(-) create mode 100644 drivers/media/rc/ir-rcmm-decoder.c (limited to 'include/uapi') diff --git a/Documentation/media/lirc.h.rst.exceptions b/Documentation/media/lirc.h.rst.exceptions index 379b9e7df5d0..7a8b8ff4f076 100644 --- a/Documentation/media/lirc.h.rst.exceptions +++ b/Documentation/media/lirc.h.rst.exceptions @@ -60,6 +60,9 @@ ignore symbol RC_PROTO_SHARP ignore symbol RC_PROTO_XMP ignore symbol RC_PROTO_CEC ignore symbol RC_PROTO_IMON +ignore symbol RC_PROTO_RCMM12 +ignore symbol RC_PROTO_RCMM24 +ignore symbol RC_PROTO_RCMM32 # Undocumented macros diff --git a/MAINTAINERS b/MAINTAINERS index 914d333ee73d..9662ad6c58e4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16536,6 +16536,11 @@ M: David Härdeman S: Maintained F: drivers/media/rc/winbond-cir.c +RCMM REMOTE CONTROLS DECODER +M: Patrick Lerda +S: Maintained +F: drivers/media/rc/ir-rcmm-decoder.c + WINSYSTEMS EBC-C384 WATCHDOG DRIVER M: William Breathitt Gray L: linux-watchdog@vger.kernel.org diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 8a216068a35a..8e95f6eac89d 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -133,6 +133,19 @@ config IR_IMON_DECODER remote control and you would like to use it with a raw IR receiver, or if you wish to use an encoder to transmit this IR. +config IR_RCMM_DECODER + tristate "Enable IR raw decoder for the RC-MM protocol" + depends on RC_CORE + help + Enable this option when you have IR with RC-MM protocol, and + you need the software decoder. The driver supports 12, + 24 and 32 bits RC-MM variants. You can enable or disable the + different modes using the following RC protocol keywords: + 'rc-mm-12', 'rc-mm-24' and 'rc-mm-32'. + + To compile this driver as a module, choose M here: the module + will be called ir-rcmm-decoder. + endif #RC_DECODERS menuconfig RC_DEVICES diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index 92c163816849..48d23433b3c0 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_IR_SHARP_DECODER) += ir-sharp-decoder.o obj-$(CONFIG_IR_MCE_KBD_DECODER) += ir-mce_kbd-decoder.o obj-$(CONFIG_IR_XMP_DECODER) += ir-xmp-decoder.o obj-$(CONFIG_IR_IMON_DECODER) += ir-imon-decoder.o +obj-$(CONFIG_IR_RCMM_DECODER) += ir-rcmm-decoder.o # stand-alone IR receivers/transmitters obj-$(CONFIG_RC_ATI_REMOTE) += ati_remote.o diff --git a/drivers/media/rc/ir-rcmm-decoder.c b/drivers/media/rc/ir-rcmm-decoder.c new file mode 100644 index 000000000000..f1096ac1e5c5 --- /dev/null +++ b/drivers/media/rc/ir-rcmm-decoder.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0+ +// ir-rcmm-decoder.c - A decoder for the RCMM IR protocol +// +// Copyright (C) 2018 by Patrick Lerda + +#include "rc-core-priv.h" +#include +#include + +#define RCMM_UNIT 166667 /* nanosecs */ +#define RCMM_PREFIX_PULSE 416666 /* 166666.666666666*2.5 */ +#define RCMM_PULSE_0 277777 /* 166666.666666666*(1+2/3) */ +#define RCMM_PULSE_1 444444 /* 166666.666666666*(2+2/3) */ +#define RCMM_PULSE_2 611111 /* 166666.666666666*(3+2/3) */ +#define RCMM_PULSE_3 777778 /* 166666.666666666*(4+2/3) */ + +enum rcmm_state { + STATE_INACTIVE, + STATE_LOW, + STATE_BUMP, + STATE_VALUE, + STATE_FINISHED, +}; + +static bool rcmm_mode(const struct rcmm_dec *data) +{ + return !((0x000c0000 & data->bits) == 0x000c0000); +} + +static int rcmm_miscmode(struct rc_dev *dev, struct rcmm_dec *data) +{ + switch (data->count) { + case 24: + if (dev->enabled_protocols & RC_PROTO_BIT_RCMM24) { + rc_keydown(dev, RC_PROTO_RCMM24, data->bits, 0); + data->state = STATE_INACTIVE; + return 0; + } + return -1; + + case 12: + if (dev->enabled_protocols & RC_PROTO_BIT_RCMM12) { + rc_keydown(dev, RC_PROTO_RCMM12, data->bits, 0); + data->state = STATE_INACTIVE; + return 0; + } + return -1; + } + + return -1; +} + +/** + * ir_rcmm_decode() - Decode one RCMM pulse or space + * @dev: the struct rc_dev descriptor of the device + * @ev: the struct ir_raw_event descriptor of the pulse/space + * + * This function returns -EINVAL if the pulse violates the state machine + */ +static int ir_rcmm_decode(struct rc_dev *dev, struct ir_raw_event ev) +{ + struct rcmm_dec *data = &dev->raw->rcmm; + u32 scancode; + u8 toggle; + int value; + + if (!(dev->enabled_protocols & (RC_PROTO_BIT_RCMM32 | + RC_PROTO_BIT_RCMM24 | + RC_PROTO_BIT_RCMM12))) + return 0; + + if (!is_timing_event(ev)) { + if (ev.reset) + data->state = STATE_INACTIVE; + return 0; + } + + switch (data->state) { + case STATE_INACTIVE: + if (!ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_PREFIX_PULSE, RCMM_UNIT / 2)) + break; + + data->state = STATE_LOW; + data->count = 0; + data->bits = 0; + return 0; + + case STATE_LOW: + if (ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2)) + break; + + data->state = STATE_BUMP; + return 0; + + case STATE_BUMP: + if (!ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2)) + break; + + data->state = STATE_VALUE; + return 0; + + case STATE_VALUE: + if (ev.pulse) + break; + + if (eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2)) + value = 0; + else if (eq_margin(ev.duration, RCMM_PULSE_1, RCMM_UNIT / 2)) + value = 1; + else if (eq_margin(ev.duration, RCMM_PULSE_2, RCMM_UNIT / 2)) + value = 2; + else if (eq_margin(ev.duration, RCMM_PULSE_3, RCMM_UNIT / 2)) + value = 3; + else + value = -1; + + if (value == -1) { + if (!rcmm_miscmode(dev, data)) + return 0; + break; + } + + data->bits <<= 2; + data->bits |= value; + + data->count += 2; + + if (data->count < 32) + data->state = STATE_BUMP; + else + data->state = STATE_FINISHED; + + return 0; + + case STATE_FINISHED: + if (!ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2)) + break; + + if (rcmm_mode(data)) { + toggle = !!(0x8000 & data->bits); + scancode = data->bits & ~0x8000; + } else { + toggle = 0; + scancode = data->bits; + } + + if (dev->enabled_protocols & RC_PROTO_BIT_RCMM32) { + rc_keydown(dev, RC_PROTO_RCMM32, scancode, toggle); + data->state = STATE_INACTIVE; + return 0; + } + + break; + } + + data->state = STATE_INACTIVE; + return -EINVAL; +} + +static const int rcmmspace[] = { + RCMM_PULSE_0, + RCMM_PULSE_1, + RCMM_PULSE_2, + RCMM_PULSE_3, +}; + +static int ir_rcmm_rawencoder(struct ir_raw_event **ev, unsigned int max, + unsigned int n, u32 data) +{ + int i; + int ret; + + ret = ir_raw_gen_pulse_space(ev, &max, RCMM_PREFIX_PULSE, RCMM_PULSE_0); + if (ret) + return ret; + + for (i = n - 2; i >= 0; i -= 2) { + const unsigned int space = rcmmspace[(data >> i) & 3]; + + ret = ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, space); + if (ret) + return ret; + } + + return ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, RCMM_PULSE_3 * 2); +} + +static int ir_rcmm_encode(enum rc_proto protocol, u32 scancode, + struct ir_raw_event *events, unsigned int max) +{ + struct ir_raw_event *e = events; + int ret; + + switch (protocol) { + case RC_PROTO_RCMM32: + ret = ir_rcmm_rawencoder(&e, max, 32, scancode); + break; + case RC_PROTO_RCMM24: + ret = ir_rcmm_rawencoder(&e, max, 24, scancode); + break; + case RC_PROTO_RCMM12: + ret = ir_rcmm_rawencoder(&e, max, 12, scancode); + break; + default: + ret = -EINVAL; + } + + if (ret < 0) + return ret; + + return e - events; +} + +static struct ir_raw_handler rcmm_handler = { + .protocols = RC_PROTO_BIT_RCMM32 | + RC_PROTO_BIT_RCMM24 | + RC_PROTO_BIT_RCMM12, + .decode = ir_rcmm_decode, + .encode = ir_rcmm_encode, + .carrier = 36000, + .min_timeout = RCMM_PULSE_3 + RCMM_UNIT, +}; + +static int __init ir_rcmm_decode_init(void) +{ + ir_raw_handler_register(&rcmm_handler); + + pr_info("IR RCMM protocol handler initialized\n"); + return 0; +} + +static void __exit ir_rcmm_decode_exit(void) +{ + ir_raw_handler_unregister(&rcmm_handler); +} + +module_init(ir_rcmm_decode_init); +module_exit(ir_rcmm_decode_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick Lerda"); +MODULE_DESCRIPTION("RCMM IR protocol decoder"); diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index c2cbe7f6266c..9f21b3e8b377 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -131,6 +131,11 @@ struct ir_raw_event_ctrl { unsigned int bits; bool stick_keyboard; } imon; + struct rcmm_dec { + int state; + unsigned int count; + u32 bits; + } rcmm; }; /* Mutex for locking raw IR processing and handler change */ diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 66a174979b3c..dc38e9c0a2ff 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -70,6 +70,12 @@ static const struct { [RC_PROTO_CEC] = { .name = "cec", .repeat_period = 0 }, [RC_PROTO_IMON] = { .name = "imon", .scancode_bits = 0x7fffffff, .repeat_period = 114 }, + [RC_PROTO_RCMM12] = { .name = "rc-mm-12", + .scancode_bits = 0x00000fff, .repeat_period = 114 }, + [RC_PROTO_RCMM24] = { .name = "rc-mm-24", + .scancode_bits = 0x00ffffff, .repeat_period = 114 }, + [RC_PROTO_RCMM32] = { .name = "rc-mm-32", + .scancode_bits = 0xffffffff, .repeat_period = 114 }, }; /* Used to keep track of known keymaps */ @@ -1006,6 +1012,9 @@ static const struct { { RC_PROTO_BIT_XMP, "xmp", "ir-xmp-decoder" }, { RC_PROTO_BIT_CEC, "cec", NULL }, { RC_PROTO_BIT_IMON, "imon", "ir-imon-decoder" }, + { RC_PROTO_BIT_RCMM12 | + RC_PROTO_BIT_RCMM24 | + RC_PROTO_BIT_RCMM32, "rc-mm", "ir-rcmm-decoder" }, }; /** diff --git a/include/media/rc-map.h b/include/media/rc-map.h index d621acadfbf3..e5e86d595645 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -37,6 +37,9 @@ #define RC_PROTO_BIT_XMP BIT_ULL(RC_PROTO_XMP) #define RC_PROTO_BIT_CEC BIT_ULL(RC_PROTO_CEC) #define RC_PROTO_BIT_IMON BIT_ULL(RC_PROTO_IMON) +#define RC_PROTO_BIT_RCMM12 BIT_ULL(RC_PROTO_RCMM12) +#define RC_PROTO_BIT_RCMM24 BIT_ULL(RC_PROTO_RCMM24) +#define RC_PROTO_BIT_RCMM32 BIT_ULL(RC_PROTO_RCMM32) #define RC_PROTO_BIT_ALL \ (RC_PROTO_BIT_UNKNOWN | RC_PROTO_BIT_OTHER | \ @@ -51,7 +54,8 @@ RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \ RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \ RC_PROTO_BIT_XMP | RC_PROTO_BIT_CEC | \ - RC_PROTO_BIT_IMON) + RC_PROTO_BIT_IMON | RC_PROTO_BIT_RCMM12 | \ + RC_PROTO_BIT_RCMM24 | RC_PROTO_BIT_RCMM32) /* All rc protocols for which we have decoders */ #define RC_PROTO_BIT_ALL_IR_DECODER \ (RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ @@ -64,7 +68,9 @@ RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \ RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \ - RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON) + RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON | \ + RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \ + RC_PROTO_BIT_RCMM32) #define RC_PROTO_BIT_ALL_IR_ENCODER \ (RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ @@ -77,7 +83,9 @@ RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ RC_PROTO_BIT_RC6_6A_24 | \ RC_PROTO_BIT_RC6_6A_32 | RC_PROTO_BIT_RC6_MCE | \ - RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON) + RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON | \ + RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \ + RC_PROTO_BIT_RCMM32) #define RC_SCANCODE_UNKNOWN(x) (x) #define RC_SCANCODE_OTHER(x) (x) diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h index 6b319581882f..45fcbf99d72e 100644 --- a/include/uapi/linux/lirc.h +++ b/include/uapi/linux/lirc.h @@ -192,6 +192,9 @@ struct lirc_scancode { * @RC_PROTO_XMP: XMP protocol * @RC_PROTO_CEC: CEC protocol * @RC_PROTO_IMON: iMon Pad protocol + * @RC_PROTO_RCMM12: RC-MM protocol 12 bits + * @RC_PROTO_RCMM24: RC-MM protocol 24 bits + * @RC_PROTO_RCMM32: RC-MM protocol 32 bits */ enum rc_proto { RC_PROTO_UNKNOWN = 0, @@ -218,6 +221,9 @@ enum rc_proto { RC_PROTO_XMP = 21, RC_PROTO_CEC = 22, RC_PROTO_IMON = 23, + RC_PROTO_RCMM12 = 24, + RC_PROTO_RCMM24 = 25, + RC_PROTO_RCMM32 = 26, }; #endif diff --git a/tools/include/uapi/linux/lirc.h b/tools/include/uapi/linux/lirc.h index f189931042a7..45fcbf99d72e 100644 --- a/tools/include/uapi/linux/lirc.h +++ b/tools/include/uapi/linux/lirc.h @@ -133,6 +133,12 @@ #define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) +/* + * Return the recording timeout, which is either set by + * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols. + */ +#define LIRC_GET_REC_TIMEOUT _IOR('i', 0x00000024, __u32) + /* * struct lirc_scancode - decoded scancode with protocol for use with * LIRC_MODE_SCANCODE @@ -186,6 +192,9 @@ struct lirc_scancode { * @RC_PROTO_XMP: XMP protocol * @RC_PROTO_CEC: CEC protocol * @RC_PROTO_IMON: iMon Pad protocol + * @RC_PROTO_RCMM12: RC-MM protocol 12 bits + * @RC_PROTO_RCMM24: RC-MM protocol 24 bits + * @RC_PROTO_RCMM32: RC-MM protocol 32 bits */ enum rc_proto { RC_PROTO_UNKNOWN = 0, @@ -212,6 +221,9 @@ enum rc_proto { RC_PROTO_XMP = 21, RC_PROTO_CEC = 22, RC_PROTO_IMON = 23, + RC_PROTO_RCMM12 = 24, + RC_PROTO_RCMM24 = 25, + RC_PROTO_RCMM32 = 26, }; #endif diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c index 858c19caf224..570a7358942c 100644 --- a/tools/testing/selftests/ir/ir_loopback.c +++ b/tools/testing/selftests/ir/ir_loopback.c @@ -51,6 +51,10 @@ static const struct { { RC_PROTO_RC6_6A_32, "rc-6-6a-32", 0xffffffff, "rc-6" }, { RC_PROTO_RC6_MCE, "rc-6-mce", 0x00007fff, "rc-6" }, { RC_PROTO_SHARP, "sharp", 0x1fff, "sharp" }, + { RC_PROTO_IMON, "imon", 0x7fffffff, "imon" }, + { RC_PROTO_RCMM12, "rcmm-12", 0x00000fff, "rcmm" }, + { RC_PROTO_RCMM24, "rcmm-24", 0x00ffffff, "rcmm" }, + { RC_PROTO_RCMM32, "rcmm-32", 0xffffffff, "rcmm" }, }; int lirc_open(const char *rc) @@ -139,6 +143,11 @@ int main(int argc, char **argv) (((scancode >> 8) ^ ~scancode) & 0xff) == 0) continue; + if (rc_proto == RC_PROTO_RCMM32 && + (scancode & 0x000c0000) != 0x000c0000 && + scancode & 0x00008000) + continue; + struct lirc_scancode lsc = { .rc_proto = rc_proto, .scancode = scancode -- cgit v1.3-7-g2ca7 From 80d7da1cac62f28b3df4880e8143b39cabb4b59a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 16 May 2018 11:18:50 +0300 Subject: asm-generic: Drop getrlimit and setrlimit syscalls from default list The newer prlimit64 syscall provides all the functionality of getrlimit and setrlimit syscalls and adds the pid of target process, so future architectures won't need to include getrlimit and setrlimit. Therefore drop getrlimit and setrlimit syscalls from the generic syscall list unless __ARCH_WANT_SET_GET_RLIMIT is defined by the architecture's unistd.h prior to including asm-generic/unistd.h, and adjust all architectures using the generic syscall list to define it so that no in-tree architectures are affected. Cc: Arnd Bergmann Cc: linux-arch@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-hexagon@vger.kernel.org Cc: uclinux-h8-devel@lists.sourceforge.jp Signed-off-by: Yury Norov Acked-by: Arnd Bergmann Acked-by: Mark Salter [c6x] Acked-by: James Hogan [metag] Acked-by: Ley Foon Tan [nios2] Acked-by: Stafford Horne [openrisc] Acked-by: Will Deacon [arm64] Acked-by: Vineet Gupta #arch/arc bits Signed-off-by: Yury Norov Signed-off-by: Arnd Bergmann --- arch/arc/include/uapi/asm/unistd.h | 1 + arch/arm64/include/uapi/asm/unistd.h | 1 + arch/c6x/include/uapi/asm/unistd.h | 1 + arch/csky/include/uapi/asm/unistd.h | 1 + arch/h8300/include/uapi/asm/unistd.h | 1 + arch/hexagon/include/uapi/asm/unistd.h | 1 + arch/nds32/include/uapi/asm/unistd.h | 1 + arch/nios2/include/uapi/asm/unistd.h | 1 + arch/openrisc/include/uapi/asm/unistd.h | 1 + arch/riscv/include/uapi/asm/unistd.h | 1 + arch/unicore32/include/uapi/asm/unistd.h | 1 + include/uapi/asm-generic/unistd.h | 5 +++++ scripts/checksyscalls.sh | 5 +++++ 13 files changed, 21 insertions(+) (limited to 'include/uapi') diff --git a/arch/arc/include/uapi/asm/unistd.h b/arch/arc/include/uapi/asm/unistd.h index 3b3543fd151c..6a1a62a979dd 100644 --- a/arch/arc/include/uapi/asm/unistd.h +++ b/arch/arc/include/uapi/asm/unistd.h @@ -18,6 +18,7 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_STAT64 +#define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_VFORK diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h index dae1584cf017..79937de2a0cc 100644 --- a/arch/arm64/include/uapi/asm/unistd.h +++ b/arch/arm64/include/uapi/asm/unistd.h @@ -17,5 +17,6 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_NEW_STAT +#define __ARCH_WANT_SET_GET_RLIMIT #include diff --git a/arch/c6x/include/uapi/asm/unistd.h b/arch/c6x/include/uapi/asm/unistd.h index 6b2fe792de9d..e3721b2cfd6a 100644 --- a/arch/c6x/include/uapi/asm/unistd.h +++ b/arch/c6x/include/uapi/asm/unistd.h @@ -17,6 +17,7 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_STAT64 +#define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_SYS_CLONE /* Use the standard ABI for syscalls. */ diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h index 224c9a9ab45b..f5c83492136f 100644 --- a/arch/csky/include/uapi/asm/unistd.h +++ b/arch/csky/include/uapi/asm/unistd.h @@ -2,6 +2,7 @@ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SET_GET_RLIMIT #include #define __NR_set_thread_area (__NR_arch_specific_syscall + 0) diff --git a/arch/h8300/include/uapi/asm/unistd.h b/arch/h8300/include/uapi/asm/unistd.h index 628195823816..b9e9352f2328 100644 --- a/arch/h8300/include/uapi/asm/unistd.h +++ b/arch/h8300/include/uapi/asm/unistd.h @@ -2,5 +2,6 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_STAT64 +#define __ARCH_WANT_SET_GET_RLIMIT #include diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h index c91ca7d02461..6bb392a33c35 100644 --- a/arch/hexagon/include/uapi/asm/unistd.h +++ b/arch/hexagon/include/uapi/asm/unistd.h @@ -30,6 +30,7 @@ #define sys_mmap2 sys_mmap_pgoff #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_STAT64 +#define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_VFORK diff --git a/arch/nds32/include/uapi/asm/unistd.h b/arch/nds32/include/uapi/asm/unistd.h index c2c3a3e34083..eb98d24d3190 100644 --- a/arch/nds32/include/uapi/asm/unistd.h +++ b/arch/nds32/include/uapi/asm/unistd.h @@ -3,6 +3,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYNC_FILE_RANGE2 +#define __ARCH_WANT_SET_GET_RLIMIT /* Use the standard ABI for syscalls */ #include diff --git a/arch/nios2/include/uapi/asm/unistd.h b/arch/nios2/include/uapi/asm/unistd.h index d9948d88790b..fa68e68bc26d 100644 --- a/arch/nios2/include/uapi/asm/unistd.h +++ b/arch/nios2/include/uapi/asm/unistd.h @@ -20,6 +20,7 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_STAT64 +#define __ARCH_WANT_SET_GET_RLIMIT /* Use the standard ABI for syscalls */ #include diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h index ec37df18d8ed..2e0bc0ff9f31 100644 --- a/arch/openrisc/include/uapi/asm/unistd.h +++ b/arch/openrisc/include/uapi/asm/unistd.h @@ -21,6 +21,7 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_STAT64 +#define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/riscv/include/uapi/asm/unistd.h b/arch/riscv/include/uapi/asm/unistd.h index 1f3bd3ebbb0d..d9340c52e7ad 100644 --- a/arch/riscv/include/uapi/asm/unistd.h +++ b/arch/riscv/include/uapi/asm/unistd.h @@ -18,6 +18,7 @@ #ifdef __LP64__ #define __ARCH_WANT_NEW_STAT #endif /* __LP64__ */ +#define __ARCH_WANT_SET_GET_RLIMIT #include diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h index 1e8fe5941b8a..2b575c0cf177 100644 --- a/arch/unicore32/include/uapi/asm/unistd.h +++ b/arch/unicore32/include/uapi/asm/unistd.h @@ -12,6 +12,7 @@ */ #define __ARCH_WANT_RENAMEAT +#define __ARCH_WANT_SET_GET_RLIMIT /* Use the standard ABI for syscalls. */ #include diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index b928eff3bf92..2cdf600b05fa 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -467,10 +467,15 @@ __SYSCALL(__NR_uname, sys_newuname) __SYSCALL(__NR_sethostname, sys_sethostname) #define __NR_setdomainname 162 __SYSCALL(__NR_setdomainname, sys_setdomainname) + +#ifdef __ARCH_WANT_SET_GET_RLIMIT +/* getrlimit and setrlimit are superseded with prlimit64 */ #define __NR_getrlimit 163 __SC_COMP(__NR_getrlimit, sys_getrlimit, compat_sys_getrlimit) #define __NR_setrlimit 164 __SC_COMP(__NR_setrlimit, sys_setrlimit, compat_sys_setrlimit) +#endif + #define __NR_getrusage 165 __SC_COMP(__NR_getrusage, sys_getrusage, compat_sys_getrusage) #define __NR_umask 166 diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh index cc70a64fa81f..53c5677d7e82 100755 --- a/scripts/checksyscalls.sh +++ b/scripts/checksyscalls.sh @@ -38,6 +38,11 @@ cat << EOF #define __IGNORE_lstat64 /* fstatat64 */ #endif +#ifndef __ARCH_WANT_SET_GET_RLIMIT +#define __IGNORE_getrlimit /* getrlimit */ +#define __IGNORE_setrlimit /* setrlimit */ +#endif + /* Missing flags argument */ #define __IGNORE_renameat /* renameat2 */ -- cgit v1.3-7-g2ca7 From 517b773e0f612d608cbc62a08c55601bd56f73f6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:49 +0200 Subject: RDMA/nldev: Share with user-space object IDs Give to the user space tools unique identifier for PD, MR, CQ and CM_ID objects, so they can be able to query on them with .doit callbacks. QP .doit is not supported yet, till all drivers will be updated to provide their LQPN to be equal to their restrack ID. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 20 ++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 9 +++++++++ 2 files changed, 29 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 9b4f891771c4..81d7ee3dcb20 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -108,6 +108,10 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -466,6 +470,9 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, &cm_id->route.addr.dst_addr)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -494,6 +501,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -522,6 +532,9 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, RDMA_NLDEV_ATTR_PAD)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -552,6 +565,9 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -893,6 +909,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { .fill_res_func = fill_res_cq_entry, @@ -900,6 +917,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { .fill_res_func = fill_res_mr_entry, @@ -907,6 +925,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { .fill_res_func = fill_res_pd_entry, @@ -914,6 +933,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_PDN, }, }; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 3a9e681e4257..43362132e0d7 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -456,6 +456,15 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */ RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */ + /* + * Indexes to get/set secific entry, + * for QP use RDMA_NLDEV_ATTR_RES_LQPN + */ + RDMA_NLDEV_ATTR_RES_PDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CQN, /* u32 */ + RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ + /* * Always the end */ -- cgit v1.3-7-g2ca7 From c3d02788b45ab4a2d8f243b98c04b549c8193af6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:50 +0200 Subject: RDMA/nldev: Provide parent IDs for PD, MR and QP objects PD, MR and QP objects have parents objects: contexts and PDs. The exposed parent IDs allow to correlate various objects and simplify debug investigation. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 18 ++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 81d7ee3dcb20..e6c7cc510556 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -112,6 +112,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -420,6 +421,10 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -503,6 +508,10 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + cq->uobject->context->res.id)) + goto err; if (fill_res_name_pid(msg, res)) goto err; @@ -535,6 +544,10 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -568,6 +581,11 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + pd->uobject->context->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 43362132e0d7..4ebbcfb2c6ef 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -464,6 +464,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_CQN, /* u32 */ RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CTXN, /* u32 */ /* * Always the end -- cgit v1.3-7-g2ca7 From c8ce48f06503eee20f189eed5b2aa736272b7344 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Feb 2019 17:30:06 +0100 Subject: asm-generic: Make time32 syscall numbers optional We don't want new architectures to even provide the old 32-bit time_t based system calls any more, or define the syscall number macros. Add a new __ARCH_WANT_TIME32_SYSCALLS macro that gets enabled for all existing 32-bit architectures using the generic system call table, so we don't change any current behavior. Since this symbol is evaluated in user space as well, we cannot use a Kconfig CONFIG_* macro but have to define it in uapi/asm/unistd.h. On 64-bit architectures, the same system call numbers mostly refer to the system calls we want to keep, as they already pass 64-bit time_t. As new architectures no longer provide these, we need new exceptions in checksyscalls.sh. Signed-off-by: Arnd Bergmann --- arch/arc/include/uapi/asm/unistd.h | 1 + arch/arm64/include/uapi/asm/unistd.h | 1 + arch/c6x/include/uapi/asm/unistd.h | 1 + arch/csky/include/uapi/asm/unistd.h | 1 + arch/h8300/include/uapi/asm/unistd.h | 1 + arch/hexagon/include/uapi/asm/unistd.h | 1 + arch/nds32/include/uapi/asm/unistd.h | 1 + arch/nios2/include/uapi/asm/unistd.h | 1 + arch/openrisc/include/uapi/asm/unistd.h | 1 + arch/riscv/include/uapi/asm/unistd.h | 3 +++ arch/unicore32/include/uapi/asm/unistd.h | 1 + include/uapi/asm-generic/unistd.h | 36 ++++++++++++++++++++++++++++++++ scripts/checksyscalls.sh | 7 +++++++ 13 files changed, 56 insertions(+) (limited to 'include/uapi') diff --git a/arch/arc/include/uapi/asm/unistd.h b/arch/arc/include/uapi/asm/unistd.h index 6a1a62a979dd..5eafa1115162 100644 --- a/arch/arc/include/uapi/asm/unistd.h +++ b/arch/arc/include/uapi/asm/unistd.h @@ -23,6 +23,7 @@ #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_FORK +#define __ARCH_WANT_TIME32_SYSCALLS #define sys_mmap2 sys_mmap_pgoff diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h index 79937de2a0cc..4703d218663a 100644 --- a/arch/arm64/include/uapi/asm/unistd.h +++ b/arch/arm64/include/uapi/asm/unistd.h @@ -18,5 +18,6 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SET_GET_RLIMIT +#define __ARCH_WANT_TIME32_SYSCALLS #include diff --git a/arch/c6x/include/uapi/asm/unistd.h b/arch/c6x/include/uapi/asm/unistd.h index e3721b2cfd6a..79b724c39d9b 100644 --- a/arch/c6x/include/uapi/asm/unistd.h +++ b/arch/c6x/include/uapi/asm/unistd.h @@ -19,6 +19,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_TIME32_SYSCALLS /* Use the standard ABI for syscalls. */ #include diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h index f5c83492136f..ec60e49cea66 100644 --- a/arch/csky/include/uapi/asm/unistd.h +++ b/arch/csky/include/uapi/asm/unistd.h @@ -3,6 +3,7 @@ #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SET_GET_RLIMIT +#define __ARCH_WANT_TIME32_SYSCALLS #include #define __NR_set_thread_area (__NR_arch_specific_syscall + 0) diff --git a/arch/h8300/include/uapi/asm/unistd.h b/arch/h8300/include/uapi/asm/unistd.h index b9e9352f2328..eb7bc0012af5 100644 --- a/arch/h8300/include/uapi/asm/unistd.h +++ b/arch/h8300/include/uapi/asm/unistd.h @@ -3,5 +3,6 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SET_GET_RLIMIT +#define __ARCH_WANT_TIME32_SYSCALLS #include diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h index 6bb392a33c35..432c4db1b623 100644 --- a/arch/hexagon/include/uapi/asm/unistd.h +++ b/arch/hexagon/include/uapi/asm/unistd.h @@ -35,5 +35,6 @@ #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_FORK +#define __ARCH_WANT_TIME32_SYSCALLS #include diff --git a/arch/nds32/include/uapi/asm/unistd.h b/arch/nds32/include/uapi/asm/unistd.h index eb98d24d3190..4ec8f543103f 100644 --- a/arch/nds32/include/uapi/asm/unistd.h +++ b/arch/nds32/include/uapi/asm/unistd.h @@ -4,6 +4,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYNC_FILE_RANGE2 #define __ARCH_WANT_SET_GET_RLIMIT +#define __ARCH_WANT_TIME32_SYSCALLS /* Use the standard ABI for syscalls */ #include diff --git a/arch/nios2/include/uapi/asm/unistd.h b/arch/nios2/include/uapi/asm/unistd.h index fa68e68bc26d..0b4bb1d41b28 100644 --- a/arch/nios2/include/uapi/asm/unistd.h +++ b/arch/nios2/include/uapi/asm/unistd.h @@ -21,6 +21,7 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SET_GET_RLIMIT +#define __ARCH_WANT_TIME32_SYSCALLS /* Use the standard ABI for syscalls */ #include diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h index 2e0bc0ff9f31..566f8c4f8047 100644 --- a/arch/openrisc/include/uapi/asm/unistd.h +++ b/arch/openrisc/include/uapi/asm/unistd.h @@ -24,6 +24,7 @@ #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_TIME32_SYSCALLS #include diff --git a/arch/riscv/include/uapi/asm/unistd.h b/arch/riscv/include/uapi/asm/unistd.h index d9340c52e7ad..486a288b454c 100644 --- a/arch/riscv/include/uapi/asm/unistd.h +++ b/arch/riscv/include/uapi/asm/unistd.h @@ -19,6 +19,9 @@ #define __ARCH_WANT_NEW_STAT #endif /* __LP64__ */ #define __ARCH_WANT_SET_GET_RLIMIT +#ifndef __LP64__ +#define __ARCH_WANT_TIME32_SYSCALLS +#endif #include diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h index 2b575c0cf177..4e5e624f5d7e 100644 --- a/arch/unicore32/include/uapi/asm/unistd.h +++ b/arch/unicore32/include/uapi/asm/unistd.h @@ -13,6 +13,7 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_SET_GET_RLIMIT +#define __ARCH_WANT_TIME32_SYSCALLS /* Use the standard ABI for syscalls. */ #include diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 2cdf600b05fa..12cdf611d217 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -38,8 +38,10 @@ __SYSCALL(__NR_io_destroy, sys_io_destroy) __SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit) #define __NR_io_cancel 3 __SYSCALL(__NR_io_cancel, sys_io_cancel) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_io_getevents 4 __SC_3264(__NR_io_getevents, sys_io_getevents_time32, sys_io_getevents) +#endif /* fs/xattr.c */ #define __NR_setxattr 5 @@ -222,10 +224,12 @@ __SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev) __SYSCALL(__NR3264_sendfile, sys_sendfile64) /* fs/select.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_pselect6 72 __SC_COMP_3264(__NR_pselect6, sys_pselect6_time32, sys_pselect6, compat_sys_pselect6_time32) #define __NR_ppoll 73 __SC_COMP_3264(__NR_ppoll, sys_ppoll_time32, sys_ppoll, compat_sys_ppoll_time32) +#endif /* fs/signalfd.c */ #define __NR_signalfd4 74 @@ -269,16 +273,20 @@ __SC_COMP(__NR_sync_file_range, sys_sync_file_range, \ /* fs/timerfd.c */ #define __NR_timerfd_create 85 __SYSCALL(__NR_timerfd_create, sys_timerfd_create) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_timerfd_settime 86 __SC_3264(__NR_timerfd_settime, sys_timerfd_settime32, \ sys_timerfd_settime) #define __NR_timerfd_gettime 87 __SC_3264(__NR_timerfd_gettime, sys_timerfd_gettime32, \ sys_timerfd_gettime) +#endif /* fs/utimes.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_utimensat 88 __SC_3264(__NR_utimensat, sys_utimensat_time32, sys_utimensat) +#endif /* kernel/acct.c */ #define __NR_acct 89 @@ -309,8 +317,10 @@ __SYSCALL(__NR_set_tid_address, sys_set_tid_address) __SYSCALL(__NR_unshare, sys_unshare) /* kernel/futex.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_futex 98 __SC_3264(__NR_futex, sys_futex_time32, sys_futex) +#endif #define __NR_set_robust_list 99 __SC_COMP(__NR_set_robust_list, sys_set_robust_list, \ compat_sys_set_robust_list) @@ -319,8 +329,10 @@ __SC_COMP(__NR_get_robust_list, sys_get_robust_list, \ compat_sys_get_robust_list) /* kernel/hrtimer.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_nanosleep 101 __SC_3264(__NR_nanosleep, sys_nanosleep_time32, sys_nanosleep) +#endif /* kernel/itimer.c */ #define __NR_getitimer 102 @@ -341,14 +353,19 @@ __SYSCALL(__NR_delete_module, sys_delete_module) /* kernel/posix-timers.c */ #define __NR_timer_create 107 __SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_timer_gettime 108 __SC_3264(__NR_timer_gettime, sys_timer_gettime32, sys_timer_gettime) +#endif #define __NR_timer_getoverrun 109 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_timer_settime 110 __SC_3264(__NR_timer_settime, sys_timer_settime32, sys_timer_settime) +#endif #define __NR_timer_delete 111 __SYSCALL(__NR_timer_delete, sys_timer_delete) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_clock_settime 112 __SC_3264(__NR_clock_settime, sys_clock_settime32, sys_clock_settime) #define __NR_clock_gettime 113 @@ -358,6 +375,7 @@ __SC_3264(__NR_clock_getres, sys_clock_getres_time32, sys_clock_getres) #define __NR_clock_nanosleep 115 __SC_3264(__NR_clock_nanosleep, sys_clock_nanosleep_time32, \ sys_clock_nanosleep) +#endif /* kernel/printk.c */ #define __NR_syslog 116 @@ -388,9 +406,11 @@ __SYSCALL(__NR_sched_yield, sys_sched_yield) __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) #define __NR_sched_get_priority_min 126 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_sched_rr_get_interval 127 __SC_3264(__NR_sched_rr_get_interval, sys_sched_rr_get_interval_time32, \ sys_sched_rr_get_interval) +#endif /* kernel/signal.c */ #define __NR_restart_syscall 128 @@ -411,9 +431,11 @@ __SC_COMP(__NR_rt_sigaction, sys_rt_sigaction, compat_sys_rt_sigaction) __SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask) #define __NR_rt_sigpending 136 __SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_rt_sigtimedwait 137 __SC_COMP_3264(__NR_rt_sigtimedwait, sys_rt_sigtimedwait_time32, \ sys_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time32) +#endif #define __NR_rt_sigqueueinfo 138 __SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \ compat_sys_rt_sigqueueinfo) @@ -486,12 +508,14 @@ __SYSCALL(__NR_prctl, sys_prctl) __SYSCALL(__NR_getcpu, sys_getcpu) /* kernel/time.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_gettimeofday 169 __SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday) #define __NR_settimeofday 170 __SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday) #define __NR_adjtimex 171 __SC_3264(__NR_adjtimex, sys_adjtimex_time32, sys_adjtimex) +#endif /* kernel/timer.c */ #define __NR_getpid 172 @@ -516,11 +540,13 @@ __SC_COMP(__NR_sysinfo, sys_sysinfo, compat_sys_sysinfo) __SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open) #define __NR_mq_unlink 181 __SYSCALL(__NR_mq_unlink, sys_mq_unlink) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_mq_timedsend 182 __SC_3264(__NR_mq_timedsend, sys_mq_timedsend_time32, sys_mq_timedsend) #define __NR_mq_timedreceive 183 __SC_3264(__NR_mq_timedreceive, sys_mq_timedreceive_time32, \ sys_mq_timedreceive) +#endif #define __NR_mq_notify 184 __SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify) #define __NR_mq_getsetattr 185 @@ -541,8 +567,10 @@ __SC_COMP(__NR_msgsnd, sys_msgsnd, compat_sys_msgsnd) __SYSCALL(__NR_semget, sys_semget) #define __NR_semctl 191 __SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_semtimedop 192 __SC_COMP(__NR_semtimedop, sys_semtimedop, sys_semtimedop_time32) +#endif #define __NR_semop 193 __SYSCALL(__NR_semop, sys_semop) @@ -663,8 +691,10 @@ __SC_COMP(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo, \ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_accept4 242 __SYSCALL(__NR_accept4, sys_accept4) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_recvmmsg 243 __SC_COMP_3264(__NR_recvmmsg, sys_recvmmsg_time32, sys_recvmmsg, compat_sys_recvmmsg_time32) +#endif /* * Architectures may provide up to 16 syscalls of their own @@ -672,8 +702,10 @@ __SC_COMP_3264(__NR_recvmmsg, sys_recvmmsg_time32, sys_recvmmsg, compat_sys_recv */ #define __NR_arch_specific_syscall 244 +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_wait4 260 __SC_COMP(__NR_wait4, sys_wait4, compat_sys_wait4) +#endif #define __NR_prlimit64 261 __SYSCALL(__NR_prlimit64, sys_prlimit64) #define __NR_fanotify_init 262 @@ -684,8 +716,10 @@ __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) #define __NR_open_by_handle_at 265 __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_clock_adjtime 266 __SC_3264(__NR_clock_adjtime, sys_clock_adjtime32, sys_clock_adjtime) +#endif #define __NR_syncfs 267 __SYSCALL(__NR_syncfs, sys_syncfs) #define __NR_setns 268 @@ -738,8 +772,10 @@ __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc) __SYSCALL(__NR_pkey_free, sys_pkey_free) #define __NR_statx 291 __SYSCALL(__NR_statx, sys_statx) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_io_pgetevents 292 __SC_COMP_3264(__NR_io_pgetevents, sys_io_pgetevents_time32, sys_io_pgetevents, compat_sys_io_pgetevents) +#endif #define __NR_rseq 293 __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh index 53c5677d7e82..ffd635efbdca 100755 --- a/scripts/checksyscalls.sh +++ b/scripts/checksyscalls.sh @@ -143,6 +143,13 @@ cat << EOF #define __IGNORE_rt_sigtimedwait #define __IGNORE_futex #define __IGNORE_sched_rr_get_interval +#define __IGNORE_gettimeofday +#define __IGNORE_settimeofday +#define __IGNORE_wait4 +#define __IGNORE_adjtimex +#define __IGNORE_nanosleep +#define __IGNORE_io_getevents +#define __IGNORE_recvmmsg #endif /* i386-specific or historical system calls */ -- cgit v1.3-7-g2ca7 From eeaf06ac1a5584e41cf289f8351e446bb131374b Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Thu, 5 Jul 2018 12:57:12 +1000 Subject: drm/nouveau/svm: initial support for shared virtual memory This uses HMM to mirror a process' CPU page tables into a channel's page tables, and keep them synchronised so that both the CPU and GPU are able to access the same memory at the same virtual address. While this code also supports Volta/Turing, it's only enabled for Pascal GPUs currently due to channel recovery being unreliable right now on the later GPUs. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/Kbuild | 1 + drivers/gpu/drm/nouveau/Kconfig | 11 + drivers/gpu/drm/nouveau/nouveau_chan.c | 9 + drivers/gpu/drm/nouveau/nouveau_drm.c | 7 +- drivers/gpu/drm/nouveau/nouveau_drv.h | 2 + drivers/gpu/drm/nouveau/nouveau_svm.c | 737 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/nouveau/nouveau_svm.h | 41 ++ drivers/gpu/drm/nouveau/nouveau_vmm.c | 2 + drivers/gpu/drm/nouveau/nouveau_vmm.h | 1 + include/uapi/drm/nouveau_drm.h | 8 + 10 files changed, 818 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/nouveau/nouveau_svm.c create mode 100644 drivers/gpu/drm/nouveau/nouveau_svm.h (limited to 'include/uapi') diff --git a/drivers/gpu/drm/nouveau/Kbuild b/drivers/gpu/drm/nouveau/Kbuild index b17843dd050d..71bd48aafe64 100644 --- a/drivers/gpu/drm/nouveau/Kbuild +++ b/drivers/gpu/drm/nouveau/Kbuild @@ -30,6 +30,7 @@ nouveau-y += nouveau_vga.o # DRM - memory management nouveau-y += nouveau_bo.o nouveau-y += nouveau_gem.o +nouveau-$(CONFIG_DRM_NOUVEAU_SVM) += nouveau_svm.o nouveau-y += nouveau_mem.o nouveau-y += nouveau_prime.o nouveau-y += nouveau_sgdma.o diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig index 432c440223bb..e7b26a6f386f 100644 --- a/drivers/gpu/drm/nouveau/Kconfig +++ b/drivers/gpu/drm/nouveau/Kconfig @@ -71,3 +71,14 @@ config DRM_NOUVEAU_BACKLIGHT help Say Y here if you want to control the backlight of your display (e.g. a laptop panel). + +config DRM_NOUVEAU_SVM + bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" + depends on ARCH_HAS_HMM + depends on DRM_NOUVEAU + depends on STAGING + select HMM_MIRROR + default n + help + Say Y here if you want to enable experimental support for + Shared Virtual Memory (SVM). diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c index 5160a0a9c77d..282fd90b65e1 100644 --- a/drivers/gpu/drm/nouveau/nouveau_chan.c +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c @@ -42,6 +42,7 @@ #include "nouveau_fence.h" #include "nouveau_abi16.h" #include "nouveau_vmm.h" +#include "nouveau_svm.h" MODULE_PARM_DESC(vram_pushbuf, "Create DMA push buffers in VRAM"); int nouveau_vram_pushbuf; @@ -95,6 +96,10 @@ nouveau_channel_del(struct nouveau_channel **pchan) if (chan->fence) nouveau_fence(chan->drm)->context_del(chan); + + if (cli) + nouveau_svmm_part(chan->vmm->svmm, chan->inst); + nvif_object_fini(&chan->nvsw); nvif_object_fini(&chan->gart); nvif_object_fini(&chan->vram); @@ -494,6 +499,10 @@ nouveau_channel_new(struct nouveau_drm *drm, struct nvif_device *device, nouveau_channel_del(pchan); } + ret = nouveau_svmm_join((*pchan)->vmm->svmm, (*pchan)->inst); + if (ret) + nouveau_channel_del(pchan); + done: cli->base.super = super; return ret; diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 72755d4c3983..53fdfa283ee2 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -62,6 +62,7 @@ #include "nouveau_usif.h" #include "nouveau_connector.h" #include "nouveau_platform.h" +#include "nouveau_svm.h" MODULE_PARM_DESC(config, "option string to pass to driver core"); static char *nouveau_config; @@ -549,6 +550,7 @@ nouveau_drm_device_init(struct drm_device *dev) nouveau_debugfs_init(drm); nouveau_hwmon_init(dev); + nouveau_svm_init(drm); nouveau_fbcon_init(dev); nouveau_led_init(dev); @@ -592,6 +594,7 @@ nouveau_drm_device_fini(struct drm_device *dev) nouveau_led_fini(dev); nouveau_fbcon_fini(dev); + nouveau_svm_fini(drm); nouveau_hwmon_fini(dev); nouveau_debugfs_fini(drm); @@ -737,6 +740,7 @@ nouveau_do_suspend(struct drm_device *dev, bool runtime) struct nouveau_drm *drm = nouveau_drm(dev); int ret; + nouveau_svm_suspend(drm); nouveau_led_suspend(dev); if (dev->mode_config.num_crtc) { @@ -813,7 +817,7 @@ nouveau_do_resume(struct drm_device *dev, bool runtime) } nouveau_led_resume(dev); - + nouveau_svm_resume(drm); return 0; } @@ -1033,6 +1037,7 @@ nouveau_ioctls[] = { DRM_IOCTL_DEF_DRV(NOUVEAU_GROBJ_ALLOC, nouveau_abi16_ioctl_grobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_AUTH|DRM_RENDER_ALLOW), diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h index 8fcff0a4800e..1326b42f75e6 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drv.h +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h @@ -210,6 +210,8 @@ struct nouveau_drm { bool have_disp_power_ref; struct dev_pm_domain vga_pm_domain; + + struct nouveau_svm *svm; }; static inline struct nouveau_drm * diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c new file mode 100644 index 000000000000..6158e99b1dc3 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -0,0 +1,737 @@ +/* + * Copyright 2018 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "nouveau_svm.h" +#include "nouveau_drv.h" +#include "nouveau_chan.h" + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +struct nouveau_svm { + struct nouveau_drm *drm; + struct mutex mutex; + struct list_head inst; + + struct nouveau_svm_fault_buffer { + int id; + struct nvif_object object; + u32 entries; + u32 getaddr; + u32 putaddr; + u32 get; + u32 put; + struct nvif_notify notify; + + struct nouveau_svm_fault { + u64 inst; + u64 addr; + u64 time; + u32 engine; + u8 gpc; + u8 hub; + u8 access; + u8 client; + u8 fault; + struct nouveau_svmm *svmm; + } **fault; + int fault_nr; + } buffer[1]; +}; + +#define SVM_DBG(s,f,a...) NV_DEBUG((s)->drm, "svm: "f"\n", ##a) +#define SVM_ERR(s,f,a...) NV_WARN((s)->drm, "svm: "f"\n", ##a) + +struct nouveau_ivmm { + struct nouveau_svmm *svmm; + u64 inst; + struct list_head head; +}; + +static struct nouveau_ivmm * +nouveau_ivmm_find(struct nouveau_svm *svm, u64 inst) +{ + struct nouveau_ivmm *ivmm; + list_for_each_entry(ivmm, &svm->inst, head) { + if (ivmm->inst == inst) + return ivmm; + } + return NULL; +} + +struct nouveau_svmm { + struct nouveau_vmm *vmm; + struct { + unsigned long start; + unsigned long limit; + } unmanaged; + + struct mutex mutex; + + struct mm_struct *mm; + struct hmm_mirror mirror; +}; + +#define SVMM_DBG(s,f,a...) \ + NV_DEBUG((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a) +#define SVMM_ERR(s,f,a...) \ + NV_WARN((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a) + +/* Unlink channel instance from SVMM. */ +void +nouveau_svmm_part(struct nouveau_svmm *svmm, u64 inst) +{ + struct nouveau_ivmm *ivmm; + if (svmm) { + mutex_lock(&svmm->vmm->cli->drm->svm->mutex); + ivmm = nouveau_ivmm_find(svmm->vmm->cli->drm->svm, inst); + if (ivmm) { + list_del(&ivmm->head); + kfree(ivmm); + } + mutex_unlock(&svmm->vmm->cli->drm->svm->mutex); + } +} + +/* Link channel instance to SVMM. */ +int +nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst) +{ + struct nouveau_ivmm *ivmm; + if (svmm) { + if (!(ivmm = kmalloc(sizeof(*ivmm), GFP_KERNEL))) + return -ENOMEM; + ivmm->svmm = svmm; + ivmm->inst = inst; + + mutex_lock(&svmm->vmm->cli->drm->svm->mutex); + list_add(&ivmm->head, &svmm->vmm->cli->drm->svm->inst); + mutex_unlock(&svmm->vmm->cli->drm->svm->mutex); + } + return 0; +} + +/* Invalidate SVMM address-range on GPU. */ +static void +nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) +{ + if (limit > start) { + bool super = svmm->vmm->vmm.object.client->super; + svmm->vmm->vmm.object.client->super = true; + nvif_object_mthd(&svmm->vmm->vmm.object, NVIF_VMM_V0_PFNCLR, + &(struct nvif_vmm_pfnclr_v0) { + .addr = start, + .size = limit - start, + }, sizeof(struct nvif_vmm_pfnclr_v0)); + svmm->vmm->vmm.object.client->super = super; + } +} + +static int +nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, + const struct hmm_update *update) +{ + struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); + unsigned long start = update->start; + unsigned long limit = update->end; + + if (!update->blockable) + return -EAGAIN; + + SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); + + mutex_lock(&svmm->mutex); + if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) { + if (start < svmm->unmanaged.start) { + nouveau_svmm_invalidate(svmm, start, + svmm->unmanaged.limit); + } + start = svmm->unmanaged.limit; + } + + nouveau_svmm_invalidate(svmm, start, limit); + mutex_unlock(&svmm->mutex); + return 0; +} + +static void +nouveau_svmm_release(struct hmm_mirror *mirror) +{ +} + +static const struct hmm_mirror_ops +nouveau_svmm = { + .sync_cpu_device_pagetables = nouveau_svmm_sync_cpu_device_pagetables, + .release = nouveau_svmm_release, +}; + +void +nouveau_svmm_fini(struct nouveau_svmm **psvmm) +{ + struct nouveau_svmm *svmm = *psvmm; + if (svmm) { + hmm_mirror_unregister(&svmm->mirror); + kfree(*psvmm); + *psvmm = NULL; + } +} + +int +nouveau_svmm_init(struct drm_device *dev, void *data, + struct drm_file *file_priv) +{ + struct nouveau_cli *cli = nouveau_cli(file_priv); + struct nouveau_svmm *svmm; + struct drm_nouveau_svm_init *args = data; + int ret; + + /* Allocate tracking for SVM-enabled VMM. */ + if (!(svmm = kzalloc(sizeof(*svmm), GFP_KERNEL))) + return -ENOMEM; + svmm->vmm = &cli->svm; + svmm->unmanaged.start = args->unmanaged_addr; + svmm->unmanaged.limit = args->unmanaged_addr + args->unmanaged_size; + mutex_init(&svmm->mutex); + + /* Check that SVM isn't already enabled for the client. */ + mutex_lock(&cli->mutex); + if (cli->svm.cli) { + ret = -EBUSY; + goto done; + } + + /* Allocate a new GPU VMM that can support SVM (managed by the + * client, with replayable faults enabled). + * + * All future channel/memory allocations will make use of this + * VMM instead of the standard one. + */ + ret = nvif_vmm_init(&cli->mmu, cli->vmm.vmm.object.oclass, true, + args->unmanaged_addr, args->unmanaged_size, + &(struct gp100_vmm_v0) { + .fault_replay = true, + }, sizeof(struct gp100_vmm_v0), &cli->svm.vmm); + if (ret) + goto done; + + /* Enable HMM mirroring of CPU address-space to VMM. */ + svmm->mm = get_task_mm(current); + down_write(&svmm->mm->mmap_sem); + svmm->mirror.ops = &nouveau_svmm; + ret = hmm_mirror_register(&svmm->mirror, svmm->mm); + if (ret == 0) { + cli->svm.svmm = svmm; + cli->svm.cli = cli; + } + up_write(&svmm->mm->mmap_sem); + mmput(svmm->mm); + +done: + if (ret) + nouveau_svmm_fini(&svmm); + mutex_unlock(&cli->mutex); + return ret; +} + +static const u64 +nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = { + [HMM_PFN_VALID ] = NVIF_VMM_PFNMAP_V0_V, + [HMM_PFN_WRITE ] = NVIF_VMM_PFNMAP_V0_W, + [HMM_PFN_DEVICE_PRIVATE] = NVIF_VMM_PFNMAP_V0_VRAM, +}; + +static const u64 +nouveau_svm_pfn_values[HMM_PFN_VALUE_MAX] = { + [HMM_PFN_ERROR ] = ~NVIF_VMM_PFNMAP_V0_V, + [HMM_PFN_NONE ] = NVIF_VMM_PFNMAP_V0_NONE, + [HMM_PFN_SPECIAL] = ~NVIF_VMM_PFNMAP_V0_V, +}; + +/* Issue fault replay for GPU to retry accesses that faulted previously. */ +static void +nouveau_svm_fault_replay(struct nouveau_svm *svm) +{ + SVM_DBG(svm, "replay"); + WARN_ON(nvif_object_mthd(&svm->drm->client.vmm.vmm.object, + GP100_VMM_VN_FAULT_REPLAY, + &(struct gp100_vmm_fault_replay_vn) {}, + sizeof(struct gp100_vmm_fault_replay_vn))); +} + +/* Cancel a replayable fault that could not be handled. + * + * Cancelling the fault will trigger recovery to reset the engine + * and kill the offending channel (ie. GPU SIGSEGV). + */ +static void +nouveau_svm_fault_cancel(struct nouveau_svm *svm, + u64 inst, u8 hub, u8 gpc, u8 client) +{ + SVM_DBG(svm, "cancel %016llx %d %02x %02x", inst, hub, gpc, client); + WARN_ON(nvif_object_mthd(&svm->drm->client.vmm.vmm.object, + GP100_VMM_VN_FAULT_CANCEL, + &(struct gp100_vmm_fault_cancel_v0) { + .hub = hub, + .gpc = gpc, + .client = client, + .inst = inst, + }, sizeof(struct gp100_vmm_fault_cancel_v0))); +} + +static void +nouveau_svm_fault_cancel_fault(struct nouveau_svm *svm, + struct nouveau_svm_fault *fault) +{ + nouveau_svm_fault_cancel(svm, fault->inst, + fault->hub, + fault->gpc, + fault->client); +} + +static int +nouveau_svm_fault_cmp(const void *a, const void *b) +{ + const struct nouveau_svm_fault *fa = *(struct nouveau_svm_fault **)a; + const struct nouveau_svm_fault *fb = *(struct nouveau_svm_fault **)b; + int ret; + if ((ret = (s64)fa->inst - fb->inst)) + return ret; + if ((ret = (s64)fa->addr - fb->addr)) + return ret; + /*XXX: atomic? */ + return (fa->access == 0 || fa->access == 3) - + (fb->access == 0 || fb->access == 3); +} + +static void +nouveau_svm_fault_cache(struct nouveau_svm *svm, + struct nouveau_svm_fault_buffer *buffer, u32 offset) +{ + struct nvif_object *memory = &buffer->object; + const u32 instlo = nvif_rd32(memory, offset + 0x00); + const u32 insthi = nvif_rd32(memory, offset + 0x04); + const u32 addrlo = nvif_rd32(memory, offset + 0x08); + const u32 addrhi = nvif_rd32(memory, offset + 0x0c); + const u32 timelo = nvif_rd32(memory, offset + 0x10); + const u32 timehi = nvif_rd32(memory, offset + 0x14); + const u32 engine = nvif_rd32(memory, offset + 0x18); + const u32 info = nvif_rd32(memory, offset + 0x1c); + const u64 inst = (u64)insthi << 32 | instlo; + const u8 gpc = (info & 0x1f000000) >> 24; + const u8 hub = (info & 0x00100000) >> 20; + const u8 client = (info & 0x00007f00) >> 8; + struct nouveau_svm_fault *fault; + + //XXX: i think we're supposed to spin waiting */ + if (WARN_ON(!(info & 0x80000000))) + return; + + nvif_mask(memory, offset + 0x1c, 0x80000000, 0x00000000); + + if (!buffer->fault[buffer->fault_nr]) { + fault = kmalloc(sizeof(*fault), GFP_KERNEL); + if (WARN_ON(!fault)) { + nouveau_svm_fault_cancel(svm, inst, hub, gpc, client); + return; + } + buffer->fault[buffer->fault_nr] = fault; + } + + fault = buffer->fault[buffer->fault_nr++]; + fault->inst = inst; + fault->addr = (u64)addrhi << 32 | addrlo; + fault->time = (u64)timehi << 32 | timelo; + fault->engine = engine; + fault->gpc = gpc; + fault->hub = hub; + fault->access = (info & 0x000f0000) >> 16; + fault->client = client; + fault->fault = (info & 0x0000001f); + + SVM_DBG(svm, "fault %016llx %016llx %02x", + fault->inst, fault->addr, fault->access); +} + +static int +nouveau_svm_fault(struct nvif_notify *notify) +{ + struct nouveau_svm_fault_buffer *buffer = + container_of(notify, typeof(*buffer), notify); + struct nouveau_svm *svm = + container_of(buffer, typeof(*svm), buffer[buffer->id]); + struct nvif_object *device = &svm->drm->client.device.object; + struct nouveau_svmm *svmm; + struct { + struct { + struct nvif_ioctl_v0 i; + struct nvif_ioctl_mthd_v0 m; + struct nvif_vmm_pfnmap_v0 p; + } i; + u64 phys[16]; + } args; + struct hmm_range range; + struct vm_area_struct *vma; + u64 inst, start, limit; + int fi, fn, pi, fill; + int replay = 0, ret; + + /* Parse available fault buffer entries into a cache, and update + * the GET pointer so HW can reuse the entries. + */ + SVM_DBG(svm, "fault handler"); + if (buffer->get == buffer->put) { + buffer->put = nvif_rd32(device, buffer->putaddr); + buffer->get = nvif_rd32(device, buffer->getaddr); + if (buffer->get == buffer->put) + return NVIF_NOTIFY_KEEP; + } + buffer->fault_nr = 0; + + SVM_DBG(svm, "get %08x put %08x", buffer->get, buffer->put); + while (buffer->get != buffer->put) { + nouveau_svm_fault_cache(svm, buffer, buffer->get * 0x20); + if (++buffer->get == buffer->entries) + buffer->get = 0; + } + nvif_wr32(device, buffer->getaddr, buffer->get); + SVM_DBG(svm, "%d fault(s) pending", buffer->fault_nr); + + /* Sort parsed faults by instance pointer to prevent unnecessary + * instance to SVMM translations, followed by address and access + * type to reduce the amount of work when handling the faults. + */ + sort(buffer->fault, buffer->fault_nr, sizeof(*buffer->fault), + nouveau_svm_fault_cmp, NULL); + + /* Lookup SVMM structure for each unique instance pointer. */ + mutex_lock(&svm->mutex); + for (fi = 0, svmm = NULL; fi < buffer->fault_nr; fi++) { + if (!svmm || buffer->fault[fi]->inst != inst) { + struct nouveau_ivmm *ivmm = + nouveau_ivmm_find(svm, buffer->fault[fi]->inst); + svmm = ivmm ? ivmm->svmm : NULL; + inst = buffer->fault[fi]->inst; + SVM_DBG(svm, "inst %016llx -> svm-%p", inst, svmm); + } + buffer->fault[fi]->svmm = svmm; + } + mutex_unlock(&svm->mutex); + + /* Process list of faults. */ + args.i.i.version = 0; + args.i.i.type = NVIF_IOCTL_V0_MTHD; + args.i.m.version = 0; + args.i.m.method = NVIF_VMM_V0_PFNMAP; + args.i.p.version = 0; + + for (fi = 0; fn = fi + 1, fi < buffer->fault_nr; fi = fn) { + /* Cancel any faults from non-SVM channels. */ + if (!(svmm = buffer->fault[fi]->svmm)) { + nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]); + continue; + } + SVMM_DBG(svmm, "addr %016llx", buffer->fault[fi]->addr); + + /* We try and group handling of faults within a small + * window into a single update. + */ + start = buffer->fault[fi]->addr; + limit = start + (ARRAY_SIZE(args.phys) << PAGE_SHIFT); + if (start < svmm->unmanaged.limit) + limit = min_t(u64, limit, svmm->unmanaged.start); + else + if (limit > svmm->unmanaged.start) + start = max_t(u64, start, svmm->unmanaged.limit); + SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit); + + /* Intersect fault window with the CPU VMA, cancelling + * the fault if the address is invalid. + */ + down_read(&svmm->mm->mmap_sem); + vma = find_vma_intersection(svmm->mm, start, limit); + if (!vma) { + SVMM_ERR(svmm, "wndw %016llx-%016llx", start, limit); + up_read(&svmm->mm->mmap_sem); + nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]); + continue; + } + start = max_t(u64, start, vma->vm_start); + limit = min_t(u64, limit, vma->vm_end); + SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit); + + if (buffer->fault[fi]->addr != start) { + SVMM_ERR(svmm, "addr %016llx", buffer->fault[fi]->addr); + up_read(&svmm->mm->mmap_sem); + nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]); + continue; + } + + /* Prepare the GPU-side update of all pages within the + * fault window, determining required pages and access + * permissions based on pending faults. + */ + args.i.p.page = PAGE_SHIFT; + args.i.p.addr = start; + for (fn = fi, pi = 0;;) { + /* Determine required permissions based on GPU fault + * access flags. + *XXX: atomic? + */ + if (buffer->fault[fn]->access != 0 /* READ. */ && + buffer->fault[fn]->access != 3 /* PREFETCH. */) { + args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V | + NVIF_VMM_PFNMAP_V0_W; + } else { + args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V; + } + args.i.p.size = pi << PAGE_SHIFT; + + /* It's okay to skip over duplicate addresses from the + * same SVMM as faults are ordered by access type such + * that only the first one needs to be handled. + * + * ie. WRITE faults appear first, thus any handling of + * pending READ faults will already be satisfied. + */ + while (++fn < buffer->fault_nr && + buffer->fault[fn]->svmm == svmm && + buffer->fault[fn ]->addr == + buffer->fault[fn - 1]->addr); + + /* If the next fault is outside the window, or all GPU + * faults have been dealt with, we're done here. + */ + if (fn >= buffer->fault_nr || + buffer->fault[fn]->svmm != svmm || + buffer->fault[fn]->addr >= limit) + break; + + /* Fill in the gap between this fault and the next. */ + fill = (buffer->fault[fn ]->addr - + buffer->fault[fn - 1]->addr) >> PAGE_SHIFT; + while (--fill) + args.phys[pi++] = NVIF_VMM_PFNMAP_V0_NONE; + } + + SVMM_DBG(svmm, "wndw %016llx-%016llx covering %d fault(s)", + args.i.p.addr, + args.i.p.addr + args.i.p.size, fn - fi); + + /* Have HMM fault pages within the fault window to the GPU. */ + range.vma = vma; + range.start = args.i.p.addr; + range.end = args.i.p.addr + args.i.p.size; + range.pfns = args.phys; + range.flags = nouveau_svm_pfn_flags; + range.values = nouveau_svm_pfn_values; + range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; +again: + ret = hmm_vma_fault(&range, true); + if (ret == 0) { + mutex_lock(&svmm->mutex); + if (!hmm_vma_range_done(&range)) { + mutex_unlock(&svmm->mutex); + goto again; + } + + svmm->vmm->vmm.object.client->super = true; + ret = nvif_object_ioctl(&svmm->vmm->vmm.object, + &args, sizeof(args.i) + + pi * sizeof(args.phys[0]), + NULL); + svmm->vmm->vmm.object.client->super = false; + mutex_unlock(&svmm->mutex); + } + up_read(&svmm->mm->mmap_sem); + + /* Cancel any faults in the window whose pages didn't manage + * to keep their valid bit, or stay writeable when required. + * + * If handling failed completely, cancel all faults. + */ + while (fi < fn) { + struct nouveau_svm_fault *fault = buffer->fault[fi++]; + pi = (fault->addr - range.start) >> PAGE_SHIFT; + if (ret || + !(range.pfns[pi] & NVIF_VMM_PFNMAP_V0_V) || + (!(range.pfns[pi] & NVIF_VMM_PFNMAP_V0_W) && + fault->access != 0 && fault->access != 3)) { + nouveau_svm_fault_cancel_fault(svm, fault); + continue; + } + replay++; + } + } + + /* Issue fault replay to the GPU. */ + if (replay) + nouveau_svm_fault_replay(svm); + return NVIF_NOTIFY_KEEP; +} + +static void +nouveau_svm_fault_buffer_fini(struct nouveau_svm *svm, int id) +{ + struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id]; + nvif_notify_put(&buffer->notify); +} + +static int +nouveau_svm_fault_buffer_init(struct nouveau_svm *svm, int id) +{ + struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id]; + struct nvif_object *device = &svm->drm->client.device.object; + buffer->get = nvif_rd32(device, buffer->getaddr); + buffer->put = nvif_rd32(device, buffer->putaddr); + SVM_DBG(svm, "get %08x put %08x (init)", buffer->get, buffer->put); + return nvif_notify_get(&buffer->notify); +} + +static void +nouveau_svm_fault_buffer_dtor(struct nouveau_svm *svm, int id) +{ + struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id]; + int i; + + if (buffer->fault) { + for (i = 0; buffer->fault[i] && i < buffer->entries; i++) + kfree(buffer->fault[i]); + kvfree(buffer->fault); + } + + nouveau_svm_fault_buffer_fini(svm, id); + + nvif_notify_fini(&buffer->notify); + nvif_object_fini(&buffer->object); +} + +static int +nouveau_svm_fault_buffer_ctor(struct nouveau_svm *svm, s32 oclass, int id) +{ + struct nouveau_svm_fault_buffer *buffer = &svm->buffer[id]; + struct nouveau_drm *drm = svm->drm; + struct nvif_object *device = &drm->client.device.object; + struct nvif_clb069_v0 args = {}; + int ret; + + buffer->id = id; + + ret = nvif_object_init(device, 0, oclass, &args, sizeof(args), + &buffer->object); + if (ret < 0) { + SVM_ERR(svm, "Fault buffer allocation failed: %d", ret); + return ret; + } + + nvif_object_map(&buffer->object, NULL, 0); + buffer->entries = args.entries; + buffer->getaddr = args.get; + buffer->putaddr = args.put; + + ret = nvif_notify_init(&buffer->object, nouveau_svm_fault, true, + NVB069_V0_NTFY_FAULT, NULL, 0, 0, + &buffer->notify); + if (ret) + return ret; + + buffer->fault = kvzalloc(sizeof(*buffer->fault) * buffer->entries, GFP_KERNEL); + if (!buffer->fault) + return -ENOMEM; + + return nouveau_svm_fault_buffer_init(svm, id); +} + +void +nouveau_svm_resume(struct nouveau_drm *drm) +{ + struct nouveau_svm *svm = drm->svm; + if (svm) + nouveau_svm_fault_buffer_init(svm, 0); +} + +void +nouveau_svm_suspend(struct nouveau_drm *drm) +{ + struct nouveau_svm *svm = drm->svm; + if (svm) + nouveau_svm_fault_buffer_fini(svm, 0); +} + +void +nouveau_svm_fini(struct nouveau_drm *drm) +{ + struct nouveau_svm *svm = drm->svm; + if (svm) { + nouveau_svm_fault_buffer_dtor(svm, 0); + kfree(drm->svm); + drm->svm = NULL; + } +} + +void +nouveau_svm_init(struct nouveau_drm *drm) +{ + static const struct nvif_mclass buffers[] = { + { VOLTA_FAULT_BUFFER_A, 0 }, + { MAXWELL_FAULT_BUFFER_A, 0 }, + {} + }; + struct nouveau_svm *svm; + int ret; + + /* Disable on Volta and newer until channel recovery is fixed, + * otherwise clients will have a trivial way to trash the GPU + * for everyone. + */ + if (drm->client.device.info.family > NV_DEVICE_INFO_V0_PASCAL) + return; + + if (!(drm->svm = svm = kzalloc(sizeof(*drm->svm), GFP_KERNEL))) + return; + + drm->svm->drm = drm; + mutex_init(&drm->svm->mutex); + INIT_LIST_HEAD(&drm->svm->inst); + + ret = nvif_mclass(&drm->client.device.object, buffers); + if (ret < 0) { + SVM_DBG(svm, "No supported fault buffer class"); + nouveau_svm_fini(drm); + return; + } + + ret = nouveau_svm_fault_buffer_ctor(svm, buffers[ret].oclass, 0); + if (ret) { + nouveau_svm_fini(drm); + return; + } + + SVM_DBG(svm, "Initialised"); +} diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.h b/drivers/gpu/drm/nouveau/nouveau_svm.h new file mode 100644 index 000000000000..0379a1c316ca --- /dev/null +++ b/drivers/gpu/drm/nouveau/nouveau_svm.h @@ -0,0 +1,41 @@ +#ifndef __NOUVEAU_SVM_H__ +#define __NOUVEAU_SVM_H__ +#include +struct drm_device; +struct drm_file; +struct nouveau_drm; + +struct nouveau_svmm; + +#if IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) +void nouveau_svm_init(struct nouveau_drm *); +void nouveau_svm_fini(struct nouveau_drm *); +void nouveau_svm_suspend(struct nouveau_drm *); +void nouveau_svm_resume(struct nouveau_drm *); + +int nouveau_svmm_init(struct drm_device *, void *, struct drm_file *); +void nouveau_svmm_fini(struct nouveau_svmm **); +int nouveau_svmm_join(struct nouveau_svmm *, u64 inst); +void nouveau_svmm_part(struct nouveau_svmm *, u64 inst); +#else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ +static inline void nouveau_svm_init(struct nouveau_drm *drm) {} +static inline void nouveau_svm_fini(struct nouveau_drm *drm) {} +static inline void nouveau_svm_suspend(struct nouveau_drm *drm) {} +static inline void nouveau_svm_resume(struct nouveau_drm *drm) {} + +static inline int nouveau_svmm_init(struct drm_device *device, void *p, + struct drm_file *file) +{ + return -ENOSYS; +} + +static inline void nouveau_svmm_fini(struct nouveau_svmm **svmmp) {} + +static inline int nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst) +{ + return 0; +} + +static inline void nouveau_svmm_part(struct nouveau_svmm *svmm, u64 inst) {} +#endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ +#endif diff --git a/drivers/gpu/drm/nouveau/nouveau_vmm.c b/drivers/gpu/drm/nouveau/nouveau_vmm.c index 724d02d7c049..77061182a1cf 100644 --- a/drivers/gpu/drm/nouveau/nouveau_vmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_vmm.c @@ -22,6 +22,7 @@ #include "nouveau_vmm.h" #include "nouveau_drv.h" #include "nouveau_bo.h" +#include "nouveau_svm.h" #include "nouveau_mem.h" void @@ -119,6 +120,7 @@ done: void nouveau_vmm_fini(struct nouveau_vmm *vmm) { + nouveau_svmm_fini(&vmm->svmm); nvif_vmm_fini(&vmm->vmm); vmm->cli = NULL; } diff --git a/drivers/gpu/drm/nouveau/nouveau_vmm.h b/drivers/gpu/drm/nouveau/nouveau_vmm.h index ede872f6f668..2b98d975f37e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_vmm.h +++ b/drivers/gpu/drm/nouveau/nouveau_vmm.h @@ -25,6 +25,7 @@ void nouveau_vma_unmap(struct nouveau_vma *); struct nouveau_vmm { struct nouveau_cli *cli; struct nvif_vmm vmm; + struct nouveau_svmm *svmm; }; int nouveau_vmm_init(struct nouveau_cli *, s32 oclass, struct nouveau_vmm *); diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h index 259588a4b61b..afac182c80d8 100644 --- a/include/uapi/drm/nouveau_drm.h +++ b/include/uapi/drm/nouveau_drm.h @@ -133,12 +133,20 @@ struct drm_nouveau_gem_cpu_fini { #define DRM_NOUVEAU_NOTIFIEROBJ_ALLOC 0x05 /* deprecated */ #define DRM_NOUVEAU_GPUOBJ_FREE 0x06 /* deprecated */ #define DRM_NOUVEAU_NVIF 0x07 +#define DRM_NOUVEAU_SVM_INIT 0x08 #define DRM_NOUVEAU_GEM_NEW 0x40 #define DRM_NOUVEAU_GEM_PUSHBUF 0x41 #define DRM_NOUVEAU_GEM_CPU_PREP 0x42 #define DRM_NOUVEAU_GEM_CPU_FINI 0x43 #define DRM_NOUVEAU_GEM_INFO 0x44 +struct drm_nouveau_svm_init { + __u64 unmanaged_addr; + __u64 unmanaged_size; +}; + +#define DRM_IOCTL_NOUVEAU_SVM_INIT DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_SVM_INIT, struct drm_nouveau_svm_init) + #define DRM_IOCTL_NOUVEAU_GEM_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_NEW, struct drm_nouveau_gem_new) #define DRM_IOCTL_NOUVEAU_GEM_PUSHBUF DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_PUSHBUF, struct drm_nouveau_gem_pushbuf) #define DRM_IOCTL_NOUVEAU_GEM_CPU_PREP DRM_IOW (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_PREP, struct drm_nouveau_gem_cpu_prep) -- cgit v1.3-7-g2ca7 From f180bf12ac061f093abb9247505f661817973cae Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Tue, 7 Aug 2018 16:13:16 -0400 Subject: drm/nouveau/svm: new ioctl to migrate process memory to GPU memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This add an ioctl to migrate a range of process address space to the device memory. On platform without cache coherent bus (x86, ARM, ...) this means that CPU can not access that range directly, instead CPU will fault which will migrate the memory back to system memory. This is behind a staging flag so that we can evolve the API. Signed-off-by: Jérôme Glisse --- drivers/gpu/drm/nouveau/nouveau_drm.c | 1 + drivers/gpu/drm/nouveau/nouveau_svm.c | 96 +++++++++++++++++++++++++++++++++++ drivers/gpu/drm/nouveau/nouveau_svm.h | 7 +++ include/uapi/drm/nouveau_drm.h | 43 ++++++++++++++++ 4 files changed, 147 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index ee2c4d498d7d..5020265bfbd9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -1043,6 +1043,7 @@ nouveau_ioctls[] = { DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_BIND, nouveau_svmm_bind, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_AUTH|DRM_RENDER_ALLOW), diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 3ba980d80a1a..93ed43c413f0 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -22,6 +22,7 @@ #include "nouveau_svm.h" #include "nouveau_drv.h" #include "nouveau_chan.h" +#include "nouveau_dmem.h" #include #include @@ -104,6 +105,101 @@ struct nouveau_svmm { #define SVMM_ERR(s,f,a...) \ NV_WARN((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a) +int +nouveau_svmm_bind(struct drm_device *dev, void *data, + struct drm_file *file_priv) +{ + struct nouveau_cli *cli = nouveau_cli(file_priv); + struct drm_nouveau_svm_bind *args = data; + unsigned target, cmd, priority; + unsigned long addr, end, size; + struct mm_struct *mm; + + args->va_start &= PAGE_MASK; + args->va_end &= PAGE_MASK; + + /* Sanity check arguments */ + if (args->reserved0 || args->reserved1) + return -EINVAL; + if (args->header & (~NOUVEAU_SVM_BIND_VALID_MASK)) + return -EINVAL; + if (args->va_start >= args->va_end) + return -EINVAL; + if (!args->npages) + return -EINVAL; + + cmd = args->header >> NOUVEAU_SVM_BIND_COMMAND_SHIFT; + cmd &= NOUVEAU_SVM_BIND_COMMAND_MASK; + switch (cmd) { + case NOUVEAU_SVM_BIND_COMMAND__MIGRATE: + break; + default: + return -EINVAL; + } + + priority = args->header >> NOUVEAU_SVM_BIND_PRIORITY_SHIFT; + priority &= NOUVEAU_SVM_BIND_PRIORITY_MASK; + + /* FIXME support CPU target ie all target value < GPU_VRAM */ + target = args->header >> NOUVEAU_SVM_BIND_TARGET_SHIFT; + target &= NOUVEAU_SVM_BIND_TARGET_MASK; + switch (target) { + case NOUVEAU_SVM_BIND_TARGET__GPU_VRAM: + break; + default: + return -EINVAL; + } + + /* + * FIXME: For now refuse non 0 stride, we need to change the migrate + * kernel function to handle stride to avoid to create a mess within + * each device driver. + */ + if (args->stride) + return -EINVAL; + + size = ((unsigned long)args->npages) << PAGE_SHIFT; + if ((args->va_start + size) <= args->va_start) + return -EINVAL; + if ((args->va_start + size) > args->va_end) + return -EINVAL; + + /* + * Ok we are ask to do something sane, for now we only support migrate + * commands but we will add things like memory policy (what to do on + * page fault) and maybe some other commands. + */ + + mm = get_task_mm(current); + down_read(&mm->mmap_sem); + + for (addr = args->va_start, end = args->va_start + size; addr < end;) { + struct vm_area_struct *vma; + unsigned long next; + + vma = find_vma_intersection(mm, addr, end); + if (!vma) + break; + + next = min(vma->vm_end, end); + /* This is a best effort so we ignore errors */ + nouveau_dmem_migrate_vma(cli->drm, vma, addr, next); + addr = next; + } + + /* + * FIXME Return the number of page we have migrated, again we need to + * update the migrate API to return that information so that we can + * report it to user space. + */ + args->result = 0; + + up_read(&mm->mmap_sem); + mmput(mm); + + return 0; +} + /* Unlink channel instance from SVMM. */ void nouveau_svmm_part(struct nouveau_svmm *svmm, u64 inst) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.h b/drivers/gpu/drm/nouveau/nouveau_svm.h index 0379a1c316ca..e839d8189461 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.h +++ b/drivers/gpu/drm/nouveau/nouveau_svm.h @@ -17,6 +17,7 @@ int nouveau_svmm_init(struct drm_device *, void *, struct drm_file *); void nouveau_svmm_fini(struct nouveau_svmm **); int nouveau_svmm_join(struct nouveau_svmm *, u64 inst); void nouveau_svmm_part(struct nouveau_svmm *, u64 inst); +int nouveau_svmm_bind(struct drm_device *, void *, struct drm_file *); #else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ static inline void nouveau_svm_init(struct nouveau_drm *drm) {} static inline void nouveau_svm_fini(struct nouveau_drm *drm) {} @@ -37,5 +38,11 @@ static inline int nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst) } static inline void nouveau_svmm_part(struct nouveau_svmm *svmm, u64 inst) {} + +static inline int nouveau_svmm_bind(struct drm_device *device, void *p, + struct drm_file *file) +{ + return -ENOSYS; +} #endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ #endif diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h index afac182c80d8..9459a6e3bc1f 100644 --- a/include/uapi/drm/nouveau_drm.h +++ b/include/uapi/drm/nouveau_drm.h @@ -134,6 +134,7 @@ struct drm_nouveau_gem_cpu_fini { #define DRM_NOUVEAU_GPUOBJ_FREE 0x06 /* deprecated */ #define DRM_NOUVEAU_NVIF 0x07 #define DRM_NOUVEAU_SVM_INIT 0x08 +#define DRM_NOUVEAU_SVM_BIND 0x09 #define DRM_NOUVEAU_GEM_NEW 0x40 #define DRM_NOUVEAU_GEM_PUSHBUF 0x41 #define DRM_NOUVEAU_GEM_CPU_PREP 0x42 @@ -145,7 +146,49 @@ struct drm_nouveau_svm_init { __u64 unmanaged_size; }; +struct drm_nouveau_svm_bind { + __u64 header; + __u64 va_start; + __u64 va_end; + __u64 npages; + __u64 stride; + __u64 result; + __u64 reserved0; + __u64 reserved1; +}; + +#define NOUVEAU_SVM_BIND_COMMAND_SHIFT 0 +#define NOUVEAU_SVM_BIND_COMMAND_BITS 8 +#define NOUVEAU_SVM_BIND_COMMAND_MASK ((1 << 8) - 1) +#define NOUVEAU_SVM_BIND_PRIORITY_SHIFT 8 +#define NOUVEAU_SVM_BIND_PRIORITY_BITS 8 +#define NOUVEAU_SVM_BIND_PRIORITY_MASK ((1 << 8) - 1) +#define NOUVEAU_SVM_BIND_TARGET_SHIFT 16 +#define NOUVEAU_SVM_BIND_TARGET_BITS 32 +#define NOUVEAU_SVM_BIND_TARGET_MASK 0xffffffff + +/* + * Below is use to validate ioctl argument, userspace can also use it to make + * sure that no bit are set beyond known fields for a given kernel version. + */ +#define NOUVEAU_SVM_BIND_VALID_BITS 48 +#define NOUVEAU_SVM_BIND_VALID_MASK ((1ULL << NOUVEAU_SVM_BIND_VALID_BITS) - 1) + + +/* + * NOUVEAU_BIND_COMMAND__MIGRATE: synchronous migrate to target memory. + * result: number of page successfuly migrate to the target memory. + */ +#define NOUVEAU_SVM_BIND_COMMAND__MIGRATE 0 + +/* + * NOUVEAU_SVM_BIND_HEADER_TARGET__GPU_VRAM: target the GPU VRAM memory. + */ +#define NOUVEAU_SVM_BIND_TARGET__GPU_VRAM (1UL << 31) + + #define DRM_IOCTL_NOUVEAU_SVM_INIT DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_SVM_INIT, struct drm_nouveau_svm_init) +#define DRM_IOCTL_NOUVEAU_SVM_BIND DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_SVM_BIND, struct drm_nouveau_svm_bind) #define DRM_IOCTL_NOUVEAU_GEM_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_NEW, struct drm_nouveau_gem_new) #define DRM_IOCTL_NOUVEAU_GEM_PUSHBUF DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_PUSHBUF, struct drm_nouveau_gem_pushbuf) -- cgit v1.3-7-g2ca7 From 3856ec4b93c9463d36ee39098dde1fbbd29ec6dd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 15 Feb 2019 11:03:53 -0800 Subject: RDMA/core: Add RDMA_NLDEV_CMD_NEWLINK/DELLINK support Add support for new LINK messages to allow adding and deleting rdma interfaces. This will be used initially for soft rdma drivers which instantiate device instances dynamically by the admin specifying a netdev device to use. The rdma_rxe module will be the first user of these messages. The design is modeled after RTNL_NEWLINK/DELLINK: rdma drivers register with the rdma core if they provide link add/delete functions. Each driver registers with a unique "type" string, that is used to dispatch messages coming from user space. A new RDMA_NLDEV_ATTR is defined for the "type" string. User mode will pass 3 attributes in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type" of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device interface to use for this link. The DELLINK message will contain the RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Reviewed-by: Michael J. Ruhl Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 122 +++++++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 3 + include/rdma/rdma_netlink.h | 11 ++++ include/uapi/rdma/rdma_netlink.h | 10 +++- 4 files changed, 144 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 1980ddc5f7bc..5e94dc87f04f 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -113,6 +114,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -1200,6 +1203,117 @@ RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +static LIST_HEAD(link_ops); +static DECLARE_RWSEM(link_ops_rwsem); + +static const struct rdma_link_ops *link_ops_get(const char *type) +{ + const struct rdma_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->type, type)) + goto out; + } + ops = NULL; +out: + return ops; +} + +void rdma_link_register(struct rdma_link_ops *ops) +{ + down_write(&link_ops_rwsem); + if (link_ops_get(ops->type)) { + WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type); + goto out; + } + list_add(&ops->list, &link_ops); +out: + up_write(&link_ops_rwsem); +} +EXPORT_SYMBOL(rdma_link_register); + +void rdma_link_unregister(struct rdma_link_ops *ops) +{ + down_write(&link_ops_rwsem); + list_del(&ops->list); + up_write(&link_ops_rwsem); +} +EXPORT_SYMBOL(rdma_link_unregister); + +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char ibdev_name[IB_DEVICE_NAME_MAX]; + const struct rdma_link_ops *ops; + char ndev_name[IFNAMSIZ]; + struct net_device *ndev; + char type[IFNAMSIZ]; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) + return -EINVAL; + + nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + sizeof(ibdev_name)); + if (strchr(ibdev_name, '%')) + return -EINVAL; + + nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + sizeof(ndev_name)); + + ndev = dev_get_by_name(&init_net, ndev_name); + if (!ndev) + return -ENODEV; + + down_read(&link_ops_rwsem); + ops = link_ops_get(type); +#ifdef CONFIG_MODULES + if (!ops) { + up_read(&link_ops_rwsem); + request_module("rdma-link-%s", type); + down_read(&link_ops_rwsem); + ops = link_ops_get(type); + } +#endif + err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL; + up_read(&link_ops_rwsem); + dev_put(ndev); + + return err; +} + +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) { + ib_device_put(device); + return -EINVAL; + } + + ib_unregister_device_and_put(device); + return 0; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -1209,6 +1323,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_NEWLINK] = { + .doit = nldev_newlink, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELLINK] = { + .doit = nldev_dellink, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 640263289ab9..225cb76d469f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -238,6 +238,7 @@ enum ib_device_cap_flags { IB_DEVICE_RDMA_NETDEV_OPA_VNIC = (1ULL << 35), /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = (1ULL << 36), + IB_DEVICE_ALLOW_USER_UNREG = (1ULL << 37), }; enum ib_signature_prot_cap { @@ -2622,6 +2623,8 @@ struct ib_device { refcount_t refcount; struct completion unreg_completion; struct work_struct unregistration_work; + + const struct rdma_link_ops *link_ops; }; struct ib_client { diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 70218e6b5187..10732ab31ba2 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -99,4 +99,15 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); * Returns true on success or false if no listeners. */ bool rdma_nl_chk_listeners(unsigned int group); + +struct rdma_link_ops { + struct list_head list; + const char *type; + int (*newlink)(const char *ibdev_name, struct net_device *ndev); +}; + +void rdma_link_register(struct rdma_link_ops *ops); +void rdma_link_unregister(struct rdma_link_ops *ops); + +#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type) #endif /* _RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 4ebbcfb2c6ef..5cc592728071 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -255,9 +255,11 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_GET, /* can dump */ RDMA_NLDEV_CMD_SET, - /* 3 - 4 are free to use */ + RDMA_NLDEV_CMD_NEWLINK, - RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ + RDMA_NLDEV_CMD_DELLINK, + + RDMA_NLDEV_CMD_PORT_GET, /* can dump */ /* 6 - 8 are free to use */ @@ -465,6 +467,10 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ RDMA_NLDEV_ATTR_RES_CTXN, /* u32 */ + /* + * Identifies the rdma driver. eg: "rxe" or "siw" + */ + RDMA_NLDEV_ATTR_LINK_TYPE, /* string */ /* * Always the end -- cgit v1.3-7-g2ca7 From fadccd8fc2d06cf7fd222245d7e04b00fae946cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:37:13 +0100 Subject: nvme_ioctl.h: remove duplicate GPL boilerplate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We already have a ЅPDX header, so no need to duplicate the information. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- include/linux/nvme.h | 10 +--------- include/uapi/linux/nvme_ioctl.h | 9 --------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bbcc83886899..baa49e6a23cc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for the NVM Express interface * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_NVME_H diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index 6e74b1eaf541..1c215ea1798e 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -2,15 +2,6 @@ /* * Definitions for the NVM Express ioctl interface * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _UAPI_LINUX_NVME_IOCTL_H -- cgit v1.3-7-g2ca7 From 61697a6abd24acba941359c6268a94f4afe4a53d Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 18 Jan 2019 14:19:26 -0500 Subject: dm: eliminate 'split_discard_bios' flag from DM target interface There is no need to have DM core split discards on behalf of a DM target now that blk_queue_split() handles splitting discards based on the queue_limits. A DM target just needs to set max_discard_sectors, discard_granularity, etc, in queue_limits. Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 1 - drivers/md/dm-raid.c | 14 +++++++++----- drivers/md/dm-thin.c | 1 - drivers/md/dm-zoned-target.c | 1 - drivers/md/dm.c | 25 ++++++------------------- include/linux/device-mapper.h | 6 ------ include/uapi/linux/dm-ioctl.h | 4 ++-- 7 files changed, 17 insertions(+), 35 deletions(-) (limited to 'include/uapi') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index b29a8327eed1..adc529f12b6b 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2496,7 +2496,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->num_discard_bios = 1; ti->discards_supported = true; - ti->split_discard_bios = false; ti->per_io_data_size = sizeof(struct per_bio_data); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index adcfe8ae10aa..9fdef6897316 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2986,11 +2986,6 @@ static void configure_discard_support(struct raid_set *rs) } } - /* - * RAID1 and RAID10 personalities require bio splitting, - * RAID0/4/5/6 don't and process large discard bios properly. - */ - ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs)); ti->num_discard_bios = 1; } @@ -3747,6 +3742,15 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, chunk_size); blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); + + /* + * RAID1 and RAID10 personalities require bio splitting, + * RAID0/4/5/6 don't and process large discard bios properly. + */ + if (rs_is_raid1(rs) || rs_is_raid10(rs)) { + limits->discard_granularity = chunk_size; + limits->max_discard_sectors = chunk_size; + } } static void raid_postsuspend(struct dm_target *ti) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index e83b63608262..0d9ded0f5e50 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -4227,7 +4227,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (tc->pool->pf.discard_enabled) { ti->discards_supported = true; ti->num_discard_bios = 1; - ti->split_discard_bios = false; } mutex_unlock(&dm_thin_pool_table.mutex); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 6af5babe6837..8865c1709e16 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -727,7 +727,6 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->per_io_data_size = sizeof(struct dmz_bioctx); ti->flush_supported = true; ti->discards_supported = true; - ti->split_discard_bios = true; /* The exposed capacity is the number of chunks that can be mapped */ ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7a774fcd0194..55f12df3589d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1478,17 +1478,10 @@ static unsigned get_num_write_zeroes_bios(struct dm_target *ti) return ti->num_write_zeroes_bios; } -typedef bool (*is_split_required_fn)(struct dm_target *ti); - -static bool is_split_required_for_discard(struct dm_target *ti) -{ - return ti->split_discard_bios; -} - static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, - unsigned num_bios, bool is_split_required) + unsigned num_bios) { - unsigned len; + unsigned len = ci->sector_count; /* * Even though the device advertised support for this type of @@ -1499,11 +1492,6 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target * if (!num_bios) return -EOPNOTSUPP; - if (!is_split_required) - len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); - else - len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); - __send_duplicate_bios(ci, ti, num_bios, &len); ci->sector += len; @@ -1514,23 +1502,22 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target * static int __send_discard(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti), - is_split_required_for_discard(ti)); + return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti)); } static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti), false); + return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti)); } static int __send_write_same(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti), false); + return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti)); } static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false); + return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti)); } static bool is_abnormal_io(struct bio *bio) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index e528baebad69..0f5b3d7c6cb3 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -315,12 +315,6 @@ struct dm_target { * whether or not its underlying devices have support. */ bool discards_supported:1; - - /* - * Set if the target required discard bios to be split - * on max_io_len boundary. - */ - bool split_discard_bios:1; }; /* Each target can link one of these into the table */ diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index d1e49514977b..f396a82dfd3e 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -270,9 +270,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 39 +#define DM_VERSION_MINOR 40 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2018-04-03)" +#define DM_VERSION_EXTRA "-ioctl (2019-01-18)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.3-7-g2ca7 From 663586c0a8929db81e617c775823efb9d65f2bc2 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 7 Nov 2018 23:16:19 +0100 Subject: ubi: Expose the bitrot interface Using UBI_IOCRPEB and UBI_IOCSPEB userspace can force reading and scrubbing of PEBs. In case of bitflips UBI will automatically take action and move data to a different PEB. This interface allows a daemon to foster your NAND. Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/cdev.c | 30 +++++++++ drivers/mtd/ubi/ubi.h | 1 + drivers/mtd/ubi/wl.c | 144 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/mtd/ubi-user.h | 5 ++ 4 files changed, 180 insertions(+) (limited to 'include/uapi') diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c index 22547d7a84ea..947a8adbc799 100644 --- a/drivers/mtd/ubi/cdev.c +++ b/drivers/mtd/ubi/cdev.c @@ -974,6 +974,36 @@ static long ubi_cdev_ioctl(struct file *file, unsigned int cmd, break; } + /* Check a specific PEB for bitflips and scrub it if needed */ + case UBI_IOCRPEB: + { + int pnum; + + err = get_user(pnum, (__user int32_t *)argp); + if (err) { + err = -EFAULT; + break; + } + + err = ubi_bitflip_check(ubi, pnum, 0); + break; + } + + /* Force scrubbing for a specific PEB */ + case UBI_IOCSPEB: + { + int pnum; + + err = get_user(pnum, (__user int32_t *)argp); + if (err) { + err = -EFAULT; + break; + } + + err = ubi_bitflip_check(ubi, pnum, 1); + break; + } + default: err = -ENOTTY; break; diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index d47b9e436e67..a1b9e764d489 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -929,6 +929,7 @@ int ubi_wl_put_fm_peb(struct ubi_device *ubi, struct ubi_wl_entry *used_e, int ubi_is_erase_work(struct ubi_work *wrk); void ubi_refill_pools(struct ubi_device *ubi); int ubi_ensure_anchor_pebs(struct ubi_device *ubi); +int ubi_bitflip_check(struct ubi_device *ubi, int pnum, int force_scrub); /* io.c */ int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset, diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index ca1b31385eb5..40f838d54b0f 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1440,6 +1440,150 @@ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum) return err; } +static bool scrub_possible(struct ubi_device *ubi, struct ubi_wl_entry *e) +{ + if (in_wl_tree(e, &ubi->scrub)) + return false; + else if (in_wl_tree(e, &ubi->erroneous)) + return false; + else if (ubi->move_from == e) + return false; + else if (ubi->move_to == e) + return false; + + return true; +} + +/** + * ubi_bitflip_check - Check an eraseblock for bitflips and scrub it if needed. + * @ubi: UBI device description object + * @pnum: the physical eraseblock to schedule + * @force: dont't read the block, assume bitflips happened and take action. + * + * This function reads the given eraseblock and checks if bitflips occured. + * In case of bitflips, the eraseblock is scheduled for scrubbing. + * If scrubbing is forced with @force, the eraseblock is not read, + * but scheduled for scrubbing right away. + * + * Returns: + * %EINVAL, PEB is out of range + * %ENOENT, PEB is no longer used by UBI + * %EBUSY, PEB cannot be checked now or a check is currently running on it + * %EAGAIN, bit flips happened but scrubbing is currently not possible + * %EUCLEAN, bit flips happened and PEB is scheduled for scrubbing + * %0, no bit flips detected + */ +int ubi_bitflip_check(struct ubi_device *ubi, int pnum, int force) +{ + int err; + struct ubi_wl_entry *e; + + if (pnum < 0 || pnum >= ubi->peb_count) { + err = -EINVAL; + goto out; + } + + /* + * Pause all parallel work, otherwise it can happen that the + * erase worker frees a wl entry under us. + */ + down_write(&ubi->work_sem); + + /* + * Make sure that the wl entry does not change state while + * inspecting it. + */ + spin_lock(&ubi->wl_lock); + e = ubi->lookuptbl[pnum]; + if (!e) { + spin_unlock(&ubi->wl_lock); + err = -ENOENT; + goto out_resume; + } + + /* + * Does it make sense to check this PEB? + */ + if (!scrub_possible(ubi, e)) { + spin_unlock(&ubi->wl_lock); + err = -EBUSY; + goto out_resume; + } + spin_unlock(&ubi->wl_lock); + + if (!force) { + mutex_lock(&ubi->buf_mutex); + err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size); + mutex_unlock(&ubi->buf_mutex); + } + + if (err == UBI_IO_BITFLIPS || force) { + /* + * Okay, bit flip happened, let's figure out what we can do. + */ + spin_lock(&ubi->wl_lock); + + /* + * Recheck. We released wl_lock, UBI might have killed the + * wl entry under us. + */ + e = ubi->lookuptbl[pnum]; + if (!e) { + spin_unlock(&ubi->wl_lock); + err = -ENOENT; + goto out_resume; + } + + /* + * Need to re-check state + */ + if (!scrub_possible(ubi, e)) { + spin_unlock(&ubi->wl_lock); + err = -EBUSY; + goto out_resume; + } + + if (in_pq(ubi, e)) { + prot_queue_del(ubi, e->pnum); + wl_tree_add(e, &ubi->scrub); + spin_unlock(&ubi->wl_lock); + + err = ensure_wear_leveling(ubi, 1); + } else if (in_wl_tree(e, &ubi->used)) { + rb_erase(&e->u.rb, &ubi->used); + wl_tree_add(e, &ubi->scrub); + spin_unlock(&ubi->wl_lock); + + err = ensure_wear_leveling(ubi, 1); + } else if (in_wl_tree(e, &ubi->free)) { + rb_erase(&e->u.rb, &ubi->free); + ubi->free_count--; + spin_unlock(&ubi->wl_lock); + + /* + * This PEB is empty we can schedule it for + * erasure right away. No wear leveling needed. + */ + err = schedule_erase(ubi, e, UBI_UNKNOWN, UBI_UNKNOWN, + force ? 0 : 1, true); + } else { + spin_unlock(&ubi->wl_lock); + err = -EAGAIN; + } + + if (!err && !force) + err = -EUCLEAN; + } else { + err = 0; + } + +out_resume: + up_write(&ubi->work_sem); +out: + + return err; +} + /** * tree_destroy - destroy an RB-tree. * @ubi: UBI device description object diff --git a/include/uapi/mtd/ubi-user.h b/include/uapi/mtd/ubi-user.h index aad3b6201fc0..b69e9ba6742b 100644 --- a/include/uapi/mtd/ubi-user.h +++ b/include/uapi/mtd/ubi-user.h @@ -171,6 +171,11 @@ /* Re-name volumes */ #define UBI_IOCRNVOL _IOW(UBI_IOC_MAGIC, 3, struct ubi_rnvol_req) +/* Read the specified PEB and scrub it if there are bitflips */ +#define UBI_IOCRPEB _IOW(UBI_IOC_MAGIC, 4, __s32) +/* Force scrubbing on the specified PEB */ +#define UBI_IOCSPEB _IOW(UBI_IOC_MAGIC, 5, __s32) + /* ioctl commands of the UBI control character device */ #define UBI_CTRL_IOC_MAGIC 'o' -- cgit v1.3-7-g2ca7 From 228a73abde5c04428678e917b271f8526cfd90ed Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 4 Jan 2019 13:31:54 +0800 Subject: btrfs: introduce new ioctl to unregister a btrfs device Support for a new command that can be used eg. as a command $ btrfs device scan --forget [dev]' (the final name may change though) to undo the effects of 'btrfs device scan [dev]'. For this purpose this patch proposes to use ioctl #5 as it was empty and is next to the SCAN ioctl. The new ioctl BTRFS_IOC_FORGET_DEV works only on the control device (/dev/btrfs-control) to unregister one or all devices, devices that are not mounted. The argument is struct btrfs_ioctl_vol_args, ::name specifies the device path. To unregister all device, the path is an empty string. Again, the devices are removed only if they aren't part of a mounte filesystem. This new ioctl provides: - release of unwanted btrfs_fs_devices and btrfs_devices structures from memory if the device is not going to be mounted - ability to mount filesystem in degraded mode, when one devices is corrupted like in split brain raid1 - running test cases which would require reloading the kernel module but this is not possible eg. due to mounted filesystem or built-in Signed-off-by: Anand Jain Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 11 +++++++++++ fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 17 insertions(+) (limited to 'include/uapi') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0a3f122dd61f..f9d13a30aa8a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2190,6 +2190,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7ca31c83e730..fe122e6099ae 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1446,6 +1446,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + int ret; + + mutex_lock(&uuid_mutex); + ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + mutex_unlock(&uuid_mutex); + + return ret; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 656ea8d85770..3ad9d58d1b66 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e0763bc4158e..c195896d478f 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -837,6 +837,8 @@ enum btrfs_err_code { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_vol_args) /* trans start and trans end are dangerous, and only for * use by applications that know how to avoid the * resulting deadlocks -- cgit v1.3-7-g2ca7 From e5567f5f67621877726f99be040af9fbedda37dc Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 19 Feb 2019 11:04:51 -0800 Subject: PCI/ATS: Add pci_prg_resp_pasid_required() interface. Return the PRG Response PASID Required bit in the Page Request Status Register. As per PCIe spec r4.0, sec 10.5.2.3, if this bit is Set, the device expects a PASID TLP Prefix on PRG Response Messages when the corresponding Page Requests had a PASID TLP Prefix. If Clear, the device does not expect PASID TLP Prefixes on any PRG Response Message, and the device behavior is undefined if the device receives a PRG Response Message with a PASID TLP Prefix. Also the device behavior is undefined if this bit is Set and the device receives a PRG Response Message with no PASID TLP Prefix when the corresponding Page Requests had a PASID TLP Prefix. This function will be used by drivers like IOMMU, if it is required to check the status of the PRG Response PASID Required bit before enabling the PASID support of the device. Cc: Ashok Raj Cc: Jacob Pan Cc: Keith Busch Suggested-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Bjorn Helgaas Signed-off-by: Joerg Roedel --- drivers/pci/ats.c | 30 ++++++++++++++++++++++++++++++ include/linux/pci-ats.h | 5 +++++ include/uapi/linux/pci_regs.h | 1 + 3 files changed, 36 insertions(+) (limited to 'include/uapi') diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 5b78f3b1b918..420cd0a578d0 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -368,6 +368,36 @@ int pci_pasid_features(struct pci_dev *pdev) } EXPORT_SYMBOL_GPL(pci_pasid_features); +/** + * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit + * status. + * @pdev: PCI device structure + * + * Returns 1 if PASID is required in PRG Response Message, 0 otherwise. + * + * Even though the PRG response PASID status is read from PRI Status + * Register, since this API will mainly be used by PASID users, this + * function is defined within #ifdef CONFIG_PCI_PASID instead of + * CONFIG_PCI_PRI. + */ +int pci_prg_resp_pasid_required(struct pci_dev *pdev) +{ + u16 status; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); + if (!pos) + return 0; + + pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); + + if (status & PCI_PRI_STATUS_PASID) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required); + #define PASID_NUMBER_SHIFT 8 #define PASID_NUMBER_MASK (0x1f << PASID_NUMBER_SHIFT) /** diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index 7c4b8e27268c..facfd6a18fe1 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -40,6 +40,7 @@ void pci_disable_pasid(struct pci_dev *pdev); void pci_restore_pasid_state(struct pci_dev *pdev); int pci_pasid_features(struct pci_dev *pdev); int pci_max_pasids(struct pci_dev *pdev); +int pci_prg_resp_pasid_required(struct pci_dev *pdev); #else /* CONFIG_PCI_PASID */ @@ -66,6 +67,10 @@ static inline int pci_max_pasids(struct pci_dev *pdev) return -EINVAL; } +static int pci_prg_resp_pasid_required(struct pci_dev *pdev) +{ + return 0; +} #endif /* CONFIG_PCI_PASID */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index e1e9888c85e6..898be572b010 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -880,6 +880,7 @@ #define PCI_PRI_STATUS_RF 0x001 /* Response Failure */ #define PCI_PRI_STATUS_UPRGI 0x002 /* Unexpected PRG index */ #define PCI_PRI_STATUS_STOPPED 0x100 /* PRI Stopped */ +#define PCI_PRI_STATUS_PASID 0x8000 /* PRG Response PASID Required */ #define PCI_PRI_MAX_REQ 0x08 /* PRI max reqs supported */ #define PCI_PRI_ALLOC_REQ 0x0c /* PRI max reqs allowed */ #define PCI_EXT_CAP_PRI_SIZEOF 16 -- cgit v1.3-7-g2ca7 From 8c938ddc6df3bbe72809db1be6c9f3af83f5d7a9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 19 Feb 2019 11:06:09 -0800 Subject: PCI/ATS: Add pci_ats_page_aligned() interface Return the Page Aligned Request bit in the ATS Capability Register. As per PCIe spec r4.0, sec 10.5.1.2, if the Page Aligned Request bit is set, it indicates the Untranslated Addresses generated by the device are always aligned to a 4096 byte boundary. An IOMMU that can only translate page-aligned addresses can only be used with devices that always produce aligned Untranslated Addresses. This interface will be used by drivers for such IOMMUs to determine whether devices can use the ATS service. Cc: Ashok Raj Cc: Jacob Pan Cc: Keith Busch Suggested-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Bjorn Helgaas Signed-off-by: Joerg Roedel --- drivers/pci/ats.c | 27 +++++++++++++++++++++++++++ include/linux/pci.h | 2 ++ include/uapi/linux/pci_regs.h | 1 + 3 files changed, 30 insertions(+) (limited to 'include/uapi') diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 420cd0a578d0..97c08146534a 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -142,6 +142,33 @@ int pci_ats_queue_depth(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_ats_queue_depth); +/** + * pci_ats_page_aligned - Return Page Aligned Request bit status. + * @pdev: the PCI device + * + * Returns 1, if the Untranslated Addresses generated by the device + * are always aligned or 0 otherwise. + * + * Per PCIe spec r4.0, sec 10.5.1.2, if the Page Aligned Request bit + * is set, it indicates the Untranslated Addresses generated by the + * device are always aligned to a 4096 byte boundary. + */ +int pci_ats_page_aligned(struct pci_dev *pdev) +{ + u16 cap; + + if (!pdev->ats_cap) + return 0; + + pci_read_config_word(pdev, pdev->ats_cap + PCI_ATS_CAP, &cap); + + if (cap & PCI_ATS_CAP_PAGE_ALIGNED) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_ats_page_aligned); + #ifdef CONFIG_PCI_PRI /** * pci_enable_pri - Enable PRI capability diff --git a/include/linux/pci.h b/include/linux/pci.h index 65f1d8c2f082..9724a8c0496b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1524,11 +1524,13 @@ void pci_ats_init(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); +int pci_ats_page_aligned(struct pci_dev *dev); #else static inline void pci_ats_init(struct pci_dev *d) { } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; } +static inline int pci_ats_page_aligned(struct pci_dev *dev) { return 0; } #endif #ifdef CONFIG_PCIE_PTM diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 898be572b010..5c98133f2c94 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -866,6 +866,7 @@ #define PCI_ATS_CAP 0x04 /* ATS Capability Register */ #define PCI_ATS_CAP_QDEP(x) ((x) & 0x1f) /* Invalidate Queue Depth */ #define PCI_ATS_MAX_QDEP 32 /* Max Invalidate Queue Depth */ +#define PCI_ATS_CAP_PAGE_ALIGNED 0x0020 /* Page Aligned Request */ #define PCI_ATS_CTRL 0x06 /* ATS Control Register */ #define PCI_ATS_CTRL_ENABLE 0x8000 /* ATS Enable */ #define PCI_ATS_CTRL_STU(x) ((x) & 0x1f) /* Smallest Translation Unit */ -- cgit v1.3-7-g2ca7 From 230afe74d139f37ba5e344ad4e53d65911d12188 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 27 Feb 2019 00:19:18 +0200 Subject: habanalabs: allow memory allocations larger than 4GB This patch increase the size field in the uapi structure of the Memory IOCTL from 32-bit to 64-bit. This is to allow the user to allocate and/or map memory in chunks that are larger then 4GB. Goya's device memory (DRAM) can be up to 16GB, and for certain topologies, the user may want an allocation that is larger than 4GB. This change doesn't break current user-space because there was a "pad" field in the uapi structure right after the size field. Changing the size field to be 64-bit and removing the pad field maintains compatibility with current user-space. Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/habanalabs.h | 2 +- drivers/misc/habanalabs/memory.c | 10 ++++------ include/uapi/misc/habanalabs.h | 6 ++---- 3 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 901542d685e8..fdf517448599 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -1320,7 +1320,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx); int hl_vm_init(struct hl_device *hdev); void hl_vm_fini(struct hl_device *hdev); -int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size, +int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, struct hl_userptr *userptr); int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr); void hl_userptr_delete_list(struct hl_device *hdev, diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c index 9e3491dc3b55..4b57d7ce50dd 100644 --- a/drivers/misc/habanalabs/memory.c +++ b/drivers/misc/habanalabs/memory.c @@ -1210,7 +1210,7 @@ out: * - Pins the physical pages * - Create a SG list from those pages */ -int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size, +int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, struct hl_userptr *userptr) { u64 start, end; @@ -1218,14 +1218,12 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size, int rc; if (!size) { - dev_err(hdev->dev, "size to pin is invalid - %d\n", - size); + dev_err(hdev->dev, "size to pin is invalid - %llu\n", size); return -EINVAL; } if (!access_ok((void __user *) (uintptr_t) addr, size)) { - dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", - addr); + dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr); return -EFAULT; } @@ -1236,7 +1234,7 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size, if (((addr + size) < addr) || PAGE_ALIGN(addr + size) < (addr + size)) { dev_err(hdev->dev, - "user pointer 0x%llx + %u causes integer overflow\n", + "user pointer 0x%llx + %llu causes integer overflow\n", addr, size); return -EINVAL; } diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 4afc1891ece8..23d6ad3459cb 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -237,8 +237,7 @@ struct hl_mem_in { /* HL_MEM_OP_ALLOC- allocate device memory */ struct { /* Size to alloc */ - __u32 mem_size; - __u32 pad; + __u64 mem_size; } alloc; /* HL_MEM_OP_FREE - free device memory */ @@ -278,8 +277,7 @@ struct hl_mem_in { */ __u64 hint_addr; /* Size of allocated host memory */ - __u32 mem_size; - __u32 pad; + __u64 mem_size; } map_host; /* HL_MEM_OP_UNMAP - unmap host memory */ -- cgit v1.3-7-g2ca7 From 541664d360d1fdaa116473410265b2cb8a806b50 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 28 Feb 2019 11:55:44 +0200 Subject: habanalabs: add comments in uapi/misc/habanalabs.h Add comment about minimum and maximum size of command buffer. Add some text about the expected input of CS IOCTL. Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- include/uapi/misc/habanalabs.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 23d6ad3459cb..7fd6f633534c 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -112,7 +112,9 @@ struct hl_cb_in { __u64 cb_handle; /* HL_CB_OP_* */ __u32 op; - /* Size of CB. Minimum requested size must be PAGE_SIZE */ + /* Size of CB. Maximum size is 2MB. The minimum size that will be + * allocated, regardless of this parameter's value, is PAGE_SIZE + */ __u32 cb_size; /* Context ID - Currently not in use */ __u32 ctx_id; @@ -364,6 +366,12 @@ union hl_mem_args { * internal. The driver will get completion notifications from the device only * on JOBS which are enqueued in the external queues. * + * For jobs on external queues, the user needs to create command buffers + * through the CB ioctl and give the CB's handle to the CS ioctl. For jobs on + * internal queues, the user needs to prepare a "command buffer" with packets + * on either the SRAM or DRAM, and give the device address of that buffer to + * the CS ioctl. + * * This IOCTL is asynchronous in regard to the actual execution of the CS. This * means it returns immediately after ALL the JOBS were enqueued on their * relevant queues. Therefore, the user mustn't assume the CS has been completed -- cgit v1.3-7-g2ca7 From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jan 2019 10:46:33 -0700 Subject: Add io_uring IO interface The submission queue (SQ) and completion queue (CQ) rings are shared between the application and the kernel. This eliminates the need to copy data back and forth to submit and complete IO. IO submissions use the io_uring_sqe data structure, and completions are generated in the form of io_uring_cqe data structures. The SQ ring is an index into the io_uring_sqe array, which makes it possible to submit a batch of IOs without them being contiguous in the ring. The CQ ring is always contiguous, as completion events are inherently unordered, and hence any io_uring_cqe entry can point back to an arbitrary submission. Two new system calls are added for this: io_uring_setup(entries, params) Sets up an io_uring instance for doing async IO. On success, returns a file descriptor that the application can mmap to gain access to the SQ ring, CQ ring, and io_uring_sqes. io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize) Initiates IO against the rings mapped to this fd, or waits for them to complete, or both. The behavior is controlled by the parameters passed in. If 'to_submit' is non-zero, then we'll try and submit new IO. If IORING_ENTER_GETEVENTS is set, the kernel will wait for 'min_complete' events, if they aren't already available. It's valid to set IORING_ENTER_GETEVENTS and 'min_complete' == 0 at the same time, this allows the kernel to return already completed events without waiting for them. This is useful only for polling, as for IRQ driven IO, the application can just check the CQ ring without entering the kernel. With this setup, it's possible to do async IO with a single system call. Future developments will enable polled IO with this interface, and polled submission as well. The latter will enable an application to do IO without doing ANY system calls at all. For IRQ driven IO, an application only needs to enter the kernel for completions if it wants to wait for them to occur. Each io_uring is backed by a workqueue, to support buffered async IO as well. We will only punt to an async context if the command would need to wait for IO on the device side. Any data that can be accessed directly in the page cache is done inline. This avoids the slowness issue of usual threadpools, since cached data is accessed as quickly as a sync interface. Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- arch/x86/entry/syscalls/syscall_32.tbl | 2 + arch/x86/entry/syscalls/syscall_64.tbl | 2 + fs/Makefile | 1 + fs/io_uring.c | 1255 ++++++++++++++++++++++++++++++++ include/linux/fs.h | 9 + include/linux/sched/user.h | 2 +- include/linux/syscalls.h | 6 + include/uapi/asm-generic/unistd.h | 6 +- include/uapi/linux/io_uring.h | 95 +++ init/Kconfig | 9 + kernel/sys_ni.c | 2 + net/unix/garbage.c | 3 + 12 files changed, 1390 insertions(+), 2 deletions(-) create mode 100644 fs/io_uring.c create mode 100644 include/uapi/linux/io_uring.h (limited to 'include/uapi') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 3cf7b533b3d1..481c126259e9 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -398,3 +398,5 @@ 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq +425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup +426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f0b1709a5ffb..6a32a430c8e0 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,8 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +425 common io_uring_setup __x64_sys_io_uring_setup +426 common io_uring_enter __x64_sys_io_uring_enter # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/Makefile b/fs/Makefile index 293733f61594..8e15d6fc4340 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_IO_URING) += io_uring.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FILE_LOCKING) += locks.o diff --git a/fs/io_uring.c b/fs/io_uring.c new file mode 100644 index 000000000000..f68052290426 --- /dev/null +++ b/fs/io_uring.c @@ -0,0 +1,1255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared application/kernel submission and completion ring pairs, for + * supporting fast/efficient IO. + * + * A note on the read/write ordering memory barriers that are matched between + * the application and kernel side. When the application reads the CQ ring + * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() + * the kernel uses after writing the tail. Failure to do so could cause a + * delay in when the application notices that completion events available. + * This isn't a fatal condition. Likewise, the application must use an + * appropriate smp_wmb() both before writing the SQ tail, and after writing + * the SQ tail. The first one orders the sqe writes with the tail write, and + * the latter is paired with the smp_rmb() the kernel will issue before + * reading the SQ tail on submission. + * + * Also see the examples in the liburing library: + * + * git://git.kernel.dk/liburing + * + * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens + * from data shared between the kernel and application. This is done both + * for ordering purposes, but also to ensure that once a value is loaded from + * data that the application could potentially modify, it remains stable. + * + * Copyright (C) 2018-2019 Jens Axboe + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +#define IORING_MAX_ENTRIES 4096 + +struct io_uring { + u32 head ____cacheline_aligned_in_smp; + u32 tail ____cacheline_aligned_in_smp; +}; + +struct io_sq_ring { + struct io_uring r; + u32 ring_mask; + u32 ring_entries; + u32 dropped; + u32 flags; + u32 array[]; +}; + +struct io_cq_ring { + struct io_uring r; + u32 ring_mask; + u32 ring_entries; + u32 overflow; + struct io_uring_cqe cqes[]; +}; + +struct io_ring_ctx { + struct { + struct percpu_ref refs; + } ____cacheline_aligned_in_smp; + + struct { + unsigned int flags; + bool compat; + bool account_mem; + + /* SQ ring */ + struct io_sq_ring *sq_ring; + unsigned cached_sq_head; + unsigned sq_entries; + unsigned sq_mask; + struct io_uring_sqe *sq_sqes; + } ____cacheline_aligned_in_smp; + + /* IO offload */ + struct workqueue_struct *sqo_wq; + struct mm_struct *sqo_mm; + + struct { + /* CQ ring */ + struct io_cq_ring *cq_ring; + unsigned cached_cq_tail; + unsigned cq_entries; + unsigned cq_mask; + struct wait_queue_head cq_wait; + struct fasync_struct *cq_fasync; + } ____cacheline_aligned_in_smp; + + struct user_struct *user; + + struct completion ctx_done; + + struct { + struct mutex uring_lock; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t completion_lock; + } ____cacheline_aligned_in_smp; + +#if defined(CONFIG_UNIX) + struct socket *ring_sock; +#endif +}; + +struct sqe_submit { + const struct io_uring_sqe *sqe; + unsigned short index; + bool has_user; +}; + +struct io_kiocb { + struct kiocb rw; + + struct sqe_submit submit; + + struct io_ring_ctx *ctx; + struct list_head list; + unsigned int flags; +#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ + u64 user_data; + + struct work_struct work; +}; + +#define IO_PLUG_THRESHOLD 2 + +static struct kmem_cache *req_cachep; + +static const struct file_operations io_uring_fops; + +struct sock *io_uring_get_socket(struct file *file) +{ +#if defined(CONFIG_UNIX) + if (file->f_op == &io_uring_fops) { + struct io_ring_ctx *ctx = file->private_data; + + return ctx->ring_sock->sk; + } +#endif + return NULL; +} +EXPORT_SYMBOL(io_uring_get_socket); + +static void io_ring_ctx_ref_free(struct percpu_ref *ref) +{ + struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); + + complete(&ctx->ctx_done); +} + +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) { + kfree(ctx); + return NULL; + } + + ctx->flags = p->flags; + init_waitqueue_head(&ctx->cq_wait); + init_completion(&ctx->ctx_done); + mutex_init(&ctx->uring_lock); + init_waitqueue_head(&ctx->wait); + spin_lock_init(&ctx->completion_lock); + return ctx; +} + +static void io_commit_cqring(struct io_ring_ctx *ctx) +{ + struct io_cq_ring *ring = ctx->cq_ring; + + if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) { + /* order cqe stores with ring update */ + smp_store_release(&ring->r.tail, ctx->cached_cq_tail); + + /* + * Write sider barrier of tail update, app has read side. See + * comment at the top of this file. + */ + smp_wmb(); + + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } + } +} + +static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) +{ + struct io_cq_ring *ring = ctx->cq_ring; + unsigned tail; + + tail = ctx->cached_cq_tail; + /* See comment at the top of the file */ + smp_rmb(); + if (tail + 1 == READ_ONCE(ring->r.head)) + return NULL; + + ctx->cached_cq_tail++; + return &ring->cqes[tail & ctx->cq_mask]; +} + +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res, unsigned ev_flags) +{ + struct io_uring_cqe *cqe; + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqring(ctx); + if (cqe) { + WRITE_ONCE(cqe->user_data, ki_user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, ev_flags); + } else { + unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); + + WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); + } +} + +static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res, unsigned ev_flags) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) +{ + percpu_ref_put_many(&ctx->refs, refs); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + if (!percpu_ref_tryget(&ctx->refs)) + return NULL; + + req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); + if (req) { + req->ctx = ctx; + req->flags = 0; + return req; + } + + io_ring_drop_ctx_refs(ctx, 1); + return NULL; +} + +static void io_free_req(struct io_kiocb *req) +{ + io_ring_drop_ctx_refs(req->ctx, 1); + kmem_cache_free(req_cachep, req); +} + +static void kiocb_end_write(struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(kiocb->ki_filp); + + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (S_ISREG(inode->i_mode)) + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + file_end_write(kiocb->ki_filp); + } +} + +static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + kiocb_end_write(kiocb); + + fput(kiocb->ki_filp); + io_cqring_add_event(req->ctx, req->user_data, res, 0); + io_free_req(req); +} + +/* + * If we tracked the file through the SCM inflight mechanism, we could support + * any file. For now, just ensure that anything potentially problematic is done + * inline. + */ +static bool io_file_supports_async(struct file *file) +{ + umode_t mode = file_inode(file)->i_mode; + + if (S_ISBLK(mode) || S_ISCHR(mode)) + return true; + if (S_ISREG(mode) && file->f_op != &io_uring_fops) + return true; + + return false; +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + struct kiocb *kiocb = &req->rw; + unsigned ioprio; + int fd, ret; + + /* For -EAGAIN retry, everything is already prepped */ + if (kiocb->ki_filp) + return 0; + + fd = READ_ONCE(sqe->fd); + kiocb->ki_filp = fget(fd); + if (unlikely(!kiocb->ki_filp)) + return -EBADF; + if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) + force_nonblock = false; + kiocb->ki_pos = READ_ONCE(sqe->off); + kiocb->ki_flags = iocb_flags(kiocb->ki_filp); + kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); + + ioprio = READ_ONCE(sqe->ioprio); + if (ioprio) { + ret = ioprio_check_cap(ioprio); + if (ret) + goto out_fput; + + kiocb->ki_ioprio = ioprio; + } else + kiocb->ki_ioprio = get_current_ioprio(); + + ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); + if (unlikely(ret)) + goto out_fput; + if (force_nonblock) { + kiocb->ki_flags |= IOCB_NOWAIT; + req->flags |= REQ_F_FORCE_NONBLOCK; + } + if (kiocb->ki_flags & IOCB_HIPRI) { + ret = -EINVAL; + goto out_fput; + } + + kiocb->ki_complete = io_complete_rw; + return 0; +out_fput: + fput(kiocb->ki_filp); + return ret; +} + +static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +{ + switch (ret) { + case -EIOCBQUEUED: + break; + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* + * We can't just restart the syscall, since previously + * submitted sqes may already be in progress. Just fail this + * IO with EINTR. + */ + ret = -EINTR; + /* fall through */ + default: + kiocb->ki_complete(kiocb, ret, 0); + } +} + +static int io_import_iovec(struct io_ring_ctx *ctx, int rw, + const struct sqe_submit *s, struct iovec **iovec, + struct iov_iter *iter) +{ + const struct io_uring_sqe *sqe = s->sqe; + void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + size_t sqe_len = READ_ONCE(sqe->len); + + if (!s->has_user) + return -EFAULT; + +#ifdef CONFIG_COMPAT + if (ctx->compat) + return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, + iovec, iter); +#endif + + return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); +} + +static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct kiocb *kiocb = &req->rw; + struct iov_iter iter; + struct file *file; + ssize_t ret; + + ret = io_prep_rw(req, s->sqe, force_nonblock); + if (ret) + return ret; + file = kiocb->ki_filp; + + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + goto out_fput; + ret = -EINVAL; + if (unlikely(!file->f_op->read_iter)) + goto out_fput; + + ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); + if (ret) + goto out_fput; + + ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter)); + if (!ret) { + ssize_t ret2; + + /* Catch -EAGAIN return for forced non-blocking submission */ + ret2 = call_read_iter(file, kiocb, &iter); + if (!force_nonblock || ret2 != -EAGAIN) + io_rw_done(kiocb, ret2); + else + ret = -EAGAIN; + } + kfree(iovec); +out_fput: + /* Hold on to the file for -EAGAIN */ + if (unlikely(ret && ret != -EAGAIN)) + fput(file); + return ret; +} + +static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct kiocb *kiocb = &req->rw; + struct iov_iter iter; + struct file *file; + ssize_t ret; + + ret = io_prep_rw(req, s->sqe, force_nonblock); + if (ret) + return ret; + /* Hold on to the file for -EAGAIN */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) + return -EAGAIN; + + ret = -EBADF; + file = kiocb->ki_filp; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + goto out_fput; + ret = -EINVAL; + if (unlikely(!file->f_op->write_iter)) + goto out_fput; + + ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); + if (ret) + goto out_fput; + + ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, + iov_iter_count(&iter)); + if (!ret) { + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (S_ISREG(file_inode(file)->i_mode)) { + __sb_start_write(file_inode(file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; + io_rw_done(kiocb, call_write_iter(file, kiocb, &iter)); + } + kfree(iovec); +out_fput: + if (unlikely(ret)) + fput(file); + return ret; +} + +/* + * IORING_OP_NOP just posts a completion event, nothing else. + */ +static int io_nop(struct io_kiocb *req, u64 user_data) +{ + struct io_ring_ctx *ctx = req->ctx; + long err = 0; + + /* + * Twilight zone - it's possible that someone issued an opcode that + * has a file attached, then got -EAGAIN on submission, and changed + * the sqe before we retried it from async context. Avoid dropping + * a file reference for this malicious case, and flag the error. + */ + if (req->rw.ki_filp) { + err = -EBADF; + fput(req->rw.ki_filp); + } + io_cqring_add_event(ctx, user_data, err, 0); + io_free_req(req); + return 0; +} + +static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct sqe_submit *s, bool force_nonblock) +{ + ssize_t ret; + int opcode; + + if (unlikely(s->index >= ctx->sq_entries)) + return -EINVAL; + req->user_data = READ_ONCE(s->sqe->user_data); + + opcode = READ_ONCE(s->sqe->opcode); + switch (opcode) { + case IORING_OP_NOP: + ret = io_nop(req, req->user_data); + break; + case IORING_OP_READV: + ret = io_read(req, s, force_nonblock); + break; + case IORING_OP_WRITEV: + ret = io_write(req, s, force_nonblock); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static void io_sq_wq_submit_work(struct work_struct *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct sqe_submit *s = &req->submit; + const struct io_uring_sqe *sqe = s->sqe; + struct io_ring_ctx *ctx = req->ctx; + mm_segment_t old_fs = get_fs(); + int ret; + + /* Ensure we clear previously set forced non-block flag */ + req->flags &= ~REQ_F_FORCE_NONBLOCK; + req->rw.ki_flags &= ~IOCB_NOWAIT; + + if (!mmget_not_zero(ctx->sqo_mm)) { + ret = -EFAULT; + goto err; + } + + use_mm(ctx->sqo_mm); + set_fs(USER_DS); + s->has_user = true; + + ret = __io_submit_sqe(ctx, req, s, false); + + set_fs(old_fs); + unuse_mm(ctx->sqo_mm); + mmput(ctx->sqo_mm); +err: + if (ret) { + io_cqring_add_event(ctx, sqe->user_data, ret, 0); + io_free_req(req); + } + + /* async context always use a copy of the sqe */ + kfree(sqe); +} + +static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s) +{ + struct io_kiocb *req; + ssize_t ret; + + /* enforce forwards compatibility on users */ + if (unlikely(s->sqe->flags)) + return -EINVAL; + + req = io_get_req(ctx); + if (unlikely(!req)) + return -EAGAIN; + + req->rw.ki_filp = NULL; + + ret = __io_submit_sqe(ctx, req, s, true); + if (ret == -EAGAIN) { + struct io_uring_sqe *sqe_copy; + + sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + if (sqe_copy) { + memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); + s->sqe = sqe_copy; + + memcpy(&req->submit, s, sizeof(*s)); + INIT_WORK(&req->work, io_sq_wq_submit_work); + queue_work(ctx->sqo_wq, &req->work); + ret = 0; + } + } + if (ret) + io_free_req(req); + + return ret; +} + +static void io_commit_sqring(struct io_ring_ctx *ctx) +{ + struct io_sq_ring *ring = ctx->sq_ring; + + if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) { + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&ring->r.head, ctx->cached_sq_head); + + /* + * write side barrier of head update, app has read side. See + * comment at the top of this file + */ + smp_wmb(); + } +} + +/* + * Undo last io_get_sqring() + */ +static void io_drop_sqring(struct io_ring_ctx *ctx) +{ + ctx->cached_sq_head--; +} + +/* + * Fetch an sqe, if one is available. Note that s->sqe will point to memory + * that is mapped by userspace. This means that care needs to be taken to + * ensure that reads are stable, as we cannot rely on userspace always + * being a good citizen. If members of the sqe are validated and then later + * used, it's important that those reads are done through READ_ONCE() to + * prevent a re-load down the line. + */ +static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) +{ + struct io_sq_ring *ring = ctx->sq_ring; + unsigned head; + + /* + * The cached sq head (or cq tail) serves two purposes: + * + * 1) allows us to batch the cost of updating the user visible + * head updates. + * 2) allows the kernel side to track the head on its own, even + * though the application is the one updating it. + */ + head = ctx->cached_sq_head; + /* See comment at the top of this file */ + smp_rmb(); + if (head == READ_ONCE(ring->r.tail)) + return false; + + head = READ_ONCE(ring->array[head & ctx->sq_mask]); + if (head < ctx->sq_entries) { + s->index = head; + s->sqe = &ctx->sq_sqes[head]; + ctx->cached_sq_head++; + return true; + } + + /* drop invalid entries */ + ctx->cached_sq_head++; + ring->dropped++; + /* See comment at the top of this file */ + smp_wmb(); + return false; +} + +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) +{ + int i, ret = 0, submit = 0; + struct blk_plug plug; + + if (to_submit > IO_PLUG_THRESHOLD) + blk_start_plug(&plug); + + for (i = 0; i < to_submit; i++) { + struct sqe_submit s; + + if (!io_get_sqring(ctx, &s)) + break; + + s.has_user = true; + ret = io_submit_sqe(ctx, &s); + if (ret) { + io_drop_sqring(ctx); + break; + } + + submit++; + } + io_commit_sqring(ctx); + + if (to_submit > IO_PLUG_THRESHOLD) + blk_finish_plug(&plug); + + return submit ? submit : ret; +} + +static unsigned io_cqring_events(struct io_cq_ring *ring) +{ + return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); +} + +/* + * Wait until events become available, if we don't already have some. The + * application must reap them itself, as they reside on the shared cq ring. + */ +static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, + const sigset_t __user *sig, size_t sigsz) +{ + struct io_cq_ring *ring = ctx->cq_ring; + sigset_t ksigmask, sigsaved; + DEFINE_WAIT(wait); + int ret; + + /* See comment at the top of this file */ + smp_rmb(); + if (io_cqring_events(ring) >= min_events) + return 0; + + if (sig) { + ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz); + if (ret) + return ret; + } + + do { + prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE); + + ret = 0; + /* See comment at the top of this file */ + smp_rmb(); + if (io_cqring_events(ring) >= min_events) + break; + + schedule(); + + ret = -EINTR; + if (signal_pending(current)) + break; + } while (1); + + finish_wait(&ctx->wait, &wait); + + if (sig) + restore_user_sigmask(sig, &sigsaved); + + return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; +} + +static int io_sq_offload_start(struct io_ring_ctx *ctx) +{ + int ret; + + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + + /* Do QD, or 2 * CPUS, whatever is smallest */ + ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, + min(ctx->sq_entries - 1, 2 * num_online_cpus())); + if (!ctx->sqo_wq) { + ret = -ENOMEM; + goto err; + } + + return 0; +err: + mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + return ret; +} + +static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +{ + atomic_long_sub(nr_pages, &user->locked_vm); +} + +static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +{ + unsigned long page_limit, cur_pages, new_pages; + + /* Don't allow more pages than we can safely lock */ + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + return 0; +} + +static void io_mem_free(void *ptr) +{ + struct page *page = virt_to_head_page(ptr); + + if (put_page_testzero(page)) + free_compound_page(page); +} + +static void *io_mem_alloc(size_t size) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | + __GFP_NORETRY; + + return (void *) __get_free_pages(gfp_flags, get_order(size)); +} + +static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) +{ + struct io_sq_ring *sq_ring; + struct io_cq_ring *cq_ring; + size_t bytes; + + bytes = struct_size(sq_ring, array, sq_entries); + bytes += array_size(sizeof(struct io_uring_sqe), sq_entries); + bytes += struct_size(cq_ring, cqes, cq_entries); + + return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; +} + +static void io_ring_ctx_free(struct io_ring_ctx *ctx) +{ + if (ctx->sqo_wq) + destroy_workqueue(ctx->sqo_wq); + if (ctx->sqo_mm) + mmdrop(ctx->sqo_mm); +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) + sock_release(ctx->ring_sock); +#endif + + io_mem_free(ctx->sq_ring); + io_mem_free(ctx->sq_sqes); + io_mem_free(ctx->cq_ring); + + percpu_ref_exit(&ctx->refs); + if (ctx->account_mem) + io_unaccount_mem(ctx->user, + ring_pages(ctx->sq_entries, ctx->cq_entries)); + free_uid(ctx->user); + kfree(ctx); +} + +static __poll_t io_uring_poll(struct file *file, poll_table *wait) +{ + struct io_ring_ctx *ctx = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &ctx->cq_wait, wait); + /* See comment at the top of this file */ + smp_rmb(); + if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) + mask |= EPOLLOUT | EPOLLWRNORM; + if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static int io_uring_fasync(int fd, struct file *file, int on) +{ + struct io_ring_ctx *ctx = file->private_data; + + return fasync_helper(fd, file, on, &ctx->cq_fasync); +} + +static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) +{ + mutex_lock(&ctx->uring_lock); + percpu_ref_kill(&ctx->refs); + mutex_unlock(&ctx->uring_lock); + + wait_for_completion(&ctx->ctx_done); + io_ring_ctx_free(ctx); +} + +static int io_uring_release(struct inode *inode, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + file->private_data = NULL; + io_ring_ctx_wait_and_kill(ctx); + return 0; +} + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT; + unsigned long sz = vma->vm_end - vma->vm_start; + struct io_ring_ctx *ctx = file->private_data; + unsigned long pfn; + struct page *page; + void *ptr; + + switch (offset) { + case IORING_OFF_SQ_RING: + ptr = ctx->sq_ring; + break; + case IORING_OFF_SQES: + ptr = ctx->sq_sqes; + break; + case IORING_OFF_CQ_RING: + ptr = ctx->cq_ring; + break; + default: + return -EINVAL; + } + + page = virt_to_head_page(ptr); + if (sz > (PAGE_SIZE << compound_order(page))) + return -EINVAL; + + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); +} + +SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, + u32, min_complete, u32, flags, const sigset_t __user *, sig, + size_t, sigsz) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + int submitted = 0; + struct fd f; + + if (flags & ~IORING_ENTER_GETEVENTS) + return -EINVAL; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = -EOPNOTSUPP; + if (f.file->f_op != &io_uring_fops) + goto out_fput; + + ret = -ENXIO; + ctx = f.file->private_data; + if (!percpu_ref_tryget(&ctx->refs)) + goto out_fput; + + ret = 0; + if (to_submit) { + to_submit = min(to_submit, ctx->sq_entries); + + mutex_lock(&ctx->uring_lock); + submitted = io_ring_submit(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (submitted < 0) + goto out_ctx; + } + if (flags & IORING_ENTER_GETEVENTS) { + min_complete = min(min_complete, ctx->cq_entries); + + /* + * The application could have included the 'to_submit' count + * in how many events it wanted to wait for. If we failed to + * submit the desired count, we may need to adjust the number + * of events to poll/wait for. + */ + if (submitted < to_submit) + min_complete = min_t(unsigned, submitted, min_complete); + + ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + } + +out_ctx: + io_ring_drop_ctx_refs(ctx, 1); +out_fput: + fdput(f); + return submitted ? submitted : ret; +} + +static const struct file_operations io_uring_fops = { + .release = io_uring_release, + .mmap = io_uring_mmap, + .poll = io_uring_poll, + .fasync = io_uring_fasync, +}; + +static int io_allocate_scq_urings(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_sq_ring *sq_ring; + struct io_cq_ring *cq_ring; + size_t size; + + sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries)); + if (!sq_ring) + return -ENOMEM; + + ctx->sq_ring = sq_ring; + sq_ring->ring_mask = p->sq_entries - 1; + sq_ring->ring_entries = p->sq_entries; + ctx->sq_mask = sq_ring->ring_mask; + ctx->sq_entries = sq_ring->ring_entries; + + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + if (size == SIZE_MAX) + return -EOVERFLOW; + + ctx->sq_sqes = io_mem_alloc(size); + if (!ctx->sq_sqes) { + io_mem_free(ctx->sq_ring); + return -ENOMEM; + } + + cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); + if (!cq_ring) { + io_mem_free(ctx->sq_ring); + io_mem_free(ctx->sq_sqes); + return -ENOMEM; + } + + ctx->cq_ring = cq_ring; + cq_ring->ring_mask = p->cq_entries - 1; + cq_ring->ring_entries = p->cq_entries; + ctx->cq_mask = cq_ring->ring_mask; + ctx->cq_entries = cq_ring->ring_entries; + return 0; +} + +/* + * Allocate an anonymous fd, this is what constitutes the application + * visible backing of an io_uring instance. The application mmaps this + * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, + * we have to tie this fd to a socket for file garbage collection purposes. + */ +static int io_uring_get_fd(struct io_ring_ctx *ctx) +{ + struct file *file; + int ret; + +#if defined(CONFIG_UNIX) + ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, + &ctx->ring_sock); + if (ret) + return ret; +#endif + + ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (ret < 0) + goto err; + + file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, + O_RDWR | O_CLOEXEC); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + goto err; + } + +#if defined(CONFIG_UNIX) + ctx->ring_sock->file = file; +#endif + fd_install(ret, file); + return ret; +err: +#if defined(CONFIG_UNIX) + sock_release(ctx->ring_sock); + ctx->ring_sock = NULL; +#endif + return ret; +} + +static int io_uring_create(unsigned entries, struct io_uring_params *p) +{ + struct user_struct *user = NULL; + struct io_ring_ctx *ctx; + bool account_mem; + int ret; + + if (!entries || entries > IORING_MAX_ENTRIES) + return -EINVAL; + + /* + * Use twice as many entries for the CQ ring. It's possible for the + * application to drive a higher depth than the size of the SQ ring, + * since the sqes are only used at submission time. This allows for + * some flexibility in overcommitting a bit. + */ + p->sq_entries = roundup_pow_of_two(entries); + p->cq_entries = 2 * p->sq_entries; + + user = get_uid(current_user()); + account_mem = !capable(CAP_IPC_LOCK); + + if (account_mem) { + ret = io_account_mem(user, + ring_pages(p->sq_entries, p->cq_entries)); + if (ret) { + free_uid(user); + return ret; + } + } + + ctx = io_ring_ctx_alloc(p); + if (!ctx) { + if (account_mem) + io_unaccount_mem(user, ring_pages(p->sq_entries, + p->cq_entries)); + free_uid(user); + return -ENOMEM; + } + ctx->compat = in_compat_syscall(); + ctx->account_mem = account_mem; + ctx->user = user; + + ret = io_allocate_scq_urings(ctx, p); + if (ret) + goto err; + + ret = io_sq_offload_start(ctx); + if (ret) + goto err; + + ret = io_uring_get_fd(ctx); + if (ret < 0) + goto err; + + memset(&p->sq_off, 0, sizeof(p->sq_off)); + p->sq_off.head = offsetof(struct io_sq_ring, r.head); + p->sq_off.tail = offsetof(struct io_sq_ring, r.tail); + p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask); + p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries); + p->sq_off.flags = offsetof(struct io_sq_ring, flags); + p->sq_off.dropped = offsetof(struct io_sq_ring, dropped); + p->sq_off.array = offsetof(struct io_sq_ring, array); + + memset(&p->cq_off, 0, sizeof(p->cq_off)); + p->cq_off.head = offsetof(struct io_cq_ring, r.head); + p->cq_off.tail = offsetof(struct io_cq_ring, r.tail); + p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask); + p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries); + p->cq_off.overflow = offsetof(struct io_cq_ring, overflow); + p->cq_off.cqes = offsetof(struct io_cq_ring, cqes); + return ret; +err: + io_ring_ctx_wait_and_kill(ctx); + return ret; +} + +/* + * Sets up an aio uring context, and returns the fd. Applications asks for a + * ring size, we return the actual sq/cq ring sizes (among other things) in the + * params structure passed in. + */ +static long io_uring_setup(u32 entries, struct io_uring_params __user *params) +{ + struct io_uring_params p; + long ret; + int i; + + if (copy_from_user(&p, params, sizeof(p))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(p.resv); i++) { + if (p.resv[i]) + return -EINVAL; + } + + if (p.flags) + return -EINVAL; + + ret = io_uring_create(entries, &p); + if (ret < 0) + return ret; + + if (copy_to_user(params, &p, sizeof(p))) + return -EFAULT; + + return ret; +} + +SYSCALL_DEFINE2(io_uring_setup, u32, entries, + struct io_uring_params __user *, params) +{ + return io_uring_setup(entries, params); +} + +static int __init io_uring_init(void) +{ + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + return 0; +}; +__initcall(io_uring_init); diff --git a/include/linux/fs.h b/include/linux/fs.h index dedcc2e9265c..61aa210f0c2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3517,4 +3517,13 @@ extern void inode_nohighmem(struct inode *inode); extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +#if defined(CONFIG_IO_URING) +extern struct sock *io_uring_get_socket(struct file *file); +#else +static inline struct sock *io_uring_get_socket(struct file *file) +{ + return NULL; +} +#endif + #endif /* _LINUX_FS_H */ diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 39ad98c09c58..c7b5f86b91a1 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -40,7 +40,7 @@ struct user_struct { kuid_t uid; #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ - defined(CONFIG_NET) + defined(CONFIG_NET) || defined(CONFIG_IO_URING) atomic_long_t locked_vm; #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 257cccba3062..3072dbaa7869 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -69,6 +69,7 @@ struct file_handle; struct sigaltstack; struct rseq; union bpf_attr; +struct io_uring_params; #include #include @@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id, struct io_event __user *events, struct old_timespec32 __user *timeout, const struct __aio_sigset *sig); +asmlinkage long sys_io_uring_setup(u32 entries, + struct io_uring_params __user *p); +asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, + u32 min_complete, u32 flags, + const sigset_t __user *sig, size_t sigsz); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d90127298f12..87871e7b7ea7 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +#define __NR_io_uring_setup 425 +__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) +#define __NR_io_uring_enter 426 +__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) #undef __NR_syscalls -#define __NR_syscalls 295 +#define __NR_syscalls 427 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h new file mode 100644 index 000000000000..ac692823d6f4 --- /dev/null +++ b/include/uapi/linux/io_uring.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Header file for the io_uring interface. + * + * Copyright (C) 2019 Jens Axboe + * Copyright (C) 2019 Christoph Hellwig + */ +#ifndef LINUX_IO_URING_H +#define LINUX_IO_URING_H + +#include +#include + +/* + * IO submission data structure (Submission Queue Entry) + */ +struct io_uring_sqe { + __u8 opcode; /* type of operation for this sqe */ + __u8 flags; /* as of now unused */ + __u16 ioprio; /* ioprio for the request */ + __s32 fd; /* file descriptor to do IO on */ + __u64 off; /* offset into file */ + __u64 addr; /* pointer to buffer or iovecs */ + __u32 len; /* buffer size or number of iovecs */ + union { + __kernel_rwf_t rw_flags; + __u32 __resv; + }; + __u64 user_data; /* data to be passed back at completion time */ + __u64 __pad2[3]; +}; + +#define IORING_OP_NOP 0 +#define IORING_OP_READV 1 +#define IORING_OP_WRITEV 2 + +/* + * IO completion data structure (Completion Queue Entry) + */ +struct io_uring_cqe { + __u64 user_data; /* sqe->data submission passed back */ + __s32 res; /* result code for this event */ + __u32 flags; +}; + +/* + * Magic offsets for the application to mmap the data it needs + */ +#define IORING_OFF_SQ_RING 0ULL +#define IORING_OFF_CQ_RING 0x8000000ULL +#define IORING_OFF_SQES 0x10000000ULL + +/* + * Filled with the offset for mmap(2) + */ +struct io_sqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 flags; + __u32 dropped; + __u32 array; + __u32 resv1; + __u64 resv2; +}; + +struct io_cqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 overflow; + __u32 cqes; + __u64 resv[2]; +}; + +/* + * io_uring_enter(2) flags + */ +#define IORING_ENTER_GETEVENTS (1U << 0) + +/* + * Passed in for io_uring_setup(2). Copied back with updated info on success + */ +struct io_uring_params { + __u32 sq_entries; + __u32 cq_entries; + __u32 flags; + __u32 resv[7]; + struct io_sqring_offsets sq_off; + struct io_cqring_offsets cq_off; +}; + +#endif diff --git a/init/Kconfig b/init/Kconfig index c9386a365eea..53b54214a36e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1414,6 +1414,15 @@ config AIO by some high performance threaded applications. Disabling this option saves about 7k. +config IO_URING + bool "Enable IO uring support" if EXPERT + select ANON_INODES + default y + help + This option enables support for the io_uring interface, enabling + applications to submit and complete IO through submission and + completion rings that are shared between the kernel and application. + config ADVISE_SYSCALLS bool "Enable madvise/fadvise syscalls" if EXPERT default y diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ab9d0e3c6d50..ee5e523564bb 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -46,6 +46,8 @@ COND_SYSCALL(io_getevents); COND_SYSCALL(io_pgetevents); COND_SYSCALL_COMPAT(io_getevents); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL(io_uring_setup); +COND_SYSCALL(io_uring_enter); /* fs/xattr.c */ diff --git a/net/unix/garbage.c b/net/unix/garbage.c index c36757e72844..f81854d74c7d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -108,6 +108,9 @@ struct sock *unix_get_socket(struct file *filp) /* PF_UNIX ? */ if (s && sock->ops && sock->ops->family == PF_UNIX) u_sock = s; + } else { + /* Could be an io_uring instance */ + u_sock = io_uring_get_socket(filp); } return u_sock; } -- cgit v1.3-7-g2ca7 From c992fe2925d776be066d9f6cc13f9ea11d78b657 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Jan 2019 09:43:02 -0700 Subject: io_uring: add fsync support Add a new fsync opcode, which either syncs a range if one is passed, or the whole file if the offset and length fields are both cleared to zero. A flag is provided to use fdatasync semantics, that is only force out metadata which is required to retrieve the file data, but not others like metadata. Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/io_uring.c | 54 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 8 ++++++- 2 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index f68052290426..745413d92712 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -24,6 +24,7 @@ * data that the application could potentially modify, it remains stable. * * Copyright (C) 2018-2019 Jens Axboe + * Copyright (c) 2018-2019 Christoph Hellwig */ #include #include @@ -557,6 +558,56 @@ static int io_nop(struct io_kiocb *req, u64 user_data) return 0; } +static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int fd; + + /* Prep already done */ + if (req->rw.ki_filp) + return 0; + + if (unlikely(sqe->addr || sqe->ioprio)) + return -EINVAL; + + fd = READ_ONCE(sqe->fd); + req->rw.ki_filp = fget(fd); + if (unlikely(!req->rw.ki_filp)) + return -EBADF; + + return 0; +} + +static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + loff_t sqe_off = READ_ONCE(sqe->off); + loff_t sqe_len = READ_ONCE(sqe->len); + loff_t end = sqe_off + sqe_len; + unsigned fsync_flags; + int ret; + + fsync_flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + ret = io_prep_fsync(req, sqe); + if (ret) + return ret; + + /* fsync always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + + ret = vfs_fsync_range(req->rw.ki_filp, sqe_off, + end > 0 ? end : LLONG_MAX, + fsync_flags & IORING_FSYNC_DATASYNC); + + fput(req->rw.ki_filp); + io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_free_req(req); + return 0; +} + static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock) { @@ -578,6 +629,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_WRITEV: ret = io_write(req, s, force_nonblock); break; + case IORING_OP_FSYNC: + ret = io_fsync(req, s->sqe, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ac692823d6f4..4589d56d0b68 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -24,7 +24,7 @@ struct io_uring_sqe { __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; - __u32 __resv; + __u32 fsync_flags; }; __u64 user_data; /* data to be passed back at completion time */ __u64 __pad2[3]; @@ -33,6 +33,12 @@ struct io_uring_sqe { #define IORING_OP_NOP 0 #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 +#define IORING_OP_FSYNC 3 + +/* + * sqe->fsync_flags + */ +#define IORING_FSYNC_DATASYNC (1U << 0) /* * IO completion data structure (Completion Queue Entry) -- cgit v1.3-7-g2ca7 From def596e9557c91d9846fc4d84d26f2c564644416 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Jan 2019 08:59:42 -0700 Subject: io_uring: support for IO polling Add support for a polled io_uring instance. When a read or write is submitted to a polled io_uring, the application must poll for completions on the CQ ring through io_uring_enter(2). Polled IO may not generate IRQ completions, hence they need to be actively found by the application itself. To use polling, io_uring_setup() must be used with the IORING_SETUP_IOPOLL flag being set. It is illegal to mix and match polled and non-polled IO on an io_uring. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/io_uring.c | 275 ++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/io_uring.h | 5 + 2 files changed, 271 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index 745413d92712..578797a4f318 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -124,6 +124,14 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; + bool poll_multi_file; + /* + * ->poll_list is protected by the ctx->uring_lock for + * io_uring instances that don't use IORING_SETUP_SQPOLL. + * For SQPOLL, only the single threaded io_sq_thread() will + * manipulate the list, hence no extra locking is needed there. + */ + struct list_head poll_list; } ____cacheline_aligned_in_smp; #if defined(CONFIG_UNIX) @@ -135,6 +143,7 @@ struct sqe_submit { const struct io_uring_sqe *sqe; unsigned short index; bool has_user; + bool needs_lock; }; struct io_kiocb { @@ -146,12 +155,15 @@ struct io_kiocb { struct list_head list; unsigned int flags; #define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ +#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ u64 user_data; + u64 error; struct work_struct work; }; #define IO_PLUG_THRESHOLD 2 +#define IO_IOPOLL_BATCH 8 static struct kmem_cache *req_cachep; @@ -196,6 +208,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); + INIT_LIST_HEAD(&ctx->poll_list); return ctx; } @@ -297,12 +310,153 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx) return NULL; } +static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) +{ + if (*nr) { + kmem_cache_free_bulk(req_cachep, *nr, reqs); + io_ring_drop_ctx_refs(ctx, *nr); + *nr = 0; + } +} + static void io_free_req(struct io_kiocb *req) { io_ring_drop_ctx_refs(req->ctx, 1); kmem_cache_free(req_cachep, req); } +/* + * Find and free completed poll iocbs + */ +static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, + struct list_head *done) +{ + void *reqs[IO_IOPOLL_BATCH]; + struct io_kiocb *req; + int to_free = 0; + + while (!list_empty(done)) { + req = list_first_entry(done, struct io_kiocb, list); + list_del(&req->list); + + io_cqring_fill_event(ctx, req->user_data, req->error, 0); + + reqs[to_free++] = req; + (*nr_events)++; + + fput(req->rw.ki_filp); + if (to_free == ARRAY_SIZE(reqs)) + io_free_req_many(ctx, reqs, &to_free); + } + io_commit_cqring(ctx); + + io_free_req_many(ctx, reqs, &to_free); +} + +static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, + long min) +{ + struct io_kiocb *req, *tmp; + LIST_HEAD(done); + bool spin; + int ret; + + /* + * Only spin for completions if we don't have multiple devices hanging + * off our complete list, and we're under the requested amount. + */ + spin = !ctx->poll_multi_file && *nr_events < min; + + ret = 0; + list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { + struct kiocb *kiocb = &req->rw; + + /* + * Move completed entries to our local list. If we find a + * request that requires polling, break out and complete + * the done list first, if we have entries there. + */ + if (req->flags & REQ_F_IOPOLL_COMPLETED) { + list_move_tail(&req->list, &done); + continue; + } + if (!list_empty(&done)) + break; + + ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); + if (ret < 0) + break; + + if (ret && spin) + spin = false; + ret = 0; + } + + if (!list_empty(&done)) + io_iopoll_complete(ctx, nr_events, &done); + + return ret; +} + +/* + * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a + * non-spinning poll check - we'll still enter the driver poll loop, but only + * as a non-spinning completion check. + */ +static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, + long min) +{ + while (!list_empty(&ctx->poll_list)) { + int ret; + + ret = io_do_iopoll(ctx, nr_events, min); + if (ret < 0) + return ret; + if (!min || *nr_events >= min) + return 0; + } + + return 1; +} + +/* + * We can't just wait for polled events to come to us, we have to actively + * find and complete them. + */ +static void io_iopoll_reap_events(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_IOPOLL)) + return; + + mutex_lock(&ctx->uring_lock); + while (!list_empty(&ctx->poll_list)) { + unsigned int nr_events = 0; + + io_iopoll_getevents(ctx, &nr_events, 1); + } + mutex_unlock(&ctx->uring_lock); +} + +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) +{ + int ret = 0; + + do { + int tmin = 0; + + if (*nr_events < min) + tmin = min - *nr_events; + + ret = io_iopoll_getevents(ctx, nr_events, tmin); + if (ret <= 0) + break; + ret = 0; + } while (min && !*nr_events && !need_resched()); + + return ret; +} + static void kiocb_end_write(struct kiocb *kiocb) { if (kiocb->ki_flags & IOCB_WRITE) { @@ -329,6 +483,53 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) io_free_req(req); } +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + kiocb_end_write(kiocb); + + req->error = res; + if (res != -EAGAIN) + req->flags |= REQ_F_IOPOLL_COMPLETED; +} + +/* + * After the iocb has been issued, it's safe to be found on the poll list. + * Adding the kiocb to the list AFTER submission ensures that we don't + * find it from a io_iopoll_getevents() thread before the issuer is done + * accessing the kiocb cookie. + */ +static void io_iopoll_req_issued(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + /* + * Track whether we have multiple files in our lists. This will impact + * how we do polling eventually, not spinning if we're on potentially + * different devices. + */ + if (list_empty(&ctx->poll_list)) { + ctx->poll_multi_file = false; + } else if (!ctx->poll_multi_file) { + struct io_kiocb *list_req; + + list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, + list); + if (list_req->rw.ki_filp != req->rw.ki_filp) + ctx->poll_multi_file = true; + } + + /* + * For fast devices, IO may have already completed. If it has, add + * it to the front so we find it first. + */ + if (req->flags & REQ_F_IOPOLL_COMPLETED) + list_add(&req->list, &ctx->poll_list); + else + list_add_tail(&req->list, &ctx->poll_list); +} + /* * If we tracked the file through the SCM inflight mechanism, we could support * any file. For now, just ensure that anything potentially problematic is done @@ -349,6 +550,7 @@ static bool io_file_supports_async(struct file *file) static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { + struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; unsigned ioprio; int fd, ret; @@ -384,12 +586,22 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_NOWAIT; req->flags |= REQ_F_FORCE_NONBLOCK; } - if (kiocb->ki_flags & IOCB_HIPRI) { - ret = -EINVAL; - goto out_fput; - } + if (ctx->flags & IORING_SETUP_IOPOLL) { + ret = -EOPNOTSUPP; + if (!(kiocb->ki_flags & IOCB_DIRECT) || + !kiocb->ki_filp->f_op->iopoll) + goto out_fput; - kiocb->ki_complete = io_complete_rw; + req->error = 0; + kiocb->ki_flags |= IOCB_HIPRI; + kiocb->ki_complete = io_complete_rw_iopoll; + } else { + if (kiocb->ki_flags & IOCB_HIPRI) { + ret = -EINVAL; + goto out_fput; + } + kiocb->ki_complete = io_complete_rw; + } return 0; out_fput: fput(kiocb->ki_filp); @@ -543,6 +755,9 @@ static int io_nop(struct io_kiocb *req, u64 user_data) struct io_ring_ctx *ctx = req->ctx; long err = 0; + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + /* * Twilight zone - it's possible that someone issued an opcode that * has a file attached, then got -EAGAIN on submission, and changed @@ -566,6 +781,8 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->rw.ki_filp) return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio)) return -EINVAL; @@ -637,7 +854,22 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, break; } - return ret; + if (ret) + return ret; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + if (req->error == -EAGAIN) + return -EAGAIN; + + /* workqueue context doesn't hold uring_lock, grab it now */ + if (s->needs_lock) + mutex_lock(&ctx->uring_lock); + io_iopoll_req_issued(req); + if (s->needs_lock) + mutex_unlock(&ctx->uring_lock); + } + + return 0; } static void io_sq_wq_submit_work(struct work_struct *work) @@ -661,8 +893,19 @@ static void io_sq_wq_submit_work(struct work_struct *work) use_mm(ctx->sqo_mm); set_fs(USER_DS); s->has_user = true; + s->needs_lock = true; - ret = __io_submit_sqe(ctx, req, s, false); + do { + ret = __io_submit_sqe(ctx, req, s, false); + /* + * We can get EAGAIN for polled IO even though we're forcing + * a sync submission from here, since we can't wait for + * request slots on the block side. + */ + if (ret != -EAGAIN) + break; + cond_resched(); + } while (1); set_fs(old_fs); unuse_mm(ctx->sqo_mm); @@ -799,6 +1042,8 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) break; s.has_user = true; + s.needs_lock = false; + ret = io_submit_sqe(ctx, &s); if (ret) { io_drop_sqring(ctx); @@ -947,6 +1192,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) destroy_workqueue(ctx->sqo_wq); if (ctx->sqo_mm) mmdrop(ctx->sqo_mm); + + io_iopoll_reap_events(ctx); + #if defined(CONFIG_UNIX) if (ctx->ring_sock) sock_release(ctx->ring_sock); @@ -993,6 +1241,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); io_ring_ctx_free(ctx); } @@ -1074,6 +1323,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out_ctx; } if (flags & IORING_ENTER_GETEVENTS) { + unsigned nr_events = 0; + min_complete = min(min_complete, ctx->cq_entries); /* @@ -1085,7 +1336,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (submitted < to_submit) min_complete = min_t(unsigned, submitted, min_complete); - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + if (ctx->flags & IORING_SETUP_IOPOLL) { + mutex_lock(&ctx->uring_lock); + ret = io_iopoll_check(ctx, &nr_events, min_complete); + mutex_unlock(&ctx->uring_lock); + } else { + ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + } } out_ctx: @@ -1282,7 +1539,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) return -EINVAL; } - if (p.flags) + if (p.flags & ~IORING_SETUP_IOPOLL) return -EINVAL; ret = io_uring_create(entries, &p); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4589d56d0b68..5c457ea396e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -30,6 +30,11 @@ struct io_uring_sqe { __u64 __pad2[3]; }; +/* + * io_uring_setup() flags + */ +#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ + #define IORING_OP_NOP 0 #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 -- cgit v1.3-7-g2ca7 From edafccee56ff31678a091ddb7219aba9b28bc3cb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Jan 2019 09:16:05 -0700 Subject: io_uring: add support for pre-mapped user IO buffers If we have fixed user buffers, we can map them into the kernel when we setup the io_uring. That avoids the need to do get_user_pages() for each and every IO. To utilize this feature, the application must call io_uring_register() after having setup an io_uring instance, passing in IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer to an iovec array, and the nr_args should contain how many iovecs the application wishes to map. If successful, these buffers are now mapped into the kernel, eligible for IO. To use these fixed buffers, the application must use the IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len must point to somewhere inside the indexed buffer. The application may register buffers throughout the lifetime of the io_uring instance. It can call io_uring_register() with IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of buffers, and then register a new set. The application need not unregister buffers explicitly before shutting down the io_uring instance. It's perfectly valid to setup a larger buffer, and then sometimes only use parts of it for an IO. As long as the range is within the originally mapped region, it will work just fine. For now, buffers must not be file backed. If file backed buffers are passed in, the registration will fail with -1/EOPNOTSUPP. This restriction may be relaxed in the future. RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat arbitrary 1G per buffer size is also imposed. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/io_uring.c | 374 +++++++++++++++++++++++++++++++-- include/linux/syscalls.h | 2 + include/uapi/asm-generic/unistd.h | 4 +- include/uapi/linux/io_uring.h | 13 +- kernel/sys_ni.c | 1 + 7 files changed, 381 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 481c126259e9..2eefd2a7c1ce 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -400,3 +400,4 @@ 386 i386 rseq sys_rseq __ia32_sys_rseq 425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup 426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter +427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 6a32a430c8e0..65c026185e61 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 334 common rseq __x64_sys_rseq 425 common io_uring_setup __x64_sys_io_uring_setup 426 common io_uring_enter __x64_sys_io_uring_enter +427 common io_uring_register __x64_sys_io_uring_register # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/io_uring.c b/fs/io_uring.c index 31f43ed894ba..c0c0f68568b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,8 @@ #include #include #include +#include +#include #include @@ -81,6 +84,13 @@ struct io_cq_ring { struct io_uring_cqe cqes[]; }; +struct io_mapped_ubuf { + u64 ubuf; + size_t len; + struct bio_vec *bvec; + unsigned int nr_bvecs; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -113,6 +123,10 @@ struct io_ring_ctx { struct fasync_struct *cq_fasync; } ____cacheline_aligned_in_smp; + /* if used, fixed mapped user buffers */ + unsigned nr_user_bufs; + struct io_mapped_ubuf *user_bufs; + struct user_struct *user; struct completion ctx_done; @@ -732,6 +746,46 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } +static int io_import_fixed(struct io_ring_ctx *ctx, int rw, + const struct io_uring_sqe *sqe, + struct iov_iter *iter) +{ + size_t len = READ_ONCE(sqe->len); + struct io_mapped_ubuf *imu; + unsigned index, buf_index; + size_t offset; + u64 buf_addr; + + /* attempt to use fixed buffers without having provided iovecs */ + if (unlikely(!ctx->user_bufs)) + return -EFAULT; + + buf_index = READ_ONCE(sqe->buf_index); + if (unlikely(buf_index >= ctx->nr_user_bufs)) + return -EFAULT; + + index = array_index_nospec(buf_index, ctx->nr_user_bufs); + imu = &ctx->user_bufs[index]; + buf_addr = READ_ONCE(sqe->addr); + + /* overflow */ + if (buf_addr + len < buf_addr) + return -EFAULT; + /* not inside the mapped region */ + if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len) + return -EFAULT; + + /* + * May not be a start of buffer, set size appropriately + * and advance us to the beginning. + */ + offset = buf_addr - imu->ubuf; + iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); + if (offset) + iov_iter_advance(iter, offset); + return 0; +} + static int io_import_iovec(struct io_ring_ctx *ctx, int rw, const struct sqe_submit *s, struct iovec **iovec, struct iov_iter *iter) @@ -739,6 +793,23 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw, const struct io_uring_sqe *sqe = s->sqe; void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); size_t sqe_len = READ_ONCE(sqe->len); + u8 opcode; + + /* + * We're reading ->opcode for the second time, but the first read + * doesn't care whether it's _FIXED or not, so it doesn't matter + * whether ->opcode changes concurrently. The first read does care + * about whether it is a READ or a WRITE, so we don't trust this read + * for that purpose and instead let the caller pass in the read/write + * flag. + */ + opcode = READ_ONCE(sqe->opcode); + if (opcode == IORING_OP_READ_FIXED || + opcode == IORING_OP_WRITE_FIXED) { + ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); + *iovec = NULL; + return ret; + } if (!s->has_user) return -EFAULT; @@ -886,7 +957,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; fd = READ_ONCE(sqe->fd); @@ -945,9 +1016,19 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_nop(req, req->user_data); break; case IORING_OP_READV: + if (unlikely(s->sqe->buf_index)) + return -EINVAL; ret = io_read(req, s, force_nonblock, state); break; case IORING_OP_WRITEV: + if (unlikely(s->sqe->buf_index)) + return -EINVAL; + ret = io_write(req, s, force_nonblock, state); + break; + case IORING_OP_READ_FIXED: + ret = io_read(req, s, force_nonblock, state); + break; + case IORING_OP_WRITE_FIXED: ret = io_write(req, s, force_nonblock, state); break; case IORING_OP_FSYNC: @@ -976,28 +1057,46 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } +static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) +{ + u8 opcode = READ_ONCE(sqe->opcode); + + return !(opcode == IORING_OP_READ_FIXED || + opcode == IORING_OP_WRITE_FIXED); +} + static void io_sq_wq_submit_work(struct work_struct *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs; + bool needs_user; int ret; /* Ensure we clear previously set forced non-block flag */ req->flags &= ~REQ_F_FORCE_NONBLOCK; req->rw.ki_flags &= ~IOCB_NOWAIT; - if (!mmget_not_zero(ctx->sqo_mm)) { - ret = -EFAULT; - goto err; - } - - use_mm(ctx->sqo_mm); - set_fs(USER_DS); - s->has_user = true; s->needs_lock = true; + s->has_user = false; + + /* + * If we're doing IO to fixed buffers, we don't need to get/set + * user context + */ + needs_user = io_sqe_needs_user(s->sqe); + if (needs_user) { + if (!mmget_not_zero(ctx->sqo_mm)) { + ret = -EFAULT; + goto err; + } + use_mm(ctx->sqo_mm); + old_fs = get_fs(); + set_fs(USER_DS); + s->has_user = true; + } do { ret = __io_submit_sqe(ctx, req, s, false, NULL); @@ -1011,9 +1110,11 @@ static void io_sq_wq_submit_work(struct work_struct *work) cond_resched(); } while (1); - set_fs(old_fs); - unuse_mm(ctx->sqo_mm); - mmput(ctx->sqo_mm); + if (needs_user) { + set_fs(old_fs); + unuse_mm(ctx->sqo_mm); + mmput(ctx->sqo_mm); + } err: if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); @@ -1317,6 +1418,198 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; } +static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) +{ + int i, j; + + if (!ctx->user_bufs) + return -ENXIO; + + for (i = 0; i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + + for (j = 0; j < imu->nr_bvecs; j++) + put_page(imu->bvec[j].bv_page); + + if (ctx->account_mem) + io_unaccount_mem(ctx->user, imu->nr_bvecs); + kfree(imu->bvec); + imu->nr_bvecs = 0; + } + + kfree(ctx->user_bufs); + ctx->user_bufs = NULL; + ctx->nr_user_bufs = 0; + return 0; +} + +static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, + void __user *arg, unsigned index) +{ + struct iovec __user *src; + +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct compat_iovec __user *ciovs; + struct compat_iovec ciov; + + ciovs = (struct compat_iovec __user *) arg; + if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) + return -EFAULT; + + dst->iov_base = (void __user *) (unsigned long) ciov.iov_base; + dst->iov_len = ciov.iov_len; + return 0; + } +#endif + src = (struct iovec __user *) arg; + if (copy_from_user(dst, &src[index], sizeof(*dst))) + return -EFAULT; + return 0; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct vm_area_struct **vmas = NULL; + struct page **pages = NULL; + int i, j, got_pages = 0; + int ret = -EINVAL; + + if (ctx->user_bufs) + return -EBUSY; + if (!nr_args || nr_args > UIO_MAXIOV) + return -EINVAL; + + ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf), + GFP_KERNEL); + if (!ctx->user_bufs) + return -ENOMEM; + + for (i = 0; i < nr_args; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + unsigned long off, start, end, ubuf; + int pret, nr_pages; + struct iovec iov; + size_t size; + + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; + + /* + * Don't impose further limits on the size and buffer + * constraints here, we'll -EINVAL later when IO is + * submitted if they are wrong. + */ + ret = -EFAULT; + if (!iov.iov_base || !iov.iov_len) + goto err; + + /* arbitrary limit, but we need something */ + if (iov.iov_len > SZ_1G) + goto err; + + ubuf = (unsigned long) iov.iov_base; + end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ubuf >> PAGE_SHIFT; + nr_pages = end - start; + + if (ctx->account_mem) { + ret = io_account_mem(ctx->user, nr_pages); + if (ret) + goto err; + } + + ret = 0; + if (!pages || nr_pages > got_pages) { + kfree(vmas); + kfree(pages); + pages = kmalloc_array(nr_pages, sizeof(struct page *), + GFP_KERNEL); + vmas = kmalloc_array(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!pages || !vmas) { + ret = -ENOMEM; + if (ctx->account_mem) + io_unaccount_mem(ctx->user, nr_pages); + goto err; + } + got_pages = nr_pages; + } + + imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), + GFP_KERNEL); + ret = -ENOMEM; + if (!imu->bvec) { + if (ctx->account_mem) + io_unaccount_mem(ctx->user, nr_pages); + goto err; + } + + ret = 0; + down_read(¤t->mm->mmap_sem); + pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, + pages, vmas); + if (pret == nr_pages) { + /* don't support file backed memory */ + for (j = 0; j < nr_pages; j++) { + struct vm_area_struct *vma = vmas[j]; + + if (vma->vm_file && + !is_file_hugepages(vma->vm_file)) { + ret = -EOPNOTSUPP; + break; + } + } + } else { + ret = pret < 0 ? pret : -EFAULT; + } + up_read(¤t->mm->mmap_sem); + if (ret) { + /* + * if we did partial map, or found file backed vmas, + * release any pages we did get + */ + if (pret > 0) { + for (j = 0; j < pret; j++) + put_page(pages[j]); + } + if (ctx->account_mem) + io_unaccount_mem(ctx->user, nr_pages); + goto err; + } + + off = ubuf & ~PAGE_MASK; + size = iov.iov_len; + for (j = 0; j < nr_pages; j++) { + size_t vec_len; + + vec_len = min_t(size_t, size, PAGE_SIZE - off); + imu->bvec[j].bv_page = pages[j]; + imu->bvec[j].bv_len = vec_len; + imu->bvec[j].bv_offset = off; + off = 0; + size -= vec_len; + } + /* store original address for later verification */ + imu->ubuf = ubuf; + imu->len = iov.iov_len; + imu->nr_bvecs = nr_pages; + + ctx->nr_user_bufs++; + } + kfree(pages); + kfree(vmas); + return 0; +err: + kfree(pages); + kfree(vmas); + io_sqe_buffer_unregister(ctx); + return ret; +} + static void io_ring_ctx_free(struct io_ring_ctx *ctx) { if (ctx->sqo_wq) @@ -1325,6 +1618,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) mmdrop(ctx->sqo_mm); io_iopoll_reap_events(ctx); + io_sqe_buffer_unregister(ctx); #if defined(CONFIG_UNIX) if (ctx->ring_sock) @@ -1689,6 +1983,60 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } +static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, + void __user *arg, unsigned nr_args) +{ + int ret; + + percpu_ref_kill(&ctx->refs); + wait_for_completion(&ctx->ctx_done); + + switch (opcode) { + case IORING_REGISTER_BUFFERS: + ret = io_sqe_buffer_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_BUFFERS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_buffer_unregister(ctx); + break; + default: + ret = -EINVAL; + break; + } + + /* bring the ctx back to life */ + reinit_completion(&ctx->ctx_done); + percpu_ref_reinit(&ctx->refs); + return ret; +} + +SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, + void __user *, arg, unsigned int, nr_args) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + struct fd f; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = -EOPNOTSUPP; + if (f.file->f_op != &io_uring_fops) + goto out_fput; + + ctx = f.file->private_data; + + mutex_lock(&ctx->uring_lock); + ret = __io_uring_register(ctx, opcode, arg, nr_args); + mutex_unlock(&ctx->uring_lock); +out_fput: + fdput(f); + return ret; +} + static int __init io_uring_init(void) { req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3072dbaa7869..3681c05ac538 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -315,6 +315,8 @@ asmlinkage long sys_io_uring_setup(u32 entries, asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, u32 min_complete, u32 flags, const sigset_t __user *sig, size_t sigsz); +asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, + void __user *arg, unsigned int nr_args); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 87871e7b7ea7..d346229a1eb0 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -744,9 +744,11 @@ __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) #define __NR_io_uring_enter 426 __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) +#define __NR_io_uring_register 427 +__SYSCALL(__NR_io_uring_register, sys_io_uring_register) #undef __NR_syscalls -#define __NR_syscalls 427 +#define __NR_syscalls 428 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5c457ea396e6..cf28f7a11f12 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -27,7 +27,10 @@ struct io_uring_sqe { __u32 fsync_flags; }; __u64 user_data; /* data to be passed back at completion time */ - __u64 __pad2[3]; + union { + __u16 buf_index; /* index into fixed buffers, if used */ + __u64 __pad2[3]; + }; }; /* @@ -39,6 +42,8 @@ struct io_uring_sqe { #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 #define IORING_OP_FSYNC 3 +#define IORING_OP_READ_FIXED 4 +#define IORING_OP_WRITE_FIXED 5 /* * sqe->fsync_flags @@ -103,4 +108,10 @@ struct io_uring_params { struct io_cqring_offsets cq_off; }; +/* + * io_uring_register(2) opcodes and arguments + */ +#define IORING_REGISTER_BUFFERS 0 +#define IORING_UNREGISTER_BUFFERS 1 + #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ee5e523564bb..1bb6604dc19f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -48,6 +48,7 @@ COND_SYSCALL_COMPAT(io_getevents); COND_SYSCALL_COMPAT(io_pgetevents); COND_SYSCALL(io_uring_setup); COND_SYSCALL(io_uring_enter); +COND_SYSCALL(io_uring_register); /* fs/xattr.c */ -- cgit v1.3-7-g2ca7 From 6b06314c47e141031be043539900d80d2c7ba10f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Jan 2019 22:13:58 -0700 Subject: io_uring: add file set registration We normally have to fget/fput for each IO we do on a file. Even with the batching we do, the cost of the atomic inc/dec of the file usage count adds up. This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes for the io_uring_register(2) system call. The arguments passed in must be an array of __s32 holding file descriptors, and nr_args should hold the number of file descriptors the application wishes to pin for the duration of the io_uring instance (or until IORING_UNREGISTER_FILES is called). When used, the application must set IOSQE_FIXED_FILE in the sqe->flags member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd to the index in the array passed in to IORING_REGISTER_FILES. Files are automatically unregistered when the io_uring instance is torn down. An application need only unregister if it wishes to register a new set of fds. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/io_uring.c | 311 +++++++++++++++++++++++++++++++++++++----- include/uapi/linux/io_uring.h | 9 +- 2 files changed, 288 insertions(+), 32 deletions(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index c0c0f68568b5..5eaa1c4a3f7c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -61,6 +62,7 @@ #include "internal.h" #define IORING_MAX_ENTRIES 4096 +#define IORING_MAX_FIXED_FILES 1024 struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -123,6 +125,14 @@ struct io_ring_ctx { struct fasync_struct *cq_fasync; } ____cacheline_aligned_in_smp; + /* + * If used, fixed file set. Writers must ensure that ->refs is dead, + * readers must ensure that ->refs is alive as long as the file* is + * used. Only updated through io_uring_register(2). + */ + struct file **user_files; + unsigned nr_user_files; + /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; struct io_mapped_ubuf *user_bufs; @@ -170,6 +180,7 @@ struct io_kiocb { unsigned int flags; #define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ +#define REQ_F_FIXED_FILE 4 /* ctx owns file */ u64 user_data; u64 error; @@ -404,15 +415,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * Batched puts of the same file, to avoid dirtying the * file usage count multiple times, if avoidable. */ - if (!file) { - file = req->rw.ki_filp; - file_count = 1; - } else if (file == req->rw.ki_filp) { - file_count++; - } else { - fput_many(file, file_count); - file = req->rw.ki_filp; - file_count = 1; + if (!(req->flags & REQ_F_FIXED_FILE)) { + if (!file) { + file = req->rw.ki_filp; + file_count = 1; + } else if (file == req->rw.ki_filp) { + file_count++; + } else { + fput_many(file, file_count); + file = req->rw.ki_filp; + file_count = 1; + } } if (to_free == ARRAY_SIZE(reqs)) @@ -544,13 +557,19 @@ static void kiocb_end_write(struct kiocb *kiocb) } } +static void io_fput(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_FIXED_FILE)) + fput(req->rw.ki_filp); +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); kiocb_end_write(kiocb); - fput(kiocb->ki_filp); + io_fput(req); io_cqring_add_event(req->ctx, req->user_data, res, 0); io_free_req(req); } @@ -666,19 +685,29 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; - unsigned ioprio; + unsigned ioprio, flags; int fd, ret; /* For -EAGAIN retry, everything is already prepped */ if (kiocb->ki_filp) return 0; + flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); - kiocb->ki_filp = io_file_get(state, fd); - if (unlikely(!kiocb->ki_filp)) - return -EBADF; - if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) - force_nonblock = false; + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || + (unsigned) fd >= ctx->nr_user_files)) + return -EBADF; + kiocb->ki_filp = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + kiocb->ki_filp = io_file_get(state, fd); + if (unlikely(!kiocb->ki_filp)) + return -EBADF; + if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) + force_nonblock = false; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -718,10 +747,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, } return 0; out_fput: - /* in case of error, we didn't use this file reference. drop it. */ - if (state) - state->used_refs--; - io_file_put(state, kiocb->ki_filp); + if (!(flags & IOSQE_FIXED_FILE)) { + /* + * in case of error, we didn't use this file reference. drop it. + */ + if (state) + state->used_refs--; + io_file_put(state, kiocb->ki_filp); + } return ret; } @@ -863,7 +896,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, out_fput: /* Hold on to the file for -EAGAIN */ if (unlikely(ret && ret != -EAGAIN)) - fput(file); + io_fput(req); return ret; } @@ -917,7 +950,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, kfree(iovec); out_fput: if (unlikely(ret)) - fput(file); + io_fput(req); return ret; } @@ -940,7 +973,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data) */ if (req->rw.ki_filp) { err = -EBADF; - fput(req->rw.ki_filp); + io_fput(req); } io_cqring_add_event(ctx, user_data, err, 0); io_free_req(req); @@ -949,21 +982,32 @@ static int io_nop(struct io_kiocb *req, u64 user_data) static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_ring_ctx *ctx = req->ctx; + unsigned flags; int fd; /* Prep already done */ if (req->rw.ki_filp) return 0; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; fd = READ_ONCE(sqe->fd); - req->rw.ki_filp = fget(fd); - if (unlikely(!req->rw.ki_filp)) - return -EBADF; + flags = READ_ONCE(sqe->flags); + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) + return -EBADF; + req->rw.ki_filp = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + req->rw.ki_filp = fget(fd); + if (unlikely(!req->rw.ki_filp)) + return -EBADF; + } return 0; } @@ -993,7 +1037,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); - fput(req->rw.ki_filp); + io_fput(req); io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); io_free_req(req); return 0; @@ -1132,7 +1176,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags)) + if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) return -EINVAL; req = io_get_req(ctx, state); @@ -1344,6 +1388,201 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; } +static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) { + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) + kfree_skb(skb); + } +#else + int i; + + for (i = 0; i < ctx->nr_user_files; i++) + fput(ctx->user_files[i]); +#endif +} + +static int io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ + if (!ctx->user_files) + return -ENXIO; + + __io_sqe_files_unregister(ctx); + kfree(ctx->user_files); + ctx->user_files = NULL; + ctx->nr_user_files = 0; + return 0; +} + +static void io_finish_async(struct io_ring_ctx *ctx) +{ + if (ctx->sqo_wq) { + destroy_workqueue(ctx->sqo_wq); + ctx->sqo_wq = NULL; + } +} + +#if defined(CONFIG_UNIX) +static void io_destruct_skb(struct sk_buff *skb) +{ + struct io_ring_ctx *ctx = skb->sk->sk_user_data; + + io_finish_async(ctx); + unix_destruct_scm(skb); +} + +/* + * Ensure the UNIX gc is aware of our file set, so we are certain that + * the io_uring can be safely unregistered on process exit, even if we have + * loops in the file referencing. + */ +static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) +{ + struct sock *sk = ctx->ring_sock->sk; + struct scm_fp_list *fpl; + struct sk_buff *skb; + int i; + + if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + unsigned long inflight = ctx->user->unix_inflight + nr; + + if (inflight > task_rlimit(current, RLIMIT_NOFILE)) + return -EMFILE; + } + + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } + + skb->sk = sk; + skb->destructor = io_destruct_skb; + + fpl->user = get_uid(ctx->user); + for (i = 0; i < nr; i++) { + fpl->fp[i] = get_file(ctx->user_files[i + offset]); + unix_inflight(fpl->user, fpl->fp[i]); + } + + fpl->max = fpl->count = nr; + UNIXCB(skb).fp = fpl; + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + skb_queue_head(&sk->sk_receive_queue, skb); + + for (i = 0; i < nr; i++) + fput(fpl->fp[i]); + + return 0; +} + +/* + * If UNIX sockets are enabled, fd passing can cause a reference cycle which + * causes regular reference counting to break down. We rely on the UNIX + * garbage collection to take care of this problem for us. + */ +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + unsigned left, total; + int ret = 0; + + total = 0; + left = ctx->nr_user_files; + while (left) { + unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); + int ret; + + ret = __io_sqe_files_scm(ctx, this_files, total); + if (ret) + break; + left -= this_files; + total += this_files; + } + + if (!ret) + return 0; + + while (total < ctx->nr_user_files) { + fput(ctx->user_files[total]); + total++; + } + + return ret; +} +#else +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + return 0; +} +#endif + +static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + __s32 __user *fds = (__s32 __user *) arg; + int fd, ret = 0; + unsigned i; + + if (ctx->user_files) + return -EBUSY; + if (!nr_args) + return -EINVAL; + if (nr_args > IORING_MAX_FIXED_FILES) + return -EMFILE; + + ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL); + if (!ctx->user_files) + return -ENOMEM; + + for (i = 0; i < nr_args; i++) { + ret = -EFAULT; + if (copy_from_user(&fd, &fds[i], sizeof(fd))) + break; + + ctx->user_files[i] = fget(fd); + + ret = -EBADF; + if (!ctx->user_files[i]) + break; + /* + * Don't allow io_uring instances to be registered. If UNIX + * isn't enabled, then this causes a reference cycle and this + * instance can never get freed. If UNIX is enabled we'll + * handle it just fine, but there's still no point in allowing + * a ring fd as it doesn't support regular read/write anyway. + */ + if (ctx->user_files[i]->f_op == &io_uring_fops) { + fput(ctx->user_files[i]); + break; + } + ctx->nr_user_files++; + ret = 0; + } + + if (ret) { + for (i = 0; i < ctx->nr_user_files; i++) + fput(ctx->user_files[i]); + + kfree(ctx->user_files); + ctx->nr_user_files = 0; + return ret; + } + + ret = io_sqe_files_scm(ctx); + if (ret) + io_sqe_files_unregister(ctx); + + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx) { int ret; @@ -1612,13 +1851,13 @@ err: static void io_ring_ctx_free(struct io_ring_ctx *ctx) { - if (ctx->sqo_wq) - destroy_workqueue(ctx->sqo_wq); + io_finish_async(ctx); if (ctx->sqo_mm) mmdrop(ctx->sqo_mm); io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); + io_sqe_files_unregister(ctx); #if defined(CONFIG_UNIX) if (ctx->ring_sock) @@ -1858,6 +2097,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; + ctx->ring_sock->sk->sk_user_data = ctx; #endif fd_install(ret, file); return ret; @@ -2001,6 +2241,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sqe_buffer_unregister(ctx); break; + case IORING_REGISTER_FILES: + ret = io_sqe_files_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_FILES: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_files_unregister(ctx); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cf28f7a11f12..6257478d55e9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -16,7 +16,7 @@ */ struct io_uring_sqe { __u8 opcode; /* type of operation for this sqe */ - __u8 flags; /* as of now unused */ + __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ __u64 off; /* offset into file */ @@ -33,6 +33,11 @@ struct io_uring_sqe { }; }; +/* + * sqe->flags + */ +#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ + /* * io_uring_setup() flags */ @@ -113,5 +118,7 @@ struct io_uring_params { */ #define IORING_REGISTER_BUFFERS 0 #define IORING_UNREGISTER_BUFFERS 1 +#define IORING_REGISTER_FILES 2 +#define IORING_UNREGISTER_FILES 3 #endif -- cgit v1.3-7-g2ca7 From 6c271ce2f1d572f7fa225700a13cfe7ced492434 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Jan 2019 11:22:30 -0700 Subject: io_uring: add submission polling This enables an application to do IO, without ever entering the kernel. By using the SQ ring to fill in new sqes and watching for completions on the CQ ring, we can submit and reap IOs without doing a single system call. The kernel side thread will poll for new submissions, and in case of HIPRI/polled IO, it'll also poll for completions. By default, we allow 1 second of active spinning. This can by changed by passing in a different grace period at io_uring_register(2) time. If the thread exceeds this idle time without having any work to do, it will set: sq_ring->flags |= IORING_SQ_NEED_WAKEUP. The application will have to call io_uring_enter() to start things back up again. If IO is kept busy, that will never be needed. Basically an application that has this feature enabled will guard it's io_uring_enter(2) call with: read_barrier(); if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP) io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP); instead of calling it unconditionally. It's mandatory to use fixed files with this feature. Failure to do so will result in the application getting an -EBADF CQ entry when submitting IO. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/io_uring.c | 249 ++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/io_uring.h | 12 +- 2 files changed, 253 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5eaa1c4a3f7c..0a4caedf82c1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -108,12 +109,16 @@ struct io_ring_ctx { unsigned cached_sq_head; unsigned sq_entries; unsigned sq_mask; + unsigned sq_thread_idle; struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; /* IO offload */ struct workqueue_struct *sqo_wq; + struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; + wait_queue_head_t sqo_wait; + unsigned sqo_stop; struct { /* CQ ring */ @@ -168,6 +173,7 @@ struct sqe_submit { unsigned short index; bool has_user; bool needs_lock; + bool needs_fixed_file; }; struct io_kiocb { @@ -327,6 +333,8 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); + if (waitqueue_active(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); } static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) @@ -680,9 +688,10 @@ static bool io_file_supports_async(struct file *file) return false; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, +static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock, struct io_submit_state *state) { + const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; unsigned ioprio, flags; @@ -702,6 +711,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_filp = ctx->user_files[fd]; req->flags |= REQ_F_FIXED_FILE; } else { + if (s->needs_fixed_file) + return -EBADF; kiocb->ki_filp = io_file_get(state, fd); if (unlikely(!kiocb->ki_filp)) return -EBADF; @@ -865,7 +876,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, struct file *file; ssize_t ret; - ret = io_prep_rw(req, s->sqe, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock, state); if (ret) return ret; file = kiocb->ki_filp; @@ -909,7 +920,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, struct file *file; ssize_t ret; - ret = io_prep_rw(req, s->sqe, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock, state); if (ret) return ret; /* Hold on to the file for -EAGAIN */ @@ -1301,6 +1312,169 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) return false; } +static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, + unsigned int nr, bool has_user, bool mm_fault) +{ + struct io_submit_state state, *statep = NULL; + int ret, i, submitted = 0; + + if (nr > IO_PLUG_THRESHOLD) { + io_submit_state_start(&state, ctx, nr); + statep = &state; + } + + for (i = 0; i < nr; i++) { + if (unlikely(mm_fault)) { + ret = -EFAULT; + } else { + sqes[i].has_user = has_user; + sqes[i].needs_lock = true; + sqes[i].needs_fixed_file = true; + ret = io_submit_sqe(ctx, &sqes[i], statep); + } + if (!ret) { + submitted++; + continue; + } + + io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0); + } + + if (statep) + io_submit_state_end(&state); + + return submitted; +} + +static int io_sq_thread(void *data) +{ + struct sqe_submit sqes[IO_IOPOLL_BATCH]; + struct io_ring_ctx *ctx = data; + struct mm_struct *cur_mm = NULL; + mm_segment_t old_fs; + DEFINE_WAIT(wait); + unsigned inflight; + unsigned long timeout; + + old_fs = get_fs(); + set_fs(USER_DS); + + timeout = inflight = 0; + while (!kthread_should_stop() && !ctx->sqo_stop) { + bool all_fixed, mm_fault = false; + int i; + + if (inflight) { + unsigned nr_events = 0; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + /* + * We disallow the app entering submit/complete + * with polling, but we still need to lock the + * ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + io_iopoll_check(ctx, &nr_events, 0); + mutex_unlock(&ctx->uring_lock); + } else { + /* + * Normal IO, just pretend everything completed. + * We don't have to poll completions for that. + */ + nr_events = inflight; + } + + inflight -= nr_events; + if (!inflight) + timeout = jiffies + ctx->sq_thread_idle; + } + + if (!io_get_sqring(ctx, &sqes[0])) { + /* + * We're polling. If we're within the defined idle + * period, then let us spin without work before going + * to sleep. + */ + if (inflight || !time_after(jiffies, timeout)) { + cpu_relax(); + continue; + } + + /* + * Drop cur_mm before scheduling, we can't hold it for + * long periods (or over schedule()). Do this before + * adding ourselves to the waitqueue, as the unuse/drop + * may sleep. + */ + if (cur_mm) { + unuse_mm(cur_mm); + mmput(cur_mm); + cur_mm = NULL; + } + + prepare_to_wait(&ctx->sqo_wait, &wait, + TASK_INTERRUPTIBLE); + + /* Tell userspace we may need a wakeup call */ + ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; + smp_wmb(); + + if (!io_get_sqring(ctx, &sqes[0])) { + if (kthread_should_stop()) { + finish_wait(&ctx->sqo_wait, &wait); + break; + } + if (signal_pending(current)) + flush_signals(current); + schedule(); + finish_wait(&ctx->sqo_wait, &wait); + + ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + smp_wmb(); + continue; + } + finish_wait(&ctx->sqo_wait, &wait); + + ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + smp_wmb(); + } + + i = 0; + all_fixed = true; + do { + if (all_fixed && io_sqe_needs_user(sqes[i].sqe)) + all_fixed = false; + + i++; + if (i == ARRAY_SIZE(sqes)) + break; + } while (io_get_sqring(ctx, &sqes[i])); + + /* Unless all new commands are FIXED regions, grab mm */ + if (!all_fixed && !cur_mm) { + mm_fault = !mmget_not_zero(ctx->sqo_mm); + if (!mm_fault) { + use_mm(ctx->sqo_mm); + cur_mm = ctx->sqo_mm; + } + } + + inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL, + mm_fault); + + /* Commit SQ ring head once we've consumed all SQEs */ + io_commit_sqring(ctx); + } + + set_fs(old_fs); + if (cur_mm) { + unuse_mm(cur_mm); + mmput(cur_mm); + } + return 0; +} + static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; @@ -1319,6 +1493,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) s.has_user = true; s.needs_lock = false; + s.needs_fixed_file = false; ret = io_submit_sqe(ctx, &s, statep); if (ret) { @@ -1418,8 +1593,20 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return 0; } +static void io_sq_thread_stop(struct io_ring_ctx *ctx) +{ + if (ctx->sqo_thread) { + ctx->sqo_stop = 1; + mb(); + kthread_stop(ctx->sqo_thread); + ctx->sqo_thread = NULL; + } +} + static void io_finish_async(struct io_ring_ctx *ctx) { + io_sq_thread_stop(ctx); + if (ctx->sqo_wq) { destroy_workqueue(ctx->sqo_wq); ctx->sqo_wq = NULL; @@ -1583,13 +1770,47 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static int io_sq_offload_start(struct io_ring_ctx *ctx) +static int io_sq_offload_start(struct io_ring_ctx *ctx, + struct io_uring_params *p) { int ret; + init_waitqueue_head(&ctx->sqo_wait); mmgrab(current->mm); ctx->sqo_mm = current->mm; + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; + + ret = -EINVAL; + if (!cpu_possible(p->sq_thread_cpu)) + goto err; + + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (p->flags & IORING_SETUP_SQ_AFF) { + int cpu; + + cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); + ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, + ctx, cpu, + "io_uring-sq"); + } else { + ctx->sqo_thread = kthread_create(io_sq_thread, ctx, + "io_uring-sq"); + } + if (IS_ERR(ctx->sqo_thread)) { + ret = PTR_ERR(ctx->sqo_thread); + ctx->sqo_thread = NULL; + goto err; + } + wake_up_process(ctx->sqo_thread); + } else if (p->flags & IORING_SETUP_SQ_AFF) { + /* Can't have SQ_AFF without SQPOLL */ + ret = -EINVAL; + goto err; + } + /* Do QD, or 2 * CPUS, whatever is smallest */ ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, min(ctx->sq_entries - 1, 2 * num_online_cpus())); @@ -1600,6 +1821,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx) return 0; err: + io_sq_thread_stop(ctx); mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; return ret; @@ -1959,7 +2181,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; - if (flags & ~IORING_ENTER_GETEVENTS) + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; f = fdget(fd); @@ -1975,6 +2197,18 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (!percpu_ref_tryget(&ctx->refs)) goto out_fput; + /* + * For SQ polling, the thread will do all submissions and completions. + * Just return the requested submit count, and wake the thread if + * we were asked to. + */ + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (flags & IORING_ENTER_SQ_WAKEUP) + wake_up(&ctx->sqo_wait); + submitted = to_submit; + goto out_ctx; + } + ret = 0; if (to_submit) { to_submit = min(to_submit, ctx->sq_entries); @@ -2156,7 +2390,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret) goto err; - ret = io_sq_offload_start(ctx); + ret = io_sq_offload_start(ctx, p); if (ret) goto err; @@ -2204,7 +2438,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) return -EINVAL; } - if (p.flags & ~IORING_SETUP_IOPOLL) + if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | + IORING_SETUP_SQ_AFF)) return -EINVAL; ret = io_uring_create(entries, &p); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6257478d55e9..0ec74bab8dbe 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -42,6 +42,8 @@ struct io_uring_sqe { * io_uring_setup() flags */ #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ +#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ +#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_OP_NOP 0 #define IORING_OP_READV 1 @@ -86,6 +88,11 @@ struct io_sqring_offsets { __u64 resv2; }; +/* + * sq_ring->flags + */ +#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ + struct io_cqring_offsets { __u32 head; __u32 tail; @@ -100,6 +107,7 @@ struct io_cqring_offsets { * io_uring_enter(2) flags */ #define IORING_ENTER_GETEVENTS (1U << 0) +#define IORING_ENTER_SQ_WAKEUP (1U << 1) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -108,7 +116,9 @@ struct io_uring_params { __u32 sq_entries; __u32 cq_entries; __u32 flags; - __u32 resv[7]; + __u32 sq_thread_cpu; + __u32 sq_thread_idle; + __u32 resv[5]; struct io_sqring_offsets sq_off; struct io_cqring_offsets cq_off; }; -- cgit v1.3-7-g2ca7 From ca215086b14b89a0e70fc211314944aa6ce50020 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:23 -0800 Subject: mm: convert PG_balloon to PG_offline PG_balloon was introduced to implement page migration/compaction for pages inflated in virtio-balloon. Nowadays, it is only a marker that a page is part of virtio-balloon and therefore logically offline. We also want to make use of this flag in other balloon drivers - for inflated pages or when onlining a section but keeping some pages offline (e.g. used right now by XEN and Hyper-V via set_online_page_callback()). We are going to expose this flag to dump tools like makedumpfile. But instead of exposing PG_balloon, let's generalize the concept of marking pages as logically offline, so it can be reused for other purposes later on. Rename PG_balloon to PG_offline. This is an indicator that the page is logically offline, the content stale and that it should not be touched (e.g. a hypervisor would have to allocate backing storage in order for the guest to dump an unused page). We can then e.g. exclude such pages from dumps. We replace and reuse KPF_BALLOON (23), as this shouldn't really harm (and for now the semantics stay the same). In following patches, we will make use of this bit also in other balloon drivers. While at it, document PGTABLE. [akpm@linux-foundation.org: fix comment text, per David] Link: http://lkml.kernel.org/r/20181119101616.8901-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Konstantin Khlebnikov Acked-by: Michael S. Tsirkin Acked-by: Pankaj gupta Cc: Jonathan Corbet Cc: Alexey Dobriyan Cc: Mike Rapoport Cc: Christian Hansen Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Stephen Rothwell Cc: Matthew Wilcox Cc: Michal Hocko Cc: Pavel Tatashin Cc: Alexander Duyck Cc: Naoya Horiguchi Cc: Miles Chen Cc: David Rientjes Cc: Kazuhito Hagio Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Nadav Amit Cc: Omar Sandoval Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 9 ++++++--- fs/proc/page.c | 4 ++-- include/linux/balloon_compaction.h | 8 ++++---- include/linux/page-flags.h | 11 +++++++---- include/uapi/linux/kernel-page-flags.h | 2 +- tools/vm/page-types.c | 2 +- 6 files changed, 21 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 3f7bade2c231..340a5aee9b80 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -75,9 +75,10 @@ number of times a page is mapped. 20. NOPAGE 21. KSM 22. THP - 23. BALLOON + 23. OFFLINE 24. ZERO_PAGE 25. IDLE + 26. PGTABLE * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the memory cgroup each page is charged to, indexed by PFN. Only available when @@ -118,8 +119,8 @@ Short descriptions to the page flags identical memory pages dynamically shared between one or more processes 22 - THP contiguous pages which construct transparent hugepages -23 - BALLOON - balloon compaction page +23 - OFFLINE + page is logically offline 24 - ZERO_PAGE zero page for pfn_zero or huge_zero page 25 - IDLE @@ -128,6 +129,8 @@ Short descriptions to the page flags Note that this flag may be stale in case the page was accessed via a PTE. To make sure the flag is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first. +26 - PGTABLE + page is in use as a page table IO related page flags --------------------- diff --git a/fs/proc/page.c b/fs/proc/page.c index 40b05e0d4274..544d1ee15aee 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page) else if (page_count(page) == 0 && is_free_buddy_page(page)) u |= 1 << KPF_BUDDY; - if (PageBalloon(page)) - u |= 1 << KPF_BALLOON; + if (PageOffline(page)) + u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index cbe50da5a59d..f111c780ef1d 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -95,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping, static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); __SetPageMovable(page, balloon->inode->i_mapping); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); @@ -111,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, */ static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); __ClearPageMovable(page); set_page_private(page, 0); /* @@ -141,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void) static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); list_add(&page->lru, &balloon->pages); } static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); list_del(&page->lru); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 39b4494e29f1..808b4183e30d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -671,7 +671,7 @@ PAGEFLAG_FALSE(DoubleMap) /* Reserve 0x0000007f to catch underflows of page_mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 -#define PG_balloon 0x00000100 +#define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 @@ -706,10 +706,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \ PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is true for pages that are on the balloon page list - * (see mm/balloon_compaction.c). + * PageOffline() indicates that the page is logically offline although the + * containing section is online. (e.g. inflated in a balloon driver or + * not onlined when onlining the section). + * The content of these pages is effectively stale. Such pages should not + * be touched (read/write/dump/save) except by their owner. */ -PAGE_TYPE_OPS(Balloon, balloon) +PAGE_TYPE_OPS(Offline, offline) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 21b9113c69da..6f2f2720f3ac 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -32,7 +32,7 @@ #define KPF_KSM 21 #define KPF_THP 22 -#define KPF_BALLOON 23 +#define KPF_OFFLINE 23 #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 #define KPF_PGTABLE 26 diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 1ff3a6c0367b..6f64b2b93234 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -133,7 +133,7 @@ static const char * const page_flag_names[] = { [KPF_NOPAGE] = "n:nopage", [KPF_KSM] = "x:ksm", [KPF_THP] = "t:thp", - [KPF_BALLOON] = "o:balloon", + [KPF_OFFLINE] = "o:offline", [KPF_PGTABLE] = "g:pgtable", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page", -- cgit v1.3-7-g2ca7 From ab3948f58ff841e51feb845720624665ef5b7ef3 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 5 Mar 2019 15:47:54 -0800 Subject: mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd Android uses ashmem for sharing memory regions. We are looking forward to migrating all usecases of ashmem to memfd so that we can possibly remove the ashmem driver in the future from staging while also benefiting from using memfd and contributing to it. Note staging drivers are also not ABI and generally can be removed at anytime. One of the main usecases Android has is the ability to create a region and mmap it as writeable, then add protection against making any "future" writes while keeping the existing already mmap'ed writeable-region active. This allows us to implement a usecase where receivers of the shared memory buffer can get a read-only view, while the sender continues to write to the buffer. See CursorWindow documentation in Android for more details: https://developer.android.com/reference/android/database/CursorWindow This usecase cannot be implemented with the existing F_SEAL_WRITE seal. To support the usecase, this patch adds a new F_SEAL_FUTURE_WRITE seal which prevents any future mmap and write syscalls from succeeding while keeping the existing mmap active. A better way to do F_SEAL_FUTURE_WRITE seal was discussed [1] last week where we don't need to modify core VFS structures to get the same behavior of the seal. This solves several side-effects pointed by Andy. self-tests are provided in later patch to verify the expected semantics. [1] https://lore.kernel.org/lkml/20181111173650.GA256781@google.com/ Thanks a lot to Andy for suggestions to improve code. Link: http://lkml.kernel.org/r/20190112203816.85534-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Acked-by: John Stultz Cc: Andy Lutomirski Cc: Minchan Kim Cc: Jann Horn Cc: Al Viro Cc: Andy Lutomirski Cc: Hugh Dickins Cc: J. Bruce Fields Cc: Jeff Layton Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- include/uapi/linux/fcntl.h | 1 + mm/memfd.c | 3 ++- mm/shmem.c | 25 ++++++++++++++++++++++--- 4 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a7fa037b876b..b0eef008de67 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6448cdd9a350..a2f8658f1c55 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -41,6 +41,7 @@ #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/mm/memfd.c b/mm/memfd.c index 97264c79d2cd..650e65a46b9c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ - F_SEAL_WRITE) + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { diff --git a/mm/shmem.c b/mm/shmem.c index 283a1833dafc..b3db3779a30a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2190,6 +2190,24 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + + if (info->seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED + * read-only mapping, take care to not allow mprotect to revert + * protections. + */ + vma->vm_flags &= ~(VM_MAYWRITE); + } + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && @@ -2440,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ - if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { - if (info->seals & F_SEAL_WRITE) + if (unlikely(info->seals & (F_SEAL_GROW | + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; @@ -2704,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } -- cgit v1.3-7-g2ca7 From 221c5eb2338232f7340386de1c43decc32682e58 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Jan 2019 09:41:58 -0700 Subject: io_uring: add support for IORING_OP_POLL This is basically a direct port of bfe4037e722e, which implements a one-shot poll command through aio. Description below is based on that commit as well. However, instead of adding a POLL command and relying on io_cancel(2) to remove it, we mimic the epoll(2) interface of having a command to add a poll notification, IORING_OP_POLL_ADD, and one to remove it again, IORING_OP_POLL_REMOVE. To poll for a file descriptor the application should submit an sqe of type IORING_OP_POLL. It will poll the fd for the events specified in the poll_events field. Unlike poll or epoll without EPOLLONESHOT this interface always works in one shot mode, that is once the sqe is completed, it will have to be resubmitted. Reviewed-by: Hannes Reinecke Based-on-code-from: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/io_uring.c | 263 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/io_uring.h | 3 + 2 files changed, 265 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index d2a3a1bc85cc..85b914bd823c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -161,6 +161,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; + struct list_head cancel_list; } ____cacheline_aligned_in_smp; #if defined(CONFIG_UNIX) @@ -176,8 +177,20 @@ struct sqe_submit { bool needs_fixed_file; }; +struct io_poll_iocb { + struct file *file; + struct wait_queue_head *head; + __poll_t events; + bool woken; + bool canceled; + struct wait_queue_entry wait; +}; + struct io_kiocb { - struct kiocb rw; + union { + struct kiocb rw; + struct io_poll_iocb poll; + }; struct sqe_submit submit; @@ -261,6 +274,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); + INIT_LIST_HEAD(&ctx->cancel_list); return ctx; } @@ -1058,6 +1072,246 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static void io_poll_remove_one(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = &req->poll; + + spin_lock(&poll->head->lock); + WRITE_ONCE(poll->canceled, true); + if (!list_empty(&poll->wait.entry)) { + list_del_init(&poll->wait.entry); + queue_work(req->ctx->sqo_wq, &req->work); + } + spin_unlock(&poll->head->lock); + + list_del_init(&req->list); +} + +static void io_poll_remove_all(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&ctx->cancel_list)) { + req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list); + io_poll_remove_one(req); + } + spin_unlock_irq(&ctx->completion_lock); +} + +/* + * Find a running poll command that matches one specified in sqe->addr, + * and remove it if found. + */ +static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *poll_req, *next; + int ret = -ENOENT; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->poll_events) + return -EINVAL; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) { + if (READ_ONCE(sqe->addr) == poll_req->user_data) { + io_poll_remove_one(poll_req); + ret = 0; + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_free_req(req); + return 0; +} + +static void io_poll_complete(struct io_kiocb *req, __poll_t mask) +{ + io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); + io_fput(req); + io_free_req(req); +} + +static void io_poll_complete_work(struct work_struct *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_poll_iocb *poll = &req->poll; + struct poll_table_struct pt = { ._key = poll->events }; + struct io_ring_ctx *ctx = req->ctx; + __poll_t mask = 0; + + if (!READ_ONCE(poll->canceled)) + mask = vfs_poll(poll->file, &pt) & poll->events; + + /* + * Note that ->ki_cancel callers also delete iocb from active_reqs after + * calling ->ki_cancel. We need the ctx_lock roundtrip here to + * synchronize with them. In the cancellation case the list_del_init + * itself is not actually needed, but harmless so we keep it in to + * avoid further branches in the fast path. + */ + spin_lock_irq(&ctx->completion_lock); + if (!mask && !READ_ONCE(poll->canceled)) { + add_wait_queue(poll->head, &poll->wait); + spin_unlock_irq(&ctx->completion_lock); + return; + } + list_del_init(&req->list); + spin_unlock_irq(&ctx->completion_lock); + + io_poll_complete(req, mask); +} + +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, + wait); + struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); + struct io_ring_ctx *ctx = req->ctx; + __poll_t mask = key_to_poll(key); + + poll->woken = true; + + /* for instances that support it check for an event match first: */ + if (mask) { + unsigned long flags; + + if (!(mask & poll->events)) + return 0; + + /* try to complete the iocb inline if we can: */ + if (spin_trylock_irqsave(&ctx->completion_lock, flags)) { + list_del(&req->list); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + list_del_init(&poll->wait.entry); + io_poll_complete(req, mask); + return 1; + } + } + + list_del_init(&poll->wait.entry); + queue_work(ctx->sqo_wq, &req->work); + return 1; +} + +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int error; +}; + +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + + if (unlikely(pt->req->poll.head)) { + pt->error = -EINVAL; + return; + } + + pt->error = 0; + pt->req->poll.head = head; + add_wait_queue(head, &pt->req->poll.wait); +} + +static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_poll_iocb *poll = &req->poll; + struct io_ring_ctx *ctx = req->ctx; + struct io_poll_table ipt; + unsigned flags; + __poll_t mask; + u16 events; + int fd; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + return -EINVAL; + + INIT_WORK(&req->work, io_poll_complete_work); + events = READ_ONCE(sqe->poll_events); + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + + flags = READ_ONCE(sqe->flags); + fd = READ_ONCE(sqe->fd); + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) + return -EBADF; + poll->file = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + poll->file = fget(fd); + } + if (unlikely(!poll->file)) + return -EBADF; + + poll->head = NULL; + poll->woken = false; + poll->canceled = false; + + ipt.pt._qproc = io_poll_queue_proc; + ipt.pt._key = poll->events; + ipt.req = req; + ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ + + /* initialized the list so that we can do list_empty checks */ + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); + + /* one for removal from waitqueue, one for this function */ + refcount_set(&req->refs, 2); + + mask = vfs_poll(poll->file, &ipt.pt) & poll->events; + if (unlikely(!poll->head)) { + /* we did not manage to set up a waitqueue, done */ + goto out; + } + + spin_lock_irq(&ctx->completion_lock); + spin_lock(&poll->head->lock); + if (poll->woken) { + /* wake_up context handles the rest */ + mask = 0; + ipt.error = 0; + } else if (mask || ipt.error) { + /* if we get an error or a mask we are done */ + WARN_ON_ONCE(list_empty(&poll->wait.entry)); + list_del_init(&poll->wait.entry); + } else { + /* actually waiting for an event */ + list_add_tail(&req->list, &ctx->cancel_list); + } + spin_unlock(&poll->head->lock); + spin_unlock_irq(&ctx->completion_lock); + +out: + if (unlikely(ipt.error)) { + if (!(flags & IOSQE_FIXED_FILE)) + fput(poll->file); + /* + * Drop one of our refs to this req, __io_submit_sqe() will + * drop the other one since we're returning an error. + */ + io_free_req(req); + return ipt.error; + } + + if (mask) + io_poll_complete(req, mask); + io_free_req(req); + return 0; +} + static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock, struct io_submit_state *state) @@ -1093,6 +1347,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_FSYNC: ret = io_fsync(req, s->sqe, force_nonblock); break; + case IORING_OP_POLL_ADD: + ret = io_poll_add(req, s->sqe); + break; + case IORING_OP_POLL_REMOVE: + ret = io_poll_remove(req, s->sqe); + break; default: ret = -EINVAL; break; @@ -2131,6 +2391,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_poll_remove_all(ctx); io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); io_ring_ctx_free(ctx); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0ec74bab8dbe..e23408692118 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -25,6 +25,7 @@ struct io_uring_sqe { union { __kernel_rwf_t rw_flags; __u32 fsync_flags; + __u16 poll_events; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -51,6 +52,8 @@ struct io_uring_sqe { #define IORING_OP_FSYNC 3 #define IORING_OP_READ_FIXED 4 #define IORING_OP_WRITE_FIXED 5 +#define IORING_OP_POLL_ADD 6 +#define IORING_OP_POLL_REMOVE 7 /* * sqe->fsync_flags -- cgit v1.3-7-g2ca7 From 54d50897d544c874562253e2a8f70dfcad22afe8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Mar 2019 16:27:14 -0800 Subject: linux/kernel.h: split *_MAX and *_MIN macros into tends to be cluttered because we often put various sort of unrelated stuff in it. So, we have split out a sensible chunk of code into a separate header from time to time. This commit splits out the *_MAX and *_MIN defines. The standard header contains various MAX, MIN constants including numerial limits. [1] I think it makes sense to move in-kernel MAX, MIN constants into include/linux/limits.h. We already have include/uapi/linux/limits.h to contain some user-space constants. I changed its include guard to _UAPI_LINUX_LIMITS_H. This change has no impact to the user-space because scripts/headers_install.sh rips off the '_UAPI' prefix from the include guards of exported headers. [1] http://pubs.opengroup.org/onlinepubs/009604499/basedefs/limits.h.html Link: http://lkml.kernel.org/r/1549156242-20806-2-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Alex Elder Cc: Alexey Dobriyan Cc: Zhang Yanmin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 29 +---------------------------- include/linux/limits.h | 36 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/limits.h | 4 ++-- 3 files changed, 39 insertions(+), 30 deletions(-) create mode 100644 include/linux/limits.h (limited to 'include/uapi') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a9ff66977e10..34a5036debd3 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -4,6 +4,7 @@ #include +#include #include #include #include @@ -17,34 +18,6 @@ #include #include -#define USHRT_MAX ((unsigned short)~0U) -#define SHRT_MAX ((short)(USHRT_MAX>>1)) -#define SHRT_MIN ((short)(-SHRT_MAX - 1)) -#define INT_MAX ((int)(~0U>>1)) -#define INT_MIN (-INT_MAX - 1) -#define UINT_MAX (~0U) -#define LONG_MAX ((long)(~0UL>>1)) -#define LONG_MIN (-LONG_MAX - 1) -#define ULONG_MAX (~0UL) -#define LLONG_MAX ((long long)(~0ULL>>1)) -#define LLONG_MIN (-LLONG_MAX - 1) -#define ULLONG_MAX (~0ULL) -#define SIZE_MAX (~(size_t)0) -#define PHYS_ADDR_MAX (~(phys_addr_t)0) - -#define U8_MAX ((u8)~0U) -#define S8_MAX ((s8)(U8_MAX>>1)) -#define S8_MIN ((s8)(-S8_MAX - 1)) -#define U16_MAX ((u16)~0U) -#define S16_MAX ((s16)(U16_MAX>>1)) -#define S16_MIN ((s16)(-S16_MAX - 1)) -#define U32_MAX ((u32)~0U) -#define S32_MAX ((s32)(U32_MAX>>1)) -#define S32_MIN ((s32)(-S32_MAX - 1)) -#define U64_MAX ((u64)~0ULL) -#define S64_MAX ((s64)(U64_MAX>>1)) -#define S64_MIN ((s64)(-S64_MAX - 1)) - #define STACK_MAGIC 0xdeadbeef /** diff --git a/include/linux/limits.h b/include/linux/limits.h new file mode 100644 index 000000000000..76afcd24ff8c --- /dev/null +++ b/include/linux/limits.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LIMITS_H +#define _LINUX_LIMITS_H + +#include +#include + +#define USHRT_MAX ((unsigned short)~0U) +#define SHRT_MAX ((short)(USHRT_MAX >> 1)) +#define SHRT_MIN ((short)(-SHRT_MAX - 1)) +#define INT_MAX ((int)(~0U >> 1)) +#define INT_MIN (-INT_MAX - 1) +#define UINT_MAX (~0U) +#define LONG_MAX ((long)(~0UL >> 1)) +#define LONG_MIN (-LONG_MAX - 1) +#define ULONG_MAX (~0UL) +#define LLONG_MAX ((long long)(~0ULL >> 1)) +#define LLONG_MIN (-LLONG_MAX - 1) +#define ULLONG_MAX (~0ULL) +#define SIZE_MAX (~(size_t)0) +#define PHYS_ADDR_MAX (~(phys_addr_t)0) + +#define U8_MAX ((u8)~0U) +#define S8_MAX ((s8)(U8_MAX >> 1)) +#define S8_MIN ((s8)(-S8_MAX - 1)) +#define U16_MAX ((u16)~0U) +#define S16_MAX ((s16)(U16_MAX >> 1)) +#define S16_MIN ((s16)(-S16_MAX - 1)) +#define U32_MAX ((u32)~0U) +#define S32_MAX ((s32)(U32_MAX >> 1)) +#define S32_MIN ((s32)(-S32_MAX - 1)) +#define U64_MAX ((u64)~0ULL) +#define S64_MAX ((s64)(U64_MAX >> 1)) +#define S64_MIN ((s64)(-S64_MAX - 1)) + +#endif /* _LINUX_LIMITS_H */ diff --git a/include/uapi/linux/limits.h b/include/uapi/linux/limits.h index c3547f07605c..6bcbe3068761 100644 --- a/include/uapi/linux/limits.h +++ b/include/uapi/linux/limits.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _LINUX_LIMITS_H -#define _LINUX_LIMITS_H +#ifndef _UAPI_LINUX_LIMITS_H +#define _UAPI_LINUX_LIMITS_H #define NR_OPEN 1024 -- cgit v1.3-7-g2ca7 From 60d6d04ca3abb34d5e89f030dbea440d9715a168 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Mar 2019 16:29:09 -0800 Subject: autofs: add ignore mount option Add an autofs file system mount option that can be used to provide a generic indicator to applications that the mount entry should be ignored when displaying mount information. In other OSes that provide autofs and that provide a mount list to user space based on the kernel mount list a no-op mount option ("ignore" is the one use on the most common OS) is allowed so that autofs file system users can optionally use it. The idea is that it be used by user space programs to exclude autofs mounts from consideration when reading the mounts list. Prior to the change to link /etc/mtab to /proc/self/mounts all I needed to do to achieve this was to use mount(2) and not update the mtab but now that no longer works. I know the symlinking happened a long time ago and I considered doing this then but, at the time I couldn't remember the commonly used option name and thought persuading the various utility maintainers would be too hard. But now I have a RHEL request to do this for compatibility for a widely used product so I want to go ahead with it and try and enlist the help of some utility package maintainers. Clearly, without the option nothing can be done so it's at least a start. Link: http://lkml.kernel.org/r/154725123970.11260.6113771566924907275.stgit@pluto-themaw-net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 1 + fs/autofs/inode.c | 9 ++++++++- include/uapi/linux/auto_fs.h | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 3e59f0ed777b..b735f2b1e462 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -105,6 +105,7 @@ struct autofs_wait_queue { #define AUTOFS_SBI_CATATONIC 0x0001 #define AUTOFS_SBI_STRICTEXPIRE 0x0002 +#define AUTOFS_SBI_IGNORE 0x0004 struct autofs_sb_info { u32 magic; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 078992eee299..8647ecaa89fc 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -89,6 +89,8 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",indirect"); if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) seq_printf(m, ",strictexpire"); + if (sbi->flags & AUTOFS_SBI_IGNORE) + seq_printf(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); @@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = { }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, - Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire}; + Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire, + Opt_ignore}; static const match_table_t tokens = { {Opt_fd, "fd=%u"}, @@ -124,6 +127,7 @@ static const match_table_t tokens = { {Opt_direct, "direct"}, {Opt_offset, "offset"}, {Opt_strictexpire, "strictexpire"}, + {Opt_ignore, "ignore"}, {Opt_err, NULL} }; @@ -206,6 +210,9 @@ static int parse_options(char *options, case Opt_strictexpire: sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; break; + case Opt_ignore: + sbi->flags |= AUTOFS_SBI_IGNORE; + break; default: return 1; } diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index 082119630b49..1f7925afad2d 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 4 +#define AUTOFS_PROTO_SUBVERSION 5 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed -- cgit v1.3-7-g2ca7 From 6eb3c3d0a52dca337e327ae8868ca1f44a712e02 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 7 Mar 2019 16:29:26 -0800 Subject: exec: increase BINPRM_BUF_SIZE to 256 Large enterprise clients often run applications out of networked file systems where the IT mandated layout of project volumes can end up leading to paths that are longer than 128 characters. Bumping this up to the next order of two solves this problem in all but the most egregious case while still fitting into a 512b slab. [oleg@redhat.com: update comment, per Kees] Link: http://lkml.kernel.org/r/20181112160956.GA28472@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Ben Woodard Reviewed-by: Andrew Morton Acked-by: Michal Hocko Acked-by: Kees Cook Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/uapi/linux/binfmts.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/fs/exec.c b/fs/exec.c index bf0ace3841ad..2e0033348d8e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) /* * Fill the binprm structure from the inode. - * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * Check permissions, then read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ diff --git a/include/uapi/linux/binfmts.h b/include/uapi/linux/binfmts.h index 4abad03a8853..689025d9c185 100644 --- a/include/uapi/linux/binfmts.h +++ b/include/uapi/linux/binfmts.h @@ -16,6 +16,6 @@ struct pt_regs; #define MAX_ARG_STRINGS 0x7FFFFFFF /* sizeof(linux_binprm->buf) */ -#define BINPRM_BUF_SIZE 128 +#define BINPRM_BUF_SIZE 256 #endif /* _UAPI_LINUX_BINFMTS_H */ -- cgit v1.3-7-g2ca7 From dbafd7ddd62369b2f3926ab847cbf8fc40e800b7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 12 Mar 2019 10:23:04 -0700 Subject: bpf: Add bpf_get_listener_sock(struct bpf_sock *sk) helper Add a new helper "struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)" which returns a bpf_sock in TCP_LISTEN state. It will trace back to the listener sk from a request_sock if possible. It returns NULL for all other cases. No reference is taken because the helper ensures the sk is in SOCK_RCU_FREE (where the TCP_LISTEN sock should be in). Hence, bpf_sk_release() is unnecessary and the verifier does not allow bpf_sk_release(listen_sk) to be called either. The following is also allowed because the bpf_prog is run under rcu_read_lock(): sk = bpf_sk_lookup_tcp(); /* if (!sk) { ... } */ listen_sk = bpf_get_listener_sock(sk); /* if (!listen_sk) { ... } */ bpf_sk_release(sk); src_port = listen_sk->src_port; /* Allowed */ Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 11 ++++++++++- net/core/filter.c | 21 +++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c38ac9a92a7..983b25cb608d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2366,6 +2366,14 @@ union bpf_attr { * current value is ect (ECN capable). Works with IPv6 and IPv4. * Return * 1 if set, 0 if not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in TCP_LISTEN state. + * bpf_sk_release() is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2465,7 +2473,8 @@ union bpf_attr { FN(spin_unlock), \ FN(sk_fullsock), \ FN(tcp_sock), \ - FN(skb_ecn_set_ce), + FN(skb_ecn_set_ce), \ + FN(get_listener_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 36b6afacf83c..647c63a7b25b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5418,6 +5418,23 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = { .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; +BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_get_listener_sock_proto = { + .func = bpf_get_listener_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; @@ -5603,6 +5620,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_get_listener_sock: + return &bpf_get_listener_sock_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; #endif @@ -5698,6 +5717,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_get_listener_sock: + return &bpf_get_listener_sock_proto; #endif default: return bpf_base_func_proto(func_id); -- cgit v1.3-7-g2ca7 From 62369db2df8d1edfa040878203b446e023a16802 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 14 Mar 2019 12:38:39 +0000 Subject: bpf: fix documentation for eBPF helpers Another round of minor fixes for the documentation of the BPF helpers located in the UAPI bpf.h header file. Changes include: - Moving around description of some helpers, to keep the descriptions in the same order as helpers are declared (bpf_map_push_elem(), leftover from commit 90b1023f68c7 ("bpf: fix documentation for eBPF helpers"), bpf_rc_keydown(), and bpf_skb_ancestor_cgroup_id()). - Fixing typos ("contex" -> "context"). - Harmonising return types ("void* " -> "void *", "uint64_t" -> "u64"). - Addition of the "bpf_" prefix to bpf_get_storage(). - Light additions of RST markup on some keywords. - Empty line deletion between description and return value for bpf_tcp_sock(). - Edit for the description for bpf_skb_ecn_set_ce() (capital letters, acronym expansion, no effect if ECT not set, more details on return value). Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 128 ++++++++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 63 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 983b25cb608d..4465d00d3493 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -502,16 +502,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) - * Description - * Push an element *value* in *map*. *flags* is one of: - * - * **BPF_EXIST** - * If the queue/stack is full, the oldest element is removed to - * make room for this. - * Return - * 0 on success, or a negative error in case of failure. - * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -1435,14 +1425,14 @@ union bpf_attr { * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return * A 8-byte long non-decreasing number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long non-decreasing number. * @@ -2098,52 +2088,52 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * int bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to - * report a successfully decoded key press with *scancode*, - * *toggle* value in the given *protocol*. The scancode will be - * translated to a keycode using the rc keymap, and reported as - * an input key down event. After a period a key up event is - * generated. This period can be extended by calling either - * **bpf_rc_keydown**\ () again with the same values, or calling - * **bpf_rc_repeat**\ (). + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. * - * Some protocols include a toggle bit, in case the button was - * released and pressed again between consecutive scancodes. + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. * * The *ctx* should point to the lirc sample as passed into * the program. * - * The *protocol* is the decoded protocol number (see - * **enum rc_proto** for some predefined values). - * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * - * int bpf_rc_repeat(void *ctx) + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to - * report a successfully decoded repeat key message. This delays - * the generation of a key up event for previously generated - * key down event. + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). * - * Some IR protocols like NEC have a special IR message for - * repeating last button, for when a button is held down. + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. * * The *ctx* should point to the lirc sample as passed into * the program. * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * - * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) * Description * Return the cgroup v2 id of the socket associated with the *skb*. * This is roughly similar to the **bpf_get_cgroup_classid**\ () @@ -2159,30 +2149,12 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) - * Description - * Return id of cgroup v2 that is ancestor of cgroup associated - * with the *skb* at the *ancestor_level*. The root cgroup is at - * *ancestor_level* zero and each step down the hierarchy - * increments the level. If *ancestor_level* == level of cgroup - * associated with *skb*, then return value will be same as that - * of **bpf_skb_cgroup_id**\ (). - * - * The helper is useful to implement policies based on cgroups - * that are upper in hierarchy than immediate cgroup associated - * with *skb*. - * - * The format of returned id and helper limitations are same as in - * **bpf_skb_cgroup_id**\ (). - * Return - * The id is returned or 0 in case the id could not be retrieved. - * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. * - * void* get_local_storage(void *map, u64 flags) + * void *bpf_get_local_storage(void *map, u64 flags) * Description * Get the pointer to the local storage area. * The type and the size of the local storage is defined @@ -2209,6 +2181,24 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child @@ -2289,6 +2279,16 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. @@ -2346,33 +2346,35 @@ union bpf_attr { * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_sock** pointer such - * that all the fields in bpf_sock can be accessed. + * that all the fields in this **bpf_sock** can be accessed. * Return - * A **struct bpf_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. * * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_tcp_sock** pointer from a * **struct bpf_sock** pointer. - * * Return - * A **struct bpf_tcp_sock** pointer on success, or NULL in + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * * int bpf_skb_ecn_set_ce(struct sk_buf *skb) - * Description - * Sets ECN of IP header to ce (congestion encountered) if - * current value is ect (ECN capable). Works with IPv6 and IPv4. - * Return - * 1 if set, 0 if not set. + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. * * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) * Description - * Return a **struct bpf_sock** pointer in TCP_LISTEN state. - * bpf_sk_release() is unnecessary and not allowed. + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. * Return - * A **struct bpf_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ -- cgit v1.3-7-g2ca7 From 0eb0978528d47699edd091dc2c337952ad8da436 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 14 Mar 2019 12:38:40 +0000 Subject: bpf: add documentation for helpers bpf_spin_lock(), bpf_spin_unlock() Add documentation for the BPF spinlock-related helpers to the doc in bpf.h. I added the constraints and restrictions coming with the use of spinlocks for BPF: not all of it is directly related to the use of the helper, but I thought it would be nice for users to find them in the man page. This list of restrictions is nearly a verbatim copy of the list in Alexei's commit log for those helpers. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4465d00d3493..929c8e537a14 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2343,6 +2343,61 @@ union bpf_attr { * Return * 0 * + * int bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_sock** pointer such -- cgit v1.3-7-g2ca7