Skip to content

Commit

Permalink
Use pin_user_pages API for Direct I/O requests
Browse files Browse the repository at this point in the history
As of kernel v5.8, pin_user_pages* interfaced were introduced. These
interfaces use the FOLL_PIN flag. This is preferred interface now for
Direct I/O requests in the kernel. The reasoning for using this new
interface for Direct I/O requests is explained in the kernel
documenetation:
Documentation/core-api/pin_user_pages.rst

If pin_user_pages_unlocked is available, the all Direct I/O requests
will use this new API to stay uptodate with the kernel API requirements.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Closes #16856
  • Loading branch information
bwatkinson authored and behlendorf committed Dec 16, 2024
1 parent c6442bd commit 882a809
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 41 deletions.
33 changes: 33 additions & 0 deletions config/kernel-pin-user-pages.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
dnl #
dnl # Check for pin_user_pages_unlocked().
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_PIN_USER_PAGES], [
ZFS_LINUX_TEST_SRC([pin_user_pages_unlocked], [
#include <linux/mm.h>
],[
unsigned long start = 0;
unsigned long nr_pages = 1;
struct page **pages = NULL;
unsigned int gup_flags = 0;
long ret __attribute__ ((unused));
ret = pin_user_pages_unlocked(start, nr_pages, pages,
gup_flags);
])
])

AC_DEFUN([ZFS_AC_KERNEL_PIN_USER_PAGES], [
dnl #
dnl # Kernal 5.8 introduced the pin_user_pages* interfaces which should
dnl # be used for Direct I/O requests.
dnl #
AC_MSG_CHECKING([whether pin_user_pages_unlocked() is available])
ZFS_LINUX_TEST_RESULT([pin_user_pages_unlocked], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_PIN_USER_PAGES_UNLOCKED, 1,
[pin_user_pages_unlocked() is available])
],[
AC_MSG_RESULT(no)
])
])
43 changes: 19 additions & 24 deletions config/kernel-vfs-iov_iter.m4
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,21 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
error = fault_in_iov_iter_readable(&iter, size);
])
ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [
ZFS_LINUX_TEST_SRC([iov_iter_type], [
#include <linux/fs.h>
#include <linux/uio.h>
],[
struct iov_iter iter = { 0 };
struct page **pages = NULL;
size_t maxsize = 4096;
unsigned maxpages = 1;
size_t start;
size_t ret __attribute__ ((unused));
ret = iov_iter_get_pages2(&iter, pages, maxsize, maxpages,
&start);
__attribute__((unused)) enum iter_type i = iov_iter_type(&iter);
])
ZFS_LINUX_TEST_SRC([iov_iter_type], [
#include <linux/fs.h>
ZFS_LINUX_TEST_SRC([iter_is_ubuf], [
#include <linux/uio.h>
],[
struct iov_iter iter = { 0 };
__attribute__((unused)) enum iter_type i = iov_iter_type(&iter);
bool ret __attribute__((unused));
ret = iter_is_ubuf(&iter);
])
ZFS_LINUX_TEST_SRC([iter_iov], [
Expand All @@ -55,18 +50,6 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
AC_MSG_RESULT(no)
])
dnl #
dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_page_pages2().
dnl #
AC_MSG_CHECKING([whether iov_iter_get_pages2() is available])
ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1,
[iov_iter_get_pages2() is available])
],[
AC_MSG_RESULT(no)
])
dnl #
dnl # This checks for iov_iter_type() in linux/uio.h. It is not
dnl # required, however, and the module will compiled without it
Expand All @@ -81,6 +64,18 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
AC_MSG_RESULT(no)
])
dnl #
dnl # Kernel 6.0 introduced the ITER_UBUF iov_iter type. iter_is_ubuf()
dnl # was also added to determine if the iov_iter is an ITER_UBUF.
dnl #
AC_MSG_CHECKING([whether iter_is_ubuf() is available])
ZFS_LINUX_TEST_RESULT([iter_is_ubuf], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_ITER_IS_UBUF, 1, [iter_is_ubuf() is available])
],[
AC_MSG_RESULT(no)
])
dnl #
dnl # Kernel 6.5 introduces the iter_iov() function that returns the
dnl # __iov member of an iov_iter*. The iov member was renamed to this
Expand Down
2 changes: 2 additions & 0 deletions config/kernel.m4
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
ZFS_AC_KERNEL_SRC_FILE
ZFS_AC_KERNEL_SRC_PIN_USER_PAGES
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
Expand Down Expand Up @@ -238,6 +239,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_MM_PAGE_MAPPING
ZFS_AC_KERNEL_1ARG_ASSIGN_STR
ZFS_AC_KERNEL_FILE
ZFS_AC_KERNEL_PIN_USER_PAGES
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_CPU_HAS_FEATURE
Expand Down
111 changes: 94 additions & 17 deletions module/os/linux/zfs/zfs_uio.c
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,7 @@ zfs_unmark_page(struct page *page)
}
#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */

#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED)
static void
zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
{
Expand Down Expand Up @@ -472,6 +473,7 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
}
}
}
#endif

void
zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
Expand All @@ -480,6 +482,9 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
ASSERT(uio->uio_extflg & UIO_DIRECT);
ASSERT3P(uio->uio_dio.pages, !=, NULL);

#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
#else
for (long i = 0; i < uio->uio_dio.npages; i++) {
struct page *p = uio->uio_dio.pages[i];

Expand All @@ -491,51 +496,114 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)

put_page(p);
}

#endif
vmem_free(uio->uio_dio.pages,
uio->uio_dio.npages * sizeof (struct page *));
}

#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
static int
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
{
long res;
size_t skip = uio->uio_skip;
size_t len = uio->uio_resid - skip;
unsigned int gup_flags = 0;
unsigned long addr;
unsigned long nr_pages;

/*
* Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could
* possibly be used here in the future to allow for P2P operations with
* user pages.
*/
if (rw == UIO_READ)
gup_flags = FOLL_WRITE;

if (len == 0)
return (0);

#if defined(HAVE_ITER_IS_UBUF)
if (iter_is_ubuf(uio->uio_iter)) {
nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);
addr = (unsigned long)uio->uio_iter->ubuf + skip;
res = pin_user_pages_unlocked(addr, nr_pages,
&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
if (res < 0) {
return (SET_ERROR(-res));
} else if (len != (res * PAGE_SIZE)) {
uio->uio_dio.npages += res;
return (SET_ERROR(EFAULT));
}
uio->uio_dio.npages += res;
return (0);
}
#endif
const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter);
for (int i = 0; i < uio->uio_iovcnt; i++) {
size_t amt = iovp->iov_len - skip;
if (amt == 0) {
iovp++;
skip = 0;
continue;
}

addr = (unsigned long)iovp->iov_base + skip;
nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE);
res = pin_user_pages_unlocked(addr, nr_pages,
&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
if (res < 0) {
return (SET_ERROR(-res));
} else if (amt != (res * PAGE_SIZE)) {
uio->uio_dio.npages += res;
return (SET_ERROR(EFAULT));
}

len -= amt;
uio->uio_dio.npages += res;
skip = 0;
iovp++;
};

ASSERT0(len);

return (0);
}

#else
static int
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
{
size_t start;
size_t wanted = uio->uio_resid - uio->uio_skip;
ssize_t rollback = 0;
ssize_t cnt;
unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);

while (wanted) {
#if defined(HAVE_IOV_ITER_GET_PAGES2)
cnt = iov_iter_get_pages2(uio->uio_iter,
&uio->uio_dio.pages[uio->uio_dio.npages],
wanted, maxpages, &skip);
#else
cnt = iov_iter_get_pages(uio->uio_iter,
&uio->uio_dio.pages[uio->uio_dio.npages],
wanted, maxpages, &skip);
#endif
wanted, maxpages, &start);
if (cnt < 0) {
iov_iter_revert(uio->uio_iter, rollback);
return (SET_ERROR(-cnt));
}
/*
* All Direct I/O operations must be page aligned.
*/
ASSERT(IS_P2ALIGNED(start, PAGE_SIZE));
uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
rollback += cnt;
wanted -= cnt;
skip = 0;
#if !defined(HAVE_IOV_ITER_GET_PAGES2)
/*
* iov_iter_get_pages2() advances the iov_iter on success.
*/
iov_iter_advance(uio->uio_iter, cnt);
#endif

}
ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
iov_iter_revert(uio->uio_iter, rollback);

return (0);
}
#endif /* HAVE_PIN_USER_PAGES_UNLOCKED */

/*
* This function pins user pages. In the event that the user pages were not
Expand All @@ -552,25 +620,34 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)

if (uio->uio_segflg == UIO_ITER) {
uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
error = zfs_uio_pin_user_pages(uio, rw);
#else
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
#endif
} else {
return (SET_ERROR(EOPNOTSUPP));
}

ASSERT3S(uio->uio_dio.npages, >=, 0);

if (error) {
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
#else
for (long i = 0; i < uio->uio_dio.npages; i++)
put_page(uio->uio_dio.pages[i]);
#endif
vmem_free(uio->uio_dio.pages, size);
return (error);
} else {
ASSERT3S(uio->uio_dio.npages, ==, npages);
}

if (rw == UIO_WRITE) {
#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED)
if (rw == UIO_WRITE)
zfs_uio_dio_check_for_zero_page(uio);
}
#endif

uio->uio_extflg |= UIO_DIRECT;

Expand Down

0 comments on commit 882a809

Please sign in to comment.