aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/staging/lustre/lustre/osc
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/lustre/lustre/osc')
-rw-r--r--drivers/staging/lustre/lustre/osc/Makefile6
-rw-r--r--drivers/staging/lustre/lustre/osc/lproc_osc.c843
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_cache.c3306
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_cl_internal.h683
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_dev.c246
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_internal.h236
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_io.c918
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_lock.c1231
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_object.c474
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_page.c1094
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_quota.c284
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_request.c2899
12 files changed, 0 insertions, 12220 deletions
diff --git a/drivers/staging/lustre/lustre/osc/Makefile b/drivers/staging/lustre/lustre/osc/Makefile
deleted file mode 100644
index 30dec90e64e8..000000000000
--- a/drivers/staging/lustre/lustre/osc/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include
-
-obj-$(CONFIG_LUSTRE_FS) += osc.o
-osc-y := osc_request.o osc_dev.o osc_object.o \
- osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o lproc_osc.o
diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c
deleted file mode 100644
index dc76c35ae801..000000000000
--- a/drivers/staging/lustre/lustre/osc/lproc_osc.c
+++ /dev/null
@@ -1,843 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/statfs.h>
-#include <obd_cksum.h>
-#include <obd_class.h>
-#include <lprocfs_status.h>
-#include <linux/seq_file.h>
-#include "osc_internal.h"
-
-static ssize_t active_show(struct kobject *kobj, struct attribute *attr,
- char *buf)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
-
- return sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive);
-}
-
-static ssize_t active_store(struct kobject *kobj, struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- int rc;
- unsigned long val;
-
- rc = kstrtoul(buffer, 10, &val);
- if (rc)
- return rc;
- if (val > 1)
- return -ERANGE;
-
- /* opposite senses */
- if (dev->u.cli.cl_import->imp_deactive == val)
- rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
- else
- CDEBUG(D_CONFIG, "activate %ld: ignoring repeat request\n",
- val);
-
- return count;
-}
-LUSTRE_RW_ATTR(active);
-
-static ssize_t max_rpcs_in_flight_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
-
- return sprintf(buf, "%u\n", cli->cl_max_rpcs_in_flight);
-}
-
-static ssize_t max_rpcs_in_flight_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
- int rc;
- unsigned long val;
- int adding, added, req_count;
-
- rc = kstrtoul(buffer, 10, &val);
- if (rc)
- return rc;
-
- if (val < 1 || val > OSC_MAX_RIF_MAX)
- return -ERANGE;
-
- adding = val - cli->cl_max_rpcs_in_flight;
- req_count = atomic_read(&osc_pool_req_count);
- if (adding > 0 && req_count < osc_reqpool_maxreqcount) {
- /*
- * There might be some race which will cause over-limit
- * allocation, but it is fine.
- */
- if (req_count + adding > osc_reqpool_maxreqcount)
- adding = osc_reqpool_maxreqcount - req_count;
-
- added = osc_rq_pool->prp_populate(osc_rq_pool, adding);
- atomic_add(added, &osc_pool_req_count);
- }
-
- spin_lock(&cli->cl_loi_list_lock);
- cli->cl_max_rpcs_in_flight = val;
- client_adjust_max_dirty(cli);
- spin_unlock(&cli->cl_loi_list_lock);
-
- return count;
-}
-LUSTRE_RW_ATTR(max_rpcs_in_flight);
-
-static ssize_t max_dirty_mb_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
- long val;
- int mult;
-
- spin_lock(&cli->cl_loi_list_lock);
- val = cli->cl_dirty_max_pages;
- spin_unlock(&cli->cl_loi_list_lock);
-
- mult = 1 << (20 - PAGE_SHIFT);
- return lprocfs_read_frac_helper(buf, PAGE_SIZE, val, mult);
-}
-
-static ssize_t max_dirty_mb_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
- int rc;
- unsigned long pages_number;
-
- rc = kstrtoul(buffer, 10, &pages_number);
- if (rc)
- return rc;
-
- pages_number *= 1 << (20 - PAGE_SHIFT); /* MB -> pages */
-
- if (pages_number <= 0 ||
- pages_number >= OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) ||
- pages_number > totalram_pages / 4) /* 1/4 of RAM */
- return -ERANGE;
-
- spin_lock(&cli->cl_loi_list_lock);
- cli->cl_dirty_max_pages = pages_number;
- osc_wake_cache_waiters(cli);
- spin_unlock(&cli->cl_loi_list_lock);
-
- return count;
-}
-LUSTRE_RW_ATTR(max_dirty_mb);
-
-static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
-{
- struct obd_device *dev = m->private;
- struct client_obd *cli = &dev->u.cli;
- int shift = 20 - PAGE_SHIFT;
-
- seq_printf(m,
- "used_mb: %ld\n"
- "busy_cnt: %ld\n"
- "reclaim: %llu\n",
- (atomic_long_read(&cli->cl_lru_in_list) +
- atomic_long_read(&cli->cl_lru_busy)) >> shift,
- atomic_long_read(&cli->cl_lru_busy),
- cli->cl_lru_reclaim);
-
- return 0;
-}
-
-/* shrink the number of caching pages to a specific number */
-static ssize_t osc_cached_mb_seq_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *off)
-{
- struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
- struct client_obd *cli = &dev->u.cli;
- long pages_number, rc;
- char kernbuf[128];
- int mult;
- u64 val;
-
- if (count >= sizeof(kernbuf))
- return -EINVAL;
-
- if (copy_from_user(kernbuf, buffer, count))
- return -EFAULT;
- kernbuf[count] = 0;
-
- mult = 1 << (20 - PAGE_SHIFT);
- buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
- kernbuf;
- rc = lprocfs_write_frac_u64_helper(buffer, count, &val, mult);
- if (rc)
- return rc;
-
- if (val > LONG_MAX)
- return -ERANGE;
- pages_number = (long)val;
-
- if (pages_number < 0)
- return -ERANGE;
-
- rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number;
- if (rc > 0) {
- struct lu_env *env;
- u16 refcheck;
-
- env = cl_env_get(&refcheck);
- if (!IS_ERR(env)) {
- (void)osc_lru_shrink(env, cli, rc, true);
- cl_env_put(env, &refcheck);
- }
- }
-
- return count;
-}
-
-LPROC_SEQ_FOPS(osc_cached_mb);
-
-static ssize_t cur_dirty_bytes_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
- int len;
-
- spin_lock(&cli->cl_loi_list_lock);
- len = sprintf(buf, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT);
- spin_unlock(&cli->cl_loi_list_lock);
-
- return len;
-}
-LUSTRE_RO_ATTR(cur_dirty_bytes);
-
-static ssize_t cur_grant_bytes_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
- int len;
-
- spin_lock(&cli->cl_loi_list_lock);
- len = sprintf(buf, "%lu\n", cli->cl_avail_grant);
- spin_unlock(&cli->cl_loi_list_lock);
-
- return len;
-}
-
-static ssize_t cur_grant_bytes_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &obd->u.cli;
- int rc;
- unsigned long long val;
-
- rc = kstrtoull(buffer, 10, &val);
- if (rc)
- return rc;
-
- /* this is only for shrinking grant */
- spin_lock(&cli->cl_loi_list_lock);
- if (val >= cli->cl_avail_grant) {
- spin_unlock(&cli->cl_loi_list_lock);
- return -EINVAL;
- }
- spin_unlock(&cli->cl_loi_list_lock);
-
- if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
- rc = osc_shrink_grant_to_target(cli, val);
- if (rc)
- return rc;
- return count;
-}
-LUSTRE_RW_ATTR(cur_grant_bytes);
-
-static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
- int len;
-
- spin_lock(&cli->cl_loi_list_lock);
- len = sprintf(buf, "%lu\n", cli->cl_lost_grant);
- spin_unlock(&cli->cl_loi_list_lock);
-
- return len;
-}
-LUSTRE_RO_ATTR(cur_lost_grant_bytes);
-
-static ssize_t grant_shrink_interval_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
-
- return sprintf(buf, "%d\n", obd->u.cli.cl_grant_shrink_interval);
-}
-
-static ssize_t grant_shrink_interval_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
- int rc;
- unsigned long val;
-
- rc = kstrtoul(buffer, 10, &val);
- if (rc)
- return rc;
-
- if (val <= 0)
- return -ERANGE;
-
- obd->u.cli.cl_grant_shrink_interval = val;
-
- return count;
-}
-LUSTRE_RW_ATTR(grant_shrink_interval);
-
-static ssize_t checksums_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
-
- return sprintf(buf, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
-}
-
-static ssize_t checksums_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
- int rc;
- unsigned long val;
-
- rc = kstrtoul(buffer, 10, &val);
- if (rc)
- return rc;
-
- obd->u.cli.cl_checksum = (val ? 1 : 0);
-
- return count;
-}
-LUSTRE_RW_ATTR(checksums);
-
-static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
-{
- struct obd_device *obd = m->private;
- int i;
-
- DECLARE_CKSUM_NAME;
-
- if (!obd)
- return 0;
-
- for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
- if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
- continue;
- if (obd->u.cli.cl_cksum_type == (1 << i))
- seq_printf(m, "[%s] ", cksum_name[i]);
- else
- seq_printf(m, "%s ", cksum_name[i]);
- }
- seq_putc(m, '\n');
- return 0;
-}
-
-static ssize_t osc_checksum_type_seq_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *off)
-{
- struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
- int i;
-
- DECLARE_CKSUM_NAME;
- char kernbuf[10];
-
- if (!obd)
- return 0;
-
- if (count > sizeof(kernbuf) - 1)
- return -EINVAL;
- if (copy_from_user(kernbuf, buffer, count))
- return -EFAULT;
- if (count > 0 && kernbuf[count - 1] == '\n')
- kernbuf[count - 1] = '\0';
- else
- kernbuf[count] = '\0';
-
- for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
- if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
- continue;
- if (!strcmp(kernbuf, cksum_name[i])) {
- obd->u.cli.cl_cksum_type = 1 << i;
- return count;
- }
- }
- return -EINVAL;
-}
-
-LPROC_SEQ_FOPS(osc_checksum_type);
-
-static ssize_t resend_count_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
-
- return sprintf(buf, "%u\n", atomic_read(&obd->u.cli.cl_resends));
-}
-
-static ssize_t resend_count_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
- int rc;
- unsigned long val;
-
- rc = kstrtoul(buffer, 10, &val);
- if (rc)
- return rc;
-
- atomic_set(&obd->u.cli.cl_resends, val);
-
- return count;
-}
-LUSTRE_RW_ATTR(resend_count);
-
-static ssize_t contention_seconds_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
- struct osc_device *od = obd2osc_dev(obd);
-
- return sprintf(buf, "%u\n", od->od_contention_time);
-}
-
-static ssize_t contention_seconds_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
- struct osc_device *od = obd2osc_dev(obd);
- int rc;
- int val;
-
- rc = kstrtoint(buffer, 10, &val);
- if (rc)
- return rc;
-
- if (val < 0)
- return -EINVAL;
-
- od->od_contention_time = val;
-
- return count;
-}
-LUSTRE_RW_ATTR(contention_seconds);
-
-static ssize_t lockless_truncate_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
- struct osc_device *od = obd2osc_dev(obd);
-
- return sprintf(buf, "%u\n", od->od_lockless_truncate);
-}
-
-static ssize_t lockless_truncate_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
- struct osc_device *od = obd2osc_dev(obd);
- int rc;
- unsigned int val;
-
- rc = kstrtouint(buffer, 10, &val);
- if (rc)
- return rc;
-
- od->od_lockless_truncate = val;
-
- return count;
-}
-LUSTRE_RW_ATTR(lockless_truncate);
-
-static ssize_t destroys_in_flight_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *obd = container_of(kobj, struct obd_device,
- obd_kobj);
-
- return sprintf(buf, "%u\n",
- atomic_read(&obd->u.cli.cl_destroy_in_flight));
-}
-LUSTRE_RO_ATTR(destroys_in_flight);
-
-static ssize_t max_pages_per_rpc_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
-
- return sprintf(buf, "%d\n", cli->cl_max_pages_per_rpc);
-}
-
-static ssize_t max_pages_per_rpc_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buffer,
- size_t count)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
- struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
- int chunk_mask, rc;
- unsigned long long val;
-
- rc = kstrtoull(buffer, 10, &val);
- if (rc)
- return rc;
-
- /* if the max_pages is specified in bytes, convert to pages */
- if (val >= ONE_MB_BRW_SIZE)
- val >>= PAGE_SHIFT;
-
- chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
- /* max_pages_per_rpc must be chunk aligned */
- val = (val + ~chunk_mask) & chunk_mask;
- if (!val || (ocd->ocd_brw_size &&
- val > ocd->ocd_brw_size >> PAGE_SHIFT)) {
- return -ERANGE;
- }
- spin_lock(&cli->cl_loi_list_lock);
- cli->cl_max_pages_per_rpc = val;
- client_adjust_max_dirty(cli);
- spin_unlock(&cli->cl_loi_list_lock);
-
- return count;
-}
-LUSTRE_RW_ATTR(max_pages_per_rpc);
-
-static ssize_t unstable_stats_show(struct kobject *kobj,
- struct attribute *attr,
- char *buf)
-{
- struct obd_device *dev = container_of(kobj, struct obd_device,
- obd_kobj);
- struct client_obd *cli = &dev->u.cli;
- long pages;
- int mb;
-
- pages = atomic_long_read(&cli->cl_unstable_count);
- mb = (pages * PAGE_SIZE) >> 20;
-
- return sprintf(buf, "unstable_pages: %20ld\n"
- "unstable_mb: %10d\n", pages, mb);
-}
-LUSTRE_RO_ATTR(unstable_stats);
-
-LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
-LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
-LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
-LPROC_SEQ_FOPS_RO_TYPE(osc, state);
-
-LPROC_SEQ_FOPS_WR_ONLY(osc, ping);
-
-LPROC_SEQ_FOPS_RW_TYPE(osc, import);
-LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
-
-static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
- { "ping", &osc_ping_fops, NULL, 0222 },
- { "connect_flags", &osc_connect_flags_fops, NULL, 0 },
- /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/
- { "ost_server_uuid", &osc_server_uuid_fops, NULL, 0 },
- { "ost_conn_uuid", &osc_conn_uuid_fops, NULL, 0 },
- { "osc_cached_mb", &osc_cached_mb_fops, NULL },
- { "checksum_type", &osc_checksum_type_fops, NULL },
- { "timeouts", &osc_timeouts_fops, NULL, 0 },
- { "import", &osc_import_fops, NULL },
- { "state", &osc_state_fops, NULL, 0 },
- { "pinger_recov", &osc_pinger_recov_fops, NULL },
- { NULL }
-};
-
-#define pct(a, b) (b ? a * 100 / b : 0)
-
-static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
-{
- struct timespec64 now;
- struct obd_device *dev = seq->private;
- struct client_obd *cli = &dev->u.cli;
- unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
- int i;
-
- ktime_get_real_ts64(&now);
-
- spin_lock(&cli->cl_loi_list_lock);
-
- seq_printf(seq, "snapshot_time: %llu.%9lu (secs.usecs)\n",
- (s64)now.tv_sec, (unsigned long)now.tv_nsec);
- seq_printf(seq, "read RPCs in flight: %d\n",
- cli->cl_r_in_flight);
- seq_printf(seq, "write RPCs in flight: %d\n",
- cli->cl_w_in_flight);
- seq_printf(seq, "pending write pages: %d\n",
- atomic_read(&cli->cl_pending_w_pages));
- seq_printf(seq, "pending read pages: %d\n",
- atomic_read(&cli->cl_pending_r_pages));
-
- seq_puts(seq, "\n\t\t\tread\t\t\twrite\n");
- seq_puts(seq, "pages per rpc rpcs % cum % |");
- seq_puts(seq, " rpcs % cum %\n");
-
- read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
- write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
-
- read_cum = 0;
- write_cum = 0;
- for (i = 0; i < OBD_HIST_MAX; i++) {
- unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
- unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
-
- read_cum += r;
- write_cum += w;
- seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n",
- 1 << i, r, pct(r, read_tot),
- pct(read_cum, read_tot), w,
- pct(w, write_tot),
- pct(write_cum, write_tot));
- if (read_cum == read_tot && write_cum == write_tot)
- break;
- }
-
- seq_puts(seq, "\n\t\t\tread\t\t\twrite\n");
- seq_puts(seq, "rpcs in flight rpcs % cum % |");
- seq_puts(seq, " rpcs % cum %\n");
-
- read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
- write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
-
- read_cum = 0;
- write_cum = 0;
- for (i = 0; i < OBD_HIST_MAX; i++) {
- unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
- unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
-
- read_cum += r;
- write_cum += w;
- seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n",
- i, r, pct(r, read_tot),
- pct(read_cum, read_tot), w,
- pct(w, write_tot),
- pct(write_cum, write_tot));
- if (read_cum == read_tot && write_cum == write_tot)
- break;
- }
-
- seq_puts(seq, "\n\t\t\tread\t\t\twrite\n");
- seq_puts(seq, "offset rpcs % cum % |");
- seq_puts(seq, " rpcs % cum %\n");
-
- read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
- write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
-
- read_cum = 0;
- write_cum = 0;
- for (i = 0; i < OBD_HIST_MAX; i++) {
- unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
- unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
-
- read_cum += r;
- write_cum += w;
- seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n",
- (i == 0) ? 0 : 1 << (i - 1),
- r, pct(r, read_tot), pct(read_cum, read_tot),
- w, pct(w, write_tot), pct(write_cum, write_tot));
- if (read_cum == read_tot && write_cum == write_tot)
- break;
- }
-
- spin_unlock(&cli->cl_loi_list_lock);
-
- return 0;
-}
-
-#undef pct
-
-static ssize_t osc_rpc_stats_seq_write(struct file *file,
- const char __user *buf,
- size_t len, loff_t *off)
-{
- struct seq_file *seq = file->private_data;
- struct obd_device *dev = seq->private;
- struct client_obd *cli = &dev->u.cli;
-
- lprocfs_oh_clear(&cli->cl_read_rpc_hist);
- lprocfs_oh_clear(&cli->cl_write_rpc_hist);
- lprocfs_oh_clear(&cli->cl_read_page_hist);
- lprocfs_oh_clear(&cli->cl_write_page_hist);
- lprocfs_oh_clear(&cli->cl_read_offset_hist);
- lprocfs_oh_clear(&cli->cl_write_offset_hist);
-
- return len;
-}
-
-LPROC_SEQ_FOPS(osc_rpc_stats);
-
-static int osc_stats_seq_show(struct seq_file *seq, void *v)
-{
- struct timespec64 now;
- struct obd_device *dev = seq->private;
- struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
-
- ktime_get_real_ts64(&now);
-
- seq_printf(seq, "snapshot_time: %llu.%9lu (secs.usecs)\n",
- (s64)now.tv_sec, (unsigned long)now.tv_nsec);
- seq_printf(seq, "lockless_write_bytes\t\t%llu\n",
- stats->os_lockless_writes);
- seq_printf(seq, "lockless_read_bytes\t\t%llu\n",
- stats->os_lockless_reads);
- seq_printf(seq, "lockless_truncate\t\t%llu\n",
- stats->os_lockless_truncates);
- return 0;
-}
-
-static ssize_t osc_stats_seq_write(struct file *file,
- const char __user *buf,
- size_t len, loff_t *off)
-{
- struct seq_file *seq = file->private_data;
- struct obd_device *dev = seq->private;
- struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
-
- memset(stats, 0, sizeof(*stats));
- return len;
-}
-
-LPROC_SEQ_FOPS(osc_stats);
-
-int lproc_osc_attach_seqstat(struct obd_device *dev)
-{
- int rc;
-
- rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "osc_stats", 0644,
- &osc_stats_fops, dev);
- if (rc == 0)
- rc = ldebugfs_obd_seq_create(dev, "rpc_stats", 0644,
- &osc_rpc_stats_fops, dev);
-
- return rc;
-}
-
-static struct attribute *osc_attrs[] = {
- &lustre_attr_active.attr,
- &lustre_attr_checksums.attr,
- &lustre_attr_contention_seconds.attr,
- &lustre_attr_cur_dirty_bytes.attr,
- &lustre_attr_cur_grant_bytes.attr,
- &lustre_attr_cur_lost_grant_bytes.attr,
- &lustre_attr_destroys_in_flight.attr,
- &lustre_attr_grant_shrink_interval.attr,
- &lustre_attr_lockless_truncate.attr,
- &lustre_attr_max_dirty_mb.attr,
- &lustre_attr_max_pages_per_rpc.attr,
- &lustre_attr_max_rpcs_in_flight.attr,
- &lustre_attr_resend_count.attr,
- &lustre_attr_unstable_stats.attr,
- NULL,
-};
-
-static const struct attribute_group osc_attr_group = {
- .attrs = osc_attrs,
-};
-
-void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
-{
- lvars->sysfs_vars = &osc_attr_group;
- lvars->obd_vars = lprocfs_osc_obd_vars;
-}
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c
deleted file mode 100644
index 459503727ce3..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ /dev/null
@@ -1,3306 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2015, Intel Corporation.
- *
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * osc cache management.
- *
- * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
- */
-
-#define DEBUG_SUBSYSTEM S_OSC
-
-#include "osc_cl_internal.h"
-#include "osc_internal.h"
-
-static int extent_debug; /* set it to be true for more debug */
-
-static void osc_update_pending(struct osc_object *obj, int cmd, int delta);
-static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
- enum osc_extent_state state);
-static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
- struct osc_async_page *oap, int sent, int rc);
-static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
- int cmd);
-static int osc_refresh_count(const struct lu_env *env,
- struct osc_async_page *oap, int cmd);
-static int osc_io_unplug_async(const struct lu_env *env,
- struct client_obd *cli, struct osc_object *osc);
-static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
- unsigned int lost_grant);
-
-static void osc_extent_tree_dump0(int level, struct osc_object *obj,
- const char *func, int line);
-#define osc_extent_tree_dump(lvl, obj) \
- osc_extent_tree_dump0(lvl, obj, __func__, __LINE__)
-
-/** \addtogroup osc
- * @{
- */
-
-/* ------------------ osc extent ------------------ */
-static inline char *ext_flags(struct osc_extent *ext, char *flags)
-{
- char *buf = flags;
- *buf++ = ext->oe_rw ? 'r' : 'w';
- if (ext->oe_intree)
- *buf++ = 'i';
- if (ext->oe_sync)
- *buf++ = 'S';
- if (ext->oe_srvlock)
- *buf++ = 's';
- if (ext->oe_hp)
- *buf++ = 'h';
- if (ext->oe_urgent)
- *buf++ = 'u';
- if (ext->oe_memalloc)
- *buf++ = 'm';
- if (ext->oe_trunc_pending)
- *buf++ = 't';
- if (ext->oe_fsync_wait)
- *buf++ = 'Y';
- *buf = 0;
- return flags;
-}
-
-static inline char list_empty_marker(struct list_head *list)
-{
- return list_empty(list) ? '-' : '+';
-}
-
-#define EXTSTR "[%lu -> %lu/%lu]"
-#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end
-static const char *oes_strings[] = {
- "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL };
-
-#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \
- struct osc_extent *__ext = (extent); \
- char __buf[16]; \
- \
- CDEBUG(lvl, \
- "extent %p@{" EXTSTR ", " \
- "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \
- /* ----- extent part 0 ----- */ \
- __ext, EXTPARA(__ext), \
- /* ----- part 1 ----- */ \
- atomic_read(&__ext->oe_refc), \
- atomic_read(&__ext->oe_users), \
- list_empty_marker(&__ext->oe_link), \
- oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \
- __ext->oe_obj, \
- /* ----- part 2 ----- */ \
- __ext->oe_grants, __ext->oe_nr_pages, \
- list_empty_marker(&__ext->oe_pages), \
- waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \
- __ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner, \
- /* ----- part 4 ----- */ \
- ## __VA_ARGS__); \
- if (lvl == D_ERROR && __ext->oe_dlmlock) \
- LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext); \
- else \
- LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext); \
-} while (0)
-
-#undef EASSERTF
-#define EASSERTF(expr, ext, fmt, args...) do { \
- if (!(expr)) { \
- OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \
- osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \
- LASSERT(expr); \
- } \
-} while (0)
-
-#undef EASSERT
-#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n")
-
-static inline struct osc_extent *rb_extent(struct rb_node *n)
-{
- return rb_entry_safe(n, struct osc_extent, oe_node);
-}
-
-static inline struct osc_extent *next_extent(struct osc_extent *ext)
-{
- if (!ext)
- return NULL;
-
- LASSERT(ext->oe_intree);
- return rb_extent(rb_next(&ext->oe_node));
-}
-
-static inline struct osc_extent *prev_extent(struct osc_extent *ext)
-{
- if (!ext)
- return NULL;
-
- LASSERT(ext->oe_intree);
- return rb_extent(rb_prev(&ext->oe_node));
-}
-
-static inline struct osc_extent *first_extent(struct osc_object *obj)
-{
- return rb_extent(rb_first(&obj->oo_root));
-}
-
-/* object must be locked by caller. */
-static int osc_extent_sanity_check0(struct osc_extent *ext,
- const char *func, const int line)
-{
- struct osc_object *obj = ext->oe_obj;
- struct osc_async_page *oap;
- size_t page_count;
- int rc = 0;
-
- if (!osc_object_is_locked(obj)) {
- rc = 9;
- goto out;
- }
-
- if (ext->oe_state >= OES_STATE_MAX) {
- rc = 10;
- goto out;
- }
-
- if (atomic_read(&ext->oe_refc) <= 0) {
- rc = 20;
- goto out;
- }
-
- if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) {
- rc = 30;
- goto out;
- }
-
- switch (ext->oe_state) {
- case OES_INV:
- if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages))
- rc = 35;
- else
- rc = 0;
- goto out;
- case OES_ACTIVE:
- if (atomic_read(&ext->oe_users) == 0) {
- rc = 40;
- goto out;
- }
- if (ext->oe_hp) {
- rc = 50;
- goto out;
- }
- if (ext->oe_fsync_wait && !ext->oe_urgent) {
- rc = 55;
- goto out;
- }
- break;
- case OES_CACHE:
- if (ext->oe_grants == 0) {
- rc = 60;
- goto out;
- }
- if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) {
- rc = 65;
- goto out;
- }
- /* fall through */
- default:
- if (atomic_read(&ext->oe_users) > 0) {
- rc = 70;
- goto out;
- }
- }
-
- if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) {
- rc = 80;
- goto out;
- }
-
- if (ext->oe_sync && ext->oe_grants > 0) {
- rc = 90;
- goto out;
- }
-
- if (ext->oe_dlmlock && !ldlm_is_failed(ext->oe_dlmlock)) {
- struct ldlm_extent *extent;
-
- extent = &ext->oe_dlmlock->l_policy_data.l_extent;
- if (!(extent->start <= cl_offset(osc2cl(obj), ext->oe_start) &&
- extent->end >= cl_offset(osc2cl(obj), ext->oe_max_end))) {
- rc = 100;
- goto out;
- }
-
- if (!(ext->oe_dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))) {
- rc = 102;
- goto out;
- }
- }
-
- if (ext->oe_nr_pages > ext->oe_mppr) {
- rc = 105;
- goto out;
- }
-
- /* Do not verify page list if extent is in RPC. This is because an
- * in-RPC extent is supposed to be exclusively accessible w/o lock.
- */
- if (ext->oe_state > OES_CACHE) {
- rc = 0;
- goto out;
- }
-
- if (!extent_debug) {
- rc = 0;
- goto out;
- }
-
- page_count = 0;
- list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
- pgoff_t index = osc_index(oap2osc(oap));
- ++page_count;
- if (index > ext->oe_end || index < ext->oe_start) {
- rc = 110;
- goto out;
- }
- }
- if (page_count != ext->oe_nr_pages) {
- rc = 120;
- goto out;
- }
-
-out:
- if (rc != 0)
- OSC_EXTENT_DUMP(D_ERROR, ext,
- "%s:%d sanity check %p failed with rc = %d\n",
- func, line, ext, rc);
- return rc;
-}
-
-#define sanity_check_nolock(ext) \
- osc_extent_sanity_check0(ext, __func__, __LINE__)
-
-#define sanity_check(ext) ({ \
- int __res; \
- osc_object_lock((ext)->oe_obj); \
- __res = sanity_check_nolock(ext); \
- osc_object_unlock((ext)->oe_obj); \
- __res; \
-})
-
-/**
- * sanity check - to make sure there is no overlapped extent in the tree.
- */
-static int osc_extent_is_overlapped(struct osc_object *obj,
- struct osc_extent *ext)
-{
- struct osc_extent *tmp;
-
- LASSERT(osc_object_is_locked(obj));
-
- if (!extent_debug)
- return 0;
-
- for (tmp = first_extent(obj); tmp; tmp = next_extent(tmp)) {
- if (tmp == ext)
- continue;
- if (tmp->oe_end >= ext->oe_start &&
- tmp->oe_start <= ext->oe_end)
- return 1;
- }
- return 0;
-}
-
-static void osc_extent_state_set(struct osc_extent *ext, int state)
-{
- LASSERT(osc_object_is_locked(ext->oe_obj));
- LASSERT(state >= OES_INV && state < OES_STATE_MAX);
-
- /* Never try to sanity check a state changing extent :-) */
- /* LASSERT(sanity_check_nolock(ext) == 0); */
-
- /* TODO: validate the state machine */
- ext->oe_state = state;
- wake_up_all(&ext->oe_waitq);
-}
-
-static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
-{
- struct osc_extent *ext;
-
- ext = kmem_cache_zalloc(osc_extent_kmem, GFP_NOFS);
- if (!ext)
- return NULL;
-
- RB_CLEAR_NODE(&ext->oe_node);
- ext->oe_obj = obj;
- cl_object_get(osc2cl(obj));
- atomic_set(&ext->oe_refc, 1);
- atomic_set(&ext->oe_users, 0);
- INIT_LIST_HEAD(&ext->oe_link);
- ext->oe_state = OES_INV;
- INIT_LIST_HEAD(&ext->oe_pages);
- init_waitqueue_head(&ext->oe_waitq);
- ext->oe_dlmlock = NULL;
-
- return ext;
-}
-
-static void osc_extent_free(struct osc_extent *ext)
-{
- kmem_cache_free(osc_extent_kmem, ext);
-}
-
-static struct osc_extent *osc_extent_get(struct osc_extent *ext)
-{
- LASSERT(atomic_read(&ext->oe_refc) >= 0);
- atomic_inc(&ext->oe_refc);
- return ext;
-}
-
-static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext)
-{
- LASSERT(atomic_read(&ext->oe_refc) > 0);
- if (atomic_dec_and_test(&ext->oe_refc)) {
- LASSERT(list_empty(&ext->oe_link));
- LASSERT(atomic_read(&ext->oe_users) == 0);
- LASSERT(ext->oe_state == OES_INV);
- LASSERT(!ext->oe_intree);
-
- if (ext->oe_dlmlock) {
- lu_ref_add(&ext->oe_dlmlock->l_reference,
- "osc_extent", ext);
- LDLM_LOCK_PUT(ext->oe_dlmlock);
- ext->oe_dlmlock = NULL;
- }
- cl_object_put(env, osc2cl(ext->oe_obj));
- osc_extent_free(ext);
- }
-}
-
-/**
- * osc_extent_put_trust() is a special version of osc_extent_put() when
- * it's known that the caller is not the last user. This is to address the
- * problem of lacking of lu_env ;-).
- */
-static void osc_extent_put_trust(struct osc_extent *ext)
-{
- LASSERT(atomic_read(&ext->oe_refc) > 1);
- LASSERT(osc_object_is_locked(ext->oe_obj));
- atomic_dec(&ext->oe_refc);
-}
-
-/**
- * Return the extent which includes pgoff @index, or return the greatest
- * previous extent in the tree.
- */
-static struct osc_extent *osc_extent_search(struct osc_object *obj,
- pgoff_t index)
-{
- struct rb_node *n = obj->oo_root.rb_node;
- struct osc_extent *tmp, *p = NULL;
-
- LASSERT(osc_object_is_locked(obj));
- while (n) {
- tmp = rb_extent(n);
- if (index < tmp->oe_start) {
- n = n->rb_left;
- } else if (index > tmp->oe_end) {
- p = rb_extent(n);
- n = n->rb_right;
- } else {
- return tmp;
- }
- }
- return p;
-}
-
-/*
- * Return the extent covering @index, otherwise return NULL.
- * caller must have held object lock.
- */
-static struct osc_extent *osc_extent_lookup(struct osc_object *obj,
- pgoff_t index)
-{
- struct osc_extent *ext;
-
- ext = osc_extent_search(obj, index);
- if (ext && ext->oe_start <= index && index <= ext->oe_end)
- return osc_extent_get(ext);
- return NULL;
-}
-
-/* caller must have held object lock. */
-static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext)
-{
- struct rb_node **n = &obj->oo_root.rb_node;
- struct rb_node *parent = NULL;
- struct osc_extent *tmp;
-
- LASSERT(ext->oe_intree == 0);
- LASSERT(ext->oe_obj == obj);
- LASSERT(osc_object_is_locked(obj));
- while (*n) {
- tmp = rb_extent(*n);
- parent = *n;
-
- if (ext->oe_end < tmp->oe_start)
- n = &(*n)->rb_left;
- else if (ext->oe_start > tmp->oe_end)
- n = &(*n)->rb_right;
- else
- EASSERTF(0, tmp, EXTSTR "\n", EXTPARA(ext));
- }
- rb_link_node(&ext->oe_node, parent, n);
- rb_insert_color(&ext->oe_node, &obj->oo_root);
- osc_extent_get(ext);
- ext->oe_intree = 1;
-}
-
-/* caller must have held object lock. */
-static void osc_extent_erase(struct osc_extent *ext)
-{
- struct osc_object *obj = ext->oe_obj;
-
- LASSERT(osc_object_is_locked(obj));
- if (ext->oe_intree) {
- rb_erase(&ext->oe_node, &obj->oo_root);
- ext->oe_intree = 0;
- /* rbtree held a refcount */
- osc_extent_put_trust(ext);
- }
-}
-
-static struct osc_extent *osc_extent_hold(struct osc_extent *ext)
-{
- struct osc_object *obj = ext->oe_obj;
-
- LASSERT(osc_object_is_locked(obj));
- LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE);
- if (ext->oe_state == OES_CACHE) {
- osc_extent_state_set(ext, OES_ACTIVE);
- osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages);
- }
- atomic_inc(&ext->oe_users);
- list_del_init(&ext->oe_link);
- return osc_extent_get(ext);
-}
-
-static void __osc_extent_remove(struct osc_extent *ext)
-{
- LASSERT(osc_object_is_locked(ext->oe_obj));
- LASSERT(list_empty(&ext->oe_pages));
- osc_extent_erase(ext);
- list_del_init(&ext->oe_link);
- osc_extent_state_set(ext, OES_INV);
- OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n");
-}
-
-static void osc_extent_remove(struct osc_extent *ext)
-{
- struct osc_object *obj = ext->oe_obj;
-
- osc_object_lock(obj);
- __osc_extent_remove(ext);
- osc_object_unlock(obj);
-}
-
-/**
- * This function is used to merge extents to get better performance. It checks
- * if @cur and @victim are contiguous at chunk level.
- */
-static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
- struct osc_extent *victim)
-{
- struct osc_object *obj = cur->oe_obj;
- pgoff_t chunk_start;
- pgoff_t chunk_end;
- int ppc_bits;
-
- LASSERT(cur->oe_state == OES_CACHE);
- LASSERT(osc_object_is_locked(obj));
- if (!victim)
- return -EINVAL;
-
- if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait)
- return -EBUSY;
-
- if (cur->oe_max_end != victim->oe_max_end)
- return -ERANGE;
-
- LASSERT(cur->oe_dlmlock == victim->oe_dlmlock);
- ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT;
- chunk_start = cur->oe_start >> ppc_bits;
- chunk_end = cur->oe_end >> ppc_bits;
- if (chunk_start != (victim->oe_end >> ppc_bits) + 1 &&
- chunk_end + 1 != victim->oe_start >> ppc_bits)
- return -ERANGE;
-
- OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur);
-
- cur->oe_start = min(cur->oe_start, victim->oe_start);
- cur->oe_end = max(cur->oe_end, victim->oe_end);
- cur->oe_grants += victim->oe_grants;
- cur->oe_nr_pages += victim->oe_nr_pages;
- /* only the following bits are needed to merge */
- cur->oe_urgent |= victim->oe_urgent;
- cur->oe_memalloc |= victim->oe_memalloc;
- list_splice_init(&victim->oe_pages, &cur->oe_pages);
- list_del_init(&victim->oe_link);
- victim->oe_nr_pages = 0;
-
- osc_extent_get(victim);
- __osc_extent_remove(victim);
- osc_extent_put(env, victim);
-
- OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim);
- return 0;
-}
-
-/**
- * Drop user count of osc_extent, and unplug IO asynchronously.
- */
-void osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
-{
- struct osc_object *obj = ext->oe_obj;
-
- LASSERT(atomic_read(&ext->oe_users) > 0);
- LASSERT(sanity_check(ext) == 0);
- LASSERT(ext->oe_grants > 0);
-
- if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
- LASSERT(ext->oe_state == OES_ACTIVE);
- if (ext->oe_trunc_pending) {
- /* a truncate process is waiting for this extent.
- * This may happen due to a race, check
- * osc_cache_truncate_start().
- */
- osc_extent_state_set(ext, OES_TRUNC);
- ext->oe_trunc_pending = 0;
- } else {
- osc_extent_state_set(ext, OES_CACHE);
- osc_update_pending(obj, OBD_BRW_WRITE,
- ext->oe_nr_pages);
-
- /* try to merge the previous and next extent. */
- osc_extent_merge(env, ext, prev_extent(ext));
- osc_extent_merge(env, ext, next_extent(ext));
-
- if (ext->oe_urgent)
- list_move_tail(&ext->oe_link,
- &obj->oo_urgent_exts);
- }
- osc_object_unlock(obj);
-
- osc_io_unplug_async(env, osc_cli(obj), obj);
- }
- osc_extent_put(env, ext);
-}
-
-static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
-{
- return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
-}
-
-/**
- * Find or create an extent which includes @index, core function to manage
- * extent tree.
- */
-static struct osc_extent *osc_extent_find(const struct lu_env *env,
- struct osc_object *obj, pgoff_t index,
- unsigned int *grants)
-{
- struct client_obd *cli = osc_cli(obj);
- struct osc_lock *olck;
- struct cl_lock_descr *descr;
- struct osc_extent *cur;
- struct osc_extent *ext;
- struct osc_extent *conflict = NULL;
- struct osc_extent *found = NULL;
- pgoff_t chunk;
- pgoff_t max_end;
- unsigned int max_pages; /* max_pages_per_rpc */
- unsigned int chunksize;
- int ppc_bits; /* pages per chunk bits */
- pgoff_t chunk_mask;
- int rc;
-
- cur = osc_extent_alloc(obj);
- if (!cur)
- return ERR_PTR(-ENOMEM);
-
- olck = osc_env_io(env)->oi_write_osclock;
- LASSERTF(olck, "page %lu is not covered by lock\n", index);
- LASSERT(olck->ols_state == OLS_GRANTED);
-
- descr = &olck->ols_cl.cls_lock->cll_descr;
- LASSERT(descr->cld_mode >= CLM_WRITE);
-
- LASSERT(cli->cl_chunkbits >= PAGE_SHIFT);
- ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
- chunk_mask = ~((1 << ppc_bits) - 1);
- chunksize = 1 << cli->cl_chunkbits;
- chunk = index >> ppc_bits;
-
- /* align end to rpc edge, rpc size may not be a power 2 integer. */
- max_pages = cli->cl_max_pages_per_rpc;
- LASSERT((max_pages & ~chunk_mask) == 0);
- max_end = index - (index % max_pages) + max_pages - 1;
- max_end = min_t(pgoff_t, max_end, descr->cld_end);
-
- /* initialize new extent by parameters so far */
- cur->oe_max_end = max_end;
- cur->oe_start = index & chunk_mask;
- cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1;
- if (cur->oe_start < descr->cld_start)
- cur->oe_start = descr->cld_start;
- if (cur->oe_end > max_end)
- cur->oe_end = max_end;
- cur->oe_grants = 0;
- cur->oe_mppr = max_pages;
- if (olck->ols_dlmlock) {
- LASSERT(olck->ols_hold);
- cur->oe_dlmlock = LDLM_LOCK_GET(olck->ols_dlmlock);
- lu_ref_add(&olck->ols_dlmlock->l_reference, "osc_extent", cur);
- }
-
- /* grants has been allocated by caller */
- LASSERTF(*grants >= chunksize + cli->cl_extent_tax,
- "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax);
- LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR "\n",
- EXTPARA(cur));
-
-restart:
- osc_object_lock(obj);
- ext = osc_extent_search(obj, cur->oe_start);
- if (!ext)
- ext = first_extent(obj);
- while (ext) {
- pgoff_t ext_chk_start = ext->oe_start >> ppc_bits;
- pgoff_t ext_chk_end = ext->oe_end >> ppc_bits;
-
- LASSERT(sanity_check_nolock(ext) == 0);
- if (chunk > ext_chk_end + 1)
- break;
-
- /* if covering by different locks, no chance to match */
- if (olck->ols_dlmlock != ext->oe_dlmlock) {
- EASSERTF(!overlapped(ext, cur), ext,
- EXTSTR "\n", EXTPARA(cur));
-
- ext = next_extent(ext);
- continue;
- }
-
- /* discontiguous chunks? */
- if (chunk + 1 < ext_chk_start) {
- ext = next_extent(ext);
- continue;
- }
-
- /* ok, from now on, ext and cur have these attrs:
- * 1. covered by the same lock
- * 2. contiguous at chunk level or overlapping.
- */
-
- if (overlapped(ext, cur)) {
- /* cur is the minimum unit, so overlapping means
- * full contain.
- */
- EASSERTF((ext->oe_start <= cur->oe_start &&
- ext->oe_end >= cur->oe_end),
- ext, EXTSTR "\n", EXTPARA(cur));
-
- if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
- /* for simplicity, we wait for this extent to
- * finish before going forward.
- */
- conflict = osc_extent_get(ext);
- break;
- }
-
- found = osc_extent_hold(ext);
- break;
- }
-
- /* non-overlapped extent */
- if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) {
- /* we can't do anything for a non OES_CACHE extent, or
- * if there is someone waiting for this extent to be
- * flushed, try next one.
- */
- ext = next_extent(ext);
- continue;
- }
-
- /* check if they belong to the same rpc slot before trying to
- * merge. the extents are not overlapped and contiguous at
- * chunk level to get here.
- */
- if (ext->oe_max_end != max_end) {
- /* if they don't belong to the same RPC slot or
- * max_pages_per_rpc has ever changed, do not merge.
- */
- ext = next_extent(ext);
- continue;
- }
-
- /* it's required that an extent must be contiguous at chunk
- * level so that we know the whole extent is covered by grant
- * (the pages in the extent are NOT required to be contiguous).
- * Otherwise, it will be too much difficult to know which
- * chunks have grants allocated.
- */
-
- /* try to do front merge - extend ext's start */
- if (chunk + 1 == ext_chk_start) {
- /* ext must be chunk size aligned */
- EASSERT((ext->oe_start & ~chunk_mask) == 0, ext);
-
- /* pull ext's start back to cover cur */
- ext->oe_start = cur->oe_start;
- ext->oe_grants += chunksize;
- LASSERT(*grants >= chunksize);
- *grants -= chunksize;
-
- found = osc_extent_hold(ext);
- } else if (chunk == ext_chk_end + 1) {
- /* rear merge */
- ext->oe_end = cur->oe_end;
- ext->oe_grants += chunksize;
- LASSERT(*grants >= chunksize);
- *grants -= chunksize;
-
- /* try to merge with the next one because we just fill
- * in a gap
- */
- if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
- /* we can save extent tax from next extent */
- *grants += cli->cl_extent_tax;
-
- found = osc_extent_hold(ext);
- }
- if (found)
- break;
-
- ext = next_extent(ext);
- }
-
- osc_extent_tree_dump(D_CACHE, obj);
- if (found) {
- LASSERT(!conflict);
- if (!IS_ERR(found)) {
- LASSERT(found->oe_dlmlock == cur->oe_dlmlock);
- OSC_EXTENT_DUMP(D_CACHE, found,
- "found caching ext for %lu.\n", index);
- }
- } else if (!conflict) {
- /* create a new extent */
- EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur);
- cur->oe_grants = chunksize + cli->cl_extent_tax;
- LASSERT(*grants >= cur->oe_grants);
- *grants -= cur->oe_grants;
-
- cur->oe_state = OES_CACHE;
- found = osc_extent_hold(cur);
- osc_extent_insert(obj, cur);
- OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n",
- index, descr->cld_end);
- }
- osc_object_unlock(obj);
-
- if (conflict) {
- LASSERT(!found);
-
- /* waiting for IO to finish. Please notice that it's impossible
- * to be an OES_TRUNC extent.
- */
- rc = osc_extent_wait(env, conflict, OES_INV);
- osc_extent_put(env, conflict);
- conflict = NULL;
- if (rc < 0) {
- found = ERR_PTR(rc);
- goto out;
- }
-
- goto restart;
- }
-
-out:
- osc_extent_put(env, cur);
- return found;
-}
-
-/**
- * Called when IO is finished to an extent.
- */
-int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
- int sent, int rc)
-{
- struct client_obd *cli = osc_cli(ext->oe_obj);
- struct osc_async_page *oap;
- struct osc_async_page *tmp;
- int nr_pages = ext->oe_nr_pages;
- int lost_grant = 0;
- int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
- __u64 last_off = 0;
- int last_count = -1;
-
- OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n");
-
- ext->oe_rc = rc ?: ext->oe_nr_pages;
- EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext);
-
- osc_lru_add_batch(cli, &ext->oe_pages);
- list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) {
- list_del_init(&oap->oap_rpc_item);
- list_del_init(&oap->oap_pending_item);
- if (last_off <= oap->oap_obj_off) {
- last_off = oap->oap_obj_off;
- last_count = oap->oap_count;
- }
-
- --ext->oe_nr_pages;
- osc_ap_completion(env, cli, oap, sent, rc);
- }
- EASSERT(ext->oe_nr_pages == 0, ext);
-
- if (!sent) {
- lost_grant = ext->oe_grants;
- } else if (blocksize < PAGE_SIZE &&
- last_count != PAGE_SIZE) {
- /* For short writes we shouldn't count parts of pages that
- * span a whole chunk on the OST side, or our accounting goes
- * wrong. Should match the code in filter_grant_check.
- */
- int offset = last_off & ~PAGE_MASK;
- int count = last_count + (offset & (blocksize - 1));
- int end = (offset + last_count) & (blocksize - 1);
-
- if (end)
- count += blocksize - end;
-
- lost_grant = PAGE_SIZE - count;
- }
- if (ext->oe_grants > 0)
- osc_free_grant(cli, nr_pages, lost_grant);
-
- osc_extent_remove(ext);
- /* put the refcount for RPC */
- osc_extent_put(env, ext);
- return 0;
-}
-
-static int extent_wait_cb(struct osc_extent *ext, enum osc_extent_state state)
-{
- int ret;
-
- osc_object_lock(ext->oe_obj);
- ret = ext->oe_state == state;
- osc_object_unlock(ext->oe_obj);
-
- return ret;
-}
-
-/**
- * Wait for the extent's state to become @state.
- */
-static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
- enum osc_extent_state state)
-{
- struct osc_object *obj = ext->oe_obj;
- int rc = 0;
-
- osc_object_lock(obj);
- LASSERT(sanity_check_nolock(ext) == 0);
- /* `Kick' this extent only if the caller is waiting for it to be
- * written out.
- */
- if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp &&
- !ext->oe_trunc_pending) {
- if (ext->oe_state == OES_ACTIVE) {
- ext->oe_urgent = 1;
- } else if (ext->oe_state == OES_CACHE) {
- ext->oe_urgent = 1;
- osc_extent_hold(ext);
- rc = 1;
- }
- }
- osc_object_unlock(obj);
- if (rc == 1)
- osc_extent_release(env, ext);
-
- /* wait for the extent until its state becomes @state */
- rc = wait_event_idle_timeout(ext->oe_waitq,
- extent_wait_cb(ext, state), 600 * HZ);
- if (rc == 0) {
- OSC_EXTENT_DUMP(D_ERROR, ext,
- "%s: wait ext to %u timedout, recovery in progress?\n",
- cli_name(osc_cli(obj)), state);
-
- wait_event_idle(ext->oe_waitq, extent_wait_cb(ext, state));
- }
- if (ext->oe_rc < 0)
- rc = ext->oe_rc;
- else
- rc = 0;
- return rc;
-}
-
-/**
- * Discard pages with index greater than @size. If @ext is overlapped with
- * @size, then partial truncate happens.
- */
-static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
- bool partial)
-{
- struct lu_env *env;
- struct cl_io *io;
- struct osc_object *obj = ext->oe_obj;
- struct client_obd *cli = osc_cli(obj);
- struct osc_async_page *oap;
- struct osc_async_page *tmp;
- int pages_in_chunk = 0;
- int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
- __u64 trunc_chunk = trunc_index >> ppc_bits;
- int grants = 0;
- int nr_pages = 0;
- int rc = 0;
- u16 refcheck;
-
- LASSERT(sanity_check(ext) == 0);
- EASSERT(ext->oe_state == OES_TRUNC, ext);
- EASSERT(!ext->oe_urgent, ext);
-
- /* Request new lu_env.
- * We can't use that env from osc_cache_truncate_start() because
- * it's from lov_io_sub and not fully initialized.
- */
- env = cl_env_get(&refcheck);
- io = &osc_env_info(env)->oti_io;
- io->ci_obj = cl_object_top(osc2cl(obj));
- io->ci_ignore_layout = 1;
- rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
- if (rc < 0)
- goto out;
-
- /* discard all pages with index greater then trunc_index */
- list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) {
- pgoff_t index = osc_index(oap2osc(oap));
- struct cl_page *page = oap2cl_page(oap);
-
- LASSERT(list_empty(&oap->oap_rpc_item));
-
- /* only discard the pages with their index greater than
- * trunc_index, and ...
- */
- if (index < trunc_index ||
- (index == trunc_index && partial)) {
- /* accounting how many pages remaining in the chunk
- * so that we can calculate grants correctly. */
- if (index >> ppc_bits == trunc_chunk)
- ++pages_in_chunk;
- continue;
- }
-
- list_del_init(&oap->oap_pending_item);
-
- cl_page_get(page);
- lu_ref_add(&page->cp_reference, "truncate", current);
-
- if (cl_page_own(env, io, page) == 0) {
- cl_page_discard(env, io, page);
- cl_page_disown(env, io, page);
- } else {
- LASSERT(page->cp_state == CPS_FREEING);
- LASSERT(0);
- }
-
- lu_ref_del(&page->cp_reference, "truncate", current);
- cl_page_put(env, page);
-
- --ext->oe_nr_pages;
- ++nr_pages;
- }
- EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
- ext->oe_nr_pages == 0),
- ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
-
- osc_object_lock(obj);
- if (ext->oe_nr_pages == 0) {
- LASSERT(pages_in_chunk == 0);
- grants = ext->oe_grants;
- ext->oe_grants = 0;
- } else { /* calculate how many grants we can free */
- int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk;
- pgoff_t last_index;
-
- /* if there is no pages in this chunk, we can also free grants
- * for the last chunk
- */
- if (pages_in_chunk == 0) {
- /* if this is the 1st chunk and no pages in this chunk,
- * ext->oe_nr_pages must be zero, so we should be in
- * the other if-clause.
- */
- LASSERT(trunc_chunk > 0);
- --trunc_chunk;
- ++chunks;
- }
-
- /* this is what we can free from this extent */
- grants = chunks << cli->cl_chunkbits;
- ext->oe_grants -= grants;
- last_index = ((trunc_chunk + 1) << ppc_bits) - 1;
- ext->oe_end = min(last_index, ext->oe_max_end);
- LASSERT(ext->oe_end >= ext->oe_start);
- LASSERT(ext->oe_grants > 0);
- }
- osc_object_unlock(obj);
-
- if (grants > 0 || nr_pages > 0)
- osc_free_grant(cli, nr_pages, grants);
-
-out:
- cl_io_fini(env, io);
- cl_env_put(env, &refcheck);
- return rc;
-}
-
-/**
- * This function is used to make the extent prepared for transfer.
- * A race with flushing page - ll_writepage() has to be handled cautiously.
- */
-static int osc_extent_make_ready(const struct lu_env *env,
- struct osc_extent *ext)
-{
- struct osc_async_page *oap;
- struct osc_async_page *last = NULL;
- struct osc_object *obj = ext->oe_obj;
- unsigned int page_count = 0;
- int rc;
-
- /* we're going to grab page lock, so object lock must not be taken. */
- LASSERT(sanity_check(ext) == 0);
- /* in locking state, any process should not touch this extent. */
- EASSERT(ext->oe_state == OES_LOCKING, ext);
- EASSERT(ext->oe_owner, ext);
-
- OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n");
-
- list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
- ++page_count;
- if (!last || last->oap_obj_off < oap->oap_obj_off)
- last = oap;
-
- /* checking ASYNC_READY is race safe */
- if ((oap->oap_async_flags & ASYNC_READY) != 0)
- continue;
-
- rc = osc_make_ready(env, oap, OBD_BRW_WRITE);
- switch (rc) {
- case 0:
- spin_lock(&oap->oap_lock);
- oap->oap_async_flags |= ASYNC_READY;
- spin_unlock(&oap->oap_lock);
- break;
- case -EALREADY:
- LASSERT((oap->oap_async_flags & ASYNC_READY) != 0);
- break;
- default:
- LASSERTF(0, "unknown return code: %d\n", rc);
- }
- }
-
- LASSERT(page_count == ext->oe_nr_pages);
- LASSERT(last);
- /* the last page is the only one we need to refresh its count by
- * the size of file.
- */
- if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
- int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
-
- LASSERT(last_oap_count > 0);
- LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE);
- last->oap_count = last_oap_count;
- spin_lock(&last->oap_lock);
- last->oap_async_flags |= ASYNC_COUNT_STABLE;
- spin_unlock(&last->oap_lock);
- }
-
- /* for the rest of pages, we don't need to call osf_refresh_count()
- * because it's known they are not the last page
- */
- list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
- if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
- oap->oap_count = PAGE_SIZE - oap->oap_page_off;
- spin_lock(&last->oap_lock);
- oap->oap_async_flags |= ASYNC_COUNT_STABLE;
- spin_unlock(&last->oap_lock);
- }
- }
-
- osc_object_lock(obj);
- osc_extent_state_set(ext, OES_RPC);
- osc_object_unlock(obj);
- /* get a refcount for RPC. */
- osc_extent_get(ext);
-
- return 0;
-}
-
-/**
- * Quick and simple version of osc_extent_find(). This function is frequently
- * called to expand the extent for the same IO. To expand the extent, the
- * page index must be in the same or next chunk of ext->oe_end.
- */
-static int osc_extent_expand(struct osc_extent *ext, pgoff_t index,
- unsigned int *grants)
-{
- struct osc_object *obj = ext->oe_obj;
- struct client_obd *cli = osc_cli(obj);
- struct osc_extent *next;
- int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
- pgoff_t chunk = index >> ppc_bits;
- pgoff_t end_chunk;
- pgoff_t end_index;
- unsigned int chunksize = 1 << cli->cl_chunkbits;
- int rc = 0;
-
- LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
- osc_object_lock(obj);
- LASSERT(sanity_check_nolock(ext) == 0);
- end_chunk = ext->oe_end >> ppc_bits;
- if (chunk > end_chunk + 1) {
- rc = -ERANGE;
- goto out;
- }
-
- if (end_chunk >= chunk) {
- rc = 0;
- goto out;
- }
-
- LASSERT(end_chunk + 1 == chunk);
- /* try to expand this extent to cover @index */
- end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1);
-
- next = next_extent(ext);
- if (next && next->oe_start <= end_index) {
- /* complex mode - overlapped with the next extent,
- * this case will be handled by osc_extent_find()
- */
- rc = -EAGAIN;
- goto out;
- }
-
- ext->oe_end = end_index;
- ext->oe_grants += chunksize;
- LASSERT(*grants >= chunksize);
- *grants -= chunksize;
- EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext,
- "overlapped after expanding for %lu.\n", index);
-
-out:
- osc_object_unlock(obj);
- return rc;
-}
-
-static void osc_extent_tree_dump0(int level, struct osc_object *obj,
- const char *func, int line)
-{
- struct osc_extent *ext;
- int cnt;
-
- CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n",
- obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc);
-
- /* osc_object_lock(obj); */
- cnt = 1;
- for (ext = first_extent(obj); ext; ext = next_extent(ext))
- OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++);
-
- cnt = 1;
- list_for_each_entry(ext, &obj->oo_hp_exts, oe_link)
- OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++);
-
- cnt = 1;
- list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link)
- OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++);
-
- cnt = 1;
- list_for_each_entry(ext, &obj->oo_reading_exts, oe_link)
- OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++);
- /* osc_object_unlock(obj); */
-}
-
-/* ------------------ osc extent end ------------------ */
-
-static inline int osc_is_ready(struct osc_object *osc)
-{
- return !list_empty(&osc->oo_ready_item) ||
- !list_empty(&osc->oo_hp_ready_item);
-}
-
-#define OSC_IO_DEBUG(OSC, STR, args...) \
- CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \
- (OSC), osc_is_ready(OSC), \
- list_empty_marker(&(OSC)->oo_hp_ready_item), \
- list_empty_marker(&(OSC)->oo_ready_item), \
- atomic_read(&(OSC)->oo_nr_writes), \
- list_empty_marker(&(OSC)->oo_hp_exts), \
- list_empty_marker(&(OSC)->oo_urgent_exts), \
- atomic_read(&(OSC)->oo_nr_reads), \
- list_empty_marker(&(OSC)->oo_reading_exts), \
- ##args)
-
-static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
- int cmd)
-{
- struct osc_page *opg = oap2osc_page(oap);
- struct cl_page *page = oap2cl_page(oap);
- int result;
-
- LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
-
- result = cl_page_make_ready(env, page, CRT_WRITE);
- if (result == 0)
- opg->ops_submit_time = cfs_time_current();
- return result;
-}
-
-static int osc_refresh_count(const struct lu_env *env,
- struct osc_async_page *oap, int cmd)
-{
- struct osc_page *opg = oap2osc_page(oap);
- pgoff_t index = osc_index(oap2osc(oap));
- struct cl_object *obj;
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
-
- int result;
- loff_t kms;
-
- /* readpage queues with _COUNT_STABLE, shouldn't get here. */
- LASSERT(!(cmd & OBD_BRW_READ));
- obj = opg->ops_cl.cpl_obj;
-
- cl_object_attr_lock(obj);
- result = cl_object_attr_get(env, obj, attr);
- cl_object_attr_unlock(obj);
- if (result < 0)
- return result;
- kms = attr->cat_kms;
- if (cl_offset(obj, index) >= kms)
- /* catch race with truncate */
- return 0;
- else if (cl_offset(obj, index + 1) > kms)
- /* catch sub-page write at end of file */
- return kms % PAGE_SIZE;
- else
- return PAGE_SIZE;
-}
-
-static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
- int cmd, int rc)
-{
- struct osc_page *opg = oap2osc_page(oap);
- struct cl_page *page = oap2cl_page(oap);
- enum cl_req_type crt;
- int srvlock;
-
- cmd &= ~OBD_BRW_NOQUOTA;
- LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ),
- "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
- LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),
- "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
- LASSERT(opg->ops_transfer_pinned);
-
- crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
- /* Clear opg->ops_transfer_pinned before VM lock is released. */
- opg->ops_transfer_pinned = 0;
-
- opg->ops_submit_time = 0;
- srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
-
- /* statistic */
- if (rc == 0 && srvlock) {
- struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev;
- struct osc_stats *stats = &lu2osc_dev(ld)->od_stats;
- size_t bytes = oap->oap_count;
-
- if (crt == CRT_READ)
- stats->os_lockless_reads += bytes;
- else
- stats->os_lockless_writes += bytes;
- }
-
- /*
- * This has to be the last operation with the page, as locks are
- * released in cl_page_completion() and nothing except for the
- * reference counter protects page from concurrent reclaim.
- */
- lu_ref_del(&page->cp_reference, "transfer", page);
-
- cl_page_completion(env, page, crt, rc);
- cl_page_put(env, page);
-
- return 0;
-}
-
-#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do { \
- struct client_obd *__tmp = (cli); \
- CDEBUG(lvl, "%s: grant { dirty: %lu/%lu dirty_pages: %ld/%lu " \
- "dropped: %ld avail: %ld, reserved: %ld, flight: %d }" \
- "lru {in list: %ld, left: %ld, waiters: %d }" fmt "\n", \
- cli_name(__tmp), \
- __tmp->cl_dirty_pages, __tmp->cl_dirty_max_pages, \
- atomic_long_read(&obd_dirty_pages), obd_max_dirty_pages, \
- __tmp->cl_lost_grant, __tmp->cl_avail_grant, \
- __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, \
- atomic_long_read(&__tmp->cl_lru_in_list), \
- atomic_long_read(&__tmp->cl_lru_busy), \
- atomic_read(&__tmp->cl_lru_shrinkers), ##args); \
-} while (0)
-
-/* caller must hold loi_list_lock */
-static void osc_consume_write_grant(struct client_obd *cli,
- struct brw_page *pga)
-{
- assert_spin_locked(&cli->cl_loi_list_lock);
- LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
- atomic_long_inc(&obd_dirty_pages);
- cli->cl_dirty_pages++;
- pga->flag |= OBD_BRW_FROM_GRANT;
- CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
- PAGE_SIZE, pga, pga->pg);
- osc_update_next_shrink(cli);
-}
-
-/* the companion to osc_consume_write_grant, called when a brw has completed.
- * must be called with the loi lock held.
- */
-static void osc_release_write_grant(struct client_obd *cli,
- struct brw_page *pga)
-{
- assert_spin_locked(&cli->cl_loi_list_lock);
- if (!(pga->flag & OBD_BRW_FROM_GRANT))
- return;
-
- pga->flag &= ~OBD_BRW_FROM_GRANT;
- atomic_long_dec(&obd_dirty_pages);
- cli->cl_dirty_pages--;
- if (pga->flag & OBD_BRW_NOCACHE) {
- pga->flag &= ~OBD_BRW_NOCACHE;
- atomic_long_dec(&obd_dirty_transit_pages);
- cli->cl_dirty_transit--;
- }
-}
-
-/**
- * To avoid sleeping with object lock held, it's good for us allocate enough
- * grants before entering into critical section.
- *
- * spin_lock held by caller
- */
-static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes)
-{
- int rc = -EDQUOT;
-
- if (cli->cl_avail_grant >= bytes) {
- cli->cl_avail_grant -= bytes;
- cli->cl_reserved_grant += bytes;
- rc = 0;
- }
- return rc;
-}
-
-static void __osc_unreserve_grant(struct client_obd *cli,
- unsigned int reserved, unsigned int unused)
-{
- /* it's quite normal for us to get more grant than reserved.
- * Thinking about a case that two extents merged by adding a new
- * chunk, we can save one extent tax. If extent tax is greater than
- * one chunk, we can save more grant by adding a new chunk
- */
- cli->cl_reserved_grant -= reserved;
- if (unused > reserved) {
- cli->cl_avail_grant += reserved;
- cli->cl_lost_grant += unused - reserved;
- } else {
- cli->cl_avail_grant += unused;
- }
-}
-
-static void osc_unreserve_grant(struct client_obd *cli,
- unsigned int reserved, unsigned int unused)
-{
- spin_lock(&cli->cl_loi_list_lock);
- __osc_unreserve_grant(cli, reserved, unused);
- if (unused > 0)
- osc_wake_cache_waiters(cli);
- spin_unlock(&cli->cl_loi_list_lock);
-}
-
-/**
- * Free grant after IO is finished or canceled.
- *
- * @lost_grant is used to remember how many grants we have allocated but not
- * used, we should return these grants to OST. There're two cases where grants
- * can be lost:
- * 1. truncate;
- * 2. blocksize at OST is less than PAGE_SIZE and a partial page was
- * written. In this case OST may use less chunks to serve this partial
- * write. OSTs don't actually know the page size on the client side. so
- * clients have to calculate lost grant by the blocksize on the OST.
- * See filter_grant_check() for details.
- */
-static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
- unsigned int lost_grant)
-{
- unsigned long grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
-
- spin_lock(&cli->cl_loi_list_lock);
- atomic_long_sub(nr_pages, &obd_dirty_pages);
- cli->cl_dirty_pages -= nr_pages;
- cli->cl_lost_grant += lost_grant;
- if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
- /* borrow some grant from truncate to avoid the case that
- * truncate uses up all avail grant
- */
- cli->cl_lost_grant -= grant;
- cli->cl_avail_grant += grant;
- }
- osc_wake_cache_waiters(cli);
- spin_unlock(&cli->cl_loi_list_lock);
- CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n",
- lost_grant, cli->cl_lost_grant,
- cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT);
-}
-
-/**
- * The companion to osc_enter_cache(), called when @oap is no longer part of
- * the dirty accounting due to error.
- */
-static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
-{
- spin_lock(&cli->cl_loi_list_lock);
- osc_release_write_grant(cli, &oap->oap_brw_page);
- spin_unlock(&cli->cl_loi_list_lock);
-}
-
-/**
- * Non-blocking version of osc_enter_cache() that consumes grant only when it
- * is available.
- */
-static int osc_enter_cache_try(struct client_obd *cli,
- struct osc_async_page *oap,
- int bytes, int transient)
-{
- int rc;
-
- OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
-
- rc = osc_reserve_grant(cli, bytes);
- if (rc < 0)
- return 0;
-
- if (cli->cl_dirty_pages < cli->cl_dirty_max_pages &&
- atomic_long_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) {
- osc_consume_write_grant(cli, &oap->oap_brw_page);
- if (transient) {
- cli->cl_dirty_transit++;
- atomic_long_inc(&obd_dirty_transit_pages);
- oap->oap_brw_flags |= OBD_BRW_NOCACHE;
- }
- rc = 1;
- } else {
- __osc_unreserve_grant(cli, bytes, bytes);
- rc = 0;
- }
- return rc;
-}
-
-static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
-{
- int rc;
-
- spin_lock(&cli->cl_loi_list_lock);
- rc = list_empty(&ocw->ocw_entry);
- spin_unlock(&cli->cl_loi_list_lock);
- return rc;
-}
-
-/**
- * The main entry to reserve dirty page accounting. Usually the grant reserved
- * in this function will be freed in bulk in osc_free_grant() unless it fails
- * to add osc cache, in that case, it will be freed in osc_exit_cache().
- *
- * The process will be put into sleep if it's already run out of grant.
- */
-static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
- struct osc_async_page *oap, int bytes)
-{
- struct osc_object *osc = oap->oap_obj;
- struct lov_oinfo *loi = osc->oo_oinfo;
- struct osc_cache_waiter ocw;
- unsigned long timeout = (AT_OFF ? obd_timeout : at_max) * HZ;
- int rc = -EDQUOT;
-
- OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes);
-
- spin_lock(&cli->cl_loi_list_lock);
-
- /* force the caller to try sync io. this can jump the list
- * of queued writes and create a discontiguous rpc stream
- */
- if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
- !cli->cl_dirty_max_pages || cli->cl_ar.ar_force_sync ||
- loi->loi_ar.ar_force_sync) {
- OSC_DUMP_GRANT(D_CACHE, cli, "forced sync i/o\n");
- rc = -EDQUOT;
- goto out;
- }
-
- /* Hopefully normal case - cache space and write credits available */
- if (osc_enter_cache_try(cli, oap, bytes, 0)) {
- OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n");
- rc = 0;
- goto out;
- }
-
- /* We can get here for two reasons: too many dirty pages in cache, or
- * run out of grants. In both cases we should write dirty pages out.
- * Adding a cache waiter will trigger urgent write-out no matter what
- * RPC size will be.
- * The exiting condition is no avail grants and no dirty pages caching,
- * that really means there is no space on the OST.
- */
- init_waitqueue_head(&ocw.ocw_waitq);
- ocw.ocw_oap = oap;
- ocw.ocw_grant = bytes;
- while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) {
- list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
- ocw.ocw_rc = 0;
- spin_unlock(&cli->cl_loi_list_lock);
-
- osc_io_unplug_async(env, cli, NULL);
-
- CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
- cli_name(cli), &ocw, oap);
-
- rc = wait_event_idle_timeout(ocw.ocw_waitq,
- ocw_granted(cli, &ocw), timeout);
-
- spin_lock(&cli->cl_loi_list_lock);
-
- if (rc == 0) {
- /* wait_event is interrupted by signal, or timed out */
- list_del_init(&ocw.ocw_entry);
- rc = -ETIMEDOUT;
- break;
- }
- LASSERT(list_empty(&ocw.ocw_entry));
- rc = ocw.ocw_rc;
-
- if (rc != -EDQUOT)
- break;
- if (osc_enter_cache_try(cli, oap, bytes, 0)) {
- rc = 0;
- break;
- }
- }
-
- switch (rc) {
- case 0:
- OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n");
- break;
- case -ETIMEDOUT:
- OSC_DUMP_GRANT(D_CACHE, cli,
- "timeout, fall back to sync i/o\n");
- osc_extent_tree_dump(D_CACHE, osc);
- /* fall back to synchronous I/O */
- rc = -EDQUOT;
- break;
- case -EINTR:
- /* Ensures restartability - LU-3581 */
- OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n");
- rc = -ERESTARTSYS;
- break;
- case -EDQUOT:
- OSC_DUMP_GRANT(D_CACHE, cli,
- "no grant space, fall back to sync i/o\n");
- break;
- default:
- CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived due to %d, fall back to sync i/o\n",
- cli_name(cli), &ocw, rc);
- break;
- }
-out:
- spin_unlock(&cli->cl_loi_list_lock);
- return rc;
-}
-
-/* caller must hold loi_list_lock */
-void osc_wake_cache_waiters(struct client_obd *cli)
-{
- struct list_head *l, *tmp;
- struct osc_cache_waiter *ocw;
-
- list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
- ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
- list_del_init(&ocw->ocw_entry);
-
- ocw->ocw_rc = -EDQUOT;
- /* we can't dirty more */
- if ((cli->cl_dirty_pages > cli->cl_dirty_max_pages) ||
- (atomic_long_read(&obd_dirty_pages) + 1 >
- obd_max_dirty_pages)) {
- CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %ld\n",
- cli->cl_dirty_pages, cli->cl_dirty_max_pages,
- obd_max_dirty_pages);
- goto wakeup;
- }
-
- if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
- ocw->ocw_rc = 0;
-wakeup:
- CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
- ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
-
- wake_up(&ocw->ocw_waitq);
- }
-}
-
-static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
-{
- int hprpc = !!list_empty(&osc->oo_hp_exts);
-
- return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
-}
-
-/* This maintains the lists of pending pages to read/write for a given object
- * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
- * to quickly find objects that are ready to send an RPC.
- */
-static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
- int cmd)
-{
- int invalid_import = 0;
-
- /* if we have an invalid import we want to drain the queued pages
- * by forcing them through rpcs that immediately fail and complete
- * the pages. recovery relies on this to empty the queued pages
- * before canceling the locks and evicting down the llite pages
- */
- if (!cli->cl_import || cli->cl_import->imp_invalid)
- invalid_import = 1;
-
- if (cmd & OBD_BRW_WRITE) {
- if (atomic_read(&osc->oo_nr_writes) == 0)
- return 0;
- if (invalid_import) {
- CDEBUG(D_CACHE, "invalid import forcing RPC\n");
- return 1;
- }
- if (!list_empty(&osc->oo_hp_exts)) {
- CDEBUG(D_CACHE, "high prio request forcing RPC\n");
- return 1;
- }
- if (!list_empty(&osc->oo_urgent_exts)) {
- CDEBUG(D_CACHE, "urgent request forcing RPC\n");
- return 1;
- }
- /* trigger a write rpc stream as long as there are dirtiers
- * waiting for space. as they're waiting, they're not going to
- * create more pages to coalesce with what's waiting..
- */
- if (!list_empty(&cli->cl_cache_waiters)) {
- CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
- return 1;
- }
- if (atomic_read(&osc->oo_nr_writes) >=
- cli->cl_max_pages_per_rpc)
- return 1;
- } else {
- if (atomic_read(&osc->oo_nr_reads) == 0)
- return 0;
- if (invalid_import) {
- CDEBUG(D_CACHE, "invalid import forcing RPC\n");
- return 1;
- }
- /* all read are urgent. */
- if (!list_empty(&osc->oo_reading_exts))
- return 1;
- }
-
- return 0;
-}
-
-static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
-{
- struct client_obd *cli = osc_cli(obj);
-
- if (cmd & OBD_BRW_WRITE) {
- atomic_add(delta, &obj->oo_nr_writes);
- atomic_add(delta, &cli->cl_pending_w_pages);
- LASSERT(atomic_read(&obj->oo_nr_writes) >= 0);
- } else {
- atomic_add(delta, &obj->oo_nr_reads);
- atomic_add(delta, &cli->cl_pending_r_pages);
- LASSERT(atomic_read(&obj->oo_nr_reads) >= 0);
- }
- OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
-}
-
-static int osc_makes_hprpc(struct osc_object *obj)
-{
- return !list_empty(&obj->oo_hp_exts);
-}
-
-static void on_list(struct list_head *item, struct list_head *list, int should_be_on)
-{
- if (list_empty(item) && should_be_on)
- list_add_tail(item, list);
- else if (!list_empty(item) && !should_be_on)
- list_del_init(item);
-}
-
-/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc
- * can find pages to build into rpcs quickly
- */
-static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc)
-{
- if (osc_makes_hprpc(osc)) {
- /* HP rpc */
- on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0);
- on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
- } else {
- on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
- on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list,
- osc_makes_rpc(cli, osc, OBD_BRW_WRITE) ||
- osc_makes_rpc(cli, osc, OBD_BRW_READ));
- }
-
- on_list(&osc->oo_write_item, &cli->cl_loi_write_list,
- atomic_read(&osc->oo_nr_writes) > 0);
-
- on_list(&osc->oo_read_item, &cli->cl_loi_read_list,
- atomic_read(&osc->oo_nr_reads) > 0);
-
- return osc_is_ready(osc);
-}
-
-static int osc_list_maint(struct client_obd *cli, struct osc_object *osc)
-{
- int is_ready;
-
- spin_lock(&cli->cl_loi_list_lock);
- is_ready = __osc_list_maint(cli, osc);
- spin_unlock(&cli->cl_loi_list_lock);
-
- return is_ready;
-}
-
-/* this is trying to propagate async writeback errors back up to the
- * application. As an async write fails we record the error code for later if
- * the app does an fsync. As long as errors persist we force future rpcs to be
- * sync so that the app can get a sync error and break the cycle of queueing
- * pages for which writeback will fail.
- */
-static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
- int rc)
-{
- if (rc) {
- if (!ar->ar_rc)
- ar->ar_rc = rc;
-
- ar->ar_force_sync = 1;
- ar->ar_min_xid = ptlrpc_sample_next_xid();
- return;
- }
-
- if (ar->ar_force_sync && (xid >= ar->ar_min_xid))
- ar->ar_force_sync = 0;
-}
-
-/* this must be called holding the loi list lock to give coverage to exit_cache,
- * async_flag maintenance, and oap_request
- */
-static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
- struct osc_async_page *oap, int sent, int rc)
-{
- struct osc_object *osc = oap->oap_obj;
- struct lov_oinfo *loi = osc->oo_oinfo;
- __u64 xid = 0;
-
- if (oap->oap_request) {
- xid = ptlrpc_req_xid(oap->oap_request);
- ptlrpc_req_finished(oap->oap_request);
- oap->oap_request = NULL;
- }
-
- /* As the transfer for this page is being done, clear the flags */
- spin_lock(&oap->oap_lock);
- oap->oap_async_flags = 0;
- spin_unlock(&oap->oap_lock);
- oap->oap_interrupted = 0;
-
- if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) {
- spin_lock(&cli->cl_loi_list_lock);
- osc_process_ar(&cli->cl_ar, xid, rc);
- osc_process_ar(&loi->loi_ar, xid, rc);
- spin_unlock(&cli->cl_loi_list_lock);
- }
-
- rc = osc_completion(env, oap, oap->oap_cmd, rc);
- if (rc)
- CERROR("completion on oap %p obj %p returns %d.\n",
- oap, osc, rc);
-}
-
-struct extent_rpc_data {
- struct list_head *erd_rpc_list;
- unsigned int erd_page_count;
- unsigned int erd_max_pages;
- unsigned int erd_max_chunks;
- unsigned int erd_max_extents;
-};
-
-static inline unsigned int osc_extent_chunks(const struct osc_extent *ext)
-{
- struct client_obd *cli = osc_cli(ext->oe_obj);
- unsigned int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
-
- return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1;
-}
-
-/**
- * Try to add extent to one RPC. We need to think about the following things:
- * - # of pages must not be over max_pages_per_rpc
- * - extent must be compatible with previous ones
- */
-static int try_to_add_extent_for_io(struct client_obd *cli,
- struct osc_extent *ext,
- struct extent_rpc_data *data)
-{
- struct osc_extent *tmp;
- unsigned int chunk_count;
- struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
- struct osc_async_page,
- oap_pending_item);
-
- EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
- ext);
-
- if (!data->erd_max_extents)
- return 0;
-
- chunk_count = osc_extent_chunks(ext);
- EASSERTF(data->erd_page_count != 0 ||
- chunk_count <= data->erd_max_chunks, ext,
- "The first extent to be fit in a RPC contains %u chunks, which is over the limit %u.\n",
- chunk_count, data->erd_max_chunks);
-
- if (chunk_count > data->erd_max_chunks)
- return 0;
-
- data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages);
- EASSERTF(data->erd_page_count != 0 ||
- ext->oe_nr_pages <= data->erd_max_pages, ext,
- "The first extent to be fit in a RPC contains %u pages, which is over the limit %u.\n",
- ext->oe_nr_pages, data->erd_max_pages);
- if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages)
- return 0;
-
- list_for_each_entry(tmp, data->erd_rpc_list, oe_link) {
- struct osc_async_page *oap2;
-
- oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
- oap_pending_item);
- EASSERT(tmp->oe_owner == current, tmp);
- if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
- CDEBUG(D_CACHE, "Do not permit different type of IO in one RPC\n");
- return 0;
- }
-
- if (tmp->oe_srvlock != ext->oe_srvlock ||
- !tmp->oe_grants != !ext->oe_grants ||
- tmp->oe_no_merge || ext->oe_no_merge)
- return 0;
-
- /* remove break for strict check */
- break;
- }
-
- data->erd_max_extents--;
- data->erd_max_chunks -= chunk_count;
- data->erd_page_count += ext->oe_nr_pages;
- list_move_tail(&ext->oe_link, data->erd_rpc_list);
- ext->oe_owner = current;
- return 1;
-}
-
-static inline unsigned int osc_max_write_chunks(const struct client_obd *cli)
-{
- /*
- * LU-8135:
- *
- * The maximum size of a single transaction is about 64MB in ZFS.
- * #define DMU_MAX_ACCESS (64 * 1024 * 1024)
- *
- * Since ZFS is a copy-on-write file system, a single dirty page in
- * a chunk will result in the rewrite of the whole chunk, therefore
- * an RPC shouldn't be allowed to contain too many chunks otherwise
- * it will make transaction size much bigger than 64MB, especially
- * with big block size for ZFS.
- *
- * This piece of code is to make sure that OSC won't send write RPCs
- * with too many chunks. The maximum chunk size that an RPC can cover
- * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally
- * OST should tell the client what the biggest transaction size is,
- * but it's good enough for now.
- *
- * This limitation doesn't apply to ldiskfs, which allows as many
- * chunks in one RPC as we want. However, it won't have any benefits
- * to have too many discontiguous pages in one RPC.
- *
- * An osc_extent won't cover over a RPC size, so the chunks in an
- * osc_extent won't bigger than PTLRPC_MAX_BRW_SIZE >> chunkbits.
- */
- return PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits;
-}
-
-/**
- * In order to prevent multiple ptlrpcd from breaking contiguous extents,
- * get_write_extent() takes all appropriate extents in atomic.
- *
- * The following policy is used to collect extents for IO:
- * 1. Add as many HP extents as possible;
- * 2. Add the first urgent extent in urgent extent list and take it out of
- * urgent list;
- * 3. Add subsequent extents of this urgent extent;
- * 4. If urgent list is not empty, goto 2;
- * 5. Traverse the extent tree from the 1st extent;
- * 6. Above steps exit if there is no space in this RPC.
- */
-static unsigned int get_write_extents(struct osc_object *obj,
- struct list_head *rpclist)
-{
- struct client_obd *cli = osc_cli(obj);
- struct osc_extent *ext;
- struct osc_extent *temp;
- struct extent_rpc_data data = {
- .erd_rpc_list = rpclist,
- .erd_page_count = 0,
- .erd_max_pages = cli->cl_max_pages_per_rpc,
- .erd_max_chunks = osc_max_write_chunks(cli),
- .erd_max_extents = 256,
- };
-
- LASSERT(osc_object_is_locked(obj));
- list_for_each_entry_safe(ext, temp, &obj->oo_hp_exts, oe_link) {
- LASSERT(ext->oe_state == OES_CACHE);
- if (!try_to_add_extent_for_io(cli, ext, &data))
- return data.erd_page_count;
- EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
- }
- if (data.erd_page_count == data.erd_max_pages)
- return data.erd_page_count;
-
- while (!list_empty(&obj->oo_urgent_exts)) {
- ext = list_entry(obj->oo_urgent_exts.next,
- struct osc_extent, oe_link);
- if (!try_to_add_extent_for_io(cli, ext, &data))
- return data.erd_page_count;
-
- if (!ext->oe_intree)
- continue;
-
- while ((ext = next_extent(ext)) != NULL) {
- if ((ext->oe_state != OES_CACHE) ||
- (!list_empty(&ext->oe_link) &&
- ext->oe_owner))
- continue;
-
- if (!try_to_add_extent_for_io(cli, ext, &data))
- return data.erd_page_count;
- }
- }
- if (data.erd_page_count == data.erd_max_pages)
- return data.erd_page_count;
-
- ext = first_extent(obj);
- while (ext) {
- if ((ext->oe_state != OES_CACHE) ||
- /* this extent may be already in current rpclist */
- (!list_empty(&ext->oe_link) && ext->oe_owner)) {
- ext = next_extent(ext);
- continue;
- }
-
- if (!try_to_add_extent_for_io(cli, ext, &data))
- return data.erd_page_count;
-
- ext = next_extent(ext);
- }
- return data.erd_page_count;
-}
-
-static int
-osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli,
- struct osc_object *osc)
- __must_hold(osc)
-{
- LIST_HEAD(rpclist);
- struct osc_extent *ext;
- struct osc_extent *tmp;
- struct osc_extent *first = NULL;
- u32 page_count = 0;
- int srvlock = 0;
- int rc = 0;
-
- LASSERT(osc_object_is_locked(osc));
-
- page_count = get_write_extents(osc, &rpclist);
- LASSERT(equi(page_count == 0, list_empty(&rpclist)));
-
- if (list_empty(&rpclist))
- return 0;
-
- osc_update_pending(osc, OBD_BRW_WRITE, -page_count);
-
- list_for_each_entry(ext, &rpclist, oe_link) {
- LASSERT(ext->oe_state == OES_CACHE ||
- ext->oe_state == OES_LOCK_DONE);
- if (ext->oe_state == OES_CACHE)
- osc_extent_state_set(ext, OES_LOCKING);
- else
- osc_extent_state_set(ext, OES_RPC);
- }
-
- /* we're going to grab page lock, so release object lock because
- * lock order is page lock -> object lock.
- */
- osc_object_unlock(osc);
-
- list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) {
- if (ext->oe_state == OES_LOCKING) {
- rc = osc_extent_make_ready(env, ext);
- if (unlikely(rc < 0)) {
- list_del_init(&ext->oe_link);
- osc_extent_finish(env, ext, 0, rc);
- continue;
- }
- }
- if (!first) {
- first = ext;
- srvlock = ext->oe_srvlock;
- } else {
- LASSERT(srvlock == ext->oe_srvlock);
- }
- }
-
- if (!list_empty(&rpclist)) {
- LASSERT(page_count > 0);
- rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE);
- LASSERT(list_empty(&rpclist));
- }
-
- osc_object_lock(osc);
- return rc;
-}
-
-/**
- * prepare pages for ASYNC io and put pages in send queue.
- *
- * \param cmd OBD_BRW_* macroses
- * \param lop pending pages
- *
- * \return zero if no page added to send queue.
- * \return 1 if pages successfully added to send queue.
- * \return negative on errors.
- */
-static int
-osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
- struct osc_object *osc)
- __must_hold(osc)
-{
- struct osc_extent *ext;
- struct osc_extent *next;
- LIST_HEAD(rpclist);
- struct extent_rpc_data data = {
- .erd_rpc_list = &rpclist,
- .erd_page_count = 0,
- .erd_max_pages = cli->cl_max_pages_per_rpc,
- .erd_max_chunks = UINT_MAX,
- .erd_max_extents = UINT_MAX,
- };
- int rc = 0;
-
- LASSERT(osc_object_is_locked(osc));
- list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) {
- EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
- if (!try_to_add_extent_for_io(cli, ext, &data))
- break;
- osc_extent_state_set(ext, OES_RPC);
- EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
- }
- LASSERT(data.erd_page_count <= data.erd_max_pages);
-
- osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count);
-
- if (!list_empty(&rpclist)) {
- osc_object_unlock(osc);
-
- rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ);
- LASSERT(list_empty(&rpclist));
-
- osc_object_lock(osc);
- }
- return rc;
-}
-
-#define list_to_obj(list, item) ({ \
- struct list_head *__tmp = (list)->next; \
- list_del_init(__tmp); \
- list_entry(__tmp, struct osc_object, oo_##item); \
-})
-
-/* This is called by osc_check_rpcs() to find which objects have pages that
- * we could be sending. These lists are maintained by osc_makes_rpc().
- */
-static struct osc_object *osc_next_obj(struct client_obd *cli)
-{
- /* First return objects that have blocked locks so that they
- * will be flushed quickly and other clients can get the lock,
- * then objects which have pages ready to be stuffed into RPCs
- */
- if (!list_empty(&cli->cl_loi_hp_ready_list))
- return list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item);
- if (!list_empty(&cli->cl_loi_ready_list))
- return list_to_obj(&cli->cl_loi_ready_list, ready_item);
-
- /* then if we have cache waiters, return all objects with queued
- * writes. This is especially important when many small files
- * have filled up the cache and not been fired into rpcs because
- * they don't pass the nr_pending/object threshold
- */
- if (!list_empty(&cli->cl_cache_waiters) &&
- !list_empty(&cli->cl_loi_write_list))
- return list_to_obj(&cli->cl_loi_write_list, write_item);
-
- /* then return all queued objects when we have an invalid import
- * so that they get flushed
- */
- if (!cli->cl_import || cli->cl_import->imp_invalid) {
- if (!list_empty(&cli->cl_loi_write_list))
- return list_to_obj(&cli->cl_loi_write_list, write_item);
- if (!list_empty(&cli->cl_loi_read_list))
- return list_to_obj(&cli->cl_loi_read_list, read_item);
- }
- return NULL;
-}
-
-/* called with the loi list lock held */
-static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli)
- __must_hold(&cli->cl_loi_list_lock)
-{
- struct osc_object *osc;
- int rc = 0;
-
- while ((osc = osc_next_obj(cli)) != NULL) {
- struct cl_object *obj = osc2cl(osc);
- struct lu_ref_link link;
-
- OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
-
- if (osc_max_rpc_in_flight(cli, osc)) {
- __osc_list_maint(cli, osc);
- break;
- }
-
- cl_object_get(obj);
- spin_unlock(&cli->cl_loi_list_lock);
- lu_object_ref_add_at(&obj->co_lu, &link, "check", current);
-
- /* attempt some read/write balancing by alternating between
- * reads and writes in an object. The makes_rpc checks here
- * would be redundant if we were getting read/write work items
- * instead of objects. we don't want send_oap_rpc to drain a
- * partial read pending queue when we're given this object to
- * do io on writes while there are cache waiters
- */
- osc_object_lock(osc);
- if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
- rc = osc_send_write_rpc(env, cli, osc);
- if (rc < 0) {
- CERROR("Write request failed with %d\n", rc);
-
- /* osc_send_write_rpc failed, mostly because of
- * memory pressure.
- *
- * It can't break here, because if:
- * - a page was submitted by osc_io_submit, so
- * page locked;
- * - no request in flight
- * - no subsequent request
- * The system will be in live-lock state,
- * because there is no chance to call
- * osc_io_unplug() and osc_check_rpcs() any
- * more. pdflush can't help in this case,
- * because it might be blocked at grabbing
- * the page lock as we mentioned.
- *
- * Anyway, continue to drain pages.
- */
- /* break; */
- }
- }
- if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) {
- rc = osc_send_read_rpc(env, cli, osc);
- if (rc < 0)
- CERROR("Read request failed with %d\n", rc);
- }
- osc_object_unlock(osc);
-
- osc_list_maint(cli, osc);
- lu_object_ref_del_at(&obj->co_lu, &link, "check", current);
- cl_object_put(env, obj);
-
- spin_lock(&cli->cl_loi_list_lock);
- }
-}
-
-static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
- struct osc_object *osc, int async)
-{
- int rc = 0;
-
- if (osc && osc_list_maint(cli, osc) == 0)
- return 0;
-
- if (!async) {
- spin_lock(&cli->cl_loi_list_lock);
- osc_check_rpcs(env, cli);
- spin_unlock(&cli->cl_loi_list_lock);
- } else {
- CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli);
- LASSERT(cli->cl_writeback_work);
- rc = ptlrpcd_queue_work(cli->cl_writeback_work);
- }
- return rc;
-}
-
-static int osc_io_unplug_async(const struct lu_env *env,
- struct client_obd *cli, struct osc_object *osc)
-{
- return osc_io_unplug0(env, cli, osc, 1);
-}
-
-void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
- struct osc_object *osc)
-{
- (void)osc_io_unplug0(env, cli, osc, 0);
-}
-
-int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
- struct page *page, loff_t offset)
-{
- struct obd_export *exp = osc_export(osc);
- struct osc_async_page *oap = &ops->ops_oap;
-
- if (!page)
- return cfs_size_round(sizeof(*oap));
-
- oap->oap_magic = OAP_MAGIC;
- oap->oap_cli = &exp->exp_obd->u.cli;
- oap->oap_obj = osc;
-
- oap->oap_page = page;
- oap->oap_obj_off = offset;
- LASSERT(!(offset & ~PAGE_MASK));
-
- if (capable(CAP_SYS_RESOURCE))
- oap->oap_brw_flags = OBD_BRW_NOQUOTA;
-
- INIT_LIST_HEAD(&oap->oap_pending_item);
- INIT_LIST_HEAD(&oap->oap_rpc_item);
-
- spin_lock_init(&oap->oap_lock);
- CDEBUG(D_INFO, "oap %p page %p obj off %llu\n",
- oap, page, oap->oap_obj_off);
- return 0;
-}
-
-int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
- struct osc_page *ops)
-{
- struct osc_io *oio = osc_env_io(env);
- struct osc_extent *ext = NULL;
- struct osc_async_page *oap = &ops->ops_oap;
- struct client_obd *cli = oap->oap_cli;
- struct osc_object *osc = oap->oap_obj;
- pgoff_t index;
- unsigned int grants = 0, tmp;
- int brw_flags = OBD_BRW_ASYNC;
- int cmd = OBD_BRW_WRITE;
- int need_release = 0;
- int rc = 0;
-
- if (oap->oap_magic != OAP_MAGIC)
- return -EINVAL;
-
- if (!cli->cl_import || cli->cl_import->imp_invalid)
- return -EIO;
-
- if (!list_empty(&oap->oap_pending_item) ||
- !list_empty(&oap->oap_rpc_item))
- return -EBUSY;
-
- /* Set the OBD_BRW_SRVLOCK before the page is queued. */
- brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
- if (capable(CAP_SYS_RESOURCE)) {
- brw_flags |= OBD_BRW_NOQUOTA;
- cmd |= OBD_BRW_NOQUOTA;
- }
-
- /* check if the file's owner/group is over quota */
- if (!(cmd & OBD_BRW_NOQUOTA)) {
- struct cl_object *obj;
- struct cl_attr *attr;
- unsigned int qid[MAXQUOTAS];
-
- obj = cl_object_top(&osc->oo_cl);
- attr = &osc_env_info(env)->oti_attr;
-
- cl_object_attr_lock(obj);
- rc = cl_object_attr_get(env, obj, attr);
- cl_object_attr_unlock(obj);
-
- qid[USRQUOTA] = attr->cat_uid;
- qid[GRPQUOTA] = attr->cat_gid;
- if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA)
- rc = -EDQUOT;
- if (rc)
- return rc;
- }
-
- oap->oap_cmd = cmd;
- oap->oap_page_off = ops->ops_from;
- oap->oap_count = ops->ops_to - ops->ops_from;
- /*
- * No need to hold a lock here,
- * since this page is not in any list yet.
- */
- oap->oap_async_flags = 0;
- oap->oap_brw_flags = brw_flags;
-
- OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n",
- oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK);
-
- index = osc_index(oap2osc(oap));
-
- /* Add this page into extent by the following steps:
- * 1. if there exists an active extent for this IO, mostly this page
- * can be added to the active extent and sometimes we need to
- * expand extent to accommodate this page;
- * 2. otherwise, a new extent will be allocated.
- */
-
- ext = oio->oi_active;
- if (ext && ext->oe_start <= index && ext->oe_max_end >= index) {
- /* one chunk plus extent overhead must be enough to write this
- * page
- */
- grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
- if (ext->oe_end >= index)
- grants = 0;
-
- /* it doesn't need any grant to dirty this page */
- spin_lock(&cli->cl_loi_list_lock);
- rc = osc_enter_cache_try(cli, oap, grants, 0);
- spin_unlock(&cli->cl_loi_list_lock);
- if (rc == 0) { /* try failed */
- grants = 0;
- need_release = 1;
- } else if (ext->oe_end < index) {
- tmp = grants;
- /* try to expand this extent */
- rc = osc_extent_expand(ext, index, &tmp);
- if (rc < 0) {
- need_release = 1;
- /* don't free reserved grant */
- } else {
- OSC_EXTENT_DUMP(D_CACHE, ext,
- "expanded for %lu.\n", index);
- osc_unreserve_grant(cli, grants, tmp);
- grants = 0;
- }
- }
- rc = 0;
- } else if (ext) {
- /* index is located outside of active extent */
- need_release = 1;
- }
- if (need_release) {
- osc_extent_release(env, ext);
- oio->oi_active = NULL;
- ext = NULL;
- }
-
- if (!ext) {
- tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
-
- /* try to find new extent to cover this page */
- LASSERT(!oio->oi_active);
- /* we may have allocated grant for this page if we failed
- * to expand the previous active extent.
- */
- LASSERT(ergo(grants > 0, grants >= tmp));
-
- rc = 0;
- if (grants == 0) {
- /* we haven't allocated grant for this page. */
- rc = osc_enter_cache(env, cli, oap, tmp);
- if (rc == 0)
- grants = tmp;
- }
-
- tmp = grants;
- if (rc == 0) {
- ext = osc_extent_find(env, osc, index, &tmp);
- if (IS_ERR(ext)) {
- LASSERT(tmp == grants);
- osc_exit_cache(cli, oap);
- rc = PTR_ERR(ext);
- ext = NULL;
- } else {
- oio->oi_active = ext;
- }
- }
- if (grants > 0)
- osc_unreserve_grant(cli, grants, tmp);
- }
-
- LASSERT(ergo(rc == 0, ext));
- if (ext) {
- EASSERTF(ext->oe_end >= index && ext->oe_start <= index,
- ext, "index = %lu.\n", index);
- LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
-
- osc_object_lock(osc);
- if (ext->oe_nr_pages == 0)
- ext->oe_srvlock = ops->ops_srvlock;
- else
- LASSERT(ext->oe_srvlock == ops->ops_srvlock);
- ++ext->oe_nr_pages;
- list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
- osc_object_unlock(osc);
- }
- return rc;
-}
-
-int osc_teardown_async_page(const struct lu_env *env,
- struct osc_object *obj, struct osc_page *ops)
-{
- struct osc_async_page *oap = &ops->ops_oap;
- int rc = 0;
-
- LASSERT(oap->oap_magic == OAP_MAGIC);
-
- CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n",
- oap, ops, osc_index(oap2osc(oap)));
-
- if (!list_empty(&oap->oap_rpc_item)) {
- CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap);
- rc = -EBUSY;
- } else if (!list_empty(&oap->oap_pending_item)) {
- struct osc_extent *ext = NULL;
-
- osc_object_lock(obj);
- ext = osc_extent_lookup(obj, osc_index(oap2osc(oap)));
- osc_object_unlock(obj);
- /* only truncated pages are allowed to be taken out.
- * See osc_extent_truncate() and osc_cache_truncate_start()
- * for details.
- */
- if (ext && ext->oe_state != OES_TRUNC) {
- OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n",
- osc_index(oap2osc(oap)));
- rc = -EBUSY;
- }
- if (ext)
- osc_extent_put(env, ext);
- }
- return rc;
-}
-
-/**
- * This is called when a page is picked up by kernel to write out.
- *
- * We should find out the corresponding extent and add the whole extent
- * into urgent list. The extent may be being truncated or used, handle it
- * carefully.
- */
-int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
- struct osc_page *ops)
-{
- struct osc_extent *ext = NULL;
- struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj);
- struct cl_page *cp = ops->ops_cl.cpl_page;
- pgoff_t index = osc_index(ops);
- struct osc_async_page *oap = &ops->ops_oap;
- bool unplug = false;
- int rc = 0;
-
- osc_object_lock(obj);
- ext = osc_extent_lookup(obj, index);
- if (!ext) {
- osc_extent_tree_dump(D_ERROR, obj);
- LASSERTF(0, "page index %lu is NOT covered.\n", index);
- }
-
- switch (ext->oe_state) {
- case OES_RPC:
- case OES_LOCK_DONE:
- CL_PAGE_DEBUG(D_ERROR, env, cp, "flush an in-rpc page?\n");
- LASSERT(0);
- break;
- case OES_LOCKING:
- /* If we know this extent is being written out, we should abort
- * so that the writer can make this page ready. Otherwise, there
- * exists a deadlock problem because other process can wait for
- * page writeback bit holding page lock; and meanwhile in
- * vvp_page_make_ready(), we need to grab page lock before
- * really sending the RPC.
- */
- case OES_TRUNC:
- /* race with truncate, page will be redirtied */
- case OES_ACTIVE:
- /* The extent is active so we need to abort and let the caller
- * re-dirty the page. If we continued on here, and we were the
- * one making the extent active, we could deadlock waiting for
- * the page writeback to clear but it won't because the extent
- * is active and won't be written out.
- */
- rc = -EAGAIN;
- goto out;
- default:
- break;
- }
-
- rc = cl_page_prep(env, io, cp, CRT_WRITE);
- if (rc)
- goto out;
-
- spin_lock(&oap->oap_lock);
- oap->oap_async_flags |= ASYNC_READY | ASYNC_URGENT;
- spin_unlock(&oap->oap_lock);
-
- if (memory_pressure_get())
- ext->oe_memalloc = 1;
-
- ext->oe_urgent = 1;
- if (ext->oe_state == OES_CACHE) {
- OSC_EXTENT_DUMP(D_CACHE, ext,
- "flush page %p make it urgent.\n", oap);
- if (list_empty(&ext->oe_link))
- list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
- unplug = true;
- }
- rc = 0;
-
-out:
- osc_object_unlock(obj);
- osc_extent_put(env, ext);
- if (unplug)
- osc_io_unplug_async(env, osc_cli(obj), obj);
- return rc;
-}
-
-/**
- * this is called when a sync waiter receives an interruption. Its job is to
- * get the caller woken as soon as possible. If its page hasn't been put in an
- * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as
- * desiring interruption which will forcefully complete the rpc once the rpc
- * has timed out.
- */
-int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
-{
- struct osc_async_page *oap = &ops->ops_oap;
- struct osc_object *obj = oap->oap_obj;
- struct client_obd *cli = osc_cli(obj);
- struct osc_extent *ext;
- struct osc_extent *found = NULL;
- struct list_head *plist;
- pgoff_t index = osc_index(ops);
- int rc = -EBUSY;
- int cmd;
-
- LASSERT(!oap->oap_interrupted);
- oap->oap_interrupted = 1;
-
- /* Find out the caching extent */
- osc_object_lock(obj);
- if (oap->oap_cmd & OBD_BRW_WRITE) {
- plist = &obj->oo_urgent_exts;
- cmd = OBD_BRW_WRITE;
- } else {
- plist = &obj->oo_reading_exts;
- cmd = OBD_BRW_READ;
- }
- list_for_each_entry(ext, plist, oe_link) {
- if (ext->oe_start <= index && ext->oe_end >= index) {
- LASSERT(ext->oe_state == OES_LOCK_DONE);
- /* For OES_LOCK_DONE state extent, it has already held
- * a refcount for RPC.
- */
- found = osc_extent_get(ext);
- break;
- }
- }
- if (found) {
- list_del_init(&found->oe_link);
- osc_update_pending(obj, cmd, -found->oe_nr_pages);
- osc_object_unlock(obj);
-
- osc_extent_finish(env, found, 0, -EINTR);
- osc_extent_put(env, found);
- rc = 0;
- } else {
- osc_object_unlock(obj);
- /* ok, it's been put in an rpc. only one oap gets a request
- * reference
- */
- if (oap->oap_request) {
- ptlrpc_mark_interrupted(oap->oap_request);
- ptlrpcd_wake(oap->oap_request);
- ptlrpc_req_finished(oap->oap_request);
- oap->oap_request = NULL;
- }
- }
-
- osc_list_maint(cli, obj);
- return rc;
-}
-
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
- struct list_head *list, int cmd, int brw_flags)
-{
- struct client_obd *cli = osc_cli(obj);
- struct osc_extent *ext;
- struct osc_async_page *oap, *tmp;
- int page_count = 0;
- int mppr = cli->cl_max_pages_per_rpc;
- bool can_merge = true;
- pgoff_t start = CL_PAGE_EOF;
- pgoff_t end = 0;
-
- list_for_each_entry(oap, list, oap_pending_item) {
- struct osc_page *opg = oap2osc_page(oap);
- pgoff_t index = osc_index(opg);
-
- if (index > end)
- end = index;
- if (index < start)
- start = index;
- ++page_count;
- mppr <<= (page_count > mppr);
-
- if (unlikely(opg->ops_from > 0 || opg->ops_to < PAGE_SIZE))
- can_merge = false;
- }
-
- ext = osc_extent_alloc(obj);
- if (!ext) {
- list_for_each_entry_safe(oap, tmp, list, oap_pending_item) {
- list_del_init(&oap->oap_pending_item);
- osc_ap_completion(env, cli, oap, 0, -ENOMEM);
- }
- return -ENOMEM;
- }
-
- ext->oe_rw = !!(cmd & OBD_BRW_READ);
- ext->oe_sync = 1;
- ext->oe_no_merge = !can_merge;
- ext->oe_urgent = 1;
- ext->oe_start = start;
- ext->oe_end = end;
- ext->oe_max_end = end;
- ext->oe_obj = obj;
- ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
- ext->oe_nr_pages = page_count;
- ext->oe_mppr = mppr;
- list_splice_init(list, &ext->oe_pages);
-
- osc_object_lock(obj);
- /* Reuse the initial refcount for RPC, don't drop it */
- osc_extent_state_set(ext, OES_LOCK_DONE);
- if (cmd & OBD_BRW_WRITE) {
- list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
- osc_update_pending(obj, OBD_BRW_WRITE, page_count);
- } else {
- list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
- osc_update_pending(obj, OBD_BRW_READ, page_count);
- }
- osc_object_unlock(obj);
-
- osc_io_unplug_async(env, cli, obj);
- return 0;
-}
-
-/**
- * Called by osc_io_setattr_start() to freeze and destroy covering extents.
- */
-int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
- u64 size, struct osc_extent **extp)
-{
- struct client_obd *cli = osc_cli(obj);
- struct osc_extent *ext;
- struct osc_extent *temp;
- struct osc_extent *waiting = NULL;
- pgoff_t index;
- LIST_HEAD(list);
- int result = 0;
- bool partial;
-
- /* pages with index greater or equal to index will be truncated. */
- index = cl_index(osc2cl(obj), size);
- partial = size > cl_offset(osc2cl(obj), index);
-
-again:
- osc_object_lock(obj);
- ext = osc_extent_search(obj, index);
- if (!ext)
- ext = first_extent(obj);
- else if (ext->oe_end < index)
- ext = next_extent(ext);
- while (ext) {
- EASSERT(ext->oe_state != OES_TRUNC, ext);
-
- if (ext->oe_state > OES_CACHE || ext->oe_urgent) {
- /* if ext is in urgent state, it means there must exist
- * a page already having been flushed by write_page().
- * We have to wait for this extent because we can't
- * truncate that page.
- */
- OSC_EXTENT_DUMP(D_CACHE, ext,
- "waiting for busy extent\n");
- waiting = osc_extent_get(ext);
- break;
- }
-
- OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size);
-
- osc_extent_get(ext);
- if (ext->oe_state == OES_ACTIVE) {
- /* though we grab inode mutex for write path, but we
- * release it before releasing extent(in osc_io_end()),
- * so there is a race window that an extent is still
- * in OES_ACTIVE when truncate starts.
- */
- LASSERT(!ext->oe_trunc_pending);
- ext->oe_trunc_pending = 1;
- } else {
- EASSERT(ext->oe_state == OES_CACHE, ext);
- osc_extent_state_set(ext, OES_TRUNC);
- osc_update_pending(obj, OBD_BRW_WRITE,
- -ext->oe_nr_pages);
- }
- EASSERT(list_empty(&ext->oe_link), ext);
- list_add_tail(&ext->oe_link, &list);
-
- ext = next_extent(ext);
- }
- osc_object_unlock(obj);
-
- osc_list_maint(cli, obj);
-
- list_for_each_entry_safe(ext, temp, &list, oe_link) {
- int rc;
-
- list_del_init(&ext->oe_link);
-
- /* extent may be in OES_ACTIVE state because inode mutex
- * is released before osc_io_end() in file write case
- */
- if (ext->oe_state != OES_TRUNC)
- osc_extent_wait(env, ext, OES_TRUNC);
-
- rc = osc_extent_truncate(ext, index, partial);
- if (rc < 0) {
- if (result == 0)
- result = rc;
-
- OSC_EXTENT_DUMP(D_ERROR, ext,
- "truncate error %d\n", rc);
- } else if (ext->oe_nr_pages == 0) {
- osc_extent_remove(ext);
- } else {
- /* this must be an overlapped extent which means only
- * part of pages in this extent have been truncated.
- */
- EASSERTF(ext->oe_start <= index, ext,
- "trunc index = %lu/%d.\n", index, partial);
- /* fix index to skip this partially truncated extent */
- index = ext->oe_end + 1;
- partial = false;
-
- /* we need to hold this extent in OES_TRUNC state so
- * that no writeback will happen. This is to avoid
- * BUG 17397.
- * Only partial truncate can reach here, if @size is
- * not zero, the caller should provide a valid @extp.
- */
- LASSERT(!*extp);
- *extp = osc_extent_get(ext);
- OSC_EXTENT_DUMP(D_CACHE, ext,
- "trunc at %llu\n", size);
- }
- osc_extent_put(env, ext);
- }
- if (waiting) {
- int rc;
-
- /* ignore the result of osc_extent_wait the write initiator
- * should take care of it.
- */
- rc = osc_extent_wait(env, waiting, OES_INV);
- if (rc < 0)
- OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc);
-
- osc_extent_put(env, waiting);
- waiting = NULL;
- goto again;
- }
- return result;
-}
-
-/**
- * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
- */
-void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext)
-{
- if (ext) {
- struct osc_object *obj = ext->oe_obj;
- bool unplug = false;
-
- EASSERT(ext->oe_nr_pages > 0, ext);
- EASSERT(ext->oe_state == OES_TRUNC, ext);
- EASSERT(!ext->oe_urgent, ext);
-
- OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n");
- osc_object_lock(obj);
- osc_extent_state_set(ext, OES_CACHE);
- if (ext->oe_fsync_wait && !ext->oe_urgent) {
- ext->oe_urgent = 1;
- list_move_tail(&ext->oe_link, &obj->oo_urgent_exts);
- unplug = true;
- }
- osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages);
- osc_object_unlock(obj);
- osc_extent_put(env, ext);
-
- if (unplug)
- osc_io_unplug_async(env, osc_cli(obj), obj);
- }
-}
-
-/**
- * Wait for extents in a specific range to be written out.
- * The caller must have called osc_cache_writeback_range() to issue IO
- * otherwise it will take a long time for this function to finish.
- *
- * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
- * nobody else can dirty this range of file while we're waiting for
- * extents to be written.
- */
-int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
- pgoff_t start, pgoff_t end)
-{
- struct osc_extent *ext;
- pgoff_t index = start;
- int result = 0;
-
-again:
- osc_object_lock(obj);
- ext = osc_extent_search(obj, index);
- if (!ext)
- ext = first_extent(obj);
- else if (ext->oe_end < index)
- ext = next_extent(ext);
- while (ext) {
- int rc;
-
- if (ext->oe_start > end)
- break;
-
- if (!ext->oe_fsync_wait) {
- ext = next_extent(ext);
- continue;
- }
-
- EASSERT(ergo(ext->oe_state == OES_CACHE,
- ext->oe_hp || ext->oe_urgent), ext);
- EASSERT(ergo(ext->oe_state == OES_ACTIVE,
- !ext->oe_hp && ext->oe_urgent), ext);
-
- index = ext->oe_end + 1;
- osc_extent_get(ext);
- osc_object_unlock(obj);
-
- rc = osc_extent_wait(env, ext, OES_INV);
- if (result == 0)
- result = rc;
- osc_extent_put(env, ext);
- goto again;
- }
- osc_object_unlock(obj);
-
- OSC_IO_DEBUG(obj, "sync file range.\n");
- return result;
-}
-
-/**
- * Called to write out a range of osc object.
- *
- * @hp : should be set this is caused by lock cancel;
- * @discard: is set if dirty pages should be dropped - file will be deleted or
- * truncated, this implies there is no partially discarding extents.
- *
- * Return how many pages will be issued, or error code if error occurred.
- */
-int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
- pgoff_t start, pgoff_t end, int hp, int discard)
-{
- struct osc_extent *ext;
- LIST_HEAD(discard_list);
- bool unplug = false;
- int result = 0;
-
- osc_object_lock(obj);
- ext = osc_extent_search(obj, start);
- if (!ext)
- ext = first_extent(obj);
- else if (ext->oe_end < start)
- ext = next_extent(ext);
- while (ext) {
- if (ext->oe_start > end)
- break;
-
- ext->oe_fsync_wait = 1;
- switch (ext->oe_state) {
- case OES_CACHE:
- result += ext->oe_nr_pages;
- if (!discard) {
- struct list_head *list = NULL;
-
- if (hp) {
- EASSERT(!ext->oe_hp, ext);
- ext->oe_hp = 1;
- list = &obj->oo_hp_exts;
- } else if (!ext->oe_urgent) {
- ext->oe_urgent = 1;
- list = &obj->oo_urgent_exts;
- }
- if (list)
- list_move_tail(&ext->oe_link, list);
- unplug = true;
- } else {
- /* the only discarder is lock cancelling, so
- * [start, end] must contain this extent
- */
- EASSERT(ext->oe_start >= start &&
- ext->oe_max_end <= end, ext);
- osc_extent_state_set(ext, OES_LOCKING);
- ext->oe_owner = current;
- list_move_tail(&ext->oe_link, &discard_list);
- osc_update_pending(obj, OBD_BRW_WRITE,
- -ext->oe_nr_pages);
- }
- break;
- case OES_ACTIVE:
- /* It's pretty bad to wait for ACTIVE extents, because
- * we don't know how long we will wait for it to be
- * flushed since it may be blocked at awaiting more
- * grants. We do this for the correctness of fsync.
- */
- LASSERT(hp == 0 && discard == 0);
- ext->oe_urgent = 1;
- break;
- case OES_TRUNC:
- /* this extent is being truncated, can't do anything
- * for it now. it will be set to urgent after truncate
- * is finished in osc_cache_truncate_end().
- */
- default:
- break;
- }
- ext = next_extent(ext);
- }
- osc_object_unlock(obj);
-
- LASSERT(ergo(!discard, list_empty(&discard_list)));
- if (!list_empty(&discard_list)) {
- struct osc_extent *tmp;
- int rc;
-
- osc_list_maint(osc_cli(obj), obj);
- list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) {
- list_del_init(&ext->oe_link);
- EASSERT(ext->oe_state == OES_LOCKING, ext);
-
- /* Discard caching pages. We don't actually write this
- * extent out but we complete it as if we did.
- */
- rc = osc_extent_make_ready(env, ext);
- if (unlikely(rc < 0)) {
- OSC_EXTENT_DUMP(D_ERROR, ext,
- "make_ready returned %d\n", rc);
- if (result >= 0)
- result = rc;
- }
-
- /* finish the extent as if the pages were sent */
- osc_extent_finish(env, ext, 0, 0);
- }
- }
-
- if (unplug)
- osc_io_unplug(env, osc_cli(obj), obj);
-
- if (hp || discard) {
- int rc;
-
- rc = osc_cache_wait_range(env, obj, start, end);
- if (result >= 0 && rc < 0)
- result = rc;
- }
-
- OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result);
- return result;
-}
-
-/**
- * Returns a list of pages by a given [start, end] of \a obj.
- *
- * \param resched If not NULL, then we give up before hogging CPU for too
- * long and set *resched = 1, in that case caller should implement a retry
- * logic.
- *
- * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
- * crucial in the face of [offset, EOF] locks.
- *
- * Return at least one page in @queue unless there is no covered page.
- */
-int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
- struct osc_object *osc, pgoff_t start, pgoff_t end,
- osc_page_gang_cbt cb, void *cbdata)
-{
- struct osc_page *ops;
- void **pvec;
- pgoff_t idx;
- unsigned int nr;
- unsigned int i;
- unsigned int j;
- int res = CLP_GANG_OKAY;
- bool tree_lock = true;
-
- idx = start;
- pvec = osc_env_info(env)->oti_pvec;
- spin_lock(&osc->oo_tree_lock);
- while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec,
- idx, OTI_PVEC_SIZE)) > 0) {
- struct cl_page *page;
- bool end_of_region = false;
-
- for (i = 0, j = 0; i < nr; ++i) {
- ops = pvec[i];
- pvec[i] = NULL;
-
- idx = osc_index(ops);
- if (idx > end) {
- end_of_region = true;
- break;
- }
-
- page = ops->ops_cl.cpl_page;
- LASSERT(page->cp_type == CPT_CACHEABLE);
- if (page->cp_state == CPS_FREEING)
- continue;
-
- cl_page_get(page);
- lu_ref_add_atomic(&page->cp_reference,
- "gang_lookup", current);
- pvec[j++] = ops;
- }
- ++idx;
-
- /*
- * Here a delicate locking dance is performed. Current thread
- * holds a reference to a page, but has to own it before it
- * can be placed into queue. Owning implies waiting, so
- * radix-tree lock is to be released. After a wait one has to
- * check that pages weren't truncated (cl_page_own() returns
- * error in the latter case).
- */
- spin_unlock(&osc->oo_tree_lock);
- tree_lock = false;
-
- for (i = 0; i < j; ++i) {
- ops = pvec[i];
- if (res == CLP_GANG_OKAY)
- res = (*cb)(env, io, ops, cbdata);
-
- page = ops->ops_cl.cpl_page;
- lu_ref_del(&page->cp_reference, "gang_lookup", current);
- cl_page_put(env, page);
- }
- if (nr < OTI_PVEC_SIZE || end_of_region)
- break;
-
- if (res == CLP_GANG_OKAY && need_resched())
- res = CLP_GANG_RESCHED;
- if (res != CLP_GANG_OKAY)
- break;
-
- spin_lock(&osc->oo_tree_lock);
- tree_lock = true;
- }
- if (tree_lock)
- spin_unlock(&osc->oo_tree_lock);
- return res;
-}
-
-/**
- * Check if page @page is covered by an extra lock or discard it.
- */
-static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
- struct osc_page *ops, void *cbdata)
-{
- struct osc_thread_info *info = osc_env_info(env);
- struct osc_object *osc = cbdata;
- pgoff_t index;
-
- index = osc_index(ops);
- if (index >= info->oti_fn_index) {
- struct ldlm_lock *tmp;
- struct cl_page *page = ops->ops_cl.cpl_page;
-
- /* refresh non-overlapped index */
- tmp = osc_dlmlock_at_pgoff(env, osc, index,
- OSC_DAP_FL_TEST_LOCK);
- if (tmp) {
- __u64 end = tmp->l_policy_data.l_extent.end;
- /* Cache the first-non-overlapped index so as to skip
- * all pages within [index, oti_fn_index). This is safe
- * because if tmp lock is canceled, it will discard
- * these pages.
- */
- info->oti_fn_index = cl_index(osc2cl(osc), end + 1);
- if (end == OBD_OBJECT_EOF)
- info->oti_fn_index = CL_PAGE_EOF;
- LDLM_LOCK_PUT(tmp);
- } else if (cl_page_own(env, io, page) == 0) {
- /* discard the page */
- cl_page_discard(env, io, page);
- cl_page_disown(env, io, page);
- } else {
- LASSERT(page->cp_state == CPS_FREEING);
- }
- }
-
- info->oti_next_index = index + 1;
- return CLP_GANG_OKAY;
-}
-
-static int discard_cb(const struct lu_env *env, struct cl_io *io,
- struct osc_page *ops, void *cbdata)
-{
- struct osc_thread_info *info = osc_env_info(env);
- struct cl_page *page = ops->ops_cl.cpl_page;
-
- /* page is top page. */
- info->oti_next_index = osc_index(ops) + 1;
- if (cl_page_own(env, io, page) == 0) {
- if (page->cp_type == CPT_CACHEABLE &&
- PageDirty(cl_page_vmpage(page)))
- CL_PAGE_DEBUG(D_ERROR, env, page,
- "discard dirty page?\n");
-
- /* discard the page */
- cl_page_discard(env, io, page);
- cl_page_disown(env, io, page);
- } else {
- LASSERT(page->cp_state == CPS_FREEING);
- }
-
- return CLP_GANG_OKAY;
-}
-
-/**
- * Discard pages protected by the given lock. This function traverses radix
- * tree to find all covering pages and discard them. If a page is being covered
- * by other locks, it should remain in cache.
- *
- * If error happens on any step, the process continues anyway (the reasoning
- * behind this being that lock cancellation cannot be delayed indefinitely).
- */
-int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
- pgoff_t start, pgoff_t end, enum cl_lock_mode mode)
-{
- struct osc_thread_info *info = osc_env_info(env);
- struct cl_io *io = &info->oti_io;
- osc_page_gang_cbt cb;
- int res;
- int result;
-
- io->ci_obj = cl_object_top(osc2cl(osc));
- io->ci_ignore_layout = 1;
- result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
- if (result != 0)
- goto out;
-
- cb = mode == CLM_READ ? check_and_discard_cb : discard_cb;
- info->oti_fn_index = start;
- info->oti_next_index = start;
- do {
- res = osc_page_gang_lookup(env, io, osc,
- info->oti_next_index, end, cb, osc);
- if (info->oti_next_index > end)
- break;
-
- if (res == CLP_GANG_RESCHED)
- cond_resched();
- } while (res != CLP_GANG_OKAY);
-out:
- cl_io_fini(env, io);
- return result;
-}
-
-/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
deleted file mode 100644
index 1449013722f6..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
+++ /dev/null
@@ -1,683 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Internal interfaces of OSC layer.
- *
- * Author: Nikita Danilov <nikita.danilov@sun.com>
- * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
- */
-
-#ifndef OSC_CL_INTERNAL_H
-#define OSC_CL_INTERNAL_H
-
-#include <linux/libcfs/libcfs.h>
-
-#include <obd.h>
-/* osc_build_res_name() */
-#include <cl_object.h>
-#include "osc_internal.h"
-
-/** \defgroup osc osc
- * @{
- */
-
-struct osc_extent;
-
-/**
- * State maintained by osc layer for each IO context.
- */
-struct osc_io {
- /** super class */
- struct cl_io_slice oi_cl;
- /** true if this io is lockless. */
- unsigned int oi_lockless:1,
- /** true if this io is counted as active IO */
- oi_is_active:1;
- /** how many LRU pages are reserved for this IO */
- unsigned long oi_lru_reserved;
-
- /** active extents, we know how many bytes is going to be written,
- * so having an active extent will prevent it from being fragmented
- */
- struct osc_extent *oi_active;
- /** partially truncated extent, we need to hold this extent to prevent
- * page writeback from happening.
- */
- struct osc_extent *oi_trunc;
-
- /** write osc_lock for this IO, used by osc_extent_find(). */
- struct osc_lock *oi_write_osclock;
- struct obdo oi_oa;
- struct osc_async_cbargs {
- bool opc_rpc_sent;
- int opc_rc;
- struct completion opc_sync;
- } oi_cbarg;
-};
-
-/**
- * State maintained by osc layer for the duration of a system call.
- */
-struct osc_session {
- struct osc_io os_io;
-};
-
-#define OTI_PVEC_SIZE 256
-struct osc_thread_info {
- struct ldlm_res_id oti_resname;
- union ldlm_policy_data oti_policy;
- struct cl_lock_descr oti_descr;
- struct cl_attr oti_attr;
- struct lustre_handle oti_handle;
- struct cl_page_list oti_plist;
- struct cl_io oti_io;
- void *oti_pvec[OTI_PVEC_SIZE];
- /**
- * Fields used by cl_lock_discard_pages().
- */
- pgoff_t oti_next_index;
- pgoff_t oti_fn_index; /* first non-overlapped index */
- struct cl_sync_io oti_anchor;
- struct cl_req_attr oti_req_attr;
-};
-
-struct osc_object {
- struct cl_object oo_cl;
- struct lov_oinfo *oo_oinfo;
- /**
- * True if locking against this stripe got -EUSERS.
- */
- int oo_contended;
- unsigned long oo_contention_time;
- /**
- * used by the osc to keep track of what objects to build into rpcs.
- * Protected by client_obd->cli_loi_list_lock.
- */
- struct list_head oo_ready_item;
- struct list_head oo_hp_ready_item;
- struct list_head oo_write_item;
- struct list_head oo_read_item;
-
- /**
- * extent is a red black tree to manage (async) dirty pages.
- */
- struct rb_root oo_root;
- /**
- * Manage write(dirty) extents.
- */
- struct list_head oo_hp_exts; /* list of hp extents */
- struct list_head oo_urgent_exts; /* list of writeback extents */
- struct list_head oo_rpc_exts;
-
- struct list_head oo_reading_exts;
-
- atomic_t oo_nr_reads;
- atomic_t oo_nr_writes;
-
- /** Protect extent tree. Will be used to protect
- * oo_{read|write}_pages soon.
- */
- spinlock_t oo_lock;
-
- /**
- * Radix tree for caching pages
- */
- struct radix_tree_root oo_tree;
- spinlock_t oo_tree_lock;
- unsigned long oo_npages;
-
- /* Protect osc_lock this osc_object has */
- spinlock_t oo_ol_spin;
- struct list_head oo_ol_list;
-
- /** number of active IOs of this object */
- atomic_t oo_nr_ios;
- wait_queue_head_t oo_io_waitq;
-};
-
-static inline void osc_object_lock(struct osc_object *obj)
-{
- spin_lock(&obj->oo_lock);
-}
-
-static inline int osc_object_trylock(struct osc_object *obj)
-{
- return spin_trylock(&obj->oo_lock);
-}
-
-static inline void osc_object_unlock(struct osc_object *obj)
-{
- spin_unlock(&obj->oo_lock);
-}
-
-static inline int osc_object_is_locked(struct osc_object *obj)
-{
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
- return spin_is_locked(&obj->oo_lock);
-#else
- /*
- * It is not perfect to return true all the time.
- * But since this function is only used for assertion
- * and checking, it seems OK.
- */
- return 1;
-#endif
-}
-
-/*
- * Lock "micro-states" for osc layer.
- */
-enum osc_lock_state {
- OLS_NEW,
- OLS_ENQUEUED,
- OLS_UPCALL_RECEIVED,
- OLS_GRANTED,
- OLS_CANCELLED
-};
-
-/**
- * osc-private state of cl_lock.
- *
- * Interaction with DLM.
- *
- * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
- * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_dlmlock.
- *
- * This pointer is protected through a reference, acquired by
- * osc_lock_upcall0(). Also, an additional reference is acquired by
- * ldlm_lock_addref() call protecting the lock from cancellation, until
- * osc_lock_unuse() releases it.
- *
- * Below is a description of how lock references are acquired and released
- * inside of DLM.
- *
- * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
- * - ldlm_lock_create()
- * - ldlm_lock_new(): initializes a lock with 2 references. One for
- * the caller (released when reply from the server is received, or on
- * error), and another for the hash table.
- * - ldlm_lock_addref_internal(): protects the lock from cancellation.
- *
- * - When reply is received from the server (osc_enqueue_interpret())
- * - ldlm_cli_enqueue_fini()
- * - LDLM_LOCK_PUT(): releases caller reference acquired by
- * ldlm_lock_new().
- * - if (rc != 0)
- * ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
- * - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
- *
- * - When lock is being cancelled (ldlm_lock_cancel())
- * - ldlm_lock_destroy()
- * - LDLM_LOCK_PUT(): releases hash-table reference acquired by
- * ldlm_lock_new().
- *
- * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
- * either when lock is cancelled (osc_lock_blocking()), or when locks is
- * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
- * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
- * future.
- */
-struct osc_lock {
- struct cl_lock_slice ols_cl;
- /** Internal lock to protect states, etc. */
- spinlock_t ols_lock;
- /** Owner sleeps on this channel for state change */
- struct cl_sync_io *ols_owner;
- /** waiting list for this lock to be cancelled */
- struct list_head ols_waiting_list;
- /** wait entry of ols_waiting_list */
- struct list_head ols_wait_entry;
- /** list entry for osc_object::oo_ol_list */
- struct list_head ols_nextlock_oscobj;
-
- /** underlying DLM lock */
- struct ldlm_lock *ols_dlmlock;
- /** DLM flags with which osc_lock::ols_lock was enqueued */
- __u64 ols_flags;
- /** osc_lock::ols_lock handle */
- struct lustre_handle ols_handle;
- struct ldlm_enqueue_info ols_einfo;
- enum osc_lock_state ols_state;
- /** lock value block */
- struct ost_lvb ols_lvb;
-
- /**
- * true, if ldlm_lock_addref() was called against
- * osc_lock::ols_lock. This is used for sanity checking.
- *
- * \see osc_lock::ols_has_ref
- */
- unsigned ols_hold :1,
- /**
- * this is much like osc_lock::ols_hold, except that this bit is
- * cleared _after_ reference in released in osc_lock_unuse(). This
- * fine distinction is needed because:
- *
- * - if ldlm lock still has a reference, osc_ast_data_get() needs
- * to return associated cl_lock (so that a flag is needed that is
- * cleared after ldlm_lock_decref() returned), and
- *
- * - ldlm_lock_decref() can invoke blocking ast (for a
- * LDLM_FL_CBPENDING lock), and osc_lock functions like
- * osc_lock_cancel() called from there need to know whether to
- * release lock reference (so that a flag is needed that is
- * cleared before ldlm_lock_decref() is called).
- */
- ols_has_ref:1,
- /**
- * inherit the lockless attribute from top level cl_io.
- * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
- */
- ols_locklessable:1,
- /**
- * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
- * the EVAVAIL error as tolerable, this will make upper logic happy
- * to wait all glimpse locks to each OSTs to be completed.
- * Glimpse lock converts to normal lock if the server lock is
- * granted.
- * Glimpse lock should be destroyed immediately after use.
- */
- ols_glimpse:1,
- /**
- * For async glimpse lock.
- */
- ols_agl:1;
-};
-
-/**
- * Page state private for osc layer.
- */
-struct osc_page {
- struct cl_page_slice ops_cl;
- /**
- * Page queues used by osc to detect when RPC can be formed.
- */
- struct osc_async_page ops_oap;
- /**
- * An offset within page from which next transfer starts. This is used
- * by cl_page_clip() to submit partial page transfers.
- */
- int ops_from;
- /**
- * An offset within page at which next transfer ends.
- *
- * \see osc_page::ops_from.
- */
- int ops_to;
- /**
- * Boolean, true iff page is under transfer. Used for sanity checking.
- */
- unsigned ops_transfer_pinned:1,
- /**
- * in LRU?
- */
- ops_in_lru:1,
- /**
- * Set if the page must be transferred with OBD_BRW_SRVLOCK.
- */
- ops_srvlock:1;
- /**
- * lru page list. See osc_lru_{del|use}() in osc_page.c for usage.
- */
- struct list_head ops_lru;
- /**
- * Submit time - the time when the page is starting RPC. For debugging.
- */
- unsigned long ops_submit_time;
-};
-
-extern struct kmem_cache *osc_lock_kmem;
-extern struct kmem_cache *osc_object_kmem;
-extern struct kmem_cache *osc_thread_kmem;
-extern struct kmem_cache *osc_session_kmem;
-extern struct kmem_cache *osc_extent_kmem;
-
-extern struct lu_device_type osc_device_type;
-extern struct lu_context_key osc_key;
-extern struct lu_context_key osc_session_key;
-
-#define OSC_FLAGS (ASYNC_URGENT | ASYNC_READY)
-
-int osc_lock_init(const struct lu_env *env,
- struct cl_object *obj, struct cl_lock *lock,
- const struct cl_io *io);
-int osc_io_init(const struct lu_env *env,
- struct cl_object *obj, struct cl_io *io);
-struct lu_object *osc_object_alloc(const struct lu_env *env,
- const struct lu_object_header *hdr,
- struct lu_device *dev);
-int osc_page_init(const struct lu_env *env, struct cl_object *obj,
- struct cl_page *page, pgoff_t ind);
-
-void osc_index2policy(union ldlm_policy_data *policy,
- const struct cl_object *obj,
- pgoff_t start, pgoff_t end);
-int osc_lvb_print(const struct lu_env *env, void *cookie,
- lu_printer_t p, const struct ost_lvb *lvb);
-
-void osc_lru_add_batch(struct client_obd *cli, struct list_head *list);
-void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
- enum cl_req_type crt, int brw_flags);
-int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
-int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
- u32 async_flags);
-int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
- struct page *page, loff_t offset);
-int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
- struct osc_page *ops);
-int osc_page_cache_add(const struct lu_env *env,
- const struct cl_page_slice *slice, struct cl_io *io);
-int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
- struct osc_page *ops);
-int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
- struct osc_page *ops);
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
- struct list_head *list, int cmd, int brw_flags);
-int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
- u64 size, struct osc_extent **extp);
-void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
-int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
- pgoff_t start, pgoff_t end, int hp, int discard);
-int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
- pgoff_t start, pgoff_t end);
-void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
- struct osc_object *osc);
-int lru_queue_work(const struct lu_env *env, void *data);
-
-void osc_object_set_contended(struct osc_object *obj);
-void osc_object_clear_contended(struct osc_object *obj);
-int osc_object_is_contended(struct osc_object *obj);
-
-int osc_lock_is_lockless(const struct osc_lock *olck);
-
-/*****************************************************************************
- *
- * Accessors.
- *
- */
-
-static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
-{
- struct osc_thread_info *info;
-
- info = lu_context_key_get(&env->le_ctx, &osc_key);
- LASSERT(info);
- return info;
-}
-
-static inline struct osc_session *osc_env_session(const struct lu_env *env)
-{
- struct osc_session *ses;
-
- ses = lu_context_key_get(env->le_ses, &osc_session_key);
- LASSERT(ses);
- return ses;
-}
-
-static inline struct osc_io *osc_env_io(const struct lu_env *env)
-{
- return &osc_env_session(env)->os_io;
-}
-
-static inline int osc_is_object(const struct lu_object *obj)
-{
- return obj->lo_dev->ld_type == &osc_device_type;
-}
-
-static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
-{
- LINVRNT(d->ld_type == &osc_device_type);
- return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
-}
-
-static inline struct obd_export *osc_export(const struct osc_object *obj)
-{
- return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
-}
-
-static inline struct client_obd *osc_cli(const struct osc_object *obj)
-{
- return &osc_export(obj)->exp_obd->u.cli;
-}
-
-static inline struct osc_object *cl2osc(const struct cl_object *obj)
-{
- LINVRNT(osc_is_object(&obj->co_lu));
- return container_of0(obj, struct osc_object, oo_cl);
-}
-
-static inline struct cl_object *osc2cl(const struct osc_object *obj)
-{
- return (struct cl_object *)&obj->oo_cl;
-}
-
-static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode)
-{
- LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
- if (mode == CLM_READ)
- return LCK_PR;
- else if (mode == CLM_WRITE)
- return LCK_PW;
- else
- return LCK_GROUP;
-}
-
-static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode)
-{
- LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
- if (mode == LCK_PR)
- return CLM_READ;
- else if (mode == LCK_PW)
- return CLM_WRITE;
- else
- return CLM_GROUP;
-}
-
-static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
-{
- LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
- return container_of0(slice, struct osc_page, ops_cl);
-}
-
-static inline struct osc_page *oap2osc(struct osc_async_page *oap)
-{
- return container_of0(oap, struct osc_page, ops_oap);
-}
-
-static inline pgoff_t osc_index(struct osc_page *opg)
-{
- return opg->ops_cl.cpl_index;
-}
-
-static inline struct cl_page *oap2cl_page(struct osc_async_page *oap)
-{
- return oap2osc(oap)->ops_cl.cpl_page;
-}
-
-static inline struct osc_page *oap2osc_page(struct osc_async_page *oap)
-{
- return (struct osc_page *)container_of(oap, struct osc_page, ops_oap);
-}
-
-static inline struct osc_page *
-osc_cl_page_osc(struct cl_page *page, struct osc_object *osc)
-{
- const struct cl_page_slice *slice;
-
- LASSERT(osc);
- slice = cl_object_page_slice(&osc->oo_cl, page);
- return cl2osc_page(slice);
-}
-
-static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
-{
- LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
- return container_of0(slice, struct osc_lock, ols_cl);
-}
-
-static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
-{
- return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
-}
-
-static inline int osc_io_srvlock(struct osc_io *oio)
-{
- return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
-}
-
-enum osc_extent_state {
- OES_INV = 0, /** extent is just initialized or destroyed */
- OES_ACTIVE = 1, /** process is using this extent */
- OES_CACHE = 2, /** extent is ready for IO */
- OES_LOCKING = 3, /** locking page to prepare IO */
- OES_LOCK_DONE = 4, /** locking finished, ready to send */
- OES_RPC = 5, /** in RPC */
- OES_TRUNC = 6, /** being truncated */
- OES_STATE_MAX
-};
-
-/**
- * osc_extent data to manage dirty pages.
- * osc_extent has the following attributes:
- * 1. all pages in the same must be in one RPC in write back;
- * 2. # of pages must be less than max_pages_per_rpc - implied by 1;
- * 3. must be covered by only 1 osc_lock;
- * 4. exclusive. It's impossible to have overlapped osc_extent.
- *
- * The lifetime of an extent is from when the 1st page is dirtied to when
- * all pages inside it are written out.
- *
- * LOCKING ORDER
- * =============
- * page lock -> cl_loi_list_lock -> object lock(osc_object::oo_lock)
- */
-struct osc_extent {
- /** red-black tree node */
- struct rb_node oe_node;
- /** osc_object of this extent */
- struct osc_object *oe_obj;
- /** refcount, removed from red-black tree if reaches zero. */
- atomic_t oe_refc;
- /** busy if non-zero */
- atomic_t oe_users;
- /** link list of osc_object's oo_{hp|urgent|locking}_exts. */
- struct list_head oe_link;
- /** state of this extent */
- enum osc_extent_state oe_state;
- /** flags for this extent. */
- unsigned int oe_intree:1,
- /** 0 is write, 1 is read */
- oe_rw:1,
- /** sync extent, queued by osc_queue_sync_pages() */
- oe_sync:1,
- /** set if this extent has partial, sync pages.
- * Extents with partial page(s) can't merge with others in RPC
- */
- oe_no_merge:1,
- oe_srvlock:1,
- oe_memalloc:1,
- /** an ACTIVE extent is going to be truncated, so when this extent
- * is released, it will turn into TRUNC state instead of CACHE.
- */
- oe_trunc_pending:1,
- /** this extent should be written asap and someone may wait for the
- * write to finish. This bit is usually set along with urgent if
- * the extent was CACHE state.
- * fsync_wait extent can't be merged because new extent region may
- * exceed fsync range.
- */
- oe_fsync_wait:1,
- /** covering lock is being canceled */
- oe_hp:1,
- /** this extent should be written back asap. set if one of pages is
- * called by page WB daemon, or sync write or reading requests.
- */
- oe_urgent:1;
- /** how many grants allocated for this extent.
- * Grant allocated for this extent. There is no grant allocated
- * for reading extents and sync write extents.
- */
- unsigned int oe_grants;
- /** # of dirty pages in this extent */
- unsigned int oe_nr_pages;
- /** list of pending oap pages. Pages in this list are NOT sorted. */
- struct list_head oe_pages;
- /** Since an extent has to be written out in atomic, this is used to
- * remember the next page need to be locked to write this extent out.
- * Not used right now.
- */
- struct osc_page *oe_next_page;
- /** start and end index of this extent, include start and end
- * themselves. Page offset here is the page index of osc_pages.
- * oe_start is used as keyword for red-black tree.
- */
- pgoff_t oe_start;
- pgoff_t oe_end;
- /** maximum ending index of this extent, this is limited by
- * max_pages_per_rpc, lock extent and chunk size.
- */
- pgoff_t oe_max_end;
- /** waitqueue - for those who want to be notified if this extent's
- * state has changed.
- */
- wait_queue_head_t oe_waitq;
- /** lock covering this extent */
- struct ldlm_lock *oe_dlmlock;
- /** terminator of this extent. Must be true if this extent is in IO. */
- struct task_struct *oe_owner;
- /** return value of writeback. If somebody is waiting for this extent,
- * this value can be known by outside world.
- */
- int oe_rc;
- /** max pages per rpc when this extent was created */
- unsigned int oe_mppr;
-};
-
-int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
- int sent, int rc);
-void osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
-
-int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
- pgoff_t start, pgoff_t end, enum cl_lock_mode mode);
-
-typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
- struct osc_page *, void *);
-int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
- struct osc_object *osc, pgoff_t start, pgoff_t end,
- osc_page_gang_cbt cb, void *cbdata);
-/** @} osc */
-
-#endif /* OSC_CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_dev.c b/drivers/staging/lustre/lustre/osc/osc_dev.c
deleted file mode 100644
index 2b5f324743e2..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_dev.c
+++ /dev/null
@@ -1,246 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_device, for OSC layer.
- *
- * Author: Nikita Danilov <nikita.danilov@sun.com>
- */
-
-#define DEBUG_SUBSYSTEM S_OSC
-
-/* class_name2obd() */
-#include <obd_class.h>
-
-#include "osc_cl_internal.h"
-
-/** \addtogroup osc
- * @{
- */
-
-struct kmem_cache *osc_lock_kmem;
-struct kmem_cache *osc_object_kmem;
-struct kmem_cache *osc_thread_kmem;
-struct kmem_cache *osc_session_kmem;
-struct kmem_cache *osc_extent_kmem;
-struct kmem_cache *osc_quota_kmem;
-
-struct lu_kmem_descr osc_caches[] = {
- {
- .ckd_cache = &osc_lock_kmem,
- .ckd_name = "osc_lock_kmem",
- .ckd_size = sizeof(struct osc_lock)
- },
- {
- .ckd_cache = &osc_object_kmem,
- .ckd_name = "osc_object_kmem",
- .ckd_size = sizeof(struct osc_object)
- },
- {
- .ckd_cache = &osc_thread_kmem,
- .ckd_name = "osc_thread_kmem",
- .ckd_size = sizeof(struct osc_thread_info)
- },
- {
- .ckd_cache = &osc_session_kmem,
- .ckd_name = "osc_session_kmem",
- .ckd_size = sizeof(struct osc_session)
- },
- {
- .ckd_cache = &osc_extent_kmem,
- .ckd_name = "osc_extent_kmem",
- .ckd_size = sizeof(struct osc_extent)
- },
- {
- .ckd_cache = &osc_quota_kmem,
- .ckd_name = "osc_quota_kmem",
- .ckd_size = sizeof(struct osc_quota_info)
- },
- {
- .ckd_cache = NULL
- }
-};
-
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static struct lu_device *osc2lu_dev(struct osc_device *osc)
-{
- return &osc->od_cl.cd_lu_dev;
-}
-
-/*****************************************************************************
- *
- * Osc device and device type functions.
- *
- */
-
-static void *osc_key_init(const struct lu_context *ctx,
- struct lu_context_key *key)
-{
- struct osc_thread_info *info;
-
- info = kmem_cache_zalloc(osc_thread_kmem, GFP_NOFS);
- if (!info)
- info = ERR_PTR(-ENOMEM);
- return info;
-}
-
-static void osc_key_fini(const struct lu_context *ctx,
- struct lu_context_key *key, void *data)
-{
- struct osc_thread_info *info = data;
-
- kmem_cache_free(osc_thread_kmem, info);
-}
-
-struct lu_context_key osc_key = {
- .lct_tags = LCT_CL_THREAD,
- .lct_init = osc_key_init,
- .lct_fini = osc_key_fini
-};
-
-static void *osc_session_init(const struct lu_context *ctx,
- struct lu_context_key *key)
-{
- struct osc_session *info;
-
- info = kmem_cache_zalloc(osc_session_kmem, GFP_NOFS);
- if (!info)
- info = ERR_PTR(-ENOMEM);
- return info;
-}
-
-static void osc_session_fini(const struct lu_context *ctx,
- struct lu_context_key *key, void *data)
-{
- struct osc_session *info = data;
-
- kmem_cache_free(osc_session_kmem, info);
-}
-
-struct lu_context_key osc_session_key = {
- .lct_tags = LCT_SESSION,
- .lct_init = osc_session_init,
- .lct_fini = osc_session_fini
-};
-
-/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
-LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
-
-static int osc_cl_process_config(const struct lu_env *env,
- struct lu_device *d, struct lustre_cfg *cfg)
-{
- return osc_process_config_base(d->ld_obd, cfg);
-}
-
-static const struct lu_device_operations osc_lu_ops = {
- .ldo_object_alloc = osc_object_alloc,
- .ldo_process_config = osc_cl_process_config,
- .ldo_recovery_complete = NULL
-};
-
-static int osc_device_init(const struct lu_env *env, struct lu_device *d,
- const char *name, struct lu_device *next)
-{
- return 0;
-}
-
-static struct lu_device *osc_device_fini(const struct lu_env *env,
- struct lu_device *d)
-{
- return NULL;
-}
-
-static struct lu_device *osc_device_free(const struct lu_env *env,
- struct lu_device *d)
-{
- struct osc_device *od = lu2osc_dev(d);
-
- cl_device_fini(lu2cl_dev(d));
- kfree(od);
- return NULL;
-}
-
-static struct lu_device *osc_device_alloc(const struct lu_env *env,
- struct lu_device_type *t,
- struct lustre_cfg *cfg)
-{
- struct lu_device *d;
- struct osc_device *od;
- struct obd_device *obd;
- int rc;
-
- od = kzalloc(sizeof(*od), GFP_NOFS);
- if (!od)
- return ERR_PTR(-ENOMEM);
-
- cl_device_init(&od->od_cl, t);
- d = osc2lu_dev(od);
- d->ld_ops = &osc_lu_ops;
-
- /* Setup OSC OBD */
- obd = class_name2obd(lustre_cfg_string(cfg, 0));
- LASSERT(obd);
- rc = osc_setup(obd, cfg);
- if (rc) {
- osc_device_free(env, d);
- return ERR_PTR(rc);
- }
- od->od_exp = obd->obd_self_export;
- return d;
-}
-
-static const struct lu_device_type_operations osc_device_type_ops = {
- .ldto_init = osc_type_init,
- .ldto_fini = osc_type_fini,
-
- .ldto_start = osc_type_start,
- .ldto_stop = osc_type_stop,
-
- .ldto_device_alloc = osc_device_alloc,
- .ldto_device_free = osc_device_free,
-
- .ldto_device_init = osc_device_init,
- .ldto_device_fini = osc_device_fini
-};
-
-struct lu_device_type osc_device_type = {
- .ldt_tags = LU_DEVICE_CL,
- .ldt_name = LUSTRE_OSC_NAME,
- .ldt_ops = &osc_device_type_ops,
- .ldt_ctx_tags = LCT_CL_THREAD
-};
-
-/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h
deleted file mode 100644
index 32db150fd42e..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_internal.h
+++ /dev/null
@@ -1,236 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#ifndef OSC_INTERNAL_H
-#define OSC_INTERNAL_H
-
-#define OAP_MAGIC 8675309
-
-extern atomic_t osc_pool_req_count;
-extern unsigned int osc_reqpool_maxreqcount;
-extern struct ptlrpc_request_pool *osc_rq_pool;
-
-struct lu_env;
-
-enum async_flags {
- ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
- * page is added to an rpc
- */
- ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
- ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
- * to give the caller a chance to update
- * or cancel the size of the io
- */
- ASYNC_HP = 0x10,
-};
-
-struct osc_async_page {
- int oap_magic;
- unsigned short oap_cmd;
- unsigned short oap_interrupted:1;
-
- struct list_head oap_pending_item;
- struct list_head oap_rpc_item;
-
- u64 oap_obj_off;
- unsigned int oap_page_off;
- enum async_flags oap_async_flags;
-
- struct brw_page oap_brw_page;
-
- struct ptlrpc_request *oap_request;
- struct client_obd *oap_cli;
- struct osc_object *oap_obj;
-
- spinlock_t oap_lock;
-};
-
-#define oap_page oap_brw_page.pg
-#define oap_count oap_brw_page.count
-#define oap_brw_flags oap_brw_page.flag
-
-static inline struct osc_async_page *brw_page2oap(struct brw_page *pga)
-{
- return (struct osc_async_page *)container_of(pga, struct osc_async_page,
- oap_brw_page);
-}
-
-struct osc_cache_waiter {
- struct list_head ocw_entry;
- wait_queue_head_t ocw_waitq;
- struct osc_async_page *ocw_oap;
- int ocw_grant;
- int ocw_rc;
-};
-
-void osc_wake_cache_waiters(struct client_obd *cli);
-int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
-void osc_update_next_shrink(struct client_obd *cli);
-
-/*
- * cl integration.
- */
-#include <cl_object.h>
-
-extern struct ptlrpc_request_set *PTLRPCD_SET;
-
-typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
- int rc);
-
-int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
- __u64 *flags, union ldlm_policy_data *policy,
- struct ost_lvb *lvb, int kms_valid,
- osc_enqueue_upcall_f upcall,
- void *cookie, struct ldlm_enqueue_info *einfo,
- struct ptlrpc_request_set *rqset, int async, int agl);
-
-int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
- enum ldlm_type type, union ldlm_policy_data *policy,
- enum ldlm_mode mode, __u64 *flags, void *data,
- struct lustre_handle *lockh, int unref);
-
-int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
- obd_enqueue_update_f upcall, void *cookie,
- struct ptlrpc_request_set *rqset);
-int osc_punch_base(struct obd_export *exp, struct obdo *oa,
- obd_enqueue_update_f upcall, void *cookie,
- struct ptlrpc_request_set *rqset);
-int osc_sync_base(struct osc_object *exp, struct obdo *oa,
- obd_enqueue_update_f upcall, void *cookie,
- struct ptlrpc_request_set *rqset);
-
-int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
-int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
- struct list_head *ext_list, int cmd);
-long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
- long target, bool force);
-unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages);
-void osc_lru_unreserve(struct client_obd *cli, unsigned long npages);
-
-unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock);
-
-int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
-
-int lproc_osc_attach_seqstat(struct obd_device *dev);
-void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars);
-
-extern struct lu_device_type osc_device_type;
-
-static inline int osc_recoverable_error(int rc)
-{
- return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
- rc == -EAGAIN || rc == -EINPROGRESS);
-}
-
-static inline unsigned long rpcs_in_flight(struct client_obd *cli)
-{
- return cli->cl_r_in_flight + cli->cl_w_in_flight;
-}
-
-static inline char *cli_name(struct client_obd *cli)
-{
- return cli->cl_import->imp_obd->obd_name;
-}
-
-struct osc_device {
- struct cl_device od_cl;
- struct obd_export *od_exp;
-
- /* Write stats is actually protected by client_obd's lock. */
- struct osc_stats {
- u64 os_lockless_writes; /* by bytes */
- u64 os_lockless_reads; /* by bytes */
- u64 os_lockless_truncates; /* by times */
- } od_stats;
-
- /* configuration item(s) */
- int od_contention_time;
- int od_lockless_truncate;
-};
-
-static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
-{
- return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
-}
-
-extern struct lu_kmem_descr osc_caches[];
-
-extern struct kmem_cache *osc_quota_kmem;
-struct osc_quota_info {
- /** linkage for quota hash table */
- struct hlist_node oqi_hash;
- u32 oqi_id;
-};
-
-int osc_quota_setup(struct obd_device *obd);
-int osc_quota_cleanup(struct obd_device *obd);
-int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
- u32 valid, u32 flags);
-int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
-int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
- struct obd_quotactl *oqctl);
-void osc_inc_unstable_pages(struct ptlrpc_request *req);
-void osc_dec_unstable_pages(struct ptlrpc_request *req);
-bool osc_over_unstable_soft_limit(struct client_obd *cli);
-
-/**
- * Bit flags for osc_dlm_lock_at_pageoff().
- */
-enum osc_dap_flags {
- /**
- * Just check if the desired lock exists, it won't hold reference
- * count on lock.
- */
- OSC_DAP_FL_TEST_LOCK = BIT(0),
- /**
- * Return the lock even if it is being canceled.
- */
- OSC_DAP_FL_CANCELING = BIT(1),
-};
-
-struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
- struct osc_object *obj, pgoff_t index,
- enum osc_dap_flags flags);
-
-int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
-
-/** osc shrink list to link all osc client obd */
-extern struct list_head osc_shrink_list;
-/** spin lock to protect osc_shrink_list */
-extern spinlock_t osc_shrink_lock;
-unsigned long osc_cache_shrink_count(struct shrinker *sk,
- struct shrink_control *sc);
-unsigned long osc_cache_shrink_scan(struct shrinker *sk,
- struct shrink_control *sc);
-
-#endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c
deleted file mode 100644
index 76743faf3e6d..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_io.c
+++ /dev/null
@@ -1,918 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_io for OSC layer.
- *
- * Author: Nikita Danilov <nikita.danilov@sun.com>
- * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
- */
-
-#define DEBUG_SUBSYSTEM S_OSC
-
-#include <lustre_obdo.h>
-
-#include "osc_cl_internal.h"
-
-/** \addtogroup osc
- * @{
- */
-
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static struct osc_io *cl2osc_io(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
-
- LINVRNT(oio == osc_env_io(env));
- return oio;
-}
-
-/*****************************************************************************
- *
- * io operations.
- *
- */
-
-static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
-{
-}
-
-static void osc_read_ahead_release(const struct lu_env *env, void *cbdata)
-{
- struct ldlm_lock *dlmlock = cbdata;
- struct lustre_handle lockh;
-
- ldlm_lock2handle(dlmlock, &lockh);
- ldlm_lock_decref(&lockh, LCK_PR);
- LDLM_LOCK_PUT(dlmlock);
-}
-
-static int osc_io_read_ahead(const struct lu_env *env,
- const struct cl_io_slice *ios,
- pgoff_t start, struct cl_read_ahead *ra)
-{
- struct osc_object *osc = cl2osc(ios->cis_obj);
- struct ldlm_lock *dlmlock;
- int result = -ENODATA;
-
- dlmlock = osc_dlmlock_at_pgoff(env, osc, start, 0);
- if (dlmlock) {
- LASSERT(dlmlock->l_ast_data == osc);
- if (dlmlock->l_req_mode != LCK_PR) {
- struct lustre_handle lockh;
-
- ldlm_lock2handle(dlmlock, &lockh);
- ldlm_lock_addref(&lockh, LCK_PR);
- ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
- }
-
- ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc;
- ra->cra_end = cl_index(osc2cl(osc),
- dlmlock->l_policy_data.l_extent.end);
- ra->cra_release = osc_read_ahead_release;
- ra->cra_cbdata = dlmlock;
- result = 0;
- }
-
- return result;
-}
-
-/**
- * An implementation of cl_io_operations::cio_io_submit() method for osc
- * layer. Iterates over pages in the in-queue, prepares each for io by calling
- * cl_page_prep() and then either submits them through osc_io_submit_page()
- * or, if page is already submitted, changes osc flags through
- * osc_set_async_flags().
- */
-static int osc_io_submit(const struct lu_env *env,
- const struct cl_io_slice *ios,
- enum cl_req_type crt, struct cl_2queue *queue)
-{
- struct cl_page *page;
- struct cl_page *tmp;
- struct client_obd *cli = NULL;
- struct osc_object *osc = NULL; /* to keep gcc happy */
- struct osc_page *opg;
- struct cl_io *io;
- LIST_HEAD(list);
-
- struct cl_page_list *qin = &queue->c2_qin;
- struct cl_page_list *qout = &queue->c2_qout;
- unsigned int queued = 0;
- int result = 0;
- int cmd;
- int brw_flags;
- unsigned int max_pages;
-
- LASSERT(qin->pl_nr > 0);
-
- CDEBUG(D_CACHE | D_READA, "%d %d\n", qin->pl_nr, crt);
-
- osc = cl2osc(ios->cis_obj);
- cli = osc_cli(osc);
- max_pages = cli->cl_max_pages_per_rpc;
-
- cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
- brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
-
- /*
- * NOTE: here @page is a top-level page. This is done to avoid
- * creation of sub-page-list.
- */
- cl_page_list_for_each_safe(page, tmp, qin) {
- struct osc_async_page *oap;
-
- /* Top level IO. */
- io = page->cp_owner;
- LASSERT(io);
-
- opg = osc_cl_page_osc(page, osc);
- oap = &opg->ops_oap;
- LASSERT(osc == oap->oap_obj);
-
- if (!list_empty(&oap->oap_pending_item) ||
- !list_empty(&oap->oap_rpc_item)) {
- CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
- oap, opg);
- result = -EBUSY;
- break;
- }
-
- result = cl_page_prep(env, io, page, crt);
- if (result != 0) {
- LASSERT(result < 0);
- if (result != -EALREADY)
- break;
- /*
- * Handle -EALREADY error: for read case, the page is
- * already in UPTODATE state; for write, the page
- * is not dirty.
- */
- result = 0;
- continue;
- }
-
- spin_lock(&oap->oap_lock);
- oap->oap_async_flags = ASYNC_URGENT | ASYNC_READY;
- oap->oap_async_flags |= ASYNC_COUNT_STABLE;
- spin_unlock(&oap->oap_lock);
-
- osc_page_submit(env, opg, crt, brw_flags);
- list_add_tail(&oap->oap_pending_item, &list);
-
- if (page->cp_sync_io)
- cl_page_list_move(qout, qin, page);
- else /* async IO */
- cl_page_list_del(env, qin, page);
-
- if (++queued == max_pages) {
- queued = 0;
- result = osc_queue_sync_pages(env, osc, &list, cmd,
- brw_flags);
- if (result < 0)
- break;
- }
- }
-
- if (queued > 0)
- result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
-
- /* Update c/mtime for sync write. LU-7310 */
- if (qout->pl_nr > 0 && !result) {
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
- struct cl_object *obj = ios->cis_obj;
-
- cl_object_attr_lock(obj);
- attr->cat_mtime = ktime_get_real_seconds();
- attr->cat_ctime = attr->cat_mtime;
- cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME);
- cl_object_attr_unlock(obj);
- }
-
- CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
- return qout->pl_nr > 0 ? 0 : result;
-}
-
-/**
- * This is called when a page is accessed within file in a way that creates
- * new page, if one were missing (i.e., if there were a hole at that place in
- * the file, or accessed page is beyond the current file size).
- *
- * Expand stripe KMS if necessary.
- */
-static void osc_page_touch_at(const struct lu_env *env,
- struct cl_object *obj, pgoff_t idx, size_t to)
-{
- struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
- int valid;
- __u64 kms;
-
- /* offset within stripe */
- kms = cl_offset(obj, idx) + to;
-
- cl_object_attr_lock(obj);
- /*
- * XXX old code used
- *
- * ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
- *
- * here
- */
- CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n",
- kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
- loi->loi_lvb.lvb_size);
-
- attr->cat_ctime = ktime_get_real_seconds();
- attr->cat_mtime = attr->cat_ctime;
- valid = CAT_MTIME | CAT_CTIME;
- if (kms > loi->loi_kms) {
- attr->cat_kms = kms;
- valid |= CAT_KMS;
- }
- if (kms > loi->loi_lvb.lvb_size) {
- attr->cat_size = kms;
- valid |= CAT_SIZE;
- }
- cl_object_attr_update(env, obj, attr, valid);
- cl_object_attr_unlock(obj);
-}
-
-static int osc_io_commit_async(const struct lu_env *env,
- const struct cl_io_slice *ios,
- struct cl_page_list *qin, int from, int to,
- cl_commit_cbt cb)
-{
- struct cl_io *io = ios->cis_io;
- struct osc_io *oio = cl2osc_io(env, ios);
- struct osc_object *osc = cl2osc(ios->cis_obj);
- struct cl_page *page;
- struct cl_page *last_page;
- struct osc_page *opg;
- int result = 0;
-
- LASSERT(qin->pl_nr > 0);
-
- /* Handle partial page cases */
- last_page = cl_page_list_last(qin);
- if (oio->oi_lockless) {
- page = cl_page_list_first(qin);
- if (page == last_page) {
- cl_page_clip(env, page, from, to);
- } else {
- if (from != 0)
- cl_page_clip(env, page, from, PAGE_SIZE);
- if (to != PAGE_SIZE)
- cl_page_clip(env, last_page, 0, to);
- }
- }
-
- while (qin->pl_nr > 0) {
- struct osc_async_page *oap;
-
- page = cl_page_list_first(qin);
- opg = osc_cl_page_osc(page, osc);
- oap = &opg->ops_oap;
-
- if (!list_empty(&oap->oap_rpc_item)) {
- CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
- oap, opg);
- result = -EBUSY;
- break;
- }
-
- /* The page may be already in dirty cache. */
- if (list_empty(&oap->oap_pending_item)) {
- result = osc_page_cache_add(env, &opg->ops_cl, io);
- if (result != 0)
- break;
- }
-
- osc_page_touch_at(env, osc2cl(osc), osc_index(opg),
- page == last_page ? to : PAGE_SIZE);
-
- cl_page_list_del(env, qin, page);
-
- (*cb)(env, io, page);
- /* Can't access page any more. Page can be in transfer and
- * complete at any time.
- */
- }
-
- /* for sync write, kernel will wait for this page to be flushed before
- * osc_io_end() is called, so release it earlier.
- * for mkwrite(), it's known there is no further pages.
- */
- if (cl_io_is_sync_write(io) && oio->oi_active) {
- osc_extent_release(env, oio->oi_active);
- oio->oi_active = NULL;
- }
-
- CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result);
- return result;
-}
-
-static int osc_io_iter_init(const struct lu_env *env,
- const struct cl_io_slice *ios)
-{
- struct osc_object *osc = cl2osc(ios->cis_obj);
- struct obd_import *imp = osc_cli(osc)->cl_import;
- int rc = -EIO;
-
- spin_lock(&imp->imp_lock);
- if (likely(!imp->imp_invalid)) {
- struct osc_io *oio = osc_env_io(env);
-
- atomic_inc(&osc->oo_nr_ios);
- oio->oi_is_active = 1;
- rc = 0;
- }
- spin_unlock(&imp->imp_lock);
-
- return rc;
-}
-
-static int osc_io_write_iter_init(const struct lu_env *env,
- const struct cl_io_slice *ios)
-{
- struct cl_io *io = ios->cis_io;
- struct osc_io *oio = osc_env_io(env);
- struct osc_object *osc = cl2osc(ios->cis_obj);
- unsigned long npages;
-
- if (cl_io_is_append(io))
- return osc_io_iter_init(env, ios);
-
- npages = io->u.ci_rw.crw_count >> PAGE_SHIFT;
- if (io->u.ci_rw.crw_pos & ~PAGE_MASK)
- ++npages;
-
- oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages);
-
- return osc_io_iter_init(env, ios);
-}
-
-static void osc_io_iter_fini(const struct lu_env *env,
- const struct cl_io_slice *ios)
-{
- struct osc_io *oio = osc_env_io(env);
-
- if (oio->oi_is_active) {
- struct osc_object *osc = cl2osc(ios->cis_obj);
-
- oio->oi_is_active = 0;
- LASSERT(atomic_read(&osc->oo_nr_ios) > 0);
- if (atomic_dec_and_test(&osc->oo_nr_ios))
- wake_up_all(&osc->oo_io_waitq);
- }
-}
-
-static void osc_io_write_iter_fini(const struct lu_env *env,
- const struct cl_io_slice *ios)
-{
- struct osc_io *oio = osc_env_io(env);
- struct osc_object *osc = cl2osc(ios->cis_obj);
-
- if (oio->oi_lru_reserved > 0) {
- osc_lru_unreserve(osc_cli(osc), oio->oi_lru_reserved);
- oio->oi_lru_reserved = 0;
- }
- oio->oi_write_osclock = NULL;
-
- osc_io_iter_fini(env, ios);
-}
-
-static int osc_io_fault_start(const struct lu_env *env,
- const struct cl_io_slice *ios)
-{
- struct cl_io *io;
- struct cl_fault_io *fio;
-
- io = ios->cis_io;
- fio = &io->u.ci_fault;
- CDEBUG(D_INFO, "%lu %d %zu\n",
- fio->ft_index, fio->ft_writable, fio->ft_nob);
- /*
- * If mapping is writeable, adjust kms to cover this page,
- * but do not extend kms beyond actual file size.
- * See bug 10919.
- */
- if (fio->ft_writable)
- osc_page_touch_at(env, ios->cis_obj,
- fio->ft_index, fio->ft_nob);
- return 0;
-}
-
-static int osc_async_upcall(void *a, int rc)
-{
- struct osc_async_cbargs *args = a;
-
- args->opc_rc = rc;
- complete(&args->opc_sync);
- return 0;
-}
-
-/**
- * Checks that there are no pages being written in the extent being truncated.
- */
-static int trunc_check_cb(const struct lu_env *env, struct cl_io *io,
- struct osc_page *ops, void *cbdata)
-{
- struct cl_page *page = ops->ops_cl.cpl_page;
- struct osc_async_page *oap;
- __u64 start = *(__u64 *)cbdata;
-
- oap = &ops->ops_oap;
- if (oap->oap_cmd & OBD_BRW_WRITE &&
- !list_empty(&oap->oap_pending_item))
- CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n",
- start, current->comm);
-
- if (PageLocked(page->cp_vmpage))
- CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n",
- ops, osc_index(ops), oap->oap_cmd & OBD_BRW_RWMASK);
-
- return CLP_GANG_OKAY;
-}
-
-static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
- struct osc_io *oio, __u64 size)
-{
- struct cl_object *clob;
- int partial;
- pgoff_t start;
-
- clob = oio->oi_cl.cis_obj;
- start = cl_index(clob, size);
- partial = cl_offset(clob, start) < size;
-
- /*
- * Complain if there are pages in the truncated region.
- */
- osc_page_gang_lookup(env, io, cl2osc(clob),
- start + partial, CL_PAGE_EOF,
- trunc_check_cb, (void *)&size);
-}
-
-static int osc_io_setattr_start(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct cl_io *io = slice->cis_io;
- struct osc_io *oio = cl2osc_io(env, slice);
- struct cl_object *obj = slice->cis_obj;
- struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
- struct obdo *oa = &oio->oi_oa;
- struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
- __u64 size = io->u.ci_setattr.sa_attr.lvb_size;
- unsigned int ia_valid = io->u.ci_setattr.sa_valid;
- int result = 0;
-
- /* truncate cache dirty pages first */
- if (cl_io_is_trunc(io))
- result = osc_cache_truncate_start(env, cl2osc(obj), size,
- &oio->oi_trunc);
-
- if (result == 0 && oio->oi_lockless == 0) {
- cl_object_attr_lock(obj);
- result = cl_object_attr_get(env, obj, attr);
- if (result == 0) {
- struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
- unsigned int cl_valid = 0;
-
- if (ia_valid & ATTR_SIZE) {
- attr->cat_size = size;
- attr->cat_kms = size;
- cl_valid = CAT_SIZE | CAT_KMS;
- }
- if (ia_valid & ATTR_MTIME_SET) {
- attr->cat_mtime = lvb->lvb_mtime;
- cl_valid |= CAT_MTIME;
- }
- if (ia_valid & ATTR_ATIME_SET) {
- attr->cat_atime = lvb->lvb_atime;
- cl_valid |= CAT_ATIME;
- }
- if (ia_valid & ATTR_CTIME_SET) {
- attr->cat_ctime = lvb->lvb_ctime;
- cl_valid |= CAT_CTIME;
- }
- result = cl_object_attr_update(env, obj, attr,
- cl_valid);
- }
- cl_object_attr_unlock(obj);
- }
- memset(oa, 0, sizeof(*oa));
- if (result == 0) {
- oa->o_oi = loi->loi_oi;
- obdo_set_parent_fid(oa, io->u.ci_setattr.sa_parent_fid);
- oa->o_stripe_idx = io->u.ci_setattr.sa_stripe_index;
- oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP;
- if (ia_valid & ATTR_CTIME) {
- oa->o_valid |= OBD_MD_FLCTIME;
- oa->o_ctime = attr->cat_ctime;
- }
- if (ia_valid & ATTR_ATIME) {
- oa->o_valid |= OBD_MD_FLATIME;
- oa->o_atime = attr->cat_atime;
- }
- if (ia_valid & ATTR_MTIME) {
- oa->o_valid |= OBD_MD_FLMTIME;
- oa->o_mtime = attr->cat_mtime;
- }
- if (ia_valid & ATTR_SIZE) {
- oa->o_size = size;
- oa->o_blocks = OBD_OBJECT_EOF;
- oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
-
- if (oio->oi_lockless) {
- oa->o_flags = OBD_FL_SRVLOCK;
- oa->o_valid |= OBD_MD_FLFLAGS;
- }
- } else {
- LASSERT(oio->oi_lockless == 0);
- }
- if (ia_valid & ATTR_ATTR_FLAG) {
- oa->o_flags = io->u.ci_setattr.sa_attr_flags;
- oa->o_valid |= OBD_MD_FLFLAGS;
- }
-
- init_completion(&cbargs->opc_sync);
-
- if (ia_valid & ATTR_SIZE)
- result = osc_punch_base(osc_export(cl2osc(obj)),
- oa, osc_async_upcall,
- cbargs, PTLRPCD_SET);
- else
- result = osc_setattr_async(osc_export(cl2osc(obj)),
- oa, osc_async_upcall,
- cbargs, PTLRPCD_SET);
- cbargs->opc_rpc_sent = result == 0;
- }
- return result;
-}
-
-static void osc_io_setattr_end(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct cl_io *io = slice->cis_io;
- struct osc_io *oio = cl2osc_io(env, slice);
- struct cl_object *obj = slice->cis_obj;
- struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
- int result = 0;
-
- if (cbargs->opc_rpc_sent) {
- wait_for_completion(&cbargs->opc_sync);
- result = cbargs->opc_rc;
- io->ci_result = cbargs->opc_rc;
- }
- if (result == 0) {
- if (oio->oi_lockless) {
- /* lockless truncate */
- struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
-
- LASSERT(cl_io_is_trunc(io));
- /* XXX: Need a lock. */
- osd->od_stats.os_lockless_truncates++;
- }
- }
-
- if (cl_io_is_trunc(io)) {
- __u64 size = io->u.ci_setattr.sa_attr.lvb_size;
-
- osc_trunc_check(env, io, oio, size);
- osc_cache_truncate_end(env, oio->oi_trunc);
- oio->oi_trunc = NULL;
- }
-}
-
-struct osc_data_version_args {
- struct osc_io *dva_oio;
-};
-
-static int
-osc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req,
- void *arg, int rc)
-{
- struct osc_data_version_args *dva = arg;
- struct osc_io *oio = dva->dva_oio;
- const struct ost_body *body;
-
- if (rc < 0)
- goto out;
-
- body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
- if (!body) {
- rc = -EPROTO;
- goto out;
- }
-
- lustre_get_wire_obdo(&req->rq_import->imp_connect_data, &oio->oi_oa,
- &body->oa);
-out:
- oio->oi_cbarg.opc_rc = rc;
- complete(&oio->oi_cbarg.opc_sync);
-
- return 0;
-}
-
-static int osc_io_data_version_start(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
- struct osc_io *oio = cl2osc_io(env, slice);
- struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
- struct osc_object *obj = cl2osc(slice->cis_obj);
- struct obd_export *exp = osc_export(obj);
- struct lov_oinfo *loi = obj->oo_oinfo;
- struct osc_data_version_args *dva;
- struct obdo *oa = &oio->oi_oa;
- struct ptlrpc_request *req;
- struct ost_body *body;
- int rc;
-
- memset(oa, 0, sizeof(*oa));
- oa->o_oi = loi->loi_oi;
- oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
-
- if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) {
- oa->o_valid |= OBD_MD_FLFLAGS;
- oa->o_flags |= OBD_FL_SRVLOCK;
- if (dv->dv_flags & LL_DV_WR_FLUSH)
- oa->o_flags |= OBD_FL_FLUSH;
- }
-
- init_completion(&cbargs->opc_sync);
-
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
- if (!req)
- return -ENOMEM;
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
- if (rc < 0) {
- ptlrpc_request_free(req);
- return rc;
- }
-
- body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
- lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
-
- ptlrpc_request_set_replen(req);
- req->rq_interpret_reply = osc_data_version_interpret;
- BUILD_BUG_ON(sizeof(*dva) > sizeof(req->rq_async_args));
- dva = ptlrpc_req_async_args(req);
- dva->dva_oio = oio;
-
- ptlrpcd_add_req(req);
-
- return 0;
-}
-
-static void osc_io_data_version_end(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version;
- struct osc_io *oio = cl2osc_io(env, slice);
- struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
-
- wait_for_completion(&cbargs->opc_sync);
-
- if (cbargs->opc_rc) {
- slice->cis_io->ci_result = cbargs->opc_rc;
- } else if (!(oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)) {
- slice->cis_io->ci_result = -EOPNOTSUPP;
- } else {
- dv->dv_data_version = oio->oi_oa.o_data_version;
- slice->cis_io->ci_result = 0;
- }
-}
-
-static int osc_io_read_start(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct cl_object *obj = slice->cis_obj;
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
- int rc = 0;
-
- if (!slice->cis_io->ci_noatime) {
- cl_object_attr_lock(obj);
- attr->cat_atime = ktime_get_real_seconds();
- rc = cl_object_attr_update(env, obj, attr, CAT_ATIME);
- cl_object_attr_unlock(obj);
- }
- return rc;
-}
-
-static int osc_io_write_start(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct cl_object *obj = slice->cis_obj;
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
- int rc = 0;
-
- OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
- cl_object_attr_lock(obj);
- attr->cat_ctime = ktime_get_real_seconds();
- attr->cat_mtime = attr->cat_ctime;
- rc = cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME);
- cl_object_attr_unlock(obj);
-
- return rc;
-}
-
-static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
- struct cl_fsync_io *fio)
-{
- struct osc_io *oio = osc_env_io(env);
- struct obdo *oa = &oio->oi_oa;
- struct lov_oinfo *loi = obj->oo_oinfo;
- struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
- int rc = 0;
-
- memset(oa, 0, sizeof(*oa));
- oa->o_oi = loi->loi_oi;
- oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
-
- /* reload size abd blocks for start and end of sync range */
- oa->o_size = fio->fi_start;
- oa->o_blocks = fio->fi_end;
- oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
-
- obdo_set_parent_fid(oa, fio->fi_fid);
-
- init_completion(&cbargs->opc_sync);
-
- rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET);
- return rc;
-}
-
-static int osc_io_fsync_start(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct cl_io *io = slice->cis_io;
- struct cl_fsync_io *fio = &io->u.ci_fsync;
- struct cl_object *obj = slice->cis_obj;
- struct osc_object *osc = cl2osc(obj);
- pgoff_t start = cl_index(obj, fio->fi_start);
- pgoff_t end = cl_index(obj, fio->fi_end);
- int result = 0;
-
- if (fio->fi_end == OBD_OBJECT_EOF)
- end = CL_PAGE_EOF;
-
- result = osc_cache_writeback_range(env, osc, start, end, 0,
- fio->fi_mode == CL_FSYNC_DISCARD);
- if (result > 0) {
- fio->fi_nr_written += result;
- result = 0;
- }
- if (fio->fi_mode == CL_FSYNC_ALL) {
- int rc;
-
- /* we have to wait for writeback to finish before we can
- * send OST_SYNC RPC. This is bad because it causes extents
- * to be written osc by osc. However, we usually start
- * writeback before CL_FSYNC_ALL so this won't have any real
- * problem.
- */
- rc = osc_cache_wait_range(env, osc, start, end);
- if (result == 0)
- result = rc;
- rc = osc_fsync_ost(env, osc, fio);
- if (result == 0)
- result = rc;
- }
-
- return result;
-}
-
-static void osc_io_fsync_end(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
- struct cl_object *obj = slice->cis_obj;
- pgoff_t start = cl_index(obj, fio->fi_start);
- pgoff_t end = cl_index(obj, fio->fi_end);
- int result = 0;
-
- if (fio->fi_mode == CL_FSYNC_LOCAL) {
- result = osc_cache_wait_range(env, cl2osc(obj), start, end);
- } else if (fio->fi_mode == CL_FSYNC_ALL) {
- struct osc_io *oio = cl2osc_io(env, slice);
- struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
-
- wait_for_completion(&cbargs->opc_sync);
- if (result == 0)
- result = cbargs->opc_rc;
- }
- slice->cis_io->ci_result = result;
-}
-
-static void osc_io_end(const struct lu_env *env,
- const struct cl_io_slice *slice)
-{
- struct osc_io *oio = cl2osc_io(env, slice);
-
- if (oio->oi_active) {
- osc_extent_release(env, oio->oi_active);
- oio->oi_active = NULL;
- }
-}
-
-static const struct cl_io_operations osc_io_ops = {
- .op = {
- [CIT_READ] = {
- .cio_iter_init = osc_io_iter_init,
- .cio_iter_fini = osc_io_iter_fini,
- .cio_start = osc_io_read_start,
- .cio_fini = osc_io_fini
- },
- [CIT_WRITE] = {
- .cio_iter_init = osc_io_write_iter_init,
- .cio_iter_fini = osc_io_write_iter_fini,
- .cio_start = osc_io_write_start,
- .cio_end = osc_io_end,
- .cio_fini = osc_io_fini
- },
- [CIT_SETATTR] = {
- .cio_iter_init = osc_io_iter_init,
- .cio_iter_fini = osc_io_iter_fini,
- .cio_start = osc_io_setattr_start,
- .cio_end = osc_io_setattr_end
- },
- [CIT_DATA_VERSION] = {
- .cio_start = osc_io_data_version_start,
- .cio_end = osc_io_data_version_end,
- },
- [CIT_FAULT] = {
- .cio_iter_init = osc_io_iter_init,
- .cio_iter_fini = osc_io_iter_fini,
- .cio_start = osc_io_fault_start,
- .cio_end = osc_io_end,
- .cio_fini = osc_io_fini
- },
- [CIT_FSYNC] = {
- .cio_start = osc_io_fsync_start,
- .cio_end = osc_io_fsync_end,
- .cio_fini = osc_io_fini
- },
- [CIT_MISC] = {
- .cio_fini = osc_io_fini
- }
- },
- .cio_read_ahead = osc_io_read_ahead,
- .cio_submit = osc_io_submit,
- .cio_commit_async = osc_io_commit_async
-};
-
-/*****************************************************************************
- *
- * Transfer operations.
- *
- */
-
-int osc_io_init(const struct lu_env *env,
- struct cl_object *obj, struct cl_io *io)
-{
- struct osc_io *oio = osc_env_io(env);
-
- CL_IO_SLICE_CLEAN(oio, oi_cl);
- cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
- return 0;
-}
-
-/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c
deleted file mode 100644
index fe8ed0d0497a..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_lock.c
+++ /dev/null
@@ -1,1231 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_lock for OSC layer.
- *
- * Author: Nikita Danilov <nikita.danilov@sun.com>
- * Author: Jinshan Xiong <jinshan.xiong@intel.com>
- */
-
-#define DEBUG_SUBSYSTEM S_OSC
-
-#include <linux/libcfs/libcfs.h>
-/* fid_build_reg_res_name() */
-#include <lustre_fid.h>
-
-#include "osc_cl_internal.h"
-
-/** \addtogroup osc
- * @{
- */
-
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static const struct cl_lock_operations osc_lock_ops;
-static const struct cl_lock_operations osc_lock_lockless_ops;
-static void osc_lock_to_lockless(const struct lu_env *env,
- struct osc_lock *ols, int force);
-
-int osc_lock_is_lockless(const struct osc_lock *olck)
-{
- return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
-}
-
-/**
- * Returns a weak pointer to the ldlm lock identified by a handle. Returned
- * pointer cannot be dereferenced, as lock is not protected from concurrent
- * reclaim. This function is a helper for osc_lock_invariant().
- */
-static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
-{
- struct ldlm_lock *lock;
-
- lock = ldlm_handle2lock(handle);
- if (lock)
- LDLM_LOCK_PUT(lock);
- return lock;
-}
-
-/**
- * Invariant that has to be true all of the time.
- */
-static int osc_lock_invariant(struct osc_lock *ols)
-{
- struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle);
- struct ldlm_lock *olock = ols->ols_dlmlock;
- int handle_used = lustre_handle_is_used(&ols->ols_handle);
-
- if (ergo(osc_lock_is_lockless(ols),
- ols->ols_locklessable && !ols->ols_dlmlock))
- return 1;
-
- /*
- * If all the following "ergo"s are true, return 1, otherwise 0
- */
- if (!ergo(olock, handle_used))
- return 0;
-
- if (!ergo(olock, olock->l_handle.h_cookie == ols->ols_handle.cookie))
- return 0;
-
- if (!ergo(handle_used,
- ergo(lock && olock, lock == olock) &&
- ergo(!lock, !olock)))
- return 0;
- /*
- * Check that ->ols_handle and ->ols_dlmlock are consistent, but
- * take into account that they are set at the different time.
- */
- if (!ergo(ols->ols_state == OLS_CANCELLED,
- !olock && !handle_used))
- return 0;
- /*
- * DLM lock is destroyed only after we have seen cancellation
- * ast.
- */
- if (!ergo(olock && ols->ols_state < OLS_CANCELLED,
- !ldlm_is_destroyed(olock)))
- return 0;
-
- if (!ergo(ols->ols_state == OLS_GRANTED,
- olock && olock->l_req_mode == olock->l_granted_mode &&
- ols->ols_hold))
- return 0;
- return 1;
-}
-
-/*****************************************************************************
- *
- * Lock operations.
- *
- */
-
-static void osc_lock_fini(const struct lu_env *env,
- struct cl_lock_slice *slice)
-{
- struct osc_lock *ols = cl2osc_lock(slice);
-
- LINVRNT(osc_lock_invariant(ols));
- LASSERT(!ols->ols_dlmlock);
-
- kmem_cache_free(osc_lock_kmem, ols);
-}
-
-static void osc_lock_build_policy(const struct lu_env *env,
- const struct cl_lock *lock,
- union ldlm_policy_data *policy)
-{
- const struct cl_lock_descr *d = &lock->cll_descr;
-
- osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
- policy->l_extent.gid = d->cld_gid;
-}
-
-static __u64 osc_enq2ldlm_flags(__u32 enqflags)
-{
- __u64 result = 0;
-
- LASSERT((enqflags & ~CEF_MASK) == 0);
-
- if (enqflags & CEF_NONBLOCK)
- result |= LDLM_FL_BLOCK_NOWAIT;
- if (enqflags & CEF_ASYNC)
- result |= LDLM_FL_HAS_INTENT;
- if (enqflags & CEF_DISCARD_DATA)
- result |= LDLM_FL_AST_DISCARD_DATA;
- if (enqflags & CEF_PEEK)
- result |= LDLM_FL_TEST_LOCK;
- if (enqflags & CEF_LOCK_MATCH)
- result |= LDLM_FL_MATCH_LOCK;
- return result;
-}
-
-/**
- * Updates object attributes from a lock value block (lvb) received together
- * with the DLM lock reply from the server. Copy of osc_update_enqueue()
- * logic.
- *
- * This can be optimized to not update attributes when lock is a result of a
- * local match.
- *
- * Called under lock and resource spin-locks.
- */
-static void osc_lock_lvb_update(const struct lu_env *env,
- struct osc_object *osc,
- struct ldlm_lock *dlmlock,
- struct ost_lvb *lvb)
-{
- struct cl_object *obj = osc2cl(osc);
- struct lov_oinfo *oinfo = osc->oo_oinfo;
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
- unsigned int valid;
-
- valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
- if (!lvb)
- lvb = dlmlock->l_lvb_data;
-
- cl_lvb2attr(attr, lvb);
-
- cl_object_attr_lock(obj);
- if (dlmlock) {
- __u64 size;
-
- check_res_locked(dlmlock->l_resource);
- LASSERT(lvb == dlmlock->l_lvb_data);
- size = lvb->lvb_size;
-
- /* Extend KMS up to the end of this lock and no further
- * A lock on [x,y] means a KMS of up to y + 1 bytes!
- */
- if (size > dlmlock->l_policy_data.l_extent.end)
- size = dlmlock->l_policy_data.l_extent.end + 1;
- if (size >= oinfo->loi_kms) {
- LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu, kms=%llu",
- lvb->lvb_size, size);
- valid |= CAT_KMS;
- attr->cat_kms = size;
- } else {
- LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu; leaving kms=%llu, end=%llu",
- lvb->lvb_size, oinfo->loi_kms,
- dlmlock->l_policy_data.l_extent.end);
- }
- ldlm_lock_allow_match_locked(dlmlock);
- }
-
- cl_object_attr_update(env, obj, attr, valid);
- cl_object_attr_unlock(obj);
-}
-
-static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl,
- struct lustre_handle *lockh, bool lvb_update)
-{
- struct ldlm_lock *dlmlock;
-
- dlmlock = ldlm_handle2lock_long(lockh, 0);
- LASSERT(dlmlock);
-
- /* lock reference taken by ldlm_handle2lock_long() is
- * owned by osc_lock and released in osc_lock_detach()
- */
- lu_ref_add(&dlmlock->l_reference, "osc_lock", oscl);
- oscl->ols_has_ref = 1;
-
- LASSERT(!oscl->ols_dlmlock);
- oscl->ols_dlmlock = dlmlock;
-
- /* This may be a matched lock for glimpse request, do not hold
- * lock reference in that case.
- */
- if (!oscl->ols_glimpse) {
- /* hold a refc for non glimpse lock which will
- * be released in osc_lock_cancel()
- */
- lustre_handle_copy(&oscl->ols_handle, lockh);
- ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode);
- oscl->ols_hold = 1;
- }
-
- /* Lock must have been granted. */
- lock_res_and_lock(dlmlock);
- if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
- struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent;
- struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
-
- /* extend the lock extent, otherwise it will have problem when
- * we decide whether to grant a lockless lock.
- */
- descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
- descr->cld_start = cl_index(descr->cld_obj, ext->start);
- descr->cld_end = cl_index(descr->cld_obj, ext->end);
- descr->cld_gid = ext->gid;
-
- /* no lvb update for matched lock */
- if (lvb_update) {
- LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
- osc_lock_lvb_update(env, cl2osc(oscl->ols_cl.cls_obj),
- dlmlock, NULL);
- }
- LINVRNT(osc_lock_invariant(oscl));
- }
- unlock_res_and_lock(dlmlock);
-
- LASSERT(oscl->ols_state != OLS_GRANTED);
- oscl->ols_state = OLS_GRANTED;
-}
-
-/**
- * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
- * received from a server, or after osc_enqueue_base() matched a local DLM
- * lock.
- */
-static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
- int errcode)
-{
- struct osc_lock *oscl = cookie;
- struct cl_lock_slice *slice = &oscl->ols_cl;
- struct lu_env *env;
- int rc;
- u16 refcheck;
-
- env = cl_env_get(&refcheck);
- /* should never happen, similar to osc_ldlm_blocking_ast(). */
- LASSERT(!IS_ERR(env));
-
- rc = ldlm_error2errno(errcode);
- if (oscl->ols_state == OLS_ENQUEUED) {
- oscl->ols_state = OLS_UPCALL_RECEIVED;
- } else if (oscl->ols_state == OLS_CANCELLED) {
- rc = -EIO;
- } else {
- CERROR("Impossible state: %d\n", oscl->ols_state);
- LBUG();
- }
-
- if (rc == 0)
- osc_lock_granted(env, oscl, lockh, errcode == ELDLM_OK);
-
- /* Error handling, some errors are tolerable. */
- if (oscl->ols_locklessable && rc == -EUSERS) {
- /* This is a tolerable error, turn this lock into
- * lockless lock.
- */
- osc_object_set_contended(cl2osc(slice->cls_obj));
- LASSERT(slice->cls_ops == &osc_lock_ops);
-
- /* Change this lock to ldlmlock-less lock. */
- osc_lock_to_lockless(env, oscl, 1);
- oscl->ols_state = OLS_GRANTED;
- rc = 0;
- } else if (oscl->ols_glimpse && rc == -ENAVAIL) {
- LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY);
- osc_lock_lvb_update(env, cl2osc(slice->cls_obj),
- NULL, &oscl->ols_lvb);
- /* Hide the error. */
- rc = 0;
- }
-
- if (oscl->ols_owner)
- cl_sync_io_note(env, oscl->ols_owner, rc);
- cl_env_put(env, &refcheck);
-
- return rc;
-}
-
-static int osc_lock_upcall_agl(void *cookie, struct lustre_handle *lockh,
- int errcode)
-{
- struct osc_object *osc = cookie;
- struct ldlm_lock *dlmlock;
- struct lu_env *env;
- u16 refcheck;
-
- env = cl_env_get(&refcheck);
- LASSERT(!IS_ERR(env));
-
- if (errcode == ELDLM_LOCK_MATCHED) {
- errcode = ELDLM_OK;
- goto out;
- }
-
- if (errcode != ELDLM_OK)
- goto out;
-
- dlmlock = ldlm_handle2lock(lockh);
- LASSERT(dlmlock);
-
- lock_res_and_lock(dlmlock);
- LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
-
- /* there is no osc_lock associated with AGL lock */
- osc_lock_lvb_update(env, osc, dlmlock, NULL);
-
- unlock_res_and_lock(dlmlock);
- LDLM_LOCK_PUT(dlmlock);
-
-out:
- cl_object_put(env, osc2cl(osc));
- cl_env_put(env, &refcheck);
- return ldlm_error2errno(errcode);
-}
-
-static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end,
- enum cl_lock_mode mode, int discard)
-{
- struct lu_env *env;
- u16 refcheck;
- int rc = 0;
- int rc2 = 0;
-
- env = cl_env_get(&refcheck);
- if (IS_ERR(env))
- return PTR_ERR(env);
-
- if (mode == CLM_WRITE) {
- rc = osc_cache_writeback_range(env, obj, start, end, 1,
- discard);
- CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n",
- obj, start, end, rc,
- discard ? "discarded" : "written back");
- if (rc > 0)
- rc = 0;
- }
-
- rc2 = osc_lock_discard_pages(env, obj, start, end, mode);
- if (rc == 0 && rc2 < 0)
- rc = rc2;
-
- cl_env_put(env, &refcheck);
- return rc;
-}
-
-/**
- * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
- * and ldlm_lock caches.
- */
-static int osc_dlm_blocking_ast0(const struct lu_env *env,
- struct ldlm_lock *dlmlock,
- void *data, int flag)
-{
- struct cl_object *obj = NULL;
- int result = 0;
- int discard;
- enum cl_lock_mode mode = CLM_READ;
-
- LASSERT(flag == LDLM_CB_CANCELING);
-
- lock_res_and_lock(dlmlock);
- if (dlmlock->l_granted_mode != dlmlock->l_req_mode) {
- dlmlock->l_ast_data = NULL;
- unlock_res_and_lock(dlmlock);
- return 0;
- }
-
- discard = ldlm_is_discard_data(dlmlock);
- if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))
- mode = CLM_WRITE;
-
- if (dlmlock->l_ast_data) {
- obj = osc2cl(dlmlock->l_ast_data);
- dlmlock->l_ast_data = NULL;
-
- cl_object_get(obj);
- }
-
- unlock_res_and_lock(dlmlock);
-
- /* if l_ast_data is NULL, the dlmlock was enqueued by AGL or
- * the object has been destroyed.
- */
- if (obj) {
- struct ldlm_extent *extent = &dlmlock->l_policy_data.l_extent;
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
- __u64 old_kms;
-
- /* Destroy pages covered by the extent of the DLM lock */
- result = osc_lock_flush(cl2osc(obj),
- cl_index(obj, extent->start),
- cl_index(obj, extent->end),
- mode, discard);
-
- /* losing a lock, update kms */
- lock_res_and_lock(dlmlock);
- cl_object_attr_lock(obj);
- /* Must get the value under the lock to avoid race. */
- old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
- /* Update the kms. Need to loop all granted locks.
- * Not a problem for the client
- */
- attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
-
- cl_object_attr_update(env, obj, attr, CAT_KMS);
- cl_object_attr_unlock(obj);
- unlock_res_and_lock(dlmlock);
-
- cl_object_put(env, obj);
- }
- return result;
-}
-
-/**
- * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
- * some other lock, or is canceled. This function is installed as a
- * ldlm_lock::l_blocking_ast() for client extent locks.
- *
- * Control flow is tricky, because ldlm uses the same call-back
- * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
- *
- * \param dlmlock lock for which ast occurred.
- *
- * \param new description of a conflicting lock in case of blocking ast.
- *
- * \param data value of dlmlock->l_ast_data
- *
- * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
- * cancellation and blocking ast's.
- *
- * Possible use cases:
- *
- * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
- * lock due to lock lru pressure, or explicit user request to purge
- * locks.
- *
- * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
- * us that dlmlock conflicts with another lock that some client is
- * enqueing. Lock is canceled.
- *
- * - cl_lock_cancel() is called. osc_lock_cancel() calls
- * ldlm_cli_cancel() that calls
- *
- * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
- *
- * recursively entering osc_ldlm_blocking_ast().
- *
- * - client cancels lock voluntary (e.g., as a part of early cancellation):
- *
- * cl_lock_cancel()->
- * osc_lock_cancel()->
- * ldlm_cli_cancel()->
- * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
- *
- */
-static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
- struct ldlm_lock_desc *new, void *data,
- int flag)
-{
- int result = 0;
-
- switch (flag) {
- case LDLM_CB_BLOCKING: {
- struct lustre_handle lockh;
-
- ldlm_lock2handle(dlmlock, &lockh);
- result = ldlm_cli_cancel(&lockh, LCF_ASYNC);
- if (result == -ENODATA)
- result = 0;
- break;
- }
- case LDLM_CB_CANCELING: {
- struct lu_env *env;
- u16 refcheck;
-
- /*
- * This can be called in the context of outer IO, e.g.,
- *
- * osc_enqueue_base()->...
- * ->ldlm_prep_elc_req()->...
- * ->ldlm_cancel_callback()->...
- * ->osc_ldlm_blocking_ast()
- *
- * new environment has to be created to not corrupt outer
- * context.
- */
- env = cl_env_get(&refcheck);
- if (IS_ERR(env)) {
- result = PTR_ERR(env);
- break;
- }
-
- result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
- cl_env_put(env, &refcheck);
- break;
- }
- default:
- LBUG();
- }
- return result;
-}
-
-static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
-{
- struct ptlrpc_request *req = data;
- struct lu_env *env;
- struct ost_lvb *lvb;
- struct req_capsule *cap;
- struct cl_object *obj = NULL;
- int result;
- u16 refcheck;
-
- LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
-
- env = cl_env_get(&refcheck);
- if (IS_ERR(env)) {
- result = PTR_ERR(env);
- goto out;
- }
-
- lock_res_and_lock(dlmlock);
- if (dlmlock->l_ast_data) {
- obj = osc2cl(dlmlock->l_ast_data);
- cl_object_get(obj);
- }
- unlock_res_and_lock(dlmlock);
-
- if (obj) {
- /* Do not grab the mutex of cl_lock for glimpse.
- * See LU-1274 for details.
- * BTW, it's okay for cl_lock to be cancelled during
- * this period because server can handle this race.
- * See ldlm_server_glimpse_ast() for details.
- * cl_lock_mutex_get(env, lock);
- */
- cap = &req->rq_pill;
- req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
- req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
- sizeof(*lvb));
- result = req_capsule_server_pack(cap);
- if (result == 0) {
- lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
- result = cl_object_glimpse(env, obj, lvb);
- }
- if (!exp_connect_lvb_type(req->rq_export)) {
- req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB,
- sizeof(struct ost_lvb_v1),
- RCL_SERVER);
- }
- cl_object_put(env, obj);
- } else {
- /*
- * These errors are normal races, so we don't want to
- * fill the console with messages by calling
- * ptlrpc_error()
- */
- lustre_pack_reply(req, 1, NULL, NULL);
- result = -ELDLM_NO_LOCK_DATA;
- }
- cl_env_put(env, &refcheck);
-
-out:
- req->rq_status = result;
- return result;
-}
-
-static int weigh_cb(const struct lu_env *env, struct cl_io *io,
- struct osc_page *ops, void *cbdata)
-{
- struct cl_page *page = ops->ops_cl.cpl_page;
-
- if (cl_page_is_vmlocked(env, page) ||
- PageDirty(page->cp_vmpage) || PageWriteback(page->cp_vmpage)
- )
- return CLP_GANG_ABORT;
-
- *(pgoff_t *)cbdata = osc_index(ops) + 1;
- return CLP_GANG_OKAY;
-}
-
-static unsigned long osc_lock_weight(const struct lu_env *env,
- struct osc_object *oscobj,
- struct ldlm_extent *extent)
-{
- struct cl_io *io = &osc_env_info(env)->oti_io;
- struct cl_object *obj = cl_object_top(&oscobj->oo_cl);
- pgoff_t page_index;
- int result;
-
- io->ci_obj = obj;
- io->ci_ignore_layout = 1;
- result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
- if (result != 0)
- return result;
-
- page_index = cl_index(obj, extent->start);
- do {
- result = osc_page_gang_lookup(env, io, oscobj,
- page_index,
- cl_index(obj, extent->end),
- weigh_cb, (void *)&page_index);
- if (result == CLP_GANG_ABORT)
- break;
- if (result == CLP_GANG_RESCHED)
- cond_resched();
- } while (result != CLP_GANG_OKAY);
- cl_io_fini(env, io);
-
- return result == CLP_GANG_ABORT ? 1 : 0;
-}
-
-/**
- * Get the weight of dlm lock for early cancellation.
- */
-unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
-{
- struct lu_env *env;
- struct osc_object *obj;
- struct osc_lock *oscl;
- unsigned long weight;
- bool found = false;
- u16 refcheck;
-
- might_sleep();
- /*
- * osc_ldlm_weigh_ast has a complex context since it might be called
- * because of lock canceling, or from user's input. We have to make
- * a new environment for it. Probably it is implementation safe to use
- * the upper context because cl_lock_put don't modify environment
- * variables. But just in case ..
- */
- env = cl_env_get(&refcheck);
- if (IS_ERR(env))
- /* Mostly because lack of memory, do not eliminate this lock */
- return 1;
-
- LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
- obj = dlmlock->l_ast_data;
- if (!obj) {
- weight = 1;
- goto out;
- }
-
- spin_lock(&obj->oo_ol_spin);
- list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) {
- if (oscl->ols_dlmlock && oscl->ols_dlmlock != dlmlock)
- continue;
- found = true;
- }
- spin_unlock(&obj->oo_ol_spin);
- if (found) {
- /*
- * If the lock is being used by an IO, definitely not cancel it.
- */
- weight = 1;
- goto out;
- }
-
- weight = osc_lock_weight(env, obj, &dlmlock->l_policy_data.l_extent);
-
-out:
- cl_env_put(env, &refcheck);
- return weight;
-}
-
-static void osc_lock_build_einfo(const struct lu_env *env,
- const struct cl_lock *lock,
- struct osc_object *osc,
- struct ldlm_enqueue_info *einfo)
-{
- einfo->ei_type = LDLM_EXTENT;
- einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode);
- einfo->ei_cb_bl = osc_ldlm_blocking_ast;
- einfo->ei_cb_cp = ldlm_completion_ast;
- einfo->ei_cb_gl = osc_ldlm_glimpse_ast;
- einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */
-}
-
-/**
- * Determine if the lock should be converted into a lockless lock.
- *
- * Steps to check:
- * - if the lock has an explicit requirement for a non-lockless lock;
- * - if the io lock request type ci_lockreq;
- * - send the enqueue rpc to ost to make the further decision;
- * - special treat to truncate lockless lock
- *
- * Additional policy can be implemented here, e.g., never do lockless-io
- * for large extents.
- */
-static void osc_lock_to_lockless(const struct lu_env *env,
- struct osc_lock *ols, int force)
-{
- struct cl_lock_slice *slice = &ols->ols_cl;
-
- LASSERT(ols->ols_state == OLS_NEW ||
- ols->ols_state == OLS_UPCALL_RECEIVED);
-
- if (force) {
- ols->ols_locklessable = 1;
- slice->cls_ops = &osc_lock_lockless_ops;
- } else {
- struct osc_io *oio = osc_env_io(env);
- struct cl_io *io = oio->oi_cl.cis_io;
- struct cl_object *obj = slice->cls_obj;
- struct osc_object *oob = cl2osc(obj);
- const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
- struct obd_connect_data *ocd;
-
- LASSERT(io->ci_lockreq == CILR_MANDATORY ||
- io->ci_lockreq == CILR_MAYBE ||
- io->ci_lockreq == CILR_NEVER);
-
- ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
- ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
- (io->ci_lockreq == CILR_MAYBE) &&
- (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
- if (io->ci_lockreq == CILR_NEVER ||
- /* lockless IO */
- (ols->ols_locklessable && osc_object_is_contended(oob)) ||
- /* lockless truncate */
- (cl_io_is_trunc(io) &&
- (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
- osd->od_lockless_truncate)) {
- ols->ols_locklessable = 1;
- slice->cls_ops = &osc_lock_lockless_ops;
- }
- }
- LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
-}
-
-static bool osc_lock_compatible(const struct osc_lock *qing,
- const struct osc_lock *qed)
-{
- struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr;
- struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr;
-
- if (qed->ols_glimpse)
- return true;
-
- if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ)
- return true;
-
- if (qed->ols_state < OLS_GRANTED)
- return true;
-
- if (qed_descr->cld_mode >= qing_descr->cld_mode &&
- qed_descr->cld_start <= qing_descr->cld_start &&
- qed_descr->cld_end >= qing_descr->cld_end)
- return true;
-
- return false;
-}
-
-static void osc_lock_wake_waiters(const struct lu_env *env,
- struct osc_object *osc,
- struct osc_lock *oscl)
-{
- spin_lock(&osc->oo_ol_spin);
- list_del_init(&oscl->ols_nextlock_oscobj);
- spin_unlock(&osc->oo_ol_spin);
-
- spin_lock(&oscl->ols_lock);
- while (!list_empty(&oscl->ols_waiting_list)) {
- struct osc_lock *scan;
-
- scan = list_entry(oscl->ols_waiting_list.next, struct osc_lock,
- ols_wait_entry);
- list_del_init(&scan->ols_wait_entry);
-
- cl_sync_io_note(env, scan->ols_owner, 0);
- }
- spin_unlock(&oscl->ols_lock);
-}
-
-static int osc_lock_enqueue_wait(const struct lu_env *env,
- struct osc_object *obj,
- struct osc_lock *oscl)
-{
- struct osc_lock *tmp_oscl;
- struct cl_lock_descr *need = &oscl->ols_cl.cls_lock->cll_descr;
- struct cl_sync_io *waiter = &osc_env_info(env)->oti_anchor;
- int rc = 0;
-
- spin_lock(&obj->oo_ol_spin);
- list_add_tail(&oscl->ols_nextlock_oscobj, &obj->oo_ol_list);
-
-restart:
- list_for_each_entry(tmp_oscl, &obj->oo_ol_list,
- ols_nextlock_oscobj) {
- struct cl_lock_descr *descr;
-
- if (tmp_oscl == oscl)
- break;
-
- descr = &tmp_oscl->ols_cl.cls_lock->cll_descr;
- if (descr->cld_start > need->cld_end ||
- descr->cld_end < need->cld_start)
- continue;
-
- /* We're not supposed to give up group lock */
- if (descr->cld_mode == CLM_GROUP)
- break;
-
- if (!osc_lock_is_lockless(oscl) &&
- osc_lock_compatible(oscl, tmp_oscl))
- continue;
-
- /* wait for conflicting lock to be canceled */
- cl_sync_io_init(waiter, 1, cl_sync_io_end);
- oscl->ols_owner = waiter;
-
- spin_lock(&tmp_oscl->ols_lock);
- /* add oscl into tmp's ols_waiting list */
- list_add_tail(&oscl->ols_wait_entry,
- &tmp_oscl->ols_waiting_list);
- spin_unlock(&tmp_oscl->ols_lock);
-
- spin_unlock(&obj->oo_ol_spin);
- rc = cl_sync_io_wait(env, waiter, 0);
- spin_lock(&obj->oo_ol_spin);
- if (rc < 0)
- break;
-
- oscl->ols_owner = NULL;
- goto restart;
- }
- spin_unlock(&obj->oo_ol_spin);
-
- return rc;
-}
-
-/**
- * Implementation of cl_lock_operations::clo_enqueue() method for osc
- * layer. This initiates ldlm enqueue:
- *
- * - cancels conflicting locks early (osc_lock_enqueue_wait());
- *
- * - calls osc_enqueue_base() to do actual enqueue.
- *
- * osc_enqueue_base() is supplied with an upcall function that is executed
- * when lock is received either after a local cached ldlm lock is matched, or
- * when a reply from the server is received.
- *
- * This function does not wait for the network communication to complete.
- */
-static int osc_lock_enqueue(const struct lu_env *env,
- const struct cl_lock_slice *slice,
- struct cl_io *unused, struct cl_sync_io *anchor)
-{
- struct osc_thread_info *info = osc_env_info(env);
- struct osc_io *oio = osc_env_io(env);
- struct osc_object *osc = cl2osc(slice->cls_obj);
- struct osc_lock *oscl = cl2osc_lock(slice);
- struct cl_lock *lock = slice->cls_lock;
- struct ldlm_res_id *resname = &info->oti_resname;
- union ldlm_policy_data *policy = &info->oti_policy;
- osc_enqueue_upcall_f upcall = osc_lock_upcall;
- void *cookie = oscl;
- bool async = false;
- int result;
-
- LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
- "lock = %p, ols = %p\n", lock, oscl);
-
- if (oscl->ols_state == OLS_GRANTED)
- return 0;
-
- if (oscl->ols_flags & LDLM_FL_TEST_LOCK)
- goto enqueue_base;
-
- if (oscl->ols_glimpse) {
- LASSERT(equi(oscl->ols_agl, !anchor));
- async = true;
- goto enqueue_base;
- }
-
- result = osc_lock_enqueue_wait(env, osc, oscl);
- if (result < 0)
- goto out;
-
- /* we can grant lockless lock right after all conflicting locks
- * are canceled.
- */
- if (osc_lock_is_lockless(oscl)) {
- oscl->ols_state = OLS_GRANTED;
- oio->oi_lockless = 1;
- return 0;
- }
-
-enqueue_base:
- oscl->ols_state = OLS_ENQUEUED;
- if (anchor) {
- atomic_inc(&anchor->csi_sync_nr);
- oscl->ols_owner = anchor;
- }
-
- /**
- * DLM lock's ast data must be osc_object;
- * if glimpse or AGL lock, async of osc_enqueue_base() must be true,
- * DLM's enqueue callback set to osc_lock_upcall() with cookie as
- * osc_lock.
- */
- ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
- osc_lock_build_policy(env, lock, policy);
- if (oscl->ols_agl) {
- oscl->ols_einfo.ei_cbdata = NULL;
- /* hold a reference for callback */
- cl_object_get(osc2cl(osc));
- upcall = osc_lock_upcall_agl;
- cookie = osc;
- }
- result = osc_enqueue_base(osc_export(osc), resname, &oscl->ols_flags,
- policy, &oscl->ols_lvb,
- osc->oo_oinfo->loi_kms_valid,
- upcall, cookie,
- &oscl->ols_einfo, PTLRPCD_SET, async,
- oscl->ols_agl);
- if (!result) {
- if (osc_lock_is_lockless(oscl)) {
- oio->oi_lockless = 1;
- } else if (!async) {
- LASSERT(oscl->ols_state == OLS_GRANTED);
- LASSERT(oscl->ols_hold);
- LASSERT(oscl->ols_dlmlock);
- }
- } else if (oscl->ols_agl) {
- cl_object_put(env, osc2cl(osc));
- result = 0;
- }
-
-out:
- if (result < 0) {
- oscl->ols_state = OLS_CANCELLED;
- osc_lock_wake_waiters(env, osc, oscl);
-
- if (anchor)
- cl_sync_io_note(env, anchor, result);
- }
- return result;
-}
-
-/**
- * Breaks a link between osc_lock and dlm_lock.
- */
-static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
-{
- struct ldlm_lock *dlmlock;
-
- dlmlock = olck->ols_dlmlock;
- if (!dlmlock)
- return;
-
- if (olck->ols_hold) {
- olck->ols_hold = 0;
- ldlm_lock_decref(&olck->ols_handle, olck->ols_einfo.ei_mode);
- olck->ols_handle.cookie = 0ULL;
- }
-
- olck->ols_dlmlock = NULL;
-
- /* release a reference taken in osc_lock_upcall(). */
- LASSERT(olck->ols_has_ref);
- lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
- LDLM_LOCK_RELEASE(dlmlock);
- olck->ols_has_ref = 0;
-}
-
-/**
- * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
- * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
- * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
- * with some other lock some where in the cluster. This function does the
- * following:
- *
- * - invalidates all pages protected by this lock (after sending dirty
- * ones to the server, as necessary);
- *
- * - decref's underlying ldlm lock;
- *
- * - cancels ldlm lock (ldlm_cli_cancel()).
- */
-static void osc_lock_cancel(const struct lu_env *env,
- const struct cl_lock_slice *slice)
-{
- struct osc_object *obj = cl2osc(slice->cls_obj);
- struct osc_lock *oscl = cl2osc_lock(slice);
-
- LINVRNT(osc_lock_invariant(oscl));
-
- osc_lock_detach(env, oscl);
- oscl->ols_state = OLS_CANCELLED;
- oscl->ols_flags &= ~LDLM_FL_LVB_READY;
-
- osc_lock_wake_waiters(env, obj, oscl);
-}
-
-static int osc_lock_print(const struct lu_env *env, void *cookie,
- lu_printer_t p, const struct cl_lock_slice *slice)
-{
- struct osc_lock *lock = cl2osc_lock(slice);
-
- (*p)(env, cookie, "%p %#16llx %#llx %d %p ",
- lock->ols_dlmlock, lock->ols_flags, lock->ols_handle.cookie,
- lock->ols_state, lock->ols_owner);
- osc_lvb_print(env, cookie, p, &lock->ols_lvb);
- return 0;
-}
-
-static const struct cl_lock_operations osc_lock_ops = {
- .clo_fini = osc_lock_fini,
- .clo_enqueue = osc_lock_enqueue,
- .clo_cancel = osc_lock_cancel,
- .clo_print = osc_lock_print,
-};
-
-static void osc_lock_lockless_cancel(const struct lu_env *env,
- const struct cl_lock_slice *slice)
-{
- struct osc_lock *ols = cl2osc_lock(slice);
- struct osc_object *osc = cl2osc(slice->cls_obj);
- struct cl_lock_descr *descr = &slice->cls_lock->cll_descr;
- int result;
-
- LASSERT(!ols->ols_dlmlock);
- result = osc_lock_flush(osc, descr->cld_start, descr->cld_end,
- descr->cld_mode, 0);
- if (result)
- CERROR("Pages for lockless lock %p were not purged(%d)\n",
- ols, result);
-
- osc_lock_wake_waiters(env, osc, ols);
-}
-
-static const struct cl_lock_operations osc_lock_lockless_ops = {
- .clo_fini = osc_lock_fini,
- .clo_enqueue = osc_lock_enqueue,
- .clo_cancel = osc_lock_lockless_cancel,
- .clo_print = osc_lock_print
-};
-
-static void osc_lock_set_writer(const struct lu_env *env,
- const struct cl_io *io,
- struct cl_object *obj, struct osc_lock *oscl)
-{
- struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr;
- pgoff_t io_start;
- pgoff_t io_end;
-
- if (!cl_object_same(io->ci_obj, obj))
- return;
-
- if (likely(io->ci_type == CIT_WRITE)) {
- io_start = cl_index(obj, io->u.ci_rw.crw_pos);
- io_end = cl_index(obj, io->u.ci_rw.crw_pos +
- io->u.ci_rw.crw_count - 1);
- if (cl_io_is_append(io)) {
- io_start = 0;
- io_end = CL_PAGE_EOF;
- }
- } else {
- LASSERT(cl_io_is_mkwrite(io));
- io_start = io->u.ci_fault.ft_index;
- io_end = io->u.ci_fault.ft_index;
- }
-
- if (descr->cld_mode >= CLM_WRITE &&
- descr->cld_start <= io_start && descr->cld_end >= io_end) {
- struct osc_io *oio = osc_env_io(env);
-
- /* There must be only one lock to match the write region */
- LASSERT(!oio->oi_write_osclock);
- oio->oi_write_osclock = oscl;
- }
-}
-
-int osc_lock_init(const struct lu_env *env,
- struct cl_object *obj, struct cl_lock *lock,
- const struct cl_io *io)
-{
- struct osc_lock *oscl;
- __u32 enqflags = lock->cll_descr.cld_enq_flags;
-
- oscl = kmem_cache_zalloc(osc_lock_kmem, GFP_NOFS);
- if (!oscl)
- return -ENOMEM;
-
- oscl->ols_state = OLS_NEW;
- spin_lock_init(&oscl->ols_lock);
- INIT_LIST_HEAD(&oscl->ols_waiting_list);
- INIT_LIST_HEAD(&oscl->ols_wait_entry);
- INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj);
-
- oscl->ols_flags = osc_enq2ldlm_flags(enqflags);
- oscl->ols_agl = !!(enqflags & CEF_AGL);
- if (oscl->ols_agl)
- oscl->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
- if (oscl->ols_flags & LDLM_FL_HAS_INTENT) {
- oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED;
- oscl->ols_glimpse = 1;
- }
- osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo);
-
- cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops);
-
- if (!(enqflags & CEF_MUST))
- /* try to convert this lock to a lockless lock */
- osc_lock_to_lockless(env, oscl, (enqflags & CEF_NEVER));
- if (oscl->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
- oscl->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
-
- if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io))
- osc_lock_set_writer(env, io, obj, oscl);
-
-
- LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx",
- lock, oscl, oscl->ols_flags);
-
- return 0;
-}
-
-/**
- * Finds an existing lock covering given index and optionally different from a
- * given \a except lock.
- */
-struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
- struct osc_object *obj, pgoff_t index,
- enum osc_dap_flags dap_flags)
-{
- struct osc_thread_info *info = osc_env_info(env);
- struct ldlm_res_id *resname = &info->oti_resname;
- union ldlm_policy_data *policy = &info->oti_policy;
- struct lustre_handle lockh;
- struct ldlm_lock *lock = NULL;
- enum ldlm_mode mode;
- __u64 flags;
-
- ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname);
- osc_index2policy(policy, osc2cl(obj), index, index);
- policy->l_extent.gid = LDLM_GID_ANY;
-
- flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
- if (dap_flags & OSC_DAP_FL_TEST_LOCK)
- flags |= LDLM_FL_TEST_LOCK;
-
- /*
- * It is fine to match any group lock since there could be only one
- * with a uniq gid and it conflicts with all other lock modes too
- */
-again:
- mode = osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
- LCK_PR | LCK_PW | LCK_GROUP, &flags, obj, &lockh,
- dap_flags & OSC_DAP_FL_CANCELING);
- if (mode != 0) {
- lock = ldlm_handle2lock(&lockh);
- /* RACE: the lock is cancelled so let's try again */
- if (unlikely(!lock))
- goto again;
- }
- return lock;
-}
-
-/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c
deleted file mode 100644
index 6baa8e2e00c9..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_object.c
+++ /dev/null
@@ -1,474 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_object for OSC layer.
- *
- * Author: Nikita Danilov <nikita.danilov@sun.com>
- * Author: Jinshan Xiong <jinshan.xiong@intel.com>
- */
-
-#define DEBUG_SUBSYSTEM S_OSC
-
-#include "osc_cl_internal.h"
-
-/** \addtogroup osc
- * @{
- */
-
-/*****************************************************************************
- *
- * Type conversions.
- *
- */
-
-static struct lu_object *osc2lu(struct osc_object *osc)
-{
- return &osc->oo_cl.co_lu;
-}
-
-static struct osc_object *lu2osc(const struct lu_object *obj)
-{
- LINVRNT(osc_is_object(obj));
- return container_of0(obj, struct osc_object, oo_cl.co_lu);
-}
-
-/*****************************************************************************
- *
- * Object operations.
- *
- */
-
-static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
- const struct lu_object_conf *conf)
-{
- struct osc_object *osc = lu2osc(obj);
- const struct cl_object_conf *cconf = lu2cl_conf(conf);
-
- osc->oo_oinfo = cconf->u.coc_oinfo;
- INIT_LIST_HEAD(&osc->oo_ready_item);
- INIT_LIST_HEAD(&osc->oo_hp_ready_item);
- INIT_LIST_HEAD(&osc->oo_write_item);
- INIT_LIST_HEAD(&osc->oo_read_item);
-
- atomic_set(&osc->oo_nr_ios, 0);
- init_waitqueue_head(&osc->oo_io_waitq);
-
- osc->oo_root.rb_node = NULL;
- INIT_LIST_HEAD(&osc->oo_hp_exts);
- INIT_LIST_HEAD(&osc->oo_urgent_exts);
- INIT_LIST_HEAD(&osc->oo_rpc_exts);
- INIT_LIST_HEAD(&osc->oo_reading_exts);
- atomic_set(&osc->oo_nr_reads, 0);
- atomic_set(&osc->oo_nr_writes, 0);
- spin_lock_init(&osc->oo_lock);
- spin_lock_init(&osc->oo_tree_lock);
- spin_lock_init(&osc->oo_ol_spin);
- INIT_LIST_HEAD(&osc->oo_ol_list);
-
- cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
-
- return 0;
-}
-
-static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
-{
- struct osc_object *osc = lu2osc(obj);
-
- LASSERT(list_empty(&osc->oo_ready_item));
- LASSERT(list_empty(&osc->oo_hp_ready_item));
- LASSERT(list_empty(&osc->oo_write_item));
- LASSERT(list_empty(&osc->oo_read_item));
-
- LASSERT(!osc->oo_root.rb_node);
- LASSERT(list_empty(&osc->oo_hp_exts));
- LASSERT(list_empty(&osc->oo_urgent_exts));
- LASSERT(list_empty(&osc->oo_rpc_exts));
- LASSERT(list_empty(&osc->oo_reading_exts));
- LASSERT(atomic_read(&osc->oo_nr_reads) == 0);
- LASSERT(atomic_read(&osc->oo_nr_writes) == 0);
- LASSERT(list_empty(&osc->oo_ol_list));
- LASSERT(!atomic_read(&osc->oo_nr_ios));
-
- lu_object_fini(obj);
- kmem_cache_free(osc_object_kmem, osc);
-}
-
-int osc_lvb_print(const struct lu_env *env, void *cookie,
- lu_printer_t p, const struct ost_lvb *lvb)
-{
- return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu",
- lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
- lvb->lvb_ctime, lvb->lvb_blocks);
-}
-
-static int osc_object_print(const struct lu_env *env, void *cookie,
- lu_printer_t p, const struct lu_object *obj)
-{
- struct osc_object *osc = lu2osc(obj);
- struct lov_oinfo *oinfo = osc->oo_oinfo;
- struct osc_async_rc *ar = &oinfo->loi_ar;
-
- (*p)(env, cookie, "id: " DOSTID " idx: %d gen: %d kms_valid: %u kms %llu rc: %d force_sync: %d min_xid: %llu ",
- POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx,
- oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
- ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
- osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
- return 0;
-}
-
-static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
- struct cl_attr *attr)
-{
- struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
-
- cl_lvb2attr(attr, &oinfo->loi_lvb);
- attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
- return 0;
-}
-
-static int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
- const struct cl_attr *attr, unsigned int valid)
-{
- struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
- struct ost_lvb *lvb = &oinfo->loi_lvb;
-
- if (valid & CAT_SIZE)
- lvb->lvb_size = attr->cat_size;
- if (valid & CAT_MTIME)
- lvb->lvb_mtime = attr->cat_mtime;
- if (valid & CAT_ATIME)
- lvb->lvb_atime = attr->cat_atime;
- if (valid & CAT_CTIME)
- lvb->lvb_ctime = attr->cat_ctime;
- if (valid & CAT_BLOCKS)
- lvb->lvb_blocks = attr->cat_blocks;
- if (valid & CAT_KMS) {
- CDEBUG(D_CACHE, "set kms from %llu to %llu\n",
- oinfo->loi_kms, (__u64)attr->cat_kms);
- loi_kms_set(oinfo, attr->cat_kms);
- }
- return 0;
-}
-
-static int osc_object_glimpse(const struct lu_env *env,
- const struct cl_object *obj, struct ost_lvb *lvb)
-{
- struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
-
- lvb->lvb_size = oinfo->loi_kms;
- lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
- return 0;
-}
-
-static int osc_object_ast_clear(struct ldlm_lock *lock, void *data)
-{
- if (lock->l_ast_data == data)
- lock->l_ast_data = NULL;
- return LDLM_ITER_CONTINUE;
-}
-
-static int osc_object_prune(const struct lu_env *env, struct cl_object *obj)
-{
- struct osc_object *osc = cl2osc(obj);
- struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname;
-
- /* DLM locks don't hold a reference of osc_object so we have to
- * clear it before the object is being destroyed.
- */
- ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname);
- ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname,
- osc_object_ast_clear, osc);
- return 0;
-}
-
-static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
- struct ll_fiemap_info_key *fmkey,
- struct fiemap *fiemap, size_t *buflen)
-{
- struct obd_export *exp = osc_export(cl2osc(obj));
- union ldlm_policy_data policy;
- struct ptlrpc_request *req;
- struct lustre_handle lockh;
- struct ldlm_res_id resid;
- enum ldlm_mode mode = 0;
- struct fiemap *reply;
- char *tmp;
- int rc;
-
- fmkey->lfik_oa.o_oi = cl2osc(obj)->oo_oinfo->loi_oi;
- if (!(fmkey->lfik_fiemap.fm_flags & FIEMAP_FLAG_SYNC))
- goto skip_locking;
-
- policy.l_extent.start = fmkey->lfik_fiemap.fm_start & PAGE_MASK;
-
- if (OBD_OBJECT_EOF - fmkey->lfik_fiemap.fm_length <=
- fmkey->lfik_fiemap.fm_start + PAGE_SIZE - 1)
- policy.l_extent.end = OBD_OBJECT_EOF;
- else
- policy.l_extent.end = (fmkey->lfik_fiemap.fm_start +
- fmkey->lfik_fiemap.fm_length +
- PAGE_SIZE - 1) & PAGE_MASK;
-
- ostid_build_res_name(&fmkey->lfik_oa.o_oi, &resid);
- mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
- LDLM_FL_BLOCK_GRANTED | LDLM_FL_LVB_READY,
- &resid, LDLM_EXTENT, &policy,
- LCK_PR | LCK_PW, &lockh, 0);
- if (mode) { /* lock is cached on client */
- if (mode != LCK_PR) {
- ldlm_lock_addref(&lockh, LCK_PR);
- ldlm_lock_decref(&lockh, LCK_PW);
- }
- } else { /* no cached lock, needs acquire lock on server side */
- fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS;
- fmkey->lfik_oa.o_flags |= OBD_FL_SRVLOCK;
- }
-
-skip_locking:
- req = ptlrpc_request_alloc(class_exp2cliimp(exp),
- &RQF_OST_GET_INFO_FIEMAP);
- if (!req) {
- rc = -ENOMEM;
- goto drop_lock;
- }
-
- req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT,
- sizeof(*fmkey));
- req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_CLIENT,
- *buflen);
- req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_SERVER,
- *buflen);
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
- if (rc) {
- ptlrpc_request_free(req);
- goto drop_lock;
- }
- tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
- memcpy(tmp, fmkey, sizeof(*fmkey));
- tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
- memcpy(tmp, fiemap, *buflen);
- ptlrpc_request_set_replen(req);
-
- rc = ptlrpc_queue_wait(req);
- if (rc)
- goto fini_req;
-
- reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
- if (!reply) {
- rc = -EPROTO;
- goto fini_req;
- }
-
- memcpy(fiemap, reply, *buflen);
-fini_req:
- ptlrpc_req_finished(req);
-drop_lock:
- if (mode)
- ldlm_lock_decref(&lockh, LCK_PR);
- return rc;
-}
-
-void osc_object_set_contended(struct osc_object *obj)
-{
- obj->oo_contention_time = cfs_time_current();
- /* mb(); */
- obj->oo_contended = 1;
-}
-
-void osc_object_clear_contended(struct osc_object *obj)
-{
- obj->oo_contended = 0;
-}
-
-int osc_object_is_contended(struct osc_object *obj)
-{
- struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
- int osc_contention_time = dev->od_contention_time;
- unsigned long cur_time = cfs_time_current();
- unsigned long retry_time;
-
- if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
- return 1;
-
- if (!obj->oo_contended)
- return 0;
-
- /*
- * I like copy-paste. the code is copied from
- * ll_file_is_contended.
- */
- retry_time = cfs_time_add(obj->oo_contention_time,
- osc_contention_time * HZ);
- if (cfs_time_after(cur_time, retry_time)) {
- osc_object_clear_contended(obj);
- return 0;
- }
- return 1;
-}
-
-/**
- * Implementation of struct cl_object_operations::coo_req_attr_set() for osc
- * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
- * fields.
- */
-static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj,
- struct cl_req_attr *attr)
-{
- u64 flags = attr->cra_flags;
- struct lov_oinfo *oinfo;
- struct ost_lvb *lvb;
- struct obdo *oa;
-
- oinfo = cl2osc(obj)->oo_oinfo;
- lvb = &oinfo->loi_lvb;
- oa = attr->cra_oa;
-
- if (flags & OBD_MD_FLMTIME) {
- oa->o_mtime = lvb->lvb_mtime;
- oa->o_valid |= OBD_MD_FLMTIME;
- }
- if (flags & OBD_MD_FLATIME) {
- oa->o_atime = lvb->lvb_atime;
- oa->o_valid |= OBD_MD_FLATIME;
- }
- if (flags & OBD_MD_FLCTIME) {
- oa->o_ctime = lvb->lvb_ctime;
- oa->o_valid |= OBD_MD_FLCTIME;
- }
- if (flags & OBD_MD_FLGROUP) {
- ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
- oa->o_valid |= OBD_MD_FLGROUP;
- }
- if (flags & OBD_MD_FLID) {
- int rc;
-
- rc = ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
- if (rc) {
- CERROR("Bad %llu to set " DOSTID " : rc %d\n",
- (unsigned long long)ostid_id(&oinfo->loi_oi),
- POSTID(&oa->o_oi), rc);
- }
- oa->o_valid |= OBD_MD_FLID;
- }
- if (flags & OBD_MD_FLHANDLE) {
- struct ldlm_lock *lock;
- struct osc_page *opg;
-
- opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj));
- lock = osc_dlmlock_at_pgoff(env, cl2osc(obj), osc_index(opg),
- OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_CANCELING);
- if (!lock && !opg->ops_srvlock) {
- struct ldlm_resource *res;
- struct ldlm_res_id *resname;
-
- CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page,
- "uncovered page!\n");
-
- resname = &osc_env_info(env)->oti_resname;
- ostid_build_res_name(&oinfo->loi_oi, resname);
- res = ldlm_resource_get(
- osc_export(cl2osc(obj))->exp_obd->obd_namespace,
- NULL, resname, LDLM_EXTENT, 0);
- ldlm_resource_dump(D_ERROR, res);
-
- LBUG();
- }
-
- /* check for lockless io. */
- if (lock) {
- oa->o_handle = lock->l_remote_handle;
- oa->o_valid |= OBD_MD_FLHANDLE;
- LDLM_LOCK_PUT(lock);
- }
- }
-}
-
-static const struct cl_object_operations osc_ops = {
- .coo_page_init = osc_page_init,
- .coo_lock_init = osc_lock_init,
- .coo_io_init = osc_io_init,
- .coo_attr_get = osc_attr_get,
- .coo_attr_update = osc_attr_update,
- .coo_glimpse = osc_object_glimpse,
- .coo_prune = osc_object_prune,
- .coo_fiemap = osc_object_fiemap,
- .coo_req_attr_set = osc_req_attr_set
-};
-
-static const struct lu_object_operations osc_lu_obj_ops = {
- .loo_object_init = osc_object_init,
- .loo_object_release = NULL,
- .loo_object_free = osc_object_free,
- .loo_object_print = osc_object_print,
- .loo_object_invariant = NULL
-};
-
-struct lu_object *osc_object_alloc(const struct lu_env *env,
- const struct lu_object_header *unused,
- struct lu_device *dev)
-{
- struct osc_object *osc;
- struct lu_object *obj;
-
- osc = kmem_cache_zalloc(osc_object_kmem, GFP_NOFS);
- if (osc) {
- obj = osc2lu(osc);
- lu_object_init(obj, NULL, dev);
- osc->oo_cl.co_ops = &osc_ops;
- obj->lo_ops = &osc_lu_obj_ops;
- } else {
- obj = NULL;
- }
- return obj;
-}
-
-int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc)
-{
- CDEBUG(D_INODE, "Invalidate osc object: %p, # of active IOs: %d\n",
- osc, atomic_read(&osc->oo_nr_ios));
-
- wait_event_idle(osc->oo_io_waitq, !atomic_read(&osc->oo_nr_ios));
-
- /* Discard all dirty pages of this object. */
- osc_cache_truncate_start(env, osc, 0, NULL);
-
- /* Discard all caching pages */
- osc_lock_discard_pages(env, osc, 0, CL_PAGE_EOF, CLM_WRITE);
-
- /* Clear ast data of dlm lock. Do this after discarding all pages */
- osc_object_prune(env, osc2cl(osc));
-
- return 0;
-}
-
-/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c
deleted file mode 100644
index 01a930dbbf64..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_page.c
+++ /dev/null
@@ -1,1094 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * Implementation of cl_page for OSC layer.
- *
- * Author: Nikita Danilov <nikita.danilov@sun.com>
- * Author: Jinshan Xiong <jinshan.xiong@intel.com>
- */
-
-#define DEBUG_SUBSYSTEM S_OSC
-
-#include <linux/math64.h>
-#include "osc_cl_internal.h"
-
-static void osc_lru_del(struct client_obd *cli, struct osc_page *opg);
-static void osc_lru_use(struct client_obd *cli, struct osc_page *opg);
-static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli,
- struct osc_page *opg);
-
-/** \addtogroup osc
- * @{
- */
-
-/*****************************************************************************
- *
- * Page operations.
- *
- */
-static void osc_page_transfer_get(struct osc_page *opg, const char *label)
-{
- struct cl_page *page = opg->ops_cl.cpl_page;
-
- LASSERT(!opg->ops_transfer_pinned);
- cl_page_get(page);
- lu_ref_add_atomic(&page->cp_reference, label, page);
- opg->ops_transfer_pinned = 1;
-}
-
-static void osc_page_transfer_put(const struct lu_env *env,
- struct osc_page *opg)
-{
- struct cl_page *page = opg->ops_cl.cpl_page;
-
- if (opg->ops_transfer_pinned) {
- opg->ops_transfer_pinned = 0;
- lu_ref_del(&page->cp_reference, "transfer", page);
- cl_page_put(env, page);
- }
-}
-
-/**
- * This is called once for every page when it is submitted for a transfer
- * either opportunistic (osc_page_cache_add()), or immediate
- * (osc_page_submit()).
- */
-static void osc_page_transfer_add(const struct lu_env *env,
- struct osc_page *opg, enum cl_req_type crt)
-{
- struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
-
- osc_lru_use(osc_cli(obj), opg);
-}
-
-int osc_page_cache_add(const struct lu_env *env,
- const struct cl_page_slice *slice, struct cl_io *io)
-{
- struct osc_page *opg = cl2osc_page(slice);
- int result;
-
- osc_page_transfer_get(opg, "transfer\0cache");
- result = osc_queue_async_io(env, io, opg);
- if (result != 0)
- osc_page_transfer_put(env, opg);
- else
- osc_page_transfer_add(env, opg, CRT_WRITE);
-
- return result;
-}
-
-void osc_index2policy(union ldlm_policy_data *policy,
- const struct cl_object *obj,
- pgoff_t start, pgoff_t end)
-{
- memset(policy, 0, sizeof(*policy));
- policy->l_extent.start = cl_offset(obj, start);
- policy->l_extent.end = cl_offset(obj, end + 1) - 1;
-}
-
-static const char *osc_list(struct list_head *head)
-{
- return list_empty(head) ? "-" : "+";
-}
-
-static inline unsigned long osc_submit_duration(struct osc_page *opg)
-{
- if (opg->ops_submit_time == 0)
- return 0;
-
- return (cfs_time_current() - opg->ops_submit_time);
-}
-
-static int osc_page_print(const struct lu_env *env,
- const struct cl_page_slice *slice,
- void *cookie, lu_printer_t printer)
-{
- struct osc_page *opg = cl2osc_page(slice);
- struct osc_async_page *oap = &opg->ops_oap;
- struct osc_object *obj = cl2osc(slice->cpl_obj);
- struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli;
-
- return (*printer)(env, cookie, LUSTRE_OSC_NAME "-page@%p %lu: 1< %#x %d %u %s %s > 2< %llu %u %u %#x %#x | %p %p %p > 3< %d %lu %d > 4< %d %d %d %lu %s | %s %s %s %s > 5< %s %s %s %s | %d %s | %d %s %s>\n",
- opg, osc_index(opg),
- /* 1 */
- oap->oap_magic, oap->oap_cmd,
- oap->oap_interrupted,
- osc_list(&oap->oap_pending_item),
- osc_list(&oap->oap_rpc_item),
- /* 2 */
- oap->oap_obj_off, oap->oap_page_off, oap->oap_count,
- oap->oap_async_flags, oap->oap_brw_flags,
- oap->oap_request, oap->oap_cli, obj,
- /* 3 */
- opg->ops_transfer_pinned,
- osc_submit_duration(opg), opg->ops_srvlock,
- /* 4 */
- cli->cl_r_in_flight, cli->cl_w_in_flight,
- cli->cl_max_rpcs_in_flight,
- cli->cl_avail_grant,
- osc_list(&cli->cl_cache_waiters),
- osc_list(&cli->cl_loi_ready_list),
- osc_list(&cli->cl_loi_hp_ready_list),
- osc_list(&cli->cl_loi_write_list),
- osc_list(&cli->cl_loi_read_list),
- /* 5 */
- osc_list(&obj->oo_ready_item),
- osc_list(&obj->oo_hp_ready_item),
- osc_list(&obj->oo_write_item),
- osc_list(&obj->oo_read_item),
- atomic_read(&obj->oo_nr_reads),
- osc_list(&obj->oo_reading_exts),
- atomic_read(&obj->oo_nr_writes),
- osc_list(&obj->oo_hp_exts),
- osc_list(&obj->oo_urgent_exts));
-}
-
-static void osc_page_delete(const struct lu_env *env,
- const struct cl_page_slice *slice)
-{
- struct osc_page *opg = cl2osc_page(slice);
- struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
- int rc;
-
- CDEBUG(D_TRACE, "%p\n", opg);
- osc_page_transfer_put(env, opg);
- rc = osc_teardown_async_page(env, obj, opg);
- if (rc) {
- CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page,
- "Trying to teardown failed: %d\n", rc);
- LASSERT(0);
- }
-
- osc_lru_del(osc_cli(obj), opg);
-
- if (slice->cpl_page->cp_type == CPT_CACHEABLE) {
- void *value;
-
- spin_lock(&obj->oo_tree_lock);
- value = radix_tree_delete(&obj->oo_tree, osc_index(opg));
- if (value)
- --obj->oo_npages;
- spin_unlock(&obj->oo_tree_lock);
-
- LASSERT(ergo(value, value == opg));
- }
-}
-
-static void osc_page_clip(const struct lu_env *env,
- const struct cl_page_slice *slice, int from, int to)
-{
- struct osc_page *opg = cl2osc_page(slice);
- struct osc_async_page *oap = &opg->ops_oap;
-
- opg->ops_from = from;
- opg->ops_to = to;
- spin_lock(&oap->oap_lock);
- oap->oap_async_flags |= ASYNC_COUNT_STABLE;
- spin_unlock(&oap->oap_lock);
-}
-
-static int osc_page_cancel(const struct lu_env *env,
- const struct cl_page_slice *slice)
-{
- struct osc_page *opg = cl2osc_page(slice);
- int rc = 0;
-
- /* Check if the transferring against this page
- * is completed, or not even queued.
- */
- if (opg->ops_transfer_pinned)
- /* FIXME: may not be interrupted.. */
- rc = osc_cancel_async_page(env, opg);
- LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0));
- return rc;
-}
-
-static int osc_page_flush(const struct lu_env *env,
- const struct cl_page_slice *slice,
- struct cl_io *io)
-{
- struct osc_page *opg = cl2osc_page(slice);
- int rc;
-
- rc = osc_flush_async_page(env, io, opg);
- return rc;
-}
-
-static const struct cl_page_operations osc_page_ops = {
- .cpo_print = osc_page_print,
- .cpo_delete = osc_page_delete,
- .cpo_clip = osc_page_clip,
- .cpo_cancel = osc_page_cancel,
- .cpo_flush = osc_page_flush
-};
-
-int osc_page_init(const struct lu_env *env, struct cl_object *obj,
- struct cl_page *page, pgoff_t index)
-{
- struct osc_object *osc = cl2osc(obj);
- struct osc_page *opg = cl_object_page_slice(obj, page);
- int result;
-
- opg->ops_from = 0;
- opg->ops_to = PAGE_SIZE;
-
- result = osc_prep_async_page(osc, opg, page->cp_vmpage,
- cl_offset(obj, index));
- if (result == 0) {
- struct osc_io *oio = osc_env_io(env);
-
- opg->ops_srvlock = osc_io_srvlock(oio);
- cl_page_slice_add(page, &opg->ops_cl, obj, index,
- &osc_page_ops);
- }
- INIT_LIST_HEAD(&opg->ops_lru);
-
- /* reserve an LRU space for this page */
- if (page->cp_type == CPT_CACHEABLE && result == 0) {
- result = osc_lru_alloc(env, osc_cli(osc), opg);
- if (result == 0) {
- spin_lock(&osc->oo_tree_lock);
- result = radix_tree_insert(&osc->oo_tree, index, opg);
- if (result == 0)
- ++osc->oo_npages;
- spin_unlock(&osc->oo_tree_lock);
- LASSERT(result == 0);
- }
- }
-
- return result;
-}
-
-/**
- * Helper function called by osc_io_submit() for every page in an immediate
- * transfer (i.e., transferred synchronously).
- */
-void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
- enum cl_req_type crt, int brw_flags)
-{
- struct osc_async_page *oap = &opg->ops_oap;
-
- LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, magic 0x%x\n",
- oap, oap->oap_magic);
- LASSERT(oap->oap_async_flags & ASYNC_READY);
- LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE);
-
- oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
- oap->oap_page_off = opg->ops_from;
- oap->oap_count = opg->ops_to - opg->ops_from;
- oap->oap_brw_flags = brw_flags | OBD_BRW_SYNC;
-
- if (capable(CAP_SYS_RESOURCE)) {
- oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
- oap->oap_cmd |= OBD_BRW_NOQUOTA;
- }
-
- opg->ops_submit_time = cfs_time_current();
- osc_page_transfer_get(opg, "transfer\0imm");
- osc_page_transfer_add(env, opg, crt);
-}
-
-/* --------------- LRU page management ------------------ */
-
-/* OSC is a natural place to manage LRU pages as applications are specialized
- * to write OSC by OSC. Ideally, if one OSC is used more frequently it should
- * occupy more LRU slots. On the other hand, we should avoid using up all LRU
- * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep
- * for free LRU slots - this will be very bad so the algorithm requires each
- * OSC to free slots voluntarily to maintain a reasonable number of free slots
- * at any time.
- */
-static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq);
-
-/**
- * LRU pages are freed in batch mode. OSC should at least free this
- * number of pages to avoid running out of LRU slots.
- */
-static inline int lru_shrink_min(struct client_obd *cli)
-{
- return cli->cl_max_pages_per_rpc * 2;
-}
-
-/**
- * free this number at most otherwise it will take too long time to finish.
- */
-static inline int lru_shrink_max(struct client_obd *cli)
-{
- return cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight;
-}
-
-/**
- * Check if we can free LRU slots from this OSC. If there exists LRU waiters,
- * we should free slots aggressively. In this way, slots are freed in a steady
- * step to maintain fairness among OSCs.
- *
- * Return how many LRU pages should be freed.
- */
-static int osc_cache_too_much(struct client_obd *cli)
-{
- struct cl_client_cache *cache = cli->cl_cache;
- long pages = atomic_long_read(&cli->cl_lru_in_list);
- unsigned long budget;
-
- budget = cache->ccc_lru_max / (atomic_read(&cache->ccc_users) - 2);
-
- /* if it's going to run out LRU slots, we should free some, but not
- * too much to maintain fairness among OSCs.
- */
- if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 2) {
- if (pages >= budget)
- return lru_shrink_max(cli);
- else if (pages >= budget / 2)
- return lru_shrink_min(cli);
- } else {
- time64_t duration = ktime_get_real_seconds();
- long timediff;
-
- /* knock out pages by duration of no IO activity */
- duration -= cli->cl_lru_last_used;
- /*
- * The difference shouldn't be more than 70 years
- * so we can safely case to a long. Round to
- * approximately 1 minute.
- */
- timediff = (long)(duration >> 6);
- if (timediff > 0 && pages >= budget / timediff)
- return lru_shrink_min(cli);
- }
- return 0;
-}
-
-int lru_queue_work(const struct lu_env *env, void *data)
-{
- struct client_obd *cli = data;
- int count;
-
- CDEBUG(D_CACHE, "%s: run LRU work for client obd\n", cli_name(cli));
-
- count = osc_cache_too_much(cli);
- if (count > 0) {
- int rc = osc_lru_shrink(env, cli, count, false);
-
- CDEBUG(D_CACHE, "%s: shrank %d/%d pages from client obd\n",
- cli_name(cli), rc, count);
- if (rc >= count) {
- CDEBUG(D_CACHE, "%s: queue again\n", cli_name(cli));
- ptlrpcd_queue_work(cli->cl_lru_work);
- }
- }
-
- return 0;
-}
-
-void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist)
-{
- LIST_HEAD(lru);
- struct osc_async_page *oap;
- long npages = 0;
-
- list_for_each_entry(oap, plist, oap_pending_item) {
- struct osc_page *opg = oap2osc_page(oap);
-
- if (!opg->ops_in_lru)
- continue;
-
- ++npages;
- LASSERT(list_empty(&opg->ops_lru));
- list_add(&opg->ops_lru, &lru);
- }
-
- if (npages > 0) {
- spin_lock(&cli->cl_lru_list_lock);
- list_splice_tail(&lru, &cli->cl_lru_list);
- atomic_long_sub(npages, &cli->cl_lru_busy);
- atomic_long_add(npages, &cli->cl_lru_in_list);
- cli->cl_lru_last_used = ktime_get_real_seconds();
- spin_unlock(&cli->cl_lru_list_lock);
-
- if (waitqueue_active(&osc_lru_waitq))
- (void)ptlrpcd_queue_work(cli->cl_lru_work);
- }
-}
-
-static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg)
-{
- LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0);
- list_del_init(&opg->ops_lru);
- atomic_long_dec(&cli->cl_lru_in_list);
-}
-
-/**
- * Page is being destroyed. The page may be not in LRU list, if the transfer
- * has never finished(error occurred).
- */
-static void osc_lru_del(struct client_obd *cli, struct osc_page *opg)
-{
- if (opg->ops_in_lru) {
- spin_lock(&cli->cl_lru_list_lock);
- if (!list_empty(&opg->ops_lru)) {
- __osc_lru_del(cli, opg);
- } else {
- LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0);
- atomic_long_dec(&cli->cl_lru_busy);
- }
- spin_unlock(&cli->cl_lru_list_lock);
-
- atomic_long_inc(cli->cl_lru_left);
- /* this is a great place to release more LRU pages if
- * this osc occupies too many LRU pages and kernel is
- * stealing one of them.
- */
- if (osc_cache_too_much(cli)) {
- CDEBUG(D_CACHE, "%s: queue LRU work\n", cli_name(cli));
- (void)ptlrpcd_queue_work(cli->cl_lru_work);
- }
- wake_up(&osc_lru_waitq);
- } else {
- LASSERT(list_empty(&opg->ops_lru));
- }
-}
-
-/**
- * Delete page from LRUlist for redirty.
- */
-static void osc_lru_use(struct client_obd *cli, struct osc_page *opg)
-{
- /* If page is being transferred for the first time,
- * ops_lru should be empty
- */
- if (opg->ops_in_lru && !list_empty(&opg->ops_lru)) {
- spin_lock(&cli->cl_lru_list_lock);
- __osc_lru_del(cli, opg);
- spin_unlock(&cli->cl_lru_list_lock);
- atomic_long_inc(&cli->cl_lru_busy);
- }
-}
-
-static void discard_pagevec(const struct lu_env *env, struct cl_io *io,
- struct cl_page **pvec, int max_index)
-{
- int i;
-
- for (i = 0; i < max_index; i++) {
- struct cl_page *page = pvec[i];
-
- LASSERT(cl_page_is_owned(page, io));
- cl_page_delete(env, page);
- cl_page_discard(env, io, page);
- cl_page_disown(env, io, page);
- cl_page_put(env, page);
-
- pvec[i] = NULL;
- }
-}
-
-/**
- * Check if a cl_page can be released, i.e, it's not being used.
- *
- * If unstable account is turned on, bulk transfer may hold one refcount
- * for recovery so we need to check vmpage refcount as well; otherwise,
- * even we can destroy cl_page but the corresponding vmpage can't be reused.
- */
-static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page)
-{
- if (cl_page_in_use_noref(page))
- return true;
-
- if (cli->cl_cache->ccc_unstable_check) {
- struct page *vmpage = cl_page_vmpage(page);
-
- /* vmpage have two known users: cl_page and VM page cache */
- if (page_count(vmpage) - page_mapcount(vmpage) > 2)
- return true;
- }
- return false;
-}
-
-/**
- * Drop @target of pages from LRU at most.
- */
-long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
- long target, bool force)
-{
- struct cl_io *io;
- struct cl_object *clobj = NULL;
- struct cl_page **pvec;
- struct osc_page *opg;
- int maxscan = 0;
- long count = 0;
- int index = 0;
- int rc = 0;
-
- LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
- if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
- return 0;
-
- CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n",
- cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force);
- if (!force) {
- if (atomic_read(&cli->cl_lru_shrinkers) > 0)
- return -EBUSY;
-
- if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) {
- atomic_dec(&cli->cl_lru_shrinkers);
- return -EBUSY;
- }
- } else {
- atomic_inc(&cli->cl_lru_shrinkers);
- }
-
- pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
- io = &osc_env_info(env)->oti_io;
-
- spin_lock(&cli->cl_lru_list_lock);
- if (force)
- cli->cl_lru_reclaim++;
- maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list));
- while (!list_empty(&cli->cl_lru_list)) {
- struct cl_page *page;
- bool will_free = false;
-
- if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1)
- break;
-
- if (--maxscan < 0)
- break;
-
- opg = list_entry(cli->cl_lru_list.next, struct osc_page,
- ops_lru);
- page = opg->ops_cl.cpl_page;
- if (lru_page_busy(cli, page)) {
- list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
- continue;
- }
-
- LASSERT(page->cp_obj);
- if (clobj != page->cp_obj) {
- struct cl_object *tmp = page->cp_obj;
-
- cl_object_get(tmp);
- spin_unlock(&cli->cl_lru_list_lock);
-
- if (clobj) {
- discard_pagevec(env, io, pvec, index);
- index = 0;
-
- cl_io_fini(env, io);
- cl_object_put(env, clobj);
- clobj = NULL;
- }
-
- clobj = tmp;
- io->ci_obj = clobj;
- io->ci_ignore_layout = 1;
- rc = cl_io_init(env, io, CIT_MISC, clobj);
-
- spin_lock(&cli->cl_lru_list_lock);
-
- if (rc != 0)
- break;
-
- ++maxscan;
- continue;
- }
-
- if (cl_page_own_try(env, io, page) == 0) {
- if (!lru_page_busy(cli, page)) {
- /* remove it from lru list earlier to avoid
- * lock contention
- */
- __osc_lru_del(cli, opg);
- opg->ops_in_lru = 0; /* will be discarded */
-
- cl_page_get(page);
- will_free = true;
- } else {
- cl_page_disown(env, io, page);
- }
- }
-
- if (!will_free) {
- list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
- continue;
- }
-
- /* Don't discard and free the page with cl_lru_list held */
- pvec[index++] = page;
- if (unlikely(index == OTI_PVEC_SIZE)) {
- spin_unlock(&cli->cl_lru_list_lock);
- discard_pagevec(env, io, pvec, index);
- index = 0;
-
- spin_lock(&cli->cl_lru_list_lock);
- }
-
- if (++count >= target)
- break;
- }
- spin_unlock(&cli->cl_lru_list_lock);
-
- if (clobj) {
- discard_pagevec(env, io, pvec, index);
-
- cl_io_fini(env, io);
- cl_object_put(env, clobj);
- }
-
- atomic_dec(&cli->cl_lru_shrinkers);
- if (count > 0) {
- atomic_long_add(count, cli->cl_lru_left);
- wake_up_all(&osc_lru_waitq);
- }
- return count > 0 ? count : rc;
-}
-
-/**
- * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
- * \@npages of LRU slots. For performance consideration, it's better to drop
- * LRU pages in batch. Therefore, the actual number is adjusted at least
- * max_pages_per_rpc.
- */
-static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
-{
- struct lu_env *env;
- struct cl_client_cache *cache = cli->cl_cache;
- int max_scans;
- u16 refcheck;
- long rc = 0;
-
- LASSERT(cache);
-
- env = cl_env_get(&refcheck);
- if (IS_ERR(env))
- return 0;
-
- npages = max_t(int, npages, cli->cl_max_pages_per_rpc);
- CDEBUG(D_CACHE, "%s: start to reclaim %ld pages from LRU\n",
- cli_name(cli), npages);
- rc = osc_lru_shrink(env, cli, npages, true);
- if (rc >= npages) {
- CDEBUG(D_CACHE, "%s: reclaimed %ld/%ld pages from LRU\n",
- cli_name(cli), rc, npages);
- if (osc_cache_too_much(cli) > 0)
- ptlrpcd_queue_work(cli->cl_lru_work);
- goto out;
- } else if (rc > 0) {
- npages -= rc;
- }
-
- CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n",
- cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
- atomic_long_read(&cli->cl_lru_busy), npages);
-
- /* Reclaim LRU slots from other client_obd as it can't free enough
- * from its own. This should rarely happen.
- */
- spin_lock(&cache->ccc_lru_lock);
- LASSERT(!list_empty(&cache->ccc_lru));
-
- cache->ccc_lru_shrinkers++;
- list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
-
- max_scans = atomic_read(&cache->ccc_users) - 2;
- while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) {
- cli = list_entry(cache->ccc_lru.next, struct client_obd,
- cl_lru_osc);
-
- CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n",
- cli_name(cli), cli,
- atomic_long_read(&cli->cl_lru_in_list),
- atomic_long_read(&cli->cl_lru_busy));
-
- list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
- if (osc_cache_too_much(cli) > 0) {
- spin_unlock(&cache->ccc_lru_lock);
-
- rc = osc_lru_shrink(env, cli, npages, true);
- spin_lock(&cache->ccc_lru_lock);
- if (rc >= npages)
- break;
- if (rc > 0)
- npages -= rc;
- }
- }
- spin_unlock(&cache->ccc_lru_lock);
-
-out:
- cl_env_put(env, &refcheck);
- CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n",
- cli_name(cli), cli, rc);
- return rc;
-}
-
-/**
- * osc_lru_alloc() is called to reserve an LRU slot for a cl_page.
- *
- * Usually the LRU slots are reserved in osc_io_iter_rw_init().
- * Only in the case that the LRU slots are in extreme shortage, it should
- * have reserved enough slots for an IO.
- */
-static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli,
- struct osc_page *opg)
-{
- struct osc_io *oio = osc_env_io(env);
- int rc = 0;
-
- if (!cli->cl_cache) /* shall not be in LRU */
- return 0;
-
- if (oio->oi_lru_reserved > 0) {
- --oio->oi_lru_reserved;
- goto out;
- }
-
- LASSERT(atomic_long_read(cli->cl_lru_left) >= 0);
- while (!atomic_long_add_unless(cli->cl_lru_left, -1, 0)) {
- /* run out of LRU spaces, try to drop some by itself */
- rc = osc_lru_reclaim(cli, 1);
- if (rc < 0)
- break;
- if (rc > 0)
- continue;
-
- cond_resched();
-
- rc = l_wait_event_abortable(osc_lru_waitq,
- atomic_long_read(cli->cl_lru_left) > 0);
-
- if (rc < 0)
- break;
- }
-
-out:
- if (rc >= 0) {
- atomic_long_inc(&cli->cl_lru_busy);
- opg->ops_in_lru = 1;
- rc = 0;
- }
-
- return rc;
-}
-
-/**
- * osc_lru_reserve() is called to reserve enough LRU slots for I/O.
- *
- * The benefit of doing this is to reduce contention against atomic counter
- * cl_lru_left by changing it from per-page access to per-IO access.
- */
-unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages)
-{
- unsigned long reserved = 0;
- unsigned long max_pages;
- unsigned long c;
-
- /*
- * reserve a full RPC window at most to avoid that a thread accidentally
- * consumes too many LRU slots
- */
- max_pages = cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight;
- if (npages > max_pages)
- npages = max_pages;
-
- c = atomic_long_read(cli->cl_lru_left);
- if (c < npages && osc_lru_reclaim(cli, npages) > 0)
- c = atomic_long_read(cli->cl_lru_left);
- while (c >= npages) {
- if (c == atomic_long_cmpxchg(cli->cl_lru_left, c, c - npages)) {
- reserved = npages;
- break;
- }
- c = atomic_long_read(cli->cl_lru_left);
- }
- if (atomic_long_read(cli->cl_lru_left) < max_pages) {
- /*
- * If there aren't enough pages in the per-OSC LRU then
- * wake up the LRU thread to try and clear out space, so
- * we don't block if pages are being dirtied quickly.
- */
- CDEBUG(D_CACHE, "%s: queue LRU, left: %lu/%ld.\n",
- cli_name(cli), atomic_long_read(cli->cl_lru_left),
- max_pages);
- (void)ptlrpcd_queue_work(cli->cl_lru_work);
- }
-
- return reserved;
-}
-
-/**
- * osc_lru_unreserve() is called to unreserve LRU slots.
- *
- * LRU slots reserved by osc_lru_reserve() may have entries left due to several
- * reasons such as page already existing or I/O error. Those reserved slots
- * should be freed by calling this function.
- */
-void osc_lru_unreserve(struct client_obd *cli, unsigned long npages)
-{
- atomic_long_add(npages, cli->cl_lru_left);
- wake_up_all(&osc_lru_waitq);
-}
-
-/**
- * Atomic operations are expensive. We accumulate the accounting for the
- * same page pgdat to get better performance.
- * In practice this can work pretty good because the pages in the same RPC
- * are likely from the same page zone.
- */
-static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
- int factor)
-{
- int page_count = desc->bd_iov_count;
- pg_data_t *last = NULL;
- int count = 0;
- int i;
-
- LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-
- for (i = 0; i < page_count; i++) {
- pg_data_t *pgdat = page_pgdat(BD_GET_KIOV(desc, i).bv_page);
-
- if (likely(pgdat == last)) {
- ++count;
- continue;
- }
-
- if (count > 0) {
- mod_node_page_state(pgdat, NR_UNSTABLE_NFS,
- factor * count);
- count = 0;
- }
- last = pgdat;
- ++count;
- }
- if (count > 0)
- mod_node_page_state(last, NR_UNSTABLE_NFS, factor * count);
-}
-
-static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
-{
- unstable_page_accounting(desc, 1);
-}
-
-static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
-{
- unstable_page_accounting(desc, -1);
-}
-
-/**
- * Performs "unstable" page accounting. This function balances the
- * increment operations performed in osc_inc_unstable_pages. It is
- * registered as the RPC request callback, and is executed when the
- * bulk RPC is committed on the server. Thus at this point, the pages
- * involved in the bulk transfer are no longer considered unstable.
- *
- * If this function is called, the request should have been committed
- * or req:rq_unstable must have been set; it implies that the unstable
- * statistic have been added.
- */
-void osc_dec_unstable_pages(struct ptlrpc_request *req)
-{
- struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
- struct ptlrpc_bulk_desc *desc = req->rq_bulk;
- int page_count = desc->bd_iov_count;
- long unstable_count;
-
- LASSERT(page_count >= 0);
- dec_unstable_page_accounting(desc);
-
- unstable_count = atomic_long_sub_return(page_count,
- &cli->cl_unstable_count);
- LASSERT(unstable_count >= 0);
-
- unstable_count = atomic_long_sub_return(page_count,
- &cli->cl_cache->ccc_unstable_nr);
- LASSERT(unstable_count >= 0);
- if (!unstable_count)
- wake_up_all(&cli->cl_cache->ccc_unstable_waitq);
-
- if (waitqueue_active(&osc_lru_waitq))
- (void)ptlrpcd_queue_work(cli->cl_lru_work);
-}
-
-/**
- * "unstable" page accounting. See: osc_dec_unstable_pages.
- */
-void osc_inc_unstable_pages(struct ptlrpc_request *req)
-{
- struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
- struct ptlrpc_bulk_desc *desc = req->rq_bulk;
- long page_count = desc->bd_iov_count;
-
- /* No unstable page tracking */
- if (!cli->cl_cache || !cli->cl_cache->ccc_unstable_check)
- return;
-
- add_unstable_page_accounting(desc);
- atomic_long_add(page_count, &cli->cl_unstable_count);
- atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr);
-
- /*
- * If the request has already been committed (i.e. brw_commit
- * called via rq_commit_cb), we need to undo the unstable page
- * increments we just performed because rq_commit_cb wont be
- * called again.
- */
- spin_lock(&req->rq_lock);
- if (unlikely(req->rq_committed)) {
- spin_unlock(&req->rq_lock);
-
- osc_dec_unstable_pages(req);
- } else {
- req->rq_unstable = 1;
- spin_unlock(&req->rq_lock);
- }
-}
-
-/**
- * Check if it piggybacks SOFT_SYNC flag to OST from this OSC.
- * This function will be called by every BRW RPC so it's critical
- * to make this function fast.
- */
-bool osc_over_unstable_soft_limit(struct client_obd *cli)
-{
- long unstable_nr, osc_unstable_count;
-
- /* Can't check cli->cl_unstable_count, therefore, no soft limit */
- if (!cli->cl_cache || !cli->cl_cache->ccc_unstable_check)
- return false;
-
- osc_unstable_count = atomic_long_read(&cli->cl_unstable_count);
- unstable_nr = atomic_long_read(&cli->cl_cache->ccc_unstable_nr);
-
- CDEBUG(D_CACHE,
- "%s: cli: %p unstable pages: %lu, osc unstable pages: %lu\n",
- cli_name(cli), cli, unstable_nr, osc_unstable_count);
-
- /*
- * If the LRU slots are in shortage - 25% remaining AND this OSC
- * has one full RPC window of unstable pages, it's a good chance
- * to piggyback a SOFT_SYNC flag.
- * Please notice that the OST won't take immediate response for the
- * SOFT_SYNC request so active OSCs will have more chance to carry
- * the flag, this is reasonable.
- */
- return unstable_nr > cli->cl_cache->ccc_lru_max >> 2 &&
- osc_unstable_count > cli->cl_max_pages_per_rpc *
- cli->cl_max_rpcs_in_flight;
-}
-
-/**
- * Return how many LRU pages in the cache of all OSC devices
- *
- * Return: return # of cached LRU pages times reclaimation tendency
- * SHRINK_STOP if it cannot do any scanning in this time
- */
-unsigned long osc_cache_shrink_count(struct shrinker *sk,
- struct shrink_control *sc)
-{
- struct client_obd *cli;
- unsigned long cached = 0;
-
- spin_lock(&osc_shrink_lock);
- list_for_each_entry(cli, &osc_shrink_list, cl_shrink_list)
- cached += atomic_long_read(&cli->cl_lru_in_list);
- spin_unlock(&osc_shrink_lock);
-
- return (cached * sysctl_vfs_cache_pressure) / 100;
-}
-
-/**
- * Scan and try to reclaim sc->nr_to_scan cached LRU pages
- *
- * Return: number of cached LRU pages reclaimed
- * SHRINK_STOP if it cannot do any scanning in this time
- *
- * Linux kernel will loop calling this shrinker scan routine with
- * sc->nr_to_scan = SHRINK_BATCH(128 for now) until kernel got enough memory.
- *
- * If sc->nr_to_scan is 0, the VM is querying the cache size, we don't need
- * to scan and try to reclaim LRU pages, just return 0 and
- * osc_cache_shrink_count() will report the LRU page number.
- */
-unsigned long osc_cache_shrink_scan(struct shrinker *sk,
- struct shrink_control *sc)
-{
- struct client_obd *stop_anchor = NULL;
- struct client_obd *cli;
- struct lu_env *env;
- long shrank = 0;
- u16 refcheck;
- int rc;
-
- if (!sc->nr_to_scan)
- return 0;
-
- if (!(sc->gfp_mask & __GFP_FS))
- return SHRINK_STOP;
-
- env = cl_env_get(&refcheck);
- if (IS_ERR(env))
- return SHRINK_STOP;
-
- spin_lock(&osc_shrink_lock);
- while (!list_empty(&osc_shrink_list)) {
- cli = list_entry(osc_shrink_list.next, struct client_obd,
- cl_shrink_list);
-
- if (!stop_anchor)
- stop_anchor = cli;
- else if (cli == stop_anchor)
- break;
-
- list_move_tail(&cli->cl_shrink_list, &osc_shrink_list);
- spin_unlock(&osc_shrink_lock);
-
- /* shrink no more than max_pages_per_rpc for an OSC */
- rc = osc_lru_shrink(env, cli, (sc->nr_to_scan - shrank) >
- cli->cl_max_pages_per_rpc ?
- cli->cl_max_pages_per_rpc :
- sc->nr_to_scan - shrank, true);
- if (rc > 0)
- shrank += rc;
-
- if (shrank >= sc->nr_to_scan)
- goto out;
-
- spin_lock(&osc_shrink_lock);
- }
- spin_unlock(&osc_shrink_lock);
-
-out:
- cl_env_put(env, &refcheck);
-
- return shrank;
-}
-
-/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_quota.c b/drivers/staging/lustre/lustre/osc/osc_quota.c
deleted file mode 100644
index ce1731dc604f..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_quota.c
+++ /dev/null
@@ -1,284 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- *
- * Code originally extracted from quota directory
- */
-
-#include <obd_class.h>
-#include "osc_internal.h"
-
-static inline struct osc_quota_info *osc_oqi_alloc(u32 id)
-{
- struct osc_quota_info *oqi;
-
- oqi = kmem_cache_zalloc(osc_quota_kmem, GFP_NOFS);
- if (oqi)
- oqi->oqi_id = id;
-
- return oqi;
-}
-
-int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
-{
- int type;
-
- for (type = 0; type < MAXQUOTAS; type++) {
- struct osc_quota_info *oqi;
-
- oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
- if (oqi) {
- /* do not try to access oqi here, it could have been
- * freed by osc_quota_setdq()
- */
-
- /* the slot is busy, the user is about to run out of
- * quota space on this OST
- */
- CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n",
- type == USRQUOTA ? "user" : "grout", qid[type]);
- return NO_QUOTA;
- }
- }
-
- return QUOTA_OK;
-}
-
-#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \
- : OBD_MD_FLGRPQUOTA)
-#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \
- : OBD_FL_NO_GRPQUOTA)
-
-int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
- u32 valid, u32 flags)
-{
- int type;
- int rc = 0;
-
- if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0)
- return 0;
-
- for (type = 0; type < MAXQUOTAS; type++) {
- struct osc_quota_info *oqi;
-
- if ((valid & MD_QUOTA_FLAG(type)) == 0)
- continue;
-
- /* lookup the ID in the per-type hash table */
- oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
- if ((flags & FL_QUOTA_FLAG(type)) != 0) {
- /* This ID is getting close to its quota limit, let's
- * switch to sync I/O
- */
- if (oqi)
- continue;
-
- oqi = osc_oqi_alloc(qid[type]);
- if (!oqi) {
- rc = -ENOMEM;
- break;
- }
-
- rc = cfs_hash_add_unique(cli->cl_quota_hash[type],
- &qid[type], &oqi->oqi_hash);
- /* race with others? */
- if (rc == -EALREADY) {
- rc = 0;
- kmem_cache_free(osc_quota_kmem, oqi);
- }
-
- CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n",
- cli_name(cli),
- type == USRQUOTA ? "user" : "group",
- qid[type], rc);
- } else {
- /* This ID is now off the hook, let's remove it from
- * the hash table
- */
- if (!oqi)
- continue;
-
- oqi = cfs_hash_del_key(cli->cl_quota_hash[type],
- &qid[type]);
- if (oqi)
- kmem_cache_free(osc_quota_kmem, oqi);
-
- CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n",
- cli_name(cli),
- type == USRQUOTA ? "user" : "group",
- qid[type], oqi);
- }
- }
-
- return rc;
-}
-
-/*
- * Hash operations for uid/gid <-> osc_quota_info
- */
-static unsigned int
-oqi_hashfn(struct cfs_hash *hs, const void *key, unsigned int mask)
-{
- return cfs_hash_u32_hash(*((__u32 *)key), mask);
-}
-
-static int
-oqi_keycmp(const void *key, struct hlist_node *hnode)
-{
- struct osc_quota_info *oqi;
- u32 uid;
-
- LASSERT(key);
- uid = *((u32 *)key);
- oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
-
- return uid == oqi->oqi_id;
-}
-
-static void *
-oqi_key(struct hlist_node *hnode)
-{
- struct osc_quota_info *oqi;
-
- oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
- return &oqi->oqi_id;
-}
-
-static void *
-oqi_object(struct hlist_node *hnode)
-{
- return hlist_entry(hnode, struct osc_quota_info, oqi_hash);
-}
-
-static void
-oqi_get(struct cfs_hash *hs, struct hlist_node *hnode)
-{
-}
-
-static void
-oqi_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
-{
-}
-
-static void
-oqi_exit(struct cfs_hash *hs, struct hlist_node *hnode)
-{
- struct osc_quota_info *oqi;
-
- oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
-
- kmem_cache_free(osc_quota_kmem, oqi);
-}
-
-#define HASH_QUOTA_BKT_BITS 5
-#define HASH_QUOTA_CUR_BITS 5
-#define HASH_QUOTA_MAX_BITS 15
-
-static struct cfs_hash_ops quota_hash_ops = {
- .hs_hash = oqi_hashfn,
- .hs_keycmp = oqi_keycmp,
- .hs_key = oqi_key,
- .hs_object = oqi_object,
- .hs_get = oqi_get,
- .hs_put_locked = oqi_put_locked,
- .hs_exit = oqi_exit,
-};
-
-int osc_quota_setup(struct obd_device *obd)
-{
- struct client_obd *cli = &obd->u.cli;
- int i, type;
-
- for (type = 0; type < MAXQUOTAS; type++) {
- cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
- HASH_QUOTA_CUR_BITS,
- HASH_QUOTA_MAX_BITS,
- HASH_QUOTA_BKT_BITS,
- 0,
- CFS_HASH_MIN_THETA,
- CFS_HASH_MAX_THETA,
- &quota_hash_ops,
- CFS_HASH_DEFAULT);
- if (!cli->cl_quota_hash[type])
- break;
- }
-
- if (type == MAXQUOTAS)
- return 0;
-
- for (i = 0; i < type; i++)
- cfs_hash_putref(cli->cl_quota_hash[i]);
-
- return -ENOMEM;
-}
-
-int osc_quota_cleanup(struct obd_device *obd)
-{
- struct client_obd *cli = &obd->u.cli;
- int type;
-
- for (type = 0; type < MAXQUOTAS; type++)
- cfs_hash_putref(cli->cl_quota_hash[type]);
-
- return 0;
-}
-
-int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
- struct obd_quotactl *oqctl)
-{
- struct ptlrpc_request *req;
- struct obd_quotactl *oqc;
- int rc;
-
- req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
- &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION,
- OST_QUOTACTL);
- if (!req)
- return -ENOMEM;
-
- oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
- *oqc = *oqctl;
-
- ptlrpc_request_set_replen(req);
- ptlrpc_at_set_req_timeout(req);
- req->rq_no_resend = 1;
-
- rc = ptlrpc_queue_wait(req);
- if (rc)
- CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
-
- if (req->rq_repmsg) {
- oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
- if (oqc) {
- *oqctl = *oqc;
- } else if (!rc) {
- CERROR("Can't unpack obd_quotactl\n");
- rc = -EPROTO;
- }
- } else if (!rc) {
- CERROR("Can't unpack obd_quotactl\n");
- rc = -EPROTO;
- }
- ptlrpc_req_finished(req);
-
- return rc;
-}
diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c
deleted file mode 100644
index 1c2bbbf5d864..000000000000
--- a/drivers/staging/lustre/lustre/osc/osc_request.c
+++ /dev/null
@@ -1,2899 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#define DEBUG_SUBSYSTEM S_OSC
-
-#include <linux/libcfs/libcfs.h>
-
-#include <lustre_dlm.h>
-#include <lustre_net.h>
-#include <uapi/linux/lustre/lustre_idl.h>
-#include <obd_cksum.h>
-
-#include <lustre_ha.h>
-#include <lprocfs_status.h>
-#include <uapi/linux/lustre/lustre_ioctl.h>
-#include <lustre_debug.h>
-#include <lustre_obdo.h>
-#include <uapi/linux/lustre/lustre_param.h>
-#include <lustre_fid.h>
-#include <obd_class.h>
-#include <obd.h>
-#include "osc_internal.h"
-#include "osc_cl_internal.h"
-
-atomic_t osc_pool_req_count;
-unsigned int osc_reqpool_maxreqcount;
-struct ptlrpc_request_pool *osc_rq_pool;
-
-/* max memory used for request pool, unit is MB */
-static unsigned int osc_reqpool_mem_max = 5;
-module_param(osc_reqpool_mem_max, uint, 0444);
-
-struct osc_brw_async_args {
- struct obdo *aa_oa;
- int aa_requested_nob;
- int aa_nio_count;
- u32 aa_page_count;
- int aa_resends;
- struct brw_page **aa_ppga;
- struct client_obd *aa_cli;
- struct list_head aa_oaps;
- struct list_head aa_exts;
-};
-
-struct osc_async_args {
- struct obd_info *aa_oi;
-};
-
-struct osc_setattr_args {
- struct obdo *sa_oa;
- obd_enqueue_update_f sa_upcall;
- void *sa_cookie;
-};
-
-struct osc_fsync_args {
- struct osc_object *fa_obj;
- struct obdo *fa_oa;
- obd_enqueue_update_f fa_upcall;
- void *fa_cookie;
-};
-
-struct osc_enqueue_args {
- struct obd_export *oa_exp;
- enum ldlm_type oa_type;
- enum ldlm_mode oa_mode;
- __u64 *oa_flags;
- osc_enqueue_upcall_f oa_upcall;
- void *oa_cookie;
- struct ost_lvb *oa_lvb;
- struct lustre_handle oa_lockh;
- unsigned int oa_agl:1;
-};
-
-static void osc_release_ppga(struct brw_page **ppga, u32 count);
-static int brw_interpret(const struct lu_env *env,
- struct ptlrpc_request *req, void *data, int rc);
-
-static inline void osc_pack_req_body(struct ptlrpc_request *req,
- struct obdo *oa)
-{
- struct ost_body *body;
-
- body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
- LASSERT(body);
-
- lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
-}
-
-static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
- struct obdo *oa)
-{
- struct ptlrpc_request *req;
- struct ost_body *body;
- int rc;
-
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
- if (!req)
- return -ENOMEM;
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
-
- osc_pack_req_body(req, oa);
-
- ptlrpc_request_set_replen(req);
-
- rc = ptlrpc_queue_wait(req);
- if (rc)
- goto out;
-
- body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
- if (!body) {
- rc = -EPROTO;
- goto out;
- }
-
- CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
- lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa,
- &body->oa);
-
- oa->o_blksize = cli_brw_size(exp->exp_obd);
- oa->o_valid |= OBD_MD_FLBLKSZ;
-
- out:
- ptlrpc_req_finished(req);
- return rc;
-}
-
-static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
- struct obdo *oa)
-{
- struct ptlrpc_request *req;
- struct ost_body *body;
- int rc;
-
- LASSERT(oa->o_valid & OBD_MD_FLGROUP);
-
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
- if (!req)
- return -ENOMEM;
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
-
- osc_pack_req_body(req, oa);
-
- ptlrpc_request_set_replen(req);
-
- rc = ptlrpc_queue_wait(req);
- if (rc)
- goto out;
-
- body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
- if (!body) {
- rc = -EPROTO;
- goto out;
- }
-
- lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa,
- &body->oa);
-
-out:
- ptlrpc_req_finished(req);
- return rc;
-}
-
-static int osc_setattr_interpret(const struct lu_env *env,
- struct ptlrpc_request *req,
- struct osc_setattr_args *sa, int rc)
-{
- struct ost_body *body;
-
- if (rc != 0)
- goto out;
-
- body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
- if (!body) {
- rc = -EPROTO;
- goto out;
- }
-
- lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
- &body->oa);
-out:
- rc = sa->sa_upcall(sa->sa_cookie, rc);
- return rc;
-}
-
-int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
- obd_enqueue_update_f upcall, void *cookie,
- struct ptlrpc_request_set *rqset)
-{
- struct ptlrpc_request *req;
- struct osc_setattr_args *sa;
- int rc;
-
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
- if (!req)
- return -ENOMEM;
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
-
- osc_pack_req_body(req, oa);
-
- ptlrpc_request_set_replen(req);
-
- /* do mds to ost setattr asynchronously */
- if (!rqset) {
- /* Do not wait for response. */
- ptlrpcd_add_req(req);
- } else {
- req->rq_interpret_reply =
- (ptlrpc_interpterer_t)osc_setattr_interpret;
-
- BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args));
- sa = ptlrpc_req_async_args(req);
- sa->sa_oa = oa;
- sa->sa_upcall = upcall;
- sa->sa_cookie = cookie;
-
- if (rqset == PTLRPCD_SET)
- ptlrpcd_add_req(req);
- else
- ptlrpc_set_add_req(rqset, req);
- }
-
- return 0;
-}
-
-static int osc_create(const struct lu_env *env, struct obd_export *exp,
- struct obdo *oa)
-{
- struct ptlrpc_request *req;
- struct ost_body *body;
- int rc;
-
- LASSERT(oa);
- LASSERT(oa->o_valid & OBD_MD_FLGROUP);
- LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi)));
-
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
- if (!req) {
- rc = -ENOMEM;
- goto out;
- }
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
- if (rc) {
- ptlrpc_request_free(req);
- goto out;
- }
-
- body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
- LASSERT(body);
-
- lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
-
- ptlrpc_request_set_replen(req);
-
- rc = ptlrpc_queue_wait(req);
- if (rc)
- goto out_req;
-
- body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
- if (!body) {
- rc = -EPROTO;
- goto out_req;
- }
-
- CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
- lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
-
- oa->o_blksize = cli_brw_size(exp->exp_obd);
- oa->o_valid |= OBD_MD_FLBLKSZ;
-
- CDEBUG(D_HA, "transno: %lld\n",
- lustre_msg_get_transno(req->rq_repmsg));
-out_req:
- ptlrpc_req_finished(req);
-out:
- return rc;
-}
-
-int osc_punch_base(struct obd_export *exp, struct obdo *oa,
- obd_enqueue_update_f upcall, void *cookie,
- struct ptlrpc_request_set *rqset)
-{
- struct ptlrpc_request *req;
- struct osc_setattr_args *sa;
- struct ost_body *body;
- int rc;
-
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
- if (!req)
- return -ENOMEM;
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
- req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
- ptlrpc_at_set_req_timeout(req);
-
- body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
- LASSERT(body);
- lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
- oa);
-
- ptlrpc_request_set_replen(req);
-
- req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
- BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args));
- sa = ptlrpc_req_async_args(req);
- sa->sa_oa = oa;
- sa->sa_upcall = upcall;
- sa->sa_cookie = cookie;
- if (rqset == PTLRPCD_SET)
- ptlrpcd_add_req(req);
- else
- ptlrpc_set_add_req(rqset, req);
-
- return 0;
-}
-
-static int osc_sync_interpret(const struct lu_env *env,
- struct ptlrpc_request *req,
- void *arg, int rc)
-{
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
- struct osc_fsync_args *fa = arg;
- unsigned long valid = 0;
- struct ost_body *body;
- struct cl_object *obj;
-
- if (rc)
- goto out;
-
- body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
- if (!body) {
- CERROR("can't unpack ost_body\n");
- rc = -EPROTO;
- goto out;
- }
-
- *fa->fa_oa = body->oa;
- obj = osc2cl(fa->fa_obj);
-
- /* Update osc object's blocks attribute */
- cl_object_attr_lock(obj);
- if (body->oa.o_valid & OBD_MD_FLBLOCKS) {
- attr->cat_blocks = body->oa.o_blocks;
- valid |= CAT_BLOCKS;
- }
-
- if (valid)
- cl_object_attr_update(env, obj, attr, valid);
- cl_object_attr_unlock(obj);
-
-out:
- rc = fa->fa_upcall(fa->fa_cookie, rc);
- return rc;
-}
-
-int osc_sync_base(struct osc_object *obj, struct obdo *oa,
- obd_enqueue_update_f upcall, void *cookie,
- struct ptlrpc_request_set *rqset)
-{
- struct obd_export *exp = osc_export(obj);
- struct ptlrpc_request *req;
- struct ost_body *body;
- struct osc_fsync_args *fa;
- int rc;
-
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
- if (!req)
- return -ENOMEM;
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
-
- /* overload the size and blocks fields in the oa with start/end */
- body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
- LASSERT(body);
- lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
- oa);
-
- ptlrpc_request_set_replen(req);
- req->rq_interpret_reply = osc_sync_interpret;
-
- BUILD_BUG_ON(sizeof(*fa) > sizeof(req->rq_async_args));
- fa = ptlrpc_req_async_args(req);
- fa->fa_obj = obj;
- fa->fa_oa = oa;
- fa->fa_upcall = upcall;
- fa->fa_cookie = cookie;
-
- if (rqset == PTLRPCD_SET)
- ptlrpcd_add_req(req);
- else
- ptlrpc_set_add_req(rqset, req);
-
- return 0;
-}
-
-/* Find and cancel locally locks matched by @mode in the resource found by
- * @objid. Found locks are added into @cancel list. Returns the amount of
- * locks added to @cancels list.
- */
-static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
- struct list_head *cancels,
- enum ldlm_mode mode, __u64 lock_flags)
-{
- struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
- struct ldlm_res_id res_id;
- struct ldlm_resource *res;
- int count;
-
- /* Return, i.e. cancel nothing, only if ELC is supported (flag in
- * export) but disabled through procfs (flag in NS).
- *
- * This distinguishes from a case when ELC is not supported originally,
- * when we still want to cancel locks in advance and just cancel them
- * locally, without sending any RPC.
- */
- if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
- return 0;
-
- ostid_build_res_name(&oa->o_oi, &res_id);
- res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
- if (IS_ERR(res))
- return 0;
-
- LDLM_RESOURCE_ADDREF(res);
- count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
- lock_flags, 0, NULL);
- LDLM_RESOURCE_DELREF(res);
- ldlm_resource_putref(res);
- return count;
-}
-
-static int osc_destroy_interpret(const struct lu_env *env,
- struct ptlrpc_request *req, void *data,
- int rc)
-{
- struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
-
- atomic_dec(&cli->cl_destroy_in_flight);
- wake_up(&cli->cl_destroy_waitq);
- return 0;
-}
-
-static int osc_can_send_destroy(struct client_obd *cli)
-{
- if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
- cli->cl_max_rpcs_in_flight) {
- /* The destroy request can be sent */
- return 1;
- }
- if (atomic_dec_return(&cli->cl_destroy_in_flight) <
- cli->cl_max_rpcs_in_flight) {
- /*
- * The counter has been modified between the two atomic
- * operations.
- */
- wake_up(&cli->cl_destroy_waitq);
- }
- return 0;
-}
-
-static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
- struct obdo *oa)
-{
- struct client_obd *cli = &exp->exp_obd->u.cli;
- struct ptlrpc_request *req;
- struct ost_body *body;
- LIST_HEAD(cancels);
- int rc, count;
-
- if (!oa) {
- CDEBUG(D_INFO, "oa NULL\n");
- return -EINVAL;
- }
-
- count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
- LDLM_FL_DISCARD_DATA);
-
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
- if (!req) {
- ldlm_lock_list_put(&cancels, l_bl_ast, count);
- return -ENOMEM;
- }
-
- rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
- 0, &cancels, count);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
-
- req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
- ptlrpc_at_set_req_timeout(req);
-
- body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
- LASSERT(body);
- lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
-
- ptlrpc_request_set_replen(req);
-
- req->rq_interpret_reply = osc_destroy_interpret;
- if (!osc_can_send_destroy(cli)) {
- /*
- * Wait until the number of on-going destroy RPCs drops
- * under max_rpc_in_flight
- */
- l_wait_event_abortable_exclusive(cli->cl_destroy_waitq,
- osc_can_send_destroy(cli));
- }
-
- /* Do not wait for response */
- ptlrpcd_add_req(req);
- return 0;
-}
-
-static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
- long writing_bytes)
-{
- u32 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT;
-
- LASSERT(!(oa->o_valid & bits));
-
- oa->o_valid |= bits;
- spin_lock(&cli->cl_loi_list_lock);
- oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT;
- if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit >
- cli->cl_dirty_max_pages)) {
- CERROR("dirty %lu - %lu > dirty_max %lu\n",
- cli->cl_dirty_pages, cli->cl_dirty_transit,
- cli->cl_dirty_max_pages);
- oa->o_undirty = 0;
- } else if (unlikely(atomic_long_read(&obd_dirty_pages) -
- atomic_long_read(&obd_dirty_transit_pages) >
- (long)(obd_max_dirty_pages + 1))) {
- /* The atomic_read() allowing the atomic_inc() are
- * not covered by a lock thus they may safely race and trip
- * this CERROR() unless we add in a small fudge factor (+1).
- */
- CERROR("%s: dirty %ld + %ld > system dirty_max %ld\n",
- cli_name(cli), atomic_long_read(&obd_dirty_pages),
- atomic_long_read(&obd_dirty_transit_pages),
- obd_max_dirty_pages);
- oa->o_undirty = 0;
- } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages >
- 0x7fffffff)) {
- CERROR("dirty %lu - dirty_max %lu too big???\n",
- cli->cl_dirty_pages, cli->cl_dirty_max_pages);
- oa->o_undirty = 0;
- } else {
- unsigned long max_in_flight;
-
- max_in_flight = (cli->cl_max_pages_per_rpc << PAGE_SHIFT) *
- (cli->cl_max_rpcs_in_flight + 1);
- oa->o_undirty = max(cli->cl_dirty_max_pages << PAGE_SHIFT,
- max_in_flight);
- }
- oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
- oa->o_dropped = cli->cl_lost_grant;
- cli->cl_lost_grant = 0;
- spin_unlock(&cli->cl_loi_list_lock);
- CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n",
- oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
-}
-
-void osc_update_next_shrink(struct client_obd *cli)
-{
- cli->cl_next_shrink_grant =
- cfs_time_shift(cli->cl_grant_shrink_interval);
- CDEBUG(D_CACHE, "next time %ld to shrink grant\n",
- cli->cl_next_shrink_grant);
-}
-
-static void __osc_update_grant(struct client_obd *cli, u64 grant)
-{
- spin_lock(&cli->cl_loi_list_lock);
- cli->cl_avail_grant += grant;
- spin_unlock(&cli->cl_loi_list_lock);
-}
-
-static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
-{
- if (body->oa.o_valid & OBD_MD_FLGRANT) {
- CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant);
- __osc_update_grant(cli, body->oa.o_grant);
- }
-}
-
-static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
- u32 keylen, void *key, u32 vallen,
- void *val, struct ptlrpc_request_set *set);
-
-static int osc_shrink_grant_interpret(const struct lu_env *env,
- struct ptlrpc_request *req,
- void *aa, int rc)
-{
- struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
- struct obdo *oa = ((struct osc_brw_async_args *)aa)->aa_oa;
- struct ost_body *body;
-
- if (rc != 0) {
- __osc_update_grant(cli, oa->o_grant);
- goto out;
- }
-
- body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
- LASSERT(body);
- osc_update_grant(cli, body);
-out:
- kmem_cache_free(obdo_cachep, oa);
- return rc;
-}
-
-static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
-{
- spin_lock(&cli->cl_loi_list_lock);
- oa->o_grant = cli->cl_avail_grant / 4;
- cli->cl_avail_grant -= oa->o_grant;
- spin_unlock(&cli->cl_loi_list_lock);
- if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
- oa->o_valid |= OBD_MD_FLFLAGS;
- oa->o_flags = 0;
- }
- oa->o_flags |= OBD_FL_SHRINK_GRANT;
- osc_update_next_shrink(cli);
-}
-
-/* Shrink the current grant, either from some large amount to enough for a
- * full set of in-flight RPCs, or if we have already shrunk to that limit
- * then to enough for a single RPC. This avoids keeping more grant than
- * needed, and avoids shrinking the grant piecemeal.
- */
-static int osc_shrink_grant(struct client_obd *cli)
-{
- __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
- (cli->cl_max_pages_per_rpc << PAGE_SHIFT);
-
- spin_lock(&cli->cl_loi_list_lock);
- if (cli->cl_avail_grant <= target_bytes)
- target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
- spin_unlock(&cli->cl_loi_list_lock);
-
- return osc_shrink_grant_to_target(cli, target_bytes);
-}
-
-int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
-{
- int rc = 0;
- struct ost_body *body;
-
- spin_lock(&cli->cl_loi_list_lock);
- /* Don't shrink if we are already above or below the desired limit
- * We don't want to shrink below a single RPC, as that will negatively
- * impact block allocation and long-term performance.
- */
- if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT)
- target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
-
- if (target_bytes >= cli->cl_avail_grant) {
- spin_unlock(&cli->cl_loi_list_lock);
- return 0;
- }
- spin_unlock(&cli->cl_loi_list_lock);
-
- body = kzalloc(sizeof(*body), GFP_NOFS);
- if (!body)
- return -ENOMEM;
-
- osc_announce_cached(cli, &body->oa, 0);
-
- spin_lock(&cli->cl_loi_list_lock);
- body->oa.o_grant = cli->cl_avail_grant - target_bytes;
- cli->cl_avail_grant = target_bytes;
- spin_unlock(&cli->cl_loi_list_lock);
- if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
- body->oa.o_valid |= OBD_MD_FLFLAGS;
- body->oa.o_flags = 0;
- }
- body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
- osc_update_next_shrink(cli);
-
- rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
- sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
- sizeof(*body), body, NULL);
- if (rc != 0)
- __osc_update_grant(cli, body->oa.o_grant);
- kfree(body);
- return rc;
-}
-
-static int osc_should_shrink_grant(struct client_obd *client)
-{
- unsigned long time = cfs_time_current();
- unsigned long next_shrink = client->cl_next_shrink_grant;
-
- if ((client->cl_import->imp_connect_data.ocd_connect_flags &
- OBD_CONNECT_GRANT_SHRINK) == 0)
- return 0;
-
- if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
- /* Get the current RPC size directly, instead of going via:
- * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
- * Keep comment here so that it can be found by searching.
- */
- int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT;
-
- if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
- client->cl_avail_grant > brw_size)
- return 1;
-
- osc_update_next_shrink(client);
- }
- return 0;
-}
-
-static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
-{
- struct client_obd *client;
-
- list_for_each_entry(client, &item->ti_obd_list, cl_grant_shrink_list) {
- if (osc_should_shrink_grant(client))
- osc_shrink_grant(client);
- }
- return 0;
-}
-
-static int osc_add_shrink_grant(struct client_obd *client)
-{
- int rc;
-
- rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
- TIMEOUT_GRANT,
- osc_grant_shrink_grant_cb, NULL,
- &client->cl_grant_shrink_list);
- if (rc) {
- CERROR("add grant client %s error %d\n", cli_name(client), rc);
- return rc;
- }
- CDEBUG(D_CACHE, "add grant client %s\n", cli_name(client));
- osc_update_next_shrink(client);
- return 0;
-}
-
-static int osc_del_shrink_grant(struct client_obd *client)
-{
- return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
- TIMEOUT_GRANT);
-}
-
-static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
-{
- /*
- * ocd_grant is the total grant amount we're expect to hold: if we've
- * been evicted, it's the new avail_grant amount, cl_dirty_pages will
- * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant +
- * dirty.
- *
- * race is tolerable here: if we're evicted, but imp_state already
- * left EVICTED state, then cl_dirty_pages must be 0 already.
- */
- spin_lock(&cli->cl_loi_list_lock);
- if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED)
- cli->cl_avail_grant = ocd->ocd_grant;
- else
- cli->cl_avail_grant = ocd->ocd_grant -
- (cli->cl_dirty_pages << PAGE_SHIFT);
-
- /* determine the appropriate chunk size used by osc_extent. */
- cli->cl_chunkbits = max_t(int, PAGE_SHIFT, ocd->ocd_blocksize);
- spin_unlock(&cli->cl_loi_list_lock);
-
- CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld chunk bits: %d\n",
- cli_name(cli), cli->cl_avail_grant, cli->cl_lost_grant,
- cli->cl_chunkbits);
-
- if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
- list_empty(&cli->cl_grant_shrink_list))
- osc_add_shrink_grant(cli);
-}
-
-/* We assume that the reason this OSC got a short read is because it read
- * beyond the end of a stripe file; i.e. lustre is reading a sparse file
- * via the LOV, and it _knows_ it's reading inside the file, it's just that
- * this stripe never got written at or beyond this stripe offset yet.
- */
-static void handle_short_read(int nob_read, u32 page_count,
- struct brw_page **pga)
-{
- char *ptr;
- int i = 0;
-
- /* skip bytes read OK */
- while (nob_read > 0) {
- LASSERT(page_count > 0);
-
- if (pga[i]->count > nob_read) {
- /* EOF inside this page */
- ptr = kmap(pga[i]->pg) +
- (pga[i]->off & ~PAGE_MASK);
- memset(ptr + nob_read, 0, pga[i]->count - nob_read);
- kunmap(pga[i]->pg);
- page_count--;
- i++;
- break;
- }
-
- nob_read -= pga[i]->count;
- page_count--;
- i++;
- }
-
- /* zero remaining pages */
- while (page_count-- > 0) {
- ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK);
- memset(ptr, 0, pga[i]->count);
- kunmap(pga[i]->pg);
- i++;
- }
-}
-
-static int check_write_rcs(struct ptlrpc_request *req,
- int requested_nob, int niocount,
- u32 page_count, struct brw_page **pga)
-{
- int i;
- __u32 *remote_rcs;
-
- remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
- sizeof(*remote_rcs) *
- niocount);
- if (!remote_rcs) {
- CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
- return -EPROTO;
- }
-
- /* return error if any niobuf was in error */
- for (i = 0; i < niocount; i++) {
- if ((int)remote_rcs[i] < 0)
- return remote_rcs[i];
-
- if (remote_rcs[i] != 0) {
- CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
- i, remote_rcs[i], req);
- return -EPROTO;
- }
- }
-
- if (req->rq_bulk->bd_nob_transferred != requested_nob) {
- CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
- req->rq_bulk->bd_nob_transferred, requested_nob);
- return -EPROTO;
- }
-
- return 0;
-}
-
-static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
-{
- if (p1->flag != p2->flag) {
- unsigned int mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
- OBD_BRW_SYNC | OBD_BRW_ASYNC |
- OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC);
-
- /* warn if we try to combine flags that we don't know to be
- * safe to combine
- */
- if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
- CWARN("Saw flags 0x%x and 0x%x in the same brw, please report this at http://bugs.whamcloud.com/\n",
- p1->flag, p2->flag);
- }
- return 0;
- }
-
- return (p1->off + p1->count == p2->off);
-}
-
-static u32 osc_checksum_bulk(int nob, u32 pg_count,
- struct brw_page **pga, int opc,
- enum cksum_type cksum_type)
-{
- __u32 cksum;
- int i = 0;
- struct ahash_request *hdesc;
- unsigned int bufsize;
- unsigned char cfs_alg = cksum_obd2cfs(cksum_type);
-
- LASSERT(pg_count > 0);
-
- hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
- if (IS_ERR(hdesc)) {
- CERROR("Unable to initialize checksum hash %s\n",
- cfs_crypto_hash_name(cfs_alg));
- return PTR_ERR(hdesc);
- }
-
- while (nob > 0 && pg_count > 0) {
- unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
-
- /* corrupt the data before we compute the checksum, to
- * simulate an OST->client data error
- */
- if (i == 0 && opc == OST_READ &&
- OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
- unsigned char *ptr = kmap(pga[i]->pg);
- int off = pga[i]->off & ~PAGE_MASK;
-
- memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob));
- kunmap(pga[i]->pg);
- }
- cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
- pga[i]->off & ~PAGE_MASK,
- count);
- CDEBUG(D_PAGE,
- "page %p map %p index %lu flags %lx count %u priv %0lx: off %d\n",
- pga[i]->pg, pga[i]->pg->mapping, pga[i]->pg->index,
- (long)pga[i]->pg->flags, page_count(pga[i]->pg),
- page_private(pga[i]->pg),
- (int)(pga[i]->off & ~PAGE_MASK));
-
- nob -= pga[i]->count;
- pg_count--;
- i++;
- }
-
- bufsize = sizeof(cksum);
- cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
-
- /* For sending we only compute the wrong checksum instead
- * of corrupting the data so it is still correct on a redo
- */
- if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
- cksum++;
-
- return cksum;
-}
-
-static int osc_brw_prep_request(int cmd, struct client_obd *cli,
- struct obdo *oa, u32 page_count,
- struct brw_page **pga,
- struct ptlrpc_request **reqp,
- int reserve,
- int resend)
-{
- struct ptlrpc_request *req;
- struct ptlrpc_bulk_desc *desc;
- struct ost_body *body;
- struct obd_ioobj *ioobj;
- struct niobuf_remote *niobuf;
- int niocount, i, requested_nob, opc, rc;
- struct osc_brw_async_args *aa;
- struct req_capsule *pill;
- struct brw_page *pg_prev;
-
- if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
- return -ENOMEM; /* Recoverable */
- if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
- return -EINVAL; /* Fatal */
-
- if ((cmd & OBD_BRW_WRITE) != 0) {
- opc = OST_WRITE;
- req = ptlrpc_request_alloc_pool(cli->cl_import,
- osc_rq_pool,
- &RQF_OST_BRW_WRITE);
- } else {
- opc = OST_READ;
- req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
- }
- if (!req)
- return -ENOMEM;
-
- for (niocount = i = 1; i < page_count; i++) {
- if (!can_merge_pages(pga[i - 1], pga[i]))
- niocount++;
- }
-
- pill = &req->rq_pill;
- req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
- sizeof(*ioobj));
- req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
- niocount * sizeof(*niobuf));
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
- req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
- ptlrpc_at_set_req_timeout(req);
- /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
- * retry logic
- */
- req->rq_no_retry_einprogress = 1;
-
- desc = ptlrpc_prep_bulk_imp(req, page_count,
- cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
- (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
- PTLRPC_BULK_PUT_SINK) | PTLRPC_BULK_BUF_KIOV, OST_BULK_PORTAL,
- &ptlrpc_bulk_kiov_pin_ops);
-
- if (!desc) {
- rc = -ENOMEM;
- goto out;
- }
- /* NB request now owns desc and will free it when it gets freed */
-
- body = req_capsule_client_get(pill, &RMF_OST_BODY);
- ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
- niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
- LASSERT(body && ioobj && niobuf);
-
- lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
-
- obdo_to_ioobj(oa, ioobj);
- ioobj->ioo_bufcnt = niocount;
- /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
- * that might be send for this request. The actual number is decided
- * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
- * "max - 1" for old client compatibility sending "0", and also so the
- * the actual maximum is a power-of-two number, not one less. LU-1431
- */
- ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
- LASSERT(page_count > 0);
- pg_prev = pga[0];
- for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
- struct brw_page *pg = pga[i];
- int poff = pg->off & ~PAGE_MASK;
-
- LASSERT(pg->count > 0);
- /* make sure there is no gap in the middle of page array */
- LASSERTF(page_count == 1 ||
- (ergo(i == 0, poff + pg->count == PAGE_SIZE) &&
- ergo(i > 0 && i < page_count - 1,
- poff == 0 && pg->count == PAGE_SIZE) &&
- ergo(i == page_count - 1, poff == 0)),
- "i: %d/%d pg: %p off: %llu, count: %u\n",
- i, page_count, pg, pg->off, pg->count);
- LASSERTF(i == 0 || pg->off > pg_prev->off,
- "i %d p_c %u pg %p [pri %lu ind %lu] off %llu prev_pg %p [pri %lu ind %lu] off %llu\n",
- i, page_count,
- pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
- pg_prev->pg, page_private(pg_prev->pg),
- pg_prev->pg->index, pg_prev->off);
- LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
- (pg->flag & OBD_BRW_SRVLOCK));
-
- desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count);
- requested_nob += pg->count;
-
- if (i > 0 && can_merge_pages(pg_prev, pg)) {
- niobuf--;
- niobuf->rnb_len += pg->count;
- } else {
- niobuf->rnb_offset = pg->off;
- niobuf->rnb_len = pg->count;
- niobuf->rnb_flags = pg->flag;
- }
- pg_prev = pg;
- }
-
- LASSERTF((void *)(niobuf - niocount) ==
- req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
- "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
- &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
-
- osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
- if (resend) {
- if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
- body->oa.o_valid |= OBD_MD_FLFLAGS;
- body->oa.o_flags = 0;
- }
- body->oa.o_flags |= OBD_FL_RECOV_RESEND;
- }
-
- if (osc_should_shrink_grant(cli))
- osc_shrink_grant_local(cli, &body->oa);
-
- /* size[REQ_REC_OFF] still sizeof (*body) */
- if (opc == OST_WRITE) {
- if (cli->cl_checksum &&
- !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
- /* store cl_cksum_type in a local variable since
- * it can be changed via lprocfs
- */
- enum cksum_type cksum_type = cli->cl_cksum_type;
-
- if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
- oa->o_flags &= OBD_FL_LOCAL_MASK;
- body->oa.o_flags = 0;
- }
- body->oa.o_flags |= cksum_type_pack(cksum_type);
- body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
- body->oa.o_cksum = osc_checksum_bulk(requested_nob,
- page_count, pga,
- OST_WRITE,
- cksum_type);
- CDEBUG(D_PAGE, "checksum at write origin: %x\n",
- body->oa.o_cksum);
- /* save this in 'oa', too, for later checking */
- oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
- oa->o_flags |= cksum_type_pack(cksum_type);
- } else {
- /* clear out the checksum flag, in case this is a
- * resend but cl_checksum is no longer set. b=11238
- */
- oa->o_valid &= ~OBD_MD_FLCKSUM;
- }
- oa->o_cksum = body->oa.o_cksum;
- /* 1 RC per niobuf */
- req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
- sizeof(__u32) * niocount);
- } else {
- if (cli->cl_checksum &&
- !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
- if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
- body->oa.o_flags = 0;
- body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
- body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
- }
- }
- ptlrpc_request_set_replen(req);
-
- BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args));
- aa = ptlrpc_req_async_args(req);
- aa->aa_oa = oa;
- aa->aa_requested_nob = requested_nob;
- aa->aa_nio_count = niocount;
- aa->aa_page_count = page_count;
- aa->aa_resends = 0;
- aa->aa_ppga = pga;
- aa->aa_cli = cli;
- INIT_LIST_HEAD(&aa->aa_oaps);
-
- *reqp = req;
- niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
- CDEBUG(D_RPCTRACE, "brw rpc %p - object " DOSTID " offset %lld<>%lld\n",
- req, POSTID(&oa->o_oi), niobuf[0].rnb_offset,
- niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len);
-
- return 0;
-
- out:
- ptlrpc_req_finished(req);
- return rc;
-}
-
-static int check_write_checksum(struct obdo *oa,
- const struct lnet_process_id *peer,
- __u32 client_cksum, __u32 server_cksum, int nob,
- u32 page_count, struct brw_page **pga,
- enum cksum_type client_cksum_type)
-{
- __u32 new_cksum;
- char *msg;
- enum cksum_type cksum_type;
-
- if (server_cksum == client_cksum) {
- CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
- return 0;
- }
-
- cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
- oa->o_flags : 0);
- new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE,
- cksum_type);
-
- if (cksum_type != client_cksum_type)
- msg = "the server did not use the checksum type specified in the original request - likely a protocol problem"
- ;
- else if (new_cksum == server_cksum)
- msg = "changed on the client after we checksummed it - likely false positive due to mmap IO (bug 11742)"
- ;
- else if (new_cksum == client_cksum)
- msg = "changed in transit before arrival at OST";
- else
- msg = "changed in transit AND doesn't match the original - likely false positive due to mmap IO (bug 11742)"
- ;
-
- LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode " DFID " object " DOSTID " extent [%llu-%llu]\n",
- msg, libcfs_nid2str(peer->nid),
- oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
- oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
- oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
- POSTID(&oa->o_oi), pga[0]->off,
- pga[page_count - 1]->off +
- pga[page_count - 1]->count - 1);
- CERROR("original client csum %x (type %x), server csum %x (type %x), client csum now %x\n",
- client_cksum, client_cksum_type,
- server_cksum, cksum_type, new_cksum);
- return 1;
-}
-
-/* Note rc enters this function as number of bytes transferred */
-static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
-{
- struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
- const struct lnet_process_id *peer =
- &req->rq_import->imp_connection->c_peer;
- struct client_obd *cli = aa->aa_cli;
- struct ost_body *body;
- __u32 client_cksum = 0;
-
- if (rc < 0 && rc != -EDQUOT) {
- DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc);
- return rc;
- }
-
- LASSERTF(req->rq_repmsg, "rc = %d\n", rc);
- body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
- if (!body) {
- DEBUG_REQ(D_INFO, req, "Can't unpack body\n");
- return -EPROTO;
- }
-
- /* set/clear over quota flag for a uid/gid */
- if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
- body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) {
- unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid };
-
- CDEBUG(D_QUOTA, "setdq for [%u %u] with valid %#llx, flags %x\n",
- body->oa.o_uid, body->oa.o_gid, body->oa.o_valid,
- body->oa.o_flags);
- osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags);
- }
-
- osc_update_grant(cli, body);
-
- if (rc < 0)
- return rc;
-
- if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
- client_cksum = aa->aa_oa->o_cksum; /* save for later */
-
- if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
- if (rc > 0) {
- CERROR("Unexpected +ve rc %d\n", rc);
- return -EPROTO;
- }
- LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
-
- if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
- return -EAGAIN;
-
- if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
- check_write_checksum(&body->oa, peer, client_cksum,
- body->oa.o_cksum, aa->aa_requested_nob,
- aa->aa_page_count, aa->aa_ppga,
- cksum_type_unpack(aa->aa_oa->o_flags)))
- return -EAGAIN;
-
- rc = check_write_rcs(req, aa->aa_requested_nob,
- aa->aa_nio_count,
- aa->aa_page_count, aa->aa_ppga);
- goto out;
- }
-
- /* The rest of this function executes only for OST_READs */
-
- /* if unwrap_bulk failed, return -EAGAIN to retry */
- rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
- if (rc < 0) {
- rc = -EAGAIN;
- goto out;
- }
-
- if (rc > aa->aa_requested_nob) {
- CERROR("Unexpected rc %d (%d requested)\n", rc,
- aa->aa_requested_nob);
- return -EPROTO;
- }
-
- if (rc != req->rq_bulk->bd_nob_transferred) {
- CERROR("Unexpected rc %d (%d transferred)\n",
- rc, req->rq_bulk->bd_nob_transferred);
- return -EPROTO;
- }
-
- if (rc < aa->aa_requested_nob)
- handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
-
- if (body->oa.o_valid & OBD_MD_FLCKSUM) {
- static int cksum_counter;
- __u32 server_cksum = body->oa.o_cksum;
- char *via = "";
- char *router = "";
- enum cksum_type cksum_type;
-
- cksum_type = cksum_type_unpack(body->oa.o_valid &
- OBD_MD_FLFLAGS ?
- body->oa.o_flags : 0);
- client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
- aa->aa_ppga, OST_READ,
- cksum_type);
-
- if (peer->nid != req->rq_bulk->bd_sender) {
- via = " via ";
- router = libcfs_nid2str(req->rq_bulk->bd_sender);
- }
-
- if (server_cksum != client_cksum) {
- LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from %s%s%s inode " DFID " object " DOSTID " extent [%llu-%llu]\n",
- req->rq_import->imp_obd->obd_name,
- libcfs_nid2str(peer->nid),
- via, router,
- body->oa.o_valid & OBD_MD_FLFID ?
- body->oa.o_parent_seq : (__u64)0,
- body->oa.o_valid & OBD_MD_FLFID ?
- body->oa.o_parent_oid : 0,
- body->oa.o_valid & OBD_MD_FLFID ?
- body->oa.o_parent_ver : 0,
- POSTID(&body->oa.o_oi),
- aa->aa_ppga[0]->off,
- aa->aa_ppga[aa->aa_page_count-1]->off +
- aa->aa_ppga[aa->aa_page_count-1]->count -
- 1);
- CERROR("client %x, server %x, cksum_type %x\n",
- client_cksum, server_cksum, cksum_type);
- cksum_counter = 0;
- aa->aa_oa->o_cksum = client_cksum;
- rc = -EAGAIN;
- } else {
- cksum_counter++;
- CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
- rc = 0;
- }
- } else if (unlikely(client_cksum)) {
- static int cksum_missed;
-
- cksum_missed++;
- if ((cksum_missed & (-cksum_missed)) == cksum_missed)
- CERROR("Checksum %u requested from %s but not sent\n",
- cksum_missed, libcfs_nid2str(peer->nid));
- } else {
- rc = 0;
- }
-out:
- if (rc >= 0)
- lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
- aa->aa_oa, &body->oa);
-
- return rc;
-}
-
-static int osc_brw_redo_request(struct ptlrpc_request *request,
- struct osc_brw_async_args *aa, int rc)
-{
- struct ptlrpc_request *new_req;
- struct osc_brw_async_args *new_aa;
- struct osc_async_page *oap;
-
- DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
- "redo for recoverable error %d", rc);
-
- rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
- OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
- aa->aa_cli, aa->aa_oa,
- aa->aa_page_count, aa->aa_ppga,
- &new_req, 0, 1);
- if (rc)
- return rc;
-
- list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
- if (oap->oap_request) {
- LASSERTF(request == oap->oap_request,
- "request %p != oap_request %p\n",
- request, oap->oap_request);
- if (oap->oap_interrupted) {
- ptlrpc_req_finished(new_req);
- return -EINTR;
- }
- }
- }
- /* New request takes over pga and oaps from old request.
- * Note that copying a list_head doesn't work, need to move it...
- */
- aa->aa_resends++;
- new_req->rq_interpret_reply = request->rq_interpret_reply;
- new_req->rq_async_args = request->rq_async_args;
- new_req->rq_commit_cb = request->rq_commit_cb;
- /* cap resend delay to the current request timeout, this is similar to
- * what ptlrpc does (see after_reply())
- */
- if (aa->aa_resends > new_req->rq_timeout)
- new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout;
- else
- new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends;
- new_req->rq_generation_set = 1;
- new_req->rq_import_generation = request->rq_import_generation;
-
- new_aa = ptlrpc_req_async_args(new_req);
-
- INIT_LIST_HEAD(&new_aa->aa_oaps);
- list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
- INIT_LIST_HEAD(&new_aa->aa_exts);
- list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
- new_aa->aa_resends = aa->aa_resends;
-
- list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
- if (oap->oap_request) {
- ptlrpc_req_finished(oap->oap_request);
- oap->oap_request = ptlrpc_request_addref(new_req);
- }
- }
-
- /* XXX: This code will run into problem if we're going to support
- * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
- * and wait for all of them to be finished. We should inherit request
- * set from old request.
- */
- ptlrpcd_add_req(new_req);
-
- DEBUG_REQ(D_INFO, new_req, "new request");
- return 0;
-}
-
-/*
- * ugh, we want disk allocation on the target to happen in offset order. we'll
- * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
- * fine for our small page arrays and doesn't require allocation. its an
- * insertion sort that swaps elements that are strides apart, shrinking the
- * stride down until its '1' and the array is sorted.
- */
-static void sort_brw_pages(struct brw_page **array, int num)
-{
- int stride, i, j;
- struct brw_page *tmp;
-
- if (num == 1)
- return;
- for (stride = 1; stride < num ; stride = (stride * 3) + 1)
- ;
-
- do {
- stride /= 3;
- for (i = stride ; i < num ; i++) {
- tmp = array[i];
- j = i;
- while (j >= stride && array[j - stride]->off > tmp->off) {
- array[j] = array[j - stride];
- j -= stride;
- }
- array[j] = tmp;
- }
- } while (stride > 1);
-}
-
-static void osc_release_ppga(struct brw_page **ppga, u32 count)
-{
- LASSERT(ppga);
- kfree(ppga);
-}
-
-static int brw_interpret(const struct lu_env *env,
- struct ptlrpc_request *req, void *data, int rc)
-{
- struct osc_brw_async_args *aa = data;
- struct osc_extent *ext;
- struct osc_extent *tmp;
- struct client_obd *cli = aa->aa_cli;
-
- rc = osc_brw_fini_request(req, rc);
- CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
- /* When server return -EINPROGRESS, client should always retry
- * regardless of the number of times the bulk was resent already.
- */
- if (osc_recoverable_error(rc)) {
- if (req->rq_import_generation !=
- req->rq_import->imp_generation) {
- CDEBUG(D_HA, "%s: resend cross eviction for object: " DOSTID ", rc = %d.\n",
- req->rq_import->imp_obd->obd_name,
- POSTID(&aa->aa_oa->o_oi), rc);
- } else if (rc == -EINPROGRESS ||
- client_should_resend(aa->aa_resends, aa->aa_cli)) {
- rc = osc_brw_redo_request(req, aa, rc);
- } else {
- CERROR("%s: too many resent retries for object: %llu:%llu, rc = %d.\n",
- req->rq_import->imp_obd->obd_name,
- POSTID(&aa->aa_oa->o_oi), rc);
- }
-
- if (rc == 0)
- return 0;
- else if (rc == -EAGAIN || rc == -EINPROGRESS)
- rc = -EIO;
- }
-
- if (rc == 0) {
- struct obdo *oa = aa->aa_oa;
- struct cl_attr *attr = &osc_env_info(env)->oti_attr;
- unsigned long valid = 0;
- struct cl_object *obj;
- struct osc_async_page *last;
-
- last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]);
- obj = osc2cl(last->oap_obj);
-
- cl_object_attr_lock(obj);
- if (oa->o_valid & OBD_MD_FLBLOCKS) {
- attr->cat_blocks = oa->o_blocks;
- valid |= CAT_BLOCKS;
- }
- if (oa->o_valid & OBD_MD_FLMTIME) {
- attr->cat_mtime = oa->o_mtime;
- valid |= CAT_MTIME;
- }
- if (oa->o_valid & OBD_MD_FLATIME) {
- attr->cat_atime = oa->o_atime;
- valid |= CAT_ATIME;
- }
- if (oa->o_valid & OBD_MD_FLCTIME) {
- attr->cat_ctime = oa->o_ctime;
- valid |= CAT_CTIME;
- }
-
- if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
- struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
- loff_t last_off = last->oap_count + last->oap_obj_off +
- last->oap_page_off;
-
- /* Change file size if this is an out of quota or
- * direct IO write and it extends the file size
- */
- if (loi->loi_lvb.lvb_size < last_off) {
- attr->cat_size = last_off;
- valid |= CAT_SIZE;
- }
- /* Extend KMS if it's not a lockless write */
- if (loi->loi_kms < last_off &&
- oap2osc_page(last)->ops_srvlock == 0) {
- attr->cat_kms = last_off;
- valid |= CAT_KMS;
- }
- }
-
- if (valid != 0)
- cl_object_attr_update(env, obj, attr, valid);
- cl_object_attr_unlock(obj);
- }
- kmem_cache_free(obdo_cachep, aa->aa_oa);
-
- if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0)
- osc_inc_unstable_pages(req);
-
- list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
- list_del_init(&ext->oe_link);
- osc_extent_finish(env, ext, 1, rc);
- }
- LASSERT(list_empty(&aa->aa_exts));
- LASSERT(list_empty(&aa->aa_oaps));
-
- osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
- ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
-
- spin_lock(&cli->cl_loi_list_lock);
- /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
- * is called so we know whether to go to sync BRWs or wait for more
- * RPCs to complete
- */
- if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
- cli->cl_w_in_flight--;
- else
- cli->cl_r_in_flight--;
- osc_wake_cache_waiters(cli);
- spin_unlock(&cli->cl_loi_list_lock);
-
- osc_io_unplug(env, cli, NULL);
- return rc;
-}
-
-static void brw_commit(struct ptlrpc_request *req)
-{
- /*
- * If osc_inc_unstable_pages (via osc_extent_finish) races with
- * this called via the rq_commit_cb, I need to ensure
- * osc_dec_unstable_pages is still called. Otherwise unstable
- * pages may be leaked.
- */
- spin_lock(&req->rq_lock);
- if (unlikely(req->rq_unstable)) {
- req->rq_unstable = 0;
- spin_unlock(&req->rq_lock);
- osc_dec_unstable_pages(req);
- } else {
- req->rq_committed = 1;
- spin_unlock(&req->rq_lock);
- }
-}
-
-/**
- * Build an RPC by the list of extent @ext_list. The caller must ensure
- * that the total pages in this list are NOT over max pages per RPC.
- * Extents in the list must be in OES_RPC state.
- */
-int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
- struct list_head *ext_list, int cmd)
-{
- struct ptlrpc_request *req = NULL;
- struct osc_extent *ext;
- struct brw_page **pga = NULL;
- struct osc_brw_async_args *aa = NULL;
- struct obdo *oa = NULL;
- struct osc_async_page *oap;
- struct osc_object *obj = NULL;
- struct cl_req_attr *crattr = NULL;
- u64 starting_offset = OBD_OBJECT_EOF;
- u64 ending_offset = 0;
- int mpflag = 0;
- int mem_tight = 0;
- int page_count = 0;
- bool soft_sync = false;
- bool interrupted = false;
- int i;
- int rc;
- struct ost_body *body;
- LIST_HEAD(rpc_list);
-
- LASSERT(!list_empty(ext_list));
-
- /* add pages into rpc_list to build BRW rpc */
- list_for_each_entry(ext, ext_list, oe_link) {
- LASSERT(ext->oe_state == OES_RPC);
- mem_tight |= ext->oe_memalloc;
- page_count += ext->oe_nr_pages;
- if (!obj)
- obj = ext->oe_obj;
- }
-
- soft_sync = osc_over_unstable_soft_limit(cli);
- if (mem_tight)
- mpflag = cfs_memory_pressure_get_and_set();
-
- pga = kcalloc(page_count, sizeof(*pga), GFP_NOFS);
- if (!pga) {
- rc = -ENOMEM;
- goto out;
- }
-
- oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS);
- if (!oa) {
- rc = -ENOMEM;
- goto out;
- }
-
- i = 0;
- list_for_each_entry(ext, ext_list, oe_link) {
- list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
- if (mem_tight)
- oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
- if (soft_sync)
- oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC;
- pga[i] = &oap->oap_brw_page;
- pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
- i++;
-
- list_add_tail(&oap->oap_rpc_item, &rpc_list);
- if (starting_offset == OBD_OBJECT_EOF ||
- starting_offset > oap->oap_obj_off)
- starting_offset = oap->oap_obj_off;
- else
- LASSERT(!oap->oap_page_off);
- if (ending_offset < oap->oap_obj_off + oap->oap_count)
- ending_offset = oap->oap_obj_off +
- oap->oap_count;
- else
- LASSERT(oap->oap_page_off + oap->oap_count ==
- PAGE_SIZE);
- if (oap->oap_interrupted)
- interrupted = true;
- }
- }
-
- /* first page in the list */
- oap = list_entry(rpc_list.next, typeof(*oap), oap_rpc_item);
-
- crattr = &osc_env_info(env)->oti_req_attr;
- memset(crattr, 0, sizeof(*crattr));
- crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
- crattr->cra_flags = ~0ULL;
- crattr->cra_page = oap2cl_page(oap);
- crattr->cra_oa = oa;
- cl_req_attr_set(env, osc2cl(obj), crattr);
-
- sort_brw_pages(pga, page_count);
- rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 1, 0);
- if (rc != 0) {
- CERROR("prep_req failed: %d\n", rc);
- goto out;
- }
-
- req->rq_commit_cb = brw_commit;
- req->rq_interpret_reply = brw_interpret;
-
- req->rq_memalloc = mem_tight != 0;
- oap->oap_request = ptlrpc_request_addref(req);
- if (interrupted && !req->rq_intr)
- ptlrpc_mark_interrupted(req);
-
- /* Need to update the timestamps after the request is built in case
- * we race with setattr (locally or in queue at OST). If OST gets
- * later setattr before earlier BRW (as determined by the request xid),
- * the OST will not use BRW timestamps. Sadly, there is no obvious
- * way to do this in a single call. bug 10150
- */
- body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
- crattr->cra_oa = &body->oa;
- crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME;
- cl_req_attr_set(env, osc2cl(obj), crattr);
- lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
-
- BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args));
- aa = ptlrpc_req_async_args(req);
- INIT_LIST_HEAD(&aa->aa_oaps);
- list_splice_init(&rpc_list, &aa->aa_oaps);
- INIT_LIST_HEAD(&aa->aa_exts);
- list_splice_init(ext_list, &aa->aa_exts);
-
- spin_lock(&cli->cl_loi_list_lock);
- starting_offset >>= PAGE_SHIFT;
- if (cmd == OBD_BRW_READ) {
- cli->cl_r_in_flight++;
- lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
- lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
- lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
- starting_offset + 1);
- } else {
- cli->cl_w_in_flight++;
- lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
- lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
- lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
- starting_offset + 1);
- }
- spin_unlock(&cli->cl_loi_list_lock);
-
- DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%dw in flight",
- page_count, aa, cli->cl_r_in_flight,
- cli->cl_w_in_flight);
- OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val);
-
- ptlrpcd_add_req(req);
- rc = 0;
-
-out:
- if (mem_tight != 0)
- cfs_memory_pressure_restore(mpflag);
-
- if (rc != 0) {
- LASSERT(!req);
-
- if (oa)
- kmem_cache_free(obdo_cachep, oa);
- kfree(pga);
- /* this should happen rarely and is pretty bad, it makes the
- * pending list not follow the dirty order
- */
- while (!list_empty(ext_list)) {
- ext = list_entry(ext_list->next, struct osc_extent,
- oe_link);
- list_del_init(&ext->oe_link);
- osc_extent_finish(env, ext, 0, rc);
- }
- }
- return rc;
-}
-
-static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
-{
- int set = 0;
-
- LASSERT(lock);
-
- lock_res_and_lock(lock);
-
- if (!lock->l_ast_data)
- lock->l_ast_data = data;
- if (lock->l_ast_data == data)
- set = 1;
-
- unlock_res_and_lock(lock);
-
- return set;
-}
-
-static int osc_enqueue_fini(struct ptlrpc_request *req,
- osc_enqueue_upcall_f upcall, void *cookie,
- struct lustre_handle *lockh, enum ldlm_mode mode,
- __u64 *flags, int agl, int errcode)
-{
- bool intent = *flags & LDLM_FL_HAS_INTENT;
- int rc;
-
- /* The request was created before ldlm_cli_enqueue call. */
- if (intent && errcode == ELDLM_LOCK_ABORTED) {
- struct ldlm_reply *rep;
-
- rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
-
- rep->lock_policy_res1 =
- ptlrpc_status_ntoh(rep->lock_policy_res1);
- if (rep->lock_policy_res1)
- errcode = rep->lock_policy_res1;
- if (!agl)
- *flags |= LDLM_FL_LVB_READY;
- } else if (errcode == ELDLM_OK) {
- *flags |= LDLM_FL_LVB_READY;
- }
-
- /* Call the update callback. */
- rc = (*upcall)(cookie, lockh, errcode);
- /* release the reference taken in ldlm_cli_enqueue() */
- if (errcode == ELDLM_LOCK_MATCHED)
- errcode = ELDLM_OK;
- if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
- ldlm_lock_decref(lockh, mode);
-
- return rc;
-}
-
-static int osc_enqueue_interpret(const struct lu_env *env,
- struct ptlrpc_request *req,
- struct osc_enqueue_args *aa, int rc)
-{
- struct ldlm_lock *lock;
- struct lustre_handle *lockh = &aa->oa_lockh;
- enum ldlm_mode mode = aa->oa_mode;
- struct ost_lvb *lvb = aa->oa_lvb;
- __u32 lvb_len = sizeof(*lvb);
- __u64 flags = 0;
-
-
- /* ldlm_cli_enqueue is holding a reference on the lock, so it must
- * be valid.
- */
- lock = ldlm_handle2lock(lockh);
- LASSERTF(lock, "lockh %llx, req %p, aa %p - client evicted?\n",
- lockh->cookie, req, aa);
-
- /* Take an additional reference so that a blocking AST that
- * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
- * to arrive after an upcall has been executed by
- * osc_enqueue_fini().
- */
- ldlm_lock_addref(lockh, mode);
-
- /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */
- OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2);
-
- /* Let CP AST to grant the lock first. */
- OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
-
- if (aa->oa_agl) {
- LASSERT(!aa->oa_lvb);
- LASSERT(!aa->oa_flags);
- aa->oa_flags = &flags;
- }
-
- /* Complete obtaining the lock procedure. */
- rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1,
- aa->oa_mode, aa->oa_flags, lvb, lvb_len,
- lockh, rc);
- /* Complete osc stuff. */
- rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
- aa->oa_flags, aa->oa_agl, rc);
-
- OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
-
- ldlm_lock_decref(lockh, mode);
- LDLM_LOCK_PUT(lock);
- return rc;
-}
-
-struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
-
-/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
- * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
- * other synchronous requests, however keeping some locks and trying to obtain
- * others may take a considerable amount of time in a case of ost failure; and
- * when other sync requests do not get released lock from a client, the client
- * is evicted from the cluster -- such scenaries make the life difficult, so
- * release locks just after they are obtained.
- */
-int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
- __u64 *flags, union ldlm_policy_data *policy,
- struct ost_lvb *lvb, int kms_valid,
- osc_enqueue_upcall_f upcall, void *cookie,
- struct ldlm_enqueue_info *einfo,
- struct ptlrpc_request_set *rqset, int async, int agl)
-{
- struct obd_device *obd = exp->exp_obd;
- struct lustre_handle lockh = { 0 };
- struct ptlrpc_request *req = NULL;
- int intent = *flags & LDLM_FL_HAS_INTENT;
- __u64 match_flags = *flags;
- enum ldlm_mode mode;
- int rc;
-
- /* Filesystem lock extents are extended to page boundaries so that
- * dealing with the page cache is a little smoother.
- */
- policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
- policy->l_extent.end |= ~PAGE_MASK;
-
- /*
- * kms is not valid when either object is completely fresh (so that no
- * locks are cached), or object was evicted. In the latter case cached
- * lock cannot be used, because it would prime inode state with
- * potentially stale LVB.
- */
- if (!kms_valid)
- goto no_match;
-
- /* Next, search for already existing extent locks that will cover us */
- /* If we're trying to read, we also search for an existing PW lock. The
- * VFS and page cache already protect us locally, so lots of readers/
- * writers can share a single PW lock.
- *
- * There are problems with conversion deadlocks, so instead of
- * converting a read lock to a write lock, we'll just enqueue a new
- * one.
- *
- * At some point we should cancel the read lock instead of making them
- * send us a blocking callback, but there are problems with canceling
- * locks out from other users right now, too.
- */
- mode = einfo->ei_mode;
- if (einfo->ei_mode == LCK_PR)
- mode |= LCK_PW;
- if (agl == 0)
- match_flags |= LDLM_FL_LVB_READY;
- if (intent != 0)
- match_flags |= LDLM_FL_BLOCK_GRANTED;
- mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id,
- einfo->ei_type, policy, mode, &lockh, 0);
- if (mode) {
- struct ldlm_lock *matched;
-
- if (*flags & LDLM_FL_TEST_LOCK)
- return ELDLM_OK;
-
- matched = ldlm_handle2lock(&lockh);
- if (agl) {
- /* AGL enqueues DLM locks speculatively. Therefore if
- * it already exists a DLM lock, it wll just inform the
- * caller to cancel the AGL process for this stripe.
- */
- ldlm_lock_decref(&lockh, mode);
- LDLM_LOCK_PUT(matched);
- return -ECANCELED;
- } else if (osc_set_lock_data(matched, einfo->ei_cbdata)) {
- *flags |= LDLM_FL_LVB_READY;
- /* We already have a lock, and it's referenced. */
- (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED);
-
- ldlm_lock_decref(&lockh, mode);
- LDLM_LOCK_PUT(matched);
- return ELDLM_OK;
- } else {
- ldlm_lock_decref(&lockh, mode);
- LDLM_LOCK_PUT(matched);
- }
- }
-
-no_match:
- if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK))
- return -ENOLCK;
- if (intent) {
- req = ptlrpc_request_alloc(class_exp2cliimp(exp),
- &RQF_LDLM_ENQUEUE_LVB);
- if (!req)
- return -ENOMEM;
-
- rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
-
- req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
- sizeof(*lvb));
- ptlrpc_request_set_replen(req);
- }
-
- /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
- *flags &= ~LDLM_FL_BLOCK_GRANTED;
-
- rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
- sizeof(*lvb), LVB_T_OST, &lockh, async);
- if (async) {
- if (!rc) {
- struct osc_enqueue_args *aa;
-
- BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args));
- aa = ptlrpc_req_async_args(req);
- aa->oa_exp = exp;
- aa->oa_mode = einfo->ei_mode;
- aa->oa_type = einfo->ei_type;
- lustre_handle_copy(&aa->oa_lockh, &lockh);
- aa->oa_upcall = upcall;
- aa->oa_cookie = cookie;
- aa->oa_agl = !!agl;
- if (!agl) {
- aa->oa_flags = flags;
- aa->oa_lvb = lvb;
- } else {
- /* AGL is essentially to enqueue an DLM lock
- * in advance, so we don't care about the
- * result of AGL enqueue.
- */
- aa->oa_lvb = NULL;
- aa->oa_flags = NULL;
- }
-
- req->rq_interpret_reply =
- (ptlrpc_interpterer_t)osc_enqueue_interpret;
- if (rqset == PTLRPCD_SET)
- ptlrpcd_add_req(req);
- else
- ptlrpc_set_add_req(rqset, req);
- } else if (intent) {
- ptlrpc_req_finished(req);
- }
- return rc;
- }
-
- rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode,
- flags, agl, rc);
- if (intent)
- ptlrpc_req_finished(req);
-
- return rc;
-}
-
-int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
- enum ldlm_type type, union ldlm_policy_data *policy,
- enum ldlm_mode mode, __u64 *flags, void *data,
- struct lustre_handle *lockh, int unref)
-{
- struct obd_device *obd = exp->exp_obd;
- __u64 lflags = *flags;
- enum ldlm_mode rc;
-
- if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
- return -EIO;
-
- /* Filesystem lock extents are extended to page boundaries so that
- * dealing with the page cache is a little smoother
- */
- policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK;
- policy->l_extent.end |= ~PAGE_MASK;
-
- /* Next, search for already existing extent locks that will cover us */
- /* If we're trying to read, we also search for an existing PW lock. The
- * VFS and page cache already protect us locally, so lots of readers/
- * writers can share a single PW lock.
- */
- rc = mode;
- if (mode == LCK_PR)
- rc |= LCK_PW;
- rc = ldlm_lock_match(obd->obd_namespace, lflags,
- res_id, type, policy, rc, lockh, unref);
- if (!rc || lflags & LDLM_FL_TEST_LOCK)
- return rc;
-
- if (data) {
- struct ldlm_lock *lock = ldlm_handle2lock(lockh);
-
- LASSERT(lock);
- if (!osc_set_lock_data(lock, data)) {
- ldlm_lock_decref(lockh, rc);
- rc = 0;
- }
- LDLM_LOCK_PUT(lock);
- }
- return rc;
-}
-
-static int osc_statfs_interpret(const struct lu_env *env,
- struct ptlrpc_request *req,
- struct osc_async_args *aa, int rc)
-{
- struct obd_statfs *msfs;
-
- if (rc == -EBADR)
- /* The request has in fact never been sent
- * due to issues at a higher level (LOV).
- * Exit immediately since the caller is
- * aware of the problem and takes care
- * of the clean up
- */
- return rc;
-
- if ((rc == -ENOTCONN || rc == -EAGAIN) &&
- (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY)) {
- rc = 0;
- goto out;
- }
-
- if (rc != 0)
- goto out;
-
- msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
- if (!msfs) {
- rc = -EPROTO;
- goto out;
- }
-
- *aa->aa_oi->oi_osfs = *msfs;
-out:
- rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
- return rc;
-}
-
-static int osc_statfs_async(struct obd_export *exp,
- struct obd_info *oinfo, __u64 max_age,
- struct ptlrpc_request_set *rqset)
-{
- struct obd_device *obd = class_exp2obd(exp);
- struct ptlrpc_request *req;
- struct osc_async_args *aa;
- int rc;
-
- /* We could possibly pass max_age in the request (as an absolute
- * timestamp or a "seconds.usec ago") so the target can avoid doing
- * extra calls into the filesystem if that isn't necessary (e.g.
- * during mount that would help a bit). Having relative timestamps
- * is not so great if request processing is slow, while absolute
- * timestamps are not ideal because they need time synchronization.
- */
- req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
- if (!req)
- return -ENOMEM;
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
- ptlrpc_request_set_replen(req);
- req->rq_request_portal = OST_CREATE_PORTAL;
- ptlrpc_at_set_req_timeout(req);
-
- if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
- /* procfs requests not want stat in wait for avoid deadlock */
- req->rq_no_resend = 1;
- req->rq_no_delay = 1;
- }
-
- req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
- BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args));
- aa = ptlrpc_req_async_args(req);
- aa->aa_oi = oinfo;
-
- ptlrpc_set_add_req(rqset, req);
- return 0;
-}
-
-static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
- struct obd_statfs *osfs, __u64 max_age, __u32 flags)
-{
- struct obd_device *obd = class_exp2obd(exp);
- struct obd_statfs *msfs;
- struct ptlrpc_request *req;
- struct obd_import *imp = NULL;
- int rc;
-
- /* Since the request might also come from lprocfs, so we need
- * sync this with client_disconnect_export Bug15684
- */
- down_read(&obd->u.cli.cl_sem);
- if (obd->u.cli.cl_import)
- imp = class_import_get(obd->u.cli.cl_import);
- up_read(&obd->u.cli.cl_sem);
- if (!imp)
- return -ENODEV;
-
- /* We could possibly pass max_age in the request (as an absolute
- * timestamp or a "seconds.usec ago") so the target can avoid doing
- * extra calls into the filesystem if that isn't necessary (e.g.
- * during mount that would help a bit). Having relative timestamps
- * is not so great if request processing is slow, while absolute
- * timestamps are not ideal because they need time synchronization.
- */
- req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
-
- class_import_put(imp);
-
- if (!req)
- return -ENOMEM;
-
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
- ptlrpc_request_set_replen(req);
- req->rq_request_portal = OST_CREATE_PORTAL;
- ptlrpc_at_set_req_timeout(req);
-
- if (flags & OBD_STATFS_NODELAY) {
- /* procfs requests not want stat in wait for avoid deadlock */
- req->rq_no_resend = 1;
- req->rq_no_delay = 1;
- }
-
- rc = ptlrpc_queue_wait(req);
- if (rc)
- goto out;
-
- msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
- if (!msfs) {
- rc = -EPROTO;
- goto out;
- }
-
- *osfs = *msfs;
-
- out:
- ptlrpc_req_finished(req);
- return rc;
-}
-
-static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
- void *karg, void __user *uarg)
-{
- struct obd_device *obd = exp->exp_obd;
- struct obd_ioctl_data *data = karg;
- int err = 0;
-
- if (!try_module_get(THIS_MODULE)) {
- CERROR("%s: cannot get module '%s'\n", obd->obd_name,
- module_name(THIS_MODULE));
- return -EINVAL;
- }
- switch (cmd) {
- case OBD_IOC_CLIENT_RECOVER:
- err = ptlrpc_recover_import(obd->u.cli.cl_import,
- data->ioc_inlbuf1, 0);
- if (err > 0)
- err = 0;
- goto out;
- case IOC_OSC_SET_ACTIVE:
- err = ptlrpc_set_import_active(obd->u.cli.cl_import,
- data->ioc_offset);
- goto out;
- case OBD_IOC_PING_TARGET:
- err = ptlrpc_obd_ping(obd);
- goto out;
- default:
- CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
- cmd, current_comm());
- err = -ENOTTY;
- goto out;
- }
-out:
- module_put(THIS_MODULE);
- return err;
-}
-
-static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
- u32 keylen, void *key, u32 vallen,
- void *val, struct ptlrpc_request_set *set)
-{
- struct ptlrpc_request *req;
- struct obd_device *obd = exp->exp_obd;
- struct obd_import *imp = class_exp2cliimp(exp);
- char *tmp;
- int rc;
-
- OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
-
- if (KEY_IS(KEY_CHECKSUM)) {
- if (vallen != sizeof(int))
- return -EINVAL;
- exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
- return 0;
- }
-
- if (KEY_IS(KEY_SPTLRPC_CONF)) {
- sptlrpc_conf_client_adapt(obd);
- return 0;
- }
-
- if (KEY_IS(KEY_FLUSH_CTX)) {
- sptlrpc_import_flush_my_ctx(imp);
- return 0;
- }
-
- if (KEY_IS(KEY_CACHE_SET)) {
- struct client_obd *cli = &obd->u.cli;
-
- LASSERT(!cli->cl_cache); /* only once */
- cli->cl_cache = val;
- cl_cache_incref(cli->cl_cache);
- cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
-
- /* add this osc into entity list */
- LASSERT(list_empty(&cli->cl_lru_osc));
- spin_lock(&cli->cl_cache->ccc_lru_lock);
- list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
- spin_unlock(&cli->cl_cache->ccc_lru_lock);
-
- return 0;
- }
-
- if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
- struct client_obd *cli = &obd->u.cli;
- long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1;
- long target = *(long *)val;
-
- nr = osc_lru_shrink(env, cli, min(nr, target), true);
- *(long *)val -= nr;
- return 0;
- }
-
- if (!set && !KEY_IS(KEY_GRANT_SHRINK))
- return -EINVAL;
-
- /* We pass all other commands directly to OST. Since nobody calls osc
- * methods directly and everybody is supposed to go through LOV, we
- * assume lov checked invalid values for us.
- * The only recognised values so far are evict_by_nid and mds_conn.
- * Even if something bad goes through, we'd get a -EINVAL from OST
- * anyway.
- */
-
- req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
- &RQF_OST_SET_GRANT_INFO :
- &RQF_OBD_SET_INFO);
- if (!req)
- return -ENOMEM;
-
- req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
- RCL_CLIENT, keylen);
- if (!KEY_IS(KEY_GRANT_SHRINK))
- req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
- RCL_CLIENT, vallen);
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
-
- tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
- memcpy(tmp, key, keylen);
- tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
- &RMF_OST_BODY :
- &RMF_SETINFO_VAL);
- memcpy(tmp, val, vallen);
-
- if (KEY_IS(KEY_GRANT_SHRINK)) {
- struct osc_brw_async_args *aa;
- struct obdo *oa;
-
- BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args));
- aa = ptlrpc_req_async_args(req);
- oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS);
- if (!oa) {
- ptlrpc_req_finished(req);
- return -ENOMEM;
- }
- *oa = ((struct ost_body *)val)->oa;
- aa->aa_oa = oa;
- req->rq_interpret_reply = osc_shrink_grant_interpret;
- }
-
- ptlrpc_request_set_replen(req);
- if (!KEY_IS(KEY_GRANT_SHRINK)) {
- LASSERT(set);
- ptlrpc_set_add_req(set, req);
- ptlrpc_check_set(NULL, set);
- } else {
- ptlrpcd_add_req(req);
- }
-
- return 0;
-}
-
-static int osc_reconnect(const struct lu_env *env,
- struct obd_export *exp, struct obd_device *obd,
- struct obd_uuid *cluuid,
- struct obd_connect_data *data,
- void *localdata)
-{
- struct client_obd *cli = &obd->u.cli;
-
- if (data && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
- long lost_grant;
-
- spin_lock(&cli->cl_loi_list_lock);
- data->ocd_grant = (cli->cl_avail_grant +
- (cli->cl_dirty_pages << PAGE_SHIFT)) ?:
- 2 * cli_brw_size(obd);
- lost_grant = cli->cl_lost_grant;
- cli->cl_lost_grant = 0;
- spin_unlock(&cli->cl_loi_list_lock);
-
- CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d, lost: %ld.\n",
- data->ocd_connect_flags,
- data->ocd_version, data->ocd_grant, lost_grant);
- }
-
- return 0;
-}
-
-static int osc_disconnect(struct obd_export *exp)
-{
- struct obd_device *obd = class_exp2obd(exp);
- int rc;
-
- rc = client_disconnect_export(exp);
- /**
- * Initially we put del_shrink_grant before disconnect_export, but it
- * causes the following problem if setup (connect) and cleanup
- * (disconnect) are tangled together.
- * connect p1 disconnect p2
- * ptlrpc_connect_import
- * ............... class_manual_cleanup
- * osc_disconnect
- * del_shrink_grant
- * ptlrpc_connect_interrupt
- * init_grant_shrink
- * add this client to shrink list
- * cleanup_osc
- * Bang! pinger trigger the shrink.
- * So the osc should be disconnected from the shrink list, after we
- * are sure the import has been destroyed. BUG18662
- */
- if (!obd->u.cli.cl_import)
- osc_del_shrink_grant(&obd->u.cli);
- return rc;
-}
-
-static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
- struct cfs_hash_bd *bd,
- struct hlist_node *hnode, void *arg)
-{
- struct ldlm_resource *res = cfs_hash_object(hs, hnode);
- struct osc_object *osc = NULL;
- struct lu_env *env = arg;
- struct ldlm_lock *lock;
-
- lock_res(res);
- list_for_each_entry(lock, &res->lr_granted, l_res_link) {
- if (lock->l_ast_data && !osc) {
- osc = lock->l_ast_data;
- cl_object_get(osc2cl(osc));
- }
-
- /*
- * clear LDLM_FL_CLEANED flag to make sure it will be canceled
- * by the 2nd round of ldlm_namespace_clean() call in
- * osc_import_event().
- */
- ldlm_clear_cleaned(lock);
- }
- unlock_res(res);
-
- if (osc) {
- osc_object_invalidate(env, osc);
- cl_object_put(env, osc2cl(osc));
- }
-
- return 0;
-}
-
-static int osc_import_event(struct obd_device *obd,
- struct obd_import *imp,
- enum obd_import_event event)
-{
- struct client_obd *cli;
- int rc = 0;
-
- LASSERT(imp->imp_obd == obd);
-
- switch (event) {
- case IMP_EVENT_DISCON: {
- cli = &obd->u.cli;
- spin_lock(&cli->cl_loi_list_lock);
- cli->cl_avail_grant = 0;
- cli->cl_lost_grant = 0;
- spin_unlock(&cli->cl_loi_list_lock);
- break;
- }
- case IMP_EVENT_INACTIVE: {
- rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
- break;
- }
- case IMP_EVENT_INVALIDATE: {
- struct ldlm_namespace *ns = obd->obd_namespace;
- struct lu_env *env;
- u16 refcheck;
-
- ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
-
- env = cl_env_get(&refcheck);
- if (!IS_ERR(env)) {
- osc_io_unplug(env, &obd->u.cli, NULL);
-
- cfs_hash_for_each_nolock(ns->ns_rs_hash,
- osc_ldlm_resource_invalidate,
- env, 0);
- cl_env_put(env, &refcheck);
-
- ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
- } else {
- rc = PTR_ERR(env);
- }
- break;
- }
- case IMP_EVENT_ACTIVE: {
- rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
- break;
- }
- case IMP_EVENT_OCD: {
- struct obd_connect_data *ocd = &imp->imp_connect_data;
-
- if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
- osc_init_grant(&obd->u.cli, ocd);
-
- /* See bug 7198 */
- if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
- imp->imp_client->cli_request_portal = OST_REQUEST_PORTAL;
-
- rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
- break;
- }
- case IMP_EVENT_DEACTIVATE: {
- rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL);
- break;
- }
- case IMP_EVENT_ACTIVATE: {
- rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL);
- break;
- }
- default:
- CERROR("Unknown import event %d\n", event);
- LBUG();
- }
- return rc;
-}
-
-/**
- * Determine whether the lock can be canceled before replaying the lock
- * during recovery, see bug16774 for detailed information.
- *
- * \retval zero the lock can't be canceled
- * \retval other ok to cancel
- */
-static int osc_cancel_weight(struct ldlm_lock *lock)
-{
- /*
- * Cancel all unused and granted extent lock.
- */
- if (lock->l_resource->lr_type == LDLM_EXTENT &&
- lock->l_granted_mode == lock->l_req_mode &&
- osc_ldlm_weigh_ast(lock) == 0)
- return 1;
-
- return 0;
-}
-
-static int brw_queue_work(const struct lu_env *env, void *data)
-{
- struct client_obd *cli = data;
-
- CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
-
- osc_io_unplug(env, cli, NULL);
- return 0;
-}
-
-int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
-{
- struct lprocfs_static_vars lvars = { NULL };
- struct client_obd *cli = &obd->u.cli;
- void *handler;
- int rc;
- int adding;
- int added;
- int req_count;
-
- rc = ptlrpcd_addref();
- if (rc)
- return rc;
-
- rc = client_obd_setup(obd, lcfg);
- if (rc)
- goto out_ptlrpcd;
-
- handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
- if (IS_ERR(handler)) {
- rc = PTR_ERR(handler);
- goto out_client_setup;
- }
- cli->cl_writeback_work = handler;
-
- handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
- if (IS_ERR(handler)) {
- rc = PTR_ERR(handler);
- goto out_ptlrpcd_work;
- }
-
- cli->cl_lru_work = handler;
-
- rc = osc_quota_setup(obd);
- if (rc)
- goto out_ptlrpcd_work;
-
- cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
- lprocfs_osc_init_vars(&lvars);
- if (lprocfs_obd_setup(obd, lvars.obd_vars, lvars.sysfs_vars) == 0) {
- lproc_osc_attach_seqstat(obd);
- sptlrpc_lprocfs_cliobd_attach(obd);
- ptlrpc_lprocfs_register_obd(obd);
- }
-
- /*
- * We try to control the total number of requests with a upper limit
- * osc_reqpool_maxreqcount. There might be some race which will cause
- * over-limit allocation, but it is fine.
- */
- req_count = atomic_read(&osc_pool_req_count);
- if (req_count < osc_reqpool_maxreqcount) {
- adding = cli->cl_max_rpcs_in_flight + 2;
- if (req_count + adding > osc_reqpool_maxreqcount)
- adding = osc_reqpool_maxreqcount - req_count;
-
- added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding);
- atomic_add(added, &osc_pool_req_count);
- }
-
- INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
- ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
-
- spin_lock(&osc_shrink_lock);
- list_add_tail(&cli->cl_shrink_list, &osc_shrink_list);
- spin_unlock(&osc_shrink_lock);
-
- return rc;
-
-out_ptlrpcd_work:
- if (cli->cl_writeback_work) {
- ptlrpcd_destroy_work(cli->cl_writeback_work);
- cli->cl_writeback_work = NULL;
- }
- if (cli->cl_lru_work) {
- ptlrpcd_destroy_work(cli->cl_lru_work);
- cli->cl_lru_work = NULL;
- }
-out_client_setup:
- client_obd_cleanup(obd);
-out_ptlrpcd:
- ptlrpcd_decref();
- return rc;
-}
-
-static int osc_precleanup(struct obd_device *obd)
-{
- struct client_obd *cli = &obd->u.cli;
-
- /* LU-464
- * for echo client, export may be on zombie list, wait for
- * zombie thread to cull it, because cli.cl_import will be
- * cleared in client_disconnect_export():
- * class_export_destroy() -> obd_cleanup() ->
- * echo_device_free() -> echo_client_cleanup() ->
- * obd_disconnect() -> osc_disconnect() ->
- * client_disconnect_export()
- */
- obd_zombie_barrier();
- if (cli->cl_writeback_work) {
- ptlrpcd_destroy_work(cli->cl_writeback_work);
- cli->cl_writeback_work = NULL;
- }
-
- if (cli->cl_lru_work) {
- ptlrpcd_destroy_work(cli->cl_lru_work);
- cli->cl_lru_work = NULL;
- }
-
- obd_cleanup_client_import(obd);
- ptlrpc_lprocfs_unregister_obd(obd);
- lprocfs_obd_cleanup(obd);
- return 0;
-}
-
-static int osc_cleanup(struct obd_device *obd)
-{
- struct client_obd *cli = &obd->u.cli;
- int rc;
-
- spin_lock(&osc_shrink_lock);
- list_del(&cli->cl_shrink_list);
- spin_unlock(&osc_shrink_lock);
-
- /* lru cleanup */
- if (cli->cl_cache) {
- LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
- spin_lock(&cli->cl_cache->ccc_lru_lock);
- list_del_init(&cli->cl_lru_osc);
- spin_unlock(&cli->cl_cache->ccc_lru_lock);
- cli->cl_lru_left = NULL;
- cl_cache_decref(cli->cl_cache);
- cli->cl_cache = NULL;
- }
-
- /* free memory of osc quota cache */
- osc_quota_cleanup(obd);
-
- rc = client_obd_cleanup(obd);
-
- ptlrpcd_decref();
- return rc;
-}
-
-int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
-{
- struct lprocfs_static_vars lvars = { NULL };
- int rc = 0;
-
- lprocfs_osc_init_vars(&lvars);
-
- switch (lcfg->lcfg_command) {
- default:
- rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars,
- lcfg, obd);
- if (rc > 0)
- rc = 0;
- break;
- }
-
- return rc;
-}
-
-static int osc_process_config(struct obd_device *obd, u32 len, void *buf)
-{
- return osc_process_config_base(obd, buf);
-}
-
-static struct obd_ops osc_obd_ops = {
- .owner = THIS_MODULE,
- .setup = osc_setup,
- .precleanup = osc_precleanup,
- .cleanup = osc_cleanup,
- .add_conn = client_import_add_conn,
- .del_conn = client_import_del_conn,
- .connect = client_connect_import,
- .reconnect = osc_reconnect,
- .disconnect = osc_disconnect,
- .statfs = osc_statfs,
- .statfs_async = osc_statfs_async,
- .create = osc_create,
- .destroy = osc_destroy,
- .getattr = osc_getattr,
- .setattr = osc_setattr,
- .iocontrol = osc_iocontrol,
- .set_info_async = osc_set_info_async,
- .import_event = osc_import_event,
- .process_config = osc_process_config,
- .quotactl = osc_quotactl,
-};
-
-struct list_head osc_shrink_list = LIST_HEAD_INIT(osc_shrink_list);
-DEFINE_SPINLOCK(osc_shrink_lock);
-
-static struct shrinker osc_cache_shrinker = {
- .count_objects = osc_cache_shrink_count,
- .scan_objects = osc_cache_shrink_scan,
- .seeks = DEFAULT_SEEKS,
-};
-
-static int __init osc_init(void)
-{
- struct lprocfs_static_vars lvars = { NULL };
- unsigned int reqpool_size;
- unsigned int reqsize;
- int rc;
-
- /* print an address of _any_ initialized kernel symbol from this
- * module, to allow debugging with gdb that doesn't support data
- * symbols from modules.
- */
- CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
-
- rc = lu_kmem_init(osc_caches);
- if (rc)
- return rc;
-
- lprocfs_osc_init_vars(&lvars);
-
- rc = class_register_type(&osc_obd_ops, NULL,
- LUSTRE_OSC_NAME, &osc_device_type);
- if (rc)
- goto out_kmem;
-
- rc = register_shrinker(&osc_cache_shrinker);
- if (rc)
- goto out_type;
-
- /* This is obviously too much memory, only prevent overflow here */
- if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0) {
- rc = -EINVAL;
- goto out_type;
- }
-
- reqpool_size = osc_reqpool_mem_max << 20;
-
- reqsize = 1;
- while (reqsize < OST_MAXREQSIZE)
- reqsize = reqsize << 1;
-
- /*
- * We don't enlarge the request count in OSC pool according to
- * cl_max_rpcs_in_flight. The allocation from the pool will only be
- * tried after normal allocation failed. So a small OSC pool won't
- * cause much performance degression in most of cases.
- */
- osc_reqpool_maxreqcount = reqpool_size / reqsize;
-
- atomic_set(&osc_pool_req_count, 0);
- osc_rq_pool = ptlrpc_init_rq_pool(0, OST_MAXREQSIZE,
- ptlrpc_add_rqs_to_pool);
-
- if (osc_rq_pool)
- return 0;
-
- rc = -ENOMEM;
-
-out_type:
- class_unregister_type(LUSTRE_OSC_NAME);
-out_kmem:
- lu_kmem_fini(osc_caches);
- return rc;
-}
-
-static void /*__exit*/ osc_exit(void)
-{
- unregister_shrinker(&osc_cache_shrinker);
- class_unregister_type(LUSTRE_OSC_NAME);
- lu_kmem_fini(osc_caches);
- ptlrpc_free_rq_pool(osc_rq_pool);
-}
-
-MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
-MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
-MODULE_LICENSE("GPL");
-MODULE_VERSION(LUSTRE_VERSION_STRING);
-
-module_init(osc_init);
-module_exit(osc_exit);