diff options
Diffstat (limited to 'drivers/staging/lustre/lustre/osc')
-rw-r--r-- | drivers/staging/lustre/lustre/osc/Makefile | 6 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/lproc_osc.c | 843 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_cache.c | 3306 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_cl_internal.h | 683 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_dev.c | 246 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_internal.h | 236 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_io.c | 918 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_lock.c | 1231 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_object.c | 474 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_page.c | 1094 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_quota.c | 284 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_request.c | 2899 |
12 files changed, 0 insertions, 12220 deletions
diff --git a/drivers/staging/lustre/lustre/osc/Makefile b/drivers/staging/lustre/lustre/osc/Makefile deleted file mode 100644 index 30dec90e64e8..000000000000 --- a/drivers/staging/lustre/lustre/osc/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += osc.o -osc-y := osc_request.o osc_dev.o osc_object.o \ - osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o lproc_osc.o diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c deleted file mode 100644 index dc76c35ae801..000000000000 --- a/drivers/staging/lustre/lustre/osc/lproc_osc.c +++ /dev/null @@ -1,843 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#define DEBUG_SUBSYSTEM S_CLASS - -#include <linux/statfs.h> -#include <obd_cksum.h> -#include <obd_class.h> -#include <lprocfs_status.h> -#include <linux/seq_file.h> -#include "osc_internal.h" - -static ssize_t active_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive); -} - -static ssize_t active_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - if (val > 1) - return -ERANGE; - - /* opposite senses */ - if (dev->u.cli.cl_import->imp_deactive == val) - rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val); - else - CDEBUG(D_CONFIG, "activate %ld: ignoring repeat request\n", - val); - - return count; -} -LUSTRE_RW_ATTR(active); - -static ssize_t max_rpcs_in_flight_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - - return sprintf(buf, "%u\n", cli->cl_max_rpcs_in_flight); -} - -static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int rc; - unsigned long val; - int adding, added, req_count; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val < 1 || val > OSC_MAX_RIF_MAX) - return -ERANGE; - - adding = val - cli->cl_max_rpcs_in_flight; - req_count = atomic_read(&osc_pool_req_count); - if (adding > 0 && req_count < osc_reqpool_maxreqcount) { - /* - * There might be some race which will cause over-limit - * allocation, but it is fine. - */ - if (req_count + adding > osc_reqpool_maxreqcount) - adding = osc_reqpool_maxreqcount - req_count; - - added = osc_rq_pool->prp_populate(osc_rq_pool, adding); - atomic_add(added, &osc_pool_req_count); - } - - spin_lock(&cli->cl_loi_list_lock); - cli->cl_max_rpcs_in_flight = val; - client_adjust_max_dirty(cli); - spin_unlock(&cli->cl_loi_list_lock); - - return count; -} -LUSTRE_RW_ATTR(max_rpcs_in_flight); - -static ssize_t max_dirty_mb_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - long val; - int mult; - - spin_lock(&cli->cl_loi_list_lock); - val = cli->cl_dirty_max_pages; - spin_unlock(&cli->cl_loi_list_lock); - - mult = 1 << (20 - PAGE_SHIFT); - return lprocfs_read_frac_helper(buf, PAGE_SIZE, val, mult); -} - -static ssize_t max_dirty_mb_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int rc; - unsigned long pages_number; - - rc = kstrtoul(buffer, 10, &pages_number); - if (rc) - return rc; - - pages_number *= 1 << (20 - PAGE_SHIFT); /* MB -> pages */ - - if (pages_number <= 0 || - pages_number >= OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) || - pages_number > totalram_pages / 4) /* 1/4 of RAM */ - return -ERANGE; - - spin_lock(&cli->cl_loi_list_lock); - cli->cl_dirty_max_pages = pages_number; - osc_wake_cache_waiters(cli); - spin_unlock(&cli->cl_loi_list_lock); - - return count; -} -LUSTRE_RW_ATTR(max_dirty_mb); - -static int osc_cached_mb_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *dev = m->private; - struct client_obd *cli = &dev->u.cli; - int shift = 20 - PAGE_SHIFT; - - seq_printf(m, - "used_mb: %ld\n" - "busy_cnt: %ld\n" - "reclaim: %llu\n", - (atomic_long_read(&cli->cl_lru_in_list) + - atomic_long_read(&cli->cl_lru_busy)) >> shift, - atomic_long_read(&cli->cl_lru_busy), - cli->cl_lru_reclaim); - - return 0; -} - -/* shrink the number of caching pages to a specific number */ -static ssize_t osc_cached_mb_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *dev = ((struct seq_file *)file->private_data)->private; - struct client_obd *cli = &dev->u.cli; - long pages_number, rc; - char kernbuf[128]; - int mult; - u64 val; - - if (count >= sizeof(kernbuf)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - kernbuf[count] = 0; - - mult = 1 << (20 - PAGE_SHIFT); - buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) - - kernbuf; - rc = lprocfs_write_frac_u64_helper(buffer, count, &val, mult); - if (rc) - return rc; - - if (val > LONG_MAX) - return -ERANGE; - pages_number = (long)val; - - if (pages_number < 0) - return -ERANGE; - - rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number; - if (rc > 0) { - struct lu_env *env; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - (void)osc_lru_shrink(env, cli, rc, true); - cl_env_put(env, &refcheck); - } - } - - return count; -} - -LPROC_SEQ_FOPS(osc_cached_mb); - -static ssize_t cur_dirty_bytes_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int len; - - spin_lock(&cli->cl_loi_list_lock); - len = sprintf(buf, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT); - spin_unlock(&cli->cl_loi_list_lock); - - return len; -} -LUSTRE_RO_ATTR(cur_dirty_bytes); - -static ssize_t cur_grant_bytes_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int len; - - spin_lock(&cli->cl_loi_list_lock); - len = sprintf(buf, "%lu\n", cli->cl_avail_grant); - spin_unlock(&cli->cl_loi_list_lock); - - return len; -} - -static ssize_t cur_grant_bytes_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &obd->u.cli; - int rc; - unsigned long long val; - - rc = kstrtoull(buffer, 10, &val); - if (rc) - return rc; - - /* this is only for shrinking grant */ - spin_lock(&cli->cl_loi_list_lock); - if (val >= cli->cl_avail_grant) { - spin_unlock(&cli->cl_loi_list_lock); - return -EINVAL; - } - spin_unlock(&cli->cl_loi_list_lock); - - if (cli->cl_import->imp_state == LUSTRE_IMP_FULL) - rc = osc_shrink_grant_to_target(cli, val); - if (rc) - return rc; - return count; -} -LUSTRE_RW_ATTR(cur_grant_bytes); - -static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int len; - - spin_lock(&cli->cl_loi_list_lock); - len = sprintf(buf, "%lu\n", cli->cl_lost_grant); - spin_unlock(&cli->cl_loi_list_lock); - - return len; -} -LUSTRE_RO_ATTR(cur_lost_grant_bytes); - -static ssize_t grant_shrink_interval_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%d\n", obd->u.cli.cl_grant_shrink_interval); -} - -static ssize_t grant_shrink_interval_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val <= 0) - return -ERANGE; - - obd->u.cli.cl_grant_shrink_interval = val; - - return count; -} -LUSTRE_RW_ATTR(grant_shrink_interval); - -static ssize_t checksums_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%d\n", obd->u.cli.cl_checksum ? 1 : 0); -} - -static ssize_t checksums_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - obd->u.cli.cl_checksum = (val ? 1 : 0); - - return count; -} -LUSTRE_RW_ATTR(checksums); - -static int osc_checksum_type_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *obd = m->private; - int i; - - DECLARE_CKSUM_NAME; - - if (!obd) - return 0; - - for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { - if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) - continue; - if (obd->u.cli.cl_cksum_type == (1 << i)) - seq_printf(m, "[%s] ", cksum_name[i]); - else - seq_printf(m, "%s ", cksum_name[i]); - } - seq_putc(m, '\n'); - return 0; -} - -static ssize_t osc_checksum_type_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *obd = ((struct seq_file *)file->private_data)->private; - int i; - - DECLARE_CKSUM_NAME; - char kernbuf[10]; - - if (!obd) - return 0; - - if (count > sizeof(kernbuf) - 1) - return -EINVAL; - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - if (count > 0 && kernbuf[count - 1] == '\n') - kernbuf[count - 1] = '\0'; - else - kernbuf[count] = '\0'; - - for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { - if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) - continue; - if (!strcmp(kernbuf, cksum_name[i])) { - obd->u.cli.cl_cksum_type = 1 << i; - return count; - } - } - return -EINVAL; -} - -LPROC_SEQ_FOPS(osc_checksum_type); - -static ssize_t resend_count_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%u\n", atomic_read(&obd->u.cli.cl_resends)); -} - -static ssize_t resend_count_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - atomic_set(&obd->u.cli.cl_resends, val); - - return count; -} -LUSTRE_RW_ATTR(resend_count); - -static ssize_t contention_seconds_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct osc_device *od = obd2osc_dev(obd); - - return sprintf(buf, "%u\n", od->od_contention_time); -} - -static ssize_t contention_seconds_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct osc_device *od = obd2osc_dev(obd); - int rc; - int val; - - rc = kstrtoint(buffer, 10, &val); - if (rc) - return rc; - - if (val < 0) - return -EINVAL; - - od->od_contention_time = val; - - return count; -} -LUSTRE_RW_ATTR(contention_seconds); - -static ssize_t lockless_truncate_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct osc_device *od = obd2osc_dev(obd); - - return sprintf(buf, "%u\n", od->od_lockless_truncate); -} - -static ssize_t lockless_truncate_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct osc_device *od = obd2osc_dev(obd); - int rc; - unsigned int val; - - rc = kstrtouint(buffer, 10, &val); - if (rc) - return rc; - - od->od_lockless_truncate = val; - - return count; -} -LUSTRE_RW_ATTR(lockless_truncate); - -static ssize_t destroys_in_flight_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%u\n", - atomic_read(&obd->u.cli.cl_destroy_in_flight)); -} -LUSTRE_RO_ATTR(destroys_in_flight); - -static ssize_t max_pages_per_rpc_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - - return sprintf(buf, "%d\n", cli->cl_max_pages_per_rpc); -} - -static ssize_t max_pages_per_rpc_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data; - int chunk_mask, rc; - unsigned long long val; - - rc = kstrtoull(buffer, 10, &val); - if (rc) - return rc; - - /* if the max_pages is specified in bytes, convert to pages */ - if (val >= ONE_MB_BRW_SIZE) - val >>= PAGE_SHIFT; - - chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1); - /* max_pages_per_rpc must be chunk aligned */ - val = (val + ~chunk_mask) & chunk_mask; - if (!val || (ocd->ocd_brw_size && - val > ocd->ocd_brw_size >> PAGE_SHIFT)) { - return -ERANGE; - } - spin_lock(&cli->cl_loi_list_lock); - cli->cl_max_pages_per_rpc = val; - client_adjust_max_dirty(cli); - spin_unlock(&cli->cl_loi_list_lock); - - return count; -} -LUSTRE_RW_ATTR(max_pages_per_rpc); - -static ssize_t unstable_stats_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - long pages; - int mb; - - pages = atomic_long_read(&cli->cl_unstable_count); - mb = (pages * PAGE_SIZE) >> 20; - - return sprintf(buf, "unstable_pages: %20ld\n" - "unstable_mb: %10d\n", pages, mb); -} -LUSTRE_RO_ATTR(unstable_stats); - -LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags); -LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid); -LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid); -LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts); -LPROC_SEQ_FOPS_RO_TYPE(osc, state); - -LPROC_SEQ_FOPS_WR_ONLY(osc, ping); - -LPROC_SEQ_FOPS_RW_TYPE(osc, import); -LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov); - -static struct lprocfs_vars lprocfs_osc_obd_vars[] = { - { "ping", &osc_ping_fops, NULL, 0222 }, - { "connect_flags", &osc_connect_flags_fops, NULL, 0 }, - /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/ - { "ost_server_uuid", &osc_server_uuid_fops, NULL, 0 }, - { "ost_conn_uuid", &osc_conn_uuid_fops, NULL, 0 }, - { "osc_cached_mb", &osc_cached_mb_fops, NULL }, - { "checksum_type", &osc_checksum_type_fops, NULL }, - { "timeouts", &osc_timeouts_fops, NULL, 0 }, - { "import", &osc_import_fops, NULL }, - { "state", &osc_state_fops, NULL, 0 }, - { "pinger_recov", &osc_pinger_recov_fops, NULL }, - { NULL } -}; - -#define pct(a, b) (b ? a * 100 / b : 0) - -static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct obd_device *dev = seq->private; - struct client_obd *cli = &dev->u.cli; - unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; - int i; - - ktime_get_real_ts64(&now); - - spin_lock(&cli->cl_loi_list_lock); - - seq_printf(seq, "snapshot_time: %llu.%9lu (secs.usecs)\n", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - seq_printf(seq, "read RPCs in flight: %d\n", - cli->cl_r_in_flight); - seq_printf(seq, "write RPCs in flight: %d\n", - cli->cl_w_in_flight); - seq_printf(seq, "pending write pages: %d\n", - atomic_read(&cli->cl_pending_w_pages)); - seq_printf(seq, "pending read pages: %d\n", - atomic_read(&cli->cl_pending_r_pages)); - - seq_puts(seq, "\n\t\t\tread\t\t\twrite\n"); - seq_puts(seq, "pages per rpc rpcs % cum % |"); - seq_puts(seq, " rpcs % cum %\n"); - - read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist); - write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist); - - read_cum = 0; - write_cum = 0; - for (i = 0; i < OBD_HIST_MAX; i++) { - unsigned long r = cli->cl_read_page_hist.oh_buckets[i]; - unsigned long w = cli->cl_write_page_hist.oh_buckets[i]; - - read_cum += r; - write_cum += w; - seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", - 1 << i, r, pct(r, read_tot), - pct(read_cum, read_tot), w, - pct(w, write_tot), - pct(write_cum, write_tot)); - if (read_cum == read_tot && write_cum == write_tot) - break; - } - - seq_puts(seq, "\n\t\t\tread\t\t\twrite\n"); - seq_puts(seq, "rpcs in flight rpcs % cum % |"); - seq_puts(seq, " rpcs % cum %\n"); - - read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist); - write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist); - - read_cum = 0; - write_cum = 0; - for (i = 0; i < OBD_HIST_MAX; i++) { - unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i]; - unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i]; - - read_cum += r; - write_cum += w; - seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", - i, r, pct(r, read_tot), - pct(read_cum, read_tot), w, - pct(w, write_tot), - pct(write_cum, write_tot)); - if (read_cum == read_tot && write_cum == write_tot) - break; - } - - seq_puts(seq, "\n\t\t\tread\t\t\twrite\n"); - seq_puts(seq, "offset rpcs % cum % |"); - seq_puts(seq, " rpcs % cum %\n"); - - read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist); - write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist); - - read_cum = 0; - write_cum = 0; - for (i = 0; i < OBD_HIST_MAX; i++) { - unsigned long r = cli->cl_read_offset_hist.oh_buckets[i]; - unsigned long w = cli->cl_write_offset_hist.oh_buckets[i]; - - read_cum += r; - write_cum += w; - seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", - (i == 0) ? 0 : 1 << (i - 1), - r, pct(r, read_tot), pct(read_cum, read_tot), - w, pct(w, write_tot), pct(write_cum, write_tot)); - if (read_cum == read_tot && write_cum == write_tot) - break; - } - - spin_unlock(&cli->cl_loi_list_lock); - - return 0; -} - -#undef pct - -static ssize_t osc_rpc_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct obd_device *dev = seq->private; - struct client_obd *cli = &dev->u.cli; - - lprocfs_oh_clear(&cli->cl_read_rpc_hist); - lprocfs_oh_clear(&cli->cl_write_rpc_hist); - lprocfs_oh_clear(&cli->cl_read_page_hist); - lprocfs_oh_clear(&cli->cl_write_page_hist); - lprocfs_oh_clear(&cli->cl_read_offset_hist); - lprocfs_oh_clear(&cli->cl_write_offset_hist); - - return len; -} - -LPROC_SEQ_FOPS(osc_rpc_stats); - -static int osc_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct obd_device *dev = seq->private; - struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; - - ktime_get_real_ts64(&now); - - seq_printf(seq, "snapshot_time: %llu.%9lu (secs.usecs)\n", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - seq_printf(seq, "lockless_write_bytes\t\t%llu\n", - stats->os_lockless_writes); - seq_printf(seq, "lockless_read_bytes\t\t%llu\n", - stats->os_lockless_reads); - seq_printf(seq, "lockless_truncate\t\t%llu\n", - stats->os_lockless_truncates); - return 0; -} - -static ssize_t osc_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct obd_device *dev = seq->private; - struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; - - memset(stats, 0, sizeof(*stats)); - return len; -} - -LPROC_SEQ_FOPS(osc_stats); - -int lproc_osc_attach_seqstat(struct obd_device *dev) -{ - int rc; - - rc = ldebugfs_seq_create(dev->obd_debugfs_entry, "osc_stats", 0644, - &osc_stats_fops, dev); - if (rc == 0) - rc = ldebugfs_obd_seq_create(dev, "rpc_stats", 0644, - &osc_rpc_stats_fops, dev); - - return rc; -} - -static struct attribute *osc_attrs[] = { - &lustre_attr_active.attr, - &lustre_attr_checksums.attr, - &lustre_attr_contention_seconds.attr, - &lustre_attr_cur_dirty_bytes.attr, - &lustre_attr_cur_grant_bytes.attr, - &lustre_attr_cur_lost_grant_bytes.attr, - &lustre_attr_destroys_in_flight.attr, - &lustre_attr_grant_shrink_interval.attr, - &lustre_attr_lockless_truncate.attr, - &lustre_attr_max_dirty_mb.attr, - &lustre_attr_max_pages_per_rpc.attr, - &lustre_attr_max_rpcs_in_flight.attr, - &lustre_attr_resend_count.attr, - &lustre_attr_unstable_stats.attr, - NULL, -}; - -static const struct attribute_group osc_attr_group = { - .attrs = osc_attrs, -}; - -void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars) -{ - lvars->sysfs_vars = &osc_attr_group; - lvars->obd_vars = lprocfs_osc_obd_vars; -} diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c deleted file mode 100644 index 459503727ce3..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_cache.c +++ /dev/null @@ -1,3306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - * - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * osc cache management. - * - * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com> - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include "osc_cl_internal.h" -#include "osc_internal.h" - -static int extent_debug; /* set it to be true for more debug */ - -static void osc_update_pending(struct osc_object *obj, int cmd, int delta); -static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, - enum osc_extent_state state); -static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, - struct osc_async_page *oap, int sent, int rc); -static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, - int cmd); -static int osc_refresh_count(const struct lu_env *env, - struct osc_async_page *oap, int cmd); -static int osc_io_unplug_async(const struct lu_env *env, - struct client_obd *cli, struct osc_object *osc); -static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, - unsigned int lost_grant); - -static void osc_extent_tree_dump0(int level, struct osc_object *obj, - const char *func, int line); -#define osc_extent_tree_dump(lvl, obj) \ - osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) - -/** \addtogroup osc - * @{ - */ - -/* ------------------ osc extent ------------------ */ -static inline char *ext_flags(struct osc_extent *ext, char *flags) -{ - char *buf = flags; - *buf++ = ext->oe_rw ? 'r' : 'w'; - if (ext->oe_intree) - *buf++ = 'i'; - if (ext->oe_sync) - *buf++ = 'S'; - if (ext->oe_srvlock) - *buf++ = 's'; - if (ext->oe_hp) - *buf++ = 'h'; - if (ext->oe_urgent) - *buf++ = 'u'; - if (ext->oe_memalloc) - *buf++ = 'm'; - if (ext->oe_trunc_pending) - *buf++ = 't'; - if (ext->oe_fsync_wait) - *buf++ = 'Y'; - *buf = 0; - return flags; -} - -static inline char list_empty_marker(struct list_head *list) -{ - return list_empty(list) ? '-' : '+'; -} - -#define EXTSTR "[%lu -> %lu/%lu]" -#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end -static const char *oes_strings[] = { - "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL }; - -#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ - struct osc_extent *__ext = (extent); \ - char __buf[16]; \ - \ - CDEBUG(lvl, \ - "extent %p@{" EXTSTR ", " \ - "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ - /* ----- extent part 0 ----- */ \ - __ext, EXTPARA(__ext), \ - /* ----- part 1 ----- */ \ - atomic_read(&__ext->oe_refc), \ - atomic_read(&__ext->oe_users), \ - list_empty_marker(&__ext->oe_link), \ - oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ - __ext->oe_obj, \ - /* ----- part 2 ----- */ \ - __ext->oe_grants, __ext->oe_nr_pages, \ - list_empty_marker(&__ext->oe_pages), \ - waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ - __ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner, \ - /* ----- part 4 ----- */ \ - ## __VA_ARGS__); \ - if (lvl == D_ERROR && __ext->oe_dlmlock) \ - LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext); \ - else \ - LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext); \ -} while (0) - -#undef EASSERTF -#define EASSERTF(expr, ext, fmt, args...) do { \ - if (!(expr)) { \ - OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \ - osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \ - LASSERT(expr); \ - } \ -} while (0) - -#undef EASSERT -#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n") - -static inline struct osc_extent *rb_extent(struct rb_node *n) -{ - return rb_entry_safe(n, struct osc_extent, oe_node); -} - -static inline struct osc_extent *next_extent(struct osc_extent *ext) -{ - if (!ext) - return NULL; - - LASSERT(ext->oe_intree); - return rb_extent(rb_next(&ext->oe_node)); -} - -static inline struct osc_extent *prev_extent(struct osc_extent *ext) -{ - if (!ext) - return NULL; - - LASSERT(ext->oe_intree); - return rb_extent(rb_prev(&ext->oe_node)); -} - -static inline struct osc_extent *first_extent(struct osc_object *obj) -{ - return rb_extent(rb_first(&obj->oo_root)); -} - -/* object must be locked by caller. */ -static int osc_extent_sanity_check0(struct osc_extent *ext, - const char *func, const int line) -{ - struct osc_object *obj = ext->oe_obj; - struct osc_async_page *oap; - size_t page_count; - int rc = 0; - - if (!osc_object_is_locked(obj)) { - rc = 9; - goto out; - } - - if (ext->oe_state >= OES_STATE_MAX) { - rc = 10; - goto out; - } - - if (atomic_read(&ext->oe_refc) <= 0) { - rc = 20; - goto out; - } - - if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) { - rc = 30; - goto out; - } - - switch (ext->oe_state) { - case OES_INV: - if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages)) - rc = 35; - else - rc = 0; - goto out; - case OES_ACTIVE: - if (atomic_read(&ext->oe_users) == 0) { - rc = 40; - goto out; - } - if (ext->oe_hp) { - rc = 50; - goto out; - } - if (ext->oe_fsync_wait && !ext->oe_urgent) { - rc = 55; - goto out; - } - break; - case OES_CACHE: - if (ext->oe_grants == 0) { - rc = 60; - goto out; - } - if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) { - rc = 65; - goto out; - } - /* fall through */ - default: - if (atomic_read(&ext->oe_users) > 0) { - rc = 70; - goto out; - } - } - - if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) { - rc = 80; - goto out; - } - - if (ext->oe_sync && ext->oe_grants > 0) { - rc = 90; - goto out; - } - - if (ext->oe_dlmlock && !ldlm_is_failed(ext->oe_dlmlock)) { - struct ldlm_extent *extent; - - extent = &ext->oe_dlmlock->l_policy_data.l_extent; - if (!(extent->start <= cl_offset(osc2cl(obj), ext->oe_start) && - extent->end >= cl_offset(osc2cl(obj), ext->oe_max_end))) { - rc = 100; - goto out; - } - - if (!(ext->oe_dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))) { - rc = 102; - goto out; - } - } - - if (ext->oe_nr_pages > ext->oe_mppr) { - rc = 105; - goto out; - } - - /* Do not verify page list if extent is in RPC. This is because an - * in-RPC extent is supposed to be exclusively accessible w/o lock. - */ - if (ext->oe_state > OES_CACHE) { - rc = 0; - goto out; - } - - if (!extent_debug) { - rc = 0; - goto out; - } - - page_count = 0; - list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { - pgoff_t index = osc_index(oap2osc(oap)); - ++page_count; - if (index > ext->oe_end || index < ext->oe_start) { - rc = 110; - goto out; - } - } - if (page_count != ext->oe_nr_pages) { - rc = 120; - goto out; - } - -out: - if (rc != 0) - OSC_EXTENT_DUMP(D_ERROR, ext, - "%s:%d sanity check %p failed with rc = %d\n", - func, line, ext, rc); - return rc; -} - -#define sanity_check_nolock(ext) \ - osc_extent_sanity_check0(ext, __func__, __LINE__) - -#define sanity_check(ext) ({ \ - int __res; \ - osc_object_lock((ext)->oe_obj); \ - __res = sanity_check_nolock(ext); \ - osc_object_unlock((ext)->oe_obj); \ - __res; \ -}) - -/** - * sanity check - to make sure there is no overlapped extent in the tree. - */ -static int osc_extent_is_overlapped(struct osc_object *obj, - struct osc_extent *ext) -{ - struct osc_extent *tmp; - - LASSERT(osc_object_is_locked(obj)); - - if (!extent_debug) - return 0; - - for (tmp = first_extent(obj); tmp; tmp = next_extent(tmp)) { - if (tmp == ext) - continue; - if (tmp->oe_end >= ext->oe_start && - tmp->oe_start <= ext->oe_end) - return 1; - } - return 0; -} - -static void osc_extent_state_set(struct osc_extent *ext, int state) -{ - LASSERT(osc_object_is_locked(ext->oe_obj)); - LASSERT(state >= OES_INV && state < OES_STATE_MAX); - - /* Never try to sanity check a state changing extent :-) */ - /* LASSERT(sanity_check_nolock(ext) == 0); */ - - /* TODO: validate the state machine */ - ext->oe_state = state; - wake_up_all(&ext->oe_waitq); -} - -static struct osc_extent *osc_extent_alloc(struct osc_object *obj) -{ - struct osc_extent *ext; - - ext = kmem_cache_zalloc(osc_extent_kmem, GFP_NOFS); - if (!ext) - return NULL; - - RB_CLEAR_NODE(&ext->oe_node); - ext->oe_obj = obj; - cl_object_get(osc2cl(obj)); - atomic_set(&ext->oe_refc, 1); - atomic_set(&ext->oe_users, 0); - INIT_LIST_HEAD(&ext->oe_link); - ext->oe_state = OES_INV; - INIT_LIST_HEAD(&ext->oe_pages); - init_waitqueue_head(&ext->oe_waitq); - ext->oe_dlmlock = NULL; - - return ext; -} - -static void osc_extent_free(struct osc_extent *ext) -{ - kmem_cache_free(osc_extent_kmem, ext); -} - -static struct osc_extent *osc_extent_get(struct osc_extent *ext) -{ - LASSERT(atomic_read(&ext->oe_refc) >= 0); - atomic_inc(&ext->oe_refc); - return ext; -} - -static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) -{ - LASSERT(atomic_read(&ext->oe_refc) > 0); - if (atomic_dec_and_test(&ext->oe_refc)) { - LASSERT(list_empty(&ext->oe_link)); - LASSERT(atomic_read(&ext->oe_users) == 0); - LASSERT(ext->oe_state == OES_INV); - LASSERT(!ext->oe_intree); - - if (ext->oe_dlmlock) { - lu_ref_add(&ext->oe_dlmlock->l_reference, - "osc_extent", ext); - LDLM_LOCK_PUT(ext->oe_dlmlock); - ext->oe_dlmlock = NULL; - } - cl_object_put(env, osc2cl(ext->oe_obj)); - osc_extent_free(ext); - } -} - -/** - * osc_extent_put_trust() is a special version of osc_extent_put() when - * it's known that the caller is not the last user. This is to address the - * problem of lacking of lu_env ;-). - */ -static void osc_extent_put_trust(struct osc_extent *ext) -{ - LASSERT(atomic_read(&ext->oe_refc) > 1); - LASSERT(osc_object_is_locked(ext->oe_obj)); - atomic_dec(&ext->oe_refc); -} - -/** - * Return the extent which includes pgoff @index, or return the greatest - * previous extent in the tree. - */ -static struct osc_extent *osc_extent_search(struct osc_object *obj, - pgoff_t index) -{ - struct rb_node *n = obj->oo_root.rb_node; - struct osc_extent *tmp, *p = NULL; - - LASSERT(osc_object_is_locked(obj)); - while (n) { - tmp = rb_extent(n); - if (index < tmp->oe_start) { - n = n->rb_left; - } else if (index > tmp->oe_end) { - p = rb_extent(n); - n = n->rb_right; - } else { - return tmp; - } - } - return p; -} - -/* - * Return the extent covering @index, otherwise return NULL. - * caller must have held object lock. - */ -static struct osc_extent *osc_extent_lookup(struct osc_object *obj, - pgoff_t index) -{ - struct osc_extent *ext; - - ext = osc_extent_search(obj, index); - if (ext && ext->oe_start <= index && index <= ext->oe_end) - return osc_extent_get(ext); - return NULL; -} - -/* caller must have held object lock. */ -static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) -{ - struct rb_node **n = &obj->oo_root.rb_node; - struct rb_node *parent = NULL; - struct osc_extent *tmp; - - LASSERT(ext->oe_intree == 0); - LASSERT(ext->oe_obj == obj); - LASSERT(osc_object_is_locked(obj)); - while (*n) { - tmp = rb_extent(*n); - parent = *n; - - if (ext->oe_end < tmp->oe_start) - n = &(*n)->rb_left; - else if (ext->oe_start > tmp->oe_end) - n = &(*n)->rb_right; - else - EASSERTF(0, tmp, EXTSTR "\n", EXTPARA(ext)); - } - rb_link_node(&ext->oe_node, parent, n); - rb_insert_color(&ext->oe_node, &obj->oo_root); - osc_extent_get(ext); - ext->oe_intree = 1; -} - -/* caller must have held object lock. */ -static void osc_extent_erase(struct osc_extent *ext) -{ - struct osc_object *obj = ext->oe_obj; - - LASSERT(osc_object_is_locked(obj)); - if (ext->oe_intree) { - rb_erase(&ext->oe_node, &obj->oo_root); - ext->oe_intree = 0; - /* rbtree held a refcount */ - osc_extent_put_trust(ext); - } -} - -static struct osc_extent *osc_extent_hold(struct osc_extent *ext) -{ - struct osc_object *obj = ext->oe_obj; - - LASSERT(osc_object_is_locked(obj)); - LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); - if (ext->oe_state == OES_CACHE) { - osc_extent_state_set(ext, OES_ACTIVE); - osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages); - } - atomic_inc(&ext->oe_users); - list_del_init(&ext->oe_link); - return osc_extent_get(ext); -} - -static void __osc_extent_remove(struct osc_extent *ext) -{ - LASSERT(osc_object_is_locked(ext->oe_obj)); - LASSERT(list_empty(&ext->oe_pages)); - osc_extent_erase(ext); - list_del_init(&ext->oe_link); - osc_extent_state_set(ext, OES_INV); - OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n"); -} - -static void osc_extent_remove(struct osc_extent *ext) -{ - struct osc_object *obj = ext->oe_obj; - - osc_object_lock(obj); - __osc_extent_remove(ext); - osc_object_unlock(obj); -} - -/** - * This function is used to merge extents to get better performance. It checks - * if @cur and @victim are contiguous at chunk level. - */ -static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, - struct osc_extent *victim) -{ - struct osc_object *obj = cur->oe_obj; - pgoff_t chunk_start; - pgoff_t chunk_end; - int ppc_bits; - - LASSERT(cur->oe_state == OES_CACHE); - LASSERT(osc_object_is_locked(obj)); - if (!victim) - return -EINVAL; - - if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait) - return -EBUSY; - - if (cur->oe_max_end != victim->oe_max_end) - return -ERANGE; - - LASSERT(cur->oe_dlmlock == victim->oe_dlmlock); - ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT; - chunk_start = cur->oe_start >> ppc_bits; - chunk_end = cur->oe_end >> ppc_bits; - if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && - chunk_end + 1 != victim->oe_start >> ppc_bits) - return -ERANGE; - - OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); - - cur->oe_start = min(cur->oe_start, victim->oe_start); - cur->oe_end = max(cur->oe_end, victim->oe_end); - cur->oe_grants += victim->oe_grants; - cur->oe_nr_pages += victim->oe_nr_pages; - /* only the following bits are needed to merge */ - cur->oe_urgent |= victim->oe_urgent; - cur->oe_memalloc |= victim->oe_memalloc; - list_splice_init(&victim->oe_pages, &cur->oe_pages); - list_del_init(&victim->oe_link); - victim->oe_nr_pages = 0; - - osc_extent_get(victim); - __osc_extent_remove(victim); - osc_extent_put(env, victim); - - OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim); - return 0; -} - -/** - * Drop user count of osc_extent, and unplug IO asynchronously. - */ -void osc_extent_release(const struct lu_env *env, struct osc_extent *ext) -{ - struct osc_object *obj = ext->oe_obj; - - LASSERT(atomic_read(&ext->oe_users) > 0); - LASSERT(sanity_check(ext) == 0); - LASSERT(ext->oe_grants > 0); - - if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { - LASSERT(ext->oe_state == OES_ACTIVE); - if (ext->oe_trunc_pending) { - /* a truncate process is waiting for this extent. - * This may happen due to a race, check - * osc_cache_truncate_start(). - */ - osc_extent_state_set(ext, OES_TRUNC); - ext->oe_trunc_pending = 0; - } else { - osc_extent_state_set(ext, OES_CACHE); - osc_update_pending(obj, OBD_BRW_WRITE, - ext->oe_nr_pages); - - /* try to merge the previous and next extent. */ - osc_extent_merge(env, ext, prev_extent(ext)); - osc_extent_merge(env, ext, next_extent(ext)); - - if (ext->oe_urgent) - list_move_tail(&ext->oe_link, - &obj->oo_urgent_exts); - } - osc_object_unlock(obj); - - osc_io_unplug_async(env, osc_cli(obj), obj); - } - osc_extent_put(env, ext); -} - -static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) -{ - return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); -} - -/** - * Find or create an extent which includes @index, core function to manage - * extent tree. - */ -static struct osc_extent *osc_extent_find(const struct lu_env *env, - struct osc_object *obj, pgoff_t index, - unsigned int *grants) -{ - struct client_obd *cli = osc_cli(obj); - struct osc_lock *olck; - struct cl_lock_descr *descr; - struct osc_extent *cur; - struct osc_extent *ext; - struct osc_extent *conflict = NULL; - struct osc_extent *found = NULL; - pgoff_t chunk; - pgoff_t max_end; - unsigned int max_pages; /* max_pages_per_rpc */ - unsigned int chunksize; - int ppc_bits; /* pages per chunk bits */ - pgoff_t chunk_mask; - int rc; - - cur = osc_extent_alloc(obj); - if (!cur) - return ERR_PTR(-ENOMEM); - - olck = osc_env_io(env)->oi_write_osclock; - LASSERTF(olck, "page %lu is not covered by lock\n", index); - LASSERT(olck->ols_state == OLS_GRANTED); - - descr = &olck->ols_cl.cls_lock->cll_descr; - LASSERT(descr->cld_mode >= CLM_WRITE); - - LASSERT(cli->cl_chunkbits >= PAGE_SHIFT); - ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; - chunk_mask = ~((1 << ppc_bits) - 1); - chunksize = 1 << cli->cl_chunkbits; - chunk = index >> ppc_bits; - - /* align end to rpc edge, rpc size may not be a power 2 integer. */ - max_pages = cli->cl_max_pages_per_rpc; - LASSERT((max_pages & ~chunk_mask) == 0); - max_end = index - (index % max_pages) + max_pages - 1; - max_end = min_t(pgoff_t, max_end, descr->cld_end); - - /* initialize new extent by parameters so far */ - cur->oe_max_end = max_end; - cur->oe_start = index & chunk_mask; - cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; - if (cur->oe_start < descr->cld_start) - cur->oe_start = descr->cld_start; - if (cur->oe_end > max_end) - cur->oe_end = max_end; - cur->oe_grants = 0; - cur->oe_mppr = max_pages; - if (olck->ols_dlmlock) { - LASSERT(olck->ols_hold); - cur->oe_dlmlock = LDLM_LOCK_GET(olck->ols_dlmlock); - lu_ref_add(&olck->ols_dlmlock->l_reference, "osc_extent", cur); - } - - /* grants has been allocated by caller */ - LASSERTF(*grants >= chunksize + cli->cl_extent_tax, - "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax); - LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR "\n", - EXTPARA(cur)); - -restart: - osc_object_lock(obj); - ext = osc_extent_search(obj, cur->oe_start); - if (!ext) - ext = first_extent(obj); - while (ext) { - pgoff_t ext_chk_start = ext->oe_start >> ppc_bits; - pgoff_t ext_chk_end = ext->oe_end >> ppc_bits; - - LASSERT(sanity_check_nolock(ext) == 0); - if (chunk > ext_chk_end + 1) - break; - - /* if covering by different locks, no chance to match */ - if (olck->ols_dlmlock != ext->oe_dlmlock) { - EASSERTF(!overlapped(ext, cur), ext, - EXTSTR "\n", EXTPARA(cur)); - - ext = next_extent(ext); - continue; - } - - /* discontiguous chunks? */ - if (chunk + 1 < ext_chk_start) { - ext = next_extent(ext); - continue; - } - - /* ok, from now on, ext and cur have these attrs: - * 1. covered by the same lock - * 2. contiguous at chunk level or overlapping. - */ - - if (overlapped(ext, cur)) { - /* cur is the minimum unit, so overlapping means - * full contain. - */ - EASSERTF((ext->oe_start <= cur->oe_start && - ext->oe_end >= cur->oe_end), - ext, EXTSTR "\n", EXTPARA(cur)); - - if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) { - /* for simplicity, we wait for this extent to - * finish before going forward. - */ - conflict = osc_extent_get(ext); - break; - } - - found = osc_extent_hold(ext); - break; - } - - /* non-overlapped extent */ - if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) { - /* we can't do anything for a non OES_CACHE extent, or - * if there is someone waiting for this extent to be - * flushed, try next one. - */ - ext = next_extent(ext); - continue; - } - - /* check if they belong to the same rpc slot before trying to - * merge. the extents are not overlapped and contiguous at - * chunk level to get here. - */ - if (ext->oe_max_end != max_end) { - /* if they don't belong to the same RPC slot or - * max_pages_per_rpc has ever changed, do not merge. - */ - ext = next_extent(ext); - continue; - } - - /* it's required that an extent must be contiguous at chunk - * level so that we know the whole extent is covered by grant - * (the pages in the extent are NOT required to be contiguous). - * Otherwise, it will be too much difficult to know which - * chunks have grants allocated. - */ - - /* try to do front merge - extend ext's start */ - if (chunk + 1 == ext_chk_start) { - /* ext must be chunk size aligned */ - EASSERT((ext->oe_start & ~chunk_mask) == 0, ext); - - /* pull ext's start back to cover cur */ - ext->oe_start = cur->oe_start; - ext->oe_grants += chunksize; - LASSERT(*grants >= chunksize); - *grants -= chunksize; - - found = osc_extent_hold(ext); - } else if (chunk == ext_chk_end + 1) { - /* rear merge */ - ext->oe_end = cur->oe_end; - ext->oe_grants += chunksize; - LASSERT(*grants >= chunksize); - *grants -= chunksize; - - /* try to merge with the next one because we just fill - * in a gap - */ - if (osc_extent_merge(env, ext, next_extent(ext)) == 0) - /* we can save extent tax from next extent */ - *grants += cli->cl_extent_tax; - - found = osc_extent_hold(ext); - } - if (found) - break; - - ext = next_extent(ext); - } - - osc_extent_tree_dump(D_CACHE, obj); - if (found) { - LASSERT(!conflict); - if (!IS_ERR(found)) { - LASSERT(found->oe_dlmlock == cur->oe_dlmlock); - OSC_EXTENT_DUMP(D_CACHE, found, - "found caching ext for %lu.\n", index); - } - } else if (!conflict) { - /* create a new extent */ - EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); - cur->oe_grants = chunksize + cli->cl_extent_tax; - LASSERT(*grants >= cur->oe_grants); - *grants -= cur->oe_grants; - - cur->oe_state = OES_CACHE; - found = osc_extent_hold(cur); - osc_extent_insert(obj, cur); - OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", - index, descr->cld_end); - } - osc_object_unlock(obj); - - if (conflict) { - LASSERT(!found); - - /* waiting for IO to finish. Please notice that it's impossible - * to be an OES_TRUNC extent. - */ - rc = osc_extent_wait(env, conflict, OES_INV); - osc_extent_put(env, conflict); - conflict = NULL; - if (rc < 0) { - found = ERR_PTR(rc); - goto out; - } - - goto restart; - } - -out: - osc_extent_put(env, cur); - return found; -} - -/** - * Called when IO is finished to an extent. - */ -int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, - int sent, int rc) -{ - struct client_obd *cli = osc_cli(ext->oe_obj); - struct osc_async_page *oap; - struct osc_async_page *tmp; - int nr_pages = ext->oe_nr_pages; - int lost_grant = 0; - int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; - __u64 last_off = 0; - int last_count = -1; - - OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n"); - - ext->oe_rc = rc ?: ext->oe_nr_pages; - EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); - - osc_lru_add_batch(cli, &ext->oe_pages); - list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) { - list_del_init(&oap->oap_rpc_item); - list_del_init(&oap->oap_pending_item); - if (last_off <= oap->oap_obj_off) { - last_off = oap->oap_obj_off; - last_count = oap->oap_count; - } - - --ext->oe_nr_pages; - osc_ap_completion(env, cli, oap, sent, rc); - } - EASSERT(ext->oe_nr_pages == 0, ext); - - if (!sent) { - lost_grant = ext->oe_grants; - } else if (blocksize < PAGE_SIZE && - last_count != PAGE_SIZE) { - /* For short writes we shouldn't count parts of pages that - * span a whole chunk on the OST side, or our accounting goes - * wrong. Should match the code in filter_grant_check. - */ - int offset = last_off & ~PAGE_MASK; - int count = last_count + (offset & (blocksize - 1)); - int end = (offset + last_count) & (blocksize - 1); - - if (end) - count += blocksize - end; - - lost_grant = PAGE_SIZE - count; - } - if (ext->oe_grants > 0) - osc_free_grant(cli, nr_pages, lost_grant); - - osc_extent_remove(ext); - /* put the refcount for RPC */ - osc_extent_put(env, ext); - return 0; -} - -static int extent_wait_cb(struct osc_extent *ext, enum osc_extent_state state) -{ - int ret; - - osc_object_lock(ext->oe_obj); - ret = ext->oe_state == state; - osc_object_unlock(ext->oe_obj); - - return ret; -} - -/** - * Wait for the extent's state to become @state. - */ -static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, - enum osc_extent_state state) -{ - struct osc_object *obj = ext->oe_obj; - int rc = 0; - - osc_object_lock(obj); - LASSERT(sanity_check_nolock(ext) == 0); - /* `Kick' this extent only if the caller is waiting for it to be - * written out. - */ - if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp && - !ext->oe_trunc_pending) { - if (ext->oe_state == OES_ACTIVE) { - ext->oe_urgent = 1; - } else if (ext->oe_state == OES_CACHE) { - ext->oe_urgent = 1; - osc_extent_hold(ext); - rc = 1; - } - } - osc_object_unlock(obj); - if (rc == 1) - osc_extent_release(env, ext); - - /* wait for the extent until its state becomes @state */ - rc = wait_event_idle_timeout(ext->oe_waitq, - extent_wait_cb(ext, state), 600 * HZ); - if (rc == 0) { - OSC_EXTENT_DUMP(D_ERROR, ext, - "%s: wait ext to %u timedout, recovery in progress?\n", - cli_name(osc_cli(obj)), state); - - wait_event_idle(ext->oe_waitq, extent_wait_cb(ext, state)); - } - if (ext->oe_rc < 0) - rc = ext->oe_rc; - else - rc = 0; - return rc; -} - -/** - * Discard pages with index greater than @size. If @ext is overlapped with - * @size, then partial truncate happens. - */ -static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, - bool partial) -{ - struct lu_env *env; - struct cl_io *io; - struct osc_object *obj = ext->oe_obj; - struct client_obd *cli = osc_cli(obj); - struct osc_async_page *oap; - struct osc_async_page *tmp; - int pages_in_chunk = 0; - int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; - __u64 trunc_chunk = trunc_index >> ppc_bits; - int grants = 0; - int nr_pages = 0; - int rc = 0; - u16 refcheck; - - LASSERT(sanity_check(ext) == 0); - EASSERT(ext->oe_state == OES_TRUNC, ext); - EASSERT(!ext->oe_urgent, ext); - - /* Request new lu_env. - * We can't use that env from osc_cache_truncate_start() because - * it's from lov_io_sub and not fully initialized. - */ - env = cl_env_get(&refcheck); - io = &osc_env_info(env)->oti_io; - io->ci_obj = cl_object_top(osc2cl(obj)); - io->ci_ignore_layout = 1; - rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (rc < 0) - goto out; - - /* discard all pages with index greater then trunc_index */ - list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) { - pgoff_t index = osc_index(oap2osc(oap)); - struct cl_page *page = oap2cl_page(oap); - - LASSERT(list_empty(&oap->oap_rpc_item)); - - /* only discard the pages with their index greater than - * trunc_index, and ... - */ - if (index < trunc_index || - (index == trunc_index && partial)) { - /* accounting how many pages remaining in the chunk - * so that we can calculate grants correctly. */ - if (index >> ppc_bits == trunc_chunk) - ++pages_in_chunk; - continue; - } - - list_del_init(&oap->oap_pending_item); - - cl_page_get(page); - lu_ref_add(&page->cp_reference, "truncate", current); - - if (cl_page_own(env, io, page) == 0) { - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - } else { - LASSERT(page->cp_state == CPS_FREEING); - LASSERT(0); - } - - lu_ref_del(&page->cp_reference, "truncate", current); - cl_page_put(env, page); - - --ext->oe_nr_pages; - ++nr_pages; - } - EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, - ext->oe_nr_pages == 0), - ext, "trunc_index %lu, partial %d\n", trunc_index, partial); - - osc_object_lock(obj); - if (ext->oe_nr_pages == 0) { - LASSERT(pages_in_chunk == 0); - grants = ext->oe_grants; - ext->oe_grants = 0; - } else { /* calculate how many grants we can free */ - int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk; - pgoff_t last_index; - - /* if there is no pages in this chunk, we can also free grants - * for the last chunk - */ - if (pages_in_chunk == 0) { - /* if this is the 1st chunk and no pages in this chunk, - * ext->oe_nr_pages must be zero, so we should be in - * the other if-clause. - */ - LASSERT(trunc_chunk > 0); - --trunc_chunk; - ++chunks; - } - - /* this is what we can free from this extent */ - grants = chunks << cli->cl_chunkbits; - ext->oe_grants -= grants; - last_index = ((trunc_chunk + 1) << ppc_bits) - 1; - ext->oe_end = min(last_index, ext->oe_max_end); - LASSERT(ext->oe_end >= ext->oe_start); - LASSERT(ext->oe_grants > 0); - } - osc_object_unlock(obj); - - if (grants > 0 || nr_pages > 0) - osc_free_grant(cli, nr_pages, grants); - -out: - cl_io_fini(env, io); - cl_env_put(env, &refcheck); - return rc; -} - -/** - * This function is used to make the extent prepared for transfer. - * A race with flushing page - ll_writepage() has to be handled cautiously. - */ -static int osc_extent_make_ready(const struct lu_env *env, - struct osc_extent *ext) -{ - struct osc_async_page *oap; - struct osc_async_page *last = NULL; - struct osc_object *obj = ext->oe_obj; - unsigned int page_count = 0; - int rc; - - /* we're going to grab page lock, so object lock must not be taken. */ - LASSERT(sanity_check(ext) == 0); - /* in locking state, any process should not touch this extent. */ - EASSERT(ext->oe_state == OES_LOCKING, ext); - EASSERT(ext->oe_owner, ext); - - OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n"); - - list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { - ++page_count; - if (!last || last->oap_obj_off < oap->oap_obj_off) - last = oap; - - /* checking ASYNC_READY is race safe */ - if ((oap->oap_async_flags & ASYNC_READY) != 0) - continue; - - rc = osc_make_ready(env, oap, OBD_BRW_WRITE); - switch (rc) { - case 0: - spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_READY; - spin_unlock(&oap->oap_lock); - break; - case -EALREADY: - LASSERT((oap->oap_async_flags & ASYNC_READY) != 0); - break; - default: - LASSERTF(0, "unknown return code: %d\n", rc); - } - } - - LASSERT(page_count == ext->oe_nr_pages); - LASSERT(last); - /* the last page is the only one we need to refresh its count by - * the size of file. - */ - if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { - int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); - - LASSERT(last_oap_count > 0); - LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE); - last->oap_count = last_oap_count; - spin_lock(&last->oap_lock); - last->oap_async_flags |= ASYNC_COUNT_STABLE; - spin_unlock(&last->oap_lock); - } - - /* for the rest of pages, we don't need to call osf_refresh_count() - * because it's known they are not the last page - */ - list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { - if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { - oap->oap_count = PAGE_SIZE - oap->oap_page_off; - spin_lock(&last->oap_lock); - oap->oap_async_flags |= ASYNC_COUNT_STABLE; - spin_unlock(&last->oap_lock); - } - } - - osc_object_lock(obj); - osc_extent_state_set(ext, OES_RPC); - osc_object_unlock(obj); - /* get a refcount for RPC. */ - osc_extent_get(ext); - - return 0; -} - -/** - * Quick and simple version of osc_extent_find(). This function is frequently - * called to expand the extent for the same IO. To expand the extent, the - * page index must be in the same or next chunk of ext->oe_end. - */ -static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, - unsigned int *grants) -{ - struct osc_object *obj = ext->oe_obj; - struct client_obd *cli = osc_cli(obj); - struct osc_extent *next; - int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; - pgoff_t chunk = index >> ppc_bits; - pgoff_t end_chunk; - pgoff_t end_index; - unsigned int chunksize = 1 << cli->cl_chunkbits; - int rc = 0; - - LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); - osc_object_lock(obj); - LASSERT(sanity_check_nolock(ext) == 0); - end_chunk = ext->oe_end >> ppc_bits; - if (chunk > end_chunk + 1) { - rc = -ERANGE; - goto out; - } - - if (end_chunk >= chunk) { - rc = 0; - goto out; - } - - LASSERT(end_chunk + 1 == chunk); - /* try to expand this extent to cover @index */ - end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); - - next = next_extent(ext); - if (next && next->oe_start <= end_index) { - /* complex mode - overlapped with the next extent, - * this case will be handled by osc_extent_find() - */ - rc = -EAGAIN; - goto out; - } - - ext->oe_end = end_index; - ext->oe_grants += chunksize; - LASSERT(*grants >= chunksize); - *grants -= chunksize; - EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext, - "overlapped after expanding for %lu.\n", index); - -out: - osc_object_unlock(obj); - return rc; -} - -static void osc_extent_tree_dump0(int level, struct osc_object *obj, - const char *func, int line) -{ - struct osc_extent *ext; - int cnt; - - CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", - obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); - - /* osc_object_lock(obj); */ - cnt = 1; - for (ext = first_extent(obj); ext; ext = next_extent(ext)) - OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); - - cnt = 1; - list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); - - cnt = 1; - list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); - - cnt = 1; - list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); - /* osc_object_unlock(obj); */ -} - -/* ------------------ osc extent end ------------------ */ - -static inline int osc_is_ready(struct osc_object *osc) -{ - return !list_empty(&osc->oo_ready_item) || - !list_empty(&osc->oo_hp_ready_item); -} - -#define OSC_IO_DEBUG(OSC, STR, args...) \ - CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \ - (OSC), osc_is_ready(OSC), \ - list_empty_marker(&(OSC)->oo_hp_ready_item), \ - list_empty_marker(&(OSC)->oo_ready_item), \ - atomic_read(&(OSC)->oo_nr_writes), \ - list_empty_marker(&(OSC)->oo_hp_exts), \ - list_empty_marker(&(OSC)->oo_urgent_exts), \ - atomic_read(&(OSC)->oo_nr_reads), \ - list_empty_marker(&(OSC)->oo_reading_exts), \ - ##args) - -static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, - int cmd) -{ - struct osc_page *opg = oap2osc_page(oap); - struct cl_page *page = oap2cl_page(oap); - int result; - - LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ - - result = cl_page_make_ready(env, page, CRT_WRITE); - if (result == 0) - opg->ops_submit_time = cfs_time_current(); - return result; -} - -static int osc_refresh_count(const struct lu_env *env, - struct osc_async_page *oap, int cmd) -{ - struct osc_page *opg = oap2osc_page(oap); - pgoff_t index = osc_index(oap2osc(oap)); - struct cl_object *obj; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - - int result; - loff_t kms; - - /* readpage queues with _COUNT_STABLE, shouldn't get here. */ - LASSERT(!(cmd & OBD_BRW_READ)); - obj = opg->ops_cl.cpl_obj; - - cl_object_attr_lock(obj); - result = cl_object_attr_get(env, obj, attr); - cl_object_attr_unlock(obj); - if (result < 0) - return result; - kms = attr->cat_kms; - if (cl_offset(obj, index) >= kms) - /* catch race with truncate */ - return 0; - else if (cl_offset(obj, index + 1) > kms) - /* catch sub-page write at end of file */ - return kms % PAGE_SIZE; - else - return PAGE_SIZE; -} - -static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, - int cmd, int rc) -{ - struct osc_page *opg = oap2osc_page(oap); - struct cl_page *page = oap2cl_page(oap); - enum cl_req_type crt; - int srvlock; - - cmd &= ~OBD_BRW_NOQUOTA; - LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ), - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); - LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE), - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); - LASSERT(opg->ops_transfer_pinned); - - crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; - /* Clear opg->ops_transfer_pinned before VM lock is released. */ - opg->ops_transfer_pinned = 0; - - opg->ops_submit_time = 0; - srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; - - /* statistic */ - if (rc == 0 && srvlock) { - struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; - struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; - size_t bytes = oap->oap_count; - - if (crt == CRT_READ) - stats->os_lockless_reads += bytes; - else - stats->os_lockless_writes += bytes; - } - - /* - * This has to be the last operation with the page, as locks are - * released in cl_page_completion() and nothing except for the - * reference counter protects page from concurrent reclaim. - */ - lu_ref_del(&page->cp_reference, "transfer", page); - - cl_page_completion(env, page, crt, rc); - cl_page_put(env, page); - - return 0; -} - -#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do { \ - struct client_obd *__tmp = (cli); \ - CDEBUG(lvl, "%s: grant { dirty: %lu/%lu dirty_pages: %ld/%lu " \ - "dropped: %ld avail: %ld, reserved: %ld, flight: %d }" \ - "lru {in list: %ld, left: %ld, waiters: %d }" fmt "\n", \ - cli_name(__tmp), \ - __tmp->cl_dirty_pages, __tmp->cl_dirty_max_pages, \ - atomic_long_read(&obd_dirty_pages), obd_max_dirty_pages, \ - __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ - __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, \ - atomic_long_read(&__tmp->cl_lru_in_list), \ - atomic_long_read(&__tmp->cl_lru_busy), \ - atomic_read(&__tmp->cl_lru_shrinkers), ##args); \ -} while (0) - -/* caller must hold loi_list_lock */ -static void osc_consume_write_grant(struct client_obd *cli, - struct brw_page *pga) -{ - assert_spin_locked(&cli->cl_loi_list_lock); - LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); - atomic_long_inc(&obd_dirty_pages); - cli->cl_dirty_pages++; - pga->flag |= OBD_BRW_FROM_GRANT; - CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", - PAGE_SIZE, pga, pga->pg); - osc_update_next_shrink(cli); -} - -/* the companion to osc_consume_write_grant, called when a brw has completed. - * must be called with the loi lock held. - */ -static void osc_release_write_grant(struct client_obd *cli, - struct brw_page *pga) -{ - assert_spin_locked(&cli->cl_loi_list_lock); - if (!(pga->flag & OBD_BRW_FROM_GRANT)) - return; - - pga->flag &= ~OBD_BRW_FROM_GRANT; - atomic_long_dec(&obd_dirty_pages); - cli->cl_dirty_pages--; - if (pga->flag & OBD_BRW_NOCACHE) { - pga->flag &= ~OBD_BRW_NOCACHE; - atomic_long_dec(&obd_dirty_transit_pages); - cli->cl_dirty_transit--; - } -} - -/** - * To avoid sleeping with object lock held, it's good for us allocate enough - * grants before entering into critical section. - * - * spin_lock held by caller - */ -static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) -{ - int rc = -EDQUOT; - - if (cli->cl_avail_grant >= bytes) { - cli->cl_avail_grant -= bytes; - cli->cl_reserved_grant += bytes; - rc = 0; - } - return rc; -} - -static void __osc_unreserve_grant(struct client_obd *cli, - unsigned int reserved, unsigned int unused) -{ - /* it's quite normal for us to get more grant than reserved. - * Thinking about a case that two extents merged by adding a new - * chunk, we can save one extent tax. If extent tax is greater than - * one chunk, we can save more grant by adding a new chunk - */ - cli->cl_reserved_grant -= reserved; - if (unused > reserved) { - cli->cl_avail_grant += reserved; - cli->cl_lost_grant += unused - reserved; - } else { - cli->cl_avail_grant += unused; - } -} - -static void osc_unreserve_grant(struct client_obd *cli, - unsigned int reserved, unsigned int unused) -{ - spin_lock(&cli->cl_loi_list_lock); - __osc_unreserve_grant(cli, reserved, unused); - if (unused > 0) - osc_wake_cache_waiters(cli); - spin_unlock(&cli->cl_loi_list_lock); -} - -/** - * Free grant after IO is finished or canceled. - * - * @lost_grant is used to remember how many grants we have allocated but not - * used, we should return these grants to OST. There're two cases where grants - * can be lost: - * 1. truncate; - * 2. blocksize at OST is less than PAGE_SIZE and a partial page was - * written. In this case OST may use less chunks to serve this partial - * write. OSTs don't actually know the page size on the client side. so - * clients have to calculate lost grant by the blocksize on the OST. - * See filter_grant_check() for details. - */ -static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, - unsigned int lost_grant) -{ - unsigned long grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; - - spin_lock(&cli->cl_loi_list_lock); - atomic_long_sub(nr_pages, &obd_dirty_pages); - cli->cl_dirty_pages -= nr_pages; - cli->cl_lost_grant += lost_grant; - if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { - /* borrow some grant from truncate to avoid the case that - * truncate uses up all avail grant - */ - cli->cl_lost_grant -= grant; - cli->cl_avail_grant += grant; - } - osc_wake_cache_waiters(cli); - spin_unlock(&cli->cl_loi_list_lock); - CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", - lost_grant, cli->cl_lost_grant, - cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT); -} - -/** - * The companion to osc_enter_cache(), called when @oap is no longer part of - * the dirty accounting due to error. - */ -static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) -{ - spin_lock(&cli->cl_loi_list_lock); - osc_release_write_grant(cli, &oap->oap_brw_page); - spin_unlock(&cli->cl_loi_list_lock); -} - -/** - * Non-blocking version of osc_enter_cache() that consumes grant only when it - * is available. - */ -static int osc_enter_cache_try(struct client_obd *cli, - struct osc_async_page *oap, - int bytes, int transient) -{ - int rc; - - OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); - - rc = osc_reserve_grant(cli, bytes); - if (rc < 0) - return 0; - - if (cli->cl_dirty_pages < cli->cl_dirty_max_pages && - atomic_long_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) { - osc_consume_write_grant(cli, &oap->oap_brw_page); - if (transient) { - cli->cl_dirty_transit++; - atomic_long_inc(&obd_dirty_transit_pages); - oap->oap_brw_flags |= OBD_BRW_NOCACHE; - } - rc = 1; - } else { - __osc_unreserve_grant(cli, bytes, bytes); - rc = 0; - } - return rc; -} - -static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) -{ - int rc; - - spin_lock(&cli->cl_loi_list_lock); - rc = list_empty(&ocw->ocw_entry); - spin_unlock(&cli->cl_loi_list_lock); - return rc; -} - -/** - * The main entry to reserve dirty page accounting. Usually the grant reserved - * in this function will be freed in bulk in osc_free_grant() unless it fails - * to add osc cache, in that case, it will be freed in osc_exit_cache(). - * - * The process will be put into sleep if it's already run out of grant. - */ -static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, - struct osc_async_page *oap, int bytes) -{ - struct osc_object *osc = oap->oap_obj; - struct lov_oinfo *loi = osc->oo_oinfo; - struct osc_cache_waiter ocw; - unsigned long timeout = (AT_OFF ? obd_timeout : at_max) * HZ; - int rc = -EDQUOT; - - OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); - - spin_lock(&cli->cl_loi_list_lock); - - /* force the caller to try sync io. this can jump the list - * of queued writes and create a discontiguous rpc stream - */ - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || - !cli->cl_dirty_max_pages || cli->cl_ar.ar_force_sync || - loi->loi_ar.ar_force_sync) { - OSC_DUMP_GRANT(D_CACHE, cli, "forced sync i/o\n"); - rc = -EDQUOT; - goto out; - } - - /* Hopefully normal case - cache space and write credits available */ - if (osc_enter_cache_try(cli, oap, bytes, 0)) { - OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); - rc = 0; - goto out; - } - - /* We can get here for two reasons: too many dirty pages in cache, or - * run out of grants. In both cases we should write dirty pages out. - * Adding a cache waiter will trigger urgent write-out no matter what - * RPC size will be. - * The exiting condition is no avail grants and no dirty pages caching, - * that really means there is no space on the OST. - */ - init_waitqueue_head(&ocw.ocw_waitq); - ocw.ocw_oap = oap; - ocw.ocw_grant = bytes; - while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) { - list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); - ocw.ocw_rc = 0; - spin_unlock(&cli->cl_loi_list_lock); - - osc_io_unplug_async(env, cli, NULL); - - CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", - cli_name(cli), &ocw, oap); - - rc = wait_event_idle_timeout(ocw.ocw_waitq, - ocw_granted(cli, &ocw), timeout); - - spin_lock(&cli->cl_loi_list_lock); - - if (rc == 0) { - /* wait_event is interrupted by signal, or timed out */ - list_del_init(&ocw.ocw_entry); - rc = -ETIMEDOUT; - break; - } - LASSERT(list_empty(&ocw.ocw_entry)); - rc = ocw.ocw_rc; - - if (rc != -EDQUOT) - break; - if (osc_enter_cache_try(cli, oap, bytes, 0)) { - rc = 0; - break; - } - } - - switch (rc) { - case 0: - OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n"); - break; - case -ETIMEDOUT: - OSC_DUMP_GRANT(D_CACHE, cli, - "timeout, fall back to sync i/o\n"); - osc_extent_tree_dump(D_CACHE, osc); - /* fall back to synchronous I/O */ - rc = -EDQUOT; - break; - case -EINTR: - /* Ensures restartability - LU-3581 */ - OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n"); - rc = -ERESTARTSYS; - break; - case -EDQUOT: - OSC_DUMP_GRANT(D_CACHE, cli, - "no grant space, fall back to sync i/o\n"); - break; - default: - CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived due to %d, fall back to sync i/o\n", - cli_name(cli), &ocw, rc); - break; - } -out: - spin_unlock(&cli->cl_loi_list_lock); - return rc; -} - -/* caller must hold loi_list_lock */ -void osc_wake_cache_waiters(struct client_obd *cli) -{ - struct list_head *l, *tmp; - struct osc_cache_waiter *ocw; - - list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); - list_del_init(&ocw->ocw_entry); - - ocw->ocw_rc = -EDQUOT; - /* we can't dirty more */ - if ((cli->cl_dirty_pages > cli->cl_dirty_max_pages) || - (atomic_long_read(&obd_dirty_pages) + 1 > - obd_max_dirty_pages)) { - CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %ld\n", - cli->cl_dirty_pages, cli->cl_dirty_max_pages, - obd_max_dirty_pages); - goto wakeup; - } - - if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) - ocw->ocw_rc = 0; -wakeup: - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", - ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); - - wake_up(&ocw->ocw_waitq); - } -} - -static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) -{ - int hprpc = !!list_empty(&osc->oo_hp_exts); - - return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; -} - -/* This maintains the lists of pending pages to read/write for a given object - * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint() - * to quickly find objects that are ready to send an RPC. - */ -static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, - int cmd) -{ - int invalid_import = 0; - - /* if we have an invalid import we want to drain the queued pages - * by forcing them through rpcs that immediately fail and complete - * the pages. recovery relies on this to empty the queued pages - * before canceling the locks and evicting down the llite pages - */ - if (!cli->cl_import || cli->cl_import->imp_invalid) - invalid_import = 1; - - if (cmd & OBD_BRW_WRITE) { - if (atomic_read(&osc->oo_nr_writes) == 0) - return 0; - if (invalid_import) { - CDEBUG(D_CACHE, "invalid import forcing RPC\n"); - return 1; - } - if (!list_empty(&osc->oo_hp_exts)) { - CDEBUG(D_CACHE, "high prio request forcing RPC\n"); - return 1; - } - if (!list_empty(&osc->oo_urgent_exts)) { - CDEBUG(D_CACHE, "urgent request forcing RPC\n"); - return 1; - } - /* trigger a write rpc stream as long as there are dirtiers - * waiting for space. as they're waiting, they're not going to - * create more pages to coalesce with what's waiting.. - */ - if (!list_empty(&cli->cl_cache_waiters)) { - CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); - return 1; - } - if (atomic_read(&osc->oo_nr_writes) >= - cli->cl_max_pages_per_rpc) - return 1; - } else { - if (atomic_read(&osc->oo_nr_reads) == 0) - return 0; - if (invalid_import) { - CDEBUG(D_CACHE, "invalid import forcing RPC\n"); - return 1; - } - /* all read are urgent. */ - if (!list_empty(&osc->oo_reading_exts)) - return 1; - } - - return 0; -} - -static void osc_update_pending(struct osc_object *obj, int cmd, int delta) -{ - struct client_obd *cli = osc_cli(obj); - - if (cmd & OBD_BRW_WRITE) { - atomic_add(delta, &obj->oo_nr_writes); - atomic_add(delta, &cli->cl_pending_w_pages); - LASSERT(atomic_read(&obj->oo_nr_writes) >= 0); - } else { - atomic_add(delta, &obj->oo_nr_reads); - atomic_add(delta, &cli->cl_pending_r_pages); - LASSERT(atomic_read(&obj->oo_nr_reads) >= 0); - } - OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta); -} - -static int osc_makes_hprpc(struct osc_object *obj) -{ - return !list_empty(&obj->oo_hp_exts); -} - -static void on_list(struct list_head *item, struct list_head *list, int should_be_on) -{ - if (list_empty(item) && should_be_on) - list_add_tail(item, list); - else if (!list_empty(item) && !should_be_on) - list_del_init(item); -} - -/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc - * can find pages to build into rpcs quickly - */ -static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc) -{ - if (osc_makes_hprpc(osc)) { - /* HP rpc */ - on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0); - on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); - } else { - on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); - on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, - osc_makes_rpc(cli, osc, OBD_BRW_WRITE) || - osc_makes_rpc(cli, osc, OBD_BRW_READ)); - } - - on_list(&osc->oo_write_item, &cli->cl_loi_write_list, - atomic_read(&osc->oo_nr_writes) > 0); - - on_list(&osc->oo_read_item, &cli->cl_loi_read_list, - atomic_read(&osc->oo_nr_reads) > 0); - - return osc_is_ready(osc); -} - -static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) -{ - int is_ready; - - spin_lock(&cli->cl_loi_list_lock); - is_ready = __osc_list_maint(cli, osc); - spin_unlock(&cli->cl_loi_list_lock); - - return is_ready; -} - -/* this is trying to propagate async writeback errors back up to the - * application. As an async write fails we record the error code for later if - * the app does an fsync. As long as errors persist we force future rpcs to be - * sync so that the app can get a sync error and break the cycle of queueing - * pages for which writeback will fail. - */ -static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, - int rc) -{ - if (rc) { - if (!ar->ar_rc) - ar->ar_rc = rc; - - ar->ar_force_sync = 1; - ar->ar_min_xid = ptlrpc_sample_next_xid(); - return; - } - - if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) - ar->ar_force_sync = 0; -} - -/* this must be called holding the loi list lock to give coverage to exit_cache, - * async_flag maintenance, and oap_request - */ -static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, - struct osc_async_page *oap, int sent, int rc) -{ - struct osc_object *osc = oap->oap_obj; - struct lov_oinfo *loi = osc->oo_oinfo; - __u64 xid = 0; - - if (oap->oap_request) { - xid = ptlrpc_req_xid(oap->oap_request); - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; - } - - /* As the transfer for this page is being done, clear the flags */ - spin_lock(&oap->oap_lock); - oap->oap_async_flags = 0; - spin_unlock(&oap->oap_lock); - oap->oap_interrupted = 0; - - if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { - spin_lock(&cli->cl_loi_list_lock); - osc_process_ar(&cli->cl_ar, xid, rc); - osc_process_ar(&loi->loi_ar, xid, rc); - spin_unlock(&cli->cl_loi_list_lock); - } - - rc = osc_completion(env, oap, oap->oap_cmd, rc); - if (rc) - CERROR("completion on oap %p obj %p returns %d.\n", - oap, osc, rc); -} - -struct extent_rpc_data { - struct list_head *erd_rpc_list; - unsigned int erd_page_count; - unsigned int erd_max_pages; - unsigned int erd_max_chunks; - unsigned int erd_max_extents; -}; - -static inline unsigned int osc_extent_chunks(const struct osc_extent *ext) -{ - struct client_obd *cli = osc_cli(ext->oe_obj); - unsigned int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; - - return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1; -} - -/** - * Try to add extent to one RPC. We need to think about the following things: - * - # of pages must not be over max_pages_per_rpc - * - extent must be compatible with previous ones - */ -static int try_to_add_extent_for_io(struct client_obd *cli, - struct osc_extent *ext, - struct extent_rpc_data *data) -{ - struct osc_extent *tmp; - unsigned int chunk_count; - struct osc_async_page *oap = list_first_entry(&ext->oe_pages, - struct osc_async_page, - oap_pending_item); - - EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), - ext); - - if (!data->erd_max_extents) - return 0; - - chunk_count = osc_extent_chunks(ext); - EASSERTF(data->erd_page_count != 0 || - chunk_count <= data->erd_max_chunks, ext, - "The first extent to be fit in a RPC contains %u chunks, which is over the limit %u.\n", - chunk_count, data->erd_max_chunks); - - if (chunk_count > data->erd_max_chunks) - return 0; - - data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages); - EASSERTF(data->erd_page_count != 0 || - ext->oe_nr_pages <= data->erd_max_pages, ext, - "The first extent to be fit in a RPC contains %u pages, which is over the limit %u.\n", - ext->oe_nr_pages, data->erd_max_pages); - if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages) - return 0; - - list_for_each_entry(tmp, data->erd_rpc_list, oe_link) { - struct osc_async_page *oap2; - - oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page, - oap_pending_item); - EASSERT(tmp->oe_owner == current, tmp); - if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) { - CDEBUG(D_CACHE, "Do not permit different type of IO in one RPC\n"); - return 0; - } - - if (tmp->oe_srvlock != ext->oe_srvlock || - !tmp->oe_grants != !ext->oe_grants || - tmp->oe_no_merge || ext->oe_no_merge) - return 0; - - /* remove break for strict check */ - break; - } - - data->erd_max_extents--; - data->erd_max_chunks -= chunk_count; - data->erd_page_count += ext->oe_nr_pages; - list_move_tail(&ext->oe_link, data->erd_rpc_list); - ext->oe_owner = current; - return 1; -} - -static inline unsigned int osc_max_write_chunks(const struct client_obd *cli) -{ - /* - * LU-8135: - * - * The maximum size of a single transaction is about 64MB in ZFS. - * #define DMU_MAX_ACCESS (64 * 1024 * 1024) - * - * Since ZFS is a copy-on-write file system, a single dirty page in - * a chunk will result in the rewrite of the whole chunk, therefore - * an RPC shouldn't be allowed to contain too many chunks otherwise - * it will make transaction size much bigger than 64MB, especially - * with big block size for ZFS. - * - * This piece of code is to make sure that OSC won't send write RPCs - * with too many chunks. The maximum chunk size that an RPC can cover - * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally - * OST should tell the client what the biggest transaction size is, - * but it's good enough for now. - * - * This limitation doesn't apply to ldiskfs, which allows as many - * chunks in one RPC as we want. However, it won't have any benefits - * to have too many discontiguous pages in one RPC. - * - * An osc_extent won't cover over a RPC size, so the chunks in an - * osc_extent won't bigger than PTLRPC_MAX_BRW_SIZE >> chunkbits. - */ - return PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits; -} - -/** - * In order to prevent multiple ptlrpcd from breaking contiguous extents, - * get_write_extent() takes all appropriate extents in atomic. - * - * The following policy is used to collect extents for IO: - * 1. Add as many HP extents as possible; - * 2. Add the first urgent extent in urgent extent list and take it out of - * urgent list; - * 3. Add subsequent extents of this urgent extent; - * 4. If urgent list is not empty, goto 2; - * 5. Traverse the extent tree from the 1st extent; - * 6. Above steps exit if there is no space in this RPC. - */ -static unsigned int get_write_extents(struct osc_object *obj, - struct list_head *rpclist) -{ - struct client_obd *cli = osc_cli(obj); - struct osc_extent *ext; - struct osc_extent *temp; - struct extent_rpc_data data = { - .erd_rpc_list = rpclist, - .erd_page_count = 0, - .erd_max_pages = cli->cl_max_pages_per_rpc, - .erd_max_chunks = osc_max_write_chunks(cli), - .erd_max_extents = 256, - }; - - LASSERT(osc_object_is_locked(obj)); - list_for_each_entry_safe(ext, temp, &obj->oo_hp_exts, oe_link) { - LASSERT(ext->oe_state == OES_CACHE); - if (!try_to_add_extent_for_io(cli, ext, &data)) - return data.erd_page_count; - EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); - } - if (data.erd_page_count == data.erd_max_pages) - return data.erd_page_count; - - while (!list_empty(&obj->oo_urgent_exts)) { - ext = list_entry(obj->oo_urgent_exts.next, - struct osc_extent, oe_link); - if (!try_to_add_extent_for_io(cli, ext, &data)) - return data.erd_page_count; - - if (!ext->oe_intree) - continue; - - while ((ext = next_extent(ext)) != NULL) { - if ((ext->oe_state != OES_CACHE) || - (!list_empty(&ext->oe_link) && - ext->oe_owner)) - continue; - - if (!try_to_add_extent_for_io(cli, ext, &data)) - return data.erd_page_count; - } - } - if (data.erd_page_count == data.erd_max_pages) - return data.erd_page_count; - - ext = first_extent(obj); - while (ext) { - if ((ext->oe_state != OES_CACHE) || - /* this extent may be already in current rpclist */ - (!list_empty(&ext->oe_link) && ext->oe_owner)) { - ext = next_extent(ext); - continue; - } - - if (!try_to_add_extent_for_io(cli, ext, &data)) - return data.erd_page_count; - - ext = next_extent(ext); - } - return data.erd_page_count; -} - -static int -osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc) - __must_hold(osc) -{ - LIST_HEAD(rpclist); - struct osc_extent *ext; - struct osc_extent *tmp; - struct osc_extent *first = NULL; - u32 page_count = 0; - int srvlock = 0; - int rc = 0; - - LASSERT(osc_object_is_locked(osc)); - - page_count = get_write_extents(osc, &rpclist); - LASSERT(equi(page_count == 0, list_empty(&rpclist))); - - if (list_empty(&rpclist)) - return 0; - - osc_update_pending(osc, OBD_BRW_WRITE, -page_count); - - list_for_each_entry(ext, &rpclist, oe_link) { - LASSERT(ext->oe_state == OES_CACHE || - ext->oe_state == OES_LOCK_DONE); - if (ext->oe_state == OES_CACHE) - osc_extent_state_set(ext, OES_LOCKING); - else - osc_extent_state_set(ext, OES_RPC); - } - - /* we're going to grab page lock, so release object lock because - * lock order is page lock -> object lock. - */ - osc_object_unlock(osc); - - list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) { - if (ext->oe_state == OES_LOCKING) { - rc = osc_extent_make_ready(env, ext); - if (unlikely(rc < 0)) { - list_del_init(&ext->oe_link); - osc_extent_finish(env, ext, 0, rc); - continue; - } - } - if (!first) { - first = ext; - srvlock = ext->oe_srvlock; - } else { - LASSERT(srvlock == ext->oe_srvlock); - } - } - - if (!list_empty(&rpclist)) { - LASSERT(page_count > 0); - rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE); - LASSERT(list_empty(&rpclist)); - } - - osc_object_lock(osc); - return rc; -} - -/** - * prepare pages for ASYNC io and put pages in send queue. - * - * \param cmd OBD_BRW_* macroses - * \param lop pending pages - * - * \return zero if no page added to send queue. - * \return 1 if pages successfully added to send queue. - * \return negative on errors. - */ -static int -osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc) - __must_hold(osc) -{ - struct osc_extent *ext; - struct osc_extent *next; - LIST_HEAD(rpclist); - struct extent_rpc_data data = { - .erd_rpc_list = &rpclist, - .erd_page_count = 0, - .erd_max_pages = cli->cl_max_pages_per_rpc, - .erd_max_chunks = UINT_MAX, - .erd_max_extents = UINT_MAX, - }; - int rc = 0; - - LASSERT(osc_object_is_locked(osc)); - list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) { - EASSERT(ext->oe_state == OES_LOCK_DONE, ext); - if (!try_to_add_extent_for_io(cli, ext, &data)) - break; - osc_extent_state_set(ext, OES_RPC); - EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); - } - LASSERT(data.erd_page_count <= data.erd_max_pages); - - osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count); - - if (!list_empty(&rpclist)) { - osc_object_unlock(osc); - - rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ); - LASSERT(list_empty(&rpclist)); - - osc_object_lock(osc); - } - return rc; -} - -#define list_to_obj(list, item) ({ \ - struct list_head *__tmp = (list)->next; \ - list_del_init(__tmp); \ - list_entry(__tmp, struct osc_object, oo_##item); \ -}) - -/* This is called by osc_check_rpcs() to find which objects have pages that - * we could be sending. These lists are maintained by osc_makes_rpc(). - */ -static struct osc_object *osc_next_obj(struct client_obd *cli) -{ - /* First return objects that have blocked locks so that they - * will be flushed quickly and other clients can get the lock, - * then objects which have pages ready to be stuffed into RPCs - */ - if (!list_empty(&cli->cl_loi_hp_ready_list)) - return list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item); - if (!list_empty(&cli->cl_loi_ready_list)) - return list_to_obj(&cli->cl_loi_ready_list, ready_item); - - /* then if we have cache waiters, return all objects with queued - * writes. This is especially important when many small files - * have filled up the cache and not been fired into rpcs because - * they don't pass the nr_pending/object threshold - */ - if (!list_empty(&cli->cl_cache_waiters) && - !list_empty(&cli->cl_loi_write_list)) - return list_to_obj(&cli->cl_loi_write_list, write_item); - - /* then return all queued objects when we have an invalid import - * so that they get flushed - */ - if (!cli->cl_import || cli->cl_import->imp_invalid) { - if (!list_empty(&cli->cl_loi_write_list)) - return list_to_obj(&cli->cl_loi_write_list, write_item); - if (!list_empty(&cli->cl_loi_read_list)) - return list_to_obj(&cli->cl_loi_read_list, read_item); - } - return NULL; -} - -/* called with the loi list lock held */ -static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) - __must_hold(&cli->cl_loi_list_lock) -{ - struct osc_object *osc; - int rc = 0; - - while ((osc = osc_next_obj(cli)) != NULL) { - struct cl_object *obj = osc2cl(osc); - struct lu_ref_link link; - - OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); - - if (osc_max_rpc_in_flight(cli, osc)) { - __osc_list_maint(cli, osc); - break; - } - - cl_object_get(obj); - spin_unlock(&cli->cl_loi_list_lock); - lu_object_ref_add_at(&obj->co_lu, &link, "check", current); - - /* attempt some read/write balancing by alternating between - * reads and writes in an object. The makes_rpc checks here - * would be redundant if we were getting read/write work items - * instead of objects. we don't want send_oap_rpc to drain a - * partial read pending queue when we're given this object to - * do io on writes while there are cache waiters - */ - osc_object_lock(osc); - if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) { - rc = osc_send_write_rpc(env, cli, osc); - if (rc < 0) { - CERROR("Write request failed with %d\n", rc); - - /* osc_send_write_rpc failed, mostly because of - * memory pressure. - * - * It can't break here, because if: - * - a page was submitted by osc_io_submit, so - * page locked; - * - no request in flight - * - no subsequent request - * The system will be in live-lock state, - * because there is no chance to call - * osc_io_unplug() and osc_check_rpcs() any - * more. pdflush can't help in this case, - * because it might be blocked at grabbing - * the page lock as we mentioned. - * - * Anyway, continue to drain pages. - */ - /* break; */ - } - } - if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) { - rc = osc_send_read_rpc(env, cli, osc); - if (rc < 0) - CERROR("Read request failed with %d\n", rc); - } - osc_object_unlock(osc); - - osc_list_maint(cli, osc); - lu_object_ref_del_at(&obj->co_lu, &link, "check", current); - cl_object_put(env, obj); - - spin_lock(&cli->cl_loi_list_lock); - } -} - -static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc, int async) -{ - int rc = 0; - - if (osc && osc_list_maint(cli, osc) == 0) - return 0; - - if (!async) { - spin_lock(&cli->cl_loi_list_lock); - osc_check_rpcs(env, cli); - spin_unlock(&cli->cl_loi_list_lock); - } else { - CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli); - LASSERT(cli->cl_writeback_work); - rc = ptlrpcd_queue_work(cli->cl_writeback_work); - } - return rc; -} - -static int osc_io_unplug_async(const struct lu_env *env, - struct client_obd *cli, struct osc_object *osc) -{ - return osc_io_unplug0(env, cli, osc, 1); -} - -void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc) -{ - (void)osc_io_unplug0(env, cli, osc, 0); -} - -int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, - struct page *page, loff_t offset) -{ - struct obd_export *exp = osc_export(osc); - struct osc_async_page *oap = &ops->ops_oap; - - if (!page) - return cfs_size_round(sizeof(*oap)); - - oap->oap_magic = OAP_MAGIC; - oap->oap_cli = &exp->exp_obd->u.cli; - oap->oap_obj = osc; - - oap->oap_page = page; - oap->oap_obj_off = offset; - LASSERT(!(offset & ~PAGE_MASK)); - - if (capable(CAP_SYS_RESOURCE)) - oap->oap_brw_flags = OBD_BRW_NOQUOTA; - - INIT_LIST_HEAD(&oap->oap_pending_item); - INIT_LIST_HEAD(&oap->oap_rpc_item); - - spin_lock_init(&oap->oap_lock); - CDEBUG(D_INFO, "oap %p page %p obj off %llu\n", - oap, page, oap->oap_obj_off); - return 0; -} - -int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops) -{ - struct osc_io *oio = osc_env_io(env); - struct osc_extent *ext = NULL; - struct osc_async_page *oap = &ops->ops_oap; - struct client_obd *cli = oap->oap_cli; - struct osc_object *osc = oap->oap_obj; - pgoff_t index; - unsigned int grants = 0, tmp; - int brw_flags = OBD_BRW_ASYNC; - int cmd = OBD_BRW_WRITE; - int need_release = 0; - int rc = 0; - - if (oap->oap_magic != OAP_MAGIC) - return -EINVAL; - - if (!cli->cl_import || cli->cl_import->imp_invalid) - return -EIO; - - if (!list_empty(&oap->oap_pending_item) || - !list_empty(&oap->oap_rpc_item)) - return -EBUSY; - - /* Set the OBD_BRW_SRVLOCK before the page is queued. */ - brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; - if (capable(CAP_SYS_RESOURCE)) { - brw_flags |= OBD_BRW_NOQUOTA; - cmd |= OBD_BRW_NOQUOTA; - } - - /* check if the file's owner/group is over quota */ - if (!(cmd & OBD_BRW_NOQUOTA)) { - struct cl_object *obj; - struct cl_attr *attr; - unsigned int qid[MAXQUOTAS]; - - obj = cl_object_top(&osc->oo_cl); - attr = &osc_env_info(env)->oti_attr; - - cl_object_attr_lock(obj); - rc = cl_object_attr_get(env, obj, attr); - cl_object_attr_unlock(obj); - - qid[USRQUOTA] = attr->cat_uid; - qid[GRPQUOTA] = attr->cat_gid; - if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA) - rc = -EDQUOT; - if (rc) - return rc; - } - - oap->oap_cmd = cmd; - oap->oap_page_off = ops->ops_from; - oap->oap_count = ops->ops_to - ops->ops_from; - /* - * No need to hold a lock here, - * since this page is not in any list yet. - */ - oap->oap_async_flags = 0; - oap->oap_brw_flags = brw_flags; - - OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", - oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); - - index = osc_index(oap2osc(oap)); - - /* Add this page into extent by the following steps: - * 1. if there exists an active extent for this IO, mostly this page - * can be added to the active extent and sometimes we need to - * expand extent to accommodate this page; - * 2. otherwise, a new extent will be allocated. - */ - - ext = oio->oi_active; - if (ext && ext->oe_start <= index && ext->oe_max_end >= index) { - /* one chunk plus extent overhead must be enough to write this - * page - */ - grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; - if (ext->oe_end >= index) - grants = 0; - - /* it doesn't need any grant to dirty this page */ - spin_lock(&cli->cl_loi_list_lock); - rc = osc_enter_cache_try(cli, oap, grants, 0); - spin_unlock(&cli->cl_loi_list_lock); - if (rc == 0) { /* try failed */ - grants = 0; - need_release = 1; - } else if (ext->oe_end < index) { - tmp = grants; - /* try to expand this extent */ - rc = osc_extent_expand(ext, index, &tmp); - if (rc < 0) { - need_release = 1; - /* don't free reserved grant */ - } else { - OSC_EXTENT_DUMP(D_CACHE, ext, - "expanded for %lu.\n", index); - osc_unreserve_grant(cli, grants, tmp); - grants = 0; - } - } - rc = 0; - } else if (ext) { - /* index is located outside of active extent */ - need_release = 1; - } - if (need_release) { - osc_extent_release(env, ext); - oio->oi_active = NULL; - ext = NULL; - } - - if (!ext) { - tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; - - /* try to find new extent to cover this page */ - LASSERT(!oio->oi_active); - /* we may have allocated grant for this page if we failed - * to expand the previous active extent. - */ - LASSERT(ergo(grants > 0, grants >= tmp)); - - rc = 0; - if (grants == 0) { - /* we haven't allocated grant for this page. */ - rc = osc_enter_cache(env, cli, oap, tmp); - if (rc == 0) - grants = tmp; - } - - tmp = grants; - if (rc == 0) { - ext = osc_extent_find(env, osc, index, &tmp); - if (IS_ERR(ext)) { - LASSERT(tmp == grants); - osc_exit_cache(cli, oap); - rc = PTR_ERR(ext); - ext = NULL; - } else { - oio->oi_active = ext; - } - } - if (grants > 0) - osc_unreserve_grant(cli, grants, tmp); - } - - LASSERT(ergo(rc == 0, ext)); - if (ext) { - EASSERTF(ext->oe_end >= index && ext->oe_start <= index, - ext, "index = %lu.\n", index); - LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); - - osc_object_lock(osc); - if (ext->oe_nr_pages == 0) - ext->oe_srvlock = ops->ops_srvlock; - else - LASSERT(ext->oe_srvlock == ops->ops_srvlock); - ++ext->oe_nr_pages; - list_add_tail(&oap->oap_pending_item, &ext->oe_pages); - osc_object_unlock(osc); - } - return rc; -} - -int osc_teardown_async_page(const struct lu_env *env, - struct osc_object *obj, struct osc_page *ops) -{ - struct osc_async_page *oap = &ops->ops_oap; - int rc = 0; - - LASSERT(oap->oap_magic == OAP_MAGIC); - - CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", - oap, ops, osc_index(oap2osc(oap))); - - if (!list_empty(&oap->oap_rpc_item)) { - CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); - rc = -EBUSY; - } else if (!list_empty(&oap->oap_pending_item)) { - struct osc_extent *ext = NULL; - - osc_object_lock(obj); - ext = osc_extent_lookup(obj, osc_index(oap2osc(oap))); - osc_object_unlock(obj); - /* only truncated pages are allowed to be taken out. - * See osc_extent_truncate() and osc_cache_truncate_start() - * for details. - */ - if (ext && ext->oe_state != OES_TRUNC) { - OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", - osc_index(oap2osc(oap))); - rc = -EBUSY; - } - if (ext) - osc_extent_put(env, ext); - } - return rc; -} - -/** - * This is called when a page is picked up by kernel to write out. - * - * We should find out the corresponding extent and add the whole extent - * into urgent list. The extent may be being truncated or used, handle it - * carefully. - */ -int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops) -{ - struct osc_extent *ext = NULL; - struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); - struct cl_page *cp = ops->ops_cl.cpl_page; - pgoff_t index = osc_index(ops); - struct osc_async_page *oap = &ops->ops_oap; - bool unplug = false; - int rc = 0; - - osc_object_lock(obj); - ext = osc_extent_lookup(obj, index); - if (!ext) { - osc_extent_tree_dump(D_ERROR, obj); - LASSERTF(0, "page index %lu is NOT covered.\n", index); - } - - switch (ext->oe_state) { - case OES_RPC: - case OES_LOCK_DONE: - CL_PAGE_DEBUG(D_ERROR, env, cp, "flush an in-rpc page?\n"); - LASSERT(0); - break; - case OES_LOCKING: - /* If we know this extent is being written out, we should abort - * so that the writer can make this page ready. Otherwise, there - * exists a deadlock problem because other process can wait for - * page writeback bit holding page lock; and meanwhile in - * vvp_page_make_ready(), we need to grab page lock before - * really sending the RPC. - */ - case OES_TRUNC: - /* race with truncate, page will be redirtied */ - case OES_ACTIVE: - /* The extent is active so we need to abort and let the caller - * re-dirty the page. If we continued on here, and we were the - * one making the extent active, we could deadlock waiting for - * the page writeback to clear but it won't because the extent - * is active and won't be written out. - */ - rc = -EAGAIN; - goto out; - default: - break; - } - - rc = cl_page_prep(env, io, cp, CRT_WRITE); - if (rc) - goto out; - - spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_READY | ASYNC_URGENT; - spin_unlock(&oap->oap_lock); - - if (memory_pressure_get()) - ext->oe_memalloc = 1; - - ext->oe_urgent = 1; - if (ext->oe_state == OES_CACHE) { - OSC_EXTENT_DUMP(D_CACHE, ext, - "flush page %p make it urgent.\n", oap); - if (list_empty(&ext->oe_link)) - list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); - unplug = true; - } - rc = 0; - -out: - osc_object_unlock(obj); - osc_extent_put(env, ext); - if (unplug) - osc_io_unplug_async(env, osc_cli(obj), obj); - return rc; -} - -/** - * this is called when a sync waiter receives an interruption. Its job is to - * get the caller woken as soon as possible. If its page hasn't been put in an - * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as - * desiring interruption which will forcefully complete the rpc once the rpc - * has timed out. - */ -int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) -{ - struct osc_async_page *oap = &ops->ops_oap; - struct osc_object *obj = oap->oap_obj; - struct client_obd *cli = osc_cli(obj); - struct osc_extent *ext; - struct osc_extent *found = NULL; - struct list_head *plist; - pgoff_t index = osc_index(ops); - int rc = -EBUSY; - int cmd; - - LASSERT(!oap->oap_interrupted); - oap->oap_interrupted = 1; - - /* Find out the caching extent */ - osc_object_lock(obj); - if (oap->oap_cmd & OBD_BRW_WRITE) { - plist = &obj->oo_urgent_exts; - cmd = OBD_BRW_WRITE; - } else { - plist = &obj->oo_reading_exts; - cmd = OBD_BRW_READ; - } - list_for_each_entry(ext, plist, oe_link) { - if (ext->oe_start <= index && ext->oe_end >= index) { - LASSERT(ext->oe_state == OES_LOCK_DONE); - /* For OES_LOCK_DONE state extent, it has already held - * a refcount for RPC. - */ - found = osc_extent_get(ext); - break; - } - } - if (found) { - list_del_init(&found->oe_link); - osc_update_pending(obj, cmd, -found->oe_nr_pages); - osc_object_unlock(obj); - - osc_extent_finish(env, found, 0, -EINTR); - osc_extent_put(env, found); - rc = 0; - } else { - osc_object_unlock(obj); - /* ok, it's been put in an rpc. only one oap gets a request - * reference - */ - if (oap->oap_request) { - ptlrpc_mark_interrupted(oap->oap_request); - ptlrpcd_wake(oap->oap_request); - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; - } - } - - osc_list_maint(cli, obj); - return rc; -} - -int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, - struct list_head *list, int cmd, int brw_flags) -{ - struct client_obd *cli = osc_cli(obj); - struct osc_extent *ext; - struct osc_async_page *oap, *tmp; - int page_count = 0; - int mppr = cli->cl_max_pages_per_rpc; - bool can_merge = true; - pgoff_t start = CL_PAGE_EOF; - pgoff_t end = 0; - - list_for_each_entry(oap, list, oap_pending_item) { - struct osc_page *opg = oap2osc_page(oap); - pgoff_t index = osc_index(opg); - - if (index > end) - end = index; - if (index < start) - start = index; - ++page_count; - mppr <<= (page_count > mppr); - - if (unlikely(opg->ops_from > 0 || opg->ops_to < PAGE_SIZE)) - can_merge = false; - } - - ext = osc_extent_alloc(obj); - if (!ext) { - list_for_each_entry_safe(oap, tmp, list, oap_pending_item) { - list_del_init(&oap->oap_pending_item); - osc_ap_completion(env, cli, oap, 0, -ENOMEM); - } - return -ENOMEM; - } - - ext->oe_rw = !!(cmd & OBD_BRW_READ); - ext->oe_sync = 1; - ext->oe_no_merge = !can_merge; - ext->oe_urgent = 1; - ext->oe_start = start; - ext->oe_end = end; - ext->oe_max_end = end; - ext->oe_obj = obj; - ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); - ext->oe_nr_pages = page_count; - ext->oe_mppr = mppr; - list_splice_init(list, &ext->oe_pages); - - osc_object_lock(obj); - /* Reuse the initial refcount for RPC, don't drop it */ - osc_extent_state_set(ext, OES_LOCK_DONE); - if (cmd & OBD_BRW_WRITE) { - list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); - osc_update_pending(obj, OBD_BRW_WRITE, page_count); - } else { - list_add_tail(&ext->oe_link, &obj->oo_reading_exts); - osc_update_pending(obj, OBD_BRW_READ, page_count); - } - osc_object_unlock(obj); - - osc_io_unplug_async(env, cli, obj); - return 0; -} - -/** - * Called by osc_io_setattr_start() to freeze and destroy covering extents. - */ -int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, - u64 size, struct osc_extent **extp) -{ - struct client_obd *cli = osc_cli(obj); - struct osc_extent *ext; - struct osc_extent *temp; - struct osc_extent *waiting = NULL; - pgoff_t index; - LIST_HEAD(list); - int result = 0; - bool partial; - - /* pages with index greater or equal to index will be truncated. */ - index = cl_index(osc2cl(obj), size); - partial = size > cl_offset(osc2cl(obj), index); - -again: - osc_object_lock(obj); - ext = osc_extent_search(obj, index); - if (!ext) - ext = first_extent(obj); - else if (ext->oe_end < index) - ext = next_extent(ext); - while (ext) { - EASSERT(ext->oe_state != OES_TRUNC, ext); - - if (ext->oe_state > OES_CACHE || ext->oe_urgent) { - /* if ext is in urgent state, it means there must exist - * a page already having been flushed by write_page(). - * We have to wait for this extent because we can't - * truncate that page. - */ - OSC_EXTENT_DUMP(D_CACHE, ext, - "waiting for busy extent\n"); - waiting = osc_extent_get(ext); - break; - } - - OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size); - - osc_extent_get(ext); - if (ext->oe_state == OES_ACTIVE) { - /* though we grab inode mutex for write path, but we - * release it before releasing extent(in osc_io_end()), - * so there is a race window that an extent is still - * in OES_ACTIVE when truncate starts. - */ - LASSERT(!ext->oe_trunc_pending); - ext->oe_trunc_pending = 1; - } else { - EASSERT(ext->oe_state == OES_CACHE, ext); - osc_extent_state_set(ext, OES_TRUNC); - osc_update_pending(obj, OBD_BRW_WRITE, - -ext->oe_nr_pages); - } - EASSERT(list_empty(&ext->oe_link), ext); - list_add_tail(&ext->oe_link, &list); - - ext = next_extent(ext); - } - osc_object_unlock(obj); - - osc_list_maint(cli, obj); - - list_for_each_entry_safe(ext, temp, &list, oe_link) { - int rc; - - list_del_init(&ext->oe_link); - - /* extent may be in OES_ACTIVE state because inode mutex - * is released before osc_io_end() in file write case - */ - if (ext->oe_state != OES_TRUNC) - osc_extent_wait(env, ext, OES_TRUNC); - - rc = osc_extent_truncate(ext, index, partial); - if (rc < 0) { - if (result == 0) - result = rc; - - OSC_EXTENT_DUMP(D_ERROR, ext, - "truncate error %d\n", rc); - } else if (ext->oe_nr_pages == 0) { - osc_extent_remove(ext); - } else { - /* this must be an overlapped extent which means only - * part of pages in this extent have been truncated. - */ - EASSERTF(ext->oe_start <= index, ext, - "trunc index = %lu/%d.\n", index, partial); - /* fix index to skip this partially truncated extent */ - index = ext->oe_end + 1; - partial = false; - - /* we need to hold this extent in OES_TRUNC state so - * that no writeback will happen. This is to avoid - * BUG 17397. - * Only partial truncate can reach here, if @size is - * not zero, the caller should provide a valid @extp. - */ - LASSERT(!*extp); - *extp = osc_extent_get(ext); - OSC_EXTENT_DUMP(D_CACHE, ext, - "trunc at %llu\n", size); - } - osc_extent_put(env, ext); - } - if (waiting) { - int rc; - - /* ignore the result of osc_extent_wait the write initiator - * should take care of it. - */ - rc = osc_extent_wait(env, waiting, OES_INV); - if (rc < 0) - OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc); - - osc_extent_put(env, waiting); - waiting = NULL; - goto again; - } - return result; -} - -/** - * Called after osc_io_setattr_end to add oio->oi_trunc back to cache. - */ -void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext) -{ - if (ext) { - struct osc_object *obj = ext->oe_obj; - bool unplug = false; - - EASSERT(ext->oe_nr_pages > 0, ext); - EASSERT(ext->oe_state == OES_TRUNC, ext); - EASSERT(!ext->oe_urgent, ext); - - OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n"); - osc_object_lock(obj); - osc_extent_state_set(ext, OES_CACHE); - if (ext->oe_fsync_wait && !ext->oe_urgent) { - ext->oe_urgent = 1; - list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); - unplug = true; - } - osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); - osc_object_unlock(obj); - osc_extent_put(env, ext); - - if (unplug) - osc_io_unplug_async(env, osc_cli(obj), obj); - } -} - -/** - * Wait for extents in a specific range to be written out. - * The caller must have called osc_cache_writeback_range() to issue IO - * otherwise it will take a long time for this function to finish. - * - * Caller must hold inode_mutex , or cancel exclusive dlm lock so that - * nobody else can dirty this range of file while we're waiting for - * extents to be written. - */ -int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end) -{ - struct osc_extent *ext; - pgoff_t index = start; - int result = 0; - -again: - osc_object_lock(obj); - ext = osc_extent_search(obj, index); - if (!ext) - ext = first_extent(obj); - else if (ext->oe_end < index) - ext = next_extent(ext); - while (ext) { - int rc; - - if (ext->oe_start > end) - break; - - if (!ext->oe_fsync_wait) { - ext = next_extent(ext); - continue; - } - - EASSERT(ergo(ext->oe_state == OES_CACHE, - ext->oe_hp || ext->oe_urgent), ext); - EASSERT(ergo(ext->oe_state == OES_ACTIVE, - !ext->oe_hp && ext->oe_urgent), ext); - - index = ext->oe_end + 1; - osc_extent_get(ext); - osc_object_unlock(obj); - - rc = osc_extent_wait(env, ext, OES_INV); - if (result == 0) - result = rc; - osc_extent_put(env, ext); - goto again; - } - osc_object_unlock(obj); - - OSC_IO_DEBUG(obj, "sync file range.\n"); - return result; -} - -/** - * Called to write out a range of osc object. - * - * @hp : should be set this is caused by lock cancel; - * @discard: is set if dirty pages should be dropped - file will be deleted or - * truncated, this implies there is no partially discarding extents. - * - * Return how many pages will be issued, or error code if error occurred. - */ -int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end, int hp, int discard) -{ - struct osc_extent *ext; - LIST_HEAD(discard_list); - bool unplug = false; - int result = 0; - - osc_object_lock(obj); - ext = osc_extent_search(obj, start); - if (!ext) - ext = first_extent(obj); - else if (ext->oe_end < start) - ext = next_extent(ext); - while (ext) { - if (ext->oe_start > end) - break; - - ext->oe_fsync_wait = 1; - switch (ext->oe_state) { - case OES_CACHE: - result += ext->oe_nr_pages; - if (!discard) { - struct list_head *list = NULL; - - if (hp) { - EASSERT(!ext->oe_hp, ext); - ext->oe_hp = 1; - list = &obj->oo_hp_exts; - } else if (!ext->oe_urgent) { - ext->oe_urgent = 1; - list = &obj->oo_urgent_exts; - } - if (list) - list_move_tail(&ext->oe_link, list); - unplug = true; - } else { - /* the only discarder is lock cancelling, so - * [start, end] must contain this extent - */ - EASSERT(ext->oe_start >= start && - ext->oe_max_end <= end, ext); - osc_extent_state_set(ext, OES_LOCKING); - ext->oe_owner = current; - list_move_tail(&ext->oe_link, &discard_list); - osc_update_pending(obj, OBD_BRW_WRITE, - -ext->oe_nr_pages); - } - break; - case OES_ACTIVE: - /* It's pretty bad to wait for ACTIVE extents, because - * we don't know how long we will wait for it to be - * flushed since it may be blocked at awaiting more - * grants. We do this for the correctness of fsync. - */ - LASSERT(hp == 0 && discard == 0); - ext->oe_urgent = 1; - break; - case OES_TRUNC: - /* this extent is being truncated, can't do anything - * for it now. it will be set to urgent after truncate - * is finished in osc_cache_truncate_end(). - */ - default: - break; - } - ext = next_extent(ext); - } - osc_object_unlock(obj); - - LASSERT(ergo(!discard, list_empty(&discard_list))); - if (!list_empty(&discard_list)) { - struct osc_extent *tmp; - int rc; - - osc_list_maint(osc_cli(obj), obj); - list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) { - list_del_init(&ext->oe_link); - EASSERT(ext->oe_state == OES_LOCKING, ext); - - /* Discard caching pages. We don't actually write this - * extent out but we complete it as if we did. - */ - rc = osc_extent_make_ready(env, ext); - if (unlikely(rc < 0)) { - OSC_EXTENT_DUMP(D_ERROR, ext, - "make_ready returned %d\n", rc); - if (result >= 0) - result = rc; - } - - /* finish the extent as if the pages were sent */ - osc_extent_finish(env, ext, 0, 0); - } - } - - if (unplug) - osc_io_unplug(env, osc_cli(obj), obj); - - if (hp || discard) { - int rc; - - rc = osc_cache_wait_range(env, obj, start, end); - if (result >= 0 && rc < 0) - result = rc; - } - - OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result); - return result; -} - -/** - * Returns a list of pages by a given [start, end] of \a obj. - * - * \param resched If not NULL, then we give up before hogging CPU for too - * long and set *resched = 1, in that case caller should implement a retry - * logic. - * - * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely - * crucial in the face of [offset, EOF] locks. - * - * Return at least one page in @queue unless there is no covered page. - */ -int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, - struct osc_object *osc, pgoff_t start, pgoff_t end, - osc_page_gang_cbt cb, void *cbdata) -{ - struct osc_page *ops; - void **pvec; - pgoff_t idx; - unsigned int nr; - unsigned int i; - unsigned int j; - int res = CLP_GANG_OKAY; - bool tree_lock = true; - - idx = start; - pvec = osc_env_info(env)->oti_pvec; - spin_lock(&osc->oo_tree_lock); - while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec, - idx, OTI_PVEC_SIZE)) > 0) { - struct cl_page *page; - bool end_of_region = false; - - for (i = 0, j = 0; i < nr; ++i) { - ops = pvec[i]; - pvec[i] = NULL; - - idx = osc_index(ops); - if (idx > end) { - end_of_region = true; - break; - } - - page = ops->ops_cl.cpl_page; - LASSERT(page->cp_type == CPT_CACHEABLE); - if (page->cp_state == CPS_FREEING) - continue; - - cl_page_get(page); - lu_ref_add_atomic(&page->cp_reference, - "gang_lookup", current); - pvec[j++] = ops; - } - ++idx; - - /* - * Here a delicate locking dance is performed. Current thread - * holds a reference to a page, but has to own it before it - * can be placed into queue. Owning implies waiting, so - * radix-tree lock is to be released. After a wait one has to - * check that pages weren't truncated (cl_page_own() returns - * error in the latter case). - */ - spin_unlock(&osc->oo_tree_lock); - tree_lock = false; - - for (i = 0; i < j; ++i) { - ops = pvec[i]; - if (res == CLP_GANG_OKAY) - res = (*cb)(env, io, ops, cbdata); - - page = ops->ops_cl.cpl_page; - lu_ref_del(&page->cp_reference, "gang_lookup", current); - cl_page_put(env, page); - } - if (nr < OTI_PVEC_SIZE || end_of_region) - break; - - if (res == CLP_GANG_OKAY && need_resched()) - res = CLP_GANG_RESCHED; - if (res != CLP_GANG_OKAY) - break; - - spin_lock(&osc->oo_tree_lock); - tree_lock = true; - } - if (tree_lock) - spin_unlock(&osc->oo_tree_lock); - return res; -} - -/** - * Check if page @page is covered by an extra lock or discard it. - */ -static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops, void *cbdata) -{ - struct osc_thread_info *info = osc_env_info(env); - struct osc_object *osc = cbdata; - pgoff_t index; - - index = osc_index(ops); - if (index >= info->oti_fn_index) { - struct ldlm_lock *tmp; - struct cl_page *page = ops->ops_cl.cpl_page; - - /* refresh non-overlapped index */ - tmp = osc_dlmlock_at_pgoff(env, osc, index, - OSC_DAP_FL_TEST_LOCK); - if (tmp) { - __u64 end = tmp->l_policy_data.l_extent.end; - /* Cache the first-non-overlapped index so as to skip - * all pages within [index, oti_fn_index). This is safe - * because if tmp lock is canceled, it will discard - * these pages. - */ - info->oti_fn_index = cl_index(osc2cl(osc), end + 1); - if (end == OBD_OBJECT_EOF) - info->oti_fn_index = CL_PAGE_EOF; - LDLM_LOCK_PUT(tmp); - } else if (cl_page_own(env, io, page) == 0) { - /* discard the page */ - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - } else { - LASSERT(page->cp_state == CPS_FREEING); - } - } - - info->oti_next_index = index + 1; - return CLP_GANG_OKAY; -} - -static int discard_cb(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops, void *cbdata) -{ - struct osc_thread_info *info = osc_env_info(env); - struct cl_page *page = ops->ops_cl.cpl_page; - - /* page is top page. */ - info->oti_next_index = osc_index(ops) + 1; - if (cl_page_own(env, io, page) == 0) { - if (page->cp_type == CPT_CACHEABLE && - PageDirty(cl_page_vmpage(page))) - CL_PAGE_DEBUG(D_ERROR, env, page, - "discard dirty page?\n"); - - /* discard the page */ - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - } else { - LASSERT(page->cp_state == CPS_FREEING); - } - - return CLP_GANG_OKAY; -} - -/** - * Discard pages protected by the given lock. This function traverses radix - * tree to find all covering pages and discard them. If a page is being covered - * by other locks, it should remain in cache. - * - * If error happens on any step, the process continues anyway (the reasoning - * behind this being that lock cancellation cannot be delayed indefinitely). - */ -int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, - pgoff_t start, pgoff_t end, enum cl_lock_mode mode) -{ - struct osc_thread_info *info = osc_env_info(env); - struct cl_io *io = &info->oti_io; - osc_page_gang_cbt cb; - int res; - int result; - - io->ci_obj = cl_object_top(osc2cl(osc)); - io->ci_ignore_layout = 1; - result = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (result != 0) - goto out; - - cb = mode == CLM_READ ? check_and_discard_cb : discard_cb; - info->oti_fn_index = start; - info->oti_next_index = start; - do { - res = osc_page_gang_lookup(env, io, osc, - info->oti_next_index, end, cb, osc); - if (info->oti_next_index > end) - break; - - if (res == CLP_GANG_RESCHED) - cond_resched(); - } while (res != CLP_GANG_OKAY); -out: - cl_io_fini(env, io); - return result; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h deleted file mode 100644 index 1449013722f6..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h +++ /dev/null @@ -1,683 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Internal interfaces of OSC layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com> - */ - -#ifndef OSC_CL_INTERNAL_H -#define OSC_CL_INTERNAL_H - -#include <linux/libcfs/libcfs.h> - -#include <obd.h> -/* osc_build_res_name() */ -#include <cl_object.h> -#include "osc_internal.h" - -/** \defgroup osc osc - * @{ - */ - -struct osc_extent; - -/** - * State maintained by osc layer for each IO context. - */ -struct osc_io { - /** super class */ - struct cl_io_slice oi_cl; - /** true if this io is lockless. */ - unsigned int oi_lockless:1, - /** true if this io is counted as active IO */ - oi_is_active:1; - /** how many LRU pages are reserved for this IO */ - unsigned long oi_lru_reserved; - - /** active extents, we know how many bytes is going to be written, - * so having an active extent will prevent it from being fragmented - */ - struct osc_extent *oi_active; - /** partially truncated extent, we need to hold this extent to prevent - * page writeback from happening. - */ - struct osc_extent *oi_trunc; - - /** write osc_lock for this IO, used by osc_extent_find(). */ - struct osc_lock *oi_write_osclock; - struct obdo oi_oa; - struct osc_async_cbargs { - bool opc_rpc_sent; - int opc_rc; - struct completion opc_sync; - } oi_cbarg; -}; - -/** - * State maintained by osc layer for the duration of a system call. - */ -struct osc_session { - struct osc_io os_io; -}; - -#define OTI_PVEC_SIZE 256 -struct osc_thread_info { - struct ldlm_res_id oti_resname; - union ldlm_policy_data oti_policy; - struct cl_lock_descr oti_descr; - struct cl_attr oti_attr; - struct lustre_handle oti_handle; - struct cl_page_list oti_plist; - struct cl_io oti_io; - void *oti_pvec[OTI_PVEC_SIZE]; - /** - * Fields used by cl_lock_discard_pages(). - */ - pgoff_t oti_next_index; - pgoff_t oti_fn_index; /* first non-overlapped index */ - struct cl_sync_io oti_anchor; - struct cl_req_attr oti_req_attr; -}; - -struct osc_object { - struct cl_object oo_cl; - struct lov_oinfo *oo_oinfo; - /** - * True if locking against this stripe got -EUSERS. - */ - int oo_contended; - unsigned long oo_contention_time; - /** - * used by the osc to keep track of what objects to build into rpcs. - * Protected by client_obd->cli_loi_list_lock. - */ - struct list_head oo_ready_item; - struct list_head oo_hp_ready_item; - struct list_head oo_write_item; - struct list_head oo_read_item; - - /** - * extent is a red black tree to manage (async) dirty pages. - */ - struct rb_root oo_root; - /** - * Manage write(dirty) extents. - */ - struct list_head oo_hp_exts; /* list of hp extents */ - struct list_head oo_urgent_exts; /* list of writeback extents */ - struct list_head oo_rpc_exts; - - struct list_head oo_reading_exts; - - atomic_t oo_nr_reads; - atomic_t oo_nr_writes; - - /** Protect extent tree. Will be used to protect - * oo_{read|write}_pages soon. - */ - spinlock_t oo_lock; - - /** - * Radix tree for caching pages - */ - struct radix_tree_root oo_tree; - spinlock_t oo_tree_lock; - unsigned long oo_npages; - - /* Protect osc_lock this osc_object has */ - spinlock_t oo_ol_spin; - struct list_head oo_ol_list; - - /** number of active IOs of this object */ - atomic_t oo_nr_ios; - wait_queue_head_t oo_io_waitq; -}; - -static inline void osc_object_lock(struct osc_object *obj) -{ - spin_lock(&obj->oo_lock); -} - -static inline int osc_object_trylock(struct osc_object *obj) -{ - return spin_trylock(&obj->oo_lock); -} - -static inline void osc_object_unlock(struct osc_object *obj) -{ - spin_unlock(&obj->oo_lock); -} - -static inline int osc_object_is_locked(struct osc_object *obj) -{ -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) - return spin_is_locked(&obj->oo_lock); -#else - /* - * It is not perfect to return true all the time. - * But since this function is only used for assertion - * and checking, it seems OK. - */ - return 1; -#endif -} - -/* - * Lock "micro-states" for osc layer. - */ -enum osc_lock_state { - OLS_NEW, - OLS_ENQUEUED, - OLS_UPCALL_RECEIVED, - OLS_GRANTED, - OLS_CANCELLED -}; - -/** - * osc-private state of cl_lock. - * - * Interaction with DLM. - * - * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in - * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_dlmlock. - * - * This pointer is protected through a reference, acquired by - * osc_lock_upcall0(). Also, an additional reference is acquired by - * ldlm_lock_addref() call protecting the lock from cancellation, until - * osc_lock_unuse() releases it. - * - * Below is a description of how lock references are acquired and released - * inside of DLM. - * - * - When new lock is created and enqueued to the server (ldlm_cli_enqueue()) - * - ldlm_lock_create() - * - ldlm_lock_new(): initializes a lock with 2 references. One for - * the caller (released when reply from the server is received, or on - * error), and another for the hash table. - * - ldlm_lock_addref_internal(): protects the lock from cancellation. - * - * - When reply is received from the server (osc_enqueue_interpret()) - * - ldlm_cli_enqueue_fini() - * - LDLM_LOCK_PUT(): releases caller reference acquired by - * ldlm_lock_new(). - * - if (rc != 0) - * ldlm_lock_decref(): error case: matches ldlm_cli_enqueue(). - * - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue(). - * - * - When lock is being cancelled (ldlm_lock_cancel()) - * - ldlm_lock_destroy() - * - LDLM_LOCK_PUT(): releases hash-table reference acquired by - * ldlm_lock_new(). - * - * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called - * either when lock is cancelled (osc_lock_blocking()), or when locks is - * deleted without cancellation (e.g., from cl_locks_prune()). In the latter - * case ldlm lock remains in memory, and can be re-attached to osc_lock in the - * future. - */ -struct osc_lock { - struct cl_lock_slice ols_cl; - /** Internal lock to protect states, etc. */ - spinlock_t ols_lock; - /** Owner sleeps on this channel for state change */ - struct cl_sync_io *ols_owner; - /** waiting list for this lock to be cancelled */ - struct list_head ols_waiting_list; - /** wait entry of ols_waiting_list */ - struct list_head ols_wait_entry; - /** list entry for osc_object::oo_ol_list */ - struct list_head ols_nextlock_oscobj; - - /** underlying DLM lock */ - struct ldlm_lock *ols_dlmlock; - /** DLM flags with which osc_lock::ols_lock was enqueued */ - __u64 ols_flags; - /** osc_lock::ols_lock handle */ - struct lustre_handle ols_handle; - struct ldlm_enqueue_info ols_einfo; - enum osc_lock_state ols_state; - /** lock value block */ - struct ost_lvb ols_lvb; - - /** - * true, if ldlm_lock_addref() was called against - * osc_lock::ols_lock. This is used for sanity checking. - * - * \see osc_lock::ols_has_ref - */ - unsigned ols_hold :1, - /** - * this is much like osc_lock::ols_hold, except that this bit is - * cleared _after_ reference in released in osc_lock_unuse(). This - * fine distinction is needed because: - * - * - if ldlm lock still has a reference, osc_ast_data_get() needs - * to return associated cl_lock (so that a flag is needed that is - * cleared after ldlm_lock_decref() returned), and - * - * - ldlm_lock_decref() can invoke blocking ast (for a - * LDLM_FL_CBPENDING lock), and osc_lock functions like - * osc_lock_cancel() called from there need to know whether to - * release lock reference (so that a flag is needed that is - * cleared before ldlm_lock_decref() is called). - */ - ols_has_ref:1, - /** - * inherit the lockless attribute from top level cl_io. - * If true, osc_lock_enqueue is able to tolerate the -EUSERS error. - */ - ols_locklessable:1, - /** - * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat - * the EVAVAIL error as tolerable, this will make upper logic happy - * to wait all glimpse locks to each OSTs to be completed. - * Glimpse lock converts to normal lock if the server lock is - * granted. - * Glimpse lock should be destroyed immediately after use. - */ - ols_glimpse:1, - /** - * For async glimpse lock. - */ - ols_agl:1; -}; - -/** - * Page state private for osc layer. - */ -struct osc_page { - struct cl_page_slice ops_cl; - /** - * Page queues used by osc to detect when RPC can be formed. - */ - struct osc_async_page ops_oap; - /** - * An offset within page from which next transfer starts. This is used - * by cl_page_clip() to submit partial page transfers. - */ - int ops_from; - /** - * An offset within page at which next transfer ends. - * - * \see osc_page::ops_from. - */ - int ops_to; - /** - * Boolean, true iff page is under transfer. Used for sanity checking. - */ - unsigned ops_transfer_pinned:1, - /** - * in LRU? - */ - ops_in_lru:1, - /** - * Set if the page must be transferred with OBD_BRW_SRVLOCK. - */ - ops_srvlock:1; - /** - * lru page list. See osc_lru_{del|use}() in osc_page.c for usage. - */ - struct list_head ops_lru; - /** - * Submit time - the time when the page is starting RPC. For debugging. - */ - unsigned long ops_submit_time; -}; - -extern struct kmem_cache *osc_lock_kmem; -extern struct kmem_cache *osc_object_kmem; -extern struct kmem_cache *osc_thread_kmem; -extern struct kmem_cache *osc_session_kmem; -extern struct kmem_cache *osc_extent_kmem; - -extern struct lu_device_type osc_device_type; -extern struct lu_context_key osc_key; -extern struct lu_context_key osc_session_key; - -#define OSC_FLAGS (ASYNC_URGENT | ASYNC_READY) - -int osc_lock_init(const struct lu_env *env, - struct cl_object *obj, struct cl_lock *lock, - const struct cl_io *io); -int osc_io_init(const struct lu_env *env, - struct cl_object *obj, struct cl_io *io); -struct lu_object *osc_object_alloc(const struct lu_env *env, - const struct lu_object_header *hdr, - struct lu_device *dev); -int osc_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t ind); - -void osc_index2policy(union ldlm_policy_data *policy, - const struct cl_object *obj, - pgoff_t start, pgoff_t end); -int osc_lvb_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct ost_lvb *lvb); - -void osc_lru_add_batch(struct client_obd *cli, struct list_head *list); -void osc_page_submit(const struct lu_env *env, struct osc_page *opg, - enum cl_req_type crt, int brw_flags); -int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops); -int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg, - u32 async_flags); -int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, - struct page *page, loff_t offset); -int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops); -int osc_page_cache_add(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io); -int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj, - struct osc_page *ops); -int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops); -int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, - struct list_head *list, int cmd, int brw_flags); -int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, - u64 size, struct osc_extent **extp); -void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext); -int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end, int hp, int discard); -int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end); -void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc); -int lru_queue_work(const struct lu_env *env, void *data); - -void osc_object_set_contended(struct osc_object *obj); -void osc_object_clear_contended(struct osc_object *obj); -int osc_object_is_contended(struct osc_object *obj); - -int osc_lock_is_lockless(const struct osc_lock *olck); - -/***************************************************************************** - * - * Accessors. - * - */ - -static inline struct osc_thread_info *osc_env_info(const struct lu_env *env) -{ - struct osc_thread_info *info; - - info = lu_context_key_get(&env->le_ctx, &osc_key); - LASSERT(info); - return info; -} - -static inline struct osc_session *osc_env_session(const struct lu_env *env) -{ - struct osc_session *ses; - - ses = lu_context_key_get(env->le_ses, &osc_session_key); - LASSERT(ses); - return ses; -} - -static inline struct osc_io *osc_env_io(const struct lu_env *env) -{ - return &osc_env_session(env)->os_io; -} - -static inline int osc_is_object(const struct lu_object *obj) -{ - return obj->lo_dev->ld_type == &osc_device_type; -} - -static inline struct osc_device *lu2osc_dev(const struct lu_device *d) -{ - LINVRNT(d->ld_type == &osc_device_type); - return container_of0(d, struct osc_device, od_cl.cd_lu_dev); -} - -static inline struct obd_export *osc_export(const struct osc_object *obj) -{ - return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp; -} - -static inline struct client_obd *osc_cli(const struct osc_object *obj) -{ - return &osc_export(obj)->exp_obd->u.cli; -} - -static inline struct osc_object *cl2osc(const struct cl_object *obj) -{ - LINVRNT(osc_is_object(&obj->co_lu)); - return container_of0(obj, struct osc_object, oo_cl); -} - -static inline struct cl_object *osc2cl(const struct osc_object *obj) -{ - return (struct cl_object *)&obj->oo_cl; -} - -static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode) -{ - LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP); - if (mode == CLM_READ) - return LCK_PR; - else if (mode == CLM_WRITE) - return LCK_PW; - else - return LCK_GROUP; -} - -static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode) -{ - LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP); - if (mode == LCK_PR) - return CLM_READ; - else if (mode == LCK_PW) - return CLM_WRITE; - else - return CLM_GROUP; -} - -static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice) -{ - LINVRNT(osc_is_object(&slice->cpl_obj->co_lu)); - return container_of0(slice, struct osc_page, ops_cl); -} - -static inline struct osc_page *oap2osc(struct osc_async_page *oap) -{ - return container_of0(oap, struct osc_page, ops_oap); -} - -static inline pgoff_t osc_index(struct osc_page *opg) -{ - return opg->ops_cl.cpl_index; -} - -static inline struct cl_page *oap2cl_page(struct osc_async_page *oap) -{ - return oap2osc(oap)->ops_cl.cpl_page; -} - -static inline struct osc_page *oap2osc_page(struct osc_async_page *oap) -{ - return (struct osc_page *)container_of(oap, struct osc_page, ops_oap); -} - -static inline struct osc_page * -osc_cl_page_osc(struct cl_page *page, struct osc_object *osc) -{ - const struct cl_page_slice *slice; - - LASSERT(osc); - slice = cl_object_page_slice(&osc->oo_cl, page); - return cl2osc_page(slice); -} - -static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice) -{ - LINVRNT(osc_is_object(&slice->cls_obj->co_lu)); - return container_of0(slice, struct osc_lock, ols_cl); -} - -static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock) -{ - return cl2osc_lock(cl_lock_at(lock, &osc_device_type)); -} - -static inline int osc_io_srvlock(struct osc_io *oio) -{ - return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock); -} - -enum osc_extent_state { - OES_INV = 0, /** extent is just initialized or destroyed */ - OES_ACTIVE = 1, /** process is using this extent */ - OES_CACHE = 2, /** extent is ready for IO */ - OES_LOCKING = 3, /** locking page to prepare IO */ - OES_LOCK_DONE = 4, /** locking finished, ready to send */ - OES_RPC = 5, /** in RPC */ - OES_TRUNC = 6, /** being truncated */ - OES_STATE_MAX -}; - -/** - * osc_extent data to manage dirty pages. - * osc_extent has the following attributes: - * 1. all pages in the same must be in one RPC in write back; - * 2. # of pages must be less than max_pages_per_rpc - implied by 1; - * 3. must be covered by only 1 osc_lock; - * 4. exclusive. It's impossible to have overlapped osc_extent. - * - * The lifetime of an extent is from when the 1st page is dirtied to when - * all pages inside it are written out. - * - * LOCKING ORDER - * ============= - * page lock -> cl_loi_list_lock -> object lock(osc_object::oo_lock) - */ -struct osc_extent { - /** red-black tree node */ - struct rb_node oe_node; - /** osc_object of this extent */ - struct osc_object *oe_obj; - /** refcount, removed from red-black tree if reaches zero. */ - atomic_t oe_refc; - /** busy if non-zero */ - atomic_t oe_users; - /** link list of osc_object's oo_{hp|urgent|locking}_exts. */ - struct list_head oe_link; - /** state of this extent */ - enum osc_extent_state oe_state; - /** flags for this extent. */ - unsigned int oe_intree:1, - /** 0 is write, 1 is read */ - oe_rw:1, - /** sync extent, queued by osc_queue_sync_pages() */ - oe_sync:1, - /** set if this extent has partial, sync pages. - * Extents with partial page(s) can't merge with others in RPC - */ - oe_no_merge:1, - oe_srvlock:1, - oe_memalloc:1, - /** an ACTIVE extent is going to be truncated, so when this extent - * is released, it will turn into TRUNC state instead of CACHE. - */ - oe_trunc_pending:1, - /** this extent should be written asap and someone may wait for the - * write to finish. This bit is usually set along with urgent if - * the extent was CACHE state. - * fsync_wait extent can't be merged because new extent region may - * exceed fsync range. - */ - oe_fsync_wait:1, - /** covering lock is being canceled */ - oe_hp:1, - /** this extent should be written back asap. set if one of pages is - * called by page WB daemon, or sync write or reading requests. - */ - oe_urgent:1; - /** how many grants allocated for this extent. - * Grant allocated for this extent. There is no grant allocated - * for reading extents and sync write extents. - */ - unsigned int oe_grants; - /** # of dirty pages in this extent */ - unsigned int oe_nr_pages; - /** list of pending oap pages. Pages in this list are NOT sorted. */ - struct list_head oe_pages; - /** Since an extent has to be written out in atomic, this is used to - * remember the next page need to be locked to write this extent out. - * Not used right now. - */ - struct osc_page *oe_next_page; - /** start and end index of this extent, include start and end - * themselves. Page offset here is the page index of osc_pages. - * oe_start is used as keyword for red-black tree. - */ - pgoff_t oe_start; - pgoff_t oe_end; - /** maximum ending index of this extent, this is limited by - * max_pages_per_rpc, lock extent and chunk size. - */ - pgoff_t oe_max_end; - /** waitqueue - for those who want to be notified if this extent's - * state has changed. - */ - wait_queue_head_t oe_waitq; - /** lock covering this extent */ - struct ldlm_lock *oe_dlmlock; - /** terminator of this extent. Must be true if this extent is in IO. */ - struct task_struct *oe_owner; - /** return value of writeback. If somebody is waiting for this extent, - * this value can be known by outside world. - */ - int oe_rc; - /** max pages per rpc when this extent was created */ - unsigned int oe_mppr; -}; - -int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, - int sent, int rc); -void osc_extent_release(const struct lu_env *env, struct osc_extent *ext); - -int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, - pgoff_t start, pgoff_t end, enum cl_lock_mode mode); - -typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *, - struct osc_page *, void *); -int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, - struct osc_object *osc, pgoff_t start, pgoff_t end, - osc_page_gang_cbt cb, void *cbdata); -/** @} osc */ - -#endif /* OSC_CL_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/osc/osc_dev.c b/drivers/staging/lustre/lustre/osc/osc_dev.c deleted file mode 100644 index 2b5f324743e2..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_dev.c +++ /dev/null @@ -1,246 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_device, for OSC layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - */ - -#define DEBUG_SUBSYSTEM S_OSC - -/* class_name2obd() */ -#include <obd_class.h> - -#include "osc_cl_internal.h" - -/** \addtogroup osc - * @{ - */ - -struct kmem_cache *osc_lock_kmem; -struct kmem_cache *osc_object_kmem; -struct kmem_cache *osc_thread_kmem; -struct kmem_cache *osc_session_kmem; -struct kmem_cache *osc_extent_kmem; -struct kmem_cache *osc_quota_kmem; - -struct lu_kmem_descr osc_caches[] = { - { - .ckd_cache = &osc_lock_kmem, - .ckd_name = "osc_lock_kmem", - .ckd_size = sizeof(struct osc_lock) - }, - { - .ckd_cache = &osc_object_kmem, - .ckd_name = "osc_object_kmem", - .ckd_size = sizeof(struct osc_object) - }, - { - .ckd_cache = &osc_thread_kmem, - .ckd_name = "osc_thread_kmem", - .ckd_size = sizeof(struct osc_thread_info) - }, - { - .ckd_cache = &osc_session_kmem, - .ckd_name = "osc_session_kmem", - .ckd_size = sizeof(struct osc_session) - }, - { - .ckd_cache = &osc_extent_kmem, - .ckd_name = "osc_extent_kmem", - .ckd_size = sizeof(struct osc_extent) - }, - { - .ckd_cache = &osc_quota_kmem, - .ckd_name = "osc_quota_kmem", - .ckd_size = sizeof(struct osc_quota_info) - }, - { - .ckd_cache = NULL - } -}; - -/***************************************************************************** - * - * Type conversions. - * - */ - -static struct lu_device *osc2lu_dev(struct osc_device *osc) -{ - return &osc->od_cl.cd_lu_dev; -} - -/***************************************************************************** - * - * Osc device and device type functions. - * - */ - -static void *osc_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct osc_thread_info *info; - - info = kmem_cache_zalloc(osc_thread_kmem, GFP_NOFS); - if (!info) - info = ERR_PTR(-ENOMEM); - return info; -} - -static void osc_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct osc_thread_info *info = data; - - kmem_cache_free(osc_thread_kmem, info); -} - -struct lu_context_key osc_key = { - .lct_tags = LCT_CL_THREAD, - .lct_init = osc_key_init, - .lct_fini = osc_key_fini -}; - -static void *osc_session_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct osc_session *info; - - info = kmem_cache_zalloc(osc_session_kmem, GFP_NOFS); - if (!info) - info = ERR_PTR(-ENOMEM); - return info; -} - -static void osc_session_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct osc_session *info = data; - - kmem_cache_free(osc_session_kmem, info); -} - -struct lu_context_key osc_session_key = { - .lct_tags = LCT_SESSION, - .lct_init = osc_session_init, - .lct_fini = osc_session_fini -}; - -/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */ -LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key); - -static int osc_cl_process_config(const struct lu_env *env, - struct lu_device *d, struct lustre_cfg *cfg) -{ - return osc_process_config_base(d->ld_obd, cfg); -} - -static const struct lu_device_operations osc_lu_ops = { - .ldo_object_alloc = osc_object_alloc, - .ldo_process_config = osc_cl_process_config, - .ldo_recovery_complete = NULL -}; - -static int osc_device_init(const struct lu_env *env, struct lu_device *d, - const char *name, struct lu_device *next) -{ - return 0; -} - -static struct lu_device *osc_device_fini(const struct lu_env *env, - struct lu_device *d) -{ - return NULL; -} - -static struct lu_device *osc_device_free(const struct lu_env *env, - struct lu_device *d) -{ - struct osc_device *od = lu2osc_dev(d); - - cl_device_fini(lu2cl_dev(d)); - kfree(od); - return NULL; -} - -static struct lu_device *osc_device_alloc(const struct lu_env *env, - struct lu_device_type *t, - struct lustre_cfg *cfg) -{ - struct lu_device *d; - struct osc_device *od; - struct obd_device *obd; - int rc; - - od = kzalloc(sizeof(*od), GFP_NOFS); - if (!od) - return ERR_PTR(-ENOMEM); - - cl_device_init(&od->od_cl, t); - d = osc2lu_dev(od); - d->ld_ops = &osc_lu_ops; - - /* Setup OSC OBD */ - obd = class_name2obd(lustre_cfg_string(cfg, 0)); - LASSERT(obd); - rc = osc_setup(obd, cfg); - if (rc) { - osc_device_free(env, d); - return ERR_PTR(rc); - } - od->od_exp = obd->obd_self_export; - return d; -} - -static const struct lu_device_type_operations osc_device_type_ops = { - .ldto_init = osc_type_init, - .ldto_fini = osc_type_fini, - - .ldto_start = osc_type_start, - .ldto_stop = osc_type_stop, - - .ldto_device_alloc = osc_device_alloc, - .ldto_device_free = osc_device_free, - - .ldto_device_init = osc_device_init, - .ldto_device_fini = osc_device_fini -}; - -struct lu_device_type osc_device_type = { - .ldt_tags = LU_DEVICE_CL, - .ldt_name = LUSTRE_OSC_NAME, - .ldt_ops = &osc_device_type_ops, - .ldt_ctx_tags = LCT_CL_THREAD -}; - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h deleted file mode 100644 index 32db150fd42e..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_internal.h +++ /dev/null @@ -1,236 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef OSC_INTERNAL_H -#define OSC_INTERNAL_H - -#define OAP_MAGIC 8675309 - -extern atomic_t osc_pool_req_count; -extern unsigned int osc_reqpool_maxreqcount; -extern struct ptlrpc_request_pool *osc_rq_pool; - -struct lu_env; - -enum async_flags { - ASYNC_READY = 0x1, /* ap_make_ready will not be called before this - * page is added to an rpc - */ - ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ - ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called - * to give the caller a chance to update - * or cancel the size of the io - */ - ASYNC_HP = 0x10, -}; - -struct osc_async_page { - int oap_magic; - unsigned short oap_cmd; - unsigned short oap_interrupted:1; - - struct list_head oap_pending_item; - struct list_head oap_rpc_item; - - u64 oap_obj_off; - unsigned int oap_page_off; - enum async_flags oap_async_flags; - - struct brw_page oap_brw_page; - - struct ptlrpc_request *oap_request; - struct client_obd *oap_cli; - struct osc_object *oap_obj; - - spinlock_t oap_lock; -}; - -#define oap_page oap_brw_page.pg -#define oap_count oap_brw_page.count -#define oap_brw_flags oap_brw_page.flag - -static inline struct osc_async_page *brw_page2oap(struct brw_page *pga) -{ - return (struct osc_async_page *)container_of(pga, struct osc_async_page, - oap_brw_page); -} - -struct osc_cache_waiter { - struct list_head ocw_entry; - wait_queue_head_t ocw_waitq; - struct osc_async_page *ocw_oap; - int ocw_grant; - int ocw_rc; -}; - -void osc_wake_cache_waiters(struct client_obd *cli); -int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes); -void osc_update_next_shrink(struct client_obd *cli); - -/* - * cl integration. - */ -#include <cl_object.h> - -extern struct ptlrpc_request_set *PTLRPCD_SET; - -typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh, - int rc); - -int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, - __u64 *flags, union ldlm_policy_data *policy, - struct ost_lvb *lvb, int kms_valid, - osc_enqueue_upcall_f upcall, - void *cookie, struct ldlm_enqueue_info *einfo, - struct ptlrpc_request_set *rqset, int async, int agl); - -int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, - enum ldlm_type type, union ldlm_policy_data *policy, - enum ldlm_mode mode, __u64 *flags, void *data, - struct lustre_handle *lockh, int unref); - -int osc_setattr_async(struct obd_export *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset); -int osc_punch_base(struct obd_export *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset); -int osc_sync_base(struct osc_object *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset); - -int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg); -int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, - struct list_head *ext_list, int cmd); -long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, - long target, bool force); -unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages); -void osc_lru_unreserve(struct client_obd *cli, unsigned long npages); - -unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock); - -int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg); - -int lproc_osc_attach_seqstat(struct obd_device *dev); -void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars); - -extern struct lu_device_type osc_device_type; - -static inline int osc_recoverable_error(int rc) -{ - return (rc == -EIO || rc == -EROFS || rc == -ENOMEM || - rc == -EAGAIN || rc == -EINPROGRESS); -} - -static inline unsigned long rpcs_in_flight(struct client_obd *cli) -{ - return cli->cl_r_in_flight + cli->cl_w_in_flight; -} - -static inline char *cli_name(struct client_obd *cli) -{ - return cli->cl_import->imp_obd->obd_name; -} - -struct osc_device { - struct cl_device od_cl; - struct obd_export *od_exp; - - /* Write stats is actually protected by client_obd's lock. */ - struct osc_stats { - u64 os_lockless_writes; /* by bytes */ - u64 os_lockless_reads; /* by bytes */ - u64 os_lockless_truncates; /* by times */ - } od_stats; - - /* configuration item(s) */ - int od_contention_time; - int od_lockless_truncate; -}; - -static inline struct osc_device *obd2osc_dev(const struct obd_device *d) -{ - return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev); -} - -extern struct lu_kmem_descr osc_caches[]; - -extern struct kmem_cache *osc_quota_kmem; -struct osc_quota_info { - /** linkage for quota hash table */ - struct hlist_node oqi_hash; - u32 oqi_id; -}; - -int osc_quota_setup(struct obd_device *obd); -int osc_quota_cleanup(struct obd_device *obd); -int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[], - u32 valid, u32 flags); -int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]); -int osc_quotactl(struct obd_device *unused, struct obd_export *exp, - struct obd_quotactl *oqctl); -void osc_inc_unstable_pages(struct ptlrpc_request *req); -void osc_dec_unstable_pages(struct ptlrpc_request *req); -bool osc_over_unstable_soft_limit(struct client_obd *cli); - -/** - * Bit flags for osc_dlm_lock_at_pageoff(). - */ -enum osc_dap_flags { - /** - * Just check if the desired lock exists, it won't hold reference - * count on lock. - */ - OSC_DAP_FL_TEST_LOCK = BIT(0), - /** - * Return the lock even if it is being canceled. - */ - OSC_DAP_FL_CANCELING = BIT(1), -}; - -struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env, - struct osc_object *obj, pgoff_t index, - enum osc_dap_flags flags); - -int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc); - -/** osc shrink list to link all osc client obd */ -extern struct list_head osc_shrink_list; -/** spin lock to protect osc_shrink_list */ -extern spinlock_t osc_shrink_lock; -unsigned long osc_cache_shrink_count(struct shrinker *sk, - struct shrink_control *sc); -unsigned long osc_cache_shrink_scan(struct shrinker *sk, - struct shrink_control *sc); - -#endif /* OSC_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c deleted file mode 100644 index 76743faf3e6d..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_io.c +++ /dev/null @@ -1,918 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_io for OSC layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com> - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include <lustre_obdo.h> - -#include "osc_cl_internal.h" - -/** \addtogroup osc - * @{ - */ - -/***************************************************************************** - * - * Type conversions. - * - */ - -static struct osc_io *cl2osc_io(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl); - - LINVRNT(oio == osc_env_io(env)); - return oio; -} - -/***************************************************************************** - * - * io operations. - * - */ - -static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io) -{ -} - -static void osc_read_ahead_release(const struct lu_env *env, void *cbdata) -{ - struct ldlm_lock *dlmlock = cbdata; - struct lustre_handle lockh; - - ldlm_lock2handle(dlmlock, &lockh); - ldlm_lock_decref(&lockh, LCK_PR); - LDLM_LOCK_PUT(dlmlock); -} - -static int osc_io_read_ahead(const struct lu_env *env, - const struct cl_io_slice *ios, - pgoff_t start, struct cl_read_ahead *ra) -{ - struct osc_object *osc = cl2osc(ios->cis_obj); - struct ldlm_lock *dlmlock; - int result = -ENODATA; - - dlmlock = osc_dlmlock_at_pgoff(env, osc, start, 0); - if (dlmlock) { - LASSERT(dlmlock->l_ast_data == osc); - if (dlmlock->l_req_mode != LCK_PR) { - struct lustre_handle lockh; - - ldlm_lock2handle(dlmlock, &lockh); - ldlm_lock_addref(&lockh, LCK_PR); - ldlm_lock_decref(&lockh, dlmlock->l_req_mode); - } - - ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc; - ra->cra_end = cl_index(osc2cl(osc), - dlmlock->l_policy_data.l_extent.end); - ra->cra_release = osc_read_ahead_release; - ra->cra_cbdata = dlmlock; - result = 0; - } - - return result; -} - -/** - * An implementation of cl_io_operations::cio_io_submit() method for osc - * layer. Iterates over pages in the in-queue, prepares each for io by calling - * cl_page_prep() and then either submits them through osc_io_submit_page() - * or, if page is already submitted, changes osc flags through - * osc_set_async_flags(). - */ -static int osc_io_submit(const struct lu_env *env, - const struct cl_io_slice *ios, - enum cl_req_type crt, struct cl_2queue *queue) -{ - struct cl_page *page; - struct cl_page *tmp; - struct client_obd *cli = NULL; - struct osc_object *osc = NULL; /* to keep gcc happy */ - struct osc_page *opg; - struct cl_io *io; - LIST_HEAD(list); - - struct cl_page_list *qin = &queue->c2_qin; - struct cl_page_list *qout = &queue->c2_qout; - unsigned int queued = 0; - int result = 0; - int cmd; - int brw_flags; - unsigned int max_pages; - - LASSERT(qin->pl_nr > 0); - - CDEBUG(D_CACHE | D_READA, "%d %d\n", qin->pl_nr, crt); - - osc = cl2osc(ios->cis_obj); - cli = osc_cli(osc); - max_pages = cli->cl_max_pages_per_rpc; - - cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; - brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0; - - /* - * NOTE: here @page is a top-level page. This is done to avoid - * creation of sub-page-list. - */ - cl_page_list_for_each_safe(page, tmp, qin) { - struct osc_async_page *oap; - - /* Top level IO. */ - io = page->cp_owner; - LASSERT(io); - - opg = osc_cl_page_osc(page, osc); - oap = &opg->ops_oap; - LASSERT(osc == oap->oap_obj); - - if (!list_empty(&oap->oap_pending_item) || - !list_empty(&oap->oap_rpc_item)) { - CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", - oap, opg); - result = -EBUSY; - break; - } - - result = cl_page_prep(env, io, page, crt); - if (result != 0) { - LASSERT(result < 0); - if (result != -EALREADY) - break; - /* - * Handle -EALREADY error: for read case, the page is - * already in UPTODATE state; for write, the page - * is not dirty. - */ - result = 0; - continue; - } - - spin_lock(&oap->oap_lock); - oap->oap_async_flags = ASYNC_URGENT | ASYNC_READY; - oap->oap_async_flags |= ASYNC_COUNT_STABLE; - spin_unlock(&oap->oap_lock); - - osc_page_submit(env, opg, crt, brw_flags); - list_add_tail(&oap->oap_pending_item, &list); - - if (page->cp_sync_io) - cl_page_list_move(qout, qin, page); - else /* async IO */ - cl_page_list_del(env, qin, page); - - if (++queued == max_pages) { - queued = 0; - result = osc_queue_sync_pages(env, osc, &list, cmd, - brw_flags); - if (result < 0) - break; - } - } - - if (queued > 0) - result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags); - - /* Update c/mtime for sync write. LU-7310 */ - if (qout->pl_nr > 0 && !result) { - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - struct cl_object *obj = ios->cis_obj; - - cl_object_attr_lock(obj); - attr->cat_mtime = ktime_get_real_seconds(); - attr->cat_ctime = attr->cat_mtime; - cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME); - cl_object_attr_unlock(obj); - } - - CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result); - return qout->pl_nr > 0 ? 0 : result; -} - -/** - * This is called when a page is accessed within file in a way that creates - * new page, if one were missing (i.e., if there were a hole at that place in - * the file, or accessed page is beyond the current file size). - * - * Expand stripe KMS if necessary. - */ -static void osc_page_touch_at(const struct lu_env *env, - struct cl_object *obj, pgoff_t idx, size_t to) -{ - struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - int valid; - __u64 kms; - - /* offset within stripe */ - kms = cl_offset(obj, idx) + to; - - cl_object_attr_lock(obj); - /* - * XXX old code used - * - * ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm); - * - * here - */ - CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n", - kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms, - loi->loi_lvb.lvb_size); - - attr->cat_ctime = ktime_get_real_seconds(); - attr->cat_mtime = attr->cat_ctime; - valid = CAT_MTIME | CAT_CTIME; - if (kms > loi->loi_kms) { - attr->cat_kms = kms; - valid |= CAT_KMS; - } - if (kms > loi->loi_lvb.lvb_size) { - attr->cat_size = kms; - valid |= CAT_SIZE; - } - cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); -} - -static int osc_io_commit_async(const struct lu_env *env, - const struct cl_io_slice *ios, - struct cl_page_list *qin, int from, int to, - cl_commit_cbt cb) -{ - struct cl_io *io = ios->cis_io; - struct osc_io *oio = cl2osc_io(env, ios); - struct osc_object *osc = cl2osc(ios->cis_obj); - struct cl_page *page; - struct cl_page *last_page; - struct osc_page *opg; - int result = 0; - - LASSERT(qin->pl_nr > 0); - - /* Handle partial page cases */ - last_page = cl_page_list_last(qin); - if (oio->oi_lockless) { - page = cl_page_list_first(qin); - if (page == last_page) { - cl_page_clip(env, page, from, to); - } else { - if (from != 0) - cl_page_clip(env, page, from, PAGE_SIZE); - if (to != PAGE_SIZE) - cl_page_clip(env, last_page, 0, to); - } - } - - while (qin->pl_nr > 0) { - struct osc_async_page *oap; - - page = cl_page_list_first(qin); - opg = osc_cl_page_osc(page, osc); - oap = &opg->ops_oap; - - if (!list_empty(&oap->oap_rpc_item)) { - CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", - oap, opg); - result = -EBUSY; - break; - } - - /* The page may be already in dirty cache. */ - if (list_empty(&oap->oap_pending_item)) { - result = osc_page_cache_add(env, &opg->ops_cl, io); - if (result != 0) - break; - } - - osc_page_touch_at(env, osc2cl(osc), osc_index(opg), - page == last_page ? to : PAGE_SIZE); - - cl_page_list_del(env, qin, page); - - (*cb)(env, io, page); - /* Can't access page any more. Page can be in transfer and - * complete at any time. - */ - } - - /* for sync write, kernel will wait for this page to be flushed before - * osc_io_end() is called, so release it earlier. - * for mkwrite(), it's known there is no further pages. - */ - if (cl_io_is_sync_write(io) && oio->oi_active) { - osc_extent_release(env, oio->oi_active); - oio->oi_active = NULL; - } - - CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result); - return result; -} - -static int osc_io_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct osc_object *osc = cl2osc(ios->cis_obj); - struct obd_import *imp = osc_cli(osc)->cl_import; - int rc = -EIO; - - spin_lock(&imp->imp_lock); - if (likely(!imp->imp_invalid)) { - struct osc_io *oio = osc_env_io(env); - - atomic_inc(&osc->oo_nr_ios); - oio->oi_is_active = 1; - rc = 0; - } - spin_unlock(&imp->imp_lock); - - return rc; -} - -static int osc_io_write_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct osc_io *oio = osc_env_io(env); - struct osc_object *osc = cl2osc(ios->cis_obj); - unsigned long npages; - - if (cl_io_is_append(io)) - return osc_io_iter_init(env, ios); - - npages = io->u.ci_rw.crw_count >> PAGE_SHIFT; - if (io->u.ci_rw.crw_pos & ~PAGE_MASK) - ++npages; - - oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages); - - return osc_io_iter_init(env, ios); -} - -static void osc_io_iter_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct osc_io *oio = osc_env_io(env); - - if (oio->oi_is_active) { - struct osc_object *osc = cl2osc(ios->cis_obj); - - oio->oi_is_active = 0; - LASSERT(atomic_read(&osc->oo_nr_ios) > 0); - if (atomic_dec_and_test(&osc->oo_nr_ios)) - wake_up_all(&osc->oo_io_waitq); - } -} - -static void osc_io_write_iter_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct osc_io *oio = osc_env_io(env); - struct osc_object *osc = cl2osc(ios->cis_obj); - - if (oio->oi_lru_reserved > 0) { - osc_lru_unreserve(osc_cli(osc), oio->oi_lru_reserved); - oio->oi_lru_reserved = 0; - } - oio->oi_write_osclock = NULL; - - osc_io_iter_fini(env, ios); -} - -static int osc_io_fault_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io; - struct cl_fault_io *fio; - - io = ios->cis_io; - fio = &io->u.ci_fault; - CDEBUG(D_INFO, "%lu %d %zu\n", - fio->ft_index, fio->ft_writable, fio->ft_nob); - /* - * If mapping is writeable, adjust kms to cover this page, - * but do not extend kms beyond actual file size. - * See bug 10919. - */ - if (fio->ft_writable) - osc_page_touch_at(env, ios->cis_obj, - fio->ft_index, fio->ft_nob); - return 0; -} - -static int osc_async_upcall(void *a, int rc) -{ - struct osc_async_cbargs *args = a; - - args->opc_rc = rc; - complete(&args->opc_sync); - return 0; -} - -/** - * Checks that there are no pages being written in the extent being truncated. - */ -static int trunc_check_cb(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops, void *cbdata) -{ - struct cl_page *page = ops->ops_cl.cpl_page; - struct osc_async_page *oap; - __u64 start = *(__u64 *)cbdata; - - oap = &ops->ops_oap; - if (oap->oap_cmd & OBD_BRW_WRITE && - !list_empty(&oap->oap_pending_item)) - CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n", - start, current->comm); - - if (PageLocked(page->cp_vmpage)) - CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n", - ops, osc_index(ops), oap->oap_cmd & OBD_BRW_RWMASK); - - return CLP_GANG_OKAY; -} - -static void osc_trunc_check(const struct lu_env *env, struct cl_io *io, - struct osc_io *oio, __u64 size) -{ - struct cl_object *clob; - int partial; - pgoff_t start; - - clob = oio->oi_cl.cis_obj; - start = cl_index(clob, size); - partial = cl_offset(clob, start) < size; - - /* - * Complain if there are pages in the truncated region. - */ - osc_page_gang_lookup(env, io, cl2osc(clob), - start + partial, CL_PAGE_EOF, - trunc_check_cb, (void *)&size); -} - -static int osc_io_setattr_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_io *io = slice->cis_io; - struct osc_io *oio = cl2osc_io(env, slice); - struct cl_object *obj = slice->cis_obj; - struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - struct obdo *oa = &oio->oi_oa; - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - __u64 size = io->u.ci_setattr.sa_attr.lvb_size; - unsigned int ia_valid = io->u.ci_setattr.sa_valid; - int result = 0; - - /* truncate cache dirty pages first */ - if (cl_io_is_trunc(io)) - result = osc_cache_truncate_start(env, cl2osc(obj), size, - &oio->oi_trunc); - - if (result == 0 && oio->oi_lockless == 0) { - cl_object_attr_lock(obj); - result = cl_object_attr_get(env, obj, attr); - if (result == 0) { - struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr; - unsigned int cl_valid = 0; - - if (ia_valid & ATTR_SIZE) { - attr->cat_size = size; - attr->cat_kms = size; - cl_valid = CAT_SIZE | CAT_KMS; - } - if (ia_valid & ATTR_MTIME_SET) { - attr->cat_mtime = lvb->lvb_mtime; - cl_valid |= CAT_MTIME; - } - if (ia_valid & ATTR_ATIME_SET) { - attr->cat_atime = lvb->lvb_atime; - cl_valid |= CAT_ATIME; - } - if (ia_valid & ATTR_CTIME_SET) { - attr->cat_ctime = lvb->lvb_ctime; - cl_valid |= CAT_CTIME; - } - result = cl_object_attr_update(env, obj, attr, - cl_valid); - } - cl_object_attr_unlock(obj); - } - memset(oa, 0, sizeof(*oa)); - if (result == 0) { - oa->o_oi = loi->loi_oi; - obdo_set_parent_fid(oa, io->u.ci_setattr.sa_parent_fid); - oa->o_stripe_idx = io->u.ci_setattr.sa_stripe_index; - oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP; - if (ia_valid & ATTR_CTIME) { - oa->o_valid |= OBD_MD_FLCTIME; - oa->o_ctime = attr->cat_ctime; - } - if (ia_valid & ATTR_ATIME) { - oa->o_valid |= OBD_MD_FLATIME; - oa->o_atime = attr->cat_atime; - } - if (ia_valid & ATTR_MTIME) { - oa->o_valid |= OBD_MD_FLMTIME; - oa->o_mtime = attr->cat_mtime; - } - if (ia_valid & ATTR_SIZE) { - oa->o_size = size; - oa->o_blocks = OBD_OBJECT_EOF; - oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; - - if (oio->oi_lockless) { - oa->o_flags = OBD_FL_SRVLOCK; - oa->o_valid |= OBD_MD_FLFLAGS; - } - } else { - LASSERT(oio->oi_lockless == 0); - } - if (ia_valid & ATTR_ATTR_FLAG) { - oa->o_flags = io->u.ci_setattr.sa_attr_flags; - oa->o_valid |= OBD_MD_FLFLAGS; - } - - init_completion(&cbargs->opc_sync); - - if (ia_valid & ATTR_SIZE) - result = osc_punch_base(osc_export(cl2osc(obj)), - oa, osc_async_upcall, - cbargs, PTLRPCD_SET); - else - result = osc_setattr_async(osc_export(cl2osc(obj)), - oa, osc_async_upcall, - cbargs, PTLRPCD_SET); - cbargs->opc_rpc_sent = result == 0; - } - return result; -} - -static void osc_io_setattr_end(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_io *io = slice->cis_io; - struct osc_io *oio = cl2osc_io(env, slice); - struct cl_object *obj = slice->cis_obj; - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - int result = 0; - - if (cbargs->opc_rpc_sent) { - wait_for_completion(&cbargs->opc_sync); - result = cbargs->opc_rc; - io->ci_result = cbargs->opc_rc; - } - if (result == 0) { - if (oio->oi_lockless) { - /* lockless truncate */ - struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); - - LASSERT(cl_io_is_trunc(io)); - /* XXX: Need a lock. */ - osd->od_stats.os_lockless_truncates++; - } - } - - if (cl_io_is_trunc(io)) { - __u64 size = io->u.ci_setattr.sa_attr.lvb_size; - - osc_trunc_check(env, io, oio, size); - osc_cache_truncate_end(env, oio->oi_trunc); - oio->oi_trunc = NULL; - } -} - -struct osc_data_version_args { - struct osc_io *dva_oio; -}; - -static int -osc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req, - void *arg, int rc) -{ - struct osc_data_version_args *dva = arg; - struct osc_io *oio = dva->dva_oio; - const struct ost_body *body; - - if (rc < 0) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, &oio->oi_oa, - &body->oa); -out: - oio->oi_cbarg.opc_rc = rc; - complete(&oio->oi_cbarg.opc_sync); - - return 0; -} - -static int osc_io_data_version_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; - struct osc_io *oio = cl2osc_io(env, slice); - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - struct osc_object *obj = cl2osc(slice->cis_obj); - struct obd_export *exp = osc_export(obj); - struct lov_oinfo *loi = obj->oo_oinfo; - struct osc_data_version_args *dva; - struct obdo *oa = &oio->oi_oa; - struct ptlrpc_request *req; - struct ost_body *body; - int rc; - - memset(oa, 0, sizeof(*oa)); - oa->o_oi = loi->loi_oi; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - - if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) { - oa->o_valid |= OBD_MD_FLFLAGS; - oa->o_flags |= OBD_FL_SRVLOCK; - if (dv->dv_flags & LL_DV_WR_FLUSH) - oa->o_flags |= OBD_FL_FLUSH; - } - - init_completion(&cbargs->opc_sync); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); - if (rc < 0) { - ptlrpc_request_free(req); - return rc; - } - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); - - ptlrpc_request_set_replen(req); - req->rq_interpret_reply = osc_data_version_interpret; - BUILD_BUG_ON(sizeof(*dva) > sizeof(req->rq_async_args)); - dva = ptlrpc_req_async_args(req); - dva->dva_oio = oio; - - ptlrpcd_add_req(req); - - return 0; -} - -static void osc_io_data_version_end(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; - struct osc_io *oio = cl2osc_io(env, slice); - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - - wait_for_completion(&cbargs->opc_sync); - - if (cbargs->opc_rc) { - slice->cis_io->ci_result = cbargs->opc_rc; - } else if (!(oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)) { - slice->cis_io->ci_result = -EOPNOTSUPP; - } else { - dv->dv_data_version = oio->oi_oa.o_data_version; - slice->cis_io->ci_result = 0; - } -} - -static int osc_io_read_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_object *obj = slice->cis_obj; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - int rc = 0; - - if (!slice->cis_io->ci_noatime) { - cl_object_attr_lock(obj); - attr->cat_atime = ktime_get_real_seconds(); - rc = cl_object_attr_update(env, obj, attr, CAT_ATIME); - cl_object_attr_unlock(obj); - } - return rc; -} - -static int osc_io_write_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_object *obj = slice->cis_obj; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - int rc = 0; - - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1); - cl_object_attr_lock(obj); - attr->cat_ctime = ktime_get_real_seconds(); - attr->cat_mtime = attr->cat_ctime; - rc = cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME); - cl_object_attr_unlock(obj); - - return rc; -} - -static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj, - struct cl_fsync_io *fio) -{ - struct osc_io *oio = osc_env_io(env); - struct obdo *oa = &oio->oi_oa; - struct lov_oinfo *loi = obj->oo_oinfo; - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - int rc = 0; - - memset(oa, 0, sizeof(*oa)); - oa->o_oi = loi->loi_oi; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - - /* reload size abd blocks for start and end of sync range */ - oa->o_size = fio->fi_start; - oa->o_blocks = fio->fi_end; - oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; - - obdo_set_parent_fid(oa, fio->fi_fid); - - init_completion(&cbargs->opc_sync); - - rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET); - return rc; -} - -static int osc_io_fsync_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_io *io = slice->cis_io; - struct cl_fsync_io *fio = &io->u.ci_fsync; - struct cl_object *obj = slice->cis_obj; - struct osc_object *osc = cl2osc(obj); - pgoff_t start = cl_index(obj, fio->fi_start); - pgoff_t end = cl_index(obj, fio->fi_end); - int result = 0; - - if (fio->fi_end == OBD_OBJECT_EOF) - end = CL_PAGE_EOF; - - result = osc_cache_writeback_range(env, osc, start, end, 0, - fio->fi_mode == CL_FSYNC_DISCARD); - if (result > 0) { - fio->fi_nr_written += result; - result = 0; - } - if (fio->fi_mode == CL_FSYNC_ALL) { - int rc; - - /* we have to wait for writeback to finish before we can - * send OST_SYNC RPC. This is bad because it causes extents - * to be written osc by osc. However, we usually start - * writeback before CL_FSYNC_ALL so this won't have any real - * problem. - */ - rc = osc_cache_wait_range(env, osc, start, end); - if (result == 0) - result = rc; - rc = osc_fsync_ost(env, osc, fio); - if (result == 0) - result = rc; - } - - return result; -} - -static void osc_io_fsync_end(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync; - struct cl_object *obj = slice->cis_obj; - pgoff_t start = cl_index(obj, fio->fi_start); - pgoff_t end = cl_index(obj, fio->fi_end); - int result = 0; - - if (fio->fi_mode == CL_FSYNC_LOCAL) { - result = osc_cache_wait_range(env, cl2osc(obj), start, end); - } else if (fio->fi_mode == CL_FSYNC_ALL) { - struct osc_io *oio = cl2osc_io(env, slice); - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - - wait_for_completion(&cbargs->opc_sync); - if (result == 0) - result = cbargs->opc_rc; - } - slice->cis_io->ci_result = result; -} - -static void osc_io_end(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct osc_io *oio = cl2osc_io(env, slice); - - if (oio->oi_active) { - osc_extent_release(env, oio->oi_active); - oio->oi_active = NULL; - } -} - -static const struct cl_io_operations osc_io_ops = { - .op = { - [CIT_READ] = { - .cio_iter_init = osc_io_iter_init, - .cio_iter_fini = osc_io_iter_fini, - .cio_start = osc_io_read_start, - .cio_fini = osc_io_fini - }, - [CIT_WRITE] = { - .cio_iter_init = osc_io_write_iter_init, - .cio_iter_fini = osc_io_write_iter_fini, - .cio_start = osc_io_write_start, - .cio_end = osc_io_end, - .cio_fini = osc_io_fini - }, - [CIT_SETATTR] = { - .cio_iter_init = osc_io_iter_init, - .cio_iter_fini = osc_io_iter_fini, - .cio_start = osc_io_setattr_start, - .cio_end = osc_io_setattr_end - }, - [CIT_DATA_VERSION] = { - .cio_start = osc_io_data_version_start, - .cio_end = osc_io_data_version_end, - }, - [CIT_FAULT] = { - .cio_iter_init = osc_io_iter_init, - .cio_iter_fini = osc_io_iter_fini, - .cio_start = osc_io_fault_start, - .cio_end = osc_io_end, - .cio_fini = osc_io_fini - }, - [CIT_FSYNC] = { - .cio_start = osc_io_fsync_start, - .cio_end = osc_io_fsync_end, - .cio_fini = osc_io_fini - }, - [CIT_MISC] = { - .cio_fini = osc_io_fini - } - }, - .cio_read_ahead = osc_io_read_ahead, - .cio_submit = osc_io_submit, - .cio_commit_async = osc_io_commit_async -}; - -/***************************************************************************** - * - * Transfer operations. - * - */ - -int osc_io_init(const struct lu_env *env, - struct cl_object *obj, struct cl_io *io) -{ - struct osc_io *oio = osc_env_io(env); - - CL_IO_SLICE_CLEAN(oio, oi_cl); - cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops); - return 0; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c deleted file mode 100644 index fe8ed0d0497a..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_lock.c +++ /dev/null @@ -1,1231 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_lock for OSC layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - * Author: Jinshan Xiong <jinshan.xiong@intel.com> - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include <linux/libcfs/libcfs.h> -/* fid_build_reg_res_name() */ -#include <lustre_fid.h> - -#include "osc_cl_internal.h" - -/** \addtogroup osc - * @{ - */ - -/***************************************************************************** - * - * Type conversions. - * - */ - -static const struct cl_lock_operations osc_lock_ops; -static const struct cl_lock_operations osc_lock_lockless_ops; -static void osc_lock_to_lockless(const struct lu_env *env, - struct osc_lock *ols, int force); - -int osc_lock_is_lockless(const struct osc_lock *olck) -{ - return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops); -} - -/** - * Returns a weak pointer to the ldlm lock identified by a handle. Returned - * pointer cannot be dereferenced, as lock is not protected from concurrent - * reclaim. This function is a helper for osc_lock_invariant(). - */ -static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle) -{ - struct ldlm_lock *lock; - - lock = ldlm_handle2lock(handle); - if (lock) - LDLM_LOCK_PUT(lock); - return lock; -} - -/** - * Invariant that has to be true all of the time. - */ -static int osc_lock_invariant(struct osc_lock *ols) -{ - struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle); - struct ldlm_lock *olock = ols->ols_dlmlock; - int handle_used = lustre_handle_is_used(&ols->ols_handle); - - if (ergo(osc_lock_is_lockless(ols), - ols->ols_locklessable && !ols->ols_dlmlock)) - return 1; - - /* - * If all the following "ergo"s are true, return 1, otherwise 0 - */ - if (!ergo(olock, handle_used)) - return 0; - - if (!ergo(olock, olock->l_handle.h_cookie == ols->ols_handle.cookie)) - return 0; - - if (!ergo(handle_used, - ergo(lock && olock, lock == olock) && - ergo(!lock, !olock))) - return 0; - /* - * Check that ->ols_handle and ->ols_dlmlock are consistent, but - * take into account that they are set at the different time. - */ - if (!ergo(ols->ols_state == OLS_CANCELLED, - !olock && !handle_used)) - return 0; - /* - * DLM lock is destroyed only after we have seen cancellation - * ast. - */ - if (!ergo(olock && ols->ols_state < OLS_CANCELLED, - !ldlm_is_destroyed(olock))) - return 0; - - if (!ergo(ols->ols_state == OLS_GRANTED, - olock && olock->l_req_mode == olock->l_granted_mode && - ols->ols_hold)) - return 0; - return 1; -} - -/***************************************************************************** - * - * Lock operations. - * - */ - -static void osc_lock_fini(const struct lu_env *env, - struct cl_lock_slice *slice) -{ - struct osc_lock *ols = cl2osc_lock(slice); - - LINVRNT(osc_lock_invariant(ols)); - LASSERT(!ols->ols_dlmlock); - - kmem_cache_free(osc_lock_kmem, ols); -} - -static void osc_lock_build_policy(const struct lu_env *env, - const struct cl_lock *lock, - union ldlm_policy_data *policy) -{ - const struct cl_lock_descr *d = &lock->cll_descr; - - osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end); - policy->l_extent.gid = d->cld_gid; -} - -static __u64 osc_enq2ldlm_flags(__u32 enqflags) -{ - __u64 result = 0; - - LASSERT((enqflags & ~CEF_MASK) == 0); - - if (enqflags & CEF_NONBLOCK) - result |= LDLM_FL_BLOCK_NOWAIT; - if (enqflags & CEF_ASYNC) - result |= LDLM_FL_HAS_INTENT; - if (enqflags & CEF_DISCARD_DATA) - result |= LDLM_FL_AST_DISCARD_DATA; - if (enqflags & CEF_PEEK) - result |= LDLM_FL_TEST_LOCK; - if (enqflags & CEF_LOCK_MATCH) - result |= LDLM_FL_MATCH_LOCK; - return result; -} - -/** - * Updates object attributes from a lock value block (lvb) received together - * with the DLM lock reply from the server. Copy of osc_update_enqueue() - * logic. - * - * This can be optimized to not update attributes when lock is a result of a - * local match. - * - * Called under lock and resource spin-locks. - */ -static void osc_lock_lvb_update(const struct lu_env *env, - struct osc_object *osc, - struct ldlm_lock *dlmlock, - struct ost_lvb *lvb) -{ - struct cl_object *obj = osc2cl(osc); - struct lov_oinfo *oinfo = osc->oo_oinfo; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - unsigned int valid; - - valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE; - if (!lvb) - lvb = dlmlock->l_lvb_data; - - cl_lvb2attr(attr, lvb); - - cl_object_attr_lock(obj); - if (dlmlock) { - __u64 size; - - check_res_locked(dlmlock->l_resource); - LASSERT(lvb == dlmlock->l_lvb_data); - size = lvb->lvb_size; - - /* Extend KMS up to the end of this lock and no further - * A lock on [x,y] means a KMS of up to y + 1 bytes! - */ - if (size > dlmlock->l_policy_data.l_extent.end) - size = dlmlock->l_policy_data.l_extent.end + 1; - if (size >= oinfo->loi_kms) { - LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu, kms=%llu", - lvb->lvb_size, size); - valid |= CAT_KMS; - attr->cat_kms = size; - } else { - LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu; leaving kms=%llu, end=%llu", - lvb->lvb_size, oinfo->loi_kms, - dlmlock->l_policy_data.l_extent.end); - } - ldlm_lock_allow_match_locked(dlmlock); - } - - cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); -} - -static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl, - struct lustre_handle *lockh, bool lvb_update) -{ - struct ldlm_lock *dlmlock; - - dlmlock = ldlm_handle2lock_long(lockh, 0); - LASSERT(dlmlock); - - /* lock reference taken by ldlm_handle2lock_long() is - * owned by osc_lock and released in osc_lock_detach() - */ - lu_ref_add(&dlmlock->l_reference, "osc_lock", oscl); - oscl->ols_has_ref = 1; - - LASSERT(!oscl->ols_dlmlock); - oscl->ols_dlmlock = dlmlock; - - /* This may be a matched lock for glimpse request, do not hold - * lock reference in that case. - */ - if (!oscl->ols_glimpse) { - /* hold a refc for non glimpse lock which will - * be released in osc_lock_cancel() - */ - lustre_handle_copy(&oscl->ols_handle, lockh); - ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode); - oscl->ols_hold = 1; - } - - /* Lock must have been granted. */ - lock_res_and_lock(dlmlock); - if (dlmlock->l_granted_mode == dlmlock->l_req_mode) { - struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent; - struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; - - /* extend the lock extent, otherwise it will have problem when - * we decide whether to grant a lockless lock. - */ - descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode); - descr->cld_start = cl_index(descr->cld_obj, ext->start); - descr->cld_end = cl_index(descr->cld_obj, ext->end); - descr->cld_gid = ext->gid; - - /* no lvb update for matched lock */ - if (lvb_update) { - LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); - osc_lock_lvb_update(env, cl2osc(oscl->ols_cl.cls_obj), - dlmlock, NULL); - } - LINVRNT(osc_lock_invariant(oscl)); - } - unlock_res_and_lock(dlmlock); - - LASSERT(oscl->ols_state != OLS_GRANTED); - oscl->ols_state = OLS_GRANTED; -} - -/** - * Lock upcall function that is executed either when a reply to ENQUEUE rpc is - * received from a server, or after osc_enqueue_base() matched a local DLM - * lock. - */ -static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh, - int errcode) -{ - struct osc_lock *oscl = cookie; - struct cl_lock_slice *slice = &oscl->ols_cl; - struct lu_env *env; - int rc; - u16 refcheck; - - env = cl_env_get(&refcheck); - /* should never happen, similar to osc_ldlm_blocking_ast(). */ - LASSERT(!IS_ERR(env)); - - rc = ldlm_error2errno(errcode); - if (oscl->ols_state == OLS_ENQUEUED) { - oscl->ols_state = OLS_UPCALL_RECEIVED; - } else if (oscl->ols_state == OLS_CANCELLED) { - rc = -EIO; - } else { - CERROR("Impossible state: %d\n", oscl->ols_state); - LBUG(); - } - - if (rc == 0) - osc_lock_granted(env, oscl, lockh, errcode == ELDLM_OK); - - /* Error handling, some errors are tolerable. */ - if (oscl->ols_locklessable && rc == -EUSERS) { - /* This is a tolerable error, turn this lock into - * lockless lock. - */ - osc_object_set_contended(cl2osc(slice->cls_obj)); - LASSERT(slice->cls_ops == &osc_lock_ops); - - /* Change this lock to ldlmlock-less lock. */ - osc_lock_to_lockless(env, oscl, 1); - oscl->ols_state = OLS_GRANTED; - rc = 0; - } else if (oscl->ols_glimpse && rc == -ENAVAIL) { - LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); - osc_lock_lvb_update(env, cl2osc(slice->cls_obj), - NULL, &oscl->ols_lvb); - /* Hide the error. */ - rc = 0; - } - - if (oscl->ols_owner) - cl_sync_io_note(env, oscl->ols_owner, rc); - cl_env_put(env, &refcheck); - - return rc; -} - -static int osc_lock_upcall_agl(void *cookie, struct lustre_handle *lockh, - int errcode) -{ - struct osc_object *osc = cookie; - struct ldlm_lock *dlmlock; - struct lu_env *env; - u16 refcheck; - - env = cl_env_get(&refcheck); - LASSERT(!IS_ERR(env)); - - if (errcode == ELDLM_LOCK_MATCHED) { - errcode = ELDLM_OK; - goto out; - } - - if (errcode != ELDLM_OK) - goto out; - - dlmlock = ldlm_handle2lock(lockh); - LASSERT(dlmlock); - - lock_res_and_lock(dlmlock); - LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode); - - /* there is no osc_lock associated with AGL lock */ - osc_lock_lvb_update(env, osc, dlmlock, NULL); - - unlock_res_and_lock(dlmlock); - LDLM_LOCK_PUT(dlmlock); - -out: - cl_object_put(env, osc2cl(osc)); - cl_env_put(env, &refcheck); - return ldlm_error2errno(errcode); -} - -static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end, - enum cl_lock_mode mode, int discard) -{ - struct lu_env *env; - u16 refcheck; - int rc = 0; - int rc2 = 0; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - if (mode == CLM_WRITE) { - rc = osc_cache_writeback_range(env, obj, start, end, 1, - discard); - CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n", - obj, start, end, rc, - discard ? "discarded" : "written back"); - if (rc > 0) - rc = 0; - } - - rc2 = osc_lock_discard_pages(env, obj, start, end, mode); - if (rc == 0 && rc2 < 0) - rc = rc2; - - cl_env_put(env, &refcheck); - return rc; -} - -/** - * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock - * and ldlm_lock caches. - */ -static int osc_dlm_blocking_ast0(const struct lu_env *env, - struct ldlm_lock *dlmlock, - void *data, int flag) -{ - struct cl_object *obj = NULL; - int result = 0; - int discard; - enum cl_lock_mode mode = CLM_READ; - - LASSERT(flag == LDLM_CB_CANCELING); - - lock_res_and_lock(dlmlock); - if (dlmlock->l_granted_mode != dlmlock->l_req_mode) { - dlmlock->l_ast_data = NULL; - unlock_res_and_lock(dlmlock); - return 0; - } - - discard = ldlm_is_discard_data(dlmlock); - if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP)) - mode = CLM_WRITE; - - if (dlmlock->l_ast_data) { - obj = osc2cl(dlmlock->l_ast_data); - dlmlock->l_ast_data = NULL; - - cl_object_get(obj); - } - - unlock_res_and_lock(dlmlock); - - /* if l_ast_data is NULL, the dlmlock was enqueued by AGL or - * the object has been destroyed. - */ - if (obj) { - struct ldlm_extent *extent = &dlmlock->l_policy_data.l_extent; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - __u64 old_kms; - - /* Destroy pages covered by the extent of the DLM lock */ - result = osc_lock_flush(cl2osc(obj), - cl_index(obj, extent->start), - cl_index(obj, extent->end), - mode, discard); - - /* losing a lock, update kms */ - lock_res_and_lock(dlmlock); - cl_object_attr_lock(obj); - /* Must get the value under the lock to avoid race. */ - old_kms = cl2osc(obj)->oo_oinfo->loi_kms; - /* Update the kms. Need to loop all granted locks. - * Not a problem for the client - */ - attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms); - - cl_object_attr_update(env, obj, attr, CAT_KMS); - cl_object_attr_unlock(obj); - unlock_res_and_lock(dlmlock); - - cl_object_put(env, obj); - } - return result; -} - -/** - * Blocking ast invoked by ldlm when dlm lock is either blocking progress of - * some other lock, or is canceled. This function is installed as a - * ldlm_lock::l_blocking_ast() for client extent locks. - * - * Control flow is tricky, because ldlm uses the same call-back - * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's. - * - * \param dlmlock lock for which ast occurred. - * - * \param new description of a conflicting lock in case of blocking ast. - * - * \param data value of dlmlock->l_ast_data - * - * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish - * cancellation and blocking ast's. - * - * Possible use cases: - * - * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel - * lock due to lock lru pressure, or explicit user request to purge - * locks. - * - * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify - * us that dlmlock conflicts with another lock that some client is - * enqueing. Lock is canceled. - * - * - cl_lock_cancel() is called. osc_lock_cancel() calls - * ldlm_cli_cancel() that calls - * - * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) - * - * recursively entering osc_ldlm_blocking_ast(). - * - * - client cancels lock voluntary (e.g., as a part of early cancellation): - * - * cl_lock_cancel()-> - * osc_lock_cancel()-> - * ldlm_cli_cancel()-> - * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) - * - */ -static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, - struct ldlm_lock_desc *new, void *data, - int flag) -{ - int result = 0; - - switch (flag) { - case LDLM_CB_BLOCKING: { - struct lustre_handle lockh; - - ldlm_lock2handle(dlmlock, &lockh); - result = ldlm_cli_cancel(&lockh, LCF_ASYNC); - if (result == -ENODATA) - result = 0; - break; - } - case LDLM_CB_CANCELING: { - struct lu_env *env; - u16 refcheck; - - /* - * This can be called in the context of outer IO, e.g., - * - * osc_enqueue_base()->... - * ->ldlm_prep_elc_req()->... - * ->ldlm_cancel_callback()->... - * ->osc_ldlm_blocking_ast() - * - * new environment has to be created to not corrupt outer - * context. - */ - env = cl_env_get(&refcheck); - if (IS_ERR(env)) { - result = PTR_ERR(env); - break; - } - - result = osc_dlm_blocking_ast0(env, dlmlock, data, flag); - cl_env_put(env, &refcheck); - break; - } - default: - LBUG(); - } - return result; -} - -static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) -{ - struct ptlrpc_request *req = data; - struct lu_env *env; - struct ost_lvb *lvb; - struct req_capsule *cap; - struct cl_object *obj = NULL; - int result; - u16 refcheck; - - LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) { - result = PTR_ERR(env); - goto out; - } - - lock_res_and_lock(dlmlock); - if (dlmlock->l_ast_data) { - obj = osc2cl(dlmlock->l_ast_data); - cl_object_get(obj); - } - unlock_res_and_lock(dlmlock); - - if (obj) { - /* Do not grab the mutex of cl_lock for glimpse. - * See LU-1274 for details. - * BTW, it's okay for cl_lock to be cancelled during - * this period because server can handle this race. - * See ldlm_server_glimpse_ast() for details. - * cl_lock_mutex_get(env, lock); - */ - cap = &req->rq_pill; - req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK); - req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER, - sizeof(*lvb)); - result = req_capsule_server_pack(cap); - if (result == 0) { - lvb = req_capsule_server_get(cap, &RMF_DLM_LVB); - result = cl_object_glimpse(env, obj, lvb); - } - if (!exp_connect_lvb_type(req->rq_export)) { - req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, - sizeof(struct ost_lvb_v1), - RCL_SERVER); - } - cl_object_put(env, obj); - } else { - /* - * These errors are normal races, so we don't want to - * fill the console with messages by calling - * ptlrpc_error() - */ - lustre_pack_reply(req, 1, NULL, NULL); - result = -ELDLM_NO_LOCK_DATA; - } - cl_env_put(env, &refcheck); - -out: - req->rq_status = result; - return result; -} - -static int weigh_cb(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops, void *cbdata) -{ - struct cl_page *page = ops->ops_cl.cpl_page; - - if (cl_page_is_vmlocked(env, page) || - PageDirty(page->cp_vmpage) || PageWriteback(page->cp_vmpage) - ) - return CLP_GANG_ABORT; - - *(pgoff_t *)cbdata = osc_index(ops) + 1; - return CLP_GANG_OKAY; -} - -static unsigned long osc_lock_weight(const struct lu_env *env, - struct osc_object *oscobj, - struct ldlm_extent *extent) -{ - struct cl_io *io = &osc_env_info(env)->oti_io; - struct cl_object *obj = cl_object_top(&oscobj->oo_cl); - pgoff_t page_index; - int result; - - io->ci_obj = obj; - io->ci_ignore_layout = 1; - result = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (result != 0) - return result; - - page_index = cl_index(obj, extent->start); - do { - result = osc_page_gang_lookup(env, io, oscobj, - page_index, - cl_index(obj, extent->end), - weigh_cb, (void *)&page_index); - if (result == CLP_GANG_ABORT) - break; - if (result == CLP_GANG_RESCHED) - cond_resched(); - } while (result != CLP_GANG_OKAY); - cl_io_fini(env, io); - - return result == CLP_GANG_ABORT ? 1 : 0; -} - -/** - * Get the weight of dlm lock for early cancellation. - */ -unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock) -{ - struct lu_env *env; - struct osc_object *obj; - struct osc_lock *oscl; - unsigned long weight; - bool found = false; - u16 refcheck; - - might_sleep(); - /* - * osc_ldlm_weigh_ast has a complex context since it might be called - * because of lock canceling, or from user's input. We have to make - * a new environment for it. Probably it is implementation safe to use - * the upper context because cl_lock_put don't modify environment - * variables. But just in case .. - */ - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - /* Mostly because lack of memory, do not eliminate this lock */ - return 1; - - LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT); - obj = dlmlock->l_ast_data; - if (!obj) { - weight = 1; - goto out; - } - - spin_lock(&obj->oo_ol_spin); - list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) { - if (oscl->ols_dlmlock && oscl->ols_dlmlock != dlmlock) - continue; - found = true; - } - spin_unlock(&obj->oo_ol_spin); - if (found) { - /* - * If the lock is being used by an IO, definitely not cancel it. - */ - weight = 1; - goto out; - } - - weight = osc_lock_weight(env, obj, &dlmlock->l_policy_data.l_extent); - -out: - cl_env_put(env, &refcheck); - return weight; -} - -static void osc_lock_build_einfo(const struct lu_env *env, - const struct cl_lock *lock, - struct osc_object *osc, - struct ldlm_enqueue_info *einfo) -{ - einfo->ei_type = LDLM_EXTENT; - einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode); - einfo->ei_cb_bl = osc_ldlm_blocking_ast; - einfo->ei_cb_cp = ldlm_completion_ast; - einfo->ei_cb_gl = osc_ldlm_glimpse_ast; - einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */ -} - -/** - * Determine if the lock should be converted into a lockless lock. - * - * Steps to check: - * - if the lock has an explicit requirement for a non-lockless lock; - * - if the io lock request type ci_lockreq; - * - send the enqueue rpc to ost to make the further decision; - * - special treat to truncate lockless lock - * - * Additional policy can be implemented here, e.g., never do lockless-io - * for large extents. - */ -static void osc_lock_to_lockless(const struct lu_env *env, - struct osc_lock *ols, int force) -{ - struct cl_lock_slice *slice = &ols->ols_cl; - - LASSERT(ols->ols_state == OLS_NEW || - ols->ols_state == OLS_UPCALL_RECEIVED); - - if (force) { - ols->ols_locklessable = 1; - slice->cls_ops = &osc_lock_lockless_ops; - } else { - struct osc_io *oio = osc_env_io(env); - struct cl_io *io = oio->oi_cl.cis_io; - struct cl_object *obj = slice->cls_obj; - struct osc_object *oob = cl2osc(obj); - const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); - struct obd_connect_data *ocd; - - LASSERT(io->ci_lockreq == CILR_MANDATORY || - io->ci_lockreq == CILR_MAYBE || - io->ci_lockreq == CILR_NEVER); - - ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data; - ols->ols_locklessable = (io->ci_type != CIT_SETATTR) && - (io->ci_lockreq == CILR_MAYBE) && - (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK); - if (io->ci_lockreq == CILR_NEVER || - /* lockless IO */ - (ols->ols_locklessable && osc_object_is_contended(oob)) || - /* lockless truncate */ - (cl_io_is_trunc(io) && - (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) && - osd->od_lockless_truncate)) { - ols->ols_locklessable = 1; - slice->cls_ops = &osc_lock_lockless_ops; - } - } - LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); -} - -static bool osc_lock_compatible(const struct osc_lock *qing, - const struct osc_lock *qed) -{ - struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr; - struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr; - - if (qed->ols_glimpse) - return true; - - if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ) - return true; - - if (qed->ols_state < OLS_GRANTED) - return true; - - if (qed_descr->cld_mode >= qing_descr->cld_mode && - qed_descr->cld_start <= qing_descr->cld_start && - qed_descr->cld_end >= qing_descr->cld_end) - return true; - - return false; -} - -static void osc_lock_wake_waiters(const struct lu_env *env, - struct osc_object *osc, - struct osc_lock *oscl) -{ - spin_lock(&osc->oo_ol_spin); - list_del_init(&oscl->ols_nextlock_oscobj); - spin_unlock(&osc->oo_ol_spin); - - spin_lock(&oscl->ols_lock); - while (!list_empty(&oscl->ols_waiting_list)) { - struct osc_lock *scan; - - scan = list_entry(oscl->ols_waiting_list.next, struct osc_lock, - ols_wait_entry); - list_del_init(&scan->ols_wait_entry); - - cl_sync_io_note(env, scan->ols_owner, 0); - } - spin_unlock(&oscl->ols_lock); -} - -static int osc_lock_enqueue_wait(const struct lu_env *env, - struct osc_object *obj, - struct osc_lock *oscl) -{ - struct osc_lock *tmp_oscl; - struct cl_lock_descr *need = &oscl->ols_cl.cls_lock->cll_descr; - struct cl_sync_io *waiter = &osc_env_info(env)->oti_anchor; - int rc = 0; - - spin_lock(&obj->oo_ol_spin); - list_add_tail(&oscl->ols_nextlock_oscobj, &obj->oo_ol_list); - -restart: - list_for_each_entry(tmp_oscl, &obj->oo_ol_list, - ols_nextlock_oscobj) { - struct cl_lock_descr *descr; - - if (tmp_oscl == oscl) - break; - - descr = &tmp_oscl->ols_cl.cls_lock->cll_descr; - if (descr->cld_start > need->cld_end || - descr->cld_end < need->cld_start) - continue; - - /* We're not supposed to give up group lock */ - if (descr->cld_mode == CLM_GROUP) - break; - - if (!osc_lock_is_lockless(oscl) && - osc_lock_compatible(oscl, tmp_oscl)) - continue; - - /* wait for conflicting lock to be canceled */ - cl_sync_io_init(waiter, 1, cl_sync_io_end); - oscl->ols_owner = waiter; - - spin_lock(&tmp_oscl->ols_lock); - /* add oscl into tmp's ols_waiting list */ - list_add_tail(&oscl->ols_wait_entry, - &tmp_oscl->ols_waiting_list); - spin_unlock(&tmp_oscl->ols_lock); - - spin_unlock(&obj->oo_ol_spin); - rc = cl_sync_io_wait(env, waiter, 0); - spin_lock(&obj->oo_ol_spin); - if (rc < 0) - break; - - oscl->ols_owner = NULL; - goto restart; - } - spin_unlock(&obj->oo_ol_spin); - - return rc; -} - -/** - * Implementation of cl_lock_operations::clo_enqueue() method for osc - * layer. This initiates ldlm enqueue: - * - * - cancels conflicting locks early (osc_lock_enqueue_wait()); - * - * - calls osc_enqueue_base() to do actual enqueue. - * - * osc_enqueue_base() is supplied with an upcall function that is executed - * when lock is received either after a local cached ldlm lock is matched, or - * when a reply from the server is received. - * - * This function does not wait for the network communication to complete. - */ -static int osc_lock_enqueue(const struct lu_env *env, - const struct cl_lock_slice *slice, - struct cl_io *unused, struct cl_sync_io *anchor) -{ - struct osc_thread_info *info = osc_env_info(env); - struct osc_io *oio = osc_env_io(env); - struct osc_object *osc = cl2osc(slice->cls_obj); - struct osc_lock *oscl = cl2osc_lock(slice); - struct cl_lock *lock = slice->cls_lock; - struct ldlm_res_id *resname = &info->oti_resname; - union ldlm_policy_data *policy = &info->oti_policy; - osc_enqueue_upcall_f upcall = osc_lock_upcall; - void *cookie = oscl; - bool async = false; - int result; - - LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ), - "lock = %p, ols = %p\n", lock, oscl); - - if (oscl->ols_state == OLS_GRANTED) - return 0; - - if (oscl->ols_flags & LDLM_FL_TEST_LOCK) - goto enqueue_base; - - if (oscl->ols_glimpse) { - LASSERT(equi(oscl->ols_agl, !anchor)); - async = true; - goto enqueue_base; - } - - result = osc_lock_enqueue_wait(env, osc, oscl); - if (result < 0) - goto out; - - /* we can grant lockless lock right after all conflicting locks - * are canceled. - */ - if (osc_lock_is_lockless(oscl)) { - oscl->ols_state = OLS_GRANTED; - oio->oi_lockless = 1; - return 0; - } - -enqueue_base: - oscl->ols_state = OLS_ENQUEUED; - if (anchor) { - atomic_inc(&anchor->csi_sync_nr); - oscl->ols_owner = anchor; - } - - /** - * DLM lock's ast data must be osc_object; - * if glimpse or AGL lock, async of osc_enqueue_base() must be true, - * DLM's enqueue callback set to osc_lock_upcall() with cookie as - * osc_lock. - */ - ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname); - osc_lock_build_policy(env, lock, policy); - if (oscl->ols_agl) { - oscl->ols_einfo.ei_cbdata = NULL; - /* hold a reference for callback */ - cl_object_get(osc2cl(osc)); - upcall = osc_lock_upcall_agl; - cookie = osc; - } - result = osc_enqueue_base(osc_export(osc), resname, &oscl->ols_flags, - policy, &oscl->ols_lvb, - osc->oo_oinfo->loi_kms_valid, - upcall, cookie, - &oscl->ols_einfo, PTLRPCD_SET, async, - oscl->ols_agl); - if (!result) { - if (osc_lock_is_lockless(oscl)) { - oio->oi_lockless = 1; - } else if (!async) { - LASSERT(oscl->ols_state == OLS_GRANTED); - LASSERT(oscl->ols_hold); - LASSERT(oscl->ols_dlmlock); - } - } else if (oscl->ols_agl) { - cl_object_put(env, osc2cl(osc)); - result = 0; - } - -out: - if (result < 0) { - oscl->ols_state = OLS_CANCELLED; - osc_lock_wake_waiters(env, osc, oscl); - - if (anchor) - cl_sync_io_note(env, anchor, result); - } - return result; -} - -/** - * Breaks a link between osc_lock and dlm_lock. - */ -static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck) -{ - struct ldlm_lock *dlmlock; - - dlmlock = olck->ols_dlmlock; - if (!dlmlock) - return; - - if (olck->ols_hold) { - olck->ols_hold = 0; - ldlm_lock_decref(&olck->ols_handle, olck->ols_einfo.ei_mode); - olck->ols_handle.cookie = 0ULL; - } - - olck->ols_dlmlock = NULL; - - /* release a reference taken in osc_lock_upcall(). */ - LASSERT(olck->ols_has_ref); - lu_ref_del(&dlmlock->l_reference, "osc_lock", olck); - LDLM_LOCK_RELEASE(dlmlock); - olck->ols_has_ref = 0; -} - -/** - * Implements cl_lock_operations::clo_cancel() method for osc layer. This is - * called (as part of cl_lock_cancel()) when lock is canceled either voluntary - * (LRU pressure, early cancellation, umount, etc.) or due to the conflict - * with some other lock some where in the cluster. This function does the - * following: - * - * - invalidates all pages protected by this lock (after sending dirty - * ones to the server, as necessary); - * - * - decref's underlying ldlm lock; - * - * - cancels ldlm lock (ldlm_cli_cancel()). - */ -static void osc_lock_cancel(const struct lu_env *env, - const struct cl_lock_slice *slice) -{ - struct osc_object *obj = cl2osc(slice->cls_obj); - struct osc_lock *oscl = cl2osc_lock(slice); - - LINVRNT(osc_lock_invariant(oscl)); - - osc_lock_detach(env, oscl); - oscl->ols_state = OLS_CANCELLED; - oscl->ols_flags &= ~LDLM_FL_LVB_READY; - - osc_lock_wake_waiters(env, obj, oscl); -} - -static int osc_lock_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct cl_lock_slice *slice) -{ - struct osc_lock *lock = cl2osc_lock(slice); - - (*p)(env, cookie, "%p %#16llx %#llx %d %p ", - lock->ols_dlmlock, lock->ols_flags, lock->ols_handle.cookie, - lock->ols_state, lock->ols_owner); - osc_lvb_print(env, cookie, p, &lock->ols_lvb); - return 0; -} - -static const struct cl_lock_operations osc_lock_ops = { - .clo_fini = osc_lock_fini, - .clo_enqueue = osc_lock_enqueue, - .clo_cancel = osc_lock_cancel, - .clo_print = osc_lock_print, -}; - -static void osc_lock_lockless_cancel(const struct lu_env *env, - const struct cl_lock_slice *slice) -{ - struct osc_lock *ols = cl2osc_lock(slice); - struct osc_object *osc = cl2osc(slice->cls_obj); - struct cl_lock_descr *descr = &slice->cls_lock->cll_descr; - int result; - - LASSERT(!ols->ols_dlmlock); - result = osc_lock_flush(osc, descr->cld_start, descr->cld_end, - descr->cld_mode, 0); - if (result) - CERROR("Pages for lockless lock %p were not purged(%d)\n", - ols, result); - - osc_lock_wake_waiters(env, osc, ols); -} - -static const struct cl_lock_operations osc_lock_lockless_ops = { - .clo_fini = osc_lock_fini, - .clo_enqueue = osc_lock_enqueue, - .clo_cancel = osc_lock_lockless_cancel, - .clo_print = osc_lock_print -}; - -static void osc_lock_set_writer(const struct lu_env *env, - const struct cl_io *io, - struct cl_object *obj, struct osc_lock *oscl) -{ - struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; - pgoff_t io_start; - pgoff_t io_end; - - if (!cl_object_same(io->ci_obj, obj)) - return; - - if (likely(io->ci_type == CIT_WRITE)) { - io_start = cl_index(obj, io->u.ci_rw.crw_pos); - io_end = cl_index(obj, io->u.ci_rw.crw_pos + - io->u.ci_rw.crw_count - 1); - if (cl_io_is_append(io)) { - io_start = 0; - io_end = CL_PAGE_EOF; - } - } else { - LASSERT(cl_io_is_mkwrite(io)); - io_start = io->u.ci_fault.ft_index; - io_end = io->u.ci_fault.ft_index; - } - - if (descr->cld_mode >= CLM_WRITE && - descr->cld_start <= io_start && descr->cld_end >= io_end) { - struct osc_io *oio = osc_env_io(env); - - /* There must be only one lock to match the write region */ - LASSERT(!oio->oi_write_osclock); - oio->oi_write_osclock = oscl; - } -} - -int osc_lock_init(const struct lu_env *env, - struct cl_object *obj, struct cl_lock *lock, - const struct cl_io *io) -{ - struct osc_lock *oscl; - __u32 enqflags = lock->cll_descr.cld_enq_flags; - - oscl = kmem_cache_zalloc(osc_lock_kmem, GFP_NOFS); - if (!oscl) - return -ENOMEM; - - oscl->ols_state = OLS_NEW; - spin_lock_init(&oscl->ols_lock); - INIT_LIST_HEAD(&oscl->ols_waiting_list); - INIT_LIST_HEAD(&oscl->ols_wait_entry); - INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj); - - oscl->ols_flags = osc_enq2ldlm_flags(enqflags); - oscl->ols_agl = !!(enqflags & CEF_AGL); - if (oscl->ols_agl) - oscl->ols_flags |= LDLM_FL_BLOCK_NOWAIT; - if (oscl->ols_flags & LDLM_FL_HAS_INTENT) { - oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED; - oscl->ols_glimpse = 1; - } - osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo); - - cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops); - - if (!(enqflags & CEF_MUST)) - /* try to convert this lock to a lockless lock */ - osc_lock_to_lockless(env, oscl, (enqflags & CEF_NEVER)); - if (oscl->ols_locklessable && !(enqflags & CEF_DISCARD_DATA)) - oscl->ols_flags |= LDLM_FL_DENY_ON_CONTENTION; - - if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) - osc_lock_set_writer(env, io, obj, oscl); - - - LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx", - lock, oscl, oscl->ols_flags); - - return 0; -} - -/** - * Finds an existing lock covering given index and optionally different from a - * given \a except lock. - */ -struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env, - struct osc_object *obj, pgoff_t index, - enum osc_dap_flags dap_flags) -{ - struct osc_thread_info *info = osc_env_info(env); - struct ldlm_res_id *resname = &info->oti_resname; - union ldlm_policy_data *policy = &info->oti_policy; - struct lustre_handle lockh; - struct ldlm_lock *lock = NULL; - enum ldlm_mode mode; - __u64 flags; - - ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname); - osc_index2policy(policy, osc2cl(obj), index, index); - policy->l_extent.gid = LDLM_GID_ANY; - - flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING; - if (dap_flags & OSC_DAP_FL_TEST_LOCK) - flags |= LDLM_FL_TEST_LOCK; - - /* - * It is fine to match any group lock since there could be only one - * with a uniq gid and it conflicts with all other lock modes too - */ -again: - mode = osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy, - LCK_PR | LCK_PW | LCK_GROUP, &flags, obj, &lockh, - dap_flags & OSC_DAP_FL_CANCELING); - if (mode != 0) { - lock = ldlm_handle2lock(&lockh); - /* RACE: the lock is cancelled so let's try again */ - if (unlikely(!lock)) - goto again; - } - return lock; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c deleted file mode 100644 index 6baa8e2e00c9..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_object.c +++ /dev/null @@ -1,474 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_object for OSC layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - * Author: Jinshan Xiong <jinshan.xiong@intel.com> - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include "osc_cl_internal.h" - -/** \addtogroup osc - * @{ - */ - -/***************************************************************************** - * - * Type conversions. - * - */ - -static struct lu_object *osc2lu(struct osc_object *osc) -{ - return &osc->oo_cl.co_lu; -} - -static struct osc_object *lu2osc(const struct lu_object *obj) -{ - LINVRNT(osc_is_object(obj)); - return container_of0(obj, struct osc_object, oo_cl.co_lu); -} - -/***************************************************************************** - * - * Object operations. - * - */ - -static int osc_object_init(const struct lu_env *env, struct lu_object *obj, - const struct lu_object_conf *conf) -{ - struct osc_object *osc = lu2osc(obj); - const struct cl_object_conf *cconf = lu2cl_conf(conf); - - osc->oo_oinfo = cconf->u.coc_oinfo; - INIT_LIST_HEAD(&osc->oo_ready_item); - INIT_LIST_HEAD(&osc->oo_hp_ready_item); - INIT_LIST_HEAD(&osc->oo_write_item); - INIT_LIST_HEAD(&osc->oo_read_item); - - atomic_set(&osc->oo_nr_ios, 0); - init_waitqueue_head(&osc->oo_io_waitq); - - osc->oo_root.rb_node = NULL; - INIT_LIST_HEAD(&osc->oo_hp_exts); - INIT_LIST_HEAD(&osc->oo_urgent_exts); - INIT_LIST_HEAD(&osc->oo_rpc_exts); - INIT_LIST_HEAD(&osc->oo_reading_exts); - atomic_set(&osc->oo_nr_reads, 0); - atomic_set(&osc->oo_nr_writes, 0); - spin_lock_init(&osc->oo_lock); - spin_lock_init(&osc->oo_tree_lock); - spin_lock_init(&osc->oo_ol_spin); - INIT_LIST_HEAD(&osc->oo_ol_list); - - cl_object_page_init(lu2cl(obj), sizeof(struct osc_page)); - - return 0; -} - -static void osc_object_free(const struct lu_env *env, struct lu_object *obj) -{ - struct osc_object *osc = lu2osc(obj); - - LASSERT(list_empty(&osc->oo_ready_item)); - LASSERT(list_empty(&osc->oo_hp_ready_item)); - LASSERT(list_empty(&osc->oo_write_item)); - LASSERT(list_empty(&osc->oo_read_item)); - - LASSERT(!osc->oo_root.rb_node); - LASSERT(list_empty(&osc->oo_hp_exts)); - LASSERT(list_empty(&osc->oo_urgent_exts)); - LASSERT(list_empty(&osc->oo_rpc_exts)); - LASSERT(list_empty(&osc->oo_reading_exts)); - LASSERT(atomic_read(&osc->oo_nr_reads) == 0); - LASSERT(atomic_read(&osc->oo_nr_writes) == 0); - LASSERT(list_empty(&osc->oo_ol_list)); - LASSERT(!atomic_read(&osc->oo_nr_ios)); - - lu_object_fini(obj); - kmem_cache_free(osc_object_kmem, osc); -} - -int osc_lvb_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct ost_lvb *lvb) -{ - return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu", - lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, - lvb->lvb_ctime, lvb->lvb_blocks); -} - -static int osc_object_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *obj) -{ - struct osc_object *osc = lu2osc(obj); - struct lov_oinfo *oinfo = osc->oo_oinfo; - struct osc_async_rc *ar = &oinfo->loi_ar; - - (*p)(env, cookie, "id: " DOSTID " idx: %d gen: %d kms_valid: %u kms %llu rc: %d force_sync: %d min_xid: %llu ", - POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx, - oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms, - ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid); - osc_lvb_print(env, cookie, p, &oinfo->loi_lvb); - return 0; -} - -static int osc_attr_get(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr) -{ - struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; - - cl_lvb2attr(attr, &oinfo->loi_lvb); - attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0; - return 0; -} - -static int osc_attr_update(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned int valid) -{ - struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; - struct ost_lvb *lvb = &oinfo->loi_lvb; - - if (valid & CAT_SIZE) - lvb->lvb_size = attr->cat_size; - if (valid & CAT_MTIME) - lvb->lvb_mtime = attr->cat_mtime; - if (valid & CAT_ATIME) - lvb->lvb_atime = attr->cat_atime; - if (valid & CAT_CTIME) - lvb->lvb_ctime = attr->cat_ctime; - if (valid & CAT_BLOCKS) - lvb->lvb_blocks = attr->cat_blocks; - if (valid & CAT_KMS) { - CDEBUG(D_CACHE, "set kms from %llu to %llu\n", - oinfo->loi_kms, (__u64)attr->cat_kms); - loi_kms_set(oinfo, attr->cat_kms); - } - return 0; -} - -static int osc_object_glimpse(const struct lu_env *env, - const struct cl_object *obj, struct ost_lvb *lvb) -{ - struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; - - lvb->lvb_size = oinfo->loi_kms; - lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks; - return 0; -} - -static int osc_object_ast_clear(struct ldlm_lock *lock, void *data) -{ - if (lock->l_ast_data == data) - lock->l_ast_data = NULL; - return LDLM_ITER_CONTINUE; -} - -static int osc_object_prune(const struct lu_env *env, struct cl_object *obj) -{ - struct osc_object *osc = cl2osc(obj); - struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname; - - /* DLM locks don't hold a reference of osc_object so we have to - * clear it before the object is being destroyed. - */ - ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname); - ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname, - osc_object_ast_clear, osc); - return 0; -} - -static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj, - struct ll_fiemap_info_key *fmkey, - struct fiemap *fiemap, size_t *buflen) -{ - struct obd_export *exp = osc_export(cl2osc(obj)); - union ldlm_policy_data policy; - struct ptlrpc_request *req; - struct lustre_handle lockh; - struct ldlm_res_id resid; - enum ldlm_mode mode = 0; - struct fiemap *reply; - char *tmp; - int rc; - - fmkey->lfik_oa.o_oi = cl2osc(obj)->oo_oinfo->loi_oi; - if (!(fmkey->lfik_fiemap.fm_flags & FIEMAP_FLAG_SYNC)) - goto skip_locking; - - policy.l_extent.start = fmkey->lfik_fiemap.fm_start & PAGE_MASK; - - if (OBD_OBJECT_EOF - fmkey->lfik_fiemap.fm_length <= - fmkey->lfik_fiemap.fm_start + PAGE_SIZE - 1) - policy.l_extent.end = OBD_OBJECT_EOF; - else - policy.l_extent.end = (fmkey->lfik_fiemap.fm_start + - fmkey->lfik_fiemap.fm_length + - PAGE_SIZE - 1) & PAGE_MASK; - - ostid_build_res_name(&fmkey->lfik_oa.o_oi, &resid); - mode = ldlm_lock_match(exp->exp_obd->obd_namespace, - LDLM_FL_BLOCK_GRANTED | LDLM_FL_LVB_READY, - &resid, LDLM_EXTENT, &policy, - LCK_PR | LCK_PW, &lockh, 0); - if (mode) { /* lock is cached on client */ - if (mode != LCK_PR) { - ldlm_lock_addref(&lockh, LCK_PR); - ldlm_lock_decref(&lockh, LCK_PW); - } - } else { /* no cached lock, needs acquire lock on server side */ - fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS; - fmkey->lfik_oa.o_flags |= OBD_FL_SRVLOCK; - } - -skip_locking: - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_OST_GET_INFO_FIEMAP); - if (!req) { - rc = -ENOMEM; - goto drop_lock; - } - - req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT, - sizeof(*fmkey)); - req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_CLIENT, - *buflen); - req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_SERVER, - *buflen); - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); - if (rc) { - ptlrpc_request_free(req); - goto drop_lock; - } - tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY); - memcpy(tmp, fmkey, sizeof(*fmkey)); - tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL); - memcpy(tmp, fiemap, *buflen); - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto fini_req; - - reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL); - if (!reply) { - rc = -EPROTO; - goto fini_req; - } - - memcpy(fiemap, reply, *buflen); -fini_req: - ptlrpc_req_finished(req); -drop_lock: - if (mode) - ldlm_lock_decref(&lockh, LCK_PR); - return rc; -} - -void osc_object_set_contended(struct osc_object *obj) -{ - obj->oo_contention_time = cfs_time_current(); - /* mb(); */ - obj->oo_contended = 1; -} - -void osc_object_clear_contended(struct osc_object *obj) -{ - obj->oo_contended = 0; -} - -int osc_object_is_contended(struct osc_object *obj) -{ - struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev); - int osc_contention_time = dev->od_contention_time; - unsigned long cur_time = cfs_time_current(); - unsigned long retry_time; - - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION)) - return 1; - - if (!obj->oo_contended) - return 0; - - /* - * I like copy-paste. the code is copied from - * ll_file_is_contended. - */ - retry_time = cfs_time_add(obj->oo_contention_time, - osc_contention_time * HZ); - if (cfs_time_after(cur_time, retry_time)) { - osc_object_clear_contended(obj); - return 0; - } - return 1; -} - -/** - * Implementation of struct cl_object_operations::coo_req_attr_set() for osc - * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq - * fields. - */ -static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj, - struct cl_req_attr *attr) -{ - u64 flags = attr->cra_flags; - struct lov_oinfo *oinfo; - struct ost_lvb *lvb; - struct obdo *oa; - - oinfo = cl2osc(obj)->oo_oinfo; - lvb = &oinfo->loi_lvb; - oa = attr->cra_oa; - - if (flags & OBD_MD_FLMTIME) { - oa->o_mtime = lvb->lvb_mtime; - oa->o_valid |= OBD_MD_FLMTIME; - } - if (flags & OBD_MD_FLATIME) { - oa->o_atime = lvb->lvb_atime; - oa->o_valid |= OBD_MD_FLATIME; - } - if (flags & OBD_MD_FLCTIME) { - oa->o_ctime = lvb->lvb_ctime; - oa->o_valid |= OBD_MD_FLCTIME; - } - if (flags & OBD_MD_FLGROUP) { - ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi)); - oa->o_valid |= OBD_MD_FLGROUP; - } - if (flags & OBD_MD_FLID) { - int rc; - - rc = ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi)); - if (rc) { - CERROR("Bad %llu to set " DOSTID " : rc %d\n", - (unsigned long long)ostid_id(&oinfo->loi_oi), - POSTID(&oa->o_oi), rc); - } - oa->o_valid |= OBD_MD_FLID; - } - if (flags & OBD_MD_FLHANDLE) { - struct ldlm_lock *lock; - struct osc_page *opg; - - opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj)); - lock = osc_dlmlock_at_pgoff(env, cl2osc(obj), osc_index(opg), - OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_CANCELING); - if (!lock && !opg->ops_srvlock) { - struct ldlm_resource *res; - struct ldlm_res_id *resname; - - CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page, - "uncovered page!\n"); - - resname = &osc_env_info(env)->oti_resname; - ostid_build_res_name(&oinfo->loi_oi, resname); - res = ldlm_resource_get( - osc_export(cl2osc(obj))->exp_obd->obd_namespace, - NULL, resname, LDLM_EXTENT, 0); - ldlm_resource_dump(D_ERROR, res); - - LBUG(); - } - - /* check for lockless io. */ - if (lock) { - oa->o_handle = lock->l_remote_handle; - oa->o_valid |= OBD_MD_FLHANDLE; - LDLM_LOCK_PUT(lock); - } - } -} - -static const struct cl_object_operations osc_ops = { - .coo_page_init = osc_page_init, - .coo_lock_init = osc_lock_init, - .coo_io_init = osc_io_init, - .coo_attr_get = osc_attr_get, - .coo_attr_update = osc_attr_update, - .coo_glimpse = osc_object_glimpse, - .coo_prune = osc_object_prune, - .coo_fiemap = osc_object_fiemap, - .coo_req_attr_set = osc_req_attr_set -}; - -static const struct lu_object_operations osc_lu_obj_ops = { - .loo_object_init = osc_object_init, - .loo_object_release = NULL, - .loo_object_free = osc_object_free, - .loo_object_print = osc_object_print, - .loo_object_invariant = NULL -}; - -struct lu_object *osc_object_alloc(const struct lu_env *env, - const struct lu_object_header *unused, - struct lu_device *dev) -{ - struct osc_object *osc; - struct lu_object *obj; - - osc = kmem_cache_zalloc(osc_object_kmem, GFP_NOFS); - if (osc) { - obj = osc2lu(osc); - lu_object_init(obj, NULL, dev); - osc->oo_cl.co_ops = &osc_ops; - obj->lo_ops = &osc_lu_obj_ops; - } else { - obj = NULL; - } - return obj; -} - -int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc) -{ - CDEBUG(D_INODE, "Invalidate osc object: %p, # of active IOs: %d\n", - osc, atomic_read(&osc->oo_nr_ios)); - - wait_event_idle(osc->oo_io_waitq, !atomic_read(&osc->oo_nr_ios)); - - /* Discard all dirty pages of this object. */ - osc_cache_truncate_start(env, osc, 0, NULL); - - /* Discard all caching pages */ - osc_lock_discard_pages(env, osc, 0, CL_PAGE_EOF, CLM_WRITE); - - /* Clear ast data of dlm lock. Do this after discarding all pages */ - osc_object_prune(env, osc2cl(osc)); - - return 0; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c deleted file mode 100644 index 01a930dbbf64..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_page.c +++ /dev/null @@ -1,1094 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_page for OSC layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - * Author: Jinshan Xiong <jinshan.xiong@intel.com> - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include <linux/math64.h> -#include "osc_cl_internal.h" - -static void osc_lru_del(struct client_obd *cli, struct osc_page *opg); -static void osc_lru_use(struct client_obd *cli, struct osc_page *opg); -static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli, - struct osc_page *opg); - -/** \addtogroup osc - * @{ - */ - -/***************************************************************************** - * - * Page operations. - * - */ -static void osc_page_transfer_get(struct osc_page *opg, const char *label) -{ - struct cl_page *page = opg->ops_cl.cpl_page; - - LASSERT(!opg->ops_transfer_pinned); - cl_page_get(page); - lu_ref_add_atomic(&page->cp_reference, label, page); - opg->ops_transfer_pinned = 1; -} - -static void osc_page_transfer_put(const struct lu_env *env, - struct osc_page *opg) -{ - struct cl_page *page = opg->ops_cl.cpl_page; - - if (opg->ops_transfer_pinned) { - opg->ops_transfer_pinned = 0; - lu_ref_del(&page->cp_reference, "transfer", page); - cl_page_put(env, page); - } -} - -/** - * This is called once for every page when it is submitted for a transfer - * either opportunistic (osc_page_cache_add()), or immediate - * (osc_page_submit()). - */ -static void osc_page_transfer_add(const struct lu_env *env, - struct osc_page *opg, enum cl_req_type crt) -{ - struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); - - osc_lru_use(osc_cli(obj), opg); -} - -int osc_page_cache_add(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io) -{ - struct osc_page *opg = cl2osc_page(slice); - int result; - - osc_page_transfer_get(opg, "transfer\0cache"); - result = osc_queue_async_io(env, io, opg); - if (result != 0) - osc_page_transfer_put(env, opg); - else - osc_page_transfer_add(env, opg, CRT_WRITE); - - return result; -} - -void osc_index2policy(union ldlm_policy_data *policy, - const struct cl_object *obj, - pgoff_t start, pgoff_t end) -{ - memset(policy, 0, sizeof(*policy)); - policy->l_extent.start = cl_offset(obj, start); - policy->l_extent.end = cl_offset(obj, end + 1) - 1; -} - -static const char *osc_list(struct list_head *head) -{ - return list_empty(head) ? "-" : "+"; -} - -static inline unsigned long osc_submit_duration(struct osc_page *opg) -{ - if (opg->ops_submit_time == 0) - return 0; - - return (cfs_time_current() - opg->ops_submit_time); -} - -static int osc_page_print(const struct lu_env *env, - const struct cl_page_slice *slice, - void *cookie, lu_printer_t printer) -{ - struct osc_page *opg = cl2osc_page(slice); - struct osc_async_page *oap = &opg->ops_oap; - struct osc_object *obj = cl2osc(slice->cpl_obj); - struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli; - - return (*printer)(env, cookie, LUSTRE_OSC_NAME "-page@%p %lu: 1< %#x %d %u %s %s > 2< %llu %u %u %#x %#x | %p %p %p > 3< %d %lu %d > 4< %d %d %d %lu %s | %s %s %s %s > 5< %s %s %s %s | %d %s | %d %s %s>\n", - opg, osc_index(opg), - /* 1 */ - oap->oap_magic, oap->oap_cmd, - oap->oap_interrupted, - osc_list(&oap->oap_pending_item), - osc_list(&oap->oap_rpc_item), - /* 2 */ - oap->oap_obj_off, oap->oap_page_off, oap->oap_count, - oap->oap_async_flags, oap->oap_brw_flags, - oap->oap_request, oap->oap_cli, obj, - /* 3 */ - opg->ops_transfer_pinned, - osc_submit_duration(opg), opg->ops_srvlock, - /* 4 */ - cli->cl_r_in_flight, cli->cl_w_in_flight, - cli->cl_max_rpcs_in_flight, - cli->cl_avail_grant, - osc_list(&cli->cl_cache_waiters), - osc_list(&cli->cl_loi_ready_list), - osc_list(&cli->cl_loi_hp_ready_list), - osc_list(&cli->cl_loi_write_list), - osc_list(&cli->cl_loi_read_list), - /* 5 */ - osc_list(&obj->oo_ready_item), - osc_list(&obj->oo_hp_ready_item), - osc_list(&obj->oo_write_item), - osc_list(&obj->oo_read_item), - atomic_read(&obj->oo_nr_reads), - osc_list(&obj->oo_reading_exts), - atomic_read(&obj->oo_nr_writes), - osc_list(&obj->oo_hp_exts), - osc_list(&obj->oo_urgent_exts)); -} - -static void osc_page_delete(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct osc_page *opg = cl2osc_page(slice); - struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); - int rc; - - CDEBUG(D_TRACE, "%p\n", opg); - osc_page_transfer_put(env, opg); - rc = osc_teardown_async_page(env, obj, opg); - if (rc) { - CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, - "Trying to teardown failed: %d\n", rc); - LASSERT(0); - } - - osc_lru_del(osc_cli(obj), opg); - - if (slice->cpl_page->cp_type == CPT_CACHEABLE) { - void *value; - - spin_lock(&obj->oo_tree_lock); - value = radix_tree_delete(&obj->oo_tree, osc_index(opg)); - if (value) - --obj->oo_npages; - spin_unlock(&obj->oo_tree_lock); - - LASSERT(ergo(value, value == opg)); - } -} - -static void osc_page_clip(const struct lu_env *env, - const struct cl_page_slice *slice, int from, int to) -{ - struct osc_page *opg = cl2osc_page(slice); - struct osc_async_page *oap = &opg->ops_oap; - - opg->ops_from = from; - opg->ops_to = to; - spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_COUNT_STABLE; - spin_unlock(&oap->oap_lock); -} - -static int osc_page_cancel(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct osc_page *opg = cl2osc_page(slice); - int rc = 0; - - /* Check if the transferring against this page - * is completed, or not even queued. - */ - if (opg->ops_transfer_pinned) - /* FIXME: may not be interrupted.. */ - rc = osc_cancel_async_page(env, opg); - LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0)); - return rc; -} - -static int osc_page_flush(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io) -{ - struct osc_page *opg = cl2osc_page(slice); - int rc; - - rc = osc_flush_async_page(env, io, opg); - return rc; -} - -static const struct cl_page_operations osc_page_ops = { - .cpo_print = osc_page_print, - .cpo_delete = osc_page_delete, - .cpo_clip = osc_page_clip, - .cpo_cancel = osc_page_cancel, - .cpo_flush = osc_page_flush -}; - -int osc_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) -{ - struct osc_object *osc = cl2osc(obj); - struct osc_page *opg = cl_object_page_slice(obj, page); - int result; - - opg->ops_from = 0; - opg->ops_to = PAGE_SIZE; - - result = osc_prep_async_page(osc, opg, page->cp_vmpage, - cl_offset(obj, index)); - if (result == 0) { - struct osc_io *oio = osc_env_io(env); - - opg->ops_srvlock = osc_io_srvlock(oio); - cl_page_slice_add(page, &opg->ops_cl, obj, index, - &osc_page_ops); - } - INIT_LIST_HEAD(&opg->ops_lru); - - /* reserve an LRU space for this page */ - if (page->cp_type == CPT_CACHEABLE && result == 0) { - result = osc_lru_alloc(env, osc_cli(osc), opg); - if (result == 0) { - spin_lock(&osc->oo_tree_lock); - result = radix_tree_insert(&osc->oo_tree, index, opg); - if (result == 0) - ++osc->oo_npages; - spin_unlock(&osc->oo_tree_lock); - LASSERT(result == 0); - } - } - - return result; -} - -/** - * Helper function called by osc_io_submit() for every page in an immediate - * transfer (i.e., transferred synchronously). - */ -void osc_page_submit(const struct lu_env *env, struct osc_page *opg, - enum cl_req_type crt, int brw_flags) -{ - struct osc_async_page *oap = &opg->ops_oap; - - LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, magic 0x%x\n", - oap, oap->oap_magic); - LASSERT(oap->oap_async_flags & ASYNC_READY); - LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE); - - oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; - oap->oap_page_off = opg->ops_from; - oap->oap_count = opg->ops_to - opg->ops_from; - oap->oap_brw_flags = brw_flags | OBD_BRW_SYNC; - - if (capable(CAP_SYS_RESOURCE)) { - oap->oap_brw_flags |= OBD_BRW_NOQUOTA; - oap->oap_cmd |= OBD_BRW_NOQUOTA; - } - - opg->ops_submit_time = cfs_time_current(); - osc_page_transfer_get(opg, "transfer\0imm"); - osc_page_transfer_add(env, opg, crt); -} - -/* --------------- LRU page management ------------------ */ - -/* OSC is a natural place to manage LRU pages as applications are specialized - * to write OSC by OSC. Ideally, if one OSC is used more frequently it should - * occupy more LRU slots. On the other hand, we should avoid using up all LRU - * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep - * for free LRU slots - this will be very bad so the algorithm requires each - * OSC to free slots voluntarily to maintain a reasonable number of free slots - * at any time. - */ -static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq); - -/** - * LRU pages are freed in batch mode. OSC should at least free this - * number of pages to avoid running out of LRU slots. - */ -static inline int lru_shrink_min(struct client_obd *cli) -{ - return cli->cl_max_pages_per_rpc * 2; -} - -/** - * free this number at most otherwise it will take too long time to finish. - */ -static inline int lru_shrink_max(struct client_obd *cli) -{ - return cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight; -} - -/** - * Check if we can free LRU slots from this OSC. If there exists LRU waiters, - * we should free slots aggressively. In this way, slots are freed in a steady - * step to maintain fairness among OSCs. - * - * Return how many LRU pages should be freed. - */ -static int osc_cache_too_much(struct client_obd *cli) -{ - struct cl_client_cache *cache = cli->cl_cache; - long pages = atomic_long_read(&cli->cl_lru_in_list); - unsigned long budget; - - budget = cache->ccc_lru_max / (atomic_read(&cache->ccc_users) - 2); - - /* if it's going to run out LRU slots, we should free some, but not - * too much to maintain fairness among OSCs. - */ - if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 2) { - if (pages >= budget) - return lru_shrink_max(cli); - else if (pages >= budget / 2) - return lru_shrink_min(cli); - } else { - time64_t duration = ktime_get_real_seconds(); - long timediff; - - /* knock out pages by duration of no IO activity */ - duration -= cli->cl_lru_last_used; - /* - * The difference shouldn't be more than 70 years - * so we can safely case to a long. Round to - * approximately 1 minute. - */ - timediff = (long)(duration >> 6); - if (timediff > 0 && pages >= budget / timediff) - return lru_shrink_min(cli); - } - return 0; -} - -int lru_queue_work(const struct lu_env *env, void *data) -{ - struct client_obd *cli = data; - int count; - - CDEBUG(D_CACHE, "%s: run LRU work for client obd\n", cli_name(cli)); - - count = osc_cache_too_much(cli); - if (count > 0) { - int rc = osc_lru_shrink(env, cli, count, false); - - CDEBUG(D_CACHE, "%s: shrank %d/%d pages from client obd\n", - cli_name(cli), rc, count); - if (rc >= count) { - CDEBUG(D_CACHE, "%s: queue again\n", cli_name(cli)); - ptlrpcd_queue_work(cli->cl_lru_work); - } - } - - return 0; -} - -void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist) -{ - LIST_HEAD(lru); - struct osc_async_page *oap; - long npages = 0; - - list_for_each_entry(oap, plist, oap_pending_item) { - struct osc_page *opg = oap2osc_page(oap); - - if (!opg->ops_in_lru) - continue; - - ++npages; - LASSERT(list_empty(&opg->ops_lru)); - list_add(&opg->ops_lru, &lru); - } - - if (npages > 0) { - spin_lock(&cli->cl_lru_list_lock); - list_splice_tail(&lru, &cli->cl_lru_list); - atomic_long_sub(npages, &cli->cl_lru_busy); - atomic_long_add(npages, &cli->cl_lru_in_list); - cli->cl_lru_last_used = ktime_get_real_seconds(); - spin_unlock(&cli->cl_lru_list_lock); - - if (waitqueue_active(&osc_lru_waitq)) - (void)ptlrpcd_queue_work(cli->cl_lru_work); - } -} - -static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg) -{ - LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0); - list_del_init(&opg->ops_lru); - atomic_long_dec(&cli->cl_lru_in_list); -} - -/** - * Page is being destroyed. The page may be not in LRU list, if the transfer - * has never finished(error occurred). - */ -static void osc_lru_del(struct client_obd *cli, struct osc_page *opg) -{ - if (opg->ops_in_lru) { - spin_lock(&cli->cl_lru_list_lock); - if (!list_empty(&opg->ops_lru)) { - __osc_lru_del(cli, opg); - } else { - LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0); - atomic_long_dec(&cli->cl_lru_busy); - } - spin_unlock(&cli->cl_lru_list_lock); - - atomic_long_inc(cli->cl_lru_left); - /* this is a great place to release more LRU pages if - * this osc occupies too many LRU pages and kernel is - * stealing one of them. - */ - if (osc_cache_too_much(cli)) { - CDEBUG(D_CACHE, "%s: queue LRU work\n", cli_name(cli)); - (void)ptlrpcd_queue_work(cli->cl_lru_work); - } - wake_up(&osc_lru_waitq); - } else { - LASSERT(list_empty(&opg->ops_lru)); - } -} - -/** - * Delete page from LRUlist for redirty. - */ -static void osc_lru_use(struct client_obd *cli, struct osc_page *opg) -{ - /* If page is being transferred for the first time, - * ops_lru should be empty - */ - if (opg->ops_in_lru && !list_empty(&opg->ops_lru)) { - spin_lock(&cli->cl_lru_list_lock); - __osc_lru_del(cli, opg); - spin_unlock(&cli->cl_lru_list_lock); - atomic_long_inc(&cli->cl_lru_busy); - } -} - -static void discard_pagevec(const struct lu_env *env, struct cl_io *io, - struct cl_page **pvec, int max_index) -{ - int i; - - for (i = 0; i < max_index; i++) { - struct cl_page *page = pvec[i]; - - LASSERT(cl_page_is_owned(page, io)); - cl_page_delete(env, page); - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - cl_page_put(env, page); - - pvec[i] = NULL; - } -} - -/** - * Check if a cl_page can be released, i.e, it's not being used. - * - * If unstable account is turned on, bulk transfer may hold one refcount - * for recovery so we need to check vmpage refcount as well; otherwise, - * even we can destroy cl_page but the corresponding vmpage can't be reused. - */ -static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page) -{ - if (cl_page_in_use_noref(page)) - return true; - - if (cli->cl_cache->ccc_unstable_check) { - struct page *vmpage = cl_page_vmpage(page); - - /* vmpage have two known users: cl_page and VM page cache */ - if (page_count(vmpage) - page_mapcount(vmpage) > 2) - return true; - } - return false; -} - -/** - * Drop @target of pages from LRU at most. - */ -long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, - long target, bool force) -{ - struct cl_io *io; - struct cl_object *clobj = NULL; - struct cl_page **pvec; - struct osc_page *opg; - int maxscan = 0; - long count = 0; - int index = 0; - int rc = 0; - - LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0); - if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0) - return 0; - - CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n", - cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force); - if (!force) { - if (atomic_read(&cli->cl_lru_shrinkers) > 0) - return -EBUSY; - - if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) { - atomic_dec(&cli->cl_lru_shrinkers); - return -EBUSY; - } - } else { - atomic_inc(&cli->cl_lru_shrinkers); - } - - pvec = (struct cl_page **)osc_env_info(env)->oti_pvec; - io = &osc_env_info(env)->oti_io; - - spin_lock(&cli->cl_lru_list_lock); - if (force) - cli->cl_lru_reclaim++; - maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list)); - while (!list_empty(&cli->cl_lru_list)) { - struct cl_page *page; - bool will_free = false; - - if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1) - break; - - if (--maxscan < 0) - break; - - opg = list_entry(cli->cl_lru_list.next, struct osc_page, - ops_lru); - page = opg->ops_cl.cpl_page; - if (lru_page_busy(cli, page)) { - list_move_tail(&opg->ops_lru, &cli->cl_lru_list); - continue; - } - - LASSERT(page->cp_obj); - if (clobj != page->cp_obj) { - struct cl_object *tmp = page->cp_obj; - - cl_object_get(tmp); - spin_unlock(&cli->cl_lru_list_lock); - - if (clobj) { - discard_pagevec(env, io, pvec, index); - index = 0; - - cl_io_fini(env, io); - cl_object_put(env, clobj); - clobj = NULL; - } - - clobj = tmp; - io->ci_obj = clobj; - io->ci_ignore_layout = 1; - rc = cl_io_init(env, io, CIT_MISC, clobj); - - spin_lock(&cli->cl_lru_list_lock); - - if (rc != 0) - break; - - ++maxscan; - continue; - } - - if (cl_page_own_try(env, io, page) == 0) { - if (!lru_page_busy(cli, page)) { - /* remove it from lru list earlier to avoid - * lock contention - */ - __osc_lru_del(cli, opg); - opg->ops_in_lru = 0; /* will be discarded */ - - cl_page_get(page); - will_free = true; - } else { - cl_page_disown(env, io, page); - } - } - - if (!will_free) { - list_move_tail(&opg->ops_lru, &cli->cl_lru_list); - continue; - } - - /* Don't discard and free the page with cl_lru_list held */ - pvec[index++] = page; - if (unlikely(index == OTI_PVEC_SIZE)) { - spin_unlock(&cli->cl_lru_list_lock); - discard_pagevec(env, io, pvec, index); - index = 0; - - spin_lock(&cli->cl_lru_list_lock); - } - - if (++count >= target) - break; - } - spin_unlock(&cli->cl_lru_list_lock); - - if (clobj) { - discard_pagevec(env, io, pvec, index); - - cl_io_fini(env, io); - cl_object_put(env, clobj); - } - - atomic_dec(&cli->cl_lru_shrinkers); - if (count > 0) { - atomic_long_add(count, cli->cl_lru_left); - wake_up_all(&osc_lru_waitq); - } - return count > 0 ? count : rc; -} - -/** - * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least - * \@npages of LRU slots. For performance consideration, it's better to drop - * LRU pages in batch. Therefore, the actual number is adjusted at least - * max_pages_per_rpc. - */ -static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages) -{ - struct lu_env *env; - struct cl_client_cache *cache = cli->cl_cache; - int max_scans; - u16 refcheck; - long rc = 0; - - LASSERT(cache); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return 0; - - npages = max_t(int, npages, cli->cl_max_pages_per_rpc); - CDEBUG(D_CACHE, "%s: start to reclaim %ld pages from LRU\n", - cli_name(cli), npages); - rc = osc_lru_shrink(env, cli, npages, true); - if (rc >= npages) { - CDEBUG(D_CACHE, "%s: reclaimed %ld/%ld pages from LRU\n", - cli_name(cli), rc, npages); - if (osc_cache_too_much(cli) > 0) - ptlrpcd_queue_work(cli->cl_lru_work); - goto out; - } else if (rc > 0) { - npages -= rc; - } - - CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n", - cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list), - atomic_long_read(&cli->cl_lru_busy), npages); - - /* Reclaim LRU slots from other client_obd as it can't free enough - * from its own. This should rarely happen. - */ - spin_lock(&cache->ccc_lru_lock); - LASSERT(!list_empty(&cache->ccc_lru)); - - cache->ccc_lru_shrinkers++; - list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); - - max_scans = atomic_read(&cache->ccc_users) - 2; - while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) { - cli = list_entry(cache->ccc_lru.next, struct client_obd, - cl_lru_osc); - - CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n", - cli_name(cli), cli, - atomic_long_read(&cli->cl_lru_in_list), - atomic_long_read(&cli->cl_lru_busy)); - - list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); - if (osc_cache_too_much(cli) > 0) { - spin_unlock(&cache->ccc_lru_lock); - - rc = osc_lru_shrink(env, cli, npages, true); - spin_lock(&cache->ccc_lru_lock); - if (rc >= npages) - break; - if (rc > 0) - npages -= rc; - } - } - spin_unlock(&cache->ccc_lru_lock); - -out: - cl_env_put(env, &refcheck); - CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n", - cli_name(cli), cli, rc); - return rc; -} - -/** - * osc_lru_alloc() is called to reserve an LRU slot for a cl_page. - * - * Usually the LRU slots are reserved in osc_io_iter_rw_init(). - * Only in the case that the LRU slots are in extreme shortage, it should - * have reserved enough slots for an IO. - */ -static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli, - struct osc_page *opg) -{ - struct osc_io *oio = osc_env_io(env); - int rc = 0; - - if (!cli->cl_cache) /* shall not be in LRU */ - return 0; - - if (oio->oi_lru_reserved > 0) { - --oio->oi_lru_reserved; - goto out; - } - - LASSERT(atomic_long_read(cli->cl_lru_left) >= 0); - while (!atomic_long_add_unless(cli->cl_lru_left, -1, 0)) { - /* run out of LRU spaces, try to drop some by itself */ - rc = osc_lru_reclaim(cli, 1); - if (rc < 0) - break; - if (rc > 0) - continue; - - cond_resched(); - - rc = l_wait_event_abortable(osc_lru_waitq, - atomic_long_read(cli->cl_lru_left) > 0); - - if (rc < 0) - break; - } - -out: - if (rc >= 0) { - atomic_long_inc(&cli->cl_lru_busy); - opg->ops_in_lru = 1; - rc = 0; - } - - return rc; -} - -/** - * osc_lru_reserve() is called to reserve enough LRU slots for I/O. - * - * The benefit of doing this is to reduce contention against atomic counter - * cl_lru_left by changing it from per-page access to per-IO access. - */ -unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages) -{ - unsigned long reserved = 0; - unsigned long max_pages; - unsigned long c; - - /* - * reserve a full RPC window at most to avoid that a thread accidentally - * consumes too many LRU slots - */ - max_pages = cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight; - if (npages > max_pages) - npages = max_pages; - - c = atomic_long_read(cli->cl_lru_left); - if (c < npages && osc_lru_reclaim(cli, npages) > 0) - c = atomic_long_read(cli->cl_lru_left); - while (c >= npages) { - if (c == atomic_long_cmpxchg(cli->cl_lru_left, c, c - npages)) { - reserved = npages; - break; - } - c = atomic_long_read(cli->cl_lru_left); - } - if (atomic_long_read(cli->cl_lru_left) < max_pages) { - /* - * If there aren't enough pages in the per-OSC LRU then - * wake up the LRU thread to try and clear out space, so - * we don't block if pages are being dirtied quickly. - */ - CDEBUG(D_CACHE, "%s: queue LRU, left: %lu/%ld.\n", - cli_name(cli), atomic_long_read(cli->cl_lru_left), - max_pages); - (void)ptlrpcd_queue_work(cli->cl_lru_work); - } - - return reserved; -} - -/** - * osc_lru_unreserve() is called to unreserve LRU slots. - * - * LRU slots reserved by osc_lru_reserve() may have entries left due to several - * reasons such as page already existing or I/O error. Those reserved slots - * should be freed by calling this function. - */ -void osc_lru_unreserve(struct client_obd *cli, unsigned long npages) -{ - atomic_long_add(npages, cli->cl_lru_left); - wake_up_all(&osc_lru_waitq); -} - -/** - * Atomic operations are expensive. We accumulate the accounting for the - * same page pgdat to get better performance. - * In practice this can work pretty good because the pages in the same RPC - * are likely from the same page zone. - */ -static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc, - int factor) -{ - int page_count = desc->bd_iov_count; - pg_data_t *last = NULL; - int count = 0; - int i; - - LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); - - for (i = 0; i < page_count; i++) { - pg_data_t *pgdat = page_pgdat(BD_GET_KIOV(desc, i).bv_page); - - if (likely(pgdat == last)) { - ++count; - continue; - } - - if (count > 0) { - mod_node_page_state(pgdat, NR_UNSTABLE_NFS, - factor * count); - count = 0; - } - last = pgdat; - ++count; - } - if (count > 0) - mod_node_page_state(last, NR_UNSTABLE_NFS, factor * count); -} - -static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc) -{ - unstable_page_accounting(desc, 1); -} - -static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc) -{ - unstable_page_accounting(desc, -1); -} - -/** - * Performs "unstable" page accounting. This function balances the - * increment operations performed in osc_inc_unstable_pages. It is - * registered as the RPC request callback, and is executed when the - * bulk RPC is committed on the server. Thus at this point, the pages - * involved in the bulk transfer are no longer considered unstable. - * - * If this function is called, the request should have been committed - * or req:rq_unstable must have been set; it implies that the unstable - * statistic have been added. - */ -void osc_dec_unstable_pages(struct ptlrpc_request *req) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - struct ptlrpc_bulk_desc *desc = req->rq_bulk; - int page_count = desc->bd_iov_count; - long unstable_count; - - LASSERT(page_count >= 0); - dec_unstable_page_accounting(desc); - - unstable_count = atomic_long_sub_return(page_count, - &cli->cl_unstable_count); - LASSERT(unstable_count >= 0); - - unstable_count = atomic_long_sub_return(page_count, - &cli->cl_cache->ccc_unstable_nr); - LASSERT(unstable_count >= 0); - if (!unstable_count) - wake_up_all(&cli->cl_cache->ccc_unstable_waitq); - - if (waitqueue_active(&osc_lru_waitq)) - (void)ptlrpcd_queue_work(cli->cl_lru_work); -} - -/** - * "unstable" page accounting. See: osc_dec_unstable_pages. - */ -void osc_inc_unstable_pages(struct ptlrpc_request *req) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - struct ptlrpc_bulk_desc *desc = req->rq_bulk; - long page_count = desc->bd_iov_count; - - /* No unstable page tracking */ - if (!cli->cl_cache || !cli->cl_cache->ccc_unstable_check) - return; - - add_unstable_page_accounting(desc); - atomic_long_add(page_count, &cli->cl_unstable_count); - atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr); - - /* - * If the request has already been committed (i.e. brw_commit - * called via rq_commit_cb), we need to undo the unstable page - * increments we just performed because rq_commit_cb wont be - * called again. - */ - spin_lock(&req->rq_lock); - if (unlikely(req->rq_committed)) { - spin_unlock(&req->rq_lock); - - osc_dec_unstable_pages(req); - } else { - req->rq_unstable = 1; - spin_unlock(&req->rq_lock); - } -} - -/** - * Check if it piggybacks SOFT_SYNC flag to OST from this OSC. - * This function will be called by every BRW RPC so it's critical - * to make this function fast. - */ -bool osc_over_unstable_soft_limit(struct client_obd *cli) -{ - long unstable_nr, osc_unstable_count; - - /* Can't check cli->cl_unstable_count, therefore, no soft limit */ - if (!cli->cl_cache || !cli->cl_cache->ccc_unstable_check) - return false; - - osc_unstable_count = atomic_long_read(&cli->cl_unstable_count); - unstable_nr = atomic_long_read(&cli->cl_cache->ccc_unstable_nr); - - CDEBUG(D_CACHE, - "%s: cli: %p unstable pages: %lu, osc unstable pages: %lu\n", - cli_name(cli), cli, unstable_nr, osc_unstable_count); - - /* - * If the LRU slots are in shortage - 25% remaining AND this OSC - * has one full RPC window of unstable pages, it's a good chance - * to piggyback a SOFT_SYNC flag. - * Please notice that the OST won't take immediate response for the - * SOFT_SYNC request so active OSCs will have more chance to carry - * the flag, this is reasonable. - */ - return unstable_nr > cli->cl_cache->ccc_lru_max >> 2 && - osc_unstable_count > cli->cl_max_pages_per_rpc * - cli->cl_max_rpcs_in_flight; -} - -/** - * Return how many LRU pages in the cache of all OSC devices - * - * Return: return # of cached LRU pages times reclaimation tendency - * SHRINK_STOP if it cannot do any scanning in this time - */ -unsigned long osc_cache_shrink_count(struct shrinker *sk, - struct shrink_control *sc) -{ - struct client_obd *cli; - unsigned long cached = 0; - - spin_lock(&osc_shrink_lock); - list_for_each_entry(cli, &osc_shrink_list, cl_shrink_list) - cached += atomic_long_read(&cli->cl_lru_in_list); - spin_unlock(&osc_shrink_lock); - - return (cached * sysctl_vfs_cache_pressure) / 100; -} - -/** - * Scan and try to reclaim sc->nr_to_scan cached LRU pages - * - * Return: number of cached LRU pages reclaimed - * SHRINK_STOP if it cannot do any scanning in this time - * - * Linux kernel will loop calling this shrinker scan routine with - * sc->nr_to_scan = SHRINK_BATCH(128 for now) until kernel got enough memory. - * - * If sc->nr_to_scan is 0, the VM is querying the cache size, we don't need - * to scan and try to reclaim LRU pages, just return 0 and - * osc_cache_shrink_count() will report the LRU page number. - */ -unsigned long osc_cache_shrink_scan(struct shrinker *sk, - struct shrink_control *sc) -{ - struct client_obd *stop_anchor = NULL; - struct client_obd *cli; - struct lu_env *env; - long shrank = 0; - u16 refcheck; - int rc; - - if (!sc->nr_to_scan) - return 0; - - if (!(sc->gfp_mask & __GFP_FS)) - return SHRINK_STOP; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return SHRINK_STOP; - - spin_lock(&osc_shrink_lock); - while (!list_empty(&osc_shrink_list)) { - cli = list_entry(osc_shrink_list.next, struct client_obd, - cl_shrink_list); - - if (!stop_anchor) - stop_anchor = cli; - else if (cli == stop_anchor) - break; - - list_move_tail(&cli->cl_shrink_list, &osc_shrink_list); - spin_unlock(&osc_shrink_lock); - - /* shrink no more than max_pages_per_rpc for an OSC */ - rc = osc_lru_shrink(env, cli, (sc->nr_to_scan - shrank) > - cli->cl_max_pages_per_rpc ? - cli->cl_max_pages_per_rpc : - sc->nr_to_scan - shrank, true); - if (rc > 0) - shrank += rc; - - if (shrank >= sc->nr_to_scan) - goto out; - - spin_lock(&osc_shrink_lock); - } - spin_unlock(&osc_shrink_lock); - -out: - cl_env_put(env, &refcheck); - - return shrank; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_quota.c b/drivers/staging/lustre/lustre/osc/osc_quota.c deleted file mode 100644 index ce1731dc604f..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_quota.c +++ /dev/null @@ -1,284 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * GPL HEADER END - */ -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2015, Intel Corporation. - * - * Code originally extracted from quota directory - */ - -#include <obd_class.h> -#include "osc_internal.h" - -static inline struct osc_quota_info *osc_oqi_alloc(u32 id) -{ - struct osc_quota_info *oqi; - - oqi = kmem_cache_zalloc(osc_quota_kmem, GFP_NOFS); - if (oqi) - oqi->oqi_id = id; - - return oqi; -} - -int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]) -{ - int type; - - for (type = 0; type < MAXQUOTAS; type++) { - struct osc_quota_info *oqi; - - oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); - if (oqi) { - /* do not try to access oqi here, it could have been - * freed by osc_quota_setdq() - */ - - /* the slot is busy, the user is about to run out of - * quota space on this OST - */ - CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n", - type == USRQUOTA ? "user" : "grout", qid[type]); - return NO_QUOTA; - } - } - - return QUOTA_OK; -} - -#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \ - : OBD_MD_FLGRPQUOTA) -#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \ - : OBD_FL_NO_GRPQUOTA) - -int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[], - u32 valid, u32 flags) -{ - int type; - int rc = 0; - - if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0) - return 0; - - for (type = 0; type < MAXQUOTAS; type++) { - struct osc_quota_info *oqi; - - if ((valid & MD_QUOTA_FLAG(type)) == 0) - continue; - - /* lookup the ID in the per-type hash table */ - oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); - if ((flags & FL_QUOTA_FLAG(type)) != 0) { - /* This ID is getting close to its quota limit, let's - * switch to sync I/O - */ - if (oqi) - continue; - - oqi = osc_oqi_alloc(qid[type]); - if (!oqi) { - rc = -ENOMEM; - break; - } - - rc = cfs_hash_add_unique(cli->cl_quota_hash[type], - &qid[type], &oqi->oqi_hash); - /* race with others? */ - if (rc == -EALREADY) { - rc = 0; - kmem_cache_free(osc_quota_kmem, oqi); - } - - CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n", - cli_name(cli), - type == USRQUOTA ? "user" : "group", - qid[type], rc); - } else { - /* This ID is now off the hook, let's remove it from - * the hash table - */ - if (!oqi) - continue; - - oqi = cfs_hash_del_key(cli->cl_quota_hash[type], - &qid[type]); - if (oqi) - kmem_cache_free(osc_quota_kmem, oqi); - - CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n", - cli_name(cli), - type == USRQUOTA ? "user" : "group", - qid[type], oqi); - } - } - - return rc; -} - -/* - * Hash operations for uid/gid <-> osc_quota_info - */ -static unsigned int -oqi_hashfn(struct cfs_hash *hs, const void *key, unsigned int mask) -{ - return cfs_hash_u32_hash(*((__u32 *)key), mask); -} - -static int -oqi_keycmp(const void *key, struct hlist_node *hnode) -{ - struct osc_quota_info *oqi; - u32 uid; - - LASSERT(key); - uid = *((u32 *)key); - oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); - - return uid == oqi->oqi_id; -} - -static void * -oqi_key(struct hlist_node *hnode) -{ - struct osc_quota_info *oqi; - - oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); - return &oqi->oqi_id; -} - -static void * -oqi_object(struct hlist_node *hnode) -{ - return hlist_entry(hnode, struct osc_quota_info, oqi_hash); -} - -static void -oqi_get(struct cfs_hash *hs, struct hlist_node *hnode) -{ -} - -static void -oqi_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) -{ -} - -static void -oqi_exit(struct cfs_hash *hs, struct hlist_node *hnode) -{ - struct osc_quota_info *oqi; - - oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); - - kmem_cache_free(osc_quota_kmem, oqi); -} - -#define HASH_QUOTA_BKT_BITS 5 -#define HASH_QUOTA_CUR_BITS 5 -#define HASH_QUOTA_MAX_BITS 15 - -static struct cfs_hash_ops quota_hash_ops = { - .hs_hash = oqi_hashfn, - .hs_keycmp = oqi_keycmp, - .hs_key = oqi_key, - .hs_object = oqi_object, - .hs_get = oqi_get, - .hs_put_locked = oqi_put_locked, - .hs_exit = oqi_exit, -}; - -int osc_quota_setup(struct obd_device *obd) -{ - struct client_obd *cli = &obd->u.cli; - int i, type; - - for (type = 0; type < MAXQUOTAS; type++) { - cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH", - HASH_QUOTA_CUR_BITS, - HASH_QUOTA_MAX_BITS, - HASH_QUOTA_BKT_BITS, - 0, - CFS_HASH_MIN_THETA, - CFS_HASH_MAX_THETA, - "a_hash_ops, - CFS_HASH_DEFAULT); - if (!cli->cl_quota_hash[type]) - break; - } - - if (type == MAXQUOTAS) - return 0; - - for (i = 0; i < type; i++) - cfs_hash_putref(cli->cl_quota_hash[i]); - - return -ENOMEM; -} - -int osc_quota_cleanup(struct obd_device *obd) -{ - struct client_obd *cli = &obd->u.cli; - int type; - - for (type = 0; type < MAXQUOTAS; type++) - cfs_hash_putref(cli->cl_quota_hash[type]); - - return 0; -} - -int osc_quotactl(struct obd_device *unused, struct obd_export *exp, - struct obd_quotactl *oqctl) -{ - struct ptlrpc_request *req; - struct obd_quotactl *oqc; - int rc; - - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION, - OST_QUOTACTL); - if (!req) - return -ENOMEM; - - oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); - *oqc = *oqctl; - - ptlrpc_request_set_replen(req); - ptlrpc_at_set_req_timeout(req); - req->rq_no_resend = 1; - - rc = ptlrpc_queue_wait(req); - if (rc) - CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); - - if (req->rq_repmsg) { - oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL); - if (oqc) { - *oqctl = *oqc; - } else if (!rc) { - CERROR("Can't unpack obd_quotactl\n"); - rc = -EPROTO; - } - } else if (!rc) { - CERROR("Can't unpack obd_quotactl\n"); - rc = -EPROTO; - } - ptlrpc_req_finished(req); - - return rc; -} diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c deleted file mode 100644 index 1c2bbbf5d864..000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_request.c +++ /dev/null @@ -1,2899 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include <linux/libcfs/libcfs.h> - -#include <lustre_dlm.h> -#include <lustre_net.h> -#include <uapi/linux/lustre/lustre_idl.h> -#include <obd_cksum.h> - -#include <lustre_ha.h> -#include <lprocfs_status.h> -#include <uapi/linux/lustre/lustre_ioctl.h> -#include <lustre_debug.h> -#include <lustre_obdo.h> -#include <uapi/linux/lustre/lustre_param.h> -#include <lustre_fid.h> -#include <obd_class.h> -#include <obd.h> -#include "osc_internal.h" -#include "osc_cl_internal.h" - -atomic_t osc_pool_req_count; -unsigned int osc_reqpool_maxreqcount; -struct ptlrpc_request_pool *osc_rq_pool; - -/* max memory used for request pool, unit is MB */ -static unsigned int osc_reqpool_mem_max = 5; -module_param(osc_reqpool_mem_max, uint, 0444); - -struct osc_brw_async_args { - struct obdo *aa_oa; - int aa_requested_nob; - int aa_nio_count; - u32 aa_page_count; - int aa_resends; - struct brw_page **aa_ppga; - struct client_obd *aa_cli; - struct list_head aa_oaps; - struct list_head aa_exts; -}; - -struct osc_async_args { - struct obd_info *aa_oi; -}; - -struct osc_setattr_args { - struct obdo *sa_oa; - obd_enqueue_update_f sa_upcall; - void *sa_cookie; -}; - -struct osc_fsync_args { - struct osc_object *fa_obj; - struct obdo *fa_oa; - obd_enqueue_update_f fa_upcall; - void *fa_cookie; -}; - -struct osc_enqueue_args { - struct obd_export *oa_exp; - enum ldlm_type oa_type; - enum ldlm_mode oa_mode; - __u64 *oa_flags; - osc_enqueue_upcall_f oa_upcall; - void *oa_cookie; - struct ost_lvb *oa_lvb; - struct lustre_handle oa_lockh; - unsigned int oa_agl:1; -}; - -static void osc_release_ppga(struct brw_page **ppga, u32 count); -static int brw_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, int rc); - -static inline void osc_pack_req_body(struct ptlrpc_request *req, - struct obdo *oa) -{ - struct ost_body *body; - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); -} - -static int osc_getattr(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - struct ptlrpc_request *req; - struct ost_body *body; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - osc_pack_req_body(req, oa); - - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, - &body->oa); - - oa->o_blksize = cli_brw_size(exp->exp_obd); - oa->o_valid |= OBD_MD_FLBLKSZ; - - out: - ptlrpc_req_finished(req); - return rc; -} - -static int osc_setattr(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - struct ptlrpc_request *req; - struct ost_body *body; - int rc; - - LASSERT(oa->o_valid & OBD_MD_FLGROUP); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - osc_pack_req_body(req, oa); - - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, - &body->oa); - -out: - ptlrpc_req_finished(req); - return rc; -} - -static int osc_setattr_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - struct osc_setattr_args *sa, int rc) -{ - struct ost_body *body; - - if (rc != 0) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa, - &body->oa); -out: - rc = sa->sa_upcall(sa->sa_cookie, rc); - return rc; -} - -int osc_setattr_async(struct obd_export *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset) -{ - struct ptlrpc_request *req; - struct osc_setattr_args *sa; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - osc_pack_req_body(req, oa); - - ptlrpc_request_set_replen(req); - - /* do mds to ost setattr asynchronously */ - if (!rqset) { - /* Do not wait for response. */ - ptlrpcd_add_req(req); - } else { - req->rq_interpret_reply = - (ptlrpc_interpterer_t)osc_setattr_interpret; - - BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args)); - sa = ptlrpc_req_async_args(req); - sa->sa_oa = oa; - sa->sa_upcall = upcall; - sa->sa_cookie = cookie; - - if (rqset == PTLRPCD_SET) - ptlrpcd_add_req(req); - else - ptlrpc_set_add_req(rqset, req); - } - - return 0; -} - -static int osc_create(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - struct ptlrpc_request *req; - struct ost_body *body; - int rc; - - LASSERT(oa); - LASSERT(oa->o_valid & OBD_MD_FLGROUP); - LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi))); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE); - if (!req) { - rc = -ENOMEM; - goto out; - } - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE); - if (rc) { - ptlrpc_request_free(req); - goto out; - } - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); - - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out_req; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out_req; - } - - CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags); - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); - - oa->o_blksize = cli_brw_size(exp->exp_obd); - oa->o_valid |= OBD_MD_FLBLKSZ; - - CDEBUG(D_HA, "transno: %lld\n", - lustre_msg_get_transno(req->rq_repmsg)); -out_req: - ptlrpc_req_finished(req); -out: - return rc; -} - -int osc_punch_base(struct obd_export *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset) -{ - struct ptlrpc_request *req; - struct osc_setattr_args *sa; - struct ost_body *body; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ - ptlrpc_at_set_req_timeout(req); - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, - oa); - - ptlrpc_request_set_replen(req); - - req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret; - BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args)); - sa = ptlrpc_req_async_args(req); - sa->sa_oa = oa; - sa->sa_upcall = upcall; - sa->sa_cookie = cookie; - if (rqset == PTLRPCD_SET) - ptlrpcd_add_req(req); - else - ptlrpc_set_add_req(rqset, req); - - return 0; -} - -static int osc_sync_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *arg, int rc) -{ - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - struct osc_fsync_args *fa = arg; - unsigned long valid = 0; - struct ost_body *body; - struct cl_object *obj; - - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - CERROR("can't unpack ost_body\n"); - rc = -EPROTO; - goto out; - } - - *fa->fa_oa = body->oa; - obj = osc2cl(fa->fa_obj); - - /* Update osc object's blocks attribute */ - cl_object_attr_lock(obj); - if (body->oa.o_valid & OBD_MD_FLBLOCKS) { - attr->cat_blocks = body->oa.o_blocks; - valid |= CAT_BLOCKS; - } - - if (valid) - cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); - -out: - rc = fa->fa_upcall(fa->fa_cookie, rc); - return rc; -} - -int osc_sync_base(struct osc_object *obj, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset) -{ - struct obd_export *exp = osc_export(obj); - struct ptlrpc_request *req; - struct ost_body *body; - struct osc_fsync_args *fa; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - /* overload the size and blocks fields in the oa with start/end */ - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, - oa); - - ptlrpc_request_set_replen(req); - req->rq_interpret_reply = osc_sync_interpret; - - BUILD_BUG_ON(sizeof(*fa) > sizeof(req->rq_async_args)); - fa = ptlrpc_req_async_args(req); - fa->fa_obj = obj; - fa->fa_oa = oa; - fa->fa_upcall = upcall; - fa->fa_cookie = cookie; - - if (rqset == PTLRPCD_SET) - ptlrpcd_add_req(req); - else - ptlrpc_set_add_req(rqset, req); - - return 0; -} - -/* Find and cancel locally locks matched by @mode in the resource found by - * @objid. Found locks are added into @cancel list. Returns the amount of - * locks added to @cancels list. - */ -static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, - struct list_head *cancels, - enum ldlm_mode mode, __u64 lock_flags) -{ - struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; - struct ldlm_res_id res_id; - struct ldlm_resource *res; - int count; - - /* Return, i.e. cancel nothing, only if ELC is supported (flag in - * export) but disabled through procfs (flag in NS). - * - * This distinguishes from a case when ELC is not supported originally, - * when we still want to cancel locks in advance and just cancel them - * locally, without sending any RPC. - */ - if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) - return 0; - - ostid_build_res_name(&oa->o_oi, &res_id); - res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); - if (IS_ERR(res)) - return 0; - - LDLM_RESOURCE_ADDREF(res); - count = ldlm_cancel_resource_local(res, cancels, NULL, mode, - lock_flags, 0, NULL); - LDLM_RESOURCE_DELREF(res); - ldlm_resource_putref(res); - return count; -} - -static int osc_destroy_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, - int rc) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - - atomic_dec(&cli->cl_destroy_in_flight); - wake_up(&cli->cl_destroy_waitq); - return 0; -} - -static int osc_can_send_destroy(struct client_obd *cli) -{ - if (atomic_inc_return(&cli->cl_destroy_in_flight) <= - cli->cl_max_rpcs_in_flight) { - /* The destroy request can be sent */ - return 1; - } - if (atomic_dec_return(&cli->cl_destroy_in_flight) < - cli->cl_max_rpcs_in_flight) { - /* - * The counter has been modified between the two atomic - * operations. - */ - wake_up(&cli->cl_destroy_waitq); - } - return 0; -} - -static int osc_destroy(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - struct ptlrpc_request *req; - struct ost_body *body; - LIST_HEAD(cancels); - int rc, count; - - if (!oa) { - CDEBUG(D_INFO, "oa NULL\n"); - return -EINVAL; - } - - count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW, - LDLM_FL_DISCARD_DATA); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY); - if (!req) { - ldlm_lock_list_put(&cancels, l_bl_ast, count); - return -ENOMEM; - } - - rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY, - 0, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ - ptlrpc_at_set_req_timeout(req); - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); - - ptlrpc_request_set_replen(req); - - req->rq_interpret_reply = osc_destroy_interpret; - if (!osc_can_send_destroy(cli)) { - /* - * Wait until the number of on-going destroy RPCs drops - * under max_rpc_in_flight - */ - l_wait_event_abortable_exclusive(cli->cl_destroy_waitq, - osc_can_send_destroy(cli)); - } - - /* Do not wait for response */ - ptlrpcd_add_req(req); - return 0; -} - -static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, - long writing_bytes) -{ - u32 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT; - - LASSERT(!(oa->o_valid & bits)); - - oa->o_valid |= bits; - spin_lock(&cli->cl_loi_list_lock); - oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT; - if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit > - cli->cl_dirty_max_pages)) { - CERROR("dirty %lu - %lu > dirty_max %lu\n", - cli->cl_dirty_pages, cli->cl_dirty_transit, - cli->cl_dirty_max_pages); - oa->o_undirty = 0; - } else if (unlikely(atomic_long_read(&obd_dirty_pages) - - atomic_long_read(&obd_dirty_transit_pages) > - (long)(obd_max_dirty_pages + 1))) { - /* The atomic_read() allowing the atomic_inc() are - * not covered by a lock thus they may safely race and trip - * this CERROR() unless we add in a small fudge factor (+1). - */ - CERROR("%s: dirty %ld + %ld > system dirty_max %ld\n", - cli_name(cli), atomic_long_read(&obd_dirty_pages), - atomic_long_read(&obd_dirty_transit_pages), - obd_max_dirty_pages); - oa->o_undirty = 0; - } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages > - 0x7fffffff)) { - CERROR("dirty %lu - dirty_max %lu too big???\n", - cli->cl_dirty_pages, cli->cl_dirty_max_pages); - oa->o_undirty = 0; - } else { - unsigned long max_in_flight; - - max_in_flight = (cli->cl_max_pages_per_rpc << PAGE_SHIFT) * - (cli->cl_max_rpcs_in_flight + 1); - oa->o_undirty = max(cli->cl_dirty_max_pages << PAGE_SHIFT, - max_in_flight); - } - oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; - oa->o_dropped = cli->cl_lost_grant; - cli->cl_lost_grant = 0; - spin_unlock(&cli->cl_loi_list_lock); - CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n", - oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant); -} - -void osc_update_next_shrink(struct client_obd *cli) -{ - cli->cl_next_shrink_grant = - cfs_time_shift(cli->cl_grant_shrink_interval); - CDEBUG(D_CACHE, "next time %ld to shrink grant\n", - cli->cl_next_shrink_grant); -} - -static void __osc_update_grant(struct client_obd *cli, u64 grant) -{ - spin_lock(&cli->cl_loi_list_lock); - cli->cl_avail_grant += grant; - spin_unlock(&cli->cl_loi_list_lock); -} - -static void osc_update_grant(struct client_obd *cli, struct ost_body *body) -{ - if (body->oa.o_valid & OBD_MD_FLGRANT) { - CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant); - __osc_update_grant(cli, body->oa.o_grant); - } -} - -static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, - u32 keylen, void *key, u32 vallen, - void *val, struct ptlrpc_request_set *set); - -static int osc_shrink_grant_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *aa, int rc) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - struct obdo *oa = ((struct osc_brw_async_args *)aa)->aa_oa; - struct ost_body *body; - - if (rc != 0) { - __osc_update_grant(cli, oa->o_grant); - goto out; - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - osc_update_grant(cli, body); -out: - kmem_cache_free(obdo_cachep, oa); - return rc; -} - -static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa) -{ - spin_lock(&cli->cl_loi_list_lock); - oa->o_grant = cli->cl_avail_grant / 4; - cli->cl_avail_grant -= oa->o_grant; - spin_unlock(&cli->cl_loi_list_lock); - if (!(oa->o_valid & OBD_MD_FLFLAGS)) { - oa->o_valid |= OBD_MD_FLFLAGS; - oa->o_flags = 0; - } - oa->o_flags |= OBD_FL_SHRINK_GRANT; - osc_update_next_shrink(cli); -} - -/* Shrink the current grant, either from some large amount to enough for a - * full set of in-flight RPCs, or if we have already shrunk to that limit - * then to enough for a single RPC. This avoids keeping more grant than - * needed, and avoids shrinking the grant piecemeal. - */ -static int osc_shrink_grant(struct client_obd *cli) -{ - __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) * - (cli->cl_max_pages_per_rpc << PAGE_SHIFT); - - spin_lock(&cli->cl_loi_list_lock); - if (cli->cl_avail_grant <= target_bytes) - target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT; - spin_unlock(&cli->cl_loi_list_lock); - - return osc_shrink_grant_to_target(cli, target_bytes); -} - -int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes) -{ - int rc = 0; - struct ost_body *body; - - spin_lock(&cli->cl_loi_list_lock); - /* Don't shrink if we are already above or below the desired limit - * We don't want to shrink below a single RPC, as that will negatively - * impact block allocation and long-term performance. - */ - if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT) - target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT; - - if (target_bytes >= cli->cl_avail_grant) { - spin_unlock(&cli->cl_loi_list_lock); - return 0; - } - spin_unlock(&cli->cl_loi_list_lock); - - body = kzalloc(sizeof(*body), GFP_NOFS); - if (!body) - return -ENOMEM; - - osc_announce_cached(cli, &body->oa, 0); - - spin_lock(&cli->cl_loi_list_lock); - body->oa.o_grant = cli->cl_avail_grant - target_bytes; - cli->cl_avail_grant = target_bytes; - spin_unlock(&cli->cl_loi_list_lock); - if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) { - body->oa.o_valid |= OBD_MD_FLFLAGS; - body->oa.o_flags = 0; - } - body->oa.o_flags |= OBD_FL_SHRINK_GRANT; - osc_update_next_shrink(cli); - - rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export, - sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK, - sizeof(*body), body, NULL); - if (rc != 0) - __osc_update_grant(cli, body->oa.o_grant); - kfree(body); - return rc; -} - -static int osc_should_shrink_grant(struct client_obd *client) -{ - unsigned long time = cfs_time_current(); - unsigned long next_shrink = client->cl_next_shrink_grant; - - if ((client->cl_import->imp_connect_data.ocd_connect_flags & - OBD_CONNECT_GRANT_SHRINK) == 0) - return 0; - - if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) { - /* Get the current RPC size directly, instead of going via: - * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export) - * Keep comment here so that it can be found by searching. - */ - int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT; - - if (client->cl_import->imp_state == LUSTRE_IMP_FULL && - client->cl_avail_grant > brw_size) - return 1; - - osc_update_next_shrink(client); - } - return 0; -} - -static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data) -{ - struct client_obd *client; - - list_for_each_entry(client, &item->ti_obd_list, cl_grant_shrink_list) { - if (osc_should_shrink_grant(client)) - osc_shrink_grant(client); - } - return 0; -} - -static int osc_add_shrink_grant(struct client_obd *client) -{ - int rc; - - rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval, - TIMEOUT_GRANT, - osc_grant_shrink_grant_cb, NULL, - &client->cl_grant_shrink_list); - if (rc) { - CERROR("add grant client %s error %d\n", cli_name(client), rc); - return rc; - } - CDEBUG(D_CACHE, "add grant client %s\n", cli_name(client)); - osc_update_next_shrink(client); - return 0; -} - -static int osc_del_shrink_grant(struct client_obd *client) -{ - return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list, - TIMEOUT_GRANT); -} - -static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) -{ - /* - * ocd_grant is the total grant amount we're expect to hold: if we've - * been evicted, it's the new avail_grant amount, cl_dirty_pages will - * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant + - * dirty. - * - * race is tolerable here: if we're evicted, but imp_state already - * left EVICTED state, then cl_dirty_pages must be 0 already. - */ - spin_lock(&cli->cl_loi_list_lock); - if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED) - cli->cl_avail_grant = ocd->ocd_grant; - else - cli->cl_avail_grant = ocd->ocd_grant - - (cli->cl_dirty_pages << PAGE_SHIFT); - - /* determine the appropriate chunk size used by osc_extent. */ - cli->cl_chunkbits = max_t(int, PAGE_SHIFT, ocd->ocd_blocksize); - spin_unlock(&cli->cl_loi_list_lock); - - CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld chunk bits: %d\n", - cli_name(cli), cli->cl_avail_grant, cli->cl_lost_grant, - cli->cl_chunkbits); - - if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK && - list_empty(&cli->cl_grant_shrink_list)) - osc_add_shrink_grant(cli); -} - -/* We assume that the reason this OSC got a short read is because it read - * beyond the end of a stripe file; i.e. lustre is reading a sparse file - * via the LOV, and it _knows_ it's reading inside the file, it's just that - * this stripe never got written at or beyond this stripe offset yet. - */ -static void handle_short_read(int nob_read, u32 page_count, - struct brw_page **pga) -{ - char *ptr; - int i = 0; - - /* skip bytes read OK */ - while (nob_read > 0) { - LASSERT(page_count > 0); - - if (pga[i]->count > nob_read) { - /* EOF inside this page */ - ptr = kmap(pga[i]->pg) + - (pga[i]->off & ~PAGE_MASK); - memset(ptr + nob_read, 0, pga[i]->count - nob_read); - kunmap(pga[i]->pg); - page_count--; - i++; - break; - } - - nob_read -= pga[i]->count; - page_count--; - i++; - } - - /* zero remaining pages */ - while (page_count-- > 0) { - ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK); - memset(ptr, 0, pga[i]->count); - kunmap(pga[i]->pg); - i++; - } -} - -static int check_write_rcs(struct ptlrpc_request *req, - int requested_nob, int niocount, - u32 page_count, struct brw_page **pga) -{ - int i; - __u32 *remote_rcs; - - remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS, - sizeof(*remote_rcs) * - niocount); - if (!remote_rcs) { - CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n"); - return -EPROTO; - } - - /* return error if any niobuf was in error */ - for (i = 0; i < niocount; i++) { - if ((int)remote_rcs[i] < 0) - return remote_rcs[i]; - - if (remote_rcs[i] != 0) { - CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n", - i, remote_rcs[i], req); - return -EPROTO; - } - } - - if (req->rq_bulk->bd_nob_transferred != requested_nob) { - CERROR("Unexpected # bytes transferred: %d (requested %d)\n", - req->rq_bulk->bd_nob_transferred, requested_nob); - return -EPROTO; - } - - return 0; -} - -static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) -{ - if (p1->flag != p2->flag) { - unsigned int mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE | - OBD_BRW_SYNC | OBD_BRW_ASYNC | - OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC); - - /* warn if we try to combine flags that we don't know to be - * safe to combine - */ - if (unlikely((p1->flag & mask) != (p2->flag & mask))) { - CWARN("Saw flags 0x%x and 0x%x in the same brw, please report this at http://bugs.whamcloud.com/\n", - p1->flag, p2->flag); - } - return 0; - } - - return (p1->off + p1->count == p2->off); -} - -static u32 osc_checksum_bulk(int nob, u32 pg_count, - struct brw_page **pga, int opc, - enum cksum_type cksum_type) -{ - __u32 cksum; - int i = 0; - struct ahash_request *hdesc; - unsigned int bufsize; - unsigned char cfs_alg = cksum_obd2cfs(cksum_type); - - LASSERT(pg_count > 0); - - hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0); - if (IS_ERR(hdesc)) { - CERROR("Unable to initialize checksum hash %s\n", - cfs_crypto_hash_name(cfs_alg)); - return PTR_ERR(hdesc); - } - - while (nob > 0 && pg_count > 0) { - unsigned int count = pga[i]->count > nob ? nob : pga[i]->count; - - /* corrupt the data before we compute the checksum, to - * simulate an OST->client data error - */ - if (i == 0 && opc == OST_READ && - OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) { - unsigned char *ptr = kmap(pga[i]->pg); - int off = pga[i]->off & ~PAGE_MASK; - - memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob)); - kunmap(pga[i]->pg); - } - cfs_crypto_hash_update_page(hdesc, pga[i]->pg, - pga[i]->off & ~PAGE_MASK, - count); - CDEBUG(D_PAGE, - "page %p map %p index %lu flags %lx count %u priv %0lx: off %d\n", - pga[i]->pg, pga[i]->pg->mapping, pga[i]->pg->index, - (long)pga[i]->pg->flags, page_count(pga[i]->pg), - page_private(pga[i]->pg), - (int)(pga[i]->off & ~PAGE_MASK)); - - nob -= pga[i]->count; - pg_count--; - i++; - } - - bufsize = sizeof(cksum); - cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize); - - /* For sending we only compute the wrong checksum instead - * of corrupting the data so it is still correct on a redo - */ - if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) - cksum++; - - return cksum; -} - -static int osc_brw_prep_request(int cmd, struct client_obd *cli, - struct obdo *oa, u32 page_count, - struct brw_page **pga, - struct ptlrpc_request **reqp, - int reserve, - int resend) -{ - struct ptlrpc_request *req; - struct ptlrpc_bulk_desc *desc; - struct ost_body *body; - struct obd_ioobj *ioobj; - struct niobuf_remote *niobuf; - int niocount, i, requested_nob, opc, rc; - struct osc_brw_async_args *aa; - struct req_capsule *pill; - struct brw_page *pg_prev; - - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ)) - return -ENOMEM; /* Recoverable */ - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2)) - return -EINVAL; /* Fatal */ - - if ((cmd & OBD_BRW_WRITE) != 0) { - opc = OST_WRITE; - req = ptlrpc_request_alloc_pool(cli->cl_import, - osc_rq_pool, - &RQF_OST_BRW_WRITE); - } else { - opc = OST_READ; - req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ); - } - if (!req) - return -ENOMEM; - - for (niocount = i = 1; i < page_count; i++) { - if (!can_merge_pages(pga[i - 1], pga[i])) - niocount++; - } - - pill = &req->rq_pill; - req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT, - sizeof(*ioobj)); - req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT, - niocount * sizeof(*niobuf)); - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ - ptlrpc_at_set_req_timeout(req); - /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own - * retry logic - */ - req->rq_no_retry_einprogress = 1; - - desc = ptlrpc_prep_bulk_imp(req, page_count, - cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS, - (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE : - PTLRPC_BULK_PUT_SINK) | PTLRPC_BULK_BUF_KIOV, OST_BULK_PORTAL, - &ptlrpc_bulk_kiov_pin_ops); - - if (!desc) { - rc = -ENOMEM; - goto out; - } - /* NB request now owns desc and will free it when it gets freed */ - - body = req_capsule_client_get(pill, &RMF_OST_BODY); - ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ); - niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); - LASSERT(body && ioobj && niobuf); - - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); - - obdo_to_ioobj(oa, ioobj); - ioobj->ioo_bufcnt = niocount; - /* The high bits of ioo_max_brw tells server _maximum_ number of bulks - * that might be send for this request. The actual number is decided - * when the RPC is finally sent in ptlrpc_register_bulk(). It sends - * "max - 1" for old client compatibility sending "0", and also so the - * the actual maximum is a power-of-two number, not one less. LU-1431 - */ - ioobj_max_brw_set(ioobj, desc->bd_md_max_brw); - LASSERT(page_count > 0); - pg_prev = pga[0]; - for (requested_nob = i = 0; i < page_count; i++, niobuf++) { - struct brw_page *pg = pga[i]; - int poff = pg->off & ~PAGE_MASK; - - LASSERT(pg->count > 0); - /* make sure there is no gap in the middle of page array */ - LASSERTF(page_count == 1 || - (ergo(i == 0, poff + pg->count == PAGE_SIZE) && - ergo(i > 0 && i < page_count - 1, - poff == 0 && pg->count == PAGE_SIZE) && - ergo(i == page_count - 1, poff == 0)), - "i: %d/%d pg: %p off: %llu, count: %u\n", - i, page_count, pg, pg->off, pg->count); - LASSERTF(i == 0 || pg->off > pg_prev->off, - "i %d p_c %u pg %p [pri %lu ind %lu] off %llu prev_pg %p [pri %lu ind %lu] off %llu\n", - i, page_count, - pg->pg, page_private(pg->pg), pg->pg->index, pg->off, - pg_prev->pg, page_private(pg_prev->pg), - pg_prev->pg->index, pg_prev->off); - LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) == - (pg->flag & OBD_BRW_SRVLOCK)); - - desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count); - requested_nob += pg->count; - - if (i > 0 && can_merge_pages(pg_prev, pg)) { - niobuf--; - niobuf->rnb_len += pg->count; - } else { - niobuf->rnb_offset = pg->off; - niobuf->rnb_len = pg->count; - niobuf->rnb_flags = pg->flag; - } - pg_prev = pg; - } - - LASSERTF((void *)(niobuf - niocount) == - req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE), - "want %p - real %p\n", req_capsule_client_get(&req->rq_pill, - &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount)); - - osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0); - if (resend) { - if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { - body->oa.o_valid |= OBD_MD_FLFLAGS; - body->oa.o_flags = 0; - } - body->oa.o_flags |= OBD_FL_RECOV_RESEND; - } - - if (osc_should_shrink_grant(cli)) - osc_shrink_grant_local(cli, &body->oa); - - /* size[REQ_REC_OFF] still sizeof (*body) */ - if (opc == OST_WRITE) { - if (cli->cl_checksum && - !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { - /* store cl_cksum_type in a local variable since - * it can be changed via lprocfs - */ - enum cksum_type cksum_type = cli->cl_cksum_type; - - if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { - oa->o_flags &= OBD_FL_LOCAL_MASK; - body->oa.o_flags = 0; - } - body->oa.o_flags |= cksum_type_pack(cksum_type); - body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; - body->oa.o_cksum = osc_checksum_bulk(requested_nob, - page_count, pga, - OST_WRITE, - cksum_type); - CDEBUG(D_PAGE, "checksum at write origin: %x\n", - body->oa.o_cksum); - /* save this in 'oa', too, for later checking */ - oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; - oa->o_flags |= cksum_type_pack(cksum_type); - } else { - /* clear out the checksum flag, in case this is a - * resend but cl_checksum is no longer set. b=11238 - */ - oa->o_valid &= ~OBD_MD_FLCKSUM; - } - oa->o_cksum = body->oa.o_cksum; - /* 1 RC per niobuf */ - req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER, - sizeof(__u32) * niocount); - } else { - if (cli->cl_checksum && - !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { - if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) - body->oa.o_flags = 0; - body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type); - body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; - } - } - ptlrpc_request_set_replen(req); - - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - aa->aa_oa = oa; - aa->aa_requested_nob = requested_nob; - aa->aa_nio_count = niocount; - aa->aa_page_count = page_count; - aa->aa_resends = 0; - aa->aa_ppga = pga; - aa->aa_cli = cli; - INIT_LIST_HEAD(&aa->aa_oaps); - - *reqp = req; - niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); - CDEBUG(D_RPCTRACE, "brw rpc %p - object " DOSTID " offset %lld<>%lld\n", - req, POSTID(&oa->o_oi), niobuf[0].rnb_offset, - niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len); - - return 0; - - out: - ptlrpc_req_finished(req); - return rc; -} - -static int check_write_checksum(struct obdo *oa, - const struct lnet_process_id *peer, - __u32 client_cksum, __u32 server_cksum, int nob, - u32 page_count, struct brw_page **pga, - enum cksum_type client_cksum_type) -{ - __u32 new_cksum; - char *msg; - enum cksum_type cksum_type; - - if (server_cksum == client_cksum) { - CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); - return 0; - } - - cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ? - oa->o_flags : 0); - new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE, - cksum_type); - - if (cksum_type != client_cksum_type) - msg = "the server did not use the checksum type specified in the original request - likely a protocol problem" - ; - else if (new_cksum == server_cksum) - msg = "changed on the client after we checksummed it - likely false positive due to mmap IO (bug 11742)" - ; - else if (new_cksum == client_cksum) - msg = "changed in transit before arrival at OST"; - else - msg = "changed in transit AND doesn't match the original - likely false positive due to mmap IO (bug 11742)" - ; - - LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode " DFID " object " DOSTID " extent [%llu-%llu]\n", - msg, libcfs_nid2str(peer->nid), - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, - POSTID(&oa->o_oi), pga[0]->off, - pga[page_count - 1]->off + - pga[page_count - 1]->count - 1); - CERROR("original client csum %x (type %x), server csum %x (type %x), client csum now %x\n", - client_cksum, client_cksum_type, - server_cksum, cksum_type, new_cksum); - return 1; -} - -/* Note rc enters this function as number of bytes transferred */ -static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) -{ - struct osc_brw_async_args *aa = (void *)&req->rq_async_args; - const struct lnet_process_id *peer = - &req->rq_import->imp_connection->c_peer; - struct client_obd *cli = aa->aa_cli; - struct ost_body *body; - __u32 client_cksum = 0; - - if (rc < 0 && rc != -EDQUOT) { - DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc); - return rc; - } - - LASSERTF(req->rq_repmsg, "rc = %d\n", rc); - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - DEBUG_REQ(D_INFO, req, "Can't unpack body\n"); - return -EPROTO; - } - - /* set/clear over quota flag for a uid/gid */ - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && - body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) { - unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid }; - - CDEBUG(D_QUOTA, "setdq for [%u %u] with valid %#llx, flags %x\n", - body->oa.o_uid, body->oa.o_gid, body->oa.o_valid, - body->oa.o_flags); - osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags); - } - - osc_update_grant(cli, body); - - if (rc < 0) - return rc; - - if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM) - client_cksum = aa->aa_oa->o_cksum; /* save for later */ - - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { - if (rc > 0) { - CERROR("Unexpected +ve rc %d\n", rc); - return -EPROTO; - } - LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob); - - if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk)) - return -EAGAIN; - - if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum && - check_write_checksum(&body->oa, peer, client_cksum, - body->oa.o_cksum, aa->aa_requested_nob, - aa->aa_page_count, aa->aa_ppga, - cksum_type_unpack(aa->aa_oa->o_flags))) - return -EAGAIN; - - rc = check_write_rcs(req, aa->aa_requested_nob, - aa->aa_nio_count, - aa->aa_page_count, aa->aa_ppga); - goto out; - } - - /* The rest of this function executes only for OST_READs */ - - /* if unwrap_bulk failed, return -EAGAIN to retry */ - rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc); - if (rc < 0) { - rc = -EAGAIN; - goto out; - } - - if (rc > aa->aa_requested_nob) { - CERROR("Unexpected rc %d (%d requested)\n", rc, - aa->aa_requested_nob); - return -EPROTO; - } - - if (rc != req->rq_bulk->bd_nob_transferred) { - CERROR("Unexpected rc %d (%d transferred)\n", - rc, req->rq_bulk->bd_nob_transferred); - return -EPROTO; - } - - if (rc < aa->aa_requested_nob) - handle_short_read(rc, aa->aa_page_count, aa->aa_ppga); - - if (body->oa.o_valid & OBD_MD_FLCKSUM) { - static int cksum_counter; - __u32 server_cksum = body->oa.o_cksum; - char *via = ""; - char *router = ""; - enum cksum_type cksum_type; - - cksum_type = cksum_type_unpack(body->oa.o_valid & - OBD_MD_FLFLAGS ? - body->oa.o_flags : 0); - client_cksum = osc_checksum_bulk(rc, aa->aa_page_count, - aa->aa_ppga, OST_READ, - cksum_type); - - if (peer->nid != req->rq_bulk->bd_sender) { - via = " via "; - router = libcfs_nid2str(req->rq_bulk->bd_sender); - } - - if (server_cksum != client_cksum) { - LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from %s%s%s inode " DFID " object " DOSTID " extent [%llu-%llu]\n", - req->rq_import->imp_obd->obd_name, - libcfs_nid2str(peer->nid), - via, router, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_seq : (__u64)0, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_oid : 0, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_ver : 0, - POSTID(&body->oa.o_oi), - aa->aa_ppga[0]->off, - aa->aa_ppga[aa->aa_page_count-1]->off + - aa->aa_ppga[aa->aa_page_count-1]->count - - 1); - CERROR("client %x, server %x, cksum_type %x\n", - client_cksum, server_cksum, cksum_type); - cksum_counter = 0; - aa->aa_oa->o_cksum = client_cksum; - rc = -EAGAIN; - } else { - cksum_counter++; - CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); - rc = 0; - } - } else if (unlikely(client_cksum)) { - static int cksum_missed; - - cksum_missed++; - if ((cksum_missed & (-cksum_missed)) == cksum_missed) - CERROR("Checksum %u requested from %s but not sent\n", - cksum_missed, libcfs_nid2str(peer->nid)); - } else { - rc = 0; - } -out: - if (rc >= 0) - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, - aa->aa_oa, &body->oa); - - return rc; -} - -static int osc_brw_redo_request(struct ptlrpc_request *request, - struct osc_brw_async_args *aa, int rc) -{ - struct ptlrpc_request *new_req; - struct osc_brw_async_args *new_aa; - struct osc_async_page *oap; - - DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request, - "redo for recoverable error %d", rc); - - rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == - OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, - aa->aa_cli, aa->aa_oa, - aa->aa_page_count, aa->aa_ppga, - &new_req, 0, 1); - if (rc) - return rc; - - list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { - if (oap->oap_request) { - LASSERTF(request == oap->oap_request, - "request %p != oap_request %p\n", - request, oap->oap_request); - if (oap->oap_interrupted) { - ptlrpc_req_finished(new_req); - return -EINTR; - } - } - } - /* New request takes over pga and oaps from old request. - * Note that copying a list_head doesn't work, need to move it... - */ - aa->aa_resends++; - new_req->rq_interpret_reply = request->rq_interpret_reply; - new_req->rq_async_args = request->rq_async_args; - new_req->rq_commit_cb = request->rq_commit_cb; - /* cap resend delay to the current request timeout, this is similar to - * what ptlrpc does (see after_reply()) - */ - if (aa->aa_resends > new_req->rq_timeout) - new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout; - else - new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends; - new_req->rq_generation_set = 1; - new_req->rq_import_generation = request->rq_import_generation; - - new_aa = ptlrpc_req_async_args(new_req); - - INIT_LIST_HEAD(&new_aa->aa_oaps); - list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps); - INIT_LIST_HEAD(&new_aa->aa_exts); - list_splice_init(&aa->aa_exts, &new_aa->aa_exts); - new_aa->aa_resends = aa->aa_resends; - - list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) { - if (oap->oap_request) { - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = ptlrpc_request_addref(new_req); - } - } - - /* XXX: This code will run into problem if we're going to support - * to add a series of BRW RPCs into a self-defined ptlrpc_request_set - * and wait for all of them to be finished. We should inherit request - * set from old request. - */ - ptlrpcd_add_req(new_req); - - DEBUG_REQ(D_INFO, new_req, "new request"); - return 0; -} - -/* - * ugh, we want disk allocation on the target to happen in offset order. we'll - * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do - * fine for our small page arrays and doesn't require allocation. its an - * insertion sort that swaps elements that are strides apart, shrinking the - * stride down until its '1' and the array is sorted. - */ -static void sort_brw_pages(struct brw_page **array, int num) -{ - int stride, i, j; - struct brw_page *tmp; - - if (num == 1) - return; - for (stride = 1; stride < num ; stride = (stride * 3) + 1) - ; - - do { - stride /= 3; - for (i = stride ; i < num ; i++) { - tmp = array[i]; - j = i; - while (j >= stride && array[j - stride]->off > tmp->off) { - array[j] = array[j - stride]; - j -= stride; - } - array[j] = tmp; - } - } while (stride > 1); -} - -static void osc_release_ppga(struct brw_page **ppga, u32 count) -{ - LASSERT(ppga); - kfree(ppga); -} - -static int brw_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, int rc) -{ - struct osc_brw_async_args *aa = data; - struct osc_extent *ext; - struct osc_extent *tmp; - struct client_obd *cli = aa->aa_cli; - - rc = osc_brw_fini_request(req, rc); - CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); - /* When server return -EINPROGRESS, client should always retry - * regardless of the number of times the bulk was resent already. - */ - if (osc_recoverable_error(rc)) { - if (req->rq_import_generation != - req->rq_import->imp_generation) { - CDEBUG(D_HA, "%s: resend cross eviction for object: " DOSTID ", rc = %d.\n", - req->rq_import->imp_obd->obd_name, - POSTID(&aa->aa_oa->o_oi), rc); - } else if (rc == -EINPROGRESS || - client_should_resend(aa->aa_resends, aa->aa_cli)) { - rc = osc_brw_redo_request(req, aa, rc); - } else { - CERROR("%s: too many resent retries for object: %llu:%llu, rc = %d.\n", - req->rq_import->imp_obd->obd_name, - POSTID(&aa->aa_oa->o_oi), rc); - } - - if (rc == 0) - return 0; - else if (rc == -EAGAIN || rc == -EINPROGRESS) - rc = -EIO; - } - - if (rc == 0) { - struct obdo *oa = aa->aa_oa; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - unsigned long valid = 0; - struct cl_object *obj; - struct osc_async_page *last; - - last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]); - obj = osc2cl(last->oap_obj); - - cl_object_attr_lock(obj); - if (oa->o_valid & OBD_MD_FLBLOCKS) { - attr->cat_blocks = oa->o_blocks; - valid |= CAT_BLOCKS; - } - if (oa->o_valid & OBD_MD_FLMTIME) { - attr->cat_mtime = oa->o_mtime; - valid |= CAT_MTIME; - } - if (oa->o_valid & OBD_MD_FLATIME) { - attr->cat_atime = oa->o_atime; - valid |= CAT_ATIME; - } - if (oa->o_valid & OBD_MD_FLCTIME) { - attr->cat_ctime = oa->o_ctime; - valid |= CAT_CTIME; - } - - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { - struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; - loff_t last_off = last->oap_count + last->oap_obj_off + - last->oap_page_off; - - /* Change file size if this is an out of quota or - * direct IO write and it extends the file size - */ - if (loi->loi_lvb.lvb_size < last_off) { - attr->cat_size = last_off; - valid |= CAT_SIZE; - } - /* Extend KMS if it's not a lockless write */ - if (loi->loi_kms < last_off && - oap2osc_page(last)->ops_srvlock == 0) { - attr->cat_kms = last_off; - valid |= CAT_KMS; - } - } - - if (valid != 0) - cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); - } - kmem_cache_free(obdo_cachep, aa->aa_oa); - - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0) - osc_inc_unstable_pages(req); - - list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) { - list_del_init(&ext->oe_link); - osc_extent_finish(env, ext, 1, rc); - } - LASSERT(list_empty(&aa->aa_exts)); - LASSERT(list_empty(&aa->aa_oaps)); - - osc_release_ppga(aa->aa_ppga, aa->aa_page_count); - ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred); - - spin_lock(&cli->cl_loi_list_lock); - /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters - * is called so we know whether to go to sync BRWs or wait for more - * RPCs to complete - */ - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) - cli->cl_w_in_flight--; - else - cli->cl_r_in_flight--; - osc_wake_cache_waiters(cli); - spin_unlock(&cli->cl_loi_list_lock); - - osc_io_unplug(env, cli, NULL); - return rc; -} - -static void brw_commit(struct ptlrpc_request *req) -{ - /* - * If osc_inc_unstable_pages (via osc_extent_finish) races with - * this called via the rq_commit_cb, I need to ensure - * osc_dec_unstable_pages is still called. Otherwise unstable - * pages may be leaked. - */ - spin_lock(&req->rq_lock); - if (unlikely(req->rq_unstable)) { - req->rq_unstable = 0; - spin_unlock(&req->rq_lock); - osc_dec_unstable_pages(req); - } else { - req->rq_committed = 1; - spin_unlock(&req->rq_lock); - } -} - -/** - * Build an RPC by the list of extent @ext_list. The caller must ensure - * that the total pages in this list are NOT over max pages per RPC. - * Extents in the list must be in OES_RPC state. - */ -int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, - struct list_head *ext_list, int cmd) -{ - struct ptlrpc_request *req = NULL; - struct osc_extent *ext; - struct brw_page **pga = NULL; - struct osc_brw_async_args *aa = NULL; - struct obdo *oa = NULL; - struct osc_async_page *oap; - struct osc_object *obj = NULL; - struct cl_req_attr *crattr = NULL; - u64 starting_offset = OBD_OBJECT_EOF; - u64 ending_offset = 0; - int mpflag = 0; - int mem_tight = 0; - int page_count = 0; - bool soft_sync = false; - bool interrupted = false; - int i; - int rc; - struct ost_body *body; - LIST_HEAD(rpc_list); - - LASSERT(!list_empty(ext_list)); - - /* add pages into rpc_list to build BRW rpc */ - list_for_each_entry(ext, ext_list, oe_link) { - LASSERT(ext->oe_state == OES_RPC); - mem_tight |= ext->oe_memalloc; - page_count += ext->oe_nr_pages; - if (!obj) - obj = ext->oe_obj; - } - - soft_sync = osc_over_unstable_soft_limit(cli); - if (mem_tight) - mpflag = cfs_memory_pressure_get_and_set(); - - pga = kcalloc(page_count, sizeof(*pga), GFP_NOFS); - if (!pga) { - rc = -ENOMEM; - goto out; - } - - oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS); - if (!oa) { - rc = -ENOMEM; - goto out; - } - - i = 0; - list_for_each_entry(ext, ext_list, oe_link) { - list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { - if (mem_tight) - oap->oap_brw_flags |= OBD_BRW_MEMALLOC; - if (soft_sync) - oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC; - pga[i] = &oap->oap_brw_page; - pga[i]->off = oap->oap_obj_off + oap->oap_page_off; - i++; - - list_add_tail(&oap->oap_rpc_item, &rpc_list); - if (starting_offset == OBD_OBJECT_EOF || - starting_offset > oap->oap_obj_off) - starting_offset = oap->oap_obj_off; - else - LASSERT(!oap->oap_page_off); - if (ending_offset < oap->oap_obj_off + oap->oap_count) - ending_offset = oap->oap_obj_off + - oap->oap_count; - else - LASSERT(oap->oap_page_off + oap->oap_count == - PAGE_SIZE); - if (oap->oap_interrupted) - interrupted = true; - } - } - - /* first page in the list */ - oap = list_entry(rpc_list.next, typeof(*oap), oap_rpc_item); - - crattr = &osc_env_info(env)->oti_req_attr; - memset(crattr, 0, sizeof(*crattr)); - crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ; - crattr->cra_flags = ~0ULL; - crattr->cra_page = oap2cl_page(oap); - crattr->cra_oa = oa; - cl_req_attr_set(env, osc2cl(obj), crattr); - - sort_brw_pages(pga, page_count); - rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 1, 0); - if (rc != 0) { - CERROR("prep_req failed: %d\n", rc); - goto out; - } - - req->rq_commit_cb = brw_commit; - req->rq_interpret_reply = brw_interpret; - - req->rq_memalloc = mem_tight != 0; - oap->oap_request = ptlrpc_request_addref(req); - if (interrupted && !req->rq_intr) - ptlrpc_mark_interrupted(req); - - /* Need to update the timestamps after the request is built in case - * we race with setattr (locally or in queue at OST). If OST gets - * later setattr before earlier BRW (as determined by the request xid), - * the OST will not use BRW timestamps. Sadly, there is no obvious - * way to do this in a single call. bug 10150 - */ - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - crattr->cra_oa = &body->oa; - crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME; - cl_req_attr_set(env, osc2cl(obj), crattr); - lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid); - - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - INIT_LIST_HEAD(&aa->aa_oaps); - list_splice_init(&rpc_list, &aa->aa_oaps); - INIT_LIST_HEAD(&aa->aa_exts); - list_splice_init(ext_list, &aa->aa_exts); - - spin_lock(&cli->cl_loi_list_lock); - starting_offset >>= PAGE_SHIFT; - if (cmd == OBD_BRW_READ) { - cli->cl_r_in_flight++; - lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); - lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, - starting_offset + 1); - } else { - cli->cl_w_in_flight++; - lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight); - lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, - starting_offset + 1); - } - spin_unlock(&cli->cl_loi_list_lock); - - DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%dw in flight", - page_count, aa, cli->cl_r_in_flight, - cli->cl_w_in_flight); - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val); - - ptlrpcd_add_req(req); - rc = 0; - -out: - if (mem_tight != 0) - cfs_memory_pressure_restore(mpflag); - - if (rc != 0) { - LASSERT(!req); - - if (oa) - kmem_cache_free(obdo_cachep, oa); - kfree(pga); - /* this should happen rarely and is pretty bad, it makes the - * pending list not follow the dirty order - */ - while (!list_empty(ext_list)) { - ext = list_entry(ext_list->next, struct osc_extent, - oe_link); - list_del_init(&ext->oe_link); - osc_extent_finish(env, ext, 0, rc); - } - } - return rc; -} - -static int osc_set_lock_data(struct ldlm_lock *lock, void *data) -{ - int set = 0; - - LASSERT(lock); - - lock_res_and_lock(lock); - - if (!lock->l_ast_data) - lock->l_ast_data = data; - if (lock->l_ast_data == data) - set = 1; - - unlock_res_and_lock(lock); - - return set; -} - -static int osc_enqueue_fini(struct ptlrpc_request *req, - osc_enqueue_upcall_f upcall, void *cookie, - struct lustre_handle *lockh, enum ldlm_mode mode, - __u64 *flags, int agl, int errcode) -{ - bool intent = *flags & LDLM_FL_HAS_INTENT; - int rc; - - /* The request was created before ldlm_cli_enqueue call. */ - if (intent && errcode == ELDLM_LOCK_ABORTED) { - struct ldlm_reply *rep; - - rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); - - rep->lock_policy_res1 = - ptlrpc_status_ntoh(rep->lock_policy_res1); - if (rep->lock_policy_res1) - errcode = rep->lock_policy_res1; - if (!agl) - *flags |= LDLM_FL_LVB_READY; - } else if (errcode == ELDLM_OK) { - *flags |= LDLM_FL_LVB_READY; - } - - /* Call the update callback. */ - rc = (*upcall)(cookie, lockh, errcode); - /* release the reference taken in ldlm_cli_enqueue() */ - if (errcode == ELDLM_LOCK_MATCHED) - errcode = ELDLM_OK; - if (errcode == ELDLM_OK && lustre_handle_is_used(lockh)) - ldlm_lock_decref(lockh, mode); - - return rc; -} - -static int osc_enqueue_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - struct osc_enqueue_args *aa, int rc) -{ - struct ldlm_lock *lock; - struct lustre_handle *lockh = &aa->oa_lockh; - enum ldlm_mode mode = aa->oa_mode; - struct ost_lvb *lvb = aa->oa_lvb; - __u32 lvb_len = sizeof(*lvb); - __u64 flags = 0; - - - /* ldlm_cli_enqueue is holding a reference on the lock, so it must - * be valid. - */ - lock = ldlm_handle2lock(lockh); - LASSERTF(lock, "lockh %llx, req %p, aa %p - client evicted?\n", - lockh->cookie, req, aa); - - /* Take an additional reference so that a blocking AST that - * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed - * to arrive after an upcall has been executed by - * osc_enqueue_fini(). - */ - ldlm_lock_addref(lockh, mode); - - /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */ - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2); - - /* Let CP AST to grant the lock first. */ - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1); - - if (aa->oa_agl) { - LASSERT(!aa->oa_lvb); - LASSERT(!aa->oa_flags); - aa->oa_flags = &flags; - } - - /* Complete obtaining the lock procedure. */ - rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1, - aa->oa_mode, aa->oa_flags, lvb, lvb_len, - lockh, rc); - /* Complete osc stuff. */ - rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode, - aa->oa_flags, aa->oa_agl, rc); - - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10); - - ldlm_lock_decref(lockh, mode); - LDLM_LOCK_PUT(lock); - return rc; -} - -struct ptlrpc_request_set *PTLRPCD_SET = (void *)1; - -/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock - * from the 2nd OSC before a lock from the 1st one. This does not deadlock with - * other synchronous requests, however keeping some locks and trying to obtain - * others may take a considerable amount of time in a case of ost failure; and - * when other sync requests do not get released lock from a client, the client - * is evicted from the cluster -- such scenaries make the life difficult, so - * release locks just after they are obtained. - */ -int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, - __u64 *flags, union ldlm_policy_data *policy, - struct ost_lvb *lvb, int kms_valid, - osc_enqueue_upcall_f upcall, void *cookie, - struct ldlm_enqueue_info *einfo, - struct ptlrpc_request_set *rqset, int async, int agl) -{ - struct obd_device *obd = exp->exp_obd; - struct lustre_handle lockh = { 0 }; - struct ptlrpc_request *req = NULL; - int intent = *flags & LDLM_FL_HAS_INTENT; - __u64 match_flags = *flags; - enum ldlm_mode mode; - int rc; - - /* Filesystem lock extents are extended to page boundaries so that - * dealing with the page cache is a little smoother. - */ - policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK; - policy->l_extent.end |= ~PAGE_MASK; - - /* - * kms is not valid when either object is completely fresh (so that no - * locks are cached), or object was evicted. In the latter case cached - * lock cannot be used, because it would prime inode state with - * potentially stale LVB. - */ - if (!kms_valid) - goto no_match; - - /* Next, search for already existing extent locks that will cover us */ - /* If we're trying to read, we also search for an existing PW lock. The - * VFS and page cache already protect us locally, so lots of readers/ - * writers can share a single PW lock. - * - * There are problems with conversion deadlocks, so instead of - * converting a read lock to a write lock, we'll just enqueue a new - * one. - * - * At some point we should cancel the read lock instead of making them - * send us a blocking callback, but there are problems with canceling - * locks out from other users right now, too. - */ - mode = einfo->ei_mode; - if (einfo->ei_mode == LCK_PR) - mode |= LCK_PW; - if (agl == 0) - match_flags |= LDLM_FL_LVB_READY; - if (intent != 0) - match_flags |= LDLM_FL_BLOCK_GRANTED; - mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id, - einfo->ei_type, policy, mode, &lockh, 0); - if (mode) { - struct ldlm_lock *matched; - - if (*flags & LDLM_FL_TEST_LOCK) - return ELDLM_OK; - - matched = ldlm_handle2lock(&lockh); - if (agl) { - /* AGL enqueues DLM locks speculatively. Therefore if - * it already exists a DLM lock, it wll just inform the - * caller to cancel the AGL process for this stripe. - */ - ldlm_lock_decref(&lockh, mode); - LDLM_LOCK_PUT(matched); - return -ECANCELED; - } else if (osc_set_lock_data(matched, einfo->ei_cbdata)) { - *flags |= LDLM_FL_LVB_READY; - /* We already have a lock, and it's referenced. */ - (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED); - - ldlm_lock_decref(&lockh, mode); - LDLM_LOCK_PUT(matched); - return ELDLM_OK; - } else { - ldlm_lock_decref(&lockh, mode); - LDLM_LOCK_PUT(matched); - } - } - -no_match: - if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK)) - return -ENOLCK; - if (intent) { - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_LDLM_ENQUEUE_LVB); - if (!req) - return -ENOMEM; - - rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, - sizeof(*lvb)); - ptlrpc_request_set_replen(req); - } - - /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */ - *flags &= ~LDLM_FL_BLOCK_GRANTED; - - rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, - sizeof(*lvb), LVB_T_OST, &lockh, async); - if (async) { - if (!rc) { - struct osc_enqueue_args *aa; - - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - aa->oa_exp = exp; - aa->oa_mode = einfo->ei_mode; - aa->oa_type = einfo->ei_type; - lustre_handle_copy(&aa->oa_lockh, &lockh); - aa->oa_upcall = upcall; - aa->oa_cookie = cookie; - aa->oa_agl = !!agl; - if (!agl) { - aa->oa_flags = flags; - aa->oa_lvb = lvb; - } else { - /* AGL is essentially to enqueue an DLM lock - * in advance, so we don't care about the - * result of AGL enqueue. - */ - aa->oa_lvb = NULL; - aa->oa_flags = NULL; - } - - req->rq_interpret_reply = - (ptlrpc_interpterer_t)osc_enqueue_interpret; - if (rqset == PTLRPCD_SET) - ptlrpcd_add_req(req); - else - ptlrpc_set_add_req(rqset, req); - } else if (intent) { - ptlrpc_req_finished(req); - } - return rc; - } - - rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode, - flags, agl, rc); - if (intent) - ptlrpc_req_finished(req); - - return rc; -} - -int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, - enum ldlm_type type, union ldlm_policy_data *policy, - enum ldlm_mode mode, __u64 *flags, void *data, - struct lustre_handle *lockh, int unref) -{ - struct obd_device *obd = exp->exp_obd; - __u64 lflags = *flags; - enum ldlm_mode rc; - - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH)) - return -EIO; - - /* Filesystem lock extents are extended to page boundaries so that - * dealing with the page cache is a little smoother - */ - policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK; - policy->l_extent.end |= ~PAGE_MASK; - - /* Next, search for already existing extent locks that will cover us */ - /* If we're trying to read, we also search for an existing PW lock. The - * VFS and page cache already protect us locally, so lots of readers/ - * writers can share a single PW lock. - */ - rc = mode; - if (mode == LCK_PR) - rc |= LCK_PW; - rc = ldlm_lock_match(obd->obd_namespace, lflags, - res_id, type, policy, rc, lockh, unref); - if (!rc || lflags & LDLM_FL_TEST_LOCK) - return rc; - - if (data) { - struct ldlm_lock *lock = ldlm_handle2lock(lockh); - - LASSERT(lock); - if (!osc_set_lock_data(lock, data)) { - ldlm_lock_decref(lockh, rc); - rc = 0; - } - LDLM_LOCK_PUT(lock); - } - return rc; -} - -static int osc_statfs_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - struct osc_async_args *aa, int rc) -{ - struct obd_statfs *msfs; - - if (rc == -EBADR) - /* The request has in fact never been sent - * due to issues at a higher level (LOV). - * Exit immediately since the caller is - * aware of the problem and takes care - * of the clean up - */ - return rc; - - if ((rc == -ENOTCONN || rc == -EAGAIN) && - (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY)) { - rc = 0; - goto out; - } - - if (rc != 0) - goto out; - - msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); - if (!msfs) { - rc = -EPROTO; - goto out; - } - - *aa->aa_oi->oi_osfs = *msfs; -out: - rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); - return rc; -} - -static int osc_statfs_async(struct obd_export *exp, - struct obd_info *oinfo, __u64 max_age, - struct ptlrpc_request_set *rqset) -{ - struct obd_device *obd = class_exp2obd(exp); - struct ptlrpc_request *req; - struct osc_async_args *aa; - int rc; - - /* We could possibly pass max_age in the request (as an absolute - * timestamp or a "seconds.usec ago") so the target can avoid doing - * extra calls into the filesystem if that isn't necessary (e.g. - * during mount that would help a bit). Having relative timestamps - * is not so great if request processing is slow, while absolute - * timestamps are not ideal because they need time synchronization. - */ - req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - ptlrpc_request_set_replen(req); - req->rq_request_portal = OST_CREATE_PORTAL; - ptlrpc_at_set_req_timeout(req); - - if (oinfo->oi_flags & OBD_STATFS_NODELAY) { - /* procfs requests not want stat in wait for avoid deadlock */ - req->rq_no_resend = 1; - req->rq_no_delay = 1; - } - - req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret; - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - aa->aa_oi = oinfo; - - ptlrpc_set_add_req(rqset, req); - return 0; -} - -static int osc_statfs(const struct lu_env *env, struct obd_export *exp, - struct obd_statfs *osfs, __u64 max_age, __u32 flags) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_statfs *msfs; - struct ptlrpc_request *req; - struct obd_import *imp = NULL; - int rc; - - /* Since the request might also come from lprocfs, so we need - * sync this with client_disconnect_export Bug15684 - */ - down_read(&obd->u.cli.cl_sem); - if (obd->u.cli.cl_import) - imp = class_import_get(obd->u.cli.cl_import); - up_read(&obd->u.cli.cl_sem); - if (!imp) - return -ENODEV; - - /* We could possibly pass max_age in the request (as an absolute - * timestamp or a "seconds.usec ago") so the target can avoid doing - * extra calls into the filesystem if that isn't necessary (e.g. - * during mount that would help a bit). Having relative timestamps - * is not so great if request processing is slow, while absolute - * timestamps are not ideal because they need time synchronization. - */ - req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS); - - class_import_put(imp); - - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - ptlrpc_request_set_replen(req); - req->rq_request_portal = OST_CREATE_PORTAL; - ptlrpc_at_set_req_timeout(req); - - if (flags & OBD_STATFS_NODELAY) { - /* procfs requests not want stat in wait for avoid deadlock */ - req->rq_no_resend = 1; - req->rq_no_delay = 1; - } - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); - if (!msfs) { - rc = -EPROTO; - goto out; - } - - *osfs = *msfs; - - out: - ptlrpc_req_finished(req); - return rc; -} - -static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, - void *karg, void __user *uarg) -{ - struct obd_device *obd = exp->exp_obd; - struct obd_ioctl_data *data = karg; - int err = 0; - - if (!try_module_get(THIS_MODULE)) { - CERROR("%s: cannot get module '%s'\n", obd->obd_name, - module_name(THIS_MODULE)); - return -EINVAL; - } - switch (cmd) { - case OBD_IOC_CLIENT_RECOVER: - err = ptlrpc_recover_import(obd->u.cli.cl_import, - data->ioc_inlbuf1, 0); - if (err > 0) - err = 0; - goto out; - case IOC_OSC_SET_ACTIVE: - err = ptlrpc_set_import_active(obd->u.cli.cl_import, - data->ioc_offset); - goto out; - case OBD_IOC_PING_TARGET: - err = ptlrpc_obd_ping(obd); - goto out; - default: - CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", - cmd, current_comm()); - err = -ENOTTY; - goto out; - } -out: - module_put(THIS_MODULE); - return err; -} - -static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, - u32 keylen, void *key, u32 vallen, - void *val, struct ptlrpc_request_set *set) -{ - struct ptlrpc_request *req; - struct obd_device *obd = exp->exp_obd; - struct obd_import *imp = class_exp2cliimp(exp); - char *tmp; - int rc; - - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10); - - if (KEY_IS(KEY_CHECKSUM)) { - if (vallen != sizeof(int)) - return -EINVAL; - exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0; - return 0; - } - - if (KEY_IS(KEY_SPTLRPC_CONF)) { - sptlrpc_conf_client_adapt(obd); - return 0; - } - - if (KEY_IS(KEY_FLUSH_CTX)) { - sptlrpc_import_flush_my_ctx(imp); - return 0; - } - - if (KEY_IS(KEY_CACHE_SET)) { - struct client_obd *cli = &obd->u.cli; - - LASSERT(!cli->cl_cache); /* only once */ - cli->cl_cache = val; - cl_cache_incref(cli->cl_cache); - cli->cl_lru_left = &cli->cl_cache->ccc_lru_left; - - /* add this osc into entity list */ - LASSERT(list_empty(&cli->cl_lru_osc)); - spin_lock(&cli->cl_cache->ccc_lru_lock); - list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru); - spin_unlock(&cli->cl_cache->ccc_lru_lock); - - return 0; - } - - if (KEY_IS(KEY_CACHE_LRU_SHRINK)) { - struct client_obd *cli = &obd->u.cli; - long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1; - long target = *(long *)val; - - nr = osc_lru_shrink(env, cli, min(nr, target), true); - *(long *)val -= nr; - return 0; - } - - if (!set && !KEY_IS(KEY_GRANT_SHRINK)) - return -EINVAL; - - /* We pass all other commands directly to OST. Since nobody calls osc - * methods directly and everybody is supposed to go through LOV, we - * assume lov checked invalid values for us. - * The only recognised values so far are evict_by_nid and mds_conn. - * Even if something bad goes through, we'd get a -EINVAL from OST - * anyway. - */ - - req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ? - &RQF_OST_SET_GRANT_INFO : - &RQF_OBD_SET_INFO); - if (!req) - return -ENOMEM; - - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, - RCL_CLIENT, keylen); - if (!KEY_IS(KEY_GRANT_SHRINK)) - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, - RCL_CLIENT, vallen); - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); - memcpy(tmp, key, keylen); - tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ? - &RMF_OST_BODY : - &RMF_SETINFO_VAL); - memcpy(tmp, val, vallen); - - if (KEY_IS(KEY_GRANT_SHRINK)) { - struct osc_brw_async_args *aa; - struct obdo *oa; - - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS); - if (!oa) { - ptlrpc_req_finished(req); - return -ENOMEM; - } - *oa = ((struct ost_body *)val)->oa; - aa->aa_oa = oa; - req->rq_interpret_reply = osc_shrink_grant_interpret; - } - - ptlrpc_request_set_replen(req); - if (!KEY_IS(KEY_GRANT_SHRINK)) { - LASSERT(set); - ptlrpc_set_add_req(set, req); - ptlrpc_check_set(NULL, set); - } else { - ptlrpcd_add_req(req); - } - - return 0; -} - -static int osc_reconnect(const struct lu_env *env, - struct obd_export *exp, struct obd_device *obd, - struct obd_uuid *cluuid, - struct obd_connect_data *data, - void *localdata) -{ - struct client_obd *cli = &obd->u.cli; - - if (data && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { - long lost_grant; - - spin_lock(&cli->cl_loi_list_lock); - data->ocd_grant = (cli->cl_avail_grant + - (cli->cl_dirty_pages << PAGE_SHIFT)) ?: - 2 * cli_brw_size(obd); - lost_grant = cli->cl_lost_grant; - cli->cl_lost_grant = 0; - spin_unlock(&cli->cl_loi_list_lock); - - CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d, lost: %ld.\n", - data->ocd_connect_flags, - data->ocd_version, data->ocd_grant, lost_grant); - } - - return 0; -} - -static int osc_disconnect(struct obd_export *exp) -{ - struct obd_device *obd = class_exp2obd(exp); - int rc; - - rc = client_disconnect_export(exp); - /** - * Initially we put del_shrink_grant before disconnect_export, but it - * causes the following problem if setup (connect) and cleanup - * (disconnect) are tangled together. - * connect p1 disconnect p2 - * ptlrpc_connect_import - * ............... class_manual_cleanup - * osc_disconnect - * del_shrink_grant - * ptlrpc_connect_interrupt - * init_grant_shrink - * add this client to shrink list - * cleanup_osc - * Bang! pinger trigger the shrink. - * So the osc should be disconnected from the shrink list, after we - * are sure the import has been destroyed. BUG18662 - */ - if (!obd->u.cli.cl_import) - osc_del_shrink_grant(&obd->u.cli); - return rc; -} - -static int osc_ldlm_resource_invalidate(struct cfs_hash *hs, - struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *arg) -{ - struct ldlm_resource *res = cfs_hash_object(hs, hnode); - struct osc_object *osc = NULL; - struct lu_env *env = arg; - struct ldlm_lock *lock; - - lock_res(res); - list_for_each_entry(lock, &res->lr_granted, l_res_link) { - if (lock->l_ast_data && !osc) { - osc = lock->l_ast_data; - cl_object_get(osc2cl(osc)); - } - - /* - * clear LDLM_FL_CLEANED flag to make sure it will be canceled - * by the 2nd round of ldlm_namespace_clean() call in - * osc_import_event(). - */ - ldlm_clear_cleaned(lock); - } - unlock_res(res); - - if (osc) { - osc_object_invalidate(env, osc); - cl_object_put(env, osc2cl(osc)); - } - - return 0; -} - -static int osc_import_event(struct obd_device *obd, - struct obd_import *imp, - enum obd_import_event event) -{ - struct client_obd *cli; - int rc = 0; - - LASSERT(imp->imp_obd == obd); - - switch (event) { - case IMP_EVENT_DISCON: { - cli = &obd->u.cli; - spin_lock(&cli->cl_loi_list_lock); - cli->cl_avail_grant = 0; - cli->cl_lost_grant = 0; - spin_unlock(&cli->cl_loi_list_lock); - break; - } - case IMP_EVENT_INACTIVE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL); - break; - } - case IMP_EVENT_INVALIDATE: { - struct ldlm_namespace *ns = obd->obd_namespace; - struct lu_env *env; - u16 refcheck; - - ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); - - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - osc_io_unplug(env, &obd->u.cli, NULL); - - cfs_hash_for_each_nolock(ns->ns_rs_hash, - osc_ldlm_resource_invalidate, - env, 0); - cl_env_put(env, &refcheck); - - ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); - } else { - rc = PTR_ERR(env); - } - break; - } - case IMP_EVENT_ACTIVE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); - break; - } - case IMP_EVENT_OCD: { - struct obd_connect_data *ocd = &imp->imp_connect_data; - - if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT) - osc_init_grant(&obd->u.cli, ocd); - - /* See bug 7198 */ - if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL) - imp->imp_client->cli_request_portal = OST_REQUEST_PORTAL; - - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL); - break; - } - case IMP_EVENT_DEACTIVATE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL); - break; - } - case IMP_EVENT_ACTIVATE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL); - break; - } - default: - CERROR("Unknown import event %d\n", event); - LBUG(); - } - return rc; -} - -/** - * Determine whether the lock can be canceled before replaying the lock - * during recovery, see bug16774 for detailed information. - * - * \retval zero the lock can't be canceled - * \retval other ok to cancel - */ -static int osc_cancel_weight(struct ldlm_lock *lock) -{ - /* - * Cancel all unused and granted extent lock. - */ - if (lock->l_resource->lr_type == LDLM_EXTENT && - lock->l_granted_mode == lock->l_req_mode && - osc_ldlm_weigh_ast(lock) == 0) - return 1; - - return 0; -} - -static int brw_queue_work(const struct lu_env *env, void *data) -{ - struct client_obd *cli = data; - - CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); - - osc_io_unplug(env, cli, NULL); - return 0; -} - -int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - struct lprocfs_static_vars lvars = { NULL }; - struct client_obd *cli = &obd->u.cli; - void *handler; - int rc; - int adding; - int added; - int req_count; - - rc = ptlrpcd_addref(); - if (rc) - return rc; - - rc = client_obd_setup(obd, lcfg); - if (rc) - goto out_ptlrpcd; - - handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli); - if (IS_ERR(handler)) { - rc = PTR_ERR(handler); - goto out_client_setup; - } - cli->cl_writeback_work = handler; - - handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli); - if (IS_ERR(handler)) { - rc = PTR_ERR(handler); - goto out_ptlrpcd_work; - } - - cli->cl_lru_work = handler; - - rc = osc_quota_setup(obd); - if (rc) - goto out_ptlrpcd_work; - - cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; - lprocfs_osc_init_vars(&lvars); - if (lprocfs_obd_setup(obd, lvars.obd_vars, lvars.sysfs_vars) == 0) { - lproc_osc_attach_seqstat(obd); - sptlrpc_lprocfs_cliobd_attach(obd); - ptlrpc_lprocfs_register_obd(obd); - } - - /* - * We try to control the total number of requests with a upper limit - * osc_reqpool_maxreqcount. There might be some race which will cause - * over-limit allocation, but it is fine. - */ - req_count = atomic_read(&osc_pool_req_count); - if (req_count < osc_reqpool_maxreqcount) { - adding = cli->cl_max_rpcs_in_flight + 2; - if (req_count + adding > osc_reqpool_maxreqcount) - adding = osc_reqpool_maxreqcount - req_count; - - added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding); - atomic_add(added, &osc_pool_req_count); - } - - INIT_LIST_HEAD(&cli->cl_grant_shrink_list); - ns_register_cancel(obd->obd_namespace, osc_cancel_weight); - - spin_lock(&osc_shrink_lock); - list_add_tail(&cli->cl_shrink_list, &osc_shrink_list); - spin_unlock(&osc_shrink_lock); - - return rc; - -out_ptlrpcd_work: - if (cli->cl_writeback_work) { - ptlrpcd_destroy_work(cli->cl_writeback_work); - cli->cl_writeback_work = NULL; - } - if (cli->cl_lru_work) { - ptlrpcd_destroy_work(cli->cl_lru_work); - cli->cl_lru_work = NULL; - } -out_client_setup: - client_obd_cleanup(obd); -out_ptlrpcd: - ptlrpcd_decref(); - return rc; -} - -static int osc_precleanup(struct obd_device *obd) -{ - struct client_obd *cli = &obd->u.cli; - - /* LU-464 - * for echo client, export may be on zombie list, wait for - * zombie thread to cull it, because cli.cl_import will be - * cleared in client_disconnect_export(): - * class_export_destroy() -> obd_cleanup() -> - * echo_device_free() -> echo_client_cleanup() -> - * obd_disconnect() -> osc_disconnect() -> - * client_disconnect_export() - */ - obd_zombie_barrier(); - if (cli->cl_writeback_work) { - ptlrpcd_destroy_work(cli->cl_writeback_work); - cli->cl_writeback_work = NULL; - } - - if (cli->cl_lru_work) { - ptlrpcd_destroy_work(cli->cl_lru_work); - cli->cl_lru_work = NULL; - } - - obd_cleanup_client_import(obd); - ptlrpc_lprocfs_unregister_obd(obd); - lprocfs_obd_cleanup(obd); - return 0; -} - -static int osc_cleanup(struct obd_device *obd) -{ - struct client_obd *cli = &obd->u.cli; - int rc; - - spin_lock(&osc_shrink_lock); - list_del(&cli->cl_shrink_list); - spin_unlock(&osc_shrink_lock); - - /* lru cleanup */ - if (cli->cl_cache) { - LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0); - spin_lock(&cli->cl_cache->ccc_lru_lock); - list_del_init(&cli->cl_lru_osc); - spin_unlock(&cli->cl_cache->ccc_lru_lock); - cli->cl_lru_left = NULL; - cl_cache_decref(cli->cl_cache); - cli->cl_cache = NULL; - } - - /* free memory of osc quota cache */ - osc_quota_cleanup(obd); - - rc = client_obd_cleanup(obd); - - ptlrpcd_decref(); - return rc; -} - -int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - struct lprocfs_static_vars lvars = { NULL }; - int rc = 0; - - lprocfs_osc_init_vars(&lvars); - - switch (lcfg->lcfg_command) { - default: - rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars, - lcfg, obd); - if (rc > 0) - rc = 0; - break; - } - - return rc; -} - -static int osc_process_config(struct obd_device *obd, u32 len, void *buf) -{ - return osc_process_config_base(obd, buf); -} - -static struct obd_ops osc_obd_ops = { - .owner = THIS_MODULE, - .setup = osc_setup, - .precleanup = osc_precleanup, - .cleanup = osc_cleanup, - .add_conn = client_import_add_conn, - .del_conn = client_import_del_conn, - .connect = client_connect_import, - .reconnect = osc_reconnect, - .disconnect = osc_disconnect, - .statfs = osc_statfs, - .statfs_async = osc_statfs_async, - .create = osc_create, - .destroy = osc_destroy, - .getattr = osc_getattr, - .setattr = osc_setattr, - .iocontrol = osc_iocontrol, - .set_info_async = osc_set_info_async, - .import_event = osc_import_event, - .process_config = osc_process_config, - .quotactl = osc_quotactl, -}; - -struct list_head osc_shrink_list = LIST_HEAD_INIT(osc_shrink_list); -DEFINE_SPINLOCK(osc_shrink_lock); - -static struct shrinker osc_cache_shrinker = { - .count_objects = osc_cache_shrink_count, - .scan_objects = osc_cache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - -static int __init osc_init(void) -{ - struct lprocfs_static_vars lvars = { NULL }; - unsigned int reqpool_size; - unsigned int reqsize; - int rc; - - /* print an address of _any_ initialized kernel symbol from this - * module, to allow debugging with gdb that doesn't support data - * symbols from modules. - */ - CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches); - - rc = lu_kmem_init(osc_caches); - if (rc) - return rc; - - lprocfs_osc_init_vars(&lvars); - - rc = class_register_type(&osc_obd_ops, NULL, - LUSTRE_OSC_NAME, &osc_device_type); - if (rc) - goto out_kmem; - - rc = register_shrinker(&osc_cache_shrinker); - if (rc) - goto out_type; - - /* This is obviously too much memory, only prevent overflow here */ - if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0) { - rc = -EINVAL; - goto out_type; - } - - reqpool_size = osc_reqpool_mem_max << 20; - - reqsize = 1; - while (reqsize < OST_MAXREQSIZE) - reqsize = reqsize << 1; - - /* - * We don't enlarge the request count in OSC pool according to - * cl_max_rpcs_in_flight. The allocation from the pool will only be - * tried after normal allocation failed. So a small OSC pool won't - * cause much performance degression in most of cases. - */ - osc_reqpool_maxreqcount = reqpool_size / reqsize; - - atomic_set(&osc_pool_req_count, 0); - osc_rq_pool = ptlrpc_init_rq_pool(0, OST_MAXREQSIZE, - ptlrpc_add_rqs_to_pool); - - if (osc_rq_pool) - return 0; - - rc = -ENOMEM; - -out_type: - class_unregister_type(LUSTRE_OSC_NAME); -out_kmem: - lu_kmem_fini(osc_caches); - return rc; -} - -static void /*__exit*/ osc_exit(void) -{ - unregister_shrinker(&osc_cache_shrinker); - class_unregister_type(LUSTRE_OSC_NAME); - lu_kmem_fini(osc_caches); - ptlrpc_free_rq_pool(osc_rq_pool); -} - -MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>"); -MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)"); -MODULE_LICENSE("GPL"); -MODULE_VERSION(LUSTRE_VERSION_STRING); - -module_init(osc_init); -module_exit(osc_exit); |