diff options
Diffstat (limited to 'drivers')
100 files changed, 2465 insertions, 1080 deletions
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 23022cf20d26..c02fa27dd3f3 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -2426,7 +2426,7 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw, offset = to_interleave_offset(offset, mmio); writeq(cmd, mmio->addr.base + offset); - nvdimm_flush(nfit_blk->nd_region); + nvdimm_flush(nfit_blk->nd_region, NULL); if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH) readq(mmio->addr.base + offset); @@ -2475,7 +2475,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, } if (rw) - nvdimm_flush(nfit_blk->nd_region); + nvdimm_flush(nfit_blk->nd_region, NULL); rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0; return rc; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e5009a34f9c2..3327192bb71f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -115,6 +115,8 @@ static int atomic_dec_return_safe(atomic_t *v) #define RBD_FEATURE_LAYERING (1ULL<<0) #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) +#define RBD_FEATURE_OBJECT_MAP (1ULL<<3) +#define RBD_FEATURE_FAST_DIFF (1ULL<<4) #define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) #define RBD_FEATURE_DATA_POOL (1ULL<<7) #define RBD_FEATURE_OPERATIONS (1ULL<<8) @@ -122,6 +124,8 @@ static int atomic_dec_return_safe(atomic_t *v) #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ RBD_FEATURE_STRIPINGV2 | \ RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ RBD_FEATURE_DEEP_FLATTEN | \ RBD_FEATURE_DATA_POOL | \ RBD_FEATURE_OPERATIONS) @@ -203,6 +207,11 @@ struct rbd_client { struct list_head node; }; +struct pending_result { + int result; /* first nonzero result */ + int num_pending; +}; + struct rbd_img_request; enum obj_request_type { @@ -219,6 +228,18 @@ enum obj_operation_type { OBJ_OP_ZEROOUT, }; +#define RBD_OBJ_FLAG_DELETION (1U << 0) +#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1) +#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2) +#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3) +#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4) + +enum rbd_obj_read_state { + RBD_OBJ_READ_START = 1, + RBD_OBJ_READ_OBJECT, + RBD_OBJ_READ_PARENT, +}; + /* * Writes go through the following state machine to deal with * layering: @@ -245,17 +266,28 @@ enum obj_operation_type { * even if there is a parent). */ enum rbd_obj_write_state { - RBD_OBJ_WRITE_FLAT = 1, - RBD_OBJ_WRITE_GUARD, - RBD_OBJ_WRITE_READ_FROM_PARENT, - RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC, - RBD_OBJ_WRITE_COPYUP_OPS, + RBD_OBJ_WRITE_START = 1, + RBD_OBJ_WRITE_PRE_OBJECT_MAP, + RBD_OBJ_WRITE_OBJECT, + __RBD_OBJ_WRITE_COPYUP, + RBD_OBJ_WRITE_COPYUP, + RBD_OBJ_WRITE_POST_OBJECT_MAP, +}; + +enum rbd_obj_copyup_state { + RBD_OBJ_COPYUP_START = 1, + RBD_OBJ_COPYUP_READ_PARENT, + __RBD_OBJ_COPYUP_OBJECT_MAPS, + RBD_OBJ_COPYUP_OBJECT_MAPS, + __RBD_OBJ_COPYUP_WRITE_OBJECT, + RBD_OBJ_COPYUP_WRITE_OBJECT, }; struct rbd_obj_request { struct ceph_object_extent ex; + unsigned int flags; /* RBD_OBJ_FLAG_* */ union { - bool tried_parent; /* for reads */ + enum rbd_obj_read_state read_state; /* for reads */ enum rbd_obj_write_state write_state; /* for writes */ }; @@ -271,14 +303,15 @@ struct rbd_obj_request { u32 bvec_idx; }; }; + + enum rbd_obj_copyup_state copyup_state; struct bio_vec *copyup_bvecs; u32 copyup_bvec_count; - struct ceph_osd_request *osd_req; - - u64 xferred; /* bytes transferred */ - int result; + struct list_head osd_reqs; /* w/ r_private_item */ + struct mutex state_mutex; + struct pending_result pending; struct kref kref; }; @@ -287,11 +320,19 @@ enum img_req_flags { IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ }; +enum rbd_img_state { + RBD_IMG_START = 1, + RBD_IMG_EXCLUSIVE_LOCK, + __RBD_IMG_OBJECT_REQUESTS, + RBD_IMG_OBJECT_REQUESTS, +}; + struct rbd_img_request { struct rbd_device *rbd_dev; enum obj_operation_type op_type; enum obj_request_type data_type; unsigned long flags; + enum rbd_img_state state; union { u64 snap_id; /* for reads */ struct ceph_snap_context *snapc; /* for writes */ @@ -300,13 +341,14 @@ struct rbd_img_request { struct request *rq; /* block request */ struct rbd_obj_request *obj_request; /* obj req initiator */ }; - spinlock_t completion_lock; - u64 xferred;/* aggregate bytes transferred */ - int result; /* first nonzero obj_request result */ + struct list_head lock_item; struct list_head object_extents; /* obj_req.ex structs */ - u32 pending_count; + struct mutex state_mutex; + struct pending_result pending; + struct work_struct work; + int work_result; struct kref kref; }; @@ -380,7 +422,17 @@ struct rbd_device { struct work_struct released_lock_work; struct delayed_work lock_dwork; struct work_struct unlock_work; - wait_queue_head_t lock_waitq; + spinlock_t lock_lists_lock; + struct list_head acquiring_list; + struct list_head running_list; + struct completion acquire_wait; + int acquire_err; + struct completion releasing_wait; + + spinlock_t object_map_lock; + u8 *object_map; + u64 object_map_size; /* in objects */ + u64 object_map_flags; struct workqueue_struct *task_wq; @@ -408,12 +460,10 @@ struct rbd_device { * Flag bits for rbd_dev->flags: * - REMOVING (which is coupled with rbd_dev->open_count) is protected * by rbd_dev->lock - * - BLACKLISTED is protected by rbd_dev->lock_rwsem */ enum rbd_dev_flags { RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ - RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ }; static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ @@ -466,6 +516,8 @@ static int minor_to_rbd_dev_id(int minor) static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) { + lockdep_assert_held(&rbd_dev->lock_rwsem); + return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; } @@ -583,6 +635,26 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, u8 *order, u64 *snap_size); static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 *snap_features); +static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev); + +static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result); +static void rbd_img_handle_request(struct rbd_img_request *img_req, int result); + +/* + * Return true if nothing else is pending. + */ +static bool pending_result_dec(struct pending_result *pending, int *result) +{ + rbd_assert(pending->num_pending > 0); + + if (*result && !pending->result) + pending->result = *result; + if (--pending->num_pending) + return false; + + *result = pending->result; + return true; +} static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -1317,6 +1389,8 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, u32 bytes) { + dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes); + switch (obj_req->img_request->data_type) { case OBJ_REQUEST_BIO: zero_bios(&obj_req->bio_pos, off, bytes); @@ -1339,13 +1413,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) kref_put(&obj_request->kref, rbd_obj_request_destroy); } -static void rbd_img_request_get(struct rbd_img_request *img_request) -{ - dout("%s: img %p (was %d)\n", __func__, img_request, - kref_read(&img_request->kref)); - kref_get(&img_request->kref); -} - static void rbd_img_request_destroy(struct kref *kref); static void rbd_img_request_put(struct rbd_img_request *img_request) { @@ -1362,7 +1429,6 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, /* Image request now owns object's original reference */ obj_request->img_request = img_request; - img_request->pending_count++; dout("%s: img %p obj %p\n", __func__, img_request, obj_request); } @@ -1375,13 +1441,13 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, rbd_obj_request_put(obj_request); } -static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) +static void rbd_osd_submit(struct ceph_osd_request *osd_req) { - struct ceph_osd_request *osd_req = obj_request->osd_req; + struct rbd_obj_request *obj_req = osd_req->r_priv; - dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, - obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off, - obj_request->ex.oe_len, osd_req); + dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n", + __func__, osd_req, obj_req, obj_req->ex.oe_objno, + obj_req->ex.oe_off, obj_req->ex.oe_len); ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); } @@ -1457,41 +1523,38 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req) } } -static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); - static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) { struct rbd_obj_request *obj_req = osd_req->r_priv; + int result; dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, osd_req->r_result, obj_req); - rbd_assert(osd_req == obj_req->osd_req); - obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0; - if (!obj_req->result && !rbd_img_is_write(obj_req->img_request)) - obj_req->xferred = osd_req->r_result; + /* + * Writes aren't allowed to return a data payload. In some + * guarded write cases (e.g. stat + zero on an empty object) + * a stat response makes it through, but we don't care. + */ + if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request)) + result = 0; else - /* - * Writes aren't allowed to return a data payload. In some - * guarded write cases (e.g. stat + zero on an empty object) - * a stat response makes it through, but we don't care. - */ - obj_req->xferred = 0; + result = osd_req->r_result; - rbd_obj_handle_request(obj_req); + rbd_obj_handle_request(obj_req, result); } -static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) +static void rbd_osd_format_read(struct ceph_osd_request *osd_req) { - struct ceph_osd_request *osd_req = obj_request->osd_req; + struct rbd_obj_request *obj_request = osd_req->r_priv; osd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req->r_snapid = obj_request->img_request->snap_id; } -static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) +static void rbd_osd_format_write(struct ceph_osd_request *osd_req) { - struct ceph_osd_request *osd_req = obj_request->osd_req; + struct rbd_obj_request *obj_request = osd_req->r_priv; osd_req->r_flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&osd_req->r_mtime); @@ -1499,19 +1562,21 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) } static struct ceph_osd_request * -__rbd_osd_req_create(struct rbd_obj_request *obj_req, - struct ceph_snap_context *snapc, unsigned int num_ops) +__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, + struct ceph_snap_context *snapc, int num_ops) { struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_request *req; const char *name_format = rbd_dev->image_format == 1 ? RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; + int ret; req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); if (!req) - return NULL; + return ERR_PTR(-ENOMEM); + list_add_tail(&req->r_private_item, &obj_req->osd_reqs); req->r_callback = rbd_osd_req_callback; req->r_priv = obj_req; @@ -1522,27 +1587,20 @@ __rbd_osd_req_create(struct rbd_obj_request *obj_req, ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); req->r_base_oloc.pool = rbd_dev->layout.pool_id; - if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, - rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) - goto err_req; + ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, + rbd_dev->header.object_prefix, + obj_req->ex.oe_objno); + if (ret) + return ERR_PTR(ret); return req; - -err_req: - ceph_osdc_put_request(req); - return NULL; } static struct ceph_osd_request * -rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) +rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops) { - return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc, - num_ops); -} - -static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) -{ - ceph_osdc_put_request(osd_req); + return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc, + num_ops); } static struct rbd_obj_request *rbd_obj_request_create(void) @@ -1554,6 +1612,8 @@ static struct rbd_obj_request *rbd_obj_request_create(void) return NULL; ceph_object_extent_init(&obj_request->ex); + INIT_LIST_HEAD(&obj_request->osd_reqs); + mutex_init(&obj_request->state_mutex); kref_init(&obj_request->kref); dout("%s %p\n", __func__, obj_request); @@ -1563,14 +1623,19 @@ static struct rbd_obj_request *rbd_obj_request_create(void) static void rbd_obj_request_destroy(struct kref *kref) { struct rbd_obj_request *obj_request; + struct ceph_osd_request *osd_req; u32 i; obj_request = container_of(kref, struct rbd_obj_request, kref); dout("%s: obj %p\n", __func__, obj_request); - if (obj_request->osd_req) - rbd_osd_req_destroy(obj_request->osd_req); + while (!list_empty(&obj_request->osd_reqs)) { + osd_req = list_first_entry(&obj_request->osd_reqs, + struct ceph_osd_request, r_private_item); + list_del_init(&osd_req->r_private_item); + ceph_osdc_put_request(osd_req); + } switch (obj_request->img_request->data_type) { case OBJ_REQUEST_NODATA: @@ -1684,8 +1749,9 @@ static struct rbd_img_request *rbd_img_request_create( if (rbd_dev_parent_get(rbd_dev)) img_request_layered_set(img_request); - spin_lock_init(&img_request->completion_lock); + INIT_LIST_HEAD(&img_request->lock_item); INIT_LIST_HEAD(&img_request->object_extents); + mutex_init(&img_request->state_mutex); kref_init(&img_request->kref); dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, @@ -1703,6 +1769,7 @@ static void rbd_img_request_destroy(struct kref *kref) dout("%s: img %p\n", __func__, img_request); + WARN_ON(!list_empty(&img_request->lock_item)); for_each_obj_request_safe(img_request, obj_request, next_obj_request) rbd_img_obj_request_del(img_request, obj_request); @@ -1717,6 +1784,466 @@ static void rbd_img_request_destroy(struct kref *kref) kmem_cache_free(rbd_img_request_cache, img_request); } +#define BITS_PER_OBJ 2 +#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ) +#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1) + +static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno, + u64 *index, u8 *shift) +{ + u32 off; + + rbd_assert(objno < rbd_dev->object_map_size); + *index = div_u64_rem(objno, OBJS_PER_BYTE, &off); + *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ; +} + +static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno) +{ + u64 index; + u8 shift; + + lockdep_assert_held(&rbd_dev->object_map_lock); + __rbd_object_map_index(rbd_dev, objno, &index, &shift); + return (rbd_dev->object_map[index] >> shift) & OBJ_MASK; +} + +static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val) +{ + u64 index; + u8 shift; + u8 *p; + + lockdep_assert_held(&rbd_dev->object_map_lock); + rbd_assert(!(val & ~OBJ_MASK)); + + __rbd_object_map_index(rbd_dev, objno, &index, &shift); + p = &rbd_dev->object_map[index]; + *p = (*p & ~(OBJ_MASK << shift)) | (val << shift); +} + +static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno) +{ + u8 state; + + spin_lock(&rbd_dev->object_map_lock); + state = __rbd_object_map_get(rbd_dev, objno); + spin_unlock(&rbd_dev->object_map_lock); + return state; +} + +static bool use_object_map(struct rbd_device *rbd_dev) +{ + return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) && + !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)); +} + +static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno) +{ + u8 state; + + /* fall back to default logic if object map is disabled or invalid */ + if (!use_object_map(rbd_dev)) + return true; + + state = rbd_object_map_get(rbd_dev, objno); + return state != OBJECT_NONEXISTENT; +} + +static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id, + struct ceph_object_id *oid) +{ + if (snap_id == CEPH_NOSNAP) + ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX, + rbd_dev->spec->image_id); + else + ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX, + rbd_dev->spec->image_id, snap_id); +} + +static int rbd_object_map_lock(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + CEPH_DEFINE_OID_ONSTACK(oid); + u8 lock_type; + char *lock_tag; + struct ceph_locker *lockers; + u32 num_lockers; + bool broke_lock = false; + int ret; + + rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid); + +again: + ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, + CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0); + if (ret != -EBUSY || broke_lock) { + if (ret == -EEXIST) + ret = 0; /* already locked by myself */ + if (ret) + rbd_warn(rbd_dev, "failed to lock object map: %d", ret); + return ret; + } + + ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc, + RBD_LOCK_NAME, &lock_type, &lock_tag, + &lockers, &num_lockers); + if (ret) { + if (ret == -ENOENT) + goto again; + + rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret); + return ret; + } + + kfree(lock_tag); + if (num_lockers == 0) + goto again; + + rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu", + ENTITY_NAME(lockers[0].id.name)); + + ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc, + RBD_LOCK_NAME, lockers[0].id.cookie, + &lockers[0].id.name); + ceph_free_lockers(lockers, num_lockers); + if (ret) { + if (ret == -ENOENT) + goto again; + + rbd_warn(rbd_dev, "failed to break object map lock: %d", ret); + return ret; + } + + broke_lock = true; + goto again; +} + +static void rbd_object_map_unlock(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + CEPH_DEFINE_OID_ONSTACK(oid); + int ret; + + rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid); + + ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, + ""); + if (ret && ret != -ENOENT) + rbd_warn(rbd_dev, "failed to unlock object map: %d", ret); +} + +static int decode_object_map_header(void **p, void *end, u64 *object_map_size) +{ + u8 struct_v; + u32 struct_len; + u32 header_len; + void *header_end; + int ret; + + ceph_decode_32_safe(p, end, header_len, e_inval); + header_end = *p + header_len; + + ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v, + &struct_len); + if (ret) + return ret; + + ceph_decode_64_safe(p, end, *object_map_size, e_inval); + + *p = header_end; + return 0; + +e_inval: + return -EINVAL; +} + +static int __rbd_object_map_load(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + CEPH_DEFINE_OID_ONSTACK(oid); + struct page **pages; + void *p, *end; + size_t reply_len; + u64 num_objects; + u64 object_map_bytes; + u64 object_map_size; + int num_pages; + int ret; + + rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size); + + num_objects = ceph_get_num_objects(&rbd_dev->layout, + rbd_dev->mapping.size); + object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ, + BITS_PER_BYTE); + num_pages = calc_pages_for(0, object_map_bytes) + 1; + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + reply_len = num_pages * PAGE_SIZE; + rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid); + ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc, + "rbd", "object_map_load", CEPH_OSD_FLAG_READ, + NULL, 0, pages, &reply_len); + if (ret) + goto out; + + p = page_address(pages[0]); + end = p + min(reply_len, (size_t)PAGE_SIZE); + ret = decode_object_map_header(&p, end, &object_map_size); + if (ret) + goto out; + + if (object_map_size != num_objects) { + rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu", + object_map_size, num_objects); + ret = -EINVAL; + goto out; + } + + if (offset_in_page(p) + object_map_bytes > reply_len) { + ret = -EINVAL; + goto out; + } + + rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL); + if (!rbd_dev->object_map) { + ret = -ENOMEM; + goto out; + } + + rbd_dev->object_map_size = object_map_size; + ceph_copy_from_page_vector(pages, rbd_dev->object_map, + offset_in_page(p), object_map_bytes); + +out: + ceph_release_page_vector(pages, num_pages); + return ret; +} + +static void rbd_object_map_free(struct rbd_device *rbd_dev) +{ + kvfree(rbd_dev->object_map); + rbd_dev->object_map = NULL; + rbd_dev->object_map_size = 0; +} + +static int rbd_object_map_load(struct rbd_device *rbd_dev) +{ + int ret; + + ret = __rbd_object_map_load(rbd_dev); + if (ret) + return ret; + + ret = rbd_dev_v2_get_flags(rbd_dev); + if (ret) { + rbd_object_map_free(rbd_dev); + return ret; + } + + if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID) + rbd_warn(rbd_dev, "object map is invalid"); + + return 0; +} + +static int rbd_object_map_open(struct rbd_device *rbd_dev) +{ + int ret; + + ret = rbd_object_map_lock(rbd_dev); + if (ret) + return ret; + + ret = rbd_object_map_load(rbd_dev); + if (ret) { + rbd_object_map_unlock(rbd_dev); + return ret; + } + + return 0; +} + +static void rbd_object_map_close(struct rbd_device *rbd_dev) +{ + rbd_object_map_free(rbd_dev); + rbd_object_map_unlock(rbd_dev); +} + +/* + * This function needs snap_id (or more precisely just something to + * distinguish between HEAD and snapshot object maps), new_state and + * current_state that were passed to rbd_object_map_update(). + * + * To avoid allocating and stashing a context we piggyback on the OSD + * request. A HEAD update has two ops (assert_locked). For new_state + * and current_state we decode our own object_map_update op, encoded in + * rbd_cls_object_map_update(). + */ +static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req, + struct ceph_osd_request *osd_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + struct ceph_osd_data *osd_data; + u64 objno; + u8 state, new_state, current_state; + bool has_current_state; + void *p; + + if (osd_req->r_result) + return osd_req->r_result; + + /* + * Nothing to do for a snapshot object map. + */ + if (osd_req->r_num_ops == 1) + return 0; + + /* + * Update in-memory HEAD object map. + */ + rbd_assert(osd_req->r_num_ops == 2); + osd_data = osd_req_op_data(osd_req, 1, cls, request_data); + rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES); + + p = page_address(osd_data->pages[0]); + objno = ceph_decode_64(&p); + rbd_assert(objno == obj_req->ex.oe_objno); + rbd_assert(ceph_decode_64(&p) == objno + 1); + new_state = ceph_decode_8(&p); + has_current_state = ceph_decode_8(&p); + if (has_current_state) + current_state = ceph_decode_8(&p); + + spin_lock(&rbd_dev->object_map_lock); + state = __rbd_object_map_get(rbd_dev, objno); + if (!has_current_state || current_state == state || + (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) + __rbd_object_map_set(rbd_dev, objno, new_state); + spin_unlock(&rbd_dev->object_map_lock); + + return 0; +} + +static void rbd_object_map_callback(struct ceph_osd_request *osd_req) +{ + struct rbd_obj_request *obj_req = osd_req->r_priv; + int result; + + dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, + osd_req->r_result, obj_req); + + result = rbd_object_map_update_finish(obj_req, osd_req); + rbd_obj_handle_request(obj_req, result); +} + +static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state) +{ + u8 state = rbd_object_map_get(rbd_dev, objno); + + if (state == new_state || + (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) || + (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) + return false; + + return true; +} + +static int rbd_cls_object_map_update(struct ceph_osd_request *req, + int which, u64 objno, u8 new_state, + const u8 *current_state) +{ + struct page **pages; + void *p, *start; + int ret; + + ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update"); + if (ret) + return ret; + + pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + p = start = page_address(pages[0]); + ceph_encode_64(&p, objno); + ceph_encode_64(&p, objno + 1); + ceph_encode_8(&p, new_state); + if (current_state) { + ceph_encode_8(&p, 1); + ceph_encode_8(&p, *current_state); + } else { + ceph_encode_8(&p, 0); + } + + osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0, + false, true); + return 0; +} + +/* + * Return: + * 0 - object map update sent + * 1 - object map update isn't needed + * <0 - error + */ +static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id, + u8 new_state, const u8 *current_state) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_osd_request *req; + int num_ops = 1; + int which = 0; + int ret; + + if (snap_id == CEPH_NOSNAP) { + if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state)) + return 1; + + num_ops++; /* assert_locked */ + } + + req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO); + if (!req) + return -ENOMEM; + + list_add_tail(&req->r_private_item, &obj_req->osd_reqs); + req->r_callback = rbd_object_map_callback; + req->r_priv = obj_req; + + rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid); + ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); + req->r_flags = CEPH_OSD_FLAG_WRITE; + ktime_get_real_ts64(&req->r_mtime); + + if (snap_id == CEPH_NOSNAP) { + /* + * Protect against possible race conditions during lock + * ownership transitions. + */ + ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME, + CEPH_CLS_LOCK_EXCLUSIVE, "", ""); + if (ret) + return ret; + } + + ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno, + new_state, current_state); + if (ret) + return ret; + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + return ret; + + ceph_osdc_start_request(osdc, req, false); + return 0; +} + static void prune_extents(struct ceph_file_extent *img_extents, u32 *num_img_extents, u64 overlap) { @@ -1764,11 +2291,13 @@ static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req, return 0; } -static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) +static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which) { + struct rbd_obj_request *obj_req = osd_req->r_priv; + switch (obj_req->img_request->data_type) { case OBJ_REQUEST_BIO: - osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, + osd_req_op_extent_osd_data_bio(osd_req, which, &obj_req->bio_pos, obj_req->ex.oe_len); break; @@ -1777,7 +2306,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) rbd_assert(obj_req->bvec_pos.iter.bi_size == obj_req->ex.oe_len); rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); - osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, + osd_req_op_extent_osd_data_bvec_pos(osd_req, which, &obj_req->bvec_pos); break; default: @@ -1785,22 +2314,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) } } -static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) -{ - obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1); - if (!obj_req->osd_req) - return -ENOMEM; - - osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ, - obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); - rbd_osd_req_setup_data(obj_req, 0); - - rbd_osd_req_format_read(obj_req); - return 0; -} - -static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req, - unsigned int which) +static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which) { struct page **pages; @@ -1816,45 +2330,60 @@ static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req, if (IS_ERR(pages)) return PTR_ERR(pages); - osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0); - osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages, + osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0); + osd_req_op_raw_data_in_pages(osd_req, which, pages, 8 + sizeof(struct ceph_timespec), 0, false, true); return 0; } -static int count_write_ops(struct rbd_obj_request *obj_req) +static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which, + u32 bytes) +{ + struct rbd_obj_request *obj_req = osd_req->r_priv; + int ret; + + ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup"); + if (ret) + return ret; + + osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs, + obj_req->copyup_bvec_count, bytes); + return 0; +} + +static int rbd_obj_init_read(struct rbd_obj_request *obj_req) { - return 2; /* setallochint + write/writefull */ + obj_req->read_state = RBD_OBJ_READ_START; + return 0; } -static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, - unsigned int which) +static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, + int which) { + struct rbd_obj_request *obj_req = osd_req->r_priv; struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; u16 opcode; - osd_req_op_alloc_hint_init(obj_req->osd_req, which++, - rbd_dev->layout.object_size, - rbd_dev->layout.object_size); + if (!use_object_map(rbd_dev) || + !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) { + osd_req_op_alloc_hint_init(osd_req, which++, + rbd_dev->layout.object_size, + rbd_dev->layout.object_size); + } if (rbd_obj_is_entire(obj_req)) opcode = CEPH_OSD_OP_WRITEFULL; else opcode = CEPH_OSD_OP_WRITE; - osd_req_op_extent_init(obj_req->osd_req, which, opcode, + osd_req_op_extent_init(osd_req, which, opcode, obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); - rbd_osd_req_setup_data(obj_req, which++); - - rbd_assert(which == obj_req->osd_req->r_num_ops); - rbd_osd_req_format_write(obj_req); + rbd_osd_setup_data(osd_req, which); } -static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) +static int rbd_obj_init_write(struct rbd_obj_request *obj_req) { - unsigned int num_osd_ops, which = 0; - bool need_guard; int ret; /* reverse map the entire object onto the parent */ @@ -1862,24 +2391,10 @@ static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) if (ret) return ret; - need_guard = rbd_obj_copyup_enabled(obj_req); - num_osd_ops = need_guard + count_write_ops(obj_req); - - obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); - if (!obj_req->osd_req) - return -ENOMEM; - - if (need_guard) { - ret = __rbd_obj_setup_stat(obj_req, which++); - if (ret) - return ret; + if (rbd_obj_copyup_enabled(obj_req)) + obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; - obj_req->write_state = RBD_OBJ_WRITE_GUARD; - } else { - obj_req->write_state = RBD_OBJ_WRITE_FLAT; - } - - __rbd_obj_setup_write(obj_req, which); + obj_req->write_state = RBD_OBJ_WRITE_START; return 0; } @@ -1889,11 +2404,26 @@ static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) CEPH_OSD_OP_ZERO; } -static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) +static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req, + int which) +{ + struct rbd_obj_request *obj_req = osd_req->r_priv; + + if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { + rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION); + osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0); + } else { + osd_req_op_extent_init(osd_req, which, + truncate_or_zero_opcode(obj_req), + obj_req->ex.oe_off, obj_req->ex.oe_len, + 0, 0); + } +} + +static int rbd_obj_init_discard(struct rbd_obj_request *obj_req) { struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; - u64 off = obj_req->ex.oe_off; - u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len; + u64 off, next_off; int ret; /* @@ -1906,10 +2436,17 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) */ if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size || !rbd_obj_is_tail(obj_req)) { - off = round_up(off, rbd_dev->opts->alloc_size); - next_off = round_down(next_off, rbd_dev->opts->alloc_size); + off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size); + next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len, + rbd_dev->opts->alloc_size); if (off >= next_off) return 1; + + dout("%s %p %llu~%llu -> %llu~%llu\n", __func__, + obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len, + off, next_off - off); + obj_req->ex.oe_off = off; + obj_req->ex.oe_len = next_off - off; } /* reverse map the entire object onto the parent */ @@ -1917,52 +2454,29 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) if (ret) return ret; - obj_req->osd_req = rbd_osd_req_create(obj_req, 1); - if (!obj_req->osd_req) - return -ENOMEM; - - if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { - osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0); - } else { - dout("%s %p %llu~%llu -> %llu~%llu\n", __func__, - obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len, - off, next_off - off); - osd_req_op_extent_init(obj_req->osd_req, 0, - truncate_or_zero_opcode(obj_req), - off, next_off - off, 0, 0); - } + obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; + if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) + obj_req->flags |= RBD_OBJ_FLAG_DELETION; - obj_req->write_state = RBD_OBJ_WRITE_FLAT; - rbd_osd_req_format_write(obj_req); + obj_req->write_state = RBD_OBJ_WRITE_START; return 0; } -static int count_zeroout_ops(struct rbd_obj_request *obj_req) -{ - int num_osd_ops; - - if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents && - !rbd_obj_copyup_enabled(obj_req)) - num_osd_ops = 2; /* create + truncate */ - else - num_osd_ops = 1; /* delete/truncate/zero */ - - return num_osd_ops; -} - -static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req, - unsigned int which) +static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req, + int which) { + struct rbd_obj_request *obj_req = osd_req->r_priv; u16 opcode; if (rbd_obj_is_entire(obj_req)) { if (obj_req->num_img_extents) { - if (!rbd_obj_copyup_enabled(obj_req)) - osd_req_op_init(obj_req->osd_req, which++, + if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)) + osd_req_op_init(osd_req, which++, CEPH_OSD_OP_CREATE, 0); opcode = CEPH_OSD_OP_TRUNCATE; } else { - osd_req_op_init(obj_req->osd_req, which++, + rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION); + osd_req_op_init(osd_req, which++, CEPH_OSD_OP_DELETE, 0); opcode = 0; } @@ -1971,18 +2485,13 @@ static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req, } if (opcode) - osd_req_op_extent_init(obj_req->osd_req, which++, opcode, + osd_req_op_extent_init(osd_req, which, opcode, obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); - - rbd_assert(which == obj_req->osd_req->r_num_ops); - rbd_osd_req_format_write(obj_req); } -static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) +static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req) { - unsigned int num_osd_ops, which = 0; - bool need_guard; int ret; /* reverse map the entire object onto the parent */ @@ -1990,31 +2499,66 @@ static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) if (ret) return ret; - need_guard = rbd_obj_copyup_enabled(obj_req); - num_osd_ops = need_guard + count_zeroout_ops(obj_req); + if (rbd_obj_copyup_enabled(obj_req)) + obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; + if (!obj_req->num_img_extents) { + obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; + if (rbd_obj_is_entire(obj_req)) + obj_req->flags |= RBD_OBJ_FLAG_DELETION; + } - obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); - if (!obj_req->osd_req) - return -ENOMEM; + obj_req->write_state = RBD_OBJ_WRITE_START; + return 0; +} - if (need_guard) { - ret = __rbd_obj_setup_stat(obj_req, which++); - if (ret) - return ret; +static int count_write_ops(struct rbd_obj_request *obj_req) +{ + struct rbd_img_request *img_req = obj_req->img_request; - obj_req->write_state = RBD_OBJ_WRITE_GUARD; - } else { - obj_req->write_state = RBD_OBJ_WRITE_FLAT; + switch (img_req->op_type) { + case OBJ_OP_WRITE: + if (!use_object_map(img_req->rbd_dev) || + !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) + return 2; /* setallochint + write/writefull */ + + return 1; /* write/writefull */ + case OBJ_OP_DISCARD: + return 1; /* delete/truncate/zero */ + case OBJ_OP_ZEROOUT: + if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents && + !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)) + return 2; /* create + truncate */ + + return 1; /* delete/truncate/zero */ + default: + BUG(); } +} - __rbd_obj_setup_zeroout(obj_req, which); - return 0; +static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, + int which) +{ + struct rbd_obj_request *obj_req = osd_req->r_priv; + + switch (obj_req->img_request->op_type) { + case OBJ_OP_WRITE: + __rbd_osd_setup_write_ops(osd_req, which); + break; + case OBJ_OP_DISCARD: + __rbd_osd_setup_discard_ops(osd_req, which); + break; + case OBJ_OP_ZEROOUT: + __rbd_osd_setup_zeroout_ops(osd_req, which); + break; + default: + BUG(); + } } /* - * For each object request in @img_req, allocate an OSD request, add - * individual OSD ops and prepare them for submission. The number of - * OSD ops depends on op_type and the overlap point (if any). + * Prune the list of object requests (adjust offset and/or length, drop + * redundant requests). Prepare object request state machines and image + * request state machine for execution. */ static int __rbd_img_fill_request(struct rbd_img_request *img_req) { @@ -2024,16 +2568,16 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) for_each_obj_request_safe(img_req, obj_req, next_obj_req) { switch (img_req->op_type) { case OBJ_OP_READ: - ret = rbd_obj_setup_read(obj_req); + ret = rbd_obj_init_read(obj_req); break; case OBJ_OP_WRITE: - ret = rbd_obj_setup_write(obj_req); + ret = rbd_obj_init_write(obj_req); break; case OBJ_OP_DISCARD: - ret = rbd_obj_setup_discard(obj_req); + ret = rbd_obj_init_discard(obj_req); break; case OBJ_OP_ZEROOUT: - ret = rbd_obj_setup_zeroout(obj_req); + ret = rbd_obj_init_zeroout(obj_req); break; default: BUG(); @@ -2041,17 +2585,12 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) if (ret < 0) return ret; if (ret > 0) { - img_req->xferred += obj_req->ex.oe_len; - img_req->pending_count--; rbd_img_obj_request_del(img_req, obj_req); continue; } - - ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); - if (ret) - return ret; } + img_req->state = RBD_IMG_START; return 0; } @@ -2340,17 +2879,55 @@ static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, &it); } -static void rbd_img_request_submit(struct rbd_img_request *img_request) +static void rbd_img_handle_request_work(struct work_struct *work) { - struct rbd_obj_request *obj_request; + struct rbd_img_request *img_req = + container_of(work, struct rbd_img_request, work); - dout("%s: img %p\n", __func__, img_request); + rbd_img_handle_request(img_req, img_req->work_result); +} - rbd_img_request_get(img_request); - for_each_obj_request(img_request, obj_request) - rbd_obj_request_submit(obj_request); +static void rbd_img_schedule(struct rbd_img_request *img_req, int result) +{ + INIT_WORK(&img_req->work, rbd_img_handle_request_work); + img_req->work_result = result; + queue_work(rbd_wq, &img_req->work); +} - rbd_img_request_put(img_request); +static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + + if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) { + obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST; + return true; + } + + dout("%s %p objno %llu assuming dne\n", __func__, obj_req, + obj_req->ex.oe_objno); + return false; +} + +static int rbd_obj_read_object(struct rbd_obj_request *obj_req) +{ + struct ceph_osd_request *osd_req; + int ret; + + osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1); + if (IS_ERR(osd_req)) + return PTR_ERR(osd_req); + + osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ, + obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); + rbd_osd_setup_data(osd_req, 0); + rbd_osd_format_read(osd_req); + + ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); + if (ret) + return ret; + + rbd_osd_submit(osd_req); + return 0; } static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) @@ -2396,51 +2973,144 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) return ret; } - rbd_img_request_submit(child_img_req); + /* avoid parent chain recursion */ + rbd_img_schedule(child_img_req, 0); return 0; } -static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) +static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result) { struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; int ret; - if (obj_req->result == -ENOENT && - rbd_dev->parent_overlap && !obj_req->tried_parent) { - /* reverse map this object extent onto the parent */ - ret = rbd_obj_calc_img_extents(obj_req, false); +again: + switch (obj_req->read_state) { + case RBD_OBJ_READ_START: + rbd_assert(!*result); + + if (!rbd_obj_may_exist(obj_req)) { + *result = -ENOENT; + obj_req->read_state = RBD_OBJ_READ_OBJECT; + goto again; + } + + ret = rbd_obj_read_object(obj_req); if (ret) { - obj_req->result = ret; + *result = ret; return true; } - - if (obj_req->num_img_extents) { - obj_req->tried_parent = true; - ret = rbd_obj_read_from_parent(obj_req); + obj_req->read_state = RBD_OBJ_READ_OBJECT; + return false; + case RBD_OBJ_READ_OBJECT: + if (*result == -ENOENT && rbd_dev->parent_overlap) { + /* reverse map this object extent onto the parent */ + ret = rbd_obj_calc_img_extents(obj_req, false); if (ret) { - obj_req->result = ret; + *result = ret; return true; } - return false; + if (obj_req->num_img_extents) { + ret = rbd_obj_read_from_parent(obj_req); + if (ret) { + *result = ret; + return true; + } + obj_req->read_state = RBD_OBJ_READ_PARENT; + return false; + } + } + + /* + * -ENOENT means a hole in the image -- zero-fill the entire + * length of the request. A short read also implies zero-fill + * to the end of the request. + */ + if (*result == -ENOENT) { + rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len); + *result = 0; + } else if (*result >= 0) { + if (*result < obj_req->ex.oe_len) + rbd_obj_zero_range(obj_req, *result, + obj_req->ex.oe_len - *result); + else + rbd_assert(*result == obj_req->ex.oe_len); + *result = 0; } + return true; + case RBD_OBJ_READ_PARENT: + return true; + default: + BUG(); } +} - /* - * -ENOENT means a hole in the image -- zero-fill the entire - * length of the request. A short read also implies zero-fill - * to the end of the request. In both cases we update xferred - * count to indicate the whole request was satisfied. - */ - if (obj_req->result == -ENOENT || - (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) { - rbd_assert(!obj_req->xferred || !obj_req->result); - rbd_obj_zero_range(obj_req, obj_req->xferred, - obj_req->ex.oe_len - obj_req->xferred); - obj_req->result = 0; - obj_req->xferred = obj_req->ex.oe_len; +static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + + if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) + obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST; + + if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) && + (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) { + dout("%s %p noop for nonexistent\n", __func__, obj_req); + return true; } - return true; + return false; +} + +/* + * Return: + * 0 - object map update sent + * 1 - object map update isn't needed + * <0 - error + */ +static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + u8 new_state; + + if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) + return 1; + + if (obj_req->flags & RBD_OBJ_FLAG_DELETION) + new_state = OBJECT_PENDING; + else + new_state = OBJECT_EXISTS; + + return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL); +} + +static int rbd_obj_write_object(struct rbd_obj_request *obj_req) +{ + struct ceph_osd_request *osd_req; + int num_ops = count_write_ops(obj_req); + int which = 0; + int ret; + + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) + num_ops++; /* stat */ + + osd_req = rbd_obj_add_osd_request(obj_req, num_ops); + if (IS_ERR(osd_req)) + return PTR_ERR(osd_req); + + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) { + ret = rbd_osd_setup_stat(osd_req, which++); + if (ret) + return ret; + } + + rbd_osd_setup_write_ops(osd_req, which); + rbd_osd_format_write(osd_req); + + ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); + if (ret) + return ret; + + rbd_osd_submit(osd_req); + return 0; } /* @@ -2463,123 +3133,67 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) #define MODS_ONLY U32_MAX -static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req, - u32 bytes) +static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req, + u32 bytes) { + struct ceph_osd_request *osd_req; int ret; dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); - rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); rbd_assert(bytes > 0 && bytes != MODS_ONLY); - rbd_osd_req_destroy(obj_req->osd_req); - obj_req->osd_req = __rbd_osd_req_create(obj_req, &rbd_empty_snapc, 1); - if (!obj_req->osd_req) - return -ENOMEM; + osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1); + if (IS_ERR(osd_req)) + return PTR_ERR(osd_req); - ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup"); + ret = rbd_osd_setup_copyup(osd_req, 0, bytes); if (ret) return ret; - osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, - obj_req->copyup_bvecs, - obj_req->copyup_bvec_count, - bytes); - rbd_osd_req_format_write(obj_req); + rbd_osd_format_write(osd_req); - ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); + ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); if (ret) return ret; - rbd_obj_request_submit(obj_req); + rbd_osd_submit(osd_req); return 0; } -static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) +static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req, + u32 bytes) { - struct rbd_img_request *img_req = obj_req->img_request; - unsigned int num_osd_ops = (bytes != MODS_ONLY); - unsigned int which = 0; + struct ceph_osd_request *osd_req; + int num_ops = count_write_ops(obj_req); + int which = 0; int ret; dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); - rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT || - obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL); - rbd_osd_req_destroy(obj_req->osd_req); - switch (img_req->op_type) { - case OBJ_OP_WRITE: - num_osd_ops += count_write_ops(obj_req); - break; - case OBJ_OP_ZEROOUT: - num_osd_ops += count_zeroout_ops(obj_req); - break; - default: - BUG(); - } + if (bytes != MODS_ONLY) + num_ops++; /* copyup */ - obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); - if (!obj_req->osd_req) - return -ENOMEM; + osd_req = rbd_obj_add_osd_request(obj_req, num_ops); + if (IS_ERR(osd_req)) + return PTR_ERR(osd_req); if (bytes != MODS_ONLY) { - ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd", - "copyup"); + ret = rbd_osd_setup_copyup(osd_req, which++, bytes); if (ret) return ret; - - osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++, - obj_req->copyup_bvecs, - obj_req->copyup_bvec_count, - bytes); } - switch (img_req->op_type) { - case OBJ_OP_WRITE: - __rbd_obj_setup_write(obj_req, which); - break; - case OBJ_OP_ZEROOUT: - __rbd_obj_setup_zeroout(obj_req, which); - break; - default: - BUG(); - } + rbd_osd_setup_write_ops(osd_req, which); + rbd_osd_format_write(osd_req); - ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); + ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); if (ret) return ret; - rbd_obj_request_submit(obj_req); + rbd_osd_submit(osd_req); return 0; } -static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) -{ - /* - * Only send non-zero copyup data to save some I/O and network - * bandwidth -- zero copyup data is equivalent to the object not - * existing. - */ - if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) { - dout("%s obj_req %p detected zeroes\n", __func__, obj_req); - bytes = 0; - } - - if (obj_req->img_request->snapc->num_snaps && bytes > 0) { - /* - * Send a copyup request with an empty snapshot context to - * deep-copyup the object through all existing snapshots. - * A second request with the current snapshot context will be - * sent for the actual modification. - */ - obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC; - return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes); - } - - obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; - return rbd_obj_issue_copyup_ops(obj_req, bytes); -} - static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) { u32 i; @@ -2608,7 +3222,12 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) return 0; } -static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) +/* + * The target object doesn't exist. Read the data for the entire + * target object up to the overlap point (if any) from the parent, + * so we can use it for a copyup. + */ +static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req) { struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; int ret; @@ -2623,178 +3242,492 @@ static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) * request -- pass MODS_ONLY since the copyup isn't needed * anymore. */ - obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; - return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); + return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY); } ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); if (ret) return ret; - obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT; return rbd_obj_read_from_parent(obj_req); } -static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) +static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req) { + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + struct ceph_snap_context *snapc = obj_req->img_request->snapc; + u8 new_state; + u32 i; int ret; - switch (obj_req->write_state) { - case RBD_OBJ_WRITE_GUARD: - rbd_assert(!obj_req->xferred); - if (obj_req->result == -ENOENT) { - /* - * The target object doesn't exist. Read the data for - * the entire target object up to the overlap point (if - * any) from the parent, so we can use it for a copyup. - */ - ret = rbd_obj_handle_write_guard(obj_req); - if (ret) { - obj_req->result = ret; - return true; - } - return false; + rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending); + + if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) + return; + + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS) + return; + + for (i = 0; i < snapc->num_snaps; i++) { + if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) && + i + 1 < snapc->num_snaps) + new_state = OBJECT_EXISTS_CLEAN; + else + new_state = OBJECT_EXISTS; + + ret = rbd_object_map_update(obj_req, snapc->snaps[i], + new_state, NULL); + if (ret < 0) { + obj_req->pending.result = ret; + return; } - /* fall through */ - case RBD_OBJ_WRITE_FLAT: - case RBD_OBJ_WRITE_COPYUP_OPS: - if (!obj_req->result) - /* - * There is no such thing as a successful short - * write -- indicate the whole request was satisfied. - */ - obj_req->xferred = obj_req->ex.oe_len; - return true; - case RBD_OBJ_WRITE_READ_FROM_PARENT: - if (obj_req->result) - return true; - rbd_assert(obj_req->xferred); - ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); + rbd_assert(!ret); + obj_req->pending.num_pending++; + } +} + +static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req) +{ + u32 bytes = rbd_obj_img_extents_bytes(obj_req); + int ret; + + rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending); + + /* + * Only send non-zero copyup data to save some I/O and network + * bandwidth -- zero copyup data is equivalent to the object not + * existing. + */ + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS) + bytes = 0; + + if (obj_req->img_request->snapc->num_snaps && bytes > 0) { + /* + * Send a copyup request with an empty snapshot context to + * deep-copyup the object through all existing snapshots. + * A second request with the current snapshot context will be + * sent for the actual modification. + */ + ret = rbd_obj_copyup_empty_snapc(obj_req, bytes); + if (ret) { + obj_req->pending.result = ret; + return; + } + + obj_req->pending.num_pending++; + bytes = MODS_ONLY; + } + + ret = rbd_obj_copyup_current_snapc(obj_req, bytes); + if (ret) { + obj_req->pending.result = ret; + return; + } + + obj_req->pending.num_pending++; +} + +static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + int ret; + +again: + switch (obj_req->copyup_state) { + case RBD_OBJ_COPYUP_START: + rbd_assert(!*result); + + ret = rbd_obj_copyup_read_parent(obj_req); if (ret) { - obj_req->result = ret; - obj_req->xferred = 0; + *result = ret; return true; } + if (obj_req->num_img_extents) + obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT; + else + obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT; return false; - case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC: - if (obj_req->result) + case RBD_OBJ_COPYUP_READ_PARENT: + if (*result) return true; - obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; - ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); - if (ret) { - obj_req->result = ret; + if (is_zero_bvecs(obj_req->copyup_bvecs, + rbd_obj_img_extents_bytes(obj_req))) { + dout("%s %p detected zeros\n", __func__, obj_req); + obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS; + } + + rbd_obj_copyup_object_maps(obj_req); + if (!obj_req->pending.num_pending) { + *result = obj_req->pending.result; + obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS; + goto again; + } + obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS; + return false; + case __RBD_OBJ_COPYUP_OBJECT_MAPS: + if (!pending_result_dec(&obj_req->pending, result)) + return false; + /* fall through */ + case RBD_OBJ_COPYUP_OBJECT_MAPS: + if (*result) { + rbd_warn(rbd_dev, "snap object map update failed: %d", + *result); return true; } + + rbd_obj_copyup_write_object(obj_req); + if (!obj_req->pending.num_pending) { + *result = obj_req->pending.result; + obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT; + goto again; + } + obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT; return false; + case __RBD_OBJ_COPYUP_WRITE_OBJECT: + if (!pending_result_dec(&obj_req->pending, result)) + return false; + /* fall through */ + case RBD_OBJ_COPYUP_WRITE_OBJECT: + return true; default: BUG(); } } /* - * Returns true if @obj_req is completed, or false otherwise. + * Return: + * 0 - object map update sent + * 1 - object map update isn't needed + * <0 - error */ -static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) +static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req) { - switch (obj_req->img_request->op_type) { - case OBJ_OP_READ: - return rbd_obj_handle_read(obj_req); - case OBJ_OP_WRITE: - return rbd_obj_handle_write(obj_req); - case OBJ_OP_DISCARD: - case OBJ_OP_ZEROOUT: - if (rbd_obj_handle_write(obj_req)) { + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + u8 current_state = OBJECT_PENDING; + + if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) + return 1; + + if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION)) + return 1; + + return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT, + ¤t_state); +} + +static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result) +{ + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + int ret; + +again: + switch (obj_req->write_state) { + case RBD_OBJ_WRITE_START: + rbd_assert(!*result); + + if (rbd_obj_write_is_noop(obj_req)) + return true; + + ret = rbd_obj_write_pre_object_map(obj_req); + if (ret < 0) { + *result = ret; + return true; + } + obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP; + if (ret > 0) + goto again; + return false; + case RBD_OBJ_WRITE_PRE_OBJECT_MAP: + if (*result) { + rbd_warn(rbd_dev, "pre object map update failed: %d", + *result); + return true; + } + ret = rbd_obj_write_object(obj_req); + if (ret) { + *result = ret; + return true; + } + obj_req->write_state = RBD_OBJ_WRITE_OBJECT; + return false; + case RBD_OBJ_WRITE_OBJECT: + if (*result == -ENOENT) { + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) { + *result = 0; + obj_req->copyup_state = RBD_OBJ_COPYUP_START; + obj_req->write_state = __RBD_OBJ_WRITE_COPYUP; + goto again; + } /* - * Hide -ENOENT from delete/truncate/zero -- discarding - * a non-existent object is not a problem. + * On a non-existent object: + * delete - -ENOENT, truncate/zero - 0 */ - if (obj_req->result == -ENOENT) { - obj_req->result = 0; - obj_req->xferred = obj_req->ex.oe_len; - } + if (obj_req->flags & RBD_OBJ_FLAG_DELETION) + *result = 0; + } + if (*result) + return true; + + obj_req->write_state = RBD_OBJ_WRITE_COPYUP; + goto again; + case __RBD_OBJ_WRITE_COPYUP: + if (!rbd_obj_advance_copyup(obj_req, result)) + return false; + /* fall through */ + case RBD_OBJ_WRITE_COPYUP: + if (*result) { + rbd_warn(rbd_dev, "copyup failed: %d", *result); + return true; + } + ret = rbd_obj_write_post_object_map(obj_req); + if (ret < 0) { + *result = ret; return true; } + obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP; + if (ret > 0) + goto again; return false; + case RBD_OBJ_WRITE_POST_OBJECT_MAP: + if (*result) + rbd_warn(rbd_dev, "post object map update failed: %d", + *result); + return true; default: BUG(); } } -static void rbd_obj_end_request(struct rbd_obj_request *obj_req) +/* + * Return true if @obj_req is completed. + */ +static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req, + int *result) { struct rbd_img_request *img_req = obj_req->img_request; + struct rbd_device *rbd_dev = img_req->rbd_dev; + bool done; - rbd_assert((!obj_req->result && - obj_req->xferred == obj_req->ex.oe_len) || - (obj_req->result < 0 && !obj_req->xferred)); - if (!obj_req->result) { - img_req->xferred += obj_req->xferred; - return; - } + mutex_lock(&obj_req->state_mutex); + if (!rbd_img_is_write(img_req)) + done = rbd_obj_advance_read(obj_req, result); + else + done = rbd_obj_advance_write(obj_req, result); + mutex_unlock(&obj_req->state_mutex); - rbd_warn(img_req->rbd_dev, - "%s at objno %llu %llu~%llu result %d xferred %llu", - obj_op_name(img_req->op_type), obj_req->ex.oe_objno, - obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result, - obj_req->xferred); - if (!img_req->result) { - img_req->result = obj_req->result; - img_req->xferred = 0; + if (done && *result) { + rbd_assert(*result < 0); + rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d", + obj_op_name(img_req->op_type), obj_req->ex.oe_objno, + obj_req->ex.oe_off, obj_req->ex.oe_len, *result); } + return done; } -static void rbd_img_end_child_request(struct rbd_img_request *img_req) +/* + * This is open-coded in rbd_img_handle_request() to avoid parent chain + * recursion. + */ +static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result) { - struct rbd_obj_request *obj_req = img_req->obj_request; + if (__rbd_obj_handle_request(obj_req, &result)) + rbd_img_handle_request(obj_req->img_request, result); +} - rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags)); - rbd_assert((!img_req->result && - img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) || - (img_req->result < 0 && !img_req->xferred)); +static bool need_exclusive_lock(struct rbd_img_request *img_req) +{ + struct rbd_device *rbd_dev = img_req->rbd_dev; - obj_req->result = img_req->result; - obj_req->xferred = img_req->xferred; - rbd_img_request_put(img_req); + if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) + return false; + + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + return false; + + rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); + if (rbd_dev->opts->lock_on_read || + (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) + return true; + + return rbd_img_is_write(img_req); } -static void rbd_img_end_request(struct rbd_img_request *img_req) +static bool rbd_lock_add_request(struct rbd_img_request *img_req) { - rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); - rbd_assert((!img_req->result && - img_req->xferred == blk_rq_bytes(img_req->rq)) || - (img_req->result < 0 && !img_req->xferred)); + struct rbd_device *rbd_dev = img_req->rbd_dev; + bool locked; + + lockdep_assert_held(&rbd_dev->lock_rwsem); + locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED; + spin_lock(&rbd_dev->lock_lists_lock); + rbd_assert(list_empty(&img_req->lock_item)); + if (!locked) + list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list); + else + list_add_tail(&img_req->lock_item, &rbd_dev->running_list); + spin_unlock(&rbd_dev->lock_lists_lock); + return locked; +} + +static void rbd_lock_del_request(struct rbd_img_request *img_req) +{ + struct rbd_device *rbd_dev = img_req->rbd_dev; + bool need_wakeup; - blk_mq_end_request(img_req->rq, - errno_to_blk_status(img_req->result)); - rbd_img_request_put(img_req); + lockdep_assert_held(&rbd_dev->lock_rwsem); + spin_lock(&rbd_dev->lock_lists_lock); + rbd_assert(!list_empty(&img_req->lock_item)); + list_del_init(&img_req->lock_item); + need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && + list_empty(&rbd_dev->running_list)); + spin_unlock(&rbd_dev->lock_lists_lock); + if (need_wakeup) + complete(&rbd_dev->releasing_wait); } -static void rbd_obj_handle_request(struct rbd_obj_request *obj_req) +static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) { - struct rbd_img_request *img_req; + struct rbd_device *rbd_dev = img_req->rbd_dev; + + if (!need_exclusive_lock(img_req)) + return 1; + + if (rbd_lock_add_request(img_req)) + return 1; + + if (rbd_dev->opts->exclusive) { + WARN_ON(1); /* lock got released? */ + return -EROFS; + } + + /* + * Note the use of mod_delayed_work() in rbd_acquire_lock() + * and cancel_delayed_work() in wake_lock_waiters(). + */ + dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); + queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); + return 0; +} + +static void rbd_img_object_requests(struct rbd_img_request *img_req) +{ + struct rbd_obj_request *obj_req; + + rbd_assert(!img_req->pending.result && !img_req->pending.num_pending); + + for_each_obj_request(img_req, obj_req) { + int result = 0; + + if (__rbd_obj_handle_request(obj_req, &result)) { + if (result) { + img_req->pending.result = result; + return; + } + } else { + img_req->pending.num_pending++; + } + } +} + +static bool rbd_img_advance(struct rbd_img_request *img_req, int *result) +{ + struct rbd_device *rbd_dev = img_req->rbd_dev; + int ret; again: - if (!__rbd_obj_handle_request(obj_req)) - return; + switch (img_req->state) { + case RBD_IMG_START: + rbd_assert(!*result); - img_req = obj_req->img_request; - spin_lock(&img_req->completion_lock); - rbd_obj_end_request(obj_req); - rbd_assert(img_req->pending_count); - if (--img_req->pending_count) { - spin_unlock(&img_req->completion_lock); - return; + ret = rbd_img_exclusive_lock(img_req); + if (ret < 0) { + *result = ret; + return true; + } + img_req->state = RBD_IMG_EXCLUSIVE_LOCK; + if (ret > 0) + goto again; + return false; + case RBD_IMG_EXCLUSIVE_LOCK: + if (*result) + return true; + + rbd_assert(!need_exclusive_lock(img_req) || + __rbd_is_lock_owner(rbd_dev)); + + rbd_img_object_requests(img_req); + if (!img_req->pending.num_pending) { + *result = img_req->pending.result; + img_req->state = RBD_IMG_OBJECT_REQUESTS; + goto again; + } + img_req->state = __RBD_IMG_OBJECT_REQUESTS; + return false; + case __RBD_IMG_OBJECT_REQUESTS: + if (!pending_result_dec(&img_req->pending, result)) + return false; + /* fall through */ + case RBD_IMG_OBJECT_REQUESTS: + return true; + default: + BUG(); + } +} + +/* + * Return true if @img_req is completed. + */ +static bool __rbd_img_handle_request(struct rbd_img_request *img_req, + int *result) +{ + struct rbd_device *rbd_dev = img_req->rbd_dev; + bool done; + + if (need_exclusive_lock(img_req)) { + down_read(&rbd_dev->lock_rwsem); + mutex_lock(&img_req->state_mutex); + done = rbd_img_advance(img_req, result); + if (done) + rbd_lock_del_request(img_req); + mutex_unlock(&img_req->state_mutex); + up_read(&rbd_dev->lock_rwsem); + } else { + mutex_lock(&img_req->state_mutex); + done = rbd_img_advance(img_req, result); + mutex_unlock(&img_req->state_mutex); + } + + if (done && *result) { + rbd_assert(*result < 0); + rbd_warn(rbd_dev, "%s%s result %d", + test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "", + obj_op_name(img_req->op_type), *result); } + return done; +} + +static void rbd_img_handle_request(struct rbd_img_request *img_req, int result) +{ +again: + if (!__rbd_img_handle_request(img_req, &result)) + return; - spin_unlock(&img_req->completion_lock); if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { - obj_req = img_req->obj_request; - rbd_img_end_child_request(img_req); - goto again; + struct rbd_obj_request *obj_req = img_req->obj_request; + + rbd_img_request_put(img_req); + if (__rbd_obj_handle_request(obj_req, &result)) { + img_req = obj_req->img_request; + goto again; + } + } else { + struct request *rq = img_req->rq; + + rbd_img_request_put(img_req); + blk_mq_end_request(rq, errno_to_blk_status(result)); } - rbd_img_end_request(img_req); } static const struct rbd_client_id rbd_empty_cid; @@ -2839,6 +3772,7 @@ static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie) { struct rbd_client_id cid = rbd_get_cid(rbd_dev); + rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; strcpy(rbd_dev->lock_cookie, cookie); rbd_set_owner_cid(rbd_dev, &cid); queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); @@ -2863,7 +3797,6 @@ static int rbd_lock(struct rbd_device *rbd_dev) if (ret) return ret; - rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; __rbd_lock(rbd_dev, cookie); return 0; } @@ -2882,7 +3815,7 @@ static void rbd_unlock(struct rbd_device *rbd_dev) ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, rbd_dev->lock_cookie); if (ret && ret != -ENOENT) - rbd_warn(rbd_dev, "failed to unlock: %d", ret); + rbd_warn(rbd_dev, "failed to unlock header: %d", ret); /* treat errors as the image is unlocked */ rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; @@ -3009,15 +3942,34 @@ e_inval: goto out; } -static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) +/* + * Either image request state machine(s) or rbd_add_acquire_lock() + * (i.e. "rbd map"). + */ +static void wake_lock_waiters(struct rbd_device *rbd_dev, int result) { - dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); + struct rbd_img_request *img_req; + + dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); + lockdep_assert_held_write(&rbd_dev->lock_rwsem); cancel_delayed_work(&rbd_dev->lock_dwork); - if (wake_all) - wake_up_all(&rbd_dev->lock_waitq); - else - wake_up(&rbd_dev->lock_waitq); + if (!completion_done(&rbd_dev->acquire_wait)) { + rbd_assert(list_empty(&rbd_dev->acquiring_list) && + list_empty(&rbd_dev->running_list)); + rbd_dev->acquire_err = result; + complete_all(&rbd_dev->acquire_wait); + return; + } + + list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) { + mutex_lock(&img_req->state_mutex); + rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK); + rbd_img_schedule(img_req, result); + mutex_unlock(&img_req->state_mutex); + } + + list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list); } static int get_lock_owner_info(struct rbd_device *rbd_dev, @@ -3132,13 +4084,10 @@ static int rbd_try_lock(struct rbd_device *rbd_dev) goto again; ret = find_watcher(rbd_dev, lockers); - if (ret) { - if (ret > 0) - ret = 0; /* have to request lock */ - goto out; - } + if (ret) + goto out; /* request lock or error */ - rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", + rbd_warn(rbd_dev, "breaking header lock owned by %s%llu", ENTITY_NAME(lockers[0].id.name)); ret = ceph_monc_blacklist_add(&client->monc, @@ -3165,53 +4114,90 @@ out: return ret; } +static int rbd_post_acquire_action(struct rbd_device *rbd_dev) +{ + int ret; + + if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) { + ret = rbd_object_map_open(rbd_dev); + if (ret) + return ret; + } + + return 0; +} + /* - * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED + * Return: + * 0 - lock acquired + * 1 - caller should call rbd_request_lock() + * <0 - error */ -static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, - int *pret) +static int rbd_try_acquire_lock(struct rbd_device *rbd_dev) { - enum rbd_lock_state lock_state; + int ret; down_read(&rbd_dev->lock_rwsem); dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, rbd_dev->lock_state); if (__rbd_is_lock_owner(rbd_dev)) { - lock_state = rbd_dev->lock_state; up_read(&rbd_dev->lock_rwsem); - return lock_state; + return 0; } up_read(&rbd_dev->lock_rwsem); down_write(&rbd_dev->lock_rwsem); dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, rbd_dev->lock_state); - if (!__rbd_is_lock_owner(rbd_dev)) { - *pret = rbd_try_lock(rbd_dev); - if (*pret) - rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); + if (__rbd_is_lock_owner(rbd_dev)) { + up_write(&rbd_dev->lock_rwsem); + return 0; + } + + ret = rbd_try_lock(rbd_dev); + if (ret < 0) { + rbd_warn(rbd_dev, "failed to lock header: %d", ret); + if (ret == -EBLACKLISTED) + goto out; + + ret = 1; /* request lock anyway */ + } + if (ret > 0) { + up_write(&rbd_dev->lock_rwsem); + return ret; + } + + rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED); + rbd_assert(list_empty(&rbd_dev->running_list)); + + ret = rbd_post_acquire_action(rbd_dev); + if (ret) { + rbd_warn(rbd_dev, "post-acquire action failed: %d", ret); + /* + * Can't stay in RBD_LOCK_STATE_LOCKED because + * rbd_lock_add_request() would let the request through, + * assuming that e.g. object map is locked and loaded. + */ + rbd_unlock(rbd_dev); } - lock_state = rbd_dev->lock_state; +out: + wake_lock_waiters(rbd_dev, ret); up_write(&rbd_dev->lock_rwsem); - return lock_state; + return ret; } static void rbd_acquire_lock(struct work_struct *work) { struct rbd_device *rbd_dev = container_of(to_delayed_work(work), struct rbd_device, lock_dwork); - enum rbd_lock_state lock_state; - int ret = 0; + int ret; dout("%s rbd_dev %p\n", __func__, rbd_dev); again: - lock_state = rbd_try_acquire_lock(rbd_dev, &ret); - if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { - if (lock_state == RBD_LOCK_STATE_LOCKED) - wake_requests(rbd_dev, true); - dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, - rbd_dev, lock_state, ret); + ret = rbd_try_acquire_lock(rbd_dev); + if (ret <= 0) { + dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret); return; } @@ -3220,16 +4206,9 @@ again: goto again; /* treat this as a dead client */ } else if (ret == -EROFS) { rbd_warn(rbd_dev, "peer will not release lock"); - /* - * If this is rbd_add_acquire_lock(), we want to fail - * immediately -- reuse BLACKLISTED flag. Otherwise we - * want to block. - */ - if (!(rbd_dev->disk->flags & GENHD_FL_UP)) { - set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); - /* wake "rbd map --exclusive" process */ - wake_requests(rbd_dev, false); - } + down_write(&rbd_dev->lock_rwsem); + wake_lock_waiters(rbd_dev, ret); + up_write(&rbd_dev->lock_rwsem); } else if (ret < 0) { rbd_warn(rbd_dev, "error requesting lock: %d", ret); mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, @@ -3246,43 +4225,67 @@ again: } } -/* - * lock_rwsem must be held for write - */ -static bool rbd_release_lock(struct rbd_device *rbd_dev) +static bool rbd_quiesce_lock(struct rbd_device *rbd_dev) { - dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, - rbd_dev->lock_state); + bool need_wait; + + dout("%s rbd_dev %p\n", __func__, rbd_dev); + lockdep_assert_held_write(&rbd_dev->lock_rwsem); + if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) return false; - rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; - downgrade_write(&rbd_dev->lock_rwsem); /* * Ensure that all in-flight IO is flushed. - * - * FIXME: ceph_osdc_sync() flushes the entire OSD client, which - * may be shared with other devices. */ - ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); + rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; + rbd_assert(!completion_done(&rbd_dev->releasing_wait)); + need_wait = !list_empty(&rbd_dev->running_list); + downgrade_write(&rbd_dev->lock_rwsem); + if (need_wait) + wait_for_completion(&rbd_dev->releasing_wait); up_read(&rbd_dev->lock_rwsem); down_write(&rbd_dev->lock_rwsem); - dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, - rbd_dev->lock_state); if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) return false; + rbd_assert(list_empty(&rbd_dev->running_list)); + return true; +} + +static void rbd_pre_release_action(struct rbd_device *rbd_dev) +{ + if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) + rbd_object_map_close(rbd_dev); +} + +static void __rbd_release_lock(struct rbd_device *rbd_dev) +{ + rbd_assert(list_empty(&rbd_dev->running_list)); + + rbd_pre_release_action(rbd_dev); rbd_unlock(rbd_dev); +} + +/* + * lock_rwsem must be held for write + */ +static void rbd_release_lock(struct rbd_device *rbd_dev) +{ + if (!rbd_quiesce_lock(rbd_dev)) + return; + + __rbd_release_lock(rbd_dev); + /* * Give others a chance to grab the lock - we would re-acquire - * almost immediately if we got new IO during ceph_osdc_sync() - * otherwise. We need to ack our own notifications, so this - * lock_dwork will be requeued from rbd_wait_state_locked() - * after wake_requests() in rbd_handle_released_lock(). + * almost immediately if we got new IO while draining the running + * list otherwise. We need to ack our own notifications, so this + * lock_dwork will be requeued from rbd_handle_released_lock() by + * way of maybe_kick_acquire(). */ cancel_delayed_work(&rbd_dev->lock_dwork); - return true; } static void rbd_release_lock_work(struct work_struct *work) @@ -3295,6 +4298,23 @@ static void rbd_release_lock_work(struct work_struct *work) up_write(&rbd_dev->lock_rwsem); } +static void maybe_kick_acquire(struct rbd_device *rbd_dev) +{ + bool have_requests; + + dout("%s rbd_dev %p\n", __func__, rbd_dev); + if (__rbd_is_lock_owner(rbd_dev)) + return; + + spin_lock(&rbd_dev->lock_lists_lock); + have_requests = !list_empty(&rbd_dev->acquiring_list); + spin_unlock(&rbd_dev->lock_lists_lock); + if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) { + dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev); + mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); + } +} + static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, void **p) { @@ -3324,8 +4344,7 @@ static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, down_read(&rbd_dev->lock_rwsem); } - if (!__rbd_is_lock_owner(rbd_dev)) - wake_requests(rbd_dev, false); + maybe_kick_acquire(rbd_dev); up_read(&rbd_dev->lock_rwsem); } @@ -3357,8 +4376,7 @@ static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, down_read(&rbd_dev->lock_rwsem); } - if (!__rbd_is_lock_owner(rbd_dev)) - wake_requests(rbd_dev, false); + maybe_kick_acquire(rbd_dev); up_read(&rbd_dev->lock_rwsem); } @@ -3608,7 +4626,6 @@ static void cancel_tasks_sync(struct rbd_device *rbd_dev) static void rbd_unregister_watch(struct rbd_device *rbd_dev) { - WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); cancel_tasks_sync(rbd_dev); mutex_lock(&rbd_dev->watch_mutex); @@ -3630,7 +4647,8 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev) char cookie[32]; int ret; - WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); + if (!rbd_quiesce_lock(rbd_dev)) + return; format_lock_cookie(rbd_dev, cookie); ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid, @@ -3646,11 +4664,11 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev) * Lock cookie cannot be updated on older OSDs, so do * a manual release and queue an acquire. */ - if (rbd_release_lock(rbd_dev)) - queue_delayed_work(rbd_dev->task_wq, - &rbd_dev->lock_dwork, 0); + __rbd_release_lock(rbd_dev); + queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); } else { __rbd_lock(rbd_dev, cookie); + wake_lock_waiters(rbd_dev, 0); } } @@ -3671,15 +4689,18 @@ static void rbd_reregister_watch(struct work_struct *work) ret = __rbd_register_watch(rbd_dev); if (ret) { rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); - if (ret == -EBLACKLISTED || ret == -ENOENT) { - set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); - wake_requests(rbd_dev, true); - } else { + if (ret != -EBLACKLISTED && ret != -ENOENT) { queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, RBD_RETRY_DELAY); + mutex_unlock(&rbd_dev->watch_mutex); + return; } + mutex_unlock(&rbd_dev->watch_mutex); + down_write(&rbd_dev->lock_rwsem); + wake_lock_waiters(rbd_dev, ret); + up_write(&rbd_dev->lock_rwsem); return; } @@ -3742,7 +4763,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, CEPH_OSD_FLAG_READ, req_page, outbound_size, - reply_page, &inbound_size); + &reply_page, &inbound_size); if (!ret) { memcpy(inbound, page_address(reply_page), inbound_size); ret = inbound_size; @@ -3754,54 +4775,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, return ret; } -/* - * lock_rwsem must be held for read - */ -static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire) -{ - DEFINE_WAIT(wait); - unsigned long timeout; - int ret = 0; - - if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) - return -EBLACKLISTED; - - if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) - return 0; - - if (!may_acquire) { - rbd_warn(rbd_dev, "exclusive lock required"); - return -EROFS; - } - - do { - /* - * Note the use of mod_delayed_work() in rbd_acquire_lock() - * and cancel_delayed_work() in wake_requests(). - */ - dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); - queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); - prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, - TASK_UNINTERRUPTIBLE); - up_read(&rbd_dev->lock_rwsem); - timeout = schedule_timeout(ceph_timeout_jiffies( - rbd_dev->opts->lock_timeout)); - down_read(&rbd_dev->lock_rwsem); - if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { - ret = -EBLACKLISTED; - break; - } - if (!timeout) { - rbd_warn(rbd_dev, "timed out waiting for lock"); - ret = -ETIMEDOUT; - break; - } - } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); - - finish_wait(&rbd_dev->lock_waitq, &wait); - return ret; -} - static void rbd_queue_workfn(struct work_struct *work) { struct request *rq = blk_mq_rq_from_pdu(work); @@ -3812,7 +4785,6 @@ static void rbd_queue_workfn(struct work_struct *work) u64 length = blk_rq_bytes(rq); enum obj_operation_type op_type; u64 mapping_size; - bool must_be_locked; int result; switch (req_op(rq)) { @@ -3886,21 +4858,10 @@ static void rbd_queue_workfn(struct work_struct *work) goto err_rq; } - must_be_locked = - (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && - (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read); - if (must_be_locked) { - down_read(&rbd_dev->lock_rwsem); - result = rbd_wait_state_locked(rbd_dev, - !rbd_dev->opts->exclusive); - if (result) - goto err_unlock; - } - img_request = rbd_img_request_create(rbd_dev, op_type, snapc); if (!img_request) { result = -ENOMEM; - goto err_unlock; + goto err_rq; } img_request->rq = rq; snapc = NULL; /* img_request consumes a ref */ @@ -3910,19 +4871,14 @@ static void rbd_queue_workfn(struct work_struct *work) else result = rbd_img_fill_from_bio(img_request, offset, length, rq->bio); - if (result || !img_request->pending_count) + if (result) goto err_img_request; - rbd_img_request_submit(img_request); - if (must_be_locked) - up_read(&rbd_dev->lock_rwsem); + rbd_img_handle_request(img_request, 0); return; err_img_request: rbd_img_request_put(img_request); -err_unlock: - if (must_be_locked) - up_read(&rbd_dev->lock_rwsem); err_rq: if (result) rbd_warn(rbd_dev, "%s %llx at %llx result %d", @@ -4589,7 +5545,13 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); - init_waitqueue_head(&rbd_dev->lock_waitq); + spin_lock_init(&rbd_dev->lock_lists_lock); + INIT_LIST_HEAD(&rbd_dev->acquiring_list); + INIT_LIST_HEAD(&rbd_dev->running_list); + init_completion(&rbd_dev->acquire_wait); + init_completion(&rbd_dev->releasing_wait); + + spin_lock_init(&rbd_dev->object_map_lock); rbd_dev->dev.bus = &rbd_bus_type; rbd_dev->dev.type = &rbd_device_type; @@ -4772,6 +5734,32 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev) &rbd_dev->header.features); } +/* + * These are generic image flags, but since they are used only for + * object map, store them in rbd_dev->object_map_flags. + * + * For the same reason, this function is called only on object map + * (re)load and not on header refresh. + */ +static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev) +{ + __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id); + __le64 flags; + int ret; + + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_flags", + &snapid, sizeof(snapid), + &flags, sizeof(flags)); + if (ret < 0) + return ret; + if (ret < sizeof(flags)) + return -EBADMSG; + + rbd_dev->object_map_flags = le64_to_cpu(flags); + return 0; +} + struct parent_image_info { u64 pool_id; const char *pool_ns; @@ -4829,7 +5817,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev, ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, "rbd", "parent_get", CEPH_OSD_FLAG_READ, - req_page, sizeof(u64), reply_page, &reply_len); + req_page, sizeof(u64), &reply_page, &reply_len); if (ret) return ret == -EOPNOTSUPP ? 1 : ret; @@ -4841,7 +5829,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev, ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ, - req_page, sizeof(u64), reply_page, &reply_len); + req_page, sizeof(u64), &reply_page, &reply_len); if (ret) return ret; @@ -4872,7 +5860,7 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev, ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, "rbd", "get_parent", CEPH_OSD_FLAG_READ, - req_page, sizeof(u64), reply_page, &reply_len); + req_page, sizeof(u64), &reply_page, &reply_len); if (ret) return ret; @@ -5605,28 +6593,49 @@ static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) { down_write(&rbd_dev->lock_rwsem); if (__rbd_is_lock_owner(rbd_dev)) - rbd_unlock(rbd_dev); + __rbd_release_lock(rbd_dev); up_write(&rbd_dev->lock_rwsem); } +/* + * If the wait is interrupted, an error is returned even if the lock + * was successfully acquired. rbd_dev_image_unlock() will release it + * if needed. + */ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) { - int ret; + long ret; if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) { + if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read) + return 0; + rbd_warn(rbd_dev, "exclusive-lock feature is not enabled"); return -EINVAL; } - /* FIXME: "rbd map --exclusive" should be in interruptible */ - down_read(&rbd_dev->lock_rwsem); - ret = rbd_wait_state_locked(rbd_dev, true); - up_read(&rbd_dev->lock_rwsem); + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + return 0; + + rbd_assert(!rbd_is_lock_owner(rbd_dev)); + queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); + ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait, + ceph_timeout_jiffies(rbd_dev->opts->lock_timeout)); + if (ret > 0) + ret = rbd_dev->acquire_err; + else if (!ret) + ret = -ETIMEDOUT; + if (ret) { - rbd_warn(rbd_dev, "failed to acquire exclusive lock"); - return -EROFS; + rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret); + return ret; } + /* + * The lock may have been released by now, unless automatic lock + * transitions are disabled. + */ + rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev)); return 0; } @@ -5724,6 +6733,8 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) struct rbd_image_header *header; rbd_dev_parent_put(rbd_dev); + rbd_object_map_free(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); /* Free dynamic fields from the header, then zero it out */ @@ -5824,7 +6835,6 @@ out_err: static void rbd_dev_device_release(struct rbd_device *rbd_dev) { clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); - rbd_dev_mapping_clear(rbd_dev); rbd_free_disk(rbd_dev); if (!single_major) unregister_blkdev(rbd_dev->major, rbd_dev->name); @@ -5858,23 +6868,17 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) if (ret) goto err_out_blkdev; - ret = rbd_dev_mapping_set(rbd_dev); - if (ret) - goto err_out_disk; - set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); if (ret) - goto err_out_mapping; + goto err_out_disk; set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); up_write(&rbd_dev->header_rwsem); return 0; -err_out_mapping: - rbd_dev_mapping_clear(rbd_dev); err_out_disk: rbd_free_disk(rbd_dev); err_out_blkdev: @@ -5975,6 +6979,17 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) goto err_out_probe; } + ret = rbd_dev_mapping_set(rbd_dev); + if (ret) + goto err_out_probe; + + if (rbd_dev->spec->snap_id != CEPH_NOSNAP && + (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) { + ret = rbd_object_map_load(rbd_dev); + if (ret) + goto err_out_probe; + } + if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) @@ -6071,11 +7086,9 @@ static ssize_t do_rbd_add(struct bus_type *bus, if (rc) goto err_out_image_probe; - if (rbd_dev->opts->exclusive) { - rc = rbd_add_acquire_lock(rbd_dev); - if (rc) - goto err_out_device_setup; - } + rc = rbd_add_acquire_lock(rbd_dev); + if (rc) + goto err_out_image_lock; /* Everything's ready. Announce the disk to the world. */ @@ -6101,7 +7114,6 @@ out: err_out_image_lock: rbd_dev_image_unlock(rbd_dev); -err_out_device_setup: rbd_dev_device_release(rbd_dev); err_out_image_probe: rbd_dev_image_release(rbd_dev); diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index 62ff50d3e7a6..ac98ab6ccd3b 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -18,6 +18,7 @@ /* For format version 2, rbd image 'foo' consists of objects * rbd_id.foo - id of image * rbd_header.<id> - image metadata + * rbd_object_map.<id> - optional image object map * rbd_data.<id>.0000000000000000 * rbd_data.<id>.0000000000000001 * ... - data @@ -25,6 +26,7 @@ */ #define RBD_HEADER_PREFIX "rbd_header." +#define RBD_OBJECT_MAP_PREFIX "rbd_object_map." #define RBD_ID_PREFIX "rbd_id." #define RBD_V2_DATA_FORMAT "%s.%016llx" @@ -39,6 +41,14 @@ enum rbd_notify_op { RBD_NOTIFY_OP_HEADER_UPDATE = 3, }; +#define OBJECT_NONEXISTENT 0 +#define OBJECT_EXISTS 1 +#define OBJECT_PENDING 2 +#define OBJECT_EXISTS_CLEAN 3 + +#define RBD_FLAG_OBJECT_MAP_INVALID (1ULL << 0) +#define RBD_FLAG_FAST_DIFF_INVALID (1ULL << 1) + /* * For format version 1, rbd image 'foo' consists of objects * foo.rbd - image metadata diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 2109cfe80219..8fafbeab510a 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -295,6 +295,22 @@ static ssize_t target_node_show(struct device *dev, } static DEVICE_ATTR_RO(target_node); +static unsigned long long dev_dax_resource(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + + return dax_region->res.start; +} + +static ssize_t resource_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + return sprintf(buf, "%#llx\n", dev_dax_resource(dev_dax)); +} +static DEVICE_ATTR_RO(resource); + static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -313,6 +329,8 @@ static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) if (a == &dev_attr_target_node.attr && dev_dax_target_node(dev_dax) < 0) return 0; + if (a == &dev_attr_resource.attr) + return 0400; return a->mode; } @@ -320,6 +338,7 @@ static struct attribute *dev_dax_attributes[] = { &dev_attr_modalias.attr, &dev_attr_size.attr, &dev_attr_target_node.attr, + &dev_attr_resource.attr, NULL, }; @@ -388,7 +407,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, * No 'host' or dax_operations since there is no access to this * device outside of mmap of the resulting character device. */ - dax_dev = alloc_dax(dev_dax, NULL, NULL); + dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); if (!dax_dev) goto err; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 4e5ae7e8b557..8ab12068eea3 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -195,6 +195,8 @@ enum dax_device_flags { DAXDEV_ALIVE, /* gate whether dax_flush() calls the low level flush routine */ DAXDEV_WRITE_CACHE, + /* flag to check if device supports synchronous flush */ + DAXDEV_SYNC, }; /** @@ -372,6 +374,18 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(dax_write_cache_enabled); +bool __dax_synchronous(struct dax_device *dax_dev) +{ + return test_bit(DAXDEV_SYNC, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(__dax_synchronous); + +void __set_dax_synchronous(struct dax_device *dax_dev) +{ + set_bit(DAXDEV_SYNC, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(__set_dax_synchronous); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); @@ -526,7 +540,7 @@ static void dax_add_host(struct dax_device *dax_dev, const char *host) } struct dax_device *alloc_dax(void *private, const char *__host, - const struct dax_operations *ops) + const struct dax_operations *ops, unsigned long flags) { struct dax_device *dax_dev; const char *host; @@ -549,6 +563,9 @@ struct dax_device *alloc_dax(void *private, const char *__host, dax_add_host(dax_dev, host); dax_dev->ops = ops; dax_dev->private = private; + if (flags & DAXDEV_F_SYNC) + set_dax_synchronous(dax_dev); + return dax_dev; err_dev: diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 671c24332802..df2011de7be2 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -28,10 +28,27 @@ #include "dm-core.h" -#define SUB_JOB_SIZE 128 #define SPLIT_COUNT 8 #define MIN_JOBS 8 -#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE)) + +#define DEFAULT_SUB_JOB_SIZE_KB 512 +#define MAX_SUB_JOB_SIZE_KB 1024 + +static unsigned kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB; + +module_param(kcopyd_subjob_size_kb, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients"); + +static unsigned dm_get_kcopyd_subjob_size(void) +{ + unsigned sub_job_size_kb; + + sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb, + DEFAULT_SUB_JOB_SIZE_KB, + MAX_SUB_JOB_SIZE_KB); + + return sub_job_size_kb << 1; +} /*----------------------------------------------------------------- * Each kcopyd client has its own little pool of preallocated @@ -41,6 +58,7 @@ struct dm_kcopyd_client { struct page_list *pages; unsigned nr_reserved_pages; unsigned nr_free_pages; + unsigned sub_job_size; struct dm_io_client *io_client; @@ -693,8 +711,8 @@ static void segment_complete(int read_err, unsigned long write_err, progress = job->progress; count = job->source.count - progress; if (count) { - if (count > SUB_JOB_SIZE) - count = SUB_JOB_SIZE; + if (count > kc->sub_job_size) + count = kc->sub_job_size; job->progress += count; } @@ -821,7 +839,7 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, job->master_job = job; job->write_offset = 0; - if (job->source.count <= SUB_JOB_SIZE) + if (job->source.count <= kc->sub_job_size) dispatch_job(job); else { job->progress = 0; @@ -888,6 +906,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block) struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle) { int r; + unsigned reserve_pages; struct dm_kcopyd_client *kc; kc = kzalloc(sizeof(*kc), GFP_KERNEL); @@ -912,9 +931,12 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro goto bad_workqueue; } + kc->sub_job_size = dm_get_kcopyd_subjob_size(); + reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE); + kc->pages = NULL; kc->nr_reserved_pages = kc->nr_free_pages = 0; - r = client_reserve_pages(kc, RESERVE_PAGES); + r = client_reserve_pages(kc, reserve_pages); if (r) goto bad_client_pages; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 63916e1dc569..f150f5c5492b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2072,6 +2072,12 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + /* Once merging, discards no longer effect change */ + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); down_write(&s->lock); @@ -2331,6 +2337,8 @@ static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits) if (snap->discard_zeroes_cow) { struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + down_read(&_origins_lock); + (void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) snap = snap_src; @@ -2338,6 +2346,8 @@ static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits) /* All discards are split on chunk_size boundary */ limits->discard_granularity = snap->store->chunk_size; limits->max_discard_sectors = snap->store->chunk_size; + + up_read(&_origins_lock); } } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ec8b27e20de3..caaee8032afe 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -881,7 +881,7 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) EXPORT_SYMBOL_GPL(dm_table_set_type); /* validate the dax capability of the target device span */ -static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, +int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { int blocksize = *(int *) data; @@ -890,7 +890,15 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, start, len); } -bool dm_table_supports_dax(struct dm_table *t, int blocksize) +/* Check devices support synchronous DAX */ +static int device_synchronous(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + return dax_synchronous(dev->dax_dev); +} + +bool dm_table_supports_dax(struct dm_table *t, + iterate_devices_callout_fn iterate_fn, int *blocksize) { struct dm_target *ti; unsigned i; @@ -903,8 +911,7 @@ bool dm_table_supports_dax(struct dm_table *t, int blocksize) return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_supports_dax, - &blocksize)) + !ti->type->iterate_devices(ti, iterate_fn, blocksize)) return false; } @@ -940,6 +947,7 @@ static int dm_table_determine_type(struct dm_table *t) struct dm_target *tgt; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); + int page_size = PAGE_SIZE; if (t->type != DM_TYPE_NONE) { /* target already set the table's type */ @@ -984,7 +992,7 @@ static int dm_table_determine_type(struct dm_table *t) verify_bio_based: /* We must use this table as bio-based */ t->type = DM_TYPE_BIO_BASED; - if (dm_table_supports_dax(t, PAGE_SIZE) || + if (dm_table_supports_dax(t, device_supports_dax, &page_size) || (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { t->type = DM_TYPE_DAX_BIO_BASED; } else { @@ -1883,6 +1891,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { bool wc = false, fua = false; + int page_size = PAGE_SIZE; /* * Copy table's limits to the DM device's request_queue @@ -1910,8 +1919,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); - if (dm_table_supports_dax(t, PAGE_SIZE)) + if (dm_table_supports_dax(t, device_supports_dax, &page_size)) { blk_queue_flag_set(QUEUE_FLAG_DAX, q); + if (dm_table_supports_dax(t, device_synchronous, NULL)) + set_dax_synchronous(t->md->dax_dev); + } else blk_queue_flag_clear(QUEUE_FLAG_DAX, q); diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 9faf3e49c7af..8545dcee9fd0 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1602,30 +1602,6 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd) } /* - * Activate a zone (increment its reference count). - */ -void dmz_activate_zone(struct dm_zone *zone) -{ - set_bit(DMZ_ACTIVE, &zone->flags); - atomic_inc(&zone->refcount); -} - -/* - * Deactivate a zone. This decrement the zone reference counter - * and clears the active state of the zone once the count reaches 0, - * indicating that all BIOs to the zone have completed. Returns - * true if the zone was deactivated. - */ -void dmz_deactivate_zone(struct dm_zone *zone) -{ - if (atomic_dec_and_test(&zone->refcount)) { - WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags)); - clear_bit_unlock(DMZ_ACTIVE, &zone->flags); - smp_mb__after_atomic(); - } -} - -/* * Get the zone mapping a chunk, if the chunk is mapped already. * If no mapping exist and the operation is WRITE, a zone is * allocated and used to map the chunk. diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 12419f0bfe78..ed8de49c9a08 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -115,7 +115,6 @@ enum { DMZ_BUF, /* Zone internal state */ - DMZ_ACTIVE, DMZ_RECLAIM, DMZ_SEQ_WRITE_ERR, }; @@ -128,7 +127,6 @@ enum { #define dmz_is_empty(z) ((z)->wp_block == 0) #define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags) #define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags) -#define dmz_is_active(z) test_bit(DMZ_ACTIVE, &(z)->flags) #define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags) #define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags) @@ -188,8 +186,30 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd); -void dmz_activate_zone(struct dm_zone *zone); -void dmz_deactivate_zone(struct dm_zone *zone); +/* + * Activate a zone (increment its reference count). + */ +static inline void dmz_activate_zone(struct dm_zone *zone) +{ + atomic_inc(&zone->refcount); +} + +/* + * Deactivate a zone. This decrement the zone reference counter + * indicating that all BIOs to the zone have completed when the count is 0. + */ +static inline void dmz_deactivate_zone(struct dm_zone *zone) +{ + atomic_dec(&zone->refcount); +} + +/* + * Test if a zone is active, that is, has a refcount > 0. + */ +static inline bool dmz_is_active(struct dm_zone *zone) +{ + return atomic_read(&zone->refcount); +} int dmz_lock_zone_reclaim(struct dm_zone *zone); void dmz_unlock_zone_reclaim(struct dm_zone *zone); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 61f1152b74e9..d0beef033e2f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1117,7 +1117,7 @@ static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bd if (!map) return false; - ret = dm_table_supports_dax(map, blocksize); + ret = dm_table_supports_dax(map, device_supports_dax, &blocksize); dm_put_live_table(md, srcu_idx); @@ -1989,7 +1989,8 @@ static struct mapped_device *alloc_dev(int minor) sprintf(md->disk->disk_name, "dm-%d", minor); if (IS_ENABLED(CONFIG_DAX_DRIVER)) { - md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); + md->dax_dev = alloc_dax(md, md->disk->disk_name, + &dm_dax_ops, 0); if (!md->dax_dev) goto bad; } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 17e3db54404c..0475673337f3 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -72,7 +72,10 @@ bool dm_table_bio_based(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); -bool dm_table_supports_dax(struct dm_table *t, int blocksize); +bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn, + int *blocksize); +int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data); void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 6f2a088afad6..cefe233e0b52 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_ND_BLK) += nd_blk.o obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_OF_PMEM) += of_pmem.o +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o nd_pmem-y := pmem.o diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 26c1c7618891..2985ca949912 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -255,7 +255,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); sector_t sector = offset >> 9; - int rc = 0; + int rc = 0, ret = 0; if (unlikely(!size)) return 0; @@ -293,7 +293,9 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, } memcpy_flushcache(nsio->addr + offset, buf, size); - nvdimm_flush(to_nd_region(ndns->dev.parent)); + ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL); + if (ret) + rc = ret; return rc; } diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index a434a5964cb9..2d8d7e554877 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1822,8 +1822,8 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, && !guid_equal(&nd_set->type_guid, &nd_label->type_guid)) { dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n", - nd_set->type_guid.b, - nd_label->type_guid.b); + &nd_set->type_guid, + &nd_label->type_guid); continue; } @@ -2227,8 +2227,8 @@ static struct device *create_namespace_blk(struct nd_region *nd_region, if (namespace_label_has(ndd, type_guid)) { if (!guid_equal(&nd_set->type_guid, &nd_label->type_guid)) { dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n", - nd_set->type_guid.b, - nd_label->type_guid.b); + &nd_set->type_guid, + &nd_label->type_guid); return ERR_PTR(-EAGAIN); } diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index d24304c0e6d7..1b9955651379 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -155,6 +155,7 @@ struct nd_region { struct badblocks bb; struct nd_interleave_set *nd_set; struct nd_percpu_lane __percpu *lane; + int (*flush)(struct nd_region *nd_region, struct bio *bio); struct nd_mapping mapping[0]; }; diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c new file mode 100644 index 000000000000..10351d5b49fa --- /dev/null +++ b/drivers/nvdimm/nd_virtio.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio_pmem.c: Virtio pmem Driver + * + * Discovers persistent memory range information + * from host and provides a virtio based flushing + * interface. + */ +#include "virtio_pmem.h" +#include "nd.h" + + /* The interrupt handler */ +void virtio_pmem_host_ack(struct virtqueue *vq) +{ + struct virtio_pmem *vpmem = vq->vdev->priv; + struct virtio_pmem_request *req_data, *req_buf; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vpmem->pmem_lock, flags); + while ((req_data = virtqueue_get_buf(vq, &len)) != NULL) { + req_data->done = true; + wake_up(&req_data->host_acked); + + if (!list_empty(&vpmem->req_list)) { + req_buf = list_first_entry(&vpmem->req_list, + struct virtio_pmem_request, list); + req_buf->wq_buf_avail = true; + wake_up(&req_buf->wq_buf); + list_del(&req_buf->list); + } + } + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); +} +EXPORT_SYMBOL_GPL(virtio_pmem_host_ack); + + /* The request submission function */ +static int virtio_pmem_flush(struct nd_region *nd_region) +{ + struct virtio_device *vdev = nd_region->provider_data; + struct virtio_pmem *vpmem = vdev->priv; + struct virtio_pmem_request *req_data; + struct scatterlist *sgs[2], sg, ret; + unsigned long flags; + int err, err1; + + might_sleep(); + req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); + if (!req_data) + return -ENOMEM; + + req_data->done = false; + init_waitqueue_head(&req_data->host_acked); + init_waitqueue_head(&req_data->wq_buf); + INIT_LIST_HEAD(&req_data->list); + req_data->req.type = cpu_to_le32(VIRTIO_PMEM_REQ_TYPE_FLUSH); + sg_init_one(&sg, &req_data->req, sizeof(req_data->req)); + sgs[0] = &sg; + sg_init_one(&ret, &req_data->resp.ret, sizeof(req_data->resp)); + sgs[1] = &ret; + + spin_lock_irqsave(&vpmem->pmem_lock, flags); + /* + * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual + * queue does not have free descriptor. We add the request + * to req_list and wait for host_ack to wake us up when free + * slots are available. + */ + while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req_data, + GFP_ATOMIC)) == -ENOSPC) { + + dev_info(&vdev->dev, "failed to send command to virtio pmem device, no free slots in the virtqueue\n"); + req_data->wq_buf_avail = false; + list_add_tail(&req_data->list, &vpmem->req_list); + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); + + /* A host response results in "host_ack" getting called */ + wait_event(req_data->wq_buf, req_data->wq_buf_avail); + spin_lock_irqsave(&vpmem->pmem_lock, flags); + } + err1 = virtqueue_kick(vpmem->req_vq); + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); + /* + * virtqueue_add_sgs failed with error different than -ENOSPC, we can't + * do anything about that. + */ + if (err || !err1) { + dev_info(&vdev->dev, "failed to send command to virtio pmem device\n"); + err = -EIO; + } else { + /* A host repsonse results in "host_ack" getting called */ + wait_event(req_data->host_acked, req_data->done); + err = le32_to_cpu(req_data->resp.ret); + } + + kfree(req_data); + return err; +}; + +/* The asynchronous flush callback function */ +int async_pmem_flush(struct nd_region *nd_region, struct bio *bio) +{ + /* + * Create child bio for asynchronous flush and chain with + * parent bio. Otherwise directly call nd_region flush. + */ + if (bio && bio->bi_iter.bi_sector != -1) { + struct bio *child = bio_alloc(GFP_ATOMIC, 0); + + if (!child) + return -ENOMEM; + bio_copy_dev(child, bio); + child->bi_opf = REQ_PREFLUSH; + child->bi_iter.bi_sector = -1; + bio_chain(child, bio); + submit_bio(child); + return 0; + } + if (virtio_pmem_flush(nd_region)) + return -EIO; + + return 0; +}; +EXPORT_SYMBOL_GPL(async_pmem_flush); +MODULE_LICENSE("GPL"); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index e7d8cc9f41e8..2bf3acd69613 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -184,6 +184,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { + int ret = 0; blk_status_t rc = 0; bool do_acct; unsigned long start; @@ -193,7 +194,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) struct nd_region *nd_region = to_region(pmem); if (bio->bi_opf & REQ_PREFLUSH) - nvdimm_flush(nd_region); + ret = nvdimm_flush(nd_region, bio); do_acct = nd_iostat_start(bio, &start); bio_for_each_segment(bvec, bio, iter) { @@ -208,7 +209,10 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) nd_iostat_end(bio, start); if (bio->bi_opf & REQ_FUA) - nvdimm_flush(nd_region); + ret = nvdimm_flush(nd_region, bio); + + if (ret) + bio->bi_status = errno_to_blk_status(ret); bio_endio(bio); return BLK_QC_T_NONE; @@ -362,6 +366,7 @@ static int pmem_attach_disk(struct device *dev, struct gendisk *disk; void *addr; int rc; + unsigned long flags = 0UL; pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); if (!pmem) @@ -457,14 +462,15 @@ static int pmem_attach_disk(struct device *dev, nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res); disk->bb = &pmem->bb; - dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops); + if (is_nvdimm_sync(nd_region)) + flags = DAXDEV_F_SYNC; + dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags); if (!dax_dev) { put_disk(disk); return -ENOMEM; } dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); pmem->dax_dev = dax_dev; - gendev = disk_to_dev(disk); gendev->groups = pmem_attribute_groups; @@ -522,14 +528,14 @@ static int nd_pmem_remove(struct device *dev) sysfs_put(pmem->bb_state); pmem->bb_state = NULL; } - nvdimm_flush(to_nd_region(dev->parent)); + nvdimm_flush(to_nd_region(dev->parent), NULL); return 0; } static void nd_pmem_shutdown(struct device *dev) { - nvdimm_flush(to_nd_region(dev->parent)); + nvdimm_flush(to_nd_region(dev->parent), NULL); } static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 4fed9ce9c2fe..56f2227f192a 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -287,7 +287,9 @@ static ssize_t deep_flush_store(struct device *dev, struct device_attribute *att return rc; if (!flush) return -EINVAL; - nvdimm_flush(nd_region); + rc = nvdimm_flush(nd_region, NULL); + if (rc) + return rc; return len; } @@ -1077,6 +1079,11 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, dev->of_node = ndr_desc->of_node; nd_region->ndr_size = resource_size(ndr_desc->res); nd_region->ndr_start = ndr_desc->res->start; + if (ndr_desc->flush) + nd_region->flush = ndr_desc->flush; + else + nd_region->flush = NULL; + nd_device_register(dev); return nd_region; @@ -1117,11 +1124,24 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, } EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create); +int nvdimm_flush(struct nd_region *nd_region, struct bio *bio) +{ + int rc = 0; + + if (!nd_region->flush) + rc = generic_nvdimm_flush(nd_region); + else { + if (nd_region->flush(nd_region, bio)) + rc = -EIO; + } + + return rc; +} /** * nvdimm_flush - flush any posted write queues between the cpu and pmem media * @nd_region: blk or interleaved pmem region */ -void nvdimm_flush(struct nd_region *nd_region) +int generic_nvdimm_flush(struct nd_region *nd_region) { struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev); int i, idx; @@ -1145,6 +1165,8 @@ void nvdimm_flush(struct nd_region *nd_region) if (ndrd_get_flush_wpq(ndrd, i, 0)) writeq(1, ndrd_get_flush_wpq(ndrd, i, idx)); wmb(); + + return 0; } EXPORT_SYMBOL_GPL(nvdimm_flush); @@ -1189,6 +1211,13 @@ int nvdimm_has_cache(struct nd_region *nd_region) } EXPORT_SYMBOL_GPL(nvdimm_has_cache); +bool is_nvdimm_sync(struct nd_region *nd_region) +{ + return is_nd_pmem(&nd_region->dev) && + !test_bit(ND_REGION_ASYNC, &nd_region->flags); +} +EXPORT_SYMBOL_GPL(is_nvdimm_sync); + struct conflict_context { struct nd_region *nd_region; resource_size_t start, size; diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c new file mode 100644 index 000000000000..5e3d07b47e0c --- /dev/null +++ b/drivers/nvdimm/virtio_pmem.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio_pmem.c: Virtio pmem Driver + * + * Discovers persistent memory range information + * from host and registers the virtual pmem device + * with libnvdimm core. + */ +#include "virtio_pmem.h" +#include "nd.h" + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + + /* Initialize virt queue */ +static int init_vq(struct virtio_pmem *vpmem) +{ + /* single vq */ + vpmem->req_vq = virtio_find_single_vq(vpmem->vdev, + virtio_pmem_host_ack, "flush_queue"); + if (IS_ERR(vpmem->req_vq)) + return PTR_ERR(vpmem->req_vq); + + spin_lock_init(&vpmem->pmem_lock); + INIT_LIST_HEAD(&vpmem->req_list); + + return 0; +}; + +static int virtio_pmem_probe(struct virtio_device *vdev) +{ + struct nd_region_desc ndr_desc = {}; + int nid = dev_to_node(&vdev->dev); + struct nd_region *nd_region; + struct virtio_pmem *vpmem; + struct resource res; + int err = 0; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), GFP_KERNEL); + if (!vpmem) { + err = -ENOMEM; + goto out_err; + } + + vpmem->vdev = vdev; + vdev->priv = vpmem; + err = init_vq(vpmem); + if (err) { + dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n"); + goto out_err; + } + + virtio_cread(vpmem->vdev, struct virtio_pmem_config, + start, &vpmem->start); + virtio_cread(vpmem->vdev, struct virtio_pmem_config, + size, &vpmem->size); + + res.start = vpmem->start; + res.end = vpmem->start + vpmem->size - 1; + vpmem->nd_desc.provider_name = "virtio-pmem"; + vpmem->nd_desc.module = THIS_MODULE; + + vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev, + &vpmem->nd_desc); + if (!vpmem->nvdimm_bus) { + dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n"); + err = -ENXIO; + goto out_vq; + } + + dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus); + + ndr_desc.res = &res; + ndr_desc.numa_node = nid; + ndr_desc.flush = async_pmem_flush; + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + set_bit(ND_REGION_ASYNC, &ndr_desc.flags); + nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); + if (!nd_region) { + dev_err(&vdev->dev, "failed to create nvdimm region\n"); + err = -ENXIO; + goto out_nd; + } + nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent); + return 0; +out_nd: + nvdimm_bus_unregister(vpmem->nvdimm_bus); +out_vq: + vdev->config->del_vqs(vdev); +out_err: + return err; +} + +static void virtio_pmem_remove(struct virtio_device *vdev) +{ + struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev); + + nvdimm_bus_unregister(nvdimm_bus); + vdev->config->del_vqs(vdev); + vdev->config->reset(vdev); +} + +static struct virtio_driver virtio_pmem_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtio_pmem_probe, + .remove = virtio_pmem_remove, +}; + +module_virtio_driver(virtio_pmem_driver); +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio pmem driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h new file mode 100644 index 000000000000..0dddefe594c4 --- /dev/null +++ b/drivers/nvdimm/virtio_pmem.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * virtio_pmem.h: virtio pmem Driver + * + * Discovers persistent memory range information + * from host and provides a virtio based flushing + * interface. + **/ + +#ifndef _LINUX_VIRTIO_PMEM_H +#define _LINUX_VIRTIO_PMEM_H + +#include <linux/module.h> +#include <uapi/linux/virtio_pmem.h> +#include <linux/libnvdimm.h> +#include <linux/spinlock.h> + +struct virtio_pmem_request { + struct virtio_pmem_req req; + struct virtio_pmem_resp resp; + + /* Wait queue to process deferred work after ack from host */ + wait_queue_head_t host_acked; + bool done; + + /* Wait queue to process deferred work after virt queue buffer avail */ + wait_queue_head_t wq_buf; + bool wq_buf_avail; + struct list_head list; +}; + +struct virtio_pmem { + struct virtio_device *vdev; + + /* Virtio pmem request queue */ + struct virtqueue *req_vq; + + /* nvdimm bus registers virtio pmem device */ + struct nvdimm_bus *nvdimm_bus; + struct nvdimm_bus_descriptor nd_desc; + + /* List to store deferred work if virtqueue is full */ + struct list_head req_list; + + /* Synchronize virtqueue data */ + spinlock_t pmem_lock; + + /* Memory region information */ + __u64 start; + __u64 size; +}; + +void virtio_pmem_host_ack(struct virtqueue *vq); +int async_pmem_flush(struct nd_region *nd_region, struct bio *bio); +#endif diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index d04d4378ca50..63502ca537eb 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -679,7 +679,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char goto put_dev; dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name, - &dcssblk_dax_ops); + &dcssblk_dax_ops, DAXDEV_F_SYNC); if (!dev_info->dax_dev) { rc = -ENOMEM; goto put_dev; diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 023fc3bc01c6..078615cf2afc 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -43,6 +43,17 @@ config VIRTIO_PCI_LEGACY If unsure, say Y. +config VIRTIO_PMEM + tristate "Support for virtio pmem driver" + depends on VIRTIO + depends on LIBNVDIMM + help + This driver provides access to virtio-pmem devices, storage devices + that are mapped into the physical address space - similar to NVDIMMs + - with a virtio-based flushing interface. + + If unsure, say Y. + config VIRTIO_BALLOON tristate "Virtio balloon driver" depends on VIRTIO diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 6cad0b33d7ad..8188963a405b 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -58,6 +58,15 @@ config WATCHDOG_HANDLE_BOOT_ENABLED the watchdog on its own. Thus if your userspace does not start fast enough your device will reboot. +config WATCHDOG_OPEN_TIMEOUT + int "Timeout value for opening watchdog device" + default 0 + help + The maximum time, in seconds, for which the watchdog framework takes + care of pinging a hardware watchdog. A value of 0 means infinite. The + value set here can be overridden by the commandline parameter + "watchdog.open_timeout". + config WATCHDOG_SYSFS bool "Read different watchdog information through sysfs" help @@ -717,6 +726,7 @@ config IMX2_WDT config IMX_SC_WDT tristate "IMX SC Watchdog" depends on HAVE_ARM_SMCCC + depends on IMX_SCU select WATCHDOG_CORE help This is the driver for the system controller watchdog diff --git a/drivers/watchdog/acquirewdt.c b/drivers/watchdog/acquirewdt.c index 957d1255d4ca..848db958411e 100644 --- a/drivers/watchdog/acquirewdt.c +++ b/drivers/watchdog/acquirewdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Acquire Single Board Computer Watchdog Timer driver * @@ -6,11 +7,6 @@ * (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/advantechwdt.c b/drivers/watchdog/advantechwdt.c index 2766af292a71..0d02bb275b3d 100644 --- a/drivers/watchdog/advantechwdt.c +++ b/drivers/watchdog/advantechwdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Advantech Single Board Computer WDT driver * @@ -9,11 +10,6 @@ * (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/aspeed_wdt.c b/drivers/watchdog/aspeed_wdt.c index f0148637e5dd..cc71861e033a 100644 --- a/drivers/watchdog/aspeed_wdt.c +++ b/drivers/watchdog/aspeed_wdt.c @@ -309,13 +309,7 @@ static int aspeed_wdt_probe(struct platform_device *pdev) if (status & WDT_TIMEOUT_STATUS_BOOT_SECONDARY) wdt->wdd.bootstatus = WDIOF_CARDRESET; - ret = devm_watchdog_register_device(dev, &wdt->wdd); - if (ret) { - dev_err(dev, "failed to register\n"); - return ret; - } - - return 0; + return devm_watchdog_register_device(dev, &wdt->wdd); } static struct platform_driver aspeed_watchdog_driver = { diff --git a/drivers/watchdog/bcm2835_wdt.c b/drivers/watchdog/bcm2835_wdt.c index 560c1c54c177..dec6ca019bea 100644 --- a/drivers/watchdog/bcm2835_wdt.c +++ b/drivers/watchdog/bcm2835_wdt.c @@ -202,10 +202,8 @@ static int bcm2835_wdt_probe(struct platform_device *pdev) watchdog_stop_on_reboot(&bcm2835_wdt_wdd); err = devm_watchdog_register_device(dev, &bcm2835_wdt_wdd); - if (err) { - dev_err(dev, "Failed to register watchdog device"); + if (err) return err; - } if (pm_power_off == NULL) { pm_power_off = bcm2835_power_off; @@ -240,6 +238,7 @@ module_param(nowayout, bool, 0); MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); +MODULE_ALIAS("platform:bcm2835-wdt"); MODULE_AUTHOR("Lubomir Rintel <lkundrak@v3.sk>"); MODULE_DESCRIPTION("Driver for Broadcom BCM2835 watchdog timer"); MODULE_LICENSE("GPL"); diff --git a/drivers/watchdog/bcm7038_wdt.c b/drivers/watchdog/bcm7038_wdt.c index d3d88f6703d7..979caa18d3c8 100644 --- a/drivers/watchdog/bcm7038_wdt.c +++ b/drivers/watchdog/bcm7038_wdt.c @@ -159,10 +159,8 @@ static int bcm7038_wdt_probe(struct platform_device *pdev) watchdog_stop_on_reboot(&wdt->wdd); watchdog_stop_on_unregister(&wdt->wdd); err = devm_watchdog_register_device(dev, &wdt->wdd); - if (err) { - dev_err(dev, "Failed to register watchdog device\n"); + if (err) return err; - } dev_info(dev, "Registered BCM7038 Watchdog\n"); diff --git a/drivers/watchdog/bcm_kona_wdt.c b/drivers/watchdog/bcm_kona_wdt.c index 921291025680..eb850a8d19df 100644 --- a/drivers/watchdog/bcm_kona_wdt.c +++ b/drivers/watchdog/bcm_kona_wdt.c @@ -301,10 +301,8 @@ static int bcm_kona_wdt_probe(struct platform_device *pdev) watchdog_stop_on_reboot(&bcm_kona_wdt_wdd); watchdog_stop_on_unregister(&bcm_kona_wdt_wdd); ret = devm_watchdog_register_device(dev, &bcm_kona_wdt_wdd); - if (ret) { - dev_err(dev, "Failed to register watchdog device"); + if (ret) return ret; - } bcm_kona_wdt_debug_init(pdev); dev_dbg(dev, "Broadcom Kona Watchdog Timer"); diff --git a/drivers/watchdog/cadence_wdt.c b/drivers/watchdog/cadence_wdt.c index a22f2d431a35..f8d4e91d0383 100644 --- a/drivers/watchdog/cadence_wdt.c +++ b/drivers/watchdog/cadence_wdt.c @@ -363,10 +363,8 @@ static int cdns_wdt_probe(struct platform_device *pdev) watchdog_stop_on_reboot(cdns_wdt_device); watchdog_stop_on_unregister(cdns_wdt_device); ret = devm_watchdog_register_device(dev, cdns_wdt_device); - if (ret) { - dev_err(dev, "Failed to register wdt device\n"); + if (ret) return ret; - } platform_set_drvdata(pdev, wdt); dev_info(dev, "Xilinx Watchdog Timer at %p with timeout %ds%s\n", diff --git a/drivers/watchdog/da9052_wdt.c b/drivers/watchdog/da9052_wdt.c index a2feef1ff307..d708c091bf1b 100644 --- a/drivers/watchdog/da9052_wdt.c +++ b/drivers/watchdog/da9052_wdt.c @@ -176,14 +176,7 @@ static int da9052_wdt_probe(struct platform_device *pdev) return ret; } - ret = devm_watchdog_register_device(dev, &driver_data->wdt); - if (ret != 0) { - dev_err(da9052->dev, "watchdog_register_device() failed: %d\n", - ret); - return ret; - } - - return ret; + return devm_watchdog_register_device(dev, &driver_data->wdt); } static struct platform_driver da9052_wdt_driver = { diff --git a/drivers/watchdog/da9062_wdt.c b/drivers/watchdog/da9062_wdt.c index aac749cfaccb..e149e66a6ea9 100644 --- a/drivers/watchdog/da9062_wdt.c +++ b/drivers/watchdog/da9062_wdt.c @@ -214,11 +214,8 @@ static int da9062_wdt_probe(struct platform_device *pdev) watchdog_set_drvdata(&wdt->wdtdev, wdt); ret = devm_watchdog_register_device(dev, &wdt->wdtdev); - if (ret < 0) { - dev_err(wdt->hw->dev, - "watchdog registration failed (%d)\n", ret); + if (ret < 0) return ret; - } return da9062_wdt_ping(&wdt->wdtdev); } diff --git a/drivers/watchdog/davinci_wdt.c b/drivers/watchdog/davinci_wdt.c index 7b2ee35b5ffd..2b3f3cd382ef 100644 --- a/drivers/watchdog/davinci_wdt.c +++ b/drivers/watchdog/davinci_wdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * drivers/char/watchdog/davinci_wdt.c * @@ -5,10 +6,7 @@ * * Copyright (C) 2006-2013 Texas Instruments. * - * 2007 (c) MontaVista Software, Inc. This file is licensed under - * the terms of the GNU General Public License version 2. This program - * is licensed "as is" without any warranty of any kind, whether express - * or implied. + * 2007 (c) MontaVista Software, Inc. */ #include <linux/module.h> @@ -247,13 +245,7 @@ static int davinci_wdt_probe(struct platform_device *pdev) if (IS_ERR(davinci_wdt->base)) return PTR_ERR(davinci_wdt->base); - ret = devm_watchdog_register_device(dev, wdd); - if (ret) { - dev_err(dev, "cannot register watchdog device\n"); - return ret; - } - - return 0; + return devm_watchdog_register_device(dev, wdd); } static const struct of_device_id davinci_wdt_of_match[] = { diff --git a/drivers/watchdog/digicolor_wdt.c b/drivers/watchdog/digicolor_wdt.c index 8af6e9a67d0d..073d37867f47 100644 --- a/drivers/watchdog/digicolor_wdt.c +++ b/drivers/watchdog/digicolor_wdt.c @@ -118,7 +118,6 @@ static int dc_wdt_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct dc_wdt *wdt; - int ret; wdt = devm_kzalloc(dev, sizeof(struct dc_wdt), GFP_KERNEL); if (!wdt) @@ -141,13 +140,7 @@ static int dc_wdt_probe(struct platform_device *pdev) watchdog_set_restart_priority(&dc_wdt_wdd, 128); watchdog_init_timeout(&dc_wdt_wdd, timeout, dev); watchdog_stop_on_reboot(&dc_wdt_wdd); - ret = devm_watchdog_register_device(dev, &dc_wdt_wdd); - if (ret) { - dev_err(dev, "Failed to register watchdog device"); - return ret; - } - - return 0; + return devm_watchdog_register_device(dev, &dc_wdt_wdd); } static const struct of_device_id dc_wdt_of_match[] = { diff --git a/drivers/watchdog/ebc-c384_wdt.c b/drivers/watchdog/ebc-c384_wdt.c index c176f59fea28..8ef4b0df3855 100644 --- a/drivers/watchdog/ebc-c384_wdt.c +++ b/drivers/watchdog/ebc-c384_wdt.c @@ -2,15 +2,6 @@ /* * Watchdog timer driver for the WinSystems EBC-C384 * Copyright (C) 2016 William Breathitt Gray - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/device.h> #include <linux/dmi.h> diff --git a/drivers/watchdog/eurotechwdt.c b/drivers/watchdog/eurotechwdt.c index 89129e6fa9b6..3a83a48abcae 100644 --- a/drivers/watchdog/eurotechwdt.c +++ b/drivers/watchdog/eurotechwdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Eurotech CPU-1220/1410/1420 on board WDT driver * @@ -11,11 +12,6 @@ * (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/ftwdt010_wdt.c b/drivers/watchdog/ftwdt010_wdt.c index d9626ef9b9ae..21dcc7765688 100644 --- a/drivers/watchdog/ftwdt010_wdt.c +++ b/drivers/watchdog/ftwdt010_wdt.c @@ -165,10 +165,8 @@ static int ftwdt010_wdt_probe(struct platform_device *pdev) } ret = devm_watchdog_register_device(dev, &gwdt->wdd); - if (ret) { - dev_err(dev, "failed to register watchdog\n"); + if (ret) return ret; - } /* Set up platform driver data */ platform_set_drvdata(pdev, gwdt); diff --git a/drivers/watchdog/gpio_wdt.c b/drivers/watchdog/gpio_wdt.c index 777de10f2a78..0923201ce874 100644 --- a/drivers/watchdog/gpio_wdt.c +++ b/drivers/watchdog/gpio_wdt.c @@ -13,6 +13,12 @@ #include <linux/platform_device.h> #include <linux/watchdog.h> +static bool nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, + "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + #define SOFT_TIMEOUT_MIN 1 #define SOFT_TIMEOUT_DEF 60 @@ -151,6 +157,7 @@ static int gpio_wdt_probe(struct platform_device *pdev) priv->wdd.timeout = SOFT_TIMEOUT_DEF; watchdog_init_timeout(&priv->wdd, 0, dev); + watchdog_set_nowayout(&priv->wdd, nowayout); watchdog_stop_on_reboot(&priv->wdd); diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 8a90f159ffb1..7d34bcf1c45b 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -22,10 +22,11 @@ #include <linux/watchdog.h> #include <asm/nmi.h> -#define HPWDT_VERSION "2.0.2" +#define HPWDT_VERSION "2.0.3" #define SECS_TO_TICKS(secs) ((secs) * 1000 / 128) #define TICKS_TO_SECS(ticks) ((ticks) * 128 / 1000) -#define HPWDT_MAX_TIMER TICKS_TO_SECS(65535) +#define HPWDT_MAX_TICKS 65535 +#define HPWDT_MAX_TIMER TICKS_TO_SECS(HPWDT_MAX_TICKS) #define DEFAULT_MARGIN 30 #define PRETIMEOUT_SEC 9 @@ -33,6 +34,7 @@ static bool ilo5; static unsigned int soft_margin = DEFAULT_MARGIN; /* in seconds */ static bool nowayout = WATCHDOG_NOWAYOUT; static bool pretimeout = IS_ENABLED(CONFIG_HPWDT_NMI_DECODING); +static int kdumptimeout = -1; static void __iomem *pci_mem_addr; /* the PCI-memory address */ static unsigned long __iomem *hpwdt_nmistat; @@ -52,15 +54,21 @@ static const struct pci_device_id hpwdt_blacklist[] = { {0}, /* terminate list */ }; +static struct watchdog_device hpwdt_dev; /* * Watchdog operations */ +static int hpwdt_hw_is_running(void) +{ + return ioread8(hpwdt_timer_con) & 0x01; +} + static int hpwdt_start(struct watchdog_device *wdd) { int control = 0x81 | (pretimeout ? 0x4 : 0); - int reload = SECS_TO_TICKS(wdd->timeout); + int reload = SECS_TO_TICKS(min(wdd->timeout, wdd->max_hw_heartbeat_ms/1000)); - dev_dbg(wdd->parent, "start watchdog 0x%08x:0x%02x\n", reload, control); + dev_dbg(wdd->parent, "start watchdog 0x%08x:0x%08x:0x%02x\n", wdd->timeout, reload, control); iowrite16(reload, hpwdt_timer_reg); iowrite8(control, hpwdt_timer_con); @@ -85,12 +93,18 @@ static int hpwdt_stop_core(struct watchdog_device *wdd) return 0; } +static void hpwdt_ping_ticks(int val) +{ + val = min(val, HPWDT_MAX_TICKS); + iowrite16(val, hpwdt_timer_reg); +} + static int hpwdt_ping(struct watchdog_device *wdd) { - int reload = SECS_TO_TICKS(wdd->timeout); + int reload = SECS_TO_TICKS(min(wdd->timeout, wdd->max_hw_heartbeat_ms/1000)); - dev_dbg(wdd->parent, "ping watchdog 0x%08x\n", reload); - iowrite16(reload, hpwdt_timer_reg); + dev_dbg(wdd->parent, "ping watchdog 0x%08x:0x%08x\n", wdd->timeout, reload); + hpwdt_ping_ticks(reload); return 0; } @@ -166,7 +180,14 @@ static int hpwdt_pretimeout(unsigned int ulReason, struct pt_regs *regs) if (ilo5 && !pretimeout && !mynmi) return NMI_DONE; - hpwdt_stop(); + if (kdumptimeout < 0) + hpwdt_stop(); + else if (kdumptimeout == 0) + ; + else { + unsigned int val = max((unsigned int)kdumptimeout, hpwdt_dev.timeout); + hpwdt_ping_ticks(SECS_TO_TICKS(val)); + } hex_byte_pack(panic_msg, mynmi); nmi_panic(regs, panic_msg); @@ -204,9 +225,9 @@ static struct watchdog_device hpwdt_dev = { .info = &ident, .ops = &hpwdt_ops, .min_timeout = 1, - .max_timeout = HPWDT_MAX_TIMER, .timeout = DEFAULT_MARGIN, .pretimeout = PRETIMEOUT_SEC, + .max_hw_heartbeat_ms = HPWDT_MAX_TIMER * 1000, }; @@ -298,14 +319,18 @@ static int hpwdt_init_one(struct pci_dev *dev, hpwdt_timer_reg = pci_mem_addr + 0x70; hpwdt_timer_con = pci_mem_addr + 0x72; - /* Make sure that timer is disabled until /dev/watchdog is opened */ - hpwdt_stop(); + /* Have the core update running timer until user space is ready */ + if (hpwdt_hw_is_running()) { + dev_info(&dev->dev, "timer is running\n"); + set_bit(WDOG_HW_RUNNING, &hpwdt_dev.status); + } /* Initialize NMI Decoding functionality */ retval = hpwdt_init_nmi_decoding(dev); if (retval != 0) goto error_init_nmi_decoding; + watchdog_stop_on_unregister(&hpwdt_dev); watchdog_set_nowayout(&hpwdt_dev, nowayout); watchdog_init_timeout(&hpwdt_dev, soft_margin, NULL); @@ -314,13 +339,12 @@ static int hpwdt_init_one(struct pci_dev *dev, pretimeout = 0; } hpwdt_dev.pretimeout = pretimeout ? PRETIMEOUT_SEC : 0; + kdumptimeout = min(kdumptimeout, HPWDT_MAX_TIMER); hpwdt_dev.parent = &dev->dev; retval = watchdog_register_device(&hpwdt_dev); - if (retval < 0) { - dev_err(&dev->dev, "watchdog register failed: %d.\n", retval); + if (retval < 0) goto error_wd_register; - } dev_info(&dev->dev, "HPE Watchdog Timer Driver: Version: %s\n", HPWDT_VERSION); @@ -328,6 +352,7 @@ static int hpwdt_init_one(struct pci_dev *dev, hpwdt_dev.timeout, nowayout); dev_info(&dev->dev, "pretimeout: %s.\n", pretimeout ? "on" : "off"); + dev_info(&dev->dev, "kdumptimeout: %d.\n", kdumptimeout); if (dev->subsystem_vendor == PCI_VENDOR_ID_HP_3PAR) ilo5 = true; @@ -345,9 +370,6 @@ error_pci_iomap: static void hpwdt_exit(struct pci_dev *dev) { - if (!nowayout) - hpwdt_stop(); - watchdog_unregister_device(&hpwdt_dev); hpwdt_exit_nmi_decoding(); pci_iounmap(dev, pci_mem_addr); @@ -376,6 +398,9 @@ module_param(nowayout, bool, 0); MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); +module_param(kdumptimeout, int, 0444); +MODULE_PARM_DESC(kdumptimeout, "Timeout applied for crash kernel transition in seconds"); + #ifdef CONFIG_HPWDT_NMI_DECODING module_param(pretimeout, bool, 0); MODULE_PARM_DESC(pretimeout, "Watchdog pretimeout enabled"); diff --git a/drivers/watchdog/i6300esb.c b/drivers/watchdog/i6300esb.c index f98f35a05896..a30835f547b3 100644 --- a/drivers/watchdog/i6300esb.c +++ b/drivers/watchdog/i6300esb.c @@ -315,11 +315,8 @@ static int esb_probe(struct pci_dev *pdev, /* Register the watchdog so that userspace has access to it */ ret = watchdog_register_device(&edev->wdd); - if (ret != 0) { - dev_err(&pdev->dev, - "cannot register watchdog device (err=%d)\n", ret); + if (ret != 0) goto err_unmap; - } dev_info(&pdev->dev, "initialized. heartbeat=%d sec (nowayout=%d)\n", edev->wdd.timeout, nowayout); diff --git a/drivers/watchdog/iTCO_vendor_support.c b/drivers/watchdog/iTCO_vendor_support.c index 68a9d9cc2eb8..4f1b96f59349 100644 --- a/drivers/watchdog/iTCO_vendor_support.c +++ b/drivers/watchdog/iTCO_vendor_support.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * intel TCO vendor specific watchdog driver support * * (c) Copyright 2006-2009 Wim Van Sebroeck <wim@iguana.be>. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor * provide warranty for any of this software. This material is * provided "AS-IS" and at no charge. @@ -216,4 +212,3 @@ MODULE_AUTHOR("Wim Van Sebroeck <wim@iguana.be>, " MODULE_DESCRIPTION("Intel TCO Vendor Specific WatchDog Timer Driver Support"); MODULE_VERSION(DRV_VERSION); MODULE_LICENSE("GPL"); - diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index 89cea6ce9a08..c559f706ae7e 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * intel TCO Watchdog Driver * * (c) Copyright 2006-2011 Wim Van Sebroeck <wim@iguana.be>. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor * provide warranty for any of this software. This material is * provided "AS-IS" and at no charge. diff --git a/drivers/watchdog/ib700wdt.c b/drivers/watchdog/ib700wdt.c index 30d6cec582af..92fd7f33bc4d 100644 --- a/drivers/watchdog/ib700wdt.c +++ b/drivers/watchdog/ib700wdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * IB700 Single Board Computer WDT driver * @@ -14,11 +15,6 @@ * (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/ie6xx_wdt.c b/drivers/watchdog/ie6xx_wdt.c index 508fbefce9f6..8f28993fab8b 100644 --- a/drivers/watchdog/ie6xx_wdt.c +++ b/drivers/watchdog/ie6xx_wdt.c @@ -66,7 +66,7 @@ MODULE_PARM_DESC(resetmode, static struct { unsigned short sch_wdtba; - struct spinlock unlock_sequence; + spinlock_t unlock_sequence; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs; #endif @@ -254,12 +254,8 @@ static int ie6xx_wdt_probe(struct platform_device *pdev) ie6xx_wdt_debugfs_init(); ret = watchdog_register_device(&ie6xx_wdt_dev); - if (ret) { - dev_err(&pdev->dev, - "Watchdog timer: cannot register device (err =%d)\n", - ret); + if (ret) goto misc_register_error; - } return 0; diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c index a606005dd65f..32af3974e6bb 100644 --- a/drivers/watchdog/imx2_wdt.c +++ b/drivers/watchdog/imx2_wdt.c @@ -316,10 +316,8 @@ static int __init imx2_wdt_probe(struct platform_device *pdev) regmap_write(wdev->regmap, IMX2_WDT_WMCR, 0); ret = watchdog_register_device(wdog); - if (ret) { - dev_err(&pdev->dev, "cannot register watchdog device\n"); + if (ret) goto disable_clk; - } dev_info(&pdev->dev, "timeout %d sec (nowayout=%d)\n", wdog->timeout, nowayout); diff --git a/drivers/watchdog/imx_sc_wdt.c b/drivers/watchdog/imx_sc_wdt.c index 49848b66186c..78eaaf75a263 100644 --- a/drivers/watchdog/imx_sc_wdt.c +++ b/drivers/watchdog/imx_sc_wdt.c @@ -4,6 +4,7 @@ */ #include <linux/arm-smccc.h> +#include <linux/firmware/imx/sci.h> #include <linux/io.h> #include <linux/init.h> #include <linux/kernel.h> @@ -33,11 +34,19 @@ #define SC_TIMER_WDOG_ACTION_PARTITION 0 +#define SC_IRQ_WDOG 1 +#define SC_IRQ_GROUP_WDOG 1 + static bool nowayout = WATCHDOG_NOWAYOUT; module_param(nowayout, bool, 0000); MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); +struct imx_sc_wdt_device { + struct watchdog_device wdd; + struct notifier_block wdt_notifier; +}; + static int imx_sc_wdt_ping(struct watchdog_device *wdog) { struct arm_smccc_res res; @@ -85,24 +94,66 @@ static int imx_sc_wdt_set_timeout(struct watchdog_device *wdog, return res.a0 ? -EACCES : 0; } +static int imx_sc_wdt_set_pretimeout(struct watchdog_device *wdog, + unsigned int pretimeout) +{ + struct arm_smccc_res res; + + arm_smccc_smc(IMX_SIP_TIMER, IMX_SIP_TIMER_SET_PRETIME_WDOG, + pretimeout * 1000, 0, 0, 0, 0, 0, &res); + if (res.a0) + return -EACCES; + + wdog->pretimeout = pretimeout; + + return 0; +} + +static int imx_sc_wdt_notify(struct notifier_block *nb, + unsigned long event, void *group) +{ + struct imx_sc_wdt_device *imx_sc_wdd = + container_of(nb, + struct imx_sc_wdt_device, + wdt_notifier); + + if (event & SC_IRQ_WDOG && + *(u8 *)group == SC_IRQ_GROUP_WDOG) + watchdog_notify_pretimeout(&imx_sc_wdd->wdd); + + return 0; +} + +static void imx_sc_wdt_action(void *data) +{ + struct notifier_block *wdt_notifier = data; + + imx_scu_irq_unregister_notifier(wdt_notifier); + imx_scu_irq_group_enable(SC_IRQ_GROUP_WDOG, + SC_IRQ_WDOG, + false); +} + static const struct watchdog_ops imx_sc_wdt_ops = { .owner = THIS_MODULE, .start = imx_sc_wdt_start, .stop = imx_sc_wdt_stop, .ping = imx_sc_wdt_ping, .set_timeout = imx_sc_wdt_set_timeout, + .set_pretimeout = imx_sc_wdt_set_pretimeout, }; -static const struct watchdog_info imx_sc_wdt_info = { +static struct watchdog_info imx_sc_wdt_info = { .identity = "i.MX SC watchdog timer", .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | - WDIOF_MAGICCLOSE | WDIOF_PRETIMEOUT, + WDIOF_MAGICCLOSE, }; static int imx_sc_wdt_probe(struct platform_device *pdev) { + struct imx_sc_wdt_device *imx_sc_wdd; + struct watchdog_device *wdog; struct device *dev = &pdev->dev; - struct watchdog_device *imx_sc_wdd; int ret; imx_sc_wdd = devm_kzalloc(dev, sizeof(*imx_sc_wdd), GFP_KERNEL); @@ -111,42 +162,70 @@ static int imx_sc_wdt_probe(struct platform_device *pdev) platform_set_drvdata(pdev, imx_sc_wdd); - imx_sc_wdd->info = &imx_sc_wdt_info; - imx_sc_wdd->ops = &imx_sc_wdt_ops; - imx_sc_wdd->min_timeout = 1; - imx_sc_wdd->max_timeout = MAX_TIMEOUT; - imx_sc_wdd->parent = dev; - imx_sc_wdd->timeout = DEFAULT_TIMEOUT; - - watchdog_init_timeout(imx_sc_wdd, 0, dev); - watchdog_stop_on_reboot(imx_sc_wdd); - watchdog_stop_on_unregister(imx_sc_wdd); + wdog = &imx_sc_wdd->wdd; + wdog->info = &imx_sc_wdt_info; + wdog->ops = &imx_sc_wdt_ops; + wdog->min_timeout = 1; + wdog->max_timeout = MAX_TIMEOUT; + wdog->parent = dev; + wdog->timeout = DEFAULT_TIMEOUT; + + watchdog_init_timeout(wdog, 0, dev); + watchdog_stop_on_reboot(wdog); + watchdog_stop_on_unregister(wdog); + + ret = devm_watchdog_register_device(dev, wdog); + + if (ret) { + dev_err(dev, "Failed to register watchdog device\n"); + return ret; + } + + ret = imx_scu_irq_group_enable(SC_IRQ_GROUP_WDOG, + SC_IRQ_WDOG, + true); + if (ret) { + dev_warn(dev, "Enable irq failed, pretimeout NOT supported\n"); + return 0; + } - ret = devm_watchdog_register_device(dev, imx_sc_wdd); + imx_sc_wdd->wdt_notifier.notifier_call = imx_sc_wdt_notify; + ret = imx_scu_irq_register_notifier(&imx_sc_wdd->wdt_notifier); if (ret) { - dev_err(dev, "Failed to register watchdog device\n"); - return ret; + imx_scu_irq_group_enable(SC_IRQ_GROUP_WDOG, + SC_IRQ_WDOG, + false); + dev_warn(dev, + "Register irq notifier failed, pretimeout NOT supported\n"); + return 0; } + ret = devm_add_action_or_reset(dev, imx_sc_wdt_action, + &imx_sc_wdd->wdt_notifier); + if (!ret) + imx_sc_wdt_info.options |= WDIOF_PRETIMEOUT; + else + dev_warn(dev, "Add action failed, pretimeout NOT supported\n"); + return 0; } static int __maybe_unused imx_sc_wdt_suspend(struct device *dev) { - struct watchdog_device *imx_sc_wdd = dev_get_drvdata(dev); + struct imx_sc_wdt_device *imx_sc_wdd = dev_get_drvdata(dev); - if (watchdog_active(imx_sc_wdd)) - imx_sc_wdt_stop(imx_sc_wdd); + if (watchdog_active(&imx_sc_wdd->wdd)) + imx_sc_wdt_stop(&imx_sc_wdd->wdd); return 0; } static int __maybe_unused imx_sc_wdt_resume(struct device *dev) { - struct watchdog_device *imx_sc_wdd = dev_get_drvdata(dev); + struct imx_sc_wdt_device *imx_sc_wdd = dev_get_drvdata(dev); - if (watchdog_active(imx_sc_wdd)) - imx_sc_wdt_start(imx_sc_wdd); + if (watchdog_active(&imx_sc_wdd->wdd)) + imx_sc_wdt_start(&imx_sc_wdd->wdd); return 0; } diff --git a/drivers/watchdog/intel-mid_wdt.c b/drivers/watchdog/intel-mid_wdt.c index b2463f8276e6..2cdbd37c700c 100644 --- a/drivers/watchdog/intel-mid_wdt.c +++ b/drivers/watchdog/intel-mid_wdt.c @@ -161,10 +161,8 @@ static int mid_wdt_probe(struct platform_device *pdev) set_bit(WDOG_HW_RUNNING, &wdt_dev->status); ret = devm_watchdog_register_device(dev, wdt_dev); - if (ret) { - dev_err(dev, "error registering watchdog device\n"); + if (ret) return ret; - } dev_info(dev, "Intel MID watchdog device probed\n"); diff --git a/drivers/watchdog/jz4740_wdt.c b/drivers/watchdog/jz4740_wdt.c index 313358b2e0b1..d4a90916dd38 100644 --- a/drivers/watchdog/jz4740_wdt.c +++ b/drivers/watchdog/jz4740_wdt.c @@ -4,6 +4,7 @@ * JZ4740 Watchdog driver */ +#include <linux/mfd/ingenic-tcu.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/types.h> @@ -19,23 +20,16 @@ #include <asm/mach-jz4740/timer.h> -#define JZ_REG_WDT_TIMER_DATA 0x0 -#define JZ_REG_WDT_COUNTER_ENABLE 0x4 -#define JZ_REG_WDT_TIMER_COUNTER 0x8 -#define JZ_REG_WDT_TIMER_CONTROL 0xC - #define JZ_WDT_CLOCK_PCLK 0x1 #define JZ_WDT_CLOCK_RTC 0x2 #define JZ_WDT_CLOCK_EXT 0x4 -#define JZ_WDT_CLOCK_DIV_SHIFT 3 - -#define JZ_WDT_CLOCK_DIV_1 (0 << JZ_WDT_CLOCK_DIV_SHIFT) -#define JZ_WDT_CLOCK_DIV_4 (1 << JZ_WDT_CLOCK_DIV_SHIFT) -#define JZ_WDT_CLOCK_DIV_16 (2 << JZ_WDT_CLOCK_DIV_SHIFT) -#define JZ_WDT_CLOCK_DIV_64 (3 << JZ_WDT_CLOCK_DIV_SHIFT) -#define JZ_WDT_CLOCK_DIV_256 (4 << JZ_WDT_CLOCK_DIV_SHIFT) -#define JZ_WDT_CLOCK_DIV_1024 (5 << JZ_WDT_CLOCK_DIV_SHIFT) +#define JZ_WDT_CLOCK_DIV_1 (0 << TCU_TCSR_PRESCALE_LSB) +#define JZ_WDT_CLOCK_DIV_4 (1 << TCU_TCSR_PRESCALE_LSB) +#define JZ_WDT_CLOCK_DIV_16 (2 << TCU_TCSR_PRESCALE_LSB) +#define JZ_WDT_CLOCK_DIV_64 (3 << TCU_TCSR_PRESCALE_LSB) +#define JZ_WDT_CLOCK_DIV_256 (4 << TCU_TCSR_PRESCALE_LSB) +#define JZ_WDT_CLOCK_DIV_1024 (5 << TCU_TCSR_PRESCALE_LSB) #define DEFAULT_HEARTBEAT 5 #define MAX_HEARTBEAT 2048 @@ -63,7 +57,7 @@ static int jz4740_wdt_ping(struct watchdog_device *wdt_dev) { struct jz4740_wdt_drvdata *drvdata = watchdog_get_drvdata(wdt_dev); - writew(0x0, drvdata->base + JZ_REG_WDT_TIMER_COUNTER); + writew(0x0, drvdata->base + TCU_REG_WDT_TCNT); return 0; } @@ -74,6 +68,7 @@ static int jz4740_wdt_set_timeout(struct watchdog_device *wdt_dev, unsigned int rtc_clk_rate; unsigned int timeout_value; unsigned short clock_div = JZ_WDT_CLOCK_DIV_1; + u8 tcer; rtc_clk_rate = clk_get_rate(drvdata->rtc_clk); @@ -86,18 +81,19 @@ static int jz4740_wdt_set_timeout(struct watchdog_device *wdt_dev, break; } timeout_value >>= 2; - clock_div += (1 << JZ_WDT_CLOCK_DIV_SHIFT); + clock_div += (1 << TCU_TCSR_PRESCALE_LSB); } - writeb(0x0, drvdata->base + JZ_REG_WDT_COUNTER_ENABLE); - writew(clock_div, drvdata->base + JZ_REG_WDT_TIMER_CONTROL); + tcer = readb(drvdata->base + TCU_REG_WDT_TCER); + writeb(0x0, drvdata->base + TCU_REG_WDT_TCER); + writew(clock_div, drvdata->base + TCU_REG_WDT_TCSR); - writew((u16)timeout_value, drvdata->base + JZ_REG_WDT_TIMER_DATA); - writew(0x0, drvdata->base + JZ_REG_WDT_TIMER_COUNTER); - writew(clock_div | JZ_WDT_CLOCK_RTC, - drvdata->base + JZ_REG_WDT_TIMER_CONTROL); + writew((u16)timeout_value, drvdata->base + TCU_REG_WDT_TDR); + writew(0x0, drvdata->base + TCU_REG_WDT_TCNT); + writew(clock_div | JZ_WDT_CLOCK_RTC, drvdata->base + TCU_REG_WDT_TCSR); - writeb(0x1, drvdata->base + JZ_REG_WDT_COUNTER_ENABLE); + if (tcer & TCU_WDT_TCER_TCEN) + writeb(TCU_WDT_TCER_TCEN, drvdata->base + TCU_REG_WDT_TCER); wdt_dev->timeout = new_timeout; return 0; @@ -105,9 +101,18 @@ static int jz4740_wdt_set_timeout(struct watchdog_device *wdt_dev, static int jz4740_wdt_start(struct watchdog_device *wdt_dev) { + struct jz4740_wdt_drvdata *drvdata = watchdog_get_drvdata(wdt_dev); + u8 tcer; + + tcer = readb(drvdata->base + TCU_REG_WDT_TCER); + jz4740_timer_enable_watchdog(); jz4740_wdt_set_timeout(wdt_dev, wdt_dev->timeout); + /* Start watchdog if it wasn't started already */ + if (!(tcer & TCU_WDT_TCER_TCEN)) + writeb(TCU_WDT_TCER_TCEN, drvdata->base + TCU_REG_WDT_TCER); + return 0; } @@ -115,7 +120,7 @@ static int jz4740_wdt_stop(struct watchdog_device *wdt_dev) { struct jz4740_wdt_drvdata *drvdata = watchdog_get_drvdata(wdt_dev); - writeb(0x0, drvdata->base + JZ_REG_WDT_COUNTER_ENABLE); + writeb(0x0, drvdata->base + TCU_REG_WDT_TCER); jz4740_timer_disable_watchdog(); return 0; @@ -187,11 +192,7 @@ static int jz4740_wdt_probe(struct platform_device *pdev) return PTR_ERR(drvdata->rtc_clk); } - ret = devm_watchdog_register_device(dev, &drvdata->wdt); - if (ret < 0) - return ret; - - return 0; + return devm_watchdog_register_device(dev, &drvdata->wdt); } static struct platform_driver jz4740_wdt_driver = { diff --git a/drivers/watchdog/loongson1_wdt.c b/drivers/watchdog/loongson1_wdt.c index c8c2b8a88fc2..bb3d075c0633 100644 --- a/drivers/watchdog/loongson1_wdt.c +++ b/drivers/watchdog/loongson1_wdt.c @@ -132,10 +132,8 @@ static int ls1x_wdt_probe(struct platform_device *pdev) watchdog_set_drvdata(ls1x_wdt, drvdata); err = devm_watchdog_register_device(dev, &drvdata->wdt); - if (err) { - dev_err(dev, "failed to register watchdog device\n"); + if (err) return err; - } platform_set_drvdata(pdev, drvdata); diff --git a/drivers/watchdog/max77620_wdt.c b/drivers/watchdog/max77620_wdt.c index 9937f9fccd2e..be6a53c30002 100644 --- a/drivers/watchdog/max77620_wdt.c +++ b/drivers/watchdog/max77620_wdt.c @@ -182,13 +182,7 @@ static int max77620_wdt_probe(struct platform_device *pdev) watchdog_set_drvdata(wdt_dev, wdt); watchdog_stop_on_unregister(wdt_dev); - ret = devm_watchdog_register_device(dev, wdt_dev); - if (ret < 0) { - dev_err(dev, "watchdog registration failed: %d\n", ret); - return ret; - } - - return 0; + return devm_watchdog_register_device(dev, wdt_dev); } static const struct platform_device_id max77620_wdt_devtype[] = { diff --git a/drivers/watchdog/mei_wdt.c b/drivers/watchdog/mei_wdt.c index 96a770938ff0..5391bf3e6b11 100644 --- a/drivers/watchdog/mei_wdt.c +++ b/drivers/watchdog/mei_wdt.c @@ -384,10 +384,8 @@ static int mei_wdt_register(struct mei_wdt *wdt) watchdog_stop_on_reboot(&wdt->wdd); ret = watchdog_register_device(&wdt->wdd); - if (ret) { - dev_err(dev, "unable to register watchdog device = %d.\n", ret); + if (ret) watchdog_set_drvdata(&wdt->wdd, NULL); - } wdt->state = MEI_WDT_IDLE; diff --git a/drivers/watchdog/mena21_wdt.c b/drivers/watchdog/mena21_wdt.c index e9ca4e0e25dc..99d2359d5a8a 100644 --- a/drivers/watchdog/mena21_wdt.c +++ b/drivers/watchdog/mena21_wdt.c @@ -190,10 +190,8 @@ static int a21_wdt_probe(struct platform_device *pdev) dev_set_drvdata(dev, drv); ret = devm_watchdog_register_device(dev, &a21_wdt); - if (ret) { - dev_err(dev, "Cannot register watchdog device\n"); + if (ret) return ret; - } dev_info(dev, "MEN A21 watchdog timer driver enabled\n"); diff --git a/drivers/watchdog/menf21bmc_wdt.c b/drivers/watchdog/menf21bmc_wdt.c index 7766d7361d3b..81ebdfc371f4 100644 --- a/drivers/watchdog/menf21bmc_wdt.c +++ b/drivers/watchdog/menf21bmc_wdt.c @@ -152,10 +152,8 @@ static int menf21bmc_wdt_probe(struct platform_device *pdev) } ret = devm_watchdog_register_device(dev, &drv_data->wdt); - if (ret) { - dev_err(dev, "failed to register Watchdog device\n"); + if (ret) return ret; - } dev_info(dev, "MEN 14F021P00 BMC Watchdog device enabled\n"); diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c index b6ffad421bd0..3fc457bc16db 100644 --- a/drivers/watchdog/mpc8xxx_wdt.c +++ b/drivers/watchdog/mpc8xxx_wdt.c @@ -201,11 +201,8 @@ static int mpc8xxx_wdt_probe(struct platform_device *ofdev) ddata->wdd.timeout = ddata->wdd.min_timeout; ret = devm_watchdog_register_device(dev, &ddata->wdd); - if (ret) { - dev_err(dev, "cannot register watchdog device (err=%d)\n", - ret); + if (ret) return ret; - } dev_info(dev, "WDT driver for MPC8xxx initialized. mode:%s timeout=%d sec\n", diff --git a/drivers/watchdog/mv64x60_wdt.c b/drivers/watchdog/mv64x60_wdt.c index c785f4f0a196..74bf7144a970 100644 --- a/drivers/watchdog/mv64x60_wdt.c +++ b/drivers/watchdog/mv64x60_wdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * mv64x60_wdt.c - MV64X60 (Marvell Discovery) watchdog userspace interface * @@ -9,10 +10,7 @@ * * Derived from mpc8xx_wdt.c, with the following copyright. * - * 2002 (c) Florian Schirmer <jolt@tuxbox.org> This file is licensed under - * the terms of the GNU General Public License version 2. This program - * is licensed "as is" without any warranty of any kind, whether express - * or implied. + * 2002 (c) Florian Schirmer <jolt@tuxbox.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/drivers/watchdog/ni903x_wdt.c b/drivers/watchdog/ni903x_wdt.c index 60f5608af2a8..4cebad324b20 100644 --- a/drivers/watchdog/ni903x_wdt.c +++ b/drivers/watchdog/ni903x_wdt.c @@ -211,10 +211,8 @@ static int ni903x_acpi_add(struct acpi_device *device) watchdog_init_timeout(wdd, timeout, dev); ret = watchdog_register_device(wdd); - if (ret) { - dev_err(dev, "failed to register watchdog\n"); + if (ret) return ret; - } /* Switch from boot mode to user mode */ outb(NIWD_CONTROL_RESET | NIWD_CONTROL_MODE, diff --git a/drivers/watchdog/nic7018_wdt.c b/drivers/watchdog/nic7018_wdt.c index 2e1a2a3d4ec9..2a46cc662943 100644 --- a/drivers/watchdog/nic7018_wdt.c +++ b/drivers/watchdog/nic7018_wdt.c @@ -210,7 +210,6 @@ static int nic7018_probe(struct platform_device *pdev) ret = watchdog_register_device(wdd); if (ret) { outb(LOCK, wdt->io_base + WDT_REG_LOCK); - dev_err(dev, "failed to register watchdog\n"); return ret; } diff --git a/drivers/watchdog/npcm_wdt.c b/drivers/watchdog/npcm_wdt.c index 9d6c1689b12c..9c773c3d6d5d 100644 --- a/drivers/watchdog/npcm_wdt.c +++ b/drivers/watchdog/npcm_wdt.c @@ -220,10 +220,8 @@ static int npcm_wdt_probe(struct platform_device *pdev) return ret; ret = devm_watchdog_register_device(dev, &wdt->wdd); - if (ret) { - dev_err(dev, "failed to register watchdog\n"); + if (ret) return ret; - } dev_info(dev, "NPCM watchdog driver enabled\n"); diff --git a/drivers/watchdog/nv_tco.h b/drivers/watchdog/nv_tco.h index c2d1d04e055b..d325e528010f 100644 --- a/drivers/watchdog/nv_tco.h +++ b/drivers/watchdog/nv_tco.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * nv_tco: TCO timer driver for nVidia chipsets. * @@ -10,11 +11,6 @@ * Reserved. * http://www.kernelconcepts.de * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither kernel concepts nor Nils Faerber admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/octeon-wdt-main.c b/drivers/watchdog/octeon-wdt-main.c index 0ec419a3f7ed..fde9e739b436 100644 --- a/drivers/watchdog/octeon-wdt-main.c +++ b/drivers/watchdog/octeon-wdt-main.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Octeon Watchdog driver * @@ -10,22 +11,12 @@ * (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. * * (c) Copyright 1995 Alan Cox <alan@lxorguk.ukuu.org.uk> * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * * The OCTEON watchdog has a maximum timeout of 2^32 * io_clock. * For most systems this is less than 10 seconds, so to allow for * software to request longer watchdog heartbeats, we maintain software diff --git a/drivers/watchdog/of_xilinx_wdt.c b/drivers/watchdog/of_xilinx_wdt.c index 03786992b701..7fe4f7c3f7ce 100644 --- a/drivers/watchdog/of_xilinx_wdt.c +++ b/drivers/watchdog/of_xilinx_wdt.c @@ -238,10 +238,8 @@ static int xwdt_probe(struct platform_device *pdev) } rc = devm_watchdog_register_device(dev, xilinx_wdt_wdd); - if (rc) { - dev_err(dev, "Cannot register watchdog (err=%d)\n", rc); + if (rc) return rc; - } clk_disable(xdev->clk); diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c index d49688d93f6a..9b91882fe3c4 100644 --- a/drivers/watchdog/omap_wdt.c +++ b/drivers/watchdog/omap_wdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * omap_wdt.c * @@ -6,10 +7,7 @@ * Author: MontaVista Software, Inc. * <gdavis@mvista.com> or <source@mvista.com> * - * 2003 (c) MontaVista Software, Inc. This file is licensed under the - * terms of the GNU General Public License version 2. This program is - * licensed "as is" without any warranty of any kind, whether express - * or implied. + * 2003 (c) MontaVista Software, Inc. * * History: * diff --git a/drivers/watchdog/omap_wdt.h b/drivers/watchdog/omap_wdt.h index 42f31ec5e90d..950b4643f3e7 100644 --- a/drivers/watchdog/omap_wdt.h +++ b/drivers/watchdog/omap_wdt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * linux/drivers/char/watchdog/omap_wdt.h * @@ -5,26 +6,6 @@ * OMAP Watchdog timer register definitions * * Copyright (C) 2004 Texas Instruments. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN - * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef _OMAP_WATCHDOG_H diff --git a/drivers/watchdog/pc87413_wdt.c b/drivers/watchdog/pc87413_wdt.c index ca21d6c240a3..2af1a8b3f973 100644 --- a/drivers/watchdog/pc87413_wdt.c +++ b/drivers/watchdog/pc87413_wdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * NS pc87413-wdt Watchdog Timer driver for Linux 2.6.x.x * @@ -6,11 +7,6 @@ * (C) Copyright 2006 Sven Anders, <anders@anduras.de> * and Marcus Junker, <junker@anduras.de> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Sven Anders, Marcus Junker nor ANDURAS AG * admit liability nor provide warranty for any of this software. * This material is provided "AS-IS" and at no charge. diff --git a/drivers/watchdog/pcwd_pci.c b/drivers/watchdog/pcwd_pci.c index 5773d2591d3f..e30c1f762045 100644 --- a/drivers/watchdog/pcwd_pci.c +++ b/drivers/watchdog/pcwd_pci.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Berkshire PCI-PC Watchdog Card Driver * @@ -10,11 +11,6 @@ * Matt Domsch <Matt_Domsch@dell.com>, * Rob Radez <rob@osinvestor.com> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor * provide warranty for any of this software. This material is * provided "AS-IS" and at no charge. diff --git a/drivers/watchdog/pcwd_usb.c b/drivers/watchdog/pcwd_usb.c index 5de6182dae33..6727f8ab2d18 100644 --- a/drivers/watchdog/pcwd_usb.c +++ b/drivers/watchdog/pcwd_usb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Berkshire USB-PC Watchdog Card Driver * @@ -10,11 +11,6 @@ * Rob Radez <rob@osinvestor.com>, * Greg Kroah-Hartman <greg@kroah.com> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor * provide warranty for any of this software. This material is * provided "AS-IS" and at no charge. diff --git a/drivers/watchdog/pic32-dmt.c b/drivers/watchdog/pic32-dmt.c index 4f2aca78f13a..f43062b3c4c8 100644 --- a/drivers/watchdog/pic32-dmt.c +++ b/drivers/watchdog/pic32-dmt.c @@ -212,10 +212,8 @@ static int pic32_dmt_probe(struct platform_device *pdev) watchdog_set_drvdata(wdd, dmt); ret = devm_watchdog_register_device(dev, wdd); - if (ret) { - dev_err(dev, "watchdog register failed, err %d\n", ret); + if (ret) return ret; - } platform_set_drvdata(pdev, wdd); return 0; diff --git a/drivers/watchdog/pic32-wdt.c b/drivers/watchdog/pic32-wdt.c index 5ecdd880f0b7..41715d68d9e9 100644 --- a/drivers/watchdog/pic32-wdt.c +++ b/drivers/watchdog/pic32-wdt.c @@ -221,10 +221,8 @@ static int pic32_wdt_drv_probe(struct platform_device *pdev) watchdog_set_drvdata(wdd, wdt); ret = devm_watchdog_register_device(dev, wdd); - if (ret) { - dev_err(dev, "watchdog register failed, err %d\n", ret); + if (ret) return ret; - } platform_set_drvdata(pdev, wdd); diff --git a/drivers/watchdog/pnx4008_wdt.c b/drivers/watchdog/pnx4008_wdt.c index d9e03544aeae..7b446b696f2b 100644 --- a/drivers/watchdog/pnx4008_wdt.c +++ b/drivers/watchdog/pnx4008_wdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * drivers/char/watchdog/pnx4008_wdt.c * @@ -11,10 +12,6 @@ * 2005-2006 (c) MontaVista Software, Inc. * * (C) 2012 Wolfram Sang, Pengutronix - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -221,10 +218,8 @@ static int pnx4008_wdt_probe(struct platform_device *pdev) set_bit(WDOG_HW_RUNNING, &pnx4008_wdd.status); ret = devm_watchdog_register_device(dev, &pnx4008_wdd); - if (ret < 0) { - dev_err(dev, "cannot register watchdog device\n"); + if (ret < 0) return ret; - } dev_info(dev, "heartbeat %d sec\n", pnx4008_wdd.timeout); diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c index fc0f7e5de38d..7be7f87be28f 100644 --- a/drivers/watchdog/qcom-wdt.c +++ b/drivers/watchdog/qcom-wdt.c @@ -223,10 +223,8 @@ static int qcom_wdt_probe(struct platform_device *pdev) watchdog_init_timeout(&wdt->wdd, 0, dev); ret = devm_watchdog_register_device(dev, &wdt->wdd); - if (ret) { - dev_err(dev, "failed to register watchdog\n"); + if (ret) return ret; - } platform_set_drvdata(pdev, wdt); return 0; diff --git a/drivers/watchdog/rave-sp-wdt.c b/drivers/watchdog/rave-sp-wdt.c index 35db173252f9..2c95615b6354 100644 --- a/drivers/watchdog/rave-sp-wdt.c +++ b/drivers/watchdog/rave-sp-wdt.c @@ -310,7 +310,6 @@ static int rave_sp_wdt_probe(struct platform_device *pdev) ret = devm_watchdog_register_device(dev, wdd); if (ret) { - dev_err(dev, "Failed to register watchdog device\n"); rave_sp_wdt_stop(wdd); return ret; } diff --git a/drivers/watchdog/renesas_wdt.c b/drivers/watchdog/renesas_wdt.c index 565dbc1ec638..00662a8e039c 100644 --- a/drivers/watchdog/renesas_wdt.c +++ b/drivers/watchdog/renesas_wdt.c @@ -7,6 +7,7 @@ */ #include <linux/bitops.h> #include <linux/clk.h> +#include <linux/delay.h> #include <linux/io.h> #include <linux/kernel.h> #include <linux/module.h> @@ -70,6 +71,15 @@ static int rwdt_init_timeout(struct watchdog_device *wdev) return 0; } +static void rwdt_wait_cycles(struct rwdt_priv *priv, unsigned int cycles) +{ + unsigned int delay; + + delay = DIV_ROUND_UP(cycles * 1000000, priv->clk_rate); + + usleep_range(delay, 2 * delay); +} + static int rwdt_start(struct watchdog_device *wdev) { struct rwdt_priv *priv = watchdog_get_drvdata(wdev); @@ -80,6 +90,8 @@ static int rwdt_start(struct watchdog_device *wdev) /* Stop the timer before we modify any register */ val = readb_relaxed(priv->base + RWTCSRA) & ~RWTCSRA_TME; rwdt_write(priv, val, RWTCSRA); + /* Delay 2 cycles before setting watchdog counter */ + rwdt_wait_cycles(priv, 2); rwdt_init_timeout(wdev); rwdt_write(priv, priv->cks, RWTCSRA); @@ -98,6 +110,8 @@ static int rwdt_stop(struct watchdog_device *wdev) struct rwdt_priv *priv = watchdog_get_drvdata(wdev); rwdt_write(priv, priv->cks, RWTCSRA); + /* Delay 3 cycles before disabling module clock */ + rwdt_wait_cycles(priv, 3); pm_runtime_put(wdev->parent); return 0; @@ -175,15 +189,16 @@ static inline bool rwdt_blacklisted(struct device *dev) { return false; } static int rwdt_probe(struct platform_device *pdev) { + struct device *dev = &pdev->dev; struct rwdt_priv *priv; struct clk *clk; unsigned long clks_per_sec; int ret, i; - if (rwdt_blacklisted(&pdev->dev)) + if (rwdt_blacklisted(dev)) return -ENODEV; - priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -191,16 +206,16 @@ static int rwdt_probe(struct platform_device *pdev) if (IS_ERR(priv->base)) return PTR_ERR(priv->base); - clk = devm_clk_get(&pdev->dev, NULL); + clk = devm_clk_get(dev, NULL); if (IS_ERR(clk)) return PTR_ERR(clk); - pm_runtime_enable(&pdev->dev); - pm_runtime_get_sync(&pdev->dev); + pm_runtime_enable(dev); + pm_runtime_get_sync(dev); priv->clk_rate = clk_get_rate(clk); priv->wdev.bootstatus = (readb_relaxed(priv->base + RWTCSRA) & RWTCSRA_WOVF) ? WDIOF_CARDRESET : 0; - pm_runtime_put(&pdev->dev); + pm_runtime_put(dev); if (!priv->clk_rate) { ret = -ENOENT; @@ -216,14 +231,14 @@ static int rwdt_probe(struct platform_device *pdev) } if (i < 0) { - dev_err(&pdev->dev, "Can't find suitable clock divider\n"); + dev_err(dev, "Can't find suitable clock divider\n"); ret = -ERANGE; goto out_pm_disable; } priv->wdev.info = &rwdt_ident; priv->wdev.ops = &rwdt_ops; - priv->wdev.parent = &pdev->dev; + priv->wdev.parent = dev; priv->wdev.min_timeout = 1; priv->wdev.max_timeout = DIV_BY_CLKS_PER_SEC(priv, 65536); priv->wdev.timeout = min(priv->wdev.max_timeout, RWDT_DEFAULT_TIMEOUT); @@ -235,7 +250,7 @@ static int rwdt_probe(struct platform_device *pdev) watchdog_stop_on_unregister(&priv->wdev); /* This overrides the default timeout only if DT configuration was found */ - watchdog_init_timeout(&priv->wdev, 0, &pdev->dev); + watchdog_init_timeout(&priv->wdev, 0, dev); ret = watchdog_register_device(&priv->wdev); if (ret < 0) @@ -244,7 +259,7 @@ static int rwdt_probe(struct platform_device *pdev) return 0; out_pm_disable: - pm_runtime_disable(&pdev->dev); + pm_runtime_disable(dev); return ret; } diff --git a/drivers/watchdog/retu_wdt.c b/drivers/watchdog/retu_wdt.c index 39cd51df2ffc..258dfcf9cbda 100644 --- a/drivers/watchdog/retu_wdt.c +++ b/drivers/watchdog/retu_wdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Retu watchdog driver * @@ -5,15 +6,6 @@ * * Based on code written by Amit Kucheria and Michael Buesch. * Rewritten by Aaro Koskinen. - * - * This file is subject to the terms and conditions of the GNU General - * Public License. See the file "COPYING" in the main directory of this - * archive for more details. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/slab.h> diff --git a/drivers/watchdog/s3c2410_wdt.c b/drivers/watchdog/s3c2410_wdt.c index daf3bf0d86b8..2395f353e52d 100644 --- a/drivers/watchdog/s3c2410_wdt.c +++ b/drivers/watchdog/s3c2410_wdt.c @@ -606,10 +606,8 @@ static int s3c2410wdt_probe(struct platform_device *pdev) wdt->wdt_device.parent = dev; ret = watchdog_register_device(&wdt->wdt_device); - if (ret) { - dev_err(dev, "cannot register watchdog (%d)\n", ret); + if (ret) goto err_cpufreq; - } ret = s3c2410wdt_mask_and_disable_reset(wdt, false); if (ret < 0) diff --git a/drivers/watchdog/sa1100_wdt.c b/drivers/watchdog/sa1100_wdt.c index bfa035e1a75e..cbd8c957182f 100644 --- a/drivers/watchdog/sa1100_wdt.c +++ b/drivers/watchdog/sa1100_wdt.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Watchdog driver for the SA11x0/PXA2xx * * (c) Copyright 2000 Oleg Drokin <green@crimea.edu> * Based on SoftDog driver by Alan Cox <alan@lxorguk.ukuu.org.uk> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Oleg Drokin nor iXcelerator.com admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/sama5d4_wdt.c b/drivers/watchdog/sama5d4_wdt.c index b8da1bf21e12..d193a60430b2 100644 --- a/drivers/watchdog/sama5d4_wdt.c +++ b/drivers/watchdog/sama5d4_wdt.c @@ -110,9 +110,7 @@ static int sama5d4_wdt_set_timeout(struct watchdog_device *wdd, u32 value = WDT_SEC2TICKS(timeout); wdt->mr &= ~AT91_WDT_WDV; - wdt->mr &= ~AT91_WDT_WDD; wdt->mr |= AT91_WDT_SET_WDV(value); - wdt->mr |= AT91_WDT_SET_WDD(value); /* * WDDIS has to be 0 when updating WDD/WDV. The datasheet states: When @@ -248,7 +246,7 @@ static int sama5d4_wdt_probe(struct platform_device *pdev) timeout = WDT_SEC2TICKS(wdd->timeout); - wdt->mr |= AT91_WDT_SET_WDD(timeout); + wdt->mr |= AT91_WDT_SET_WDD(WDT_SEC2TICKS(MAX_WDT_TIMEOUT)); wdt->mr |= AT91_WDT_SET_WDV(timeout); ret = sama5d4_wdt_init(wdt); @@ -259,10 +257,8 @@ static int sama5d4_wdt_probe(struct platform_device *pdev) watchdog_stop_on_unregister(wdd); ret = devm_watchdog_register_device(dev, wdd); - if (ret) { - dev_err(dev, "failed to register watchdog device\n"); + if (ret) return ret; - } platform_set_drvdata(pdev, wdt); @@ -279,7 +275,17 @@ static const struct of_device_id sama5d4_wdt_of_match[] = { MODULE_DEVICE_TABLE(of, sama5d4_wdt_of_match); #ifdef CONFIG_PM_SLEEP -static int sama5d4_wdt_resume(struct device *dev) +static int sama5d4_wdt_suspend_late(struct device *dev) +{ + struct sama5d4_wdt *wdt = dev_get_drvdata(dev); + + if (watchdog_active(&wdt->wdd)) + sama5d4_wdt_stop(&wdt->wdd); + + return 0; +} + +static int sama5d4_wdt_resume_early(struct device *dev) { struct sama5d4_wdt *wdt = dev_get_drvdata(dev); @@ -290,12 +296,17 @@ static int sama5d4_wdt_resume(struct device *dev) */ sama5d4_wdt_init(wdt); + if (watchdog_active(&wdt->wdd)) + sama5d4_wdt_start(&wdt->wdd); + return 0; } #endif -static SIMPLE_DEV_PM_OPS(sama5d4_wdt_pm_ops, NULL, - sama5d4_wdt_resume); +static const struct dev_pm_ops sama5d4_wdt_pm_ops = { + SET_LATE_SYSTEM_SLEEP_PM_OPS(sama5d4_wdt_suspend_late, + sama5d4_wdt_resume_early) +}; static struct platform_driver sama5d4_wdt_driver = { .probe = sama5d4_wdt_probe, diff --git a/drivers/watchdog/sbc7240_wdt.c b/drivers/watchdog/sbc7240_wdt.c index efc81b318939..12cdee7d5069 100644 --- a/drivers/watchdog/sbc7240_wdt.c +++ b/drivers/watchdog/sbc7240_wdt.c @@ -1,19 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NANO7240 SBC Watchdog device driver * * Based on w83877f.c by Scott Jennings, * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation; - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or - * implied. See the License for the specific language governing - * rights and limitations under the License. - * * (c) Copyright 2007 Gilles GIGAN <gilles.gigan@jcu.edu.au> - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/drivers/watchdog/sbc8360.c b/drivers/watchdog/sbc8360.c index 3396024e7b76..4f8b9912fc51 100644 --- a/drivers/watchdog/sbc8360.c +++ b/drivers/watchdog/sbc8360.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * SBC8360 Watchdog driver * @@ -19,11 +20,6 @@ * (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/sch311x_wdt.c b/drivers/watchdog/sch311x_wdt.c index ed6e9fac5d74..3612f1df381b 100644 --- a/drivers/watchdog/sch311x_wdt.c +++ b/drivers/watchdog/sch311x_wdt.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * sch311x_wdt.c - Driver for the SCH311x Super-I/O chips * integrated watchdog. * * (c) Copyright 2008 Wim Van Sebroeck <wim@iguana.be>. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor * provide warranty for any of this software. This material is * provided "AS-IS" and at no charge. diff --git a/drivers/watchdog/softdog.c b/drivers/watchdog/softdog.c index 060740625485..3e4885c1545e 100644 --- a/drivers/watchdog/softdog.c +++ b/drivers/watchdog/softdog.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * SoftDog: A Software Watchdog Device * * (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/sp5100_tco.c b/drivers/watchdog/sp5100_tco.c index cd4430ff9b1c..93bd302ae7c5 100644 --- a/drivers/watchdog/sp5100_tco.c +++ b/drivers/watchdog/sp5100_tco.c @@ -402,10 +402,8 @@ static int sp5100_tco_probe(struct platform_device *pdev) return ret; ret = devm_watchdog_register_device(dev, wdd); - if (ret) { - dev_err(dev, "cannot register watchdog device (err=%d)\n", ret); + if (ret) return ret; - } /* Show module parameters */ dev_info(dev, "initialized. heartbeat=%d sec (nowayout=%d)\n", diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c index 072986d461b7..53e04926a7b2 100644 --- a/drivers/watchdog/sp805_wdt.c +++ b/drivers/watchdog/sp805_wdt.c @@ -288,11 +288,8 @@ sp805_wdt_probe(struct amba_device *adev, const struct amba_id *id) } ret = watchdog_register_device(&wdt->wdd); - if (ret) { - dev_err(&adev->dev, "watchdog_register_device() failed: %d\n", - ret); + if (ret) goto err; - } amba_set_drvdata(adev, wdt); dev_info(&adev->dev, "registration successful\n"); diff --git a/drivers/watchdog/sprd_wdt.c b/drivers/watchdog/sprd_wdt.c index 916fb3f96bdc..edba4e278685 100644 --- a/drivers/watchdog/sprd_wdt.c +++ b/drivers/watchdog/sprd_wdt.c @@ -320,7 +320,6 @@ static int sprd_wdt_probe(struct platform_device *pdev) ret = devm_watchdog_register_device(dev, &wdt->wdd); if (ret) { sprd_wdt_disable(wdt); - dev_err(dev, "failed to register watchdog\n"); return ret; } platform_set_drvdata(pdev, wdt); diff --git a/drivers/watchdog/st_lpc_wdt.c b/drivers/watchdog/st_lpc_wdt.c index 7a90184eb950..14ab6559c748 100644 --- a/drivers/watchdog/st_lpc_wdt.c +++ b/drivers/watchdog/st_lpc_wdt.c @@ -228,10 +228,8 @@ static int st_wdog_probe(struct platform_device *pdev) return ret; ret = devm_watchdog_register_device(dev, &st_wdog_dev); - if (ret) { - dev_err(dev, "Unable to register watchdog\n"); + if (ret) return ret; - } st_wdog_setup(st_wdog, true); diff --git a/drivers/watchdog/stm32_iwdg.c b/drivers/watchdog/stm32_iwdg.c index d569a3634d9b..a3a329011a06 100644 --- a/drivers/watchdog/stm32_iwdg.c +++ b/drivers/watchdog/stm32_iwdg.c @@ -263,10 +263,8 @@ static int stm32_iwdg_probe(struct platform_device *pdev) watchdog_init_timeout(wdd, 0, dev); ret = devm_watchdog_register_device(dev, wdd); - if (ret) { - dev_err(dev, "failed to register watchdog device\n"); + if (ret) return ret; - } platform_set_drvdata(pdev, wdt); diff --git a/drivers/watchdog/stmp3xxx_rtc_wdt.c b/drivers/watchdog/stmp3xxx_rtc_wdt.c index 671f4ba7b4ed..7caf3aa71c6a 100644 --- a/drivers/watchdog/stmp3xxx_rtc_wdt.c +++ b/drivers/watchdog/stmp3xxx_rtc_wdt.c @@ -98,10 +98,8 @@ static int stmp3xxx_wdt_probe(struct platform_device *pdev) stmp3xxx_wdd.parent = dev; ret = devm_watchdog_register_device(dev, &stmp3xxx_wdd); - if (ret < 0) { - dev_err(dev, "cannot register watchdog device\n"); + if (ret < 0) return ret; - } if (register_reboot_notifier(&wdt_notifier)) dev_warn(dev, "cannot register reboot notifier\n"); diff --git a/drivers/watchdog/tegra_wdt.c b/drivers/watchdog/tegra_wdt.c index a58b000acc4f..dfe06e506cad 100644 --- a/drivers/watchdog/tegra_wdt.c +++ b/drivers/watchdog/tegra_wdt.c @@ -219,10 +219,8 @@ static int tegra_wdt_probe(struct platform_device *pdev) watchdog_stop_on_unregister(wdd); ret = devm_watchdog_register_device(dev, wdd); - if (ret) { - dev_err(dev, "failed to register watchdog device\n"); + if (ret) return ret; - } platform_set_drvdata(pdev, wdt); diff --git a/drivers/watchdog/ts4800_wdt.c b/drivers/watchdog/ts4800_wdt.c index 9dc6d7f45806..c137ad2bd5c3 100644 --- a/drivers/watchdog/ts4800_wdt.c +++ b/drivers/watchdog/ts4800_wdt.c @@ -171,10 +171,8 @@ static int ts4800_wdt_probe(struct platform_device *pdev) ts4800_wdt_stop(wdd); ret = devm_watchdog_register_device(dev, wdd); - if (ret) { - dev_err(dev, "failed to register watchdog device\n"); + if (ret) return ret; - } platform_set_drvdata(pdev, wdt); diff --git a/drivers/watchdog/w83627hf_wdt.c b/drivers/watchdog/w83627hf_wdt.c index 3a49ba9ea608..38b31e9947aa 100644 --- a/drivers/watchdog/w83627hf_wdt.c +++ b/drivers/watchdog/w83627hf_wdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * w83627hf/thf WDT driver * @@ -17,11 +18,6 @@ * (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/wafer5823wdt.c b/drivers/watchdog/wafer5823wdt.c index 0a8073b419f8..6d2071a0590d 100644 --- a/drivers/watchdog/wafer5823wdt.c +++ b/drivers/watchdog/wafer5823wdt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * ICP Wafer 5823 Single Board Computer WDT driver * http://www.icpamerica.com/wafer_5823.php @@ -13,11 +14,6 @@ * (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c index 62be9e52a4de..21e8085b848b 100644 --- a/drivers/watchdog/watchdog_core.c +++ b/drivers/watchdog/watchdog_core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * watchdog_core.c * @@ -16,11 +17,6 @@ * Satyam Sharma <satyam@infradead.org> * Randy Dunlap <randy.dunlap@oracle.com> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox, CymruNet Ltd., Wim Van Sebroeck nor Iguana vzw. * admit liability nor provide warranty for any of this software. * This material is provided "AS-IS" and at no charge. @@ -60,11 +56,10 @@ static DEFINE_MUTEX(wtd_deferred_reg_mutex); static LIST_HEAD(wtd_deferred_reg_list); static bool wtd_deferred_reg_done; -static int watchdog_deferred_registration_add(struct watchdog_device *wdd) +static void watchdog_deferred_registration_add(struct watchdog_device *wdd) { list_add_tail(&wdd->deferred, &wtd_deferred_reg_list); - return 0; } static void watchdog_deferred_registration_del(struct watchdog_device *wdd) @@ -265,14 +260,23 @@ static int __watchdog_register_device(struct watchdog_device *wdd) int watchdog_register_device(struct watchdog_device *wdd) { - int ret; + const char *dev_str; + int ret = 0; mutex_lock(&wtd_deferred_reg_mutex); if (wtd_deferred_reg_done) ret = __watchdog_register_device(wdd); else - ret = watchdog_deferred_registration_add(wdd); + watchdog_deferred_registration_add(wdd); mutex_unlock(&wtd_deferred_reg_mutex); + + if (ret) { + dev_str = wdd->parent ? dev_name(wdd->parent) : + (const char *)wdd->info->identity; + pr_err("%s: failed to register watchdog device (err = %d)\n", + dev_str, ret); + } + return ret; } EXPORT_SYMBOL_GPL(watchdog_register_device); diff --git a/drivers/watchdog/watchdog_core.h b/drivers/watchdog/watchdog_core.h index 86ff962d1e15..a5062e8e0d13 100644 --- a/drivers/watchdog/watchdog_core.h +++ b/drivers/watchdog/watchdog_core.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * watchdog_core.h * @@ -16,11 +17,6 @@ * Satyam Sharma <satyam@infradead.org> * Randy Dunlap <randy.dunlap@oracle.com> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox, CymruNet Ltd., Wim Van Sebroeck nor Iguana vzw. * admit liability nor provide warranty for any of this software. * This material is provided "AS-IS" and at no charge. diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 252a7c7b6592..dbd2ad4c9294 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * watchdog_dev.c * @@ -20,11 +21,6 @@ * Satyam Sharma <satyam@infradead.org> * Randy Dunlap <randy.dunlap@oracle.com> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox, CymruNet Ltd., Wim Van Sebroeck nor Iguana vzw. * admit liability nor provide warranty for any of this software. * This material is provided "AS-IS" and at no charge. @@ -69,6 +65,7 @@ struct watchdog_core_data { struct mutex lock; ktime_t last_keepalive; ktime_t last_hw_keepalive; + ktime_t open_deadline; struct hrtimer timer; struct kthread_work work; unsigned long status; /* Internal status bits */ @@ -87,6 +84,19 @@ static struct kthread_worker *watchdog_kworker; static bool handle_boot_enabled = IS_ENABLED(CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED); +static unsigned open_timeout = CONFIG_WATCHDOG_OPEN_TIMEOUT; + +static bool watchdog_past_open_deadline(struct watchdog_core_data *data) +{ + return ktime_after(ktime_get(), data->open_deadline); +} + +static void watchdog_set_open_deadline(struct watchdog_core_data *data) +{ + data->open_deadline = open_timeout ? + ktime_get() + ktime_set(open_timeout, 0) : KTIME_MAX; +} + static inline bool watchdog_need_worker(struct watchdog_device *wdd) { /* All variables in milli-seconds */ @@ -119,14 +129,15 @@ static ktime_t watchdog_next_keepalive(struct watchdog_device *wdd) ktime_t virt_timeout; unsigned int hw_heartbeat_ms; - virt_timeout = ktime_add(wd_data->last_keepalive, - ms_to_ktime(timeout_ms)); + if (watchdog_active(wdd)) + virt_timeout = ktime_add(wd_data->last_keepalive, + ms_to_ktime(timeout_ms)); + else + virt_timeout = wd_data->open_deadline; + hw_heartbeat_ms = min_not_zero(timeout_ms, wdd->max_hw_heartbeat_ms); keepalive_interval = ms_to_ktime(hw_heartbeat_ms / 2); - if (!watchdog_active(wdd)) - return keepalive_interval; - /* * To ensure that the watchdog times out wdd->timeout seconds * after the most recent ping from userspace, the last @@ -211,7 +222,13 @@ static bool watchdog_worker_should_ping(struct watchdog_core_data *wd_data) { struct watchdog_device *wdd = wd_data->wdd; - return wdd && (watchdog_active(wdd) || watchdog_hw_running(wdd)); + if (!wdd) + return false; + + if (watchdog_active(wdd)) + return true; + + return watchdog_hw_running(wdd) && !watchdog_past_open_deadline(wd_data); } static void watchdog_ping_work(struct kthread_work *work) @@ -824,6 +841,15 @@ static int watchdog_open(struct inode *inode, struct file *file) if (!hw_running) kref_get(&wd_data->kref); + /* + * open_timeout only applies for the first open from + * userspace. Set open_deadline to infinity so that the kernel + * will take care of an always-running hardware watchdog in + * case the device gets magic-closed or WDIOS_DISABLECARD is + * applied. + */ + wd_data->open_deadline = KTIME_MAX; + /* dev/watchdog is a virtual (and thus non-seekable) filesystem */ return stream_open(inode, file); @@ -983,6 +1009,7 @@ static int watchdog_cdev_register(struct watchdog_device *wdd, dev_t devno) /* Record time of most recent heartbeat as 'just before now'. */ wd_data->last_hw_keepalive = ktime_sub(ktime_get(), 1); + watchdog_set_open_deadline(wd_data); /* * If the watchdog is running, prevent its driver from being unloaded, @@ -1181,3 +1208,8 @@ module_param(handle_boot_enabled, bool, 0444); MODULE_PARM_DESC(handle_boot_enabled, "Watchdog core auto-updates boot enabled watchdogs before userspace takes over (default=" __MODULE_STRING(IS_ENABLED(CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED)) ")"); + +module_param(open_timeout, uint, 0644); +MODULE_PARM_DESC(open_timeout, + "Maximum time (in seconds, 0 means infinity) for userspace to take over a running watchdog (default=" + __MODULE_STRING(CONFIG_WATCHDOG_OPEN_TIMEOUT) ")"); diff --git a/drivers/watchdog/wd501p.h b/drivers/watchdog/wd501p.h index 0e3a497d5626..43a4d88fd363 100644 --- a/drivers/watchdog/wd501p.h +++ b/drivers/watchdog/wd501p.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-1.0+ */ /* * Industrial Computer Source WDT500/501 driver * @@ -11,12 +12,7 @@ * * http://www.cymru.net * - * This driver is provided under the GNU General Public License, - * incorporated herein by reference. The driver is provided without - * warranty or support. - * * Release 0.04. - * */ diff --git a/drivers/watchdog/wdt.c b/drivers/watchdog/wdt.c index 3d2f5ed60e88..0650100fad00 100644 --- a/drivers/watchdog/wdt.c +++ b/drivers/watchdog/wdt.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Industrial Computer Source WDT501 driver * * (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/wdt_pci.c b/drivers/watchdog/wdt_pci.c index ff3a41f47127..66303ab95685 100644 --- a/drivers/watchdog/wdt_pci.c +++ b/drivers/watchdog/wdt_pci.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Industrial Computer Source PCI-WDT500/501 driver * * (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>, * All Rights Reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. diff --git a/drivers/watchdog/wm831x_wdt.c b/drivers/watchdog/wm831x_wdt.c index 9b6565a3fab4..030ce240620d 100644 --- a/drivers/watchdog/wm831x_wdt.c +++ b/drivers/watchdog/wm831x_wdt.c @@ -267,14 +267,7 @@ static int wm831x_wdt_probe(struct platform_device *pdev) } } - ret = devm_watchdog_register_device(dev, &driver_data->wdt); - if (ret != 0) { - dev_err(wm831x->dev, "watchdog_register_device() failed: %d\n", - ret); - return ret; - } - - return 0; + return devm_watchdog_register_device(dev, &driver_data->wdt); } static struct platform_driver wm831x_wdt_driver = { diff --git a/drivers/watchdog/xen_wdt.c b/drivers/watchdog/xen_wdt.c index 2ba0a3c4523c..b343f421dc72 100644 --- a/drivers/watchdog/xen_wdt.c +++ b/drivers/watchdog/xen_wdt.c @@ -138,10 +138,8 @@ static int xen_wdt_probe(struct platform_device *pdev) watchdog_stop_on_unregister(&xen_wdt_dev); ret = devm_watchdog_register_device(dev, &xen_wdt_dev); - if (ret) { - dev_err(dev, "cannot register watchdog device (%d)\n", ret); + if (ret) return ret; - } dev_info(dev, "initialized (timeout=%ds, nowayout=%d)\n", xen_wdt_dev.timeout, nowayout); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index d53f3493a6b9..cfbe46785a3b 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -402,7 +402,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir, attrs); - if (map == DMA_MAPPING_ERROR) + if (map == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; dev_addr = xen_phys_to_bus(map); |