diff options
Diffstat (limited to 'fs/xfs/xfs_log.c')
-rw-r--r-- | fs/xfs/xfs_log.c | 2705 |
1 files changed, 1299 insertions, 1406 deletions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f6006d94a581..f02a0dd522b3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -21,16 +21,9 @@ #include "xfs_sb.h" #include "xfs_health.h" -kmem_zone_t *xfs_log_ticket_zone; +struct kmem_cache *xfs_log_ticket_cache; /* Local miscellaneous function prototypes */ -STATIC int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp); - STATIC struct xlog * xlog_alloc_log( struct xfs_mount *mp, @@ -47,48 +40,27 @@ xlog_dealloc_log( /* local state machine functions */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted); + struct xlog_in_core *iclog); +STATIC void xlog_state_do_callback( + struct xlog *log); STATIC int xlog_state_get_iclog_space( struct xlog *log, int len, struct xlog_in_core **iclog, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp); STATIC void -xlog_state_switch_iclogs( - struct xlog *log, - struct xlog_in_core *iclog, - int eventual_size); -STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog); - -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); STATIC void -xlog_regrant_reserve_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void -xlog_ungrant_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void xlog_sync( struct xlog *log, - struct xlog_in_core *iclog); + struct xlog_in_core *iclog, + struct xlog_ticket *ticket); #if defined(DEBUG) STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr); -STATIC void xlog_verify_grant_tail( struct xlog *log); STATIC void @@ -99,19 +71,76 @@ xlog_verify_iclog( STATIC void xlog_verify_tail_lsn( struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t tail_lsn); + struct xlog_in_core *iclog); #else -#define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c) -#define xlog_verify_tail_lsn(a,b,c) +#define xlog_verify_tail_lsn(a,b) #endif STATIC int xlog_iclogs_empty( struct xlog *log); +static int +xfs_log_cover(struct xfs_mount *); + +/* + * We need to make sure the buffer pointer returned is naturally aligned for the + * biggest basic data type we put into it. We have already accounted for this + * padding when sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + * + * We also add space for the xlog_op_header that describes this region in the + * log. This prepends the data region we return to the caller to copy their data + * into, so do all the static initialisation of the ophdr now. Because the ophdr + * is not 8 byte aligned, we have to be careful to ensure that we align the + * start of the buffer such that the region we return to the call is 8 byte + * aligned and packed against the tail of the ophdr. + */ +void * +xlog_prepare_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + struct xlog_op_header *oph; + uint32_t len; + void *buf; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + len = lv->lv_buf_len + sizeof(struct xlog_op_header); + if (!IS_ALIGNED(len, sizeof(uint64_t))) { + lv->lv_buf_len = round_up(len, sizeof(uint64_t)) - + sizeof(struct xlog_op_header); + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + oph = vec->i_addr; + oph->oh_clientid = XFS_TRANSACTION; + oph->oh_res2 = 0; + oph->oh_flags = 0; + + buf = vec->i_addr + sizeof(struct xlog_op_header); + ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t))); + + *vecp = vec; + return buf; +} + static void xlog_grant_sub_space( struct xlog *log, @@ -197,12 +226,12 @@ xlog_ticket_reservation( if (head == &log->l_write_head) { ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); return tic->t_unit_res; - } else { - if (tic->t_flags & XLOG_TIC_PERM_RESERV) - return tic->t_unit_res * tic->t_cnt; - else - return tic->t_unit_res; } + + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + return tic->t_unit_res * tic->t_cnt; + + return tic->t_unit_res; } STATIC bool @@ -265,7 +294,7 @@ xlog_grant_head_wait( list_add_tail(&tic->t_queue, &head->waiters); do { - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto shutdown; xlog_grant_push_ail(log, need_bytes); @@ -279,7 +308,7 @@ xlog_grant_head_wait( trace_xfs_log_grant_wake(log, tic); spin_lock(&head->lock); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto shutdown; } while (xlog_space_left(log, &head->grant) < need_bytes); @@ -317,7 +346,7 @@ xlog_grant_head_check( int free_bytes; int error = 0; - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); /* * If there are other waiters on the queue then give them a chance at @@ -344,28 +373,25 @@ xlog_grant_head_check( return error; } -static void -xlog_tic_reset_res(xlog_ticket_t *tic) -{ - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - tic->t_res_num_ophdrs = 0; -} - -static void -xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) +bool +xfs_log_writable( + struct xfs_mount *mp) { - if (tic->t_res_num == XLOG_TIC_LEN_MAX) { - /* add to overflow and start again */ - tic->t_res_o_flow += tic->t_res_arr_sum; - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - } - - tic->t_res_arr[tic->t_res_num].r_len = len; - tic->t_res_arr[tic->t_res_num].r_type = type; - tic->t_res_arr_sum += len; - tic->t_res_num++; + /* + * Do not write to the log on norecovery mounts, if the data or log + * devices are read-only, or if the filesystem is shutdown. Read-only + * mounts allow internal writes for log recovery and unmount purposes, + * so don't restrict that case. + */ + if (xfs_has_norecovery(mp)) + return false; + if (xfs_readonly_buftarg(mp->m_ddev_targp)) + return false; + if (xfs_readonly_buftarg(mp->m_log->l_targ)) + return false; + if (xlog_is_shutdown(mp->m_log)) + return false; + return true; } /* @@ -380,7 +406,7 @@ xfs_log_regrant( int need_bytes; int error = 0; - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); @@ -396,8 +422,6 @@ xfs_log_regrant( xlog_grant_push_ail(log, tic->t_unit_res); tic->t_curr_res = tic->t_unit_res; - xlog_tic_reset_res(tic); - if (tic->t_cnt > 0) return 0; @@ -435,10 +459,9 @@ out_error: int xfs_log_reserve( struct xfs_mount *mp, - int unit_bytes, - int cnt, + int unit_bytes, + int cnt, struct xlog_ticket **ticp, - uint8_t client, bool permanent) { struct xlog *log = mp->m_log; @@ -446,15 +469,13 @@ xfs_log_reserve( int need_bytes; int error = 0; - ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -484,138 +505,119 @@ out_error: return error; } - /* - * NOTES: + * Run all the pending iclog callbacks and wake log force waiters and iclog + * space waiters so they can process the newly set shutdown state. We really + * don't care what order we process callbacks here because the log is shut down + * and so state cannot change on disk anymore. However, we cannot wake waiters + * until the callbacks have been processed because we may be in unmount and + * we must ensure that all AIL operations the callbacks perform have completed + * before we tear down the AIL. * - * 1. currblock field gets updated at startup and after in-core logs - * marked as with WANT_SYNC. + * We avoid processing actively referenced iclogs so that we don't run callbacks + * while the iclog owner might still be preparing the iclog for IO submssion. + * These will be caught by xlog_state_iclog_release() and call this function + * again to process any callbacks that may have been added to that iclog. */ - -/* - * This routine is called when a user of a log manager ticket is done with - * the reservation. If the ticket was ever used, then a commit record for - * the associated transaction is written out as a log operation header with - * no data. The flag XLOG_TIC_INITED is set when the first write occurs with - * a given ticket. If the ticket was one with a permanent reservation, then - * a few operations are done differently. Permanent reservation tickets by - * default don't release the reservation. They just commit the current - * transaction with the belief that the reservation is still needed. A flag - * must be passed in before permanent reservations are actually released. - * When these type of tickets are not released, they need to be set into - * the inited state again. By doing this, a start record will be written - * out when the next write occurs. - */ -xfs_lsn_t -xfs_log_done( - struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant) +static void +xlog_state_shutdown_callbacks( + struct xlog *log) { - struct xlog *log = mp->m_log; - xfs_lsn_t lsn = 0; - - if (XLOG_FORCED_SHUTDOWN(log) || - /* - * If nothing was ever written, don't write out commit record. - * If we get an error, just continue and give back the log ticket. - */ - (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(log, ticket, iclog, &lsn)))) { - lsn = (xfs_lsn_t) -1; - regrant = false; - } - - - if (!regrant) { - trace_xfs_log_done_nonperm(log, ticket); - - /* - * Release ticket if not permanent reservation or a specific - * request has been made to release a permanent reservation. - */ - xlog_ungrant_log_space(log, ticket); - } else { - trace_xfs_log_done_perm(log, ticket); - - xlog_regrant_reserve_log_space(log, ticket); - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - ticket->t_flags |= XLOG_TIC_INITED; - } - - xfs_log_ticket_put(ticket); - return lsn; -} + struct xlog_in_core *iclog; + LIST_HEAD(cb_list); -static bool -__xlog_state_release_iclog( - struct xlog *log, - struct xlog_in_core *iclog) -{ - lockdep_assert_held(&log->l_icloglock); + iclog = log->l_iclog; + do { + if (atomic_read(&iclog->ic_refcnt)) { + /* Reference holder will re-run iclog callbacks. */ + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); - if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { - /* update tail before writing to iclog */ - xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); + xlog_cil_process_committed(&cb_list); - iclog->ic_state = XLOG_STATE_SYNCING; - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); - xlog_verify_tail_lsn(log, iclog, tail_lsn); - /* cycle incremented when incrementing curr_block */ - return true; - } + spin_lock(&log->l_icloglock); + wake_up_all(&iclog->ic_write_wait); + wake_up_all(&iclog->ic_force_wait); + } while ((iclog = iclog->ic_next) != log->l_iclog); - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); - return false; + wake_up_all(&log->l_flush_wait); } /* * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. + * + * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the + * log tail is updated correctly. NEED_FUA indicates that the iclog will be + * written to stable storage, and implies that a commit record is contained + * within the iclog. We need to ensure that the log tail does not move beyond + * the tail that the first commit record in the iclog ordered against, otherwise + * correct recovery of that checkpoint becomes dependent on future operations + * performed on this iclog. + * + * Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the + * current tail into iclog. Once the iclog tail is set, future operations must + * not modify it, otherwise they potentially violate ordering constraints for + * the checkpoint commit that wrote the initial tail lsn value. The tail lsn in + * the iclog will get zeroed on activation of the iclog after sync, so we + * always capture the tail lsn on the iclog on the first NEED_FUA release + * regardless of the number of active reference counts on this iclog. */ -static int +int xlog_state_release_iclog( struct xlog *log, - struct xlog_in_core *iclog) + struct xlog_in_core *iclog, + struct xlog_ticket *ticket) { - lockdep_assert_held(&log->l_icloglock); + xfs_lsn_t tail_lsn; + bool last_ref; - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; + lockdep_assert_held(&log->l_icloglock); - if (atomic_dec_and_test(&iclog->ic_refcnt) && - __xlog_state_release_iclog(log, iclog)) { - spin_unlock(&log->l_icloglock); - xlog_sync(log, iclog); - spin_lock(&log->l_icloglock); + trace_xlog_iclog_release(iclog, _RET_IP_); + /* + * Grabbing the current log tail needs to be atomic w.r.t. the writing + * of the tail LSN into the iclog so we guarantee that the log tail does + * not move between the first time we know that the iclog needs to be + * made stable and when we eventually submit it. + */ + if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || + (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && + !iclog->ic_header.h_tail_lsn) { + tail_lsn = xlog_assign_tail_lsn(log->l_mp); + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } - return 0; -} + last_ref = atomic_dec_and_test(&iclog->ic_refcnt); -int -xfs_log_release_iclog( - struct xfs_mount *mp, - struct xlog_in_core *iclog) -{ - struct xlog *log = mp->m_log; - bool sync; - - if (iclog->ic_state == XLOG_STATE_IOERROR) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + if (xlog_is_shutdown(log)) { + /* + * If there are no more references to this iclog, process the + * pending iclog callbacks that were waiting on the release of + * this iclog. + */ + if (last_ref) + xlog_state_shutdown_callbacks(log); return -EIO; } - if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { - sync = __xlog_state_release_iclog(log, iclog); - spin_unlock(&log->l_icloglock); - if (sync) - xlog_sync(log, iclog); + if (!last_ref) + return 0; + + if (iclog->ic_state != XLOG_STATE_WANT_SYNC) { + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + return 0; } + + iclog->ic_state = XLOG_STATE_SYNCING; + xlog_verify_tail_lsn(log, iclog); + trace_xlog_iclog_syncing(iclog, _RET_IP_); + + spin_unlock(&log->l_icloglock); + xlog_sync(log, iclog, ticket); + spin_lock(&log->l_icloglock); return 0; } @@ -636,25 +638,27 @@ xfs_log_mount( xfs_daddr_t blk_offset, int num_bblks) { - bool fatal = xfs_sb_version_hascrc(&mp->m_sb); + struct xlog *log; + bool fatal = xfs_has_crc(mp); int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + if (!xfs_has_norecovery(mp)) { xfs_notice(mp, "Mounting V%d Filesystem", XFS_SB_VERSION_NUM(&mp->m_sb)); } else { xfs_notice(mp, "Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", XFS_SB_VERSION_NUM(&mp->m_sb)); - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + ASSERT(xfs_is_readonly(mp)); } - mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); - if (IS_ERR(mp->m_log)) { - error = PTR_ERR(mp->m_log); + log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (IS_ERR(log)) { + error = PTR_ERR(log); goto out; } + mp->m_log = log; /* * Validate the given log space and drop a critical message via syslog @@ -719,51 +723,51 @@ xfs_log_mount( xfs_warn(mp, "AIL initialisation failed: error %d", error); goto out_free_log; } - mp->m_log->l_ailp = mp->m_ail; + log->l_ailp = mp->m_ail; /* * skip log recovery on a norecovery mount. pretend it all * just worked. */ - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); - - if (readonly) - mp->m_flags &= ~XFS_MOUNT_RDONLY; - - error = xlog_recover(mp->m_log); - + if (!xfs_has_norecovery(mp)) { + /* + * log recovery ignores readonly state and so we need to clear + * mount-based read only state so it can write to disk. + */ + bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, + &mp->m_opstate); + error = xlog_recover(log); if (readonly) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); - xlog_recover_cancel(mp->m_log); + xlog_recover_cancel(log); goto out_destroy_ail; } } - error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, + error = xfs_sysfs_init(&log->l_kobj, &xfs_log_ktype, &mp->m_kobj, "log"); if (error) goto out_destroy_ail; /* Normal transactions can now occur */ - mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); /* * Now the log has been fully initialised and we know were our * space grant counters are, we can initialise the permanent ticket * needed for delayed logging to work. */ - xlog_cil_init_post_recovery(mp->m_log); + xlog_cil_init_post_recovery(log); return 0; out_destroy_ail: xfs_trans_ail_destroy(mp); out_free_log: - xlog_dealloc_log(mp->m_log); + xlog_dealloc_log(log); out: return error; } @@ -782,19 +786,22 @@ int xfs_log_mount_finish( struct xfs_mount *mp) { - int error = 0; - bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); - bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED; + struct xlog *log = mp->m_log; + bool readonly; + int error = 0; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + if (xfs_has_norecovery(mp)) { + ASSERT(xfs_is_readonly(mp)); return 0; - } else if (readonly) { - /* Allow unlinked processing to proceed */ - mp->m_flags &= ~XFS_MOUNT_RDONLY; } /* + * log recovery ignores readonly state and so we need to clear + * mount-based read only state so it can write to disk. + */ + readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + + /* * During the second phase of log recovery, we need iget and * iput to behave like they do for an active filesystem. * xfs_fs_drop_inode needs to be able to prevent the deletion @@ -815,9 +822,9 @@ xfs_log_mount_finish( * mount failure occurs. */ mp->m_super->s_flags |= SB_ACTIVE; - error = xlog_recover_finish(mp->m_log); - if (!error) - xfs_log_work_queue(mp); + xfs_log_work_queue(mp); + if (xlog_recovery_needed(log)) + error = xlog_recover_finish(log); mp->m_super->s_flags &= ~SB_ACTIVE; evict_inodes(mp->m_super); @@ -830,14 +837,24 @@ xfs_log_mount_finish( * Don't push in the error case because the AIL may have pending intents * that aren't removed until recovery is cancelled. */ - if (!error && recovered) { - xfs_log_force(mp, XFS_LOG_SYNC); - xfs_ail_push_all_sync(mp->m_ail); + if (xlog_recovery_needed(log)) { + if (!error) { + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_ail_push_all_sync(mp->m_ail); + } + xfs_notice(mp, "Ending recovery (logdev: %s)", + mp->m_logname ? mp->m_logname : "internal"); + } else { + xfs_info(mp, "Ending clean mount"); } - xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buftarg_drain(mp->m_ddev_targp); + clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); if (readonly) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + + /* Make sure the log is dead if we're returning failure. */ + ASSERT(!error || xlog_is_shutdown(log)); return error; } @@ -855,62 +872,114 @@ xfs_log_mount_cancel( } /* - * Final log writes as part of unmount. - * - * Mark the filesystem clean as unmount happens. Note that during relocation - * this routine needs to be executed as part of source-bag while the - * deallocation must not be done until source-end. + * Flush out the iclog to disk ensuring that device caches are flushed and + * the iclog hits stable storage before any completion waiters are woken. */ +static inline int +xlog_force_iclog( + struct xlog_in_core *iclog) +{ + atomic_inc(&iclog->ic_refcnt); + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); + return xlog_state_release_iclog(iclog->ic_log, iclog, NULL); +} -/* Actually write the unmount record to disk. */ -static void -xfs_log_write_unmount_record( - struct xfs_mount *mp) +/* + * Wait for the iclog and all prior iclogs to be written disk as required by the + * log force state machine. Waiting on ic_force_wait ensures iclog completions + * have been ordered and callbacks run before we are woken here, hence + * guaranteeing that all the iclogs up to this one are on stable storage. + */ +int +xlog_wait_on_iclog( + struct xlog_in_core *iclog) + __releases(iclog->ic_log->l_icloglock) { - /* the data section must be 32 bit size aligned */ - struct xfs_unmount_log_format magic = { - .magic = XLOG_UNMOUNT_TYPE, + struct xlog *log = iclog->ic_log; + + trace_xlog_iclog_wait_on(iclog, _RET_IP_); + if (!xlog_is_shutdown(log) && + iclog->ic_state != XLOG_STATE_ACTIVE && + iclog->ic_state != XLOG_STATE_DIRTY) { + XFS_STATS_INC(log->l_mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + + if (xlog_is_shutdown(log)) + return -EIO; + return 0; +} + +/* + * Write out an unmount record using the ticket provided. We have to account for + * the data space used in the unmount ticket as this write is not done from a + * transaction context that has already done the accounting for us. + */ +static int +xlog_write_unmount_record( + struct xlog *log, + struct xlog_ticket *ticket) +{ + struct { + struct xlog_op_header ophdr; + struct xfs_unmount_log_format ulf; + } unmount_rec = { + .ophdr = { + .oh_clientid = XFS_LOG, + .oh_tid = cpu_to_be32(ticket->t_tid), + .oh_flags = XLOG_UNMOUNT_TRANS, + }, + .ulf = { + .magic = XLOG_UNMOUNT_TYPE, + }, }; struct xfs_log_iovec reg = { - .i_addr = &magic, - .i_len = sizeof(magic), + .i_addr = &unmount_rec, + .i_len = sizeof(unmount_rec), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = ®, }; - struct xlog *log = mp->m_log; + LIST_HEAD(lv_chain); + list_add(&vec.lv_list, &lv_chain); + + BUILD_BUG_ON((sizeof(struct xlog_op_header) + + sizeof(struct xfs_unmount_log_format)) != + sizeof(unmount_rec)); + + /* account for space used by record data */ + ticket->t_curr_res -= sizeof(unmount_rec); + + return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len); +} + +/* + * Mark the filesystem clean by writing an unmount record to the head of the + * log. + */ +static void +xlog_unmount_write( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; - xfs_lsn_t lsn; - uint flags = XLOG_UNMOUNT_TRANS; int error; - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); + error = xfs_log_reserve(mp, 600, 1, &tic, 0); if (error) goto out_err; - /* - * If we think the summary counters are bad, clear the unmount header - * flag in the unmount record so that the summary counters will be - * recalculated during log recovery at next mount. Refer to - * xlog_check_unmount_rec for more details. - */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { - xfs_alert(mp, "%s: will fix summary counters at next mount", - __func__); - flags &= ~XLOG_UNMOUNT_TRANS; - } - - /* remove inited flag, and account for space used */ - tic->t_flags = 0; - tic->t_curr_res -= sizeof(magic); - error = xlog_write(log, &vec, tic, &lsn, NULL, flags); + error = xlog_write_unmount_record(log, tic); /* * At this point, we're umounting anyway, so there's no point in - * transitioning log state to IOERROR. Just continue... + * transitioning log state to shutdown. Just continue... */ out_err: if (error) @@ -918,29 +987,27 @@ out_err: spin_lock(&log->l_icloglock); iclog = log->l_iclog; - atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); - error = xlog_state_release_iclog(log, iclog); - switch (iclog->ic_state) { - default: - if (!XLOG_FORCED_SHUTDOWN(log)) { - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } - /* fall through */ - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - spin_unlock(&log->l_icloglock); - break; - } + error = xlog_force_iclog(iclog); + xlog_wait_on_iclog(iclog); if (tic) { trace_xfs_log_umount_write(log, tic); - xlog_ungrant_log_space(log, tic); - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); } } +static void +xfs_log_unmount_verify_iclog( + struct xlog *log) +{ + struct xlog_in_core *iclog = log->l_iclog; + + do { + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } while ((iclog = iclog->ic_next) != log->l_iclog); +} + /* * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). @@ -948,75 +1015,36 @@ out_err: * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ - -static int -xfs_log_unmount_write(xfs_mount_t *mp) +static void +xfs_log_unmount_write( + struct xfs_mount *mp) { - struct xlog *log = mp->m_log; - xlog_in_core_t *iclog; -#ifdef DEBUG - xlog_in_core_t *first_iclog; -#endif - int error; + struct xlog *log = mp->m_log; - /* - * Don't write out unmount record on norecovery mounts or ro devices. - * Or, if we are doing a forced umount (typically because of IO errors). - */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY || - xfs_readonly_buftarg(log->l_targ)) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); - return 0; - } + if (!xfs_log_writable(mp)) + return; - error = xfs_log_force(mp, XFS_LOG_SYNC); - ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); + xfs_log_force(mp, XFS_LOG_SYNC); -#ifdef DEBUG - first_iclog = iclog = log->l_iclog; - do { - if (iclog->ic_state != XLOG_STATE_IOERROR) { - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); - ASSERT(iclog->ic_offset == 0); - } - iclog = iclog->ic_next; - } while (iclog != first_iclog); -#endif - if (! (XLOG_FORCED_SHUTDOWN(log))) { - xfs_log_write_unmount_record(mp); - } else { - /* - * We're already in forced_shutdown mode, couldn't - * even attempt to write out the unmount transaction. - * - * Go through the motions of sync'ing and releasing - * the iclog, even though no I/O will actually happen, - * we need to wait for other log I/Os that may already - * be in progress. Do this as a separate section of - * code so we'll know if we ever get stuck here that - * we're in this odd situation of trying to unmount - * a file system that went into forced_shutdown as - * the result of an unmount.. - */ - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); - error = xlog_state_release_iclog(log, iclog); - switch (iclog->ic_state) { - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - case XLOG_STATE_IOERROR: - spin_unlock(&log->l_icloglock); - break; - default: - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } + if (xlog_is_shutdown(log)) + return; + + /* + * If we think the summary counters are bad, avoid writing the unmount + * record to force log recovery at next mount, after which the summary + * counters will be recalculated. Refer to xlog_check_unmount_rec for + * more details. + */ + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, + XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + xfs_alert(mp, "%s: will fix summary counters at next mount", + __func__); + return; } - return error; -} /* xfs_log_unmount_write */ + xfs_log_unmount_verify_iclog(log); + xlog_unmount_write(log); +} /* * Empty the log for unmount/freeze. @@ -1024,28 +1052,49 @@ xfs_log_unmount_write(xfs_mount_t *mp) * To do this, we first need to shut down the background log work so it is not * trying to cover the log as we clean up. We then need to unpin all objects in * the log so we can then flush them out. Once they have completed their IO and - * run the callbacks removing themselves from the AIL, we can write the unmount - * record. + * run the callbacks removing themselves from the AIL, we can cover the log. */ -void +int xfs_log_quiesce( struct xfs_mount *mp) { + /* + * Clear log incompat features since we're quiescing the log. Report + * failures, though it's not fatal to have a higher log feature + * protection level than the log contents actually require. + */ + if (xfs_clear_incompat_log_features(mp)) { + int error; + + error = xfs_sync_sb(mp, false); + if (error) + xfs_warn(mp, + "Failed to clear log incompat features on quiesce"); + } + cancel_delayed_work_sync(&mp->m_log->l_work); xfs_log_force(mp, XFS_LOG_SYNC); /* * The superblock buffer is uncached and while xfs_ail_push_all_sync() - * will push it, xfs_wait_buftarg() will not wait for it. Further, + * will push it, xfs_buftarg_wait() will not wait for it. Further, * xfs_buf_iowait() cannot be used because it was pushed with the * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for * the IO to complete. */ xfs_ail_push_all_sync(mp->m_ail); - xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buftarg_wait(mp->m_ddev_targp); xfs_buf_lock(mp->m_sb_bp); xfs_buf_unlock(mp->m_sb_bp); + return xfs_log_cover(mp); +} + +void +xfs_log_clean( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); xfs_log_unmount_write(mp); } @@ -1060,7 +1109,9 @@ void xfs_log_unmount( struct xfs_mount *mp) { - xfs_log_quiesce(mp); + xfs_log_clean(mp); + + xfs_buftarg_drain(mp->m_ddev_targp); xfs_trans_ail_destroy(mp); @@ -1076,7 +1127,7 @@ xfs_log_item_init( int type, const struct xfs_item_ops *ops) { - item->li_mountp = mp; + item->li_log = mp->m_log; item->li_ailp = mp->m_ail; item->li_type = type; item->li_ops = ops; @@ -1098,11 +1149,11 @@ xfs_log_space_wake( struct xlog *log = mp->m_log; int free_bytes; - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return; if (!list_empty_careful(&log->l_write_head.waiters)) { - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_write_head.lock); free_bytes = xlog_space_left(log, &log->l_write_head.grant); @@ -1111,7 +1162,7 @@ xfs_log_space_wake( } if (!list_empty_careful(&log->l_reserve_head.waiters)) { - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_reserve_head.lock); free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); @@ -1137,17 +1188,15 @@ xfs_log_space_wake( * there's no point in running a dummy transaction at this point because we * can't start trying to idle the log until both the CIL and AIL are empty. */ -static int -xfs_log_need_covered(xfs_mount_t *mp) +static bool +xfs_log_need_covered( + struct xfs_mount *mp) { - struct xlog *log = mp->m_log; - int needed = 0; - - if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) - return 0; + struct xlog *log = mp->m_log; + bool needed = false; if (!xlog_cil_empty(log)) - return 0; + return false; spin_lock(&log->l_icloglock); switch (log->l_covered_state) { @@ -1162,14 +1211,14 @@ xfs_log_need_covered(xfs_mount_t *mp) if (!xlog_iclogs_empty(log)) break; - needed = 1; + needed = true; if (log->l_covered_state == XLOG_STATE_COVER_NEED) log->l_covered_state = XLOG_STATE_COVER_DONE; else log->l_covered_state = XLOG_STATE_COVER_DONE2; break; default: - needed = 1; + needed = true; break; } spin_unlock(&log->l_icloglock); @@ -1177,6 +1226,60 @@ xfs_log_need_covered(xfs_mount_t *mp) } /* + * Explicitly cover the log. This is similar to background log covering but + * intended for usage in quiesce codepaths. The caller is responsible to ensure + * the log is idle and suitable for covering. The CIL, iclog buffers and AIL + * must all be empty. + */ +static int +xfs_log_cover( + struct xfs_mount *mp) +{ + int error = 0; + bool need_covered; + + ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) && + !xfs_ail_min_lsn(mp->m_log->l_ailp)) || + xlog_is_shutdown(mp->m_log)); + + if (!xfs_log_writable(mp)) + return 0; + + /* + * xfs_log_need_covered() is not idempotent because it progresses the + * state machine if the log requires covering. Therefore, we must call + * this function once and use the result until we've issued an sb sync. + * Do so first to make that abundantly clear. + * + * Fall into the covering sequence if the log needs covering or the + * mount has lazy superblock accounting to sync to disk. The sb sync + * used for covering accumulates the in-core counters, so covering + * handles this for us. + */ + need_covered = xfs_log_need_covered(mp); + if (!need_covered && !xfs_has_lazysbcount(mp)) + return 0; + + /* + * To cover the log, commit the superblock twice (at most) in + * independent checkpoints. The first serves as a reference for the + * tail pointer. The sync transaction and AIL push empties the AIL and + * updates the in-core tail to the LSN of the first checkpoint. The + * second commit updates the on-disk tail with the in-core LSN, + * covering the log. Push the AIL one more time to leave it empty, as + * we found it. + */ + do { + error = xfs_sync_sb(mp, true); + if (error) + break; + xfs_ail_push_all_sync(mp->m_ail); + } while (xfs_log_need_covered(mp)); + + return error; +} + +/* * We may be holding the log iclog lock upon entering this routine. */ xfs_lsn_t @@ -1227,16 +1330,18 @@ xlog_assign_tail_lsn( * wrap the tail, we should blow up. Rather than catch this case here, * we depend on other ASSERTions in other parts of the code. XXXmiken * - * This code also handles the case where the reservation head is behind - * the tail. The details of this case are described below, but the end - * result is that we return the size of the log as the amount of space left. + * If reservation head is behind the tail, we have a problem. Warn about it, + * but then treat it as if the log is empty. + * + * If the log is shut down, the head and tail may be invalid or out of whack, so + * shortcut invalidity asserts in this case so that we don't trigger them + * falsely. */ STATIC int xlog_space_left( struct xlog *log, atomic64_t *head) { - int free_bytes; int tail_bytes; int tail_cycle; int head_cycle; @@ -1246,29 +1351,30 @@ xlog_space_left( xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); tail_bytes = BBTOB(tail_bytes); if (tail_cycle == head_cycle && head_bytes >= tail_bytes) - free_bytes = log->l_logsize - (head_bytes - tail_bytes); - else if (tail_cycle + 1 < head_cycle) + return log->l_logsize - (head_bytes - tail_bytes); + if (tail_cycle + 1 < head_cycle) return 0; - else if (tail_cycle < head_cycle) { + + /* Ignore potential inconsistency when shutdown. */ + if (xlog_is_shutdown(log)) + return log->l_logsize; + + if (tail_cycle < head_cycle) { ASSERT(tail_cycle == (head_cycle - 1)); - free_bytes = tail_bytes - head_bytes; - } else { - /* - * The reservation head is behind the tail. - * In this case we just want to return the size of the - * log as the amount of space left. - */ - xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); - xfs_alert(log->l_mp, - " tail_cycle = %d, tail_bytes = %d", - tail_cycle, tail_bytes); - xfs_alert(log->l_mp, - " GH cycle = %d, GH bytes = %d", - head_cycle, head_bytes); - ASSERT(0); - free_bytes = log->l_logsize; + return tail_bytes - head_bytes; } - return free_bytes; + + /* + * The reservation head is behind the tail. In this case we just want to + * return the size of the log as the amount of space left. + */ + xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); + xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d", + tail_cycle, tail_bytes); + xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d", + head_cycle, head_bytes); + ASSERT(0); + return log->l_logsize; } @@ -1279,7 +1385,6 @@ xlog_ioend_work( struct xlog_in_core *iclog = container_of(work, struct xlog_in_core, ic_end_io_work); struct xlog *log = iclog->ic_log; - bool aborted = false; int error; error = blk_status_to_errno(iclog->ic_bio.bi_status); @@ -1294,18 +1399,10 @@ xlog_ioend_work( */ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - /* - * This flag will be propagated to the trans-committed - * callback routines to let them know that the log-commit - * didn't succeed. - */ - aborted = true; - } else if (iclog->ic_state == XLOG_STATE_IOERROR) { - aborted = true; + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } - xlog_state_done_syncing(iclog, aborted); + xlog_state_done_syncing(iclog); bio_uninit(&iclog->ic_bio); /* @@ -1355,6 +1452,32 @@ xfs_log_work_queue( } /* + * Clear the log incompat flags if we have the opportunity. + * + * This only happens if we're about to log the second dummy transaction as part + * of covering the log and we can get the log incompat feature usage lock. + */ +static inline void +xlog_clear_incompat( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; + + if (!xfs_sb_has_incompat_log_feature(&mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_ALL)) + return; + + if (log->l_covered_state != XLOG_STATE_COVER_DONE2) + return; + + if (!down_write_trylock(&log->l_incompat_users)) + return; + + xfs_clear_incompat_log_features(mp); + up_write(&log->l_incompat_users); +} + +/* * Every sync period we need to unpin all items in the AIL and push them to * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. @@ -1368,7 +1491,7 @@ xfs_log_worker( struct xfs_mount *mp = log->l_mp; /* dgc: errors ignored - not fatal and nowhere to report them */ - if (xfs_log_need_covered(mp)) { + if (xfs_fs_writable(mp, SB_FREEZE_WRITE) && xfs_log_need_covered(mp)) { /* * Dump a transaction into the log that contains no real change. * This is needed to stamp the current tail LSN into the log @@ -1380,6 +1503,7 @@ xfs_log_worker( * synchronously log the superblock instead to ensure the * superblock is immediately unpinned and can be written back. */ + xlog_clear_incompat(log); xfs_sync_sb(mp, true); } else xfs_log_force(mp, 0); @@ -1423,7 +1547,7 @@ xlog_alloc_log( log->l_logBBstart = blk_offset; log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; - log->l_flags |= XLOG_ACTIVE_RECOVERY; + set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; @@ -1432,11 +1556,16 @@ xlog_alloc_log( xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ + if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) + log->l_iclog_roundoff = mp->m_sb.sb_logsunit; + else + log->l_iclog_roundoff = BBSIZE; + xlog_grant_head_init(&log->l_reserve_head); xlog_grant_head_init(&log->l_write_head); error = -EFSCORRUPTED; - if (xfs_sb_version_hassector(&mp->m_sb)) { + if (xfs_has_sector(mp)) { log2_size = mp->m_sb.sb_logsectlog; if (log2_size < BBSHIFT) { xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", @@ -1453,7 +1582,7 @@ xlog_alloc_log( /* for larger sector sizes, must have v2 or external log */ if (log2_size && log->l_logBBstart > 0 && - !xfs_sb_version_haslogv2(&mp->m_sb)) { + !xfs_has_logv2(mp)) { xfs_warn(mp, "log sector size (0x%x) invalid for configuration.", log2_size); @@ -1462,6 +1591,8 @@ xlog_alloc_log( } log->l_sectBBsize = 1 << log2_size; + init_rwsem(&log->l_incompat_users); + xlog_get_iclog_buffer_size(mp, log); spin_lock_init(&log->l_icloglock); @@ -1477,7 +1608,6 @@ xlog_alloc_log( */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { - int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); @@ -1489,18 +1619,15 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, - KM_MAYFAIL | KM_ZERO); + iclog->ic_data = kvzalloc(log->l_iclog_size, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; -#ifdef DEBUG - log->l_iclog_bak[i] = &iclog->ic_header; -#endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); head->h_version = cpu_to_be32( - xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + xfs_has_logv2(log->l_mp) ? 2 : 1); head->h_size = cpu_to_be32(log->l_iclog_size); /* new fields */ head->h_fmt = cpu_to_be32(XLOG_FMT); @@ -1510,9 +1637,8 @@ xlog_alloc_log( iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); - spin_lock_init(&iclog->ic_callback_lock); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; + iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1525,8 +1651,9 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, - mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | + WQ_HIGHPRI), + 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; @@ -1551,47 +1678,15 @@ out: return ERR_PTR(error); } /* xlog_alloc_log */ - -/* - * Write out the commit record of a transaction associated with the given - * ticket. Return the lsn of the commit record. - */ -STATIC int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp) -{ - struct xfs_mount *mp = log->l_mp; - int error; - struct xfs_log_iovec reg = { - .i_addr = NULL, - .i_len = 0, - .i_type = XLOG_REG_TYPE_COMMIT, - }; - struct xfs_log_vec vec = { - .lv_niovecs = 1, - .lv_iovecp = ®, - }; - - ASSERT_ALWAYS(iclog); - error = xlog_write(log, &vec, ticket, commitlsnp, iclog, - XLOG_COMMIT_TRANS); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return error; -} - /* - * Push on the buffer cache code if we ever use more than 75% of the on-disk - * log space. This code pushes on the lsn which would supposedly free up - * the 25% which we want to leave free. We may need to adopt a policy which - * pushes on an lsn which is further along in the log once we reach the high - * water mark. In this manner, we would be creating a low water mark. + * Compute the LSN that we'd need to push the log tail towards in order to have + * (a) enough on-disk log space to log the number of bytes specified, (b) at + * least 25% of the log space free, and (c) at least 256 blocks free. If the + * log free space already meets all three thresholds, this function returns + * NULLCOMMITLSN. */ -STATIC void -xlog_grant_push_ail( +xfs_lsn_t +xlog_grant_push_threshold( struct xlog *log, int need_bytes) { @@ -1617,7 +1712,7 @@ xlog_grant_push_ail( free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); free_threshold = max(free_threshold, 256); if (free_blocks >= free_threshold) - return; + return NULLCOMMITLSN; xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, &threshold_block); @@ -1637,13 +1732,33 @@ xlog_grant_push_ail( if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) threshold_lsn = last_sync_lsn; + return threshold_lsn; +} + +/* + * Push the tail of the log if we need to do so to maintain the free log space + * thresholds set out by xlog_grant_push_threshold. We may need to adopt a + * policy which pushes on an lsn which is further along in the log once we + * reach the high water mark. In this manner, we would be creating a low water + * mark. + */ +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn; + + threshold_lsn = xlog_grant_push_threshold(log, need_bytes); + if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log)) + return; + /* * Get the transaction layer to kick the dirty buffers out to * disk asynchronously. No point in trying to do this if * the filesystem is shutting down. */ - if (!XLOG_FORCED_SHUTDOWN(log)) - xfs_ail_push(log->l_ailp, threshold_lsn); + xfs_ail_push(log->l_ailp, threshold_lsn); } /* @@ -1671,7 +1786,7 @@ xlog_pack_data( dp += BBSIZE; } - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { xlog_in_core_2_t *xhdr = iclog->ic_data; for ( ; i < BTOBB(size); i++) { @@ -1708,14 +1823,12 @@ xlog_cksum( offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; int i; int xheads; - xheads = size / XLOG_HEADER_CYCLE_SIZE; - if (size % XLOG_HEADER_CYCLE_SIZE) - xheads++; + xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE); for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, @@ -1739,7 +1852,7 @@ xlog_bio_end_io( &iclog->ic_end_io_work); } -static void +static int xlog_map_iclog_data( struct bio *bio, void *data, @@ -1750,11 +1863,14 @@ xlog_map_iclog_data( unsigned int off = offset_in_page(data); size_t len = min_t(size_t, count, PAGE_SIZE - off); - WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len); + if (bio_add_page(bio, page, len, off) != len) + return -EIO; data += len; count -= len; } while (count); + + return 0; } STATIC void @@ -1762,10 +1878,10 @@ xlog_write_iclog( struct xlog *log, struct xlog_in_core *iclog, uint64_t bno, - unsigned int count, - bool need_flush) + unsigned int count) { ASSERT(bno < log->l_logBBsize); + trace_xlog_iclog_write(iclog, _RET_IP_); /* * We lock the iclogbufs here so that we can serialise against I/O @@ -1776,7 +1892,7 @@ xlog_write_iclog( * across the log IO to archieve that. */ down(&iclog->ic_sema); - if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) { + if (xlog_is_shutdown(log)) { /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of @@ -1784,21 +1900,52 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog, true); + xlog_state_done_syncing(iclog); up(&iclog->ic_sema); return; } - bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); - bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); + /* + * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more + * IOs coming immediately after this one. This prevents the block layer + * writeback throttle from throttling log writes behind background + * metadata writeback and causing priority inversions. + */ + bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec, + howmany(count, PAGE_SIZE), + REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE); iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; iclog->ic_bio.bi_end_io = xlog_bio_end_io; iclog->ic_bio.bi_private = iclog; - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; - if (need_flush) + + if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) { iclog->ic_bio.bi_opf |= REQ_PREFLUSH; + /* + * For external log devices, we also need to flush the data + * device cache first to ensure all metadata writeback covered + * by the LSN in this iclog is on stable storage. This is slow, + * but it *must* complete before we issue the external log IO. + * + * If the flush fails, we cannot conclude that past metadata + * writeback from the log succeeded. Repeating the flush is + * not possible, hence we must shut down with log IO error to + * avoid shutdown re-entering this path and erroring out again. + */ + if (log->l_targ != log->l_mp->m_ddev_targp && + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) { + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + return; + } + } + if (iclog->ic_flags & XLOG_ICL_NEED_FUA) + iclog->ic_bio.bi_opf |= REQ_FUA; + + iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count); + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + return; + } if (is_vmalloc_addr(iclog->ic_data)) flush_kernel_vmap_range(iclog->ic_data, count); @@ -1852,34 +1999,20 @@ xlog_calc_iclog_size( uint32_t *roundoff) { uint32_t count_init, count; - bool use_lsunit; - - use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1; /* Add for LR header */ count_init = log->l_iclog_hsize + iclog->ic_offset; + count = roundup(count_init, log->l_iclog_roundoff); - /* Round out the log write size */ - if (use_lsunit) { - /* we have a v2 stripe unit to use */ - count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); - } else { - count = BBTOB(BTOBB(count_init)); - } - - ASSERT(count >= count_init); *roundoff = count - count_init; - if (use_lsunit) - ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit); - else - ASSERT(*roundoff < BBTOB(1)); + ASSERT(count >= count_init); + ASSERT(*roundoff < log->l_iclog_roundoff); return count; } /* - * Flush out the in-core log (iclog) to the on-disk log in an asynchronous + * Flush out the in-core log (iclog) to the on-disk log in an asynchronous * fashion. Previously, we should have moved the current iclog * ptr in the log to point to the next available iclog. This allows further * write to continue while this code syncs out an iclog ready to go. @@ -1904,28 +2037,37 @@ xlog_calc_iclog_size( STATIC void xlog_sync( struct xlog *log, - struct xlog_in_core *iclog) + struct xlog_in_core *iclog, + struct xlog_ticket *ticket) { unsigned int count; /* byte count of bwrite */ unsigned int roundoff; /* roundoff to BB or stripe */ uint64_t bno; unsigned int size; - bool need_flush = true, split = false; ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + trace_xlog_iclog_sync(iclog, _RET_IP_); count = xlog_calc_iclog_size(log, iclog, &roundoff); - /* move grant heads by roundoff in sync */ - xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); - xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + /* + * If we have a ticket, account for the roundoff via the ticket + * reservation to avoid touching the hot grant heads needlessly. + * Otherwise, we have to move grant heads directly. + */ + if (ticket) { + ticket->t_curr_res -= roundoff; + } else { + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + } /* put cycle number in every block */ - xlog_pack_data(log, iclog, roundoff); + xlog_pack_data(log, iclog, roundoff); /* real byte length */ size = iclog->ic_offset; - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) + if (xfs_has_logv2(log->l_mp)) size += roundoff; iclog->ic_header.h_len = cpu_to_be32(size); @@ -1935,10 +2077,8 @@ xlog_sync( bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); /* Do we need to split this write into 2 parts? */ - if (bno + BTOBB(count) > log->l_logBBsize) { + if (bno + BTOBB(count) > log->l_logBBsize) xlog_split_iclog(log, &iclog->ic_header, bno, count); - split = true; - } /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, @@ -1959,22 +2099,8 @@ xlog_sync( be64_to_cpu(iclog->ic_header.h_lsn)); } #endif - - /* - * Flush the data device before flushing the log to make sure all meta - * data written back from the AIL actually made it to disk before - * stamping the new log tail LSN into the log buffer. For an external - * log we need to issue the flush explicitly, and unfortunately - * synchronously here; for an internal log we can simply use the block - * layer state machine for preflushes. - */ - if (log->l_targ != log->l_mp->m_ddev_targp || split) { - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - need_flush = false; - } - xlog_verify_iclog(log, iclog, count); - xlog_write_iclog(log, iclog, bno, count, need_flush); + xlog_write_iclog(log, iclog, bno, count); } /* @@ -1987,8 +2113,6 @@ xlog_dealloc_log( xlog_in_core_t *iclog, *next_iclog; int i; - xlog_cil_destroy(log); - /* * Cycle all the iclogbuf locks to make sure all log IO completion * is done before we tear down these buffers. @@ -2000,6 +2124,13 @@ xlog_dealloc_log( iclog = iclog->ic_next; } + /* + * Destroy the CIL after waiting for iclog IO completion because an + * iclog EIO error will try to shut down the log, which accesses the + * CIL to wake up the waiters. + */ + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; @@ -2011,7 +2142,7 @@ xlog_dealloc_log( log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); kmem_free(log); -} /* xlog_dealloc_log */ +} /* * Update counters atomically now that memcpy is done. @@ -2038,63 +2169,11 @@ xlog_print_tic_res( struct xfs_mount *mp, struct xlog_ticket *ticket) { - uint i; - uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); - - /* match with XLOG_REG_TYPE_* in xfs_log.h */ -#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str - static char *res_type_str[] = { - REG_TYPE_STR(BFORMAT, "bformat"), - REG_TYPE_STR(BCHUNK, "bchunk"), - REG_TYPE_STR(EFI_FORMAT, "efi_format"), - REG_TYPE_STR(EFD_FORMAT, "efd_format"), - REG_TYPE_STR(IFORMAT, "iformat"), - REG_TYPE_STR(ICORE, "icore"), - REG_TYPE_STR(IEXT, "iext"), - REG_TYPE_STR(IBROOT, "ibroot"), - REG_TYPE_STR(ILOCAL, "ilocal"), - REG_TYPE_STR(IATTR_EXT, "iattr_ext"), - REG_TYPE_STR(IATTR_BROOT, "iattr_broot"), - REG_TYPE_STR(IATTR_LOCAL, "iattr_local"), - REG_TYPE_STR(QFORMAT, "qformat"), - REG_TYPE_STR(DQUOT, "dquot"), - REG_TYPE_STR(QUOTAOFF, "quotaoff"), - REG_TYPE_STR(LRHEADER, "LR header"), - REG_TYPE_STR(UNMOUNT, "unmount"), - REG_TYPE_STR(COMMIT, "commit"), - REG_TYPE_STR(TRANSHDR, "trans header"), - REG_TYPE_STR(ICREATE, "inode create"), - REG_TYPE_STR(RUI_FORMAT, "rui_format"), - REG_TYPE_STR(RUD_FORMAT, "rud_format"), - REG_TYPE_STR(CUI_FORMAT, "cui_format"), - REG_TYPE_STR(CUD_FORMAT, "cud_format"), - REG_TYPE_STR(BUI_FORMAT, "bui_format"), - REG_TYPE_STR(BUD_FORMAT, "bud_format"), - }; - BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); -#undef REG_TYPE_STR - xfs_warn(mp, "ticket reservation summary:"); - xfs_warn(mp, " unit res = %d bytes", - ticket->t_unit_res); - xfs_warn(mp, " current res = %d bytes", - ticket->t_curr_res); - xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", - ticket->t_res_arr_sum, ticket->t_res_o_flow); - xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", - ticket->t_res_num_ophdrs, ophdr_spc); - xfs_warn(mp, " ophdr + reg = %u bytes", - ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); - xfs_warn(mp, " num regions = %u", - ticket->t_res_num); - - for (i = 0; i < ticket->t_res_num; i++) { - uint r_type = ticket->t_res_arr[i].r_type; - xfs_warn(mp, "region[%u]: %s - %u bytes", i, - ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? - "bad-rtype" : res_type_str[r_type]), - ticket->t_res_arr[i].r_len); - } + xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); + xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res); + xfs_warn(mp, " original count = %d", ticket->t_ocnt); + xfs_warn(mp, " remaining count = %d", ticket->t_cnt); } /* @@ -2147,201 +2226,226 @@ xlog_print_trans( } } -/* - * Calculate the potential space needed by the log vector. Each region gets - * its own xlog_op_header_t and may need to be double word aligned. - */ -static int -xlog_write_calc_vec_length( - struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector) +static inline void +xlog_write_iovec( + struct xlog_in_core *iclog, + uint32_t *log_offset, + void *data, + uint32_t write_len, + int *bytes_left, + uint32_t *record_cnt, + uint32_t *data_cnt) { - struct xfs_log_vec *lv; - int headers = 0; - int len = 0; - int i; - - /* acct for start rec of xact */ - if (ticket->t_flags & XLOG_TIC_INITED) - headers++; - - for (lv = log_vector; lv; lv = lv->lv_next) { - /* we don't write ordered log vectors */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) - continue; - - headers += lv->lv_niovecs; - - for (i = 0; i < lv->lv_niovecs; i++) { - struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + ASSERT(*log_offset < iclog->ic_log->l_iclog_size); + ASSERT(*log_offset % sizeof(int32_t) == 0); + ASSERT(write_len % sizeof(int32_t) == 0); - len += vecp->i_len; - xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); - } - } - - ticket->t_res_num_ophdrs += headers; - len += headers * sizeof(struct xlog_op_header); - - return len; + memcpy(iclog->ic_datap + *log_offset, data, write_len); + *log_offset += write_len; + *bytes_left -= write_len; + (*record_cnt)++; + *data_cnt += write_len; } /* - * If first write for transaction, insert start record We can't be trying to - * commit if we are inited. We can't have any "partial_copy" if we are inited. + * Write log vectors into a single iclog which is guaranteed by the caller + * to have enough space to write the entire log vector into. */ -static int -xlog_write_start_rec( - struct xlog_op_header *ophdr, - struct xlog_ticket *ticket) +static void +xlog_write_full( + struct xfs_log_vec *lv, + struct xlog_ticket *ticket, + struct xlog_in_core *iclog, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt) { - if (!(ticket->t_flags & XLOG_TIC_INITED)) - return 0; + int index; - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; - ophdr->oh_len = 0; - ophdr->oh_flags = XLOG_START_TRANS; - ophdr->oh_res2 = 0; + ASSERT(*log_offset + *len <= iclog->ic_size || + iclog->ic_state == XLOG_STATE_WANT_SYNC); - ticket->t_flags &= ~XLOG_TIC_INITED; + /* + * Ordered log vectors have no regions to write so this + * loop will naturally skip them. + */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + struct xlog_op_header *ophdr = reg->i_addr; - return sizeof(struct xlog_op_header); + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + xlog_write_iovec(iclog, log_offset, reg->i_addr, + reg->i_len, len, record_cnt, data_cnt); + } } -static xlog_op_header_t * -xlog_write_setup_ophdr( - struct xlog *log, - struct xlog_op_header *ophdr, +static int +xlog_write_get_more_iclog_space( struct xlog_ticket *ticket, - uint flags) + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t len, + uint32_t *record_cnt, + uint32_t *data_cnt) { - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; - ophdr->oh_res2 = 0; - - /* are we copying a commit or unmount record? */ - ophdr->oh_flags = flags; + struct xlog_in_core *iclog = *iclogp; + struct xlog *log = iclog->ic_log; + int error; - /* - * We've seen logs corrupted with bad transaction client ids. This - * makes sure that XFS doesn't generate them on. Turn this into an EIO - * and shut down the filesystem. - */ - switch (ophdr->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_warn(log->l_mp, - "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, - ophdr->oh_clientid, ticket); - return NULL; - } + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC); + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + error = xlog_state_release_iclog(log, iclog, ticket); + spin_unlock(&log->l_icloglock); + if (error) + return error; - return ophdr; + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + log_offset); + if (error) + return error; + *record_cnt = 0; + *data_cnt = 0; + *iclogp = iclog; + return 0; } /* - * Set up the parameters of the region copy into the log. This has - * to handle region write split across multiple log buffers - this - * state is kept external to this function so that this code can - * be written in an obvious, self documenting manner. + * Write log vectors into a single iclog which is smaller than the current chain + * length. We write until we cannot fit a full record into the remaining space + * and then stop. We return the log vector that is to be written that cannot + * wholly fit in the iclog. */ static int -xlog_write_setup_copy( +xlog_write_partial( + struct xfs_log_vec *lv, struct xlog_ticket *ticket, - struct xlog_op_header *ophdr, - int space_available, - int space_required, - int *copy_off, - int *copy_len, - int *last_was_partial_copy, - int *bytes_consumed) -{ - int still_to_copy; - - still_to_copy = space_required - *bytes_consumed; - *copy_off = *bytes_consumed; - - if (still_to_copy <= space_available) { - /* write of region completes here */ - *copy_len = still_to_copy; - ophdr->oh_len = cpu_to_be32(*copy_len); - if (*last_was_partial_copy) - ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - *last_was_partial_copy = 0; - *bytes_consumed = 0; - return 0; - } + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt) +{ + struct xlog_in_core *iclog = *iclogp; + struct xlog_op_header *ophdr; + int index = 0; + uint32_t rlen; + int error; - /* partial write of region, needs extra log op header reservation */ - *copy_len = space_available; - ophdr->oh_len = cpu_to_be32(*copy_len); - ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - if (*last_was_partial_copy) - ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; - *bytes_consumed += *copy_len; - (*last_was_partial_copy)++; + /* walk the logvec, copying until we run out of space in the iclog */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + uint32_t reg_offset = 0; - /* account for new log op header */ - ticket->t_curr_res -= sizeof(struct xlog_op_header); - ticket->t_res_num_ophdrs++; + /* + * The first region of a continuation must have a non-zero + * length otherwise log recovery will just skip over it and + * start recovering from the next opheader it finds. Because we + * mark the next opheader as a continuation, recovery will then + * incorrectly add the continuation to the previous region and + * that breaks stuff. + * + * Hence if there isn't space for region data after the + * opheader, then we need to start afresh with a new iclog. + */ + if (iclog->ic_size - *log_offset <= + sizeof(struct xlog_op_header)) { + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, *len, record_cnt, + data_cnt); + if (error) + return error; + } - return sizeof(struct xlog_op_header); -} + ophdr = reg->i_addr; + rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset); -static int -xlog_write_copy_finish( - struct xlog *log, - struct xlog_in_core *iclog, - uint flags, - int *record_cnt, - int *data_cnt, - int *partial_copy, - int *partial_copy_len, - int log_offset, - struct xlog_in_core **commit_iclog) -{ - int error; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header)); + if (rlen != reg->i_len) + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + + xlog_write_iovec(iclog, log_offset, reg->i_addr, + rlen, len, record_cnt, data_cnt); + + /* If we wrote the whole region, move to the next. */ + if (rlen == reg->i_len) + continue; - if (*partial_copy) { /* - * This iclog has already been marked WANT_SYNC by - * xlog_state_get_iclog_space. + * We now have a partially written iovec, but it can span + * multiple iclogs so we loop here. First we release the iclog + * we currently have, then we get a new iclog and add a new + * opheader. Then we continue copying from where we were until + * we either complete the iovec or fill the iclog. If we + * complete the iovec, then we increment the index and go right + * back to the top of the outer loop. if we fill the iclog, we + * run the inner loop again. + * + * This is complicated by the tail of a region using all the + * space in an iclog and hence requiring us to release the iclog + * and get a new one before returning to the outer loop. We must + * always guarantee that we exit this inner loop with at least + * space for log transaction opheaders left in the current + * iclog, hence we cannot just terminate the loop at the end + * of the of the continuation. So we loop while there is no + * space left in the current iclog, and check for the end of the + * continuation after getting a new iclog. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; - goto release_iclog; - } + do { + /* + * Ensure we include the continuation opheader in the + * space we need in the new iclog by adding that size + * to the length we require. This continuation opheader + * needs to be accounted to the ticket as the space it + * consumes hasn't been accounted to the lv we are + * writing. + */ + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, + *len + sizeof(struct xlog_op_header), + record_cnt, data_cnt); + if (error) + return error; - *partial_copy = 0; - *partial_copy_len = 0; + ophdr = iclog->ic_datap + *log_offset; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = XFS_TRANSACTION; + ophdr->oh_res2 = 0; + ophdr->oh_flags = XLOG_WAS_CONT_TRANS; - if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { - /* no more space in this iclog - push it. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; + ticket->t_curr_res -= sizeof(struct xlog_op_header); + *log_offset += sizeof(struct xlog_op_header); + *data_cnt += sizeof(struct xlog_op_header); - xlog_state_want_sync(log, iclog); - if (!commit_iclog) - goto release_iclog; - spin_unlock(&log->l_icloglock); - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; + /* + * If rlen fits in the iclog, then end the region + * continuation. Otherwise we're going around again. + */ + reg_offset += rlen; + rlen = reg->i_len - reg_offset; + if (rlen <= iclog->ic_size - *log_offset) + ophdr->oh_flags |= XLOG_END_TRANS; + else + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + + rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset); + ophdr->oh_len = cpu_to_be32(rlen); + + xlog_write_iovec(iclog, log_offset, + reg->i_addr + reg_offset, + rlen, len, record_cnt, data_cnt); + + } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS); } + /* + * No more iovecs remain in this logvec so return the next log vec to + * the caller so it can go back to fast path copying. + */ + *iclogp = iclog; return 0; - -release_iclog: - error = xlog_state_release_iclog(log, iclog); - spin_unlock(&log->l_icloglock); - return error; } /* @@ -2387,299 +2491,187 @@ release_iclog: int xlog_write( struct xlog *log, - struct xfs_log_vec *log_vector, + struct xfs_cil_ctx *ctx, + struct list_head *lv_chain, struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, - uint flags) + uint32_t len) + { struct xlog_in_core *iclog = NULL; - struct xfs_log_iovec *vecp; struct xfs_log_vec *lv; - int len; - int index; - int partial_copy = 0; - int partial_copy_len = 0; - int contwr = 0; - int record_cnt = 0; - int data_cnt = 0; + uint32_t record_cnt = 0; + uint32_t data_cnt = 0; int error = 0; - - *start_lsn = 0; - - len = xlog_write_calc_vec_length(ticket, log_vector); - - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. - */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); + int log_offset; if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); xlog_print_tic_res(log->l_mp, ticket); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } - index = 0; - lv = log_vector; - vecp = lv->lv_iovecp; - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - void *ptr; - int log_offset; - - error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset); - if (error) - return error; + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &log_offset); + if (error) + return error; - ASSERT(log_offset <= iclog->ic_size - 1); - ptr = iclog->ic_datap + log_offset; + ASSERT(log_offset <= iclog->ic_size - 1); - /* start_lsn is the first lsn written to. That's all we need. */ - if (!*start_lsn) - *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + /* + * If we have a context pointer, pass it the first iclog we are + * writing to so it can record state needed for iclog write + * ordering. + */ + if (ctx) + xlog_cil_set_ctx_write_state(ctx, iclog); + list_for_each_entry(lv, lv_chain, lv_list) { /* - * This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). + * If the entire log vec does not fit in the iclog, punt it to + * the partial copy loop which can handle this case. */ - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - struct xfs_log_iovec *reg; - struct xlog_op_header *ophdr; - int start_rec_copy; - int copy_len; - int copy_off; - bool ordered = false; - - /* ordered log vectors have no regions to write */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { - ASSERT(lv->lv_niovecs == 0); - ordered = true; - goto next_lv; - } - - reg = &vecp[index]; - ASSERT(reg->i_len % sizeof(int32_t) == 0); - ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - - start_rec_copy = xlog_write_start_rec(ptr, ticket); - if (start_rec_copy) { - record_cnt++; - xlog_write_adv_cnt(&ptr, &len, &log_offset, - start_rec_copy); - } - - ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); - if (!ophdr) - return -EIO; - - xlog_write_adv_cnt(&ptr, &len, &log_offset, - sizeof(struct xlog_op_header)); - - len += xlog_write_setup_copy(ticket, ophdr, - iclog->ic_size-log_offset, - reg->i_len, - ©_off, ©_len, - &partial_copy, - &partial_copy_len); - xlog_verify_dest_ptr(log, ptr); - - /* - * Copy region. - * - * Unmount records just log an opheader, so can have - * empty payloads with no data region to copy. Hence we - * only copy the payload if the vector says it has data - * to copy. - */ - ASSERT(copy_len >= 0); - if (copy_len > 0) { - memcpy(ptr, reg->i_addr + copy_off, copy_len); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - copy_len); - } - copy_len += start_rec_copy + sizeof(xlog_op_header_t); - record_cnt++; - data_cnt += contwr ? copy_len : 0; - - error = xlog_write_copy_finish(log, iclog, flags, - &record_cnt, &data_cnt, - &partial_copy, - &partial_copy_len, - log_offset, - commit_iclog); - if (error) + if (lv->lv_niovecs && + lv->lv_bytes > iclog->ic_size - log_offset) { + error = xlog_write_partial(lv, ticket, &iclog, + &log_offset, &len, &record_cnt, + &data_cnt); + if (error) { + /* + * We have no iclog to release, so just return + * the error immediately. + */ return error; - - /* - * if we had a partial copy, we need to get more iclog - * space but we don't want to increment the region - * index because there is still more is this region to - * write. - * - * If we completed writing this region, and we flushed - * the iclog (indicated by resetting of the record - * count), then we also need to get more log space. If - * this was the last record, though, we are done and - * can just return. - */ - if (partial_copy) - break; - - if (++index == lv->lv_niovecs) { -next_lv: - lv = lv->lv_next; - index = 0; - if (lv) - vecp = lv->lv_iovecp; - } - if (record_cnt == 0 && !ordered) { - if (!lv) - return 0; - break; } + } else { + xlog_write_full(lv, ticket, iclog, &log_offset, + &len, &record_cnt, &data_cnt); } } - ASSERT(len == 0); + /* + * We've already been guaranteed that the last writes will fit inside + * the current iclog, and hence it will already have the space used by + * those writes accounted to it. Hence we do not need to update the + * iclog with the number of bytes written here. + */ spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (commit_iclog) { - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } else { - error = xlog_state_release_iclog(log, iclog); - } + xlog_state_finish_copy(log, iclog, record_cnt, 0); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); return error; } +static void +xlog_state_activate_iclog( + struct xlog_in_core *iclog, + int *iclogs_changed) +{ + ASSERT(list_empty_careful(&iclog->ic_callbacks)); + trace_xlog_iclog_activate(iclog, _RET_IP_); -/***************************************************************************** - * - * State Machine functions - * - ***************************************************************************** - */ + /* + * If the number of ops in this iclog indicate it just contains the + * dummy transaction, we can change state into IDLE (the second time + * around). Otherwise we should change the state into NEED a dummy. + * We don't need to cover the dummy. + */ + if (*iclogs_changed == 0 && + iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + *iclogs_changed = 1; + } else { + /* + * We have two dirty iclogs so start over. This could also be + * num of ops indicating this is not the dummy going out. + */ + *iclogs_changed = 2; + } + + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; + iclog->ic_header.h_tail_lsn = 0; +} /* - * An iclog has just finished IO completion processing, so we need to update - * the iclog state and propagate that up into the overall log state. Hence we - * prepare the iclog for cleaning, and then clean all the pending dirty iclogs - * starting from the head, and then wake up any threads that are waiting for the - * iclog to be marked clean. - * - * The ordering of marking iclogs ACTIVE must be maintained, so an iclog - * doesn't become ACTIVE beyond one that is SYNCING. This is also required to - * maintain the notion that we use a ordered wait queue to hold off would be - * writers to the log when every iclog is trying to sync to disk. - * - * Caller must hold the icloglock before calling us. - * - * State Change: !IOERROR -> DIRTY -> ACTIVE + * Loop through all iclogs and mark all iclogs currently marked DIRTY as + * ACTIVE after iclog I/O has completed. */ -STATIC void -xlog_state_clean_iclog( +static void +xlog_state_activate_iclogs( struct xlog *log, - struct xlog_in_core *dirty_iclog) + int *iclogs_changed) { - struct xlog_in_core *iclog; - int changed = 0; - - /* Prepare the completed iclog. */ - if (dirty_iclog->ic_state != XLOG_STATE_IOERROR) - dirty_iclog->ic_state = XLOG_STATE_DIRTY; + struct xlog_in_core *iclog = log->l_iclog; - /* Walk all the iclogs to update the ordered active state. */ - iclog = log->l_iclog; do { - if (iclog->ic_state == XLOG_STATE_DIRTY) { - iclog->ic_state = XLOG_STATE_ACTIVE; - iclog->ic_offset = 0; - ASSERT(list_empty_careful(&iclog->ic_callbacks)); - /* - * If the number of ops in this iclog indicate it just - * contains the dummy transaction, we can - * change state into IDLE (the second time around). - * Otherwise we should change the state into - * NEED a dummy. - * We don't need to cover the dummy. - */ - if (!changed && - (be32_to_cpu(iclog->ic_header.h_num_logops) == - XLOG_COVER_OPS)) { - changed = 1; - } else { - /* - * We have two dirty iclogs so start over - * This could also be num of ops indicates - * this is not the dummy going out. - */ - changed = 2; - } - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - } else if (iclog->ic_state == XLOG_STATE_ACTIVE) - /* do nothing */; - else - break; /* stop cleaning */ - iclog = iclog->ic_next; - } while (iclog != log->l_iclog); - + if (iclog->ic_state == XLOG_STATE_DIRTY) + xlog_state_activate_iclog(iclog, iclogs_changed); + /* + * The ordering of marking iclogs ACTIVE must be maintained, so + * an iclog doesn't become ACTIVE beyond one that is SYNCING. + */ + else if (iclog->ic_state != XLOG_STATE_ACTIVE) + break; + } while ((iclog = iclog->ic_next) != log->l_iclog); +} +static int +xlog_covered_state( + int prev_state, + int iclogs_changed) +{ /* - * Wake up threads waiting in xfs_log_force() for the dirty iclog - * to be cleaned. + * We go to NEED for any non-covering writes. We go to NEED2 if we just + * wrote the first covering record (DONE). We go to IDLE if we just + * wrote the second covering record (DONE2) and remain in IDLE until a + * non-covering write occurs. */ - wake_up_all(&dirty_iclog->ic_force_wait); + switch (prev_state) { + case XLOG_STATE_COVER_IDLE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; + fallthrough; + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + break; + case XLOG_STATE_COVER_DONE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_NEED2; + break; + case XLOG_STATE_COVER_DONE2: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; + break; + default: + ASSERT(0); + } - /* - * Change state for the dummy log recording. - * We usually go to NEED. But we go to NEED2 if the changed indicates - * we are done writing the dummy record. - * If we are done with the second dummy recored (DONE2), then - * we go to IDLE. - */ - if (changed) { - switch (log->l_covered_state) { - case XLOG_STATE_COVER_IDLE: - case XLOG_STATE_COVER_NEED: - case XLOG_STATE_COVER_NEED2: - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + return XLOG_STATE_COVER_NEED; +} - case XLOG_STATE_COVER_DONE: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_NEED2; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; +STATIC void +xlog_state_clean_iclog( + struct xlog *log, + struct xlog_in_core *dirty_iclog) +{ + int iclogs_changed = 0; - case XLOG_STATE_COVER_DONE2: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_IDLE; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + trace_xlog_iclog_clean(dirty_iclog, _RET_IP_); - default: - ASSERT(0); - } + dirty_iclog->ic_state = XLOG_STATE_DIRTY; + + xlog_state_activate_iclogs(log, &iclogs_changed); + wake_up_all(&dirty_iclog->ic_force_wait); + + if (iclogs_changed) { + log->l_covered_state = xlog_covered_state(log->l_covered_state, + iclogs_changed); } } @@ -2731,6 +2723,7 @@ xlog_state_set_callback( struct xlog_in_core *iclog, xfs_lsn_t header_lsn) { + trace_xlog_iclog_callback(iclog, _RET_IP_); iclog->ic_state = XLOG_STATE_CALLBACK; ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), @@ -2751,8 +2744,7 @@ xlog_state_set_callback( static bool xlog_state_iodone_process_iclog( struct xlog *log, - struct xlog_in_core *iclog, - bool *ioerror) + struct xlog_in_core *iclog) { xfs_lsn_t lowest_lsn; xfs_lsn_t header_lsn; @@ -2764,15 +2756,6 @@ xlog_state_iodone_process_iclog( * Skip all iclogs in the ACTIVE & DIRTY states: */ return false; - case XLOG_STATE_IOERROR: - /* - * Between marking a filesystem SHUTDOWN and stopping the log, - * we do flush all iclogs to disk (if there wasn't a log I/O - * error). So, we do want things to go smoothly in case of just - * a SHUTDOWN w/o a LOG_IO_ERROR. - */ - *ioerror = true; - return false; case XLOG_STATE_DONE_SYNC: /* * Now that we have an iclog that is in the DONE_SYNC state, do @@ -2797,104 +2780,74 @@ xlog_state_iodone_process_iclog( } /* - * Keep processing entries in the iclog callback list until we come around and - * it is empty. We need to atomically see that the list is empty and change the - * state to DIRTY so that we don't miss any more callbacks being added. - * - * This function is called with the icloglock held and returns with it held. We - * drop it while running callbacks, however, as holding it over thousands of - * callbacks is unnecessary and causes excessive contention if we do. + * Loop over all the iclogs, running attached callbacks on them. Return true if + * we ran any callbacks, indicating that we dropped the icloglock. We don't need + * to handle transient shutdown state here at all because + * xlog_state_shutdown_callbacks() will be run to do the necessary shutdown + * cleanup of the callbacks. */ -static void +static bool xlog_state_do_iclog_callbacks( - struct xlog *log, - struct xlog_in_core *iclog, - bool aborted) + struct xlog *log) __releases(&log->l_icloglock) __acquires(&log->l_icloglock) { - spin_unlock(&log->l_icloglock); - spin_lock(&iclog->ic_callback_lock); - while (!list_empty(&iclog->ic_callbacks)) { - LIST_HEAD(tmp); + struct xlog_in_core *first_iclog = log->l_iclog; + struct xlog_in_core *iclog = first_iclog; + bool ran_callback = false; - list_splice_init(&iclog->ic_callbacks, &tmp); + do { + LIST_HEAD(cb_list); - spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp, aborted); - spin_lock(&iclog->ic_callback_lock); - } + if (xlog_state_iodone_process_iclog(log, iclog)) + break; + if (iclog->ic_state != XLOG_STATE_CALLBACK) { + iclog = iclog->ic_next; + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); - /* - * Pick up the icloglock while still holding the callback lock so we - * serialise against anyone trying to add more callbacks to this iclog - * now we've finished processing. - */ - spin_lock(&log->l_icloglock); - spin_unlock(&iclog->ic_callback_lock); + trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); + xlog_cil_process_committed(&cb_list); + trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); + ran_callback = true; + + spin_lock(&log->l_icloglock); + xlog_state_clean_iclog(log, iclog); + iclog = iclog->ic_next; + } while (iclog != first_iclog); + + return ran_callback; } + +/* + * Loop running iclog completion callbacks until there are no more iclogs in a + * state that can run callbacks. + */ STATIC void xlog_state_do_callback( - struct xlog *log, - bool aborted) + struct xlog *log) { - struct xlog_in_core *iclog; - struct xlog_in_core *first_iclog; - bool cycled_icloglock; - bool ioerror; int flushcnt = 0; int repeats = 0; spin_lock(&log->l_icloglock); - do { - /* - * Scan all iclogs starting with the one pointed to by the - * log. Reset this starting point each time the log is - * unlocked (during callbacks). - * - * Keep looping through iclogs until one full pass is made - * without running any callbacks. - */ - first_iclog = log->l_iclog; - iclog = log->l_iclog; - cycled_icloglock = false; - ioerror = false; - repeats++; - - do { - if (xlog_state_iodone_process_iclog(log, iclog, - &ioerror)) - break; - - if (iclog->ic_state != XLOG_STATE_CALLBACK && - iclog->ic_state != XLOG_STATE_IOERROR) { - iclog = iclog->ic_next; - continue; - } - - /* - * Running callbacks will drop the icloglock which means - * we'll have to run at least one more complete loop. - */ - cycled_icloglock = true; - xlog_state_do_iclog_callbacks(log, iclog, aborted); - - xlog_state_clean_iclog(log, iclog); - iclog = iclog->ic_next; - } while (first_iclog != iclog); + while (xlog_state_do_iclog_callbacks(log)) { + if (xlog_is_shutdown(log)) + break; - if (repeats > 5000) { + if (++repeats > 5000) { flushcnt += repeats; repeats = 0; xfs_warn(log->l_mp, "%s: possible infinite loop (%d iterations)", __func__, flushcnt); } - } while (!ioerror && cycled_icloglock); + } - if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE || - log->l_iclog->ic_state == XLOG_STATE_IOERROR) + if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE) wake_up_all(&log->l_flush_wait); spin_unlock(&log->l_icloglock); @@ -2904,37 +2857,28 @@ xlog_state_do_callback( /* * Finish transitioning this iclog to the dirty state. * - * Make sure that we completely execute this routine only when this is - * the last call to the iclog. There is a good chance that iclog flushes, - * when we reach the end of the physical log, get turned into 2 separate - * calls to bwrite. Hence, one iclog flush could generate two calls to this - * routine. By using the reference count bwritecnt, we guarantee that only - * the second completion goes through. - * * Callbacks could take time, so they are done outside the scope of the * global state machine log lock. */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted) + struct xlog_in_core *iclog) { struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); - ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + trace_xlog_iclog_sync_done(iclog, _RET_IP_); /* * If we got an error, either on the first buffer, or in the case of - * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, - * and none should ever be attempted to be written to disk - * again. + * split log writes, on the second, we shut down the file system and + * no iclogs should ever be attempted to be written to disk again. */ - if (iclog->ic_state == XLOG_STATE_SYNCING) + if (!xlog_is_shutdown(log)) { + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); iclog->ic_state = XLOG_STATE_DONE_SYNC; - else - ASSERT(iclog->ic_state == XLOG_STATE_IOERROR); + } /* * Someone could be sleeping prior to writing out the next @@ -2943,9 +2887,8 @@ xlog_state_done_syncing( */ wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); - xlog_state_do_callback(log, aborted); /* also cleans log */ -} /* xlog_state_done_syncing */ - + xlog_state_do_callback(log); +} /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must @@ -2971,7 +2914,6 @@ xlog_state_get_iclog_space( int len, struct xlog_in_core **iclogp, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp) { int log_offset; @@ -2980,7 +2922,7 @@ xlog_state_get_iclog_space( restart: spin_lock(&log->l_icloglock); - if (XLOG_FORCED_SHUTDOWN(log)) { + if (xlog_is_shutdown(log)) { spin_unlock(&log->l_icloglock); return -EIO; } @@ -2999,6 +2941,8 @@ restart: atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; + trace_xlog_iclog_get_space(iclog, _RET_IP_); + /* On the 1st write to an iclog, figure out lsn. This works * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are * committing to. If the offset is set, that's how many blocks @@ -3006,9 +2950,6 @@ restart: */ if (log_offset == 0) { ticket->t_curr_res -= log->l_iclog_hsize; - xlog_tic_add_region(ticket, - log->l_iclog_hsize, - XLOG_REG_TYPE_LRHEADER); head->h_cycle = cpu_to_be32(log->l_curr_cycle); head->h_lsn = cpu_to_be64( xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); @@ -3037,7 +2978,7 @@ restart: * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); if (error) return error; @@ -3050,13 +2991,10 @@ restart: * iclogs (to mark it taken), this particular iclog will release/sync * to disk in xlog_write(). */ - if (len <= iclog->ic_size - iclog->ic_offset) { - *continued_write = 0; + if (len <= iclog->ic_size - iclog->ic_offset) iclog->ic_offset += len; - } else { - *continued_write = 1; + else xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - } *iclogp = iclog; ASSERT(iclog->ic_offset <= iclog->ic_size); @@ -3064,21 +3002,21 @@ restart: *logoffsetp = log_offset; return 0; -} /* xlog_state_get_iclog_space */ - -/* The first cnt-1 times through here we don't need to - * move the grant write head because the permanent - * reservation has reserved cnt times the unit amount. - * Release part of current permanent unit reservation and - * reset current reservation to be one units worth. Also - * move grant reservation head forward. +} + +/* + * The first cnt-1 times a ticket goes through here we don't need to move the + * grant write head because the permanent reservation has reserved cnt times the + * unit amount. Release part of current permanent unit reservation and reset + * current reservation to be one units worth. Also move grant reservation head + * forward. */ -STATIC void -xlog_regrant_reserve_log_space( +void +xfs_log_ticket_regrant( struct xlog *log, struct xlog_ticket *ticket) { - trace_xfs_log_regrant_reserve_enter(log, ticket); + trace_xfs_log_ticket_regrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; @@ -3088,23 +3026,20 @@ xlog_regrant_reserve_log_space( xlog_grant_sub_space(log, &log->l_write_head.grant, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); - trace_xfs_log_regrant_reserve_sub(log, ticket); + trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) - return; - - xlog_grant_add_space(log, &log->l_reserve_head.grant, - ticket->t_unit_res); + if (!ticket->t_cnt) { + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + trace_xfs_log_ticket_regrant_exit(log, ticket); - trace_xfs_log_regrant_reserve_exit(log, ticket); - - ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); -} /* xlog_regrant_reserve_log_space */ + ticket->t_curr_res = ticket->t_unit_res; + } + xfs_log_ticket_put(ticket); +} /* * Give back the space left from a reservation. @@ -3120,18 +3055,19 @@ xlog_regrant_reserve_log_space( * space, the count will stay at zero and the only space remaining will be * in the current reservation field. */ -STATIC void -xlog_ungrant_log_space( +void +xfs_log_ticket_ungrant( struct xlog *log, struct xlog_ticket *ticket) { - int bytes; + int bytes; + + trace_xfs_log_ticket_ungrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; - trace_xfs_log_ungrant_enter(log, ticket); - trace_xfs_log_ungrant_sub(log, ticket); + trace_xfs_log_ticket_ungrant_sub(log, ticket); /* * If this is a permanent reservation ticket, we may be able to free @@ -3146,25 +3082,26 @@ xlog_ungrant_log_space( xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); - trace_xfs_log_ungrant_exit(log, ticket); + trace_xfs_log_ticket_ungrant_exit(log, ticket); xfs_log_space_wake(log->l_mp); + xfs_log_ticket_put(ticket); } /* - * This routine will mark the current iclog in the ring as WANT_SYNC - * and move the current iclog pointer to the next iclog in the ring. - * When this routine is called from xlog_state_get_iclog_space(), the - * exact size of the iclog has not yet been determined. All we know is - * that every data block. We have run out of space in this log record. + * This routine will mark the current iclog in the ring as WANT_SYNC and move + * the current iclog pointer to the next iclog in the ring. */ -STATIC void +void xlog_state_switch_iclogs( struct xlog *log, struct xlog_in_core *iclog, int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + assert_spin_locked(&log->l_icloglock); + trace_xlog_iclog_switch(iclog, _RET_IP_); + if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; @@ -3176,9 +3113,8 @@ xlog_state_switch_iclogs( log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); /* Round up to next log-sunit */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1) { - uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); + if (log->l_iclog_roundoff > BBSIZE) { + uint32_t sunit_bb = BTOBB(log->l_iclog_roundoff); log->l_curr_block = roundup(log->l_curr_block, sunit_bb); } @@ -3199,7 +3135,36 @@ xlog_state_switch_iclogs( } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; -} /* xlog_state_switch_iclogs */ +} + +/* + * Force the iclog to disk and check if the iclog has been completed before + * xlog_force_iclog() returns. This can happen on synchronous (e.g. + * pmem) or fast async storage because we drop the icloglock to issue the IO. + * If completion has already occurred, tell the caller so that it can avoid an + * unnecessary wait on the iclog. + */ +static int +xlog_force_and_check_iclog( + struct xlog_in_core *iclog, + bool *completed) +{ + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + int error; + + *completed = false; + error = xlog_force_iclog(iclog); + if (error) + return error; + + /* + * If the iclog has already been completed and reused the header LSN + * will have been rewritten by completion + */ + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) + *completed = true; + return 0; +} /* * Write out all data in the in-core log as of this exact moment in time. @@ -3235,7 +3200,6 @@ xfs_log_force( { struct xlog *log = mp->m_log; struct xlog_in_core *iclog; - xfs_lsn_t lsn; XFS_STATS_INC(mp, xs_log_force); trace_xfs_log_force(mp, 0, _RET_IP_); @@ -3243,10 +3207,12 @@ xfs_log_force( xlog_cil_force(log); spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state == XLOG_STATE_IOERROR) + if (xlog_is_shutdown(log)) goto out_error; + iclog = log->l_iclog; + trace_xlog_iclog_force(iclog, _RET_IP_); + if (iclog->ic_state == XLOG_STATE_DIRTY || (iclog->ic_state == XLOG_STATE_ACTIVE && atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { @@ -3259,56 +3225,37 @@ xfs_log_force( * previous iclog and go to sleep. */ iclog = iclog->ic_prev; - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { if (atomic_read(&iclog->ic_refcnt) == 0) { - /* - * We are the only one with access to this iclog. - * - * Flush it out now. There should be a roundoff of zero - * to show that someone has already taken care of the - * roundoff from the previous sync. - */ - atomic_inc(&iclog->ic_refcnt); - lsn = be64_to_cpu(iclog->ic_header.h_lsn); - xlog_state_switch_iclogs(log, iclog, 0); - if (xlog_state_release_iclog(log, iclog)) + /* We have exclusive access to this iclog. */ + bool completed; + + if (xlog_force_and_check_iclog(iclog, &completed)) goto out_error; - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || - iclog->ic_state == XLOG_STATE_DIRTY) + if (completed) goto out_unlock; } else { /* - * Someone else is writing to this iclog. - * - * Use its call to flush out the data. However, the - * other thread may not force out this LR, so we mark - * it WANT_SYNC. + * Someone else is still writing to this iclog, so we + * need to ensure that when they release the iclog it + * gets synced immediately as we may be waiting on it. */ xlog_state_switch_iclogs(log, iclog, 0); } - } else { - /* - * If the head iclog is not active nor dirty, we just attach - * ourselves to the head and go to sleep if necessary. - */ - ; } - if (!(flags & XFS_LOG_SYNC)) - goto out_unlock; - - if (iclog->ic_state == XLOG_STATE_IOERROR) - goto out_error; - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - return 0; + /* + * The iclog we are about to wait on may contain the checkpoint pushed + * by the above xlog_cil_force() call, but it may not have been pushed + * to disk yet. Like the ACTIVE case above, we need to make sure caches + * are flushed when this iclog is written. + */ + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3317,32 +3264,45 @@ out_error: return -EIO; } +/* + * Force the log to a specific LSN. + * + * If an iclog with that lsn can be found: + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a wait queue. All callers trying + * to force a given lsn to disk must wait on the queue attached to the + * specific in-core log. When given in-core log finally completes its write + * to disk, that thread will wake up all threads waiting on the queue. + */ static int -__xfs_log_force_lsn( - struct xfs_mount *mp, +xlog_force_lsn( + struct xlog *log, xfs_lsn_t lsn, uint flags, int *log_flushed, bool already_slept) { - struct xlog *log = mp->m_log; struct xlog_in_core *iclog; + bool completed; spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state == XLOG_STATE_IOERROR) + if (xlog_is_shutdown(log)) goto out_error; + iclog = log->l_iclog; while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; if (iclog == log->l_iclog) goto out_unlock; } - if (iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; - - if (iclog->ic_state == XLOG_STATE_ACTIVE) { + switch (iclog->ic_state) { + case XLOG_STATE_ACTIVE: /* * We sleep here if we haven't already slept (e.g. this is the * first time we've looked at the correct iclog buf) and the @@ -3361,34 +3321,39 @@ __xfs_log_force_lsn( if (!already_slept && (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC || iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) { - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); return -EAGAIN; } - atomic_inc(&iclog->ic_refcnt); - xlog_state_switch_iclogs(log, iclog, 0); - if (xlog_state_release_iclog(log, iclog)) + if (xlog_force_and_check_iclog(iclog, &completed)) goto out_error; if (log_flushed) *log_flushed = 1; + if (completed) + goto out_unlock; + break; + case XLOG_STATE_WANT_SYNC: + /* + * This iclog may contain the checkpoint pushed by the + * xlog_cil_force_seq() call, but there are other writers still + * accessing it so it hasn't been pushed to disk yet. Like the + * ACTIVE case above, we need to make sure caches are flushed + * when this iclog is written. + */ + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; + break; + default: + /* + * The entire checkpoint was written by the CIL force and is on + * its way to disk already. It will be stable when it + * completes, so we don't need to manipulate caches here at all. + * We just need to wait for completion if necessary. + */ + break; } - if (!(flags & XFS_LOG_SYNC) || - (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY)) - goto out_unlock; - - if (iclog->ic_state == XLOG_STATE_IOERROR) - goto out_error; - - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3398,70 +3363,42 @@ out_error: } /* - * Force the in-core log to disk for a specific LSN. - * - * Find in-core log with lsn. - * If it is in the DIRTY state, just return. - * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC - * state and go to sleep or return. - * If it is in any other state, go to sleep or return. + * Force the log to a specific checkpoint sequence. * - * Synchronous forces are implemented with a wait queue. All callers trying - * to force a given lsn to disk must wait on the queue attached to the - * specific in-core log. When given in-core log finally completes its write - * to disk, that thread will wake up all threads waiting on the queue. + * First force the CIL so that all the required changes have been flushed to the + * iclogs. If the CIL force completed it will return a commit LSN that indicates + * the iclog that needs to be flushed to stable storage. If the caller needs + * a synchronous log force, we will wait on the iclog with the LSN returned by + * xlog_cil_force_seq() to be completed. */ int -xfs_log_force_lsn( +xfs_log_force_seq( struct xfs_mount *mp, - xfs_lsn_t lsn, + xfs_csn_t seq, uint flags, int *log_flushed) { + struct xlog *log = mp->m_log; + xfs_lsn_t lsn; int ret; - ASSERT(lsn != 0); + ASSERT(seq != 0); XFS_STATS_INC(mp, xs_log_force); - trace_xfs_log_force(mp, lsn, _RET_IP_); + trace_xfs_log_force(mp, seq, _RET_IP_); - lsn = xlog_cil_force_lsn(mp->m_log, lsn); + lsn = xlog_cil_force_seq(log, seq); if (lsn == NULLCOMMITLSN) return 0; - ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); - if (ret == -EAGAIN) - ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, false); + if (ret == -EAGAIN) { + XFS_STATS_INC(mp, xs_log_force_sleep); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, true); + } return ret; } /* - * Called when we want to mark the current iclog as being ready to sync to - * disk. - */ -STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog) -{ - assert_spin_locked(&log->l_icloglock); - - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - xlog_state_switch_iclogs(log, iclog, 0); - } else { - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR); - } -} - - -/***************************************************************************** - * - * TICKET functions - * - ***************************************************************************** - */ - -/* * Free a used ticket when its refcount falls to zero. */ void @@ -3470,7 +3407,7 @@ xfs_log_ticket_put( { ASSERT(atomic_read(&ticket->t_ref) > 0); if (atomic_dec_and_test(&ticket->t_ref)) - kmem_cache_free(xfs_log_ticket_zone, ticket); + kmem_cache_free(xfs_log_ticket_cache, ticket); } xlog_ticket_t * @@ -3486,12 +3423,12 @@ xfs_log_ticket_get( * Figure out the total log space unit (in bytes) that would be * required for a log ticket. */ -int -xfs_log_calc_unit_res( - struct xfs_mount *mp, - int unit_bytes) +static int +xlog_calc_unit_res( + struct xlog *log, + int unit_bytes, + int *niclogs) { - struct xlog *log = mp->m_log; int iclog_space; uint num_headers; @@ -3567,18 +3504,22 @@ xfs_log_calc_unit_res( /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize; - /* for roundoff padding for transaction data and one for commit record */ - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { - /* log su roundoff */ - unit_bytes += 2 * mp->m_sb.sb_logsunit; - } else { - /* BB roundoff */ - unit_bytes += 2 * BBSIZE; - } + /* roundoff padding for transaction data and one for commit record */ + unit_bytes += 2 * log->l_iclog_roundoff; + if (niclogs) + *niclogs = num_headers; return unit_bytes; } +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) +{ + return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL); +} + /* * Allocate and initialise a new log ticket. */ @@ -3587,18 +3528,14 @@ xlog_ticket_alloc( struct xlog *log, int unit_bytes, int cnt, - char client, - bool permanent, - xfs_km_flags_t alloc_flags) + bool permanent) { struct xlog_ticket *tic; int unit_res; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); - if (!tic) - return NULL; + tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL); - unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs); atomic_set(&tic->t_ref, 1); tic->t_task = current; @@ -3607,49 +3544,15 @@ xlog_ticket_alloc( tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = prandom_u32(); - tic->t_clientid = client; - tic->t_flags = XLOG_TIC_INITED; + tic->t_tid = get_random_u32(); if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; - xlog_tic_reset_res(tic); - return tic; } - -/****************************************************************************** - * - * Log debug routines - * - ****************************************************************************** - */ #if defined(DEBUG) /* - * Make sure that the destination ptr is within the valid data region of - * one of the iclogs. This uses backup pointers stored in a different - * part of the log in case we trash the log structure. - */ -STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr) -{ - int i; - int good_ptr = 0; - - for (i = 0; i < log->l_iclog_bufs; i++) { - if (ptr >= log->l_iclog_bak[i] && - ptr <= log->l_iclog_bak[i] + log->l_iclog_size) - good_ptr++; - } - - if (!good_ptr) - xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); -} - -/* * Check to make sure the grant write head didn't just over lap the tail. If * the cycles are the same, we can't be overlapping. Otherwise, make sure that * the cycles differ by exactly one and check the byte count. @@ -3671,17 +3574,15 @@ xlog_verify_grant_tail( xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); if (tail_cycle != cycle) { if (cycle - 1 != tail_cycle && - !(log->l_flags & XLOG_TAIL_WARN)) { + !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "%s: cycle - 1 != tail_cycle", __func__); - log->l_flags |= XLOG_TAIL_WARN; } if (space > BBTOB(tail_blocks) && - !(log->l_flags & XLOG_TAIL_WARN)) { + !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "%s: space > BBTOB(tail_blocks)", __func__); - log->l_flags |= XLOG_TAIL_WARN; } } } @@ -3690,10 +3591,10 @@ xlog_verify_grant_tail( STATIC void xlog_verify_tail_lsn( struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t tail_lsn) + struct xlog_in_core *iclog) { - int blocks; + xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn); + int blocks; if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { blocks = @@ -3710,7 +3611,7 @@ xlog_verify_tail_lsn( if (blocks < BTOBB(iclog->ic_offset) + 1) xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); } -} /* xlog_verify_tail_lsn */ +} /* * Perform a number of checks on the iclog before writing to disk. @@ -3778,7 +3679,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3789,11 +3690,12 @@ xlog_verify_iclog( iclog->ic_header.h_cycle_data[idx]); } } - if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) + if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) { xfs_warn(log->l_mp, - "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", - __func__, clientid, ophead, + "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx", + __func__, i, clientid, ophead, (unsigned long)field_offset); + } /* check length */ p = &ophead->oh_len; @@ -3801,8 +3703,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((uintptr_t)&ophead->oh_len - - (uintptr_t)iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3813,110 +3714,87 @@ xlog_verify_iclog( } ptr += sizeof(xlog_op_header_t) + op_len; } -} /* xlog_verify_iclog */ -#endif - -/* - * Mark all iclogs IOERROR. l_icloglock is held by the caller. - */ -STATIC int -xlog_state_ioerror( - struct xlog *log) -{ - xlog_in_core_t *iclog, *ic; - - iclog = log->l_iclog; - if (iclog->ic_state != XLOG_STATE_IOERROR) { - /* - * Mark all the incore logs IOERROR. - * From now on, no log flushes will result. - */ - ic = iclog; - do { - ic->ic_state = XLOG_STATE_IOERROR; - ic = ic->ic_next; - } while (ic != iclog); - return 0; - } - /* - * Return non-zero, if state transition has already happened. - */ - return 1; } +#endif /* - * This is called from xfs_force_shutdown, when we're forcibly - * shutting down the filesystem, typically because of an IO error. + * Perform a forced shutdown on the log. + * + * This can be called from low level log code to trigger a shutdown, or from the + * high level mount shutdown code when the mount shuts down. + * * Our main objectives here are to make sure that: - * a. if !logerror, flush the logs to disk. Anything modified - * after this is ignored. - * b. the filesystem gets marked 'SHUTDOWN' for all interested - * parties to find out, 'atomically'. - * c. those who're sleeping on log reservations, pinned objects and - * other resources get woken up, and be told the bad news. - * d. nothing new gets queued up after (b) and (c) are done. + * a. if the shutdown was not due to a log IO error, flush the logs to + * disk. Anything modified after this is ignored. + * b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested + * parties to find out. Nothing new gets queued after this is done. + * c. Tasks sleeping on log reservations, pinned objects and + * other resources get woken up. + * d. The mount is also marked as shut down so that log triggered shutdowns + * still behave the same as if they called xfs_forced_shutdown(). * - * Note: for the !logerror case we need to flush the regions held in memory out - * to disk first. This needs to be done before the log is marked as shutdown, - * otherwise the iclog writes will fail. + * Return true if the shutdown cause was a log IO error and we actually shut the + * log down. */ -int -xfs_log_force_umount( - struct xfs_mount *mp, - int logerror) +bool +xlog_force_shutdown( + struct xlog *log, + uint32_t shutdown_flags) { - struct xlog *log; - int retval; - - log = mp->m_log; - - /* - * If this happens during log recovery, don't worry about - * locking; the log isn't open for business yet. - */ - if (!log || - log->l_flags & XLOG_ACTIVE_RECOVERY) { - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - if (mp->m_sb_bp) - mp->m_sb_bp->b_flags |= XBF_DONE; - return 0; - } + bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); - /* - * Somebody could've already done the hard work for us. - * No need to get locks for this. - */ - if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) { - ASSERT(XLOG_FORCED_SHUTDOWN(log)); - return 1; - } + if (!log) + return false; /* * Flush all the completed transactions to disk before marking the log - * being shut down. We need to do it in this order to ensure that - * completed operations are safely on disk before we shut down, and that - * we don't have to issue any buffer IO after the shutdown flags are set - * to guarantee this. + * being shut down. We need to do this first as shutting down the log + * before the force will prevent the log force from flushing the iclogs + * to disk. + * + * When we are in recovery, there are no transactions to flush, and + * we don't want to touch the log because we don't want to perturb the + * current head/tail for future recovery attempts. Hence we need to + * avoid a log force in this case. + * + * If we are shutting down due to a log IO error, then we must avoid + * trying to write the log as that may just result in more IO errors and + * an endless shutdown/force loop. */ - if (!logerror) - xfs_log_force(mp, XFS_LOG_SYNC); + if (!log_error && !xlog_in_recovery(log)) + xfs_log_force(log->l_mp, XFS_LOG_SYNC); /* - * mark the filesystem and the as in a shutdown state and wake - * everybody up to tell them the bad news. + * Atomically set the shutdown state. If the shutdown state is already + * set, there someone else is performing the shutdown and so we are done + * here. This should never happen because we should only ever get called + * once by the first shutdown caller. + * + * Much of the log state machine transitions assume that shutdown state + * cannot change once they hold the log->l_icloglock. Hence we need to + * hold that lock here, even though we use the atomic test_and_set_bit() + * operation to set the shutdown state. */ spin_lock(&log->l_icloglock); - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - if (mp->m_sb_bp) - mp->m_sb_bp->b_flags |= XBF_DONE; + if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { + spin_unlock(&log->l_icloglock); + return false; + } + spin_unlock(&log->l_icloglock); /* - * Mark the log and the iclogs with IO error flags to prevent any - * further log IO from being issued or completed. + * If this log shutdown also sets the mount shutdown state, issue a + * shutdown warning message. */ - log->l_flags |= XLOG_IO_ERROR; - retval = xlog_state_ioerror(log); - spin_unlock(&log->l_icloglock); + if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR, +"Filesystem has been shut down due to log error (0x%x).", + shutdown_flags); + xfs_alert(log->l_mp, +"Please unmount the filesystem and rectify the problem(s)."); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); + } /* * We don't want anybody waiting for log reservations after this. That @@ -3935,12 +3813,16 @@ xfs_log_force_umount( * avoid races. */ spin_lock(&log->l_cilp->xc_push_lock); + wake_up_all(&log->l_cilp->xc_start_wait); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); - xlog_state_do_callback(log, true); - /* return non-zero if log IOERROR transition had already happened */ - return retval; + spin_lock(&log->l_icloglock); + xlog_state_shutdown_callbacks(log); + spin_unlock(&log->l_icloglock); + + wake_up_var(&log->l_opstate); + return log_error; } STATIC int @@ -3978,7 +3860,7 @@ xfs_log_check_lsn( * resets the in-core LSN. We can't validate in this mode, but * modifications are not allowed anyways so just return true. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) return true; /* @@ -4004,11 +3886,22 @@ xfs_log_check_lsn( return valid; } -bool -xfs_log_in_recovery( - struct xfs_mount *mp) +/* + * Notify the log that we're about to start using a feature that is protected + * by a log incompat feature flag. This will prevent log covering from + * clearing those flags. + */ +void +xlog_use_incompat_feat( + struct xlog *log) { - struct xlog *log = mp->m_log; + down_read(&log->l_incompat_users); +} - return log->l_flags & XLOG_ACTIVE_RECOVERY; +/* Notify the log that we've finished using log incompat features. */ +void +xlog_drop_incompat_feat( + struct xlog *log) +{ + up_read(&log->l_incompat_users); } |