diff options
Diffstat (limited to 'drivers/staging/lustre/lustre/osc/osc_cache.c')
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_cache.c | 575 |
1 files changed, 453 insertions, 122 deletions
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c index 63363111380c..5a14bea961b4 100644 --- a/drivers/staging/lustre/lustre/osc/osc_cache.c +++ b/drivers/staging/lustre/lustre/osc/osc_cache.c @@ -76,6 +76,8 @@ static inline char *ext_flags(struct osc_extent *ext, char *flags) *buf++ = ext->oe_rw ? 'r' : 'w'; if (ext->oe_intree) *buf++ = 'i'; + if (ext->oe_sync) + *buf++ = 'S'; if (ext->oe_srvlock) *buf++ = 's'; if (ext->oe_hp) @@ -121,9 +123,13 @@ static const char *oes_strings[] = { __ext->oe_grants, __ext->oe_nr_pages, \ list_empty_marker(&__ext->oe_pages), \ waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ - __ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner, \ + __ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner, \ /* ----- part 4 ----- */ \ ## __VA_ARGS__); \ + if (lvl == D_ERROR && __ext->oe_dlmlock) \ + LDLM_ERROR(__ext->oe_dlmlock, "extent: %p\n", __ext); \ + else \ + LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p\n", __ext); \ } while (0) #undef EASSERTF @@ -240,20 +246,25 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, goto out; } - if (!ext->oe_osclock && ext->oe_grants > 0) { + if (ext->oe_sync && ext->oe_grants > 0) { rc = 90; goto out; } - if (ext->oe_osclock) { - struct cl_lock_descr *descr; + if (ext->oe_dlmlock) { + struct ldlm_extent *extent; - descr = &ext->oe_osclock->cll_descr; - if (!(descr->cld_start <= ext->oe_start && - descr->cld_end >= ext->oe_max_end)) { + extent = &ext->oe_dlmlock->l_policy_data.l_extent; + if (!(extent->start <= cl_offset(osc2cl(obj), ext->oe_start) && + extent->end >= cl_offset(osc2cl(obj), ext->oe_max_end))) { rc = 100; goto out; } + + if (!(ext->oe_dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))) { + rc = 102; + goto out; + } } if (ext->oe_nr_pages > ext->oe_mppr) { @@ -276,7 +287,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, page_count = 0; list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { - pgoff_t index = oap2cl_page(oap)->cp_index; + pgoff_t index = osc_index(oap2osc(oap)); ++page_count; if (index > ext->oe_end || index < ext->oe_start) { rc = 110; @@ -359,7 +370,7 @@ static struct osc_extent *osc_extent_alloc(struct osc_object *obj) ext->oe_state = OES_INV; INIT_LIST_HEAD(&ext->oe_pages); init_waitqueue_head(&ext->oe_waitq); - ext->oe_osclock = NULL; + ext->oe_dlmlock = NULL; return ext; } @@ -385,9 +396,11 @@ static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) LASSERT(ext->oe_state == OES_INV); LASSERT(!ext->oe_intree); - if (ext->oe_osclock) { - cl_lock_put(env, ext->oe_osclock); - ext->oe_osclock = NULL; + if (ext->oe_dlmlock) { + lu_ref_add(&ext->oe_dlmlock->l_reference, + "osc_extent", ext); + LDLM_LOCK_PUT(ext->oe_dlmlock); + ext->oe_dlmlock = NULL; } osc_extent_free(ext); } @@ -543,8 +556,8 @@ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, if (cur->oe_max_end != victim->oe_max_end) return -ERANGE; - LASSERT(cur->oe_osclock == victim->oe_osclock); - ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT; + LASSERT(cur->oe_dlmlock == victim->oe_dlmlock); + ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT; chunk_start = cur->oe_start >> ppc_bits; chunk_end = cur->oe_end >> ppc_bits; if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && @@ -624,10 +637,10 @@ static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) static struct osc_extent *osc_extent_find(const struct lu_env *env, struct osc_object *obj, pgoff_t index, int *grants) - { struct client_obd *cli = osc_cli(obj); - struct cl_lock *lock; + struct osc_lock *olck; + struct cl_lock_descr *descr; struct osc_extent *cur; struct osc_extent *ext; struct osc_extent *conflict = NULL; @@ -644,11 +657,15 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env, if (!cur) return ERR_PTR(-ENOMEM); - lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0); - LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE); + olck = osc_env_io(env)->oi_write_osclock; + LASSERTF(olck, "page %lu is not covered by lock\n", index); + LASSERT(olck->ols_state == OLS_GRANTED); - LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT); - ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + descr = &olck->ols_cl.cls_lock->cll_descr; + LASSERT(descr->cld_mode >= CLM_WRITE); + + LASSERT(cli->cl_chunkbits >= PAGE_SHIFT); + ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; chunk_mask = ~((1 << ppc_bits) - 1); chunksize = 1 << cli->cl_chunkbits; chunk = index >> ppc_bits; @@ -657,19 +674,23 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env, max_pages = cli->cl_max_pages_per_rpc; LASSERT((max_pages & ~chunk_mask) == 0); max_end = index - (index % max_pages) + max_pages - 1; - max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end); + max_end = min_t(pgoff_t, max_end, descr->cld_end); /* initialize new extent by parameters so far */ cur->oe_max_end = max_end; cur->oe_start = index & chunk_mask; cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; - if (cur->oe_start < lock->cll_descr.cld_start) - cur->oe_start = lock->cll_descr.cld_start; + if (cur->oe_start < descr->cld_start) + cur->oe_start = descr->cld_start; if (cur->oe_end > max_end) cur->oe_end = max_end; - cur->oe_osclock = lock; cur->oe_grants = 0; cur->oe_mppr = max_pages; + if (olck->ols_dlmlock) { + LASSERT(olck->ols_hold); + cur->oe_dlmlock = LDLM_LOCK_GET(olck->ols_dlmlock); + lu_ref_add(&olck->ols_dlmlock->l_reference, "osc_extent", cur); + } /* grants has been allocated by caller */ LASSERTF(*grants >= chunksize + cli->cl_extent_tax, @@ -691,7 +712,7 @@ restart: break; /* if covering by different locks, no chance to match */ - if (lock != ext->oe_osclock) { + if (olck->ols_dlmlock != ext->oe_dlmlock) { EASSERTF(!overlapped(ext, cur), ext, EXTSTR"\n", EXTPARA(cur)); @@ -795,7 +816,7 @@ restart: if (found) { LASSERT(!conflict); if (!IS_ERR(found)) { - LASSERT(found->oe_osclock == cur->oe_osclock); + LASSERT(found->oe_dlmlock == cur->oe_dlmlock); OSC_EXTENT_DUMP(D_CACHE, found, "found caching ext for %lu.\n", index); } @@ -810,7 +831,7 @@ restart: found = osc_extent_hold(cur); osc_extent_insert(obj, cur); OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", - index, lock->cll_descr.cld_end); + index, descr->cld_end); } osc_object_unlock(obj); @@ -856,6 +877,8 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, ext->oe_rc = rc ?: ext->oe_nr_pages; EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); + + osc_lru_add_batch(cli, &ext->oe_pages); list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) { list_del_init(&oap->oap_rpc_item); list_del_init(&oap->oap_pending_item); @@ -871,20 +894,19 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, if (!sent) { lost_grant = ext->oe_grants; - } else if (blocksize < PAGE_CACHE_SIZE && - last_count != PAGE_CACHE_SIZE) { + } else if (blocksize < PAGE_SIZE && + last_count != PAGE_SIZE) { /* For short writes we shouldn't count parts of pages that * span a whole chunk on the OST side, or our accounting goes * wrong. Should match the code in filter_grant_check. */ - int offset = oap->oap_page_off & ~CFS_PAGE_MASK; - int count = oap->oap_count + (offset & (blocksize - 1)); - int end = (offset + oap->oap_count) & (blocksize - 1); - + int offset = last_off & ~PAGE_MASK; + int count = last_count + (offset & (blocksize - 1)); + int end = (offset + last_count) & (blocksize - 1); if (end) count += blocksize - end; - lost_grant = PAGE_CACHE_SIZE - count; + lost_grant = PAGE_SIZE - count; } if (ext->oe_grants > 0) osc_free_grant(cli, nr_pages, lost_grant); @@ -943,7 +965,7 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, "%s: wait ext to %d timedout, recovery in progress?\n", osc_export(obj)->exp_obd->obd_name, state); - lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + lwi = LWI_INTR(NULL, NULL); rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); } @@ -967,7 +989,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, struct osc_async_page *oap; struct osc_async_page *tmp; int pages_in_chunk = 0; - int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; __u64 trunc_chunk = trunc_index >> ppc_bits; int grants = 0; int nr_pages = 0; @@ -990,19 +1012,19 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, /* discard all pages with index greater then trunc_index */ list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) { - struct cl_page *sub = oap2cl_page(oap); - struct cl_page *page = cl_page_top(sub); + pgoff_t index = osc_index(oap2osc(oap)); + struct cl_page *page = oap2cl_page(oap); LASSERT(list_empty(&oap->oap_rpc_item)); /* only discard the pages with their index greater than * trunc_index, and ... */ - if (sub->cp_index < trunc_index || - (sub->cp_index == trunc_index && partial)) { + if (index < trunc_index || + (index == trunc_index && partial)) { /* accounting how many pages remaining in the chunk * so that we can calculate grants correctly. */ - if (sub->cp_index >> ppc_bits == trunc_chunk) + if (index >> ppc_bits == trunc_chunk) ++pages_in_chunk; continue; } @@ -1013,7 +1035,6 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, lu_ref_add(&page->cp_reference, "truncate", current); if (cl_page_own(env, io, page) == 0) { - cl_page_unmap(env, io, page); cl_page_discard(env, io, page); cl_page_disown(env, io, page); } else { @@ -1125,8 +1146,10 @@ static int osc_extent_make_ready(const struct lu_env *env, if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); LASSERT(last->oap_count > 0); - LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE); + LASSERT(last->oap_page_off + last->oap_count <= PAGE_SIZE); + spin_lock(&last->oap_lock); last->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&last->oap_lock); } /* for the rest of pages, we don't need to call osf_refresh_count() @@ -1134,8 +1157,10 @@ static int osc_extent_make_ready(const struct lu_env *env, */ list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { - oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off; + oap->oap_count = PAGE_SIZE - oap->oap_page_off; + spin_lock(&last->oap_lock); oap->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&last->oap_lock); } } @@ -1158,7 +1183,7 @@ static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants) struct osc_object *obj = ext->oe_obj; struct client_obd *cli = osc_cli(obj); struct osc_extent *next; - int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; pgoff_t chunk = index >> ppc_bits; pgoff_t end_chunk; pgoff_t end_index; @@ -1256,7 +1281,7 @@ static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, int cmd) { struct osc_page *opg = oap2osc_page(oap); - struct cl_page *page = cl_page_top(oap2cl_page(oap)); + struct cl_page *page = oap2cl_page(oap); int result; LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ @@ -1271,7 +1296,7 @@ static int osc_refresh_count(const struct lu_env *env, struct osc_async_page *oap, int cmd) { struct osc_page *opg = oap2osc_page(oap); - struct cl_page *page = oap2cl_page(oap); + pgoff_t index = osc_index(oap2osc(oap)); struct cl_object *obj; struct cl_attr *attr = &osc_env_info(env)->oti_attr; @@ -1288,28 +1313,30 @@ static int osc_refresh_count(const struct lu_env *env, if (result < 0) return result; kms = attr->cat_kms; - if (cl_offset(obj, page->cp_index) >= kms) + if (cl_offset(obj, index) >= kms) /* catch race with truncate */ return 0; - else if (cl_offset(obj, page->cp_index + 1) > kms) + else if (cl_offset(obj, index + 1) > kms) /* catch sub-page write at end of file */ - return kms % PAGE_CACHE_SIZE; + return kms % PAGE_SIZE; else - return PAGE_CACHE_SIZE; + return PAGE_SIZE; } static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, int cmd, int rc) { struct osc_page *opg = oap2osc_page(oap); - struct cl_page *page = cl_page_top(oap2cl_page(oap)); + struct cl_page *page = oap2cl_page(oap); struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); enum cl_req_type crt; int srvlock; cmd &= ~OBD_BRW_NOQUOTA; - LASSERT(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ)); - LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE)); + LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ), + "cp_state:%u, cmd:%d\n", page->cp_state, cmd); + LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE), + "cp_state:%u, cmd:%d\n", page->cp_state, cmd); LASSERT(opg->ops_transfer_pinned); /* @@ -1358,28 +1385,34 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, return 0; } -#define OSC_DUMP_GRANT(cli, fmt, args...) do { \ +#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do { \ struct client_obd *__tmp = (cli); \ - CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d " \ - "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt, \ + CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %d/%d " \ + "unstable_pages: %d/%d dropped: %ld avail: %ld, " \ + "reserved: %ld, flight: %d } lru {in list: %d, " \ + "left: %d, waiters: %d }" fmt, \ __tmp->cl_import->imp_obd->obd_name, \ __tmp->cl_dirty, __tmp->cl_dirty_max, \ atomic_read(&obd_dirty_pages), obd_max_dirty_pages, \ + atomic_read(&obd_unstable_pages), obd_max_dirty_pages, \ __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ - __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args); \ + __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, \ + atomic_read(&__tmp->cl_lru_in_list), \ + atomic_read(&__tmp->cl_lru_busy), \ + atomic_read(&__tmp->cl_lru_shrinkers), ##args); \ } while (0) /* caller must hold loi_list_lock */ static void osc_consume_write_grant(struct client_obd *cli, struct brw_page *pga) { - assert_spin_locked(&cli->cl_loi_list_lock.lock); + assert_spin_locked(&cli->cl_loi_list_lock); LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); atomic_inc(&obd_dirty_pages); - cli->cl_dirty += PAGE_CACHE_SIZE; + cli->cl_dirty += PAGE_SIZE; pga->flag |= OBD_BRW_FROM_GRANT; CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", - PAGE_CACHE_SIZE, pga, pga->pg); + PAGE_SIZE, pga, pga->pg); osc_update_next_shrink(cli); } @@ -1389,18 +1422,18 @@ static void osc_consume_write_grant(struct client_obd *cli, static void osc_release_write_grant(struct client_obd *cli, struct brw_page *pga) { - assert_spin_locked(&cli->cl_loi_list_lock.lock); + assert_spin_locked(&cli->cl_loi_list_lock); if (!(pga->flag & OBD_BRW_FROM_GRANT)) { return; } pga->flag &= ~OBD_BRW_FROM_GRANT; atomic_dec(&obd_dirty_pages); - cli->cl_dirty -= PAGE_CACHE_SIZE; + cli->cl_dirty -= PAGE_SIZE; if (pga->flag & OBD_BRW_NOCACHE) { pga->flag &= ~OBD_BRW_NOCACHE; atomic_dec(&obd_dirty_transit_pages); - cli->cl_dirty_transit -= PAGE_CACHE_SIZE; + cli->cl_dirty_transit -= PAGE_SIZE; } } @@ -1408,7 +1441,7 @@ static void osc_release_write_grant(struct client_obd *cli, * To avoid sleeping with object lock held, it's good for us allocate enough * grants before entering into critical section. * - * client_obd_list_lock held by caller + * spin_lock held by caller */ static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) { @@ -1442,11 +1475,11 @@ static void __osc_unreserve_grant(struct client_obd *cli, static void osc_unreserve_grant(struct client_obd *cli, unsigned int reserved, unsigned int unused) { - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); __osc_unreserve_grant(cli, reserved, unused); if (unused > 0) osc_wake_cache_waiters(cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); + spin_unlock(&cli->cl_loi_list_lock); } /** @@ -1456,7 +1489,7 @@ static void osc_unreserve_grant(struct client_obd *cli, * used, we should return these grants to OST. There're two cases where grants * can be lost: * 1. truncate; - * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was + * 2. blocksize at OST is less than PAGE_SIZE and a partial page was * written. In this case OST may use less chunks to serve this partial * write. OSTs don't actually know the page size on the client side. so * clients have to calculate lost grant by the blocksize on the OST. @@ -1467,9 +1500,9 @@ static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, { int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); atomic_sub(nr_pages, &obd_dirty_pages); - cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT; + cli->cl_dirty -= nr_pages << PAGE_SHIFT; cli->cl_lost_grant += lost_grant; if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { /* borrow some grant from truncate to avoid the case that @@ -1479,7 +1512,7 @@ static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, cli->cl_avail_grant += grant; } osc_wake_cache_waiters(cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); + spin_unlock(&cli->cl_loi_list_lock); CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", lost_grant, cli->cl_lost_grant, cli->cl_avail_grant, cli->cl_dirty); @@ -1491,9 +1524,9 @@ static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, */ static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) { - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); osc_release_write_grant(cli, &oap->oap_brw_page); - client_obd_list_unlock(&cli->cl_loi_list_lock); + spin_unlock(&cli->cl_loi_list_lock); } /** @@ -1506,17 +1539,18 @@ static int osc_enter_cache_try(struct client_obd *cli, { int rc; - OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); + OSC_DUMP_GRANT(D_CACHE, cli, "need:%d.\n", bytes); rc = osc_reserve_grant(cli, bytes); if (rc < 0) return 0; - if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max && - atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) { + if (cli->cl_dirty + PAGE_SIZE <= cli->cl_dirty_max && + atomic_read(&obd_unstable_pages) + 1 + + atomic_read(&obd_dirty_pages) <= obd_max_dirty_pages) { osc_consume_write_grant(cli, &oap->oap_brw_page); if (transient) { - cli->cl_dirty_transit += PAGE_CACHE_SIZE; + cli->cl_dirty_transit += PAGE_SIZE; atomic_inc(&obd_dirty_transit_pages); oap->oap_brw_flags |= OBD_BRW_NOCACHE; } @@ -1532,9 +1566,9 @@ static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) { int rc; - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); rc = list_empty(&ocw->ocw_entry); - client_obd_list_unlock(&cli->cl_loi_list_lock); + spin_unlock(&cli->cl_loi_list_lock); return rc; } @@ -1551,18 +1585,19 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, struct osc_object *osc = oap->oap_obj; struct lov_oinfo *loi = osc->oo_oinfo; struct osc_cache_waiter ocw; - struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, + LWI_ON_SIGNAL_NOOP, NULL); int rc = -EDQUOT; - OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); + OSC_DUMP_GRANT(D_CACHE, cli, "need:%d.\n", bytes); - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); /* force the caller to try sync io. this can jump the list * of queued writes and create a discontiguous rpc stream */ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || - cli->cl_dirty_max < PAGE_CACHE_SIZE || + cli->cl_dirty_max < PAGE_SIZE || cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) { rc = -EDQUOT; goto out; @@ -1587,7 +1622,7 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) { list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); ocw.ocw_rc = 0; - client_obd_list_unlock(&cli->cl_loi_list_lock); + spin_unlock(&cli->cl_loi_list_lock); osc_io_unplug_async(env, cli, NULL); @@ -1596,10 +1631,17 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi); - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); - /* l_wait_event is interrupted by signal */ + /* l_wait_event is interrupted by signal, or timed out */ if (rc < 0) { + if (rc == -ETIMEDOUT) { + OSC_DUMP_GRANT(D_ERROR, cli, + "try to reserve %d.\n", bytes); + osc_extent_tree_dump(D_ERROR, osc); + rc = -EDQUOT; + } + list_del_init(&ocw.ocw_entry); goto out; } @@ -1615,8 +1657,8 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, } } out: - client_obd_list_unlock(&cli->cl_loi_list_lock); - OSC_DUMP_GRANT(cli, "returned %d.\n", rc); + spin_unlock(&cli->cl_loi_list_lock); + OSC_DUMP_GRANT(D_CACHE, cli, "returned %d.\n", rc); return rc; } @@ -1632,9 +1674,9 @@ void osc_wake_cache_waiters(struct client_obd *cli) ocw->ocw_rc = -EDQUOT; /* we can't dirty more */ - if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) || - (atomic_read(&obd_dirty_pages) + 1 > - obd_max_dirty_pages)) { + if ((cli->cl_dirty + PAGE_SIZE > cli->cl_dirty_max) || + (atomic_read(&obd_unstable_pages) + 1 + + atomic_read(&obd_dirty_pages) > obd_max_dirty_pages)) { CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %d\n", cli->cl_dirty, cli->cl_dirty_max, obd_max_dirty_pages); @@ -1776,9 +1818,9 @@ static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) { int is_ready; - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); is_ready = __osc_list_maint(cli, osc); - client_obd_list_unlock(&cli->cl_loi_list_lock); + spin_unlock(&cli->cl_loi_list_lock); return is_ready; } @@ -1799,13 +1841,101 @@ static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, ar->ar_force_sync = 1; ar->ar_min_xid = ptlrpc_sample_next_xid(); return; - } if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) ar->ar_force_sync = 0; } +/** + * Performs "unstable" page accounting. This function balances the + * increment operations performed in osc_inc_unstable_pages. It is + * registered as the RPC request callback, and is executed when the + * bulk RPC is committed on the server. Thus at this point, the pages + * involved in the bulk transfer are no longer considered unstable. + */ +void osc_dec_unstable_pages(struct ptlrpc_request *req) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + int page_count = desc->bd_iov_count; + int i; + + /* No unstable page tracking */ + if (!cli->cl_cache) + return; + + LASSERT(page_count >= 0); + + for (i = 0; i < page_count; i++) + dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); + + atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr); + LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); + + atomic_sub(page_count, &cli->cl_unstable_count); + LASSERT(atomic_read(&cli->cl_unstable_count) >= 0); + + atomic_sub(page_count, &obd_unstable_pages); + LASSERT(atomic_read(&obd_unstable_pages) >= 0); + + spin_lock(&req->rq_lock); + req->rq_committed = 1; + req->rq_unstable = 0; + spin_unlock(&req->rq_lock); + + wake_up_all(&cli->cl_cache->ccc_unstable_waitq); +} + +/* "unstable" page accounting. See: osc_dec_unstable_pages. */ +void osc_inc_unstable_pages(struct ptlrpc_request *req) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + long page_count = desc->bd_iov_count; + int i; + + /* No unstable page tracking */ + if (!cli->cl_cache) + return; + + LASSERT(page_count >= 0); + + for (i = 0; i < page_count; i++) + inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); + + LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); + atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr); + + LASSERT(atomic_read(&cli->cl_unstable_count) >= 0); + atomic_add(page_count, &cli->cl_unstable_count); + + LASSERT(atomic_read(&obd_unstable_pages) >= 0); + atomic_add(page_count, &obd_unstable_pages); + + spin_lock(&req->rq_lock); + + /* + * If the request has already been committed (i.e. brw_commit + * called via rq_commit_cb), we need to undo the unstable page + * increments we just performed because rq_commit_cb wont be + * called again. Otherwise, just set the commit callback so the + * unstable page accounting is properly updated when the request + * is committed + */ + if (req->rq_committed) { + /* Drop lock before calling osc_dec_unstable_pages */ + spin_unlock(&req->rq_lock); + osc_dec_unstable_pages(req); + spin_lock(&req->rq_lock); + } else { + req->rq_unstable = 1; + req->rq_commit_cb = osc_dec_unstable_pages; + } + + spin_unlock(&req->rq_lock); +} + /* this must be called holding the loi list lock to give coverage to exit_cache, * async_flag maintenance, and oap_request */ @@ -1817,6 +1947,9 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, __u64 xid = 0; if (oap->oap_request) { + if (!rc) + osc_inc_unstable_pages(oap->oap_request); + xid = ptlrpc_req_xid(oap->oap_request); ptlrpc_req_finished(oap->oap_request); oap->oap_request = NULL; @@ -1829,10 +1962,10 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, oap->oap_interrupted = 0; if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); osc_process_ar(&cli->cl_ar, xid, rc); osc_process_ar(&loi->loi_ar, xid, rc); - client_obd_list_unlock(&cli->cl_loi_list_lock); + spin_unlock(&cli->cl_loi_list_lock); } rc = osc_completion(env, oap, oap->oap_cmd, rc); @@ -2133,9 +2266,8 @@ static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) } cl_object_get(obj); - client_obd_list_unlock(&cli->cl_loi_list_lock); - lu_object_ref_add_at(&obj->co_lu, &link, "check", - current); + spin_unlock(&cli->cl_loi_list_lock); + lu_object_ref_add_at(&obj->co_lu, &link, "check", current); /* attempt some read/write balancing by alternating between * reads and writes in an object. The makes_rpc checks here @@ -2178,11 +2310,10 @@ static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) osc_object_unlock(osc); osc_list_maint(cli, osc); - lu_object_ref_del_at(&obj->co_lu, &link, "check", - current); + lu_object_ref_del_at(&obj->co_lu, &link, "check", current); cl_object_put(env, obj); - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); } } @@ -2199,9 +2330,9 @@ static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, * potential stack overrun problem. LU-2859 */ atomic_inc(&cli->cl_lru_shrinkers); - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); osc_check_rpcs(env, cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); + spin_unlock(&cli->cl_loi_list_lock); atomic_dec(&cli->cl_lru_shrinkers); } else { CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli); @@ -2238,7 +2369,7 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, oap->oap_page = page; oap->oap_obj_off = offset; - LASSERT(!(offset & ~CFS_PAGE_MASK)); + LASSERT(!(offset & ~PAGE_MASK)); if (!client_is_remote(exp) && capable(CFS_CAP_SYS_RESOURCE)) oap->oap_brw_flags = OBD_BRW_NOQUOTA; @@ -2306,16 +2437,23 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, return rc; } + if (osc_over_unstable_soft_limit(cli)) + brw_flags |= OBD_BRW_SOFT_SYNC; + oap->oap_cmd = cmd; oap->oap_page_off = ops->ops_from; oap->oap_count = ops->ops_to - ops->ops_from; + /* + * No need to hold a lock here, + * since this page is not in any list yet. + */ oap->oap_async_flags = 0; oap->oap_brw_flags = brw_flags; OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); - index = oap2cl_page(oap)->cp_index; + index = osc_index(oap2osc(oap)); /* Add this page into extent by the following steps: * 1. if there exists an active extent for this IO, mostly this page @@ -2334,9 +2472,9 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, grants = 0; /* it doesn't need any grant to dirty this page */ - client_obd_list_lock(&cli->cl_loi_list_lock); + spin_lock(&cli->cl_loi_list_lock); rc = osc_enter_cache_try(cli, oap, grants, 0); - client_obd_list_unlock(&cli->cl_loi_list_lock); + spin_unlock(&cli->cl_loi_list_lock); if (rc == 0) { /* try failed */ grants = 0; need_release = 1; @@ -2427,21 +2565,21 @@ int osc_teardown_async_page(const struct lu_env *env, LASSERT(oap->oap_magic == OAP_MAGIC); CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", - oap, ops, oap2cl_page(oap)->cp_index); + oap, ops, osc_index(oap2osc(oap))); osc_object_lock(obj); if (!list_empty(&oap->oap_rpc_item)) { CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); rc = -EBUSY; } else if (!list_empty(&oap->oap_pending_item)) { - ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index); + ext = osc_extent_lookup(obj, osc_index(oap2osc(oap))); /* only truncated pages are allowed to be taken out. * See osc_extent_truncate() and osc_cache_truncate_start() * for details. */ if (ext && ext->oe_state != OES_TRUNC) { OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", - oap2cl_page(oap)->cp_index); + osc_index(oap2osc(oap))); rc = -EBUSY; } } @@ -2464,7 +2602,7 @@ int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, struct osc_extent *ext = NULL; struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); struct cl_page *cp = ops->ops_cl.cpl_page; - pgoff_t index = cp->cp_index; + pgoff_t index = osc_index(ops); struct osc_async_page *oap = &ops->ops_oap; bool unplug = false; int rc = 0; @@ -2479,8 +2617,7 @@ int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, switch (ext->oe_state) { case OES_RPC: case OES_LOCK_DONE: - CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp), - "flush an in-rpc page?\n"); + CL_PAGE_DEBUG(D_ERROR, env, cp, "flush an in-rpc page?\n"); LASSERT(0); break; case OES_LOCKING: @@ -2506,7 +2643,7 @@ int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, break; } - rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE); + rc = cl_page_prep(env, io, cp, CRT_WRITE); if (rc) goto out; @@ -2550,7 +2687,7 @@ int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) struct osc_extent *ext; struct osc_extent *found = NULL; struct list_head *plist; - pgoff_t index = oap2cl_page(oap)->cp_index; + pgoff_t index = osc_index(ops); int rc = -EBUSY; int cmd; @@ -2613,12 +2750,12 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, pgoff_t end = 0; list_for_each_entry(oap, list, oap_pending_item) { - struct cl_page *cp = oap2cl_page(oap); + pgoff_t index = osc_index(oap2osc(oap)); - if (cp->cp_index > end) - end = cp->cp_index; - if (cp->cp_index < start) - start = cp->cp_index; + if (index > end) + end = index; + if (index < start) + start = index; ++page_count; mppr <<= (page_count > mppr); } @@ -2633,6 +2770,7 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, } ext->oe_rw = !!(cmd & OBD_BRW_READ); + ext->oe_sync = 1; ext->oe_urgent = 1; ext->oe_start = start; ext->oe_end = ext->oe_max_end = end; @@ -2988,7 +3126,200 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, result = rc; } - OSC_IO_DEBUG(obj, "cache page out.\n"); + OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result); + return result; +} + +/** + * Returns a list of pages by a given [start, end] of \a obj. + * + * \param resched If not NULL, then we give up before hogging CPU for too + * long and set *resched = 1, in that case caller should implement a retry + * logic. + * + * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely + * crucial in the face of [offset, EOF] locks. + * + * Return at least one page in @queue unless there is no covered page. + */ +int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, + struct osc_object *osc, pgoff_t start, pgoff_t end, + osc_page_gang_cbt cb, void *cbdata) +{ + struct osc_page *ops; + void **pvec; + pgoff_t idx; + unsigned int nr; + unsigned int i; + unsigned int j; + int res = CLP_GANG_OKAY; + bool tree_lock = true; + + idx = start; + pvec = osc_env_info(env)->oti_pvec; + spin_lock(&osc->oo_tree_lock); + while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec, + idx, OTI_PVEC_SIZE)) > 0) { + struct cl_page *page; + bool end_of_region = false; + + for (i = 0, j = 0; i < nr; ++i) { + ops = pvec[i]; + pvec[i] = NULL; + + idx = osc_index(ops); + if (idx > end) { + end_of_region = true; + break; + } + + page = ops->ops_cl.cpl_page; + LASSERT(page->cp_type == CPT_CACHEABLE); + if (page->cp_state == CPS_FREEING) + continue; + + cl_page_get(page); + lu_ref_add_atomic(&page->cp_reference, + "gang_lookup", current); + pvec[j++] = ops; + } + ++idx; + + /* + * Here a delicate locking dance is performed. Current thread + * holds a reference to a page, but has to own it before it + * can be placed into queue. Owning implies waiting, so + * radix-tree lock is to be released. After a wait one has to + * check that pages weren't truncated (cl_page_own() returns + * error in the latter case). + */ + spin_unlock(&osc->oo_tree_lock); + tree_lock = false; + + for (i = 0; i < j; ++i) { + ops = pvec[i]; + if (res == CLP_GANG_OKAY) + res = (*cb)(env, io, ops, cbdata); + + page = ops->ops_cl.cpl_page; + lu_ref_del(&page->cp_reference, "gang_lookup", current); + cl_page_put(env, page); + } + if (nr < OTI_PVEC_SIZE || end_of_region) + break; + + if (res == CLP_GANG_OKAY && need_resched()) + res = CLP_GANG_RESCHED; + if (res != CLP_GANG_OKAY) + break; + + spin_lock(&osc->oo_tree_lock); + tree_lock = true; + } + if (tree_lock) + spin_unlock(&osc->oo_tree_lock); + return res; +} + +/** + * Check if page @page is covered by an extra lock or discard it. + */ +static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops, void *cbdata) +{ + struct osc_thread_info *info = osc_env_info(env); + struct osc_object *osc = cbdata; + pgoff_t index; + + index = osc_index(ops); + if (index >= info->oti_fn_index) { + struct ldlm_lock *tmp; + struct cl_page *page = ops->ops_cl.cpl_page; + + /* refresh non-overlapped index */ + tmp = osc_dlmlock_at_pgoff(env, osc, index, 0, 0); + if (tmp) { + __u64 end = tmp->l_policy_data.l_extent.end; + /* Cache the first-non-overlapped index so as to skip + * all pages within [index, oti_fn_index). This is safe + * because if tmp lock is canceled, it will discard + * these pages. + */ + info->oti_fn_index = cl_index(osc2cl(osc), end + 1); + if (end == OBD_OBJECT_EOF) + info->oti_fn_index = CL_PAGE_EOF; + LDLM_LOCK_PUT(tmp); + } else if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + } + + info->oti_next_index = index + 1; + return CLP_GANG_OKAY; +} + +static int discard_cb(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops, void *cbdata) +{ + struct osc_thread_info *info = osc_env_info(env); + struct cl_page *page = ops->ops_cl.cpl_page; + + /* page is top page. */ + info->oti_next_index = osc_index(ops) + 1; + if (cl_page_own(env, io, page) == 0) { + KLASSERT(ergo(page->cp_type == CPT_CACHEABLE, + !PageDirty(cl_page_vmpage(page)))); + + /* discard the page */ + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + + return CLP_GANG_OKAY; +} + +/** + * Discard pages protected by the given lock. This function traverses radix + * tree to find all covering pages and discard them. If a page is being covered + * by other locks, it should remain in cache. + * + * If error happens on any step, the process continues anyway (the reasoning + * behind this being that lock cancellation cannot be delayed indefinitely). + */ +int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, + pgoff_t start, pgoff_t end, enum cl_lock_mode mode) +{ + struct osc_thread_info *info = osc_env_info(env); + struct cl_io *io = &info->oti_io; + osc_page_gang_cbt cb; + int res; + int result; + + io->ci_obj = cl_object_top(osc2cl(osc)); + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + goto out; + + cb = mode == CLM_READ ? check_and_discard_cb : discard_cb; + info->oti_fn_index = info->oti_next_index = start; + do { + res = osc_page_gang_lookup(env, io, osc, + info->oti_next_index, end, cb, osc); + if (info->oti_next_index > end) + break; + + if (res == CLP_GANG_RESCHED) + cond_resched(); + } while (res != CLP_GANG_OKAY); +out: + cl_io_fini(env, io); return result; } |