if (this_bw < tot_bw) {
if (min) {
min *= this_bw;
- do_div(min, tot_bw);
+ min = div64_ul(min, tot_bw);
}
if (max < 100) {
max *= this_bw;
- do_div(max, tot_bw);
+ max = div64_ul(max, tot_bw);
}
}
struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
unsigned long bytes = vm_dirty_bytes;
unsigned long bg_bytes = dirty_background_bytes;
- unsigned long ratio = vm_dirty_ratio;
- unsigned long bg_ratio = dirty_background_ratio;
+ /* convert ratios to per-PAGE_SIZE for higher precision */
+ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
+ unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
unsigned long thresh;
unsigned long bg_thresh;
struct task_struct *tsk;
/*
* The byte settings can't be applied directly to memcg
* domains. Convert them to ratios by scaling against
- * globally available memory.
+ * globally available memory. As the ratios are in
+ * per-PAGE_SIZE, they can be obtained by dividing bytes by
+ * number of pages.
*/
if (bytes)
- ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 /
- global_avail, 100UL);
+ ratio = min(DIV_ROUND_UP(bytes, global_avail),
+ PAGE_SIZE);
if (bg_bytes)
- bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 /
- global_avail, 100UL);
+ bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
+ PAGE_SIZE);
bytes = bg_bytes = 0;
}
if (bytes)
thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
else
- thresh = (ratio * available_memory) / 100;
+ thresh = (ratio * available_memory) / PAGE_SIZE;
if (bg_bytes)
bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
else
- bg_thresh = (bg_ratio * available_memory) / 100;
+ bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
if (bg_thresh >= thresh)
bg_thresh = thresh / 2;
unsigned long balanced_dirty_ratelimit;
unsigned long step;
unsigned long x;
+ unsigned long shift;
/*
* The dirty rate will match the writeout rate in long term, except
* rate itself is constantly fluctuating. So decrease the track speed
* when it gets close to the target. Helps eliminate pointless tremors.
*/
- step >>= dirty_ratelimit / (2 * step + 1);
- /*
- * Limit the tracking speed to avoid overshooting.
- */
- step = (step + 7) / 8;
+ shift = dirty_ratelimit / (2 * step + 1);
+ if (shift < BITS_PER_LONG)
+ step = DIV_ROUND_UP(step >> shift, 8);
+ else
+ step = 0;
if (dirty_ratelimit < balanced_dirty_ratelimit)
dirty_ratelimit += step;
if (gdtc->dirty > gdtc->bg_thresh)
return true;
- if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
+ if (wb_stat(wb, WB_RECLAIMABLE) >
+ wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
return true;
if (mdtc) {
if (mdtc->dirty > mdtc->bg_thresh)
return true;
- if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+ if (wb_stat(wb, WB_RECLAIMABLE) >
+ wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
return true;
}
* not miss some pages (e.g., because some other process has cleared TOWRITE
* tag we set). The rule we follow is that TOWRITE tag can be cleared only
* by the process clearing the DIRTY tag (and submitting the page for IO).
+ *
+ * To avoid deadlocks between range_cyclic writeback and callers that hold
+ * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
+ * we do not loop back to the start of the file. Doing so causes a page
+ * lock/page writeback access order inversion - we should only ever lock
+ * multiple pages in ascending page->index order, and looping back to the start
+ * of the file violates that rule and causes deadlocks.
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
{
int ret = 0;
int done = 0;
+ int error;
struct pagevec pvec;
int nr_pages;
pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- int cycled;
int range_whole = 0;
int tag;
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
- if (index == 0)
- cycled = 1;
- else
- cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- cycled = 1; /* ignore range_cyclic tests */
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
-retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
- ret = (*writepage)(page, wbc, data);
- if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ error = (*writepage)(page, wbc, data);
+ if (unlikely(error)) {
+ /*
+ * Handle errors according to the type of
+ * writeback. There's no need to continue for
+ * background writeback. Just push done_index
+ * past this page so media errors won't choke
+ * writeout for the entire file. For integrity
+ * writeback, we must process the entire dirty
+ * set regardless of errors because the fs may
+ * still have state to clear for each page. In
+ * that case we continue processing and return
+ * the first error.
+ */
+ if (error == AOP_WRITEPAGE_ACTIVATE) {
unlock_page(page);
- ret = 0;
- } else {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
+ error = 0;
+ } else if (wbc->sync_mode != WB_SYNC_ALL) {
+ ret = error;
done_index = page->index + 1;
done = 1;
break;
}
+ if (!ret)
+ ret = error;
}
/*
pagevec_release(&pvec);
cond_resched();
}
- if (!cycled && !done) {
- /*
- * range_cyclic:
- * We hit the last page and there is more work to be done: wrap
- * back to the start of the file
- */
- cycled = 1;
- index = 0;
- end = writeback_index - 1;
- goto retry;
- }
+
+ /*
+ * If we hit the last page and there is more work to be done: wrap
+ * back the index back to the start of the file for the next
+ * time we are called.
+ */
+ if (wbc->range_cyclic && !done)
+ done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
current->nr_dirtied--;
dec_zone_page_state(page, NR_DIRTIED);
dec_wb_stat(wb, WB_DIRTIED);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
}
}
EXPORT_SYMBOL(account_page_redirty);
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct mem_cgroup *memcg;
- bool locked;
+ struct wb_lock_cookie cookie = {};
memcg = mem_cgroup_begin_page_stat(page);
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page))
account_page_cleaned(page, mapping, memcg, wb);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
mem_cgroup_end_page_stat(memcg);
} else {
ClearPageDirty(page);
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct mem_cgroup *memcg;
- bool locked;
+ struct wb_lock_cookie cookie = {};
/*
* Yes, Virginia, this is indeed insane.
* exclusion.
*/
memcg = mem_cgroup_begin_page_stat(page);
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page)) {
mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
mem_cgroup_end_page_stat(memcg);
return ret;
}