/* * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 * * Removed a lot of unnecessary code and simplified things now that * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 * * Speed up hash, lru, and free list operations. Use gfp() for allocating * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM * * Added 32k buffer block sizes - these are required older ARM systems. - RMK * * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
*/
/* * Returns if the folio has dirty or writeback buffers. If all the buffers * are unlocked and clean then the folio_test_dirty information is stale. If * any of the buffers are locked, it is assumed they are locked for IO.
*/ void buffer_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback)
{ struct buffer_head *head, *bh;
*dirty = false;
*writeback = false;
BUG_ON(!folio_test_locked(folio));
head = folio_buffers(folio); if (!head) return;
if (folio_test_writeback(folio))
*writeback = true;
bh = head; do { if (buffer_locked(bh))
*writeback = true;
if (buffer_dirty(bh))
*dirty = true;
bh = bh->b_this_page;
} while (bh != head);
}
/* * Block until a buffer comes unlocked. This doesn't stop it * from becoming locked again - you have to lock it yourself * if you want to preserve its state.
*/ void __wait_on_buffer(struct buffer_head * bh)
{
wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__wait_on_buffer);
staticvoid buffer_io_error(struct buffer_head *bh, char *msg)
{ if (!test_bit(BH_Quiet, &bh->b_state))
printk_ratelimited(KERN_ERR "Buffer I/O error on dev %pg, logical block %llu%s\n",
bh->b_bdev, (unsignedlonglong)bh->b_blocknr, msg);
}
/* * End-of-IO handler helper function which does not touch the bh after * unlocking it. * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but * a race there is benign: unlock_buffer() only use the bh's address for * hashing after unlocking the buffer, so it doesn't actually touch the bh * itself.
*/ staticvoid __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
{ if (uptodate) {
set_buffer_uptodate(bh);
} else { /* This happens, due to failed read-ahead attempts. */
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
}
/* * Default synchronous end-of-IO handler.. Just mark it up-to-date and * unlock the buffer.
*/ void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
put_bh(bh);
__end_buffer_read_notouch(bh, uptodate);
}
EXPORT_SYMBOL(end_buffer_read_sync);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{ if (uptodate) {
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost sync page write");
mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_write_sync);
index = ((loff_t)block << blkbits) / PAGE_SIZE;
folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0); if (IS_ERR(folio)) goto out;
/* * Folio lock protects the buffers. Callers that cannot block * will fallback to serializing vs try_to_free_buffers() via * the i_private_lock.
*/ if (atomic)
spin_lock(&bd_mapping->i_private_lock); else
folio_lock(folio);
head = folio_buffers(folio); if (!head) goto out_unlock; /* * Upon a noref migration, the folio lock serializes here; * otherwise bail.
*/ if (test_bit_acquire(BH_Migrate, &head->b_state)) {
WARN_ON(!atomic); goto out_unlock;
}
bh = head; do { if (!buffer_mapped(bh))
all_mapped = 0; elseif (bh->b_blocknr == block) {
ret = bh;
get_bh(bh); goto out_unlock;
}
bh = bh->b_this_page;
} while (bh != head);
/* we might be here because some of the buffers on this page are * not mapped. This is due to various races between * file io on the block device and getblk. It gets dealt with * elsewhere, don't buffer_error if we had some unmapped buffers
*/
ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE); if (all_mapped && __ratelimit(&last_warned)) {
printk("__find_get_block_slow() failed. block=%llu, " "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, " "device %pg blocksize: %d\n",
(unsignedlonglong)block,
(unsignedlonglong)bh->b_blocknr,
bh->b_state, bh->b_size, bdev,
1 << blkbits);
}
out_unlock: if (atomic)
spin_unlock(&bd_mapping->i_private_lock); else
folio_unlock(folio);
folio_put(folio);
out: return ret;
}
/* * Be _very_ careful from here on. Bad things can happen if * two buffer heads end IO at almost the same time and both * decide that the page is now completely done.
*/
first = folio_buffers(folio);
spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh; do { if (!buffer_uptodate(tmp))
folio_uptodate = 0; if (buffer_async_read(tmp)) {
BUG_ON(!buffer_locked(tmp)); goto still_busy;
}
tmp = tmp->b_this_page;
} while (tmp != bh);
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
bh_offset(bh)); if (err == 0 && need_fsverity(bh)) { /* * We use different work queues for decryption and for verity * because verity may require reading metadata pages that need * decryption, and we shouldn't recurse to the same workqueue.
*/
INIT_WORK(&ctx->work, verify_bh);
fsverity_enqueue_verify_work(&ctx->work); return;
}
end_buffer_async_read(bh, err == 0);
kfree(ctx);
}
/* * I/O completion handler for block_read_full_folio() - pages * which come unlocked at the end of I/O.
*/ staticvoid end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{ struct inode *inode = bh->b_folio->mapping->host; bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode); bool verify = need_fsverity(bh);
/* * Completion handler for block_write_full_folio() - folios which are unlocked * during I/O, and which have the writeback flag cleared upon I/O completion.
*/ staticvoid end_buffer_async_write(struct buffer_head *bh, int uptodate)
{ unsignedlong flags; struct buffer_head *first; struct buffer_head *tmp; struct folio *folio;
BUG_ON(!buffer_async_write(bh));
folio = bh->b_folio; if (uptodate) {
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost async page write");
mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
}
first = folio_buffers(folio);
spin_lock_irqsave(&first->b_uptodate_lock, flags);
/* * If a page's buffers are under async readin (end_buffer_async_read * completion) then there is a possibility that another thread of * control could lock one of the buffers after it has completed * but while some of the other buffers have not completed. This * locked buffer would confuse end_buffer_async_read() into not unlocking * the page. So the absence of BH_Async_Read tells end_buffer_async_read() * that this buffer is not under async I/O. * * The page comes unlocked when it has no locked buffer_async buffers * left. * * PageLocked prevents anyone starting new async I/O reads any of * the buffers. * * PageWriteback is used to prevent simultaneous writeout of the same * page. * * PageLocked prevents anyone from starting writeback of a page which is * under read I/O (PageWriteback is only ever set against a locked page).
*/ staticvoid mark_buffer_async_read(struct buffer_head *bh)
{
bh->b_end_io = end_buffer_async_read_io;
set_buffer_async_read(bh);
}
/* * fs/buffer.c contains helper functions for buffer-backed address space's * fsync functions. A common requirement for buffer-based filesystems is * that certain data from the backing blockdev needs to be written out for * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the * management of a list of dependent buffers at ->i_mapping->i_private_list. * * Locking is a little subtle: try_to_free_buffers() will remove buffers * from their controlling inode's queue when they are being freed. But * try_to_free_buffers() will be operating against the *blockdev* mapping * at the time, not against the S_ISREG file which depends on those buffers. * So the locking for i_private_list is via the i_private_lock in the address_space * which backs the buffers. Which is different from the address_space * against which the buffers are listed. So for a particular address_space, * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, * mapping->i_private_list will always be protected by the backing blockdev's * ->i_private_lock. * * Which introduces a requirement: all buffers on an address_space's * ->i_private_list must be from the same address_space: the blockdev's. * * address_spaces which do not place buffers at ->i_private_list via these * utility functions are free to use i_private_lock and i_private_list for * whatever they want. The only requirement is that list_empty(i_private_list) * be true at clear_inode() time. * * FIXME: clear_inode should not call invalidate_inode_buffers(). The * filesystems should do that. invalidate_inode_buffers() should just go * BUG_ON(!list_empty). * * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should * take an address_space, not an inode. And it should be called * mark_buffer_dirty_fsync() to clearly define why those buffers are being * queued up. * * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the * list if it is already on a list. Because if the buffer is on a list, * it *must* already be on the right one. If not, the filesystem is being * silly. This will save a ton of locking. But first we have to ensure * that buffers are taken *off* the old inode's list when they are freed * (presumably in truncate). That requires careful auditing of all * filesystems (do it inside bforget()). It could also be done by bringing * b_inode back.
*/
/* * The buffer's backing address_space's i_private_lock must be held
*/ staticvoid __remove_assoc_queue(struct buffer_head *bh)
{
list_del_init(&bh->b_assoc_buffers);
WARN_ON(!bh->b_assoc_map);
bh->b_assoc_map = NULL;
}
int inode_has_buffers(struct inode *inode)
{ return !list_empty(&inode->i_data.i_private_list);
}
/* * osync is designed to support O_SYNC io. It waits synchronously for * all already-submitted IO to complete, but does not queue any new * writes to the disk. * * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer * as you dirty the buffers, and then use osync_inode_buffers to wait for * completion. Any other dirty buffers which are not yet queued for * write will not be flushed to disk by the osync.
*/ staticint osync_buffers_list(spinlock_t *lock, struct list_head *list)
{ struct buffer_head *bh; struct list_head *p; int err = 0;
/** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written * * Starts I/O against the buffers at mapping->i_private_list, and waits upon * that I/O. * * Basically, this is a convenience function for fsync(). * @mapping is a file or directory which needs those buffers to be written for * a successful fsync().
*/ int sync_mapping_buffers(struct address_space *mapping)
{ struct address_space *buffer_mapping = mapping->i_private_data;
if (buffer_mapping == NULL || list_empty(&mapping->i_private_list)) return 0;
/** * generic_buffers_fsync_noflush - generic buffer fsync implementation * for simple filesystems with no inode lock * * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure.
*/ int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, bool datasync)
{ struct inode *inode = file->f_mapping->host; int err; int ret;
err = file_write_and_wait_range(file, start, end); if (err) return err;
ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out;
err = sync_inode_metadata(inode, 1); if (ret == 0)
ret = err;
out: /* check and advance again to catch errors after syncing out buffers */
err = file_check_and_advance_wb_err(file); if (ret == 0)
ret = err; return ret;
}
EXPORT_SYMBOL(generic_buffers_fsync_noflush);
/** * generic_buffers_fsync - generic buffer fsync implementation * for simple filesystems with no inode lock * * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. This also makes sure that * a device cache flush operation is called at the end.
*/ int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, bool datasync)
{ struct inode *inode = file->f_mapping->host; int ret;
ret = generic_buffers_fsync_noflush(file, start, end, datasync); if (!ret)
ret = blkdev_issue_flush(inode->i_sb->s_bdev); return ret;
}
EXPORT_SYMBOL(generic_buffers_fsync);
/* * Called when we've recently written block `bblock', and it is known that * `bblock' was for a buffer_boundary() buffer. This means that the block at * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's * dirty, schedule it for IO. So that indirects merge nicely with their data.
*/ void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize)
{ struct buffer_head *bh;
bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh))
write_dirty_buffer(bh, 0);
put_bh(bh);
}
}
/** * block_dirty_folio - Mark a folio as dirty. * @mapping: The address space containing this folio. * @folio: The folio to mark dirty. * * Filesystems which use buffer_heads can use this function as their * ->dirty_folio implementation. Some filesystems need to do a little * work before calling this function. Filesystems which do not use * buffer_heads should call filemap_dirty_folio() instead. * * If the folio has buffers, the uptodate buffers are set dirty, to * preserve dirty-state coherency between the folio and the buffers. * Buffers added to a dirty folio are created dirty. * * The buffers are dirtied before the folio is dirtied. There's a small * race window in which writeback may see the folio cleanness but not the * buffer dirtiness. That's fine. If this code were to set the folio * dirty before the buffers, writeback could clear the folio dirty flag, * see a bunch of clean buffers and we'd end up with dirty buffers/clean * folio on the dirty folio list. * * We use i_private_lock to lock against try_to_free_buffers() while * using the folio's buffer list. This also prevents clean buffers * being added to the folio after it was set dirty. * * Context: May only be called from process context. Does not sleep. * Caller must ensure that @folio cannot be truncated during this call, * typically by holding the folio lock or having a page in the folio * mapped and holding the page table lock. * * Return: True if the folio was dirtied; false if it was already dirtied.
*/ bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{ struct buffer_head *head; bool newly_dirty;
spin_lock(&mapping->i_private_lock);
head = folio_buffers(folio); if (head) { struct buffer_head *bh = head;
do {
set_buffer_dirty(bh);
bh = bh->b_this_page;
} while (bh != head);
} /* * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters.
*/
newly_dirty = !folio_test_set_dirty(folio);
spin_unlock(&mapping->i_private_lock);
if (newly_dirty)
__folio_mark_dirty(folio, mapping, 1);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
/* * Write out and wait upon a list of buffers. * * We have conflicting pressures: we want to make sure that all * initially dirty buffers get waited on, but that any subsequently * dirtied buffers don't. After all, we don't want fsync to last * forever if somebody is actively writing to the file. * * Do this in two main stages: first we copy dirty buffers to a * temporary inode list, queueing the writes as we go. Then we clean * up, waiting for those writes to complete. * * During this second stage, any subsequent updates to the file may end * up refiling the buffer on the original inode's dirty list again, so * there is a chance we will end up with a buffer queued for write but * not yet completed on that list. So, as a final cleanup we go through * the osync code to catch these locked, dirty buffers without requeuing * any newly dirty buffers for write.
*/ staticint fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{ struct buffer_head *bh; struct address_space *mapping; int err = 0, err2; struct blk_plug plug;
LIST_HEAD(tmp);
blk_start_plug(&plug);
spin_lock(lock); while (!list_empty(list)) {
bh = BH_ENTRY(list->next);
mapping = bh->b_assoc_map;
__remove_assoc_queue(bh); /* Avoid race with mark_buffer_dirty_inode() which does
* a lockless check and we rely on seeing the dirty bit */
smp_mb(); if (buffer_dirty(bh) || buffer_locked(bh)) {
list_add(&bh->b_assoc_buffers, &tmp);
bh->b_assoc_map = mapping; if (buffer_dirty(bh)) {
get_bh(bh);
spin_unlock(lock); /* * Ensure any pending I/O completes so that * write_dirty_buffer() actually writes the * current contents - it is a noop if I/O is * still in flight on potentially older * contents.
*/
write_dirty_buffer(bh, REQ_SYNC);
/* * Kick off IO for the previous mapping. Note * that we will not run the very last mapping, * wait_on_buffer() will do that for us * through sync_buffer().
*/
brelse(bh);
spin_lock(lock);
}
}
}
while (!list_empty(&tmp)) {
bh = BH_ENTRY(tmp.prev);
get_bh(bh);
mapping = bh->b_assoc_map;
__remove_assoc_queue(bh); /* Avoid race with mark_buffer_dirty_inode() which does
* a lockless check and we rely on seeing the dirty bit */
smp_mb(); if (buffer_dirty(bh)) {
list_add(&bh->b_assoc_buffers,
&mapping->i_private_list);
bh->b_assoc_map = mapping;
}
spin_unlock(lock);
wait_on_buffer(bh); if (!buffer_uptodate(bh))
err = -EIO;
brelse(bh);
spin_lock(lock);
}
/* * Invalidate any and all dirty buffers on a given inode. We are * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. * * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which * assumes that all the buffers are against the blockdev.
*/ void invalidate_inode_buffers(struct inode *inode)
{ if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->i_private_list; struct address_space *buffer_mapping = mapping->i_private_data;
spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list))
__remove_assoc_queue(BH_ENTRY(list->next));
spin_unlock(&buffer_mapping->i_private_lock);
}
}
EXPORT_SYMBOL(invalidate_inode_buffers);
/* * Remove any clean buffers from the inode's buffer list. This is called * when we're trying to free the inode itself. Those buffers can pin it. * * Returns true if all buffers were removed.
*/ int remove_inode_buffers(struct inode *inode)
{ int ret = 1;
spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) { struct buffer_head *bh = BH_ENTRY(list->next); if (buffer_dirty(bh)) {
ret = 0; break;
}
__remove_assoc_queue(bh);
}
spin_unlock(&buffer_mapping->i_private_lock);
} return ret;
}
/* * Create the appropriate buffers when given a folio for data area and * the size of each buffer.. Use the bh->b_this_page linked list to * follow the buffers created. Return NULL if unable to create more * buffers. * * The retry flag is used to differentiate async IO (paging, swapping) * which may not fail from ordinary buffer allocations.
*/ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsignedlong size,
gfp_t gfp)
{ struct buffer_head *bh, *head; long offset; struct mem_cgroup *memcg, *old_memcg;
/* The folio lock pins the memcg */
memcg = folio_memcg(folio);
old_memcg = set_active_memcg(memcg);
head = NULL;
offset = folio_size(folio); while ((offset -= size) >= 0) {
bh = alloc_buffer_head(gfp); if (!bh) goto no_grow;
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;
bh->b_size = size;
/* Link the buffer to its folio */
folio_set_bh(bh, folio, offset);
}
out:
set_active_memcg(old_memcg); return head; /* * In case anything failed, we just free everything we got.
*/
no_grow: if (head) { do {
bh = head;
head = head->b_this_page;
free_buffer_head(bh);
} while (head);
}
do { if (!buffer_mapped(bh)) {
bh->b_end_io = NULL;
bh->b_private = NULL;
bh->b_bdev = bdev;
bh->b_blocknr = block; if (uptodate)
set_buffer_uptodate(bh); if (block < end_block)
set_buffer_mapped(bh);
}
block++;
bh = bh->b_this_page;
} while (bh != head);
/* * Caller needs to validate requested block against end of device.
*/ return end_block;
}
/* * Create the page-cache folio that contains the requested block. * * This is used purely for blockdev mappings. * * Returns false if we have a failure which cannot be cured by retrying * without sleeping. Returns true if we succeeded, or the caller should retry.
*/ staticbool grow_dev_folio(struct block_device *bdev, sector_t block,
pgoff_t index, unsigned size, gfp_t gfp)
{ struct address_space *mapping = bdev->bd_mapping; struct folio *folio; struct buffer_head *bh;
sector_t end_block = 0;
bh = folio_buffers(folio); if (bh) { if (bh->b_size == size) {
end_block = folio_init_buffers(folio, bdev, size); goto unlock;
}
/* * Retrying may succeed; for example the folio may finish * writeback, or buffers may be cleaned. This should not * happen very often; maybe we have old buffers attached to * this blockdev's page cache and we're trying to change * the block size?
*/ if (!try_to_free_buffers(folio)) {
end_block = ~0ULL; goto unlock;
}
}
/* * Link the folio to the buffers and initialise them. Take the * lock to be atomic wrt __find_get_block(), which does not * run under the folio lock.
*/
spin_lock(&mapping->i_private_lock);
link_dev_buffers(folio, bh);
end_block = folio_init_buffers(folio, bdev, size);
spin_unlock(&mapping->i_private_lock);
unlock:
folio_unlock(folio);
folio_put(folio); return block < end_block;
}
/* * Create buffers for the specified block device block's folio. If * that folio was dirty, the buffers are set dirty also. Returns false * if we've hit a permanent error.
*/ staticbool grow_buffers(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp)
{
loff_t pos;
/* * Check for a block which lies outside our maximum possible * pagecache index.
*/ if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
__func__, (unsignedlonglong)block,
bdev); returnfalse;
}
/* Create a folio with the proper size buffers */ return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
}
if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
size, bdev_logical_block_size(bdev)); return NULL;
}
for (;;) { struct buffer_head *bh;
if (!grow_buffers(bdev, block, size, gfp)) return NULL;
if (blocking)
bh = __find_get_block_nonatomic(bdev, block, size); else
bh = __find_get_block(bdev, block, size); if (bh) return bh;
}
}
/* * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and * the page is tagged dirty in the page cache. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is * merely a hint about the true dirty state. * * When a page is set dirty in its entirety, all its buffers are marked dirty * (if the page has buffers). * * When a buffer is marked dirty, its page is dirtied, but the page's other * buffers are not. * * Also. When blockdev buffers are explicitly read with bread(), they * individually become uptodate. But their backing page remains not * uptodate - even if all of its buffers are uptodate. A subsequent * block_read_full_folio() against that folio will discover all the uptodate * buffers, will set the folio uptodate and will perform no I/O.
*/
/** * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * * mark_buffer_dirty() will set the dirty bit against the buffer, then set * its backing page dirty, then tag the page as dirty in the page cache * and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock, * i_pages lock and mapping->host->i_lock.
*/ void mark_buffer_dirty(struct buffer_head *bh)
{
WARN_ON_ONCE(!buffer_uptodate(bh));
trace_block_dirty_buffer(bh);
/* * Very *carefully* optimize the it-is-already-dirty case. * * Don't let the final "is it dirty" escape to before we * perhaps modified the buffer.
*/ if (buffer_dirty(bh)) {
smp_mb(); if (buffer_dirty(bh)) return;
}
if (!folio_test_set_dirty(folio)) {
mapping = folio->mapping; if (mapping)
__folio_mark_dirty(folio, mapping, 0);
} if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
}
EXPORT_SYMBOL(mark_buffer_dirty);
void mark_buffer_write_io_error(struct buffer_head *bh)
{
set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_folio && bh->b_folio->mapping)
mapping_set_error(bh->b_folio->mapping, -EIO); if (bh->b_assoc_map)
mapping_set_error(bh->b_assoc_map, -EIO);
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
/** * __brelse - Release a buffer. * @bh: The buffer to release. * * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
*/ void __brelse(struct buffer_head *bh)
{ if (atomic_read(&bh->b_count)) {
put_bh(bh); return;
}
WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
EXPORT_SYMBOL(__brelse);
/** * __bforget - Discard any dirty data in a buffer. * @bh: The buffer to forget. * * This variant of bforget() can be called if @bh is guaranteed to not * be NULL.
*/ void __bforget(struct buffer_head *bh)
{
clear_buffer_dirty(bh); if (bh->b_assoc_map) { struct address_space *buffer_mapping = bh->b_folio->mapping;
/* * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their * refcount elevated by one when they're in an LRU. A buffer can only appear * once in a particular CPU's LRU. A single buffer can be present in multiple * CPU's LRUs at the same time. * * This is a transparent caching front-end to sb_bread(), sb_getblk() and * sb_find_get_block(). * * The LRUs themselves only need locking against invalidate_bh_lrus. We use * a local interrupt disable for that.
*/
/* * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is * inserted at the front, and the buffer_head at the back if any is evicted. * Or, if already in the LRU it is moved to the front.
*/ staticvoid bh_lru_install(struct buffer_head *bh)
{ struct buffer_head *evictee = bh; struct bh_lru *b; int i;
check_irqs_on();
bh_lru_lock();
/* * the refcount of buffer_head in bh_lru prevents dropping the * attached page(i.e., try_to_free_buffers) so it could cause * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done.
*/ if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
bh_lru_unlock(); return;
}
b = this_cpu_ptr(&bh_lrus); for (i = 0; i < BH_LRU_SIZE; i++) {
swap(evictee, b->bhs[i]); if (evictee == bh) {
bh_lru_unlock(); return;
}
}
get_bh(bh);
bh_lru_unlock();
brelse(evictee);
}
/* * Look up the bh in this cpu's LRU. If it's there, move it to the head.
*/ staticstruct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
{ struct buffer_head *ret = NULL; unsignedint i;
check_irqs_on();
bh_lru_lock(); if (cpu_is_isolated(smp_processor_id())) {
bh_lru_unlock(); return NULL;
} for (i = 0; i < BH_LRU_SIZE; i++) { struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
/* * Perform a pagecache lookup for the matching buffer. If it's there, refresh * it in the LRU and mark it as accessed. If it is not present then return * NULL. Atomic context callers may also return NULL if the buffer is being * migrated; similarly the page is not marked accessed either.
*/ staticstruct buffer_head *
find_get_block_common(struct block_device *bdev, sector_t block, unsigned size, bool atomic)
{ struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) { /* __find_get_block_slow will mark the page accessed */
bh = __find_get_block_slow(bdev, block, atomic); if (bh)
bh_lru_install(bh);
} else
touch_buffer(bh);
/* same as __find_get_block() but allows sleeping contexts */ struct buffer_head *
__find_get_block_nonatomic(struct block_device *bdev, sector_t block, unsigned size)
{ return find_get_block_common(bdev, block, size, false);
}
EXPORT_SYMBOL(__find_get_block_nonatomic);
/** * bdev_getblk - Get a buffer_head in a block device's buffer cache. * @bdev: The block device. * @block: The block number. * @size: The size of buffer_heads for this @bdev. * @gfp: The memory allocation flags to use. * * The returned buffer head has its reference count incremented, but is * not locked. The caller should call brelse() when it has finished * with the buffer. The buffer may not be uptodate. If needed, the * caller can bring it uptodate either by reading it or overwriting it. * * Return: The buffer head, or NULL if memory could not be allocated.
*/ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp)
{ struct buffer_head *bh;
/* * Do async read-ahead on a buffer..
*/ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{ struct buffer_head *bh = bdev_getblk(bdev, block, size,
GFP_NOWAIT | __GFP_MOVABLE);
if (likely(bh)) {
bh_readahead(bh, REQ_RAHEAD);
brelse(bh);
}
}
EXPORT_SYMBOL(__breadahead);
/** * __bread_gfp() - Read a block. * @bdev: The block device to read from. * @block: Block number in units of block size. * @size: The block size of this device in bytes. * @gfp: Not page allocation flags; see below. * * You are not expected to call this function. You should use one of * sb_bread(), sb_bread_unmovable() or __bread(). * * Read a specified block, and return the buffer head that refers to it. * If @gfp is 0, the memory will be allocated using the block device's * default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be * allocated from a movable area. Do not pass in a complete set of * GFP flags. * * The returned buffer head has its refcount increased. The caller should * call brelse() when it has finished with the buffer. * * Context: May sleep waiting for I/O. * Return: NULL if the block was unreadable.
*/ struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp)
{ struct buffer_head *bh;
staticvoid __invalidate_bh_lrus(struct bh_lru *b)
{ int i;
for (i = 0; i < BH_LRU_SIZE; i++) {
brelse(b->bhs[i]);
b->bhs[i] = NULL;
}
} /* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq * or with preempt disabled.
*/ staticvoid invalidate_bh_lru(void *arg)
{ struct bh_lru *b = &get_cpu_var(bh_lrus);
/* * It's called from workqueue context so we need a bh_lru_lock to close * the race with preemption/irq.
*/ void invalidate_bh_lrus_cpu(void)
{ struct bh_lru *b;
bh_lru_lock();
b = this_cpu_ptr(&bh_lrus);
__invalidate_bh_lrus(b);
bh_lru_unlock();
}
lock_buffer(bh);
clear_buffer_dirty(bh);
bh->b_bdev = NULL;
b_state = READ_ONCE(bh->b_state); do {
} while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
b_state & ~BUFFER_FLAGS_DISCARD));
unlock_buffer(bh);
}
/** * block_invalidate_folio - Invalidate part or all of a buffer-backed folio. * @folio: The folio which is affected. * @offset: start of the range to invalidate * @length: length of the range to invalidate * * block_invalidate_folio() is called when all or part of the folio has been * invalidated by a truncate operation. * * block_invalidate_folio() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk.
*/ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{ struct buffer_head *head, *bh, *next;
size_t curr_off = 0;
size_t stop = length + offset;
bh = head; do {
size_t next_off = curr_off + bh->b_size;
next = bh->b_this_page;
/* * Are we still fully in range ?
*/ if (next_off > stop) goto out;
/* * is this block fully invalidated?
*/ if (offset <= curr_off)
discard_buffer(bh);
curr_off = next_off;
bh = next;
} while (bh != head);
/* * We release buffers only if the entire folio is being invalidated. * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore.
*/ if (length == folio_size(folio))
filemap_release_folio(folio, 0);
out:
folio_clear_mappedtodisk(folio);
}
EXPORT_SYMBOL(block_invalidate_folio);
/* * We attach and possibly dirty the buffers atomically wrt * block_dirty_folio() via i_private_lock. try_to_free_buffers * is already excluded via the folio lock.
*/ struct buffer_head *create_empty_buffers(struct folio *folio, unsignedlong blocksize, unsignedlong b_state)
{ struct buffer_head *bh, *head, *tail;
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
head = folio_alloc_buffers(folio, blocksize, gfp);
bh = head; do {
bh->b_state |= b_state;
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
spin_lock(&folio->mapping->i_private_lock); if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
bh = head; do { if (folio_test_dirty(folio))
set_buffer_dirty(bh); if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
bh = bh->b_this_page;
} while (bh != head);
}
folio_attach_private(folio, head);
spin_unlock(&folio->mapping->i_private_lock);
/** * clean_bdev_aliases: clean a range of buffers in block device * @bdev: Block device to clean buffers in * @block: Start of a range of blocks to clean * @len: Number of blocks to clean * * We are taking a range of blocks for data and we don't want writeback of any * buffer-cache aliases starting from return from this function and until the * moment when something will explicitly mark the buffer dirty (hopefully that * will not happen until we will free that block ;-) We don't even need to mark * it not-uptodate - nobody can expect anything from a newly allocated buffer * anyway. We used to use unmap_buffer() for such invalidation, but that was * wrong. We definitely don't want to mark the alias unmapped, for example - it * would confuse anyone who might pick it with bread() afterwards... * * Also.. Note that bforget() doesn't lock the buffer. So there can be * writeout I/O going on against recently-freed buffers. We don't wait on that * I/O in bforget() - it's more efficient to wait on the I/O only if we really * need to. That happens here.
*/ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{ struct address_space *bd_mapping = bdev->bd_mapping; constint blkbits = bd_mapping->host->i_blkbits; struct folio_batch fbatch;
pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
pgoff_t end; int i, count; struct buffer_head *bh; struct buffer_head *head;
end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
folio_batch_init(&fbatch); while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
count = folio_batch_count(&fbatch); for (i = 0; i < count; i++) { struct folio *folio = fbatch.folios[i];
if (!folio_buffers(folio)) continue; /* * We use folio lock instead of bd_mapping->i_private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock.
*/
folio_lock(folio); /* Recheck when the folio is locked which pins bhs */
head = folio_buffers(folio); if (!head) goto unlock_page;
bh = head; do { if (!buffer_mapped(bh) || (bh->b_blocknr < block)) goto next; if (bh->b_blocknr >= block + len) break;
clear_buffer_dirty(bh);
wait_on_buffer(bh);
clear_buffer_req(bh);
next:
bh = bh->b_this_page;
} while (bh != head);
unlock_page:
folio_unlock(folio);
}
folio_batch_release(&fbatch);
cond_resched(); /* End of range already reached? */ if (index > end || !index) break;
}
}
EXPORT_SYMBOL(clean_bdev_aliases);
/* * NOTE! All mapped/uptodate combinations are valid: * * Mapped Uptodate Meaning * * No No "unknown" - must do get_block() * No Yes "hole" - zero-filled * Yes No "allocated" - allocated on disk, not read in * Yes Yes "valid" - allocated and up-to-date in memory. * * "Dirty" is valid only with the last case (mapped+uptodate).
*/
/* * While block_write_full_folio is writing back the dirty buffers under * the page lock, whoever dirtied the buffers may decide to clean them * again at any time. We handle that by only looking at the buffer * state inside lock_buffer(). * * If block_write_full_folio() is called for regular writeback * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. * * If block_write_full_folio() is called with wbc->sync_mode == * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes.
*/ int __block_write_full_folio(struct inode *inode, struct folio *folio,
get_block_t *get_block, struct writeback_control *wbc)
{ int err;
sector_t block;
sector_t last_block; struct buffer_head *bh, *head;
size_t blocksize; int nr_underway = 0;
blk_opf_t write_flags = wbc_to_write_flags(wbc);
/* * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the folio stays dirty. * * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them.
*/
/* * Get all the dirty buffers mapped to disk addresses and * handle any aliases from the underlying blockdev's mapping.
*/ do { if (block > last_block) { /* * mapped buffers outside i_size will occur, because * this folio can be outside i_size when there is a * truncate in progress.
*/ /* * The buffer was zeroed by block_write_full_folio()
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
} elseif ((!buffer_mapped(bh) || buffer_delay(bh)) &&
buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1); if (err) goto recover;
clear_buffer_delay(bh); if (buffer_new(bh)) { /* blockdev mappings never come here */
clear_buffer_new(bh);
clean_bdev_bh_alias(bh);
}
}
bh = bh->b_this_page;
block++;
} while (bh != head);
do { if (!buffer_mapped(bh)) continue; /* * If it's a fully non-blocking write attempt and we cannot * lock the buffer then redirty the folio. Note that this can * potentially cause a busy-wait loop from writeback threads * and kswapd activity, but those code paths have their own * higher-level throttling.
*/ if (wbc->sync_mode != WB_SYNC_NONE) {
lock_buffer(bh);
} elseif (!trylock_buffer(bh)) {
folio_redirty_for_writepage(wbc, folio); continue;
} if (test_clear_buffer_dirty(bh)) {
mark_buffer_async_write_endio(bh,
end_buffer_async_write);
} else {
unlock_buffer(bh);
}
} while ((bh = bh->b_this_page) != head);
/* * The folio and its buffers are protected by the writeback flag, * so we can drop the bh refcounts early.
*/
BUG_ON(folio_test_writeback(folio));
folio_start_writeback(folio);
do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) {
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
} while (bh != head);
folio_unlock(folio);
err = 0;
done: if (nr_underway == 0) { /* * The folio was marked dirty, but the buffers were * clean. Someone wrote them back by hand with * write_dirty_buffer/submit_bh. A rare case.
*/
folio_end_writeback(folio);
/* * The folio and buffer_heads can be released at any time from * here on.
*/
} return err;
recover: /* * ENOSPC, or some other error. We may already have added some * blocks to the file, so we need to write these out to avoid * exposing stale data. * The folio is currently locked and not marked for writeback
*/
bh = head; /* Recovery: lock and submit the mapped buffers */ do { if (buffer_mapped(bh) && buffer_dirty(bh) &&
!buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write_endio(bh,
end_buffer_async_write);
} else { /* * The buffer may have been set dirty during * attachment to a dirty folio.
*/
clear_buffer_dirty(bh);
}
} while ((bh = bh->b_this_page) != head);
BUG_ON(folio_test_writeback(folio));
mapping_set_error(folio->mapping, err);
folio_start_writeback(folio); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
} while (bh != head);
folio_unlock(folio); goto done;
}
EXPORT_SYMBOL(__block_write_full_folio);
/* * If a folio has any new buffers, zero them out here, and mark them uptodate * and dirty so they'll be written out (in order to prevent uninitialised * block data from leaking). And clear the new bit.
*/ void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
{
size_t block_start, block_end; struct buffer_head *head, *bh;
BUG_ON(!folio_test_locked(folio));
head = folio_buffers(folio); if (!head) return;
/* * Block points to offset in file we need to map, iomap contains * the offset at which the map starts. If the map ends before the * current block, then do not map the buffer and let the caller * handle it.
*/ if (offset >= iomap->offset + iomap->length) return -EIO;
switch (iomap->type) { case IOMAP_HOLE: /* * If the buffer is not up to date or beyond the current EOF, * we need to mark it as new to ensure sub-block zeroing is * executed if necessary.
*/ if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
set_buffer_new(bh); return 0; case IOMAP_DELALLOC: if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
set_buffer_new(bh);
set_buffer_uptodate(bh);
set_buffer_mapped(bh);
set_buffer_delay(bh); return 0; case IOMAP_UNWRITTEN: /* * For unwritten regions, we always need to ensure that regions * in the block we are not writing to are zeroed. Mark the * buffer as new to ensure this.
*/
set_buffer_new(bh);
set_buffer_unwritten(bh);
fallthrough; case IOMAP_MAPPED: if ((iomap->flags & IOMAP_F_NEW) ||
offset >= i_size_read(inode)) { /* * This can happen if truncating the block device races * with the check in the caller as i_size updates on * block devices aren't synchronized by i_rwsem for * block devices.
*/ if (S_ISBLK(inode->i_mode)) return -EIO;
set_buffer_new(bh);
}
bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
inode->i_blkbits;
set_buffer_mapped(bh); return 0; default:
WARN_ON_ONCE(1); return -EIO;
}
}
int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block, conststruct iomap *iomap)
{
size_t from = offset_in_folio(folio, pos);
size_t to = from + len; struct inode *inode = folio->mapping->host;
size_t block_start, block_end;
sector_t block; int err = 0;
size_t blocksize; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
/* * If this is a partial write which happened to make all buffers * uptodate then we can optimize away a bogus read_folio() for * the next read(). Here we 'discover' whether the folio went * uptodate as a result of this (potentially partial) write.
*/ if (!partial)
folio_mark_uptodate(folio);
}
EXPORT_SYMBOL(block_commit_write);
/* * block_write_begin takes care of the basic task of block allocation and * bringing partial write blocks uptodate first. * * The filesystem needs to handle block truncation upon failure.
*/ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_SHIFT; struct folio *folio; int status;
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio);
status = __block_write_begin_int(folio, pos, len, get_block, NULL); if (unlikely(status)) {
folio_unlock(folio);
folio_put(folio);
folio = NULL;
}
if (unlikely(copied < len)) { /* * The buffers that were written will now be uptodate, so * we don't have to worry about a read_folio reading them * and overwriting a partial write. However if we have * encountered a short write and only partially written * into a buffer, it will not be marked uptodate, so a * read_folio might come in and destroy our partial write. * * Do the simplest thing, and just treat any short write to a * non uptodate folio as a zero-length write, and force the * caller to redo the whole thing.
*/ if (!folio_test_uptodate(folio))
copied = 0;
/* * No need to use i_size_read() here, the i_size cannot change under us * because we hold i_rwsem. * * But it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size.
*/ if (pos + copied > inode->i_size) {
i_size_write(inode, pos + copied);
i_size_changed = true;
}
folio_unlock(folio);
folio_put(folio);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock * ordering of page lock and transaction start for journaling * filesystems.
*/ if (i_size_changed)
mark_inode_dirty(inode); return copied;
}
EXPORT_SYMBOL(generic_write_end);
/* * block_is_partially_uptodate checks whether buffers within a folio are * uptodate or not. * * Returns true if all buffers which correspond to the specified part * of the folio are uptodate.
*/ bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{ unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; bool ret = true;
head = folio_buffers(folio); if (!head) returnfalse;
blocksize = head->b_size;
to = min_t(unsigned, folio_size(folio) - from, count);
to = from + to; if (from < blocksize && to > folio_size(folio) - blocksize) returnfalse;
bh = head;
block_start = 0; do {
block_end = block_start + blocksize; if (block_end > from && block_start < to) { if (!buffer_uptodate(bh)) {
ret = false; break;
} if (block_end >= to) break;
}
block_start = block_end;
bh = bh->b_this_page;
} while (bh != head);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.