source: src/linux/universal/linux-3.18/fs/xfs/xfs_aops.c @ 31869

Last change on this file since 31869 was 31869, checked in by brainslayer, 5 weeks ago

update

File size: 49.4 KB
Line 
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18#include "xfs.h"
19#include "xfs_shared.h"
20#include "xfs_format.h"
21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_inode.h"
27#include "xfs_trans.h"
28#include "xfs_inode_item.h"
29#include "xfs_alloc.h"
30#include "xfs_error.h"
31#include "xfs_iomap.h"
32#include "xfs_trace.h"
33#include "xfs_bmap.h"
34#include "xfs_bmap_util.h"
35#include "xfs_bmap_btree.h"
36#include "xfs_dinode.h"
37#include <linux/aio.h>
38#include <linux/gfp.h>
39#include <linux/mpage.h>
40#include <linux/pagevec.h>
41#include <linux/writeback.h>
42
43void
44xfs_count_page_state(
45        struct page             *page,
46        int                     *delalloc,
47        int                     *unwritten)
48{
49        struct buffer_head      *bh, *head;
50
51        *delalloc = *unwritten = 0;
52
53        bh = head = page_buffers(page);
54        do {
55                if (buffer_unwritten(bh))
56                        (*unwritten) = 1;
57                else if (buffer_delay(bh))
58                        (*delalloc) = 1;
59        } while ((bh = bh->b_this_page) != head);
60}
61
62STATIC struct block_device *
63xfs_find_bdev_for_inode(
64        struct inode            *inode)
65{
66        struct xfs_inode        *ip = XFS_I(inode);
67        struct xfs_mount        *mp = ip->i_mount;
68
69        if (XFS_IS_REALTIME_INODE(ip))
70                return mp->m_rtdev_targp->bt_bdev;
71        else
72                return mp->m_ddev_targp->bt_bdev;
73}
74
75/*
76 * We're now finished for good with this ioend structure.
77 * Update the page state via the associated buffer_heads,
78 * release holds on the inode and bio, and finally free
79 * up memory.  Do not use the ioend after this.
80 */
81STATIC void
82xfs_destroy_ioend(
83        xfs_ioend_t             *ioend)
84{
85        struct buffer_head      *bh, *next;
86
87        for (bh = ioend->io_buffer_head; bh; bh = next) {
88                next = bh->b_private;
89                bh->b_end_io(bh, !ioend->io_error);
90        }
91
92        mempool_free(ioend, xfs_ioend_pool);
93}
94
95/*
96 * Fast and loose check if this write could update the on-disk inode size.
97 */
98static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
99{
100        return ioend->io_offset + ioend->io_size >
101                XFS_I(ioend->io_inode)->i_d.di_size;
102}
103
104STATIC int
105xfs_setfilesize_trans_alloc(
106        struct xfs_ioend        *ioend)
107{
108        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
109        struct xfs_trans        *tp;
110        int                     error;
111
112        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
113
114        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
115        if (error) {
116                xfs_trans_cancel(tp, 0);
117                return error;
118        }
119
120        ioend->io_append_trans = tp;
121
122        /*
123         * We may pass freeze protection with a transaction.  So tell lockdep
124         * we released it.
125         */
126        rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
127                      1, _THIS_IP_);
128        /*
129         * We hand off the transaction to the completion thread now, so
130         * clear the flag here.
131         */
132        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
133        return 0;
134}
135
136/*
137 * Update on-disk file size now that data has been written to disk.
138 */
139STATIC int
140xfs_setfilesize(
141        struct xfs_ioend        *ioend)
142{
143        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
144        struct xfs_trans        *tp = ioend->io_append_trans;
145        xfs_fsize_t             isize;
146
147        /*
148         * The transaction may have been allocated in the I/O submission thread,
149         * thus we need to mark ourselves as beeing in a transaction manually.
150         * Similarly for freeze protection.
151         */
152        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
153        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
154                           0, 1, _THIS_IP_);
155
156        /* we abort the update if there was an IO error */
157        if (ioend->io_error) {
158                xfs_trans_cancel(tp, 0);
159                return ioend->io_error;
160        }
161
162        xfs_ilock(ip, XFS_ILOCK_EXCL);
163        isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
164        if (!isize) {
165                xfs_iunlock(ip, XFS_ILOCK_EXCL);
166                xfs_trans_cancel(tp, 0);
167                return 0;
168        }
169
170        trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
171
172        ip->i_d.di_size = isize;
173        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
174        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
175
176        return xfs_trans_commit(tp, 0);
177}
178
179/*
180 * Schedule IO completion handling on the final put of an ioend.
181 *
182 * If there is no work to do we might as well call it a day and free the
183 * ioend right now.
184 */
185STATIC void
186xfs_finish_ioend(
187        struct xfs_ioend        *ioend)
188{
189        if (atomic_dec_and_test(&ioend->io_remaining)) {
190                struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
191
192                if (ioend->io_type == XFS_IO_UNWRITTEN)
193                        queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
194                else if (ioend->io_append_trans ||
195                         (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
196                        queue_work(mp->m_data_workqueue, &ioend->io_work);
197                else
198                        xfs_destroy_ioend(ioend);
199        }
200}
201
202/*
203 * IO write completion.
204 */
205STATIC void
206xfs_end_io(
207        struct work_struct *work)
208{
209        xfs_ioend_t     *ioend = container_of(work, xfs_ioend_t, io_work);
210        struct xfs_inode *ip = XFS_I(ioend->io_inode);
211        int             error = 0;
212
213        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
214                ioend->io_error = -EIO;
215                goto done;
216        }
217
218        /*
219         * For unwritten extents we need to issue transactions to convert a
220         * range to normal written extens after the data I/O has finished.
221         * Detecting and handling completion IO errors is done individually
222         * for each case as different cleanup operations need to be performed
223         * on error.
224         */
225        if (ioend->io_type == XFS_IO_UNWRITTEN) {
226                if (ioend->io_error)
227                        goto done;
228                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
229                                                  ioend->io_size);
230        } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
231                /*
232                 * For direct I/O we do not know if we need to allocate blocks
233                 * or not so we can't preallocate an append transaction as that
234                 * results in nested reservations and log space deadlocks. Hence
235                 * allocate the transaction here. While this is sub-optimal and
236                 * can block IO completion for some time, we're stuck with doing
237                 * it this way until we can pass the ioend to the direct IO
238                 * allocation callbacks and avoid nesting that way.
239                 */
240                error = xfs_setfilesize_trans_alloc(ioend);
241                if (error)
242                        goto done;
243                error = xfs_setfilesize(ioend);
244        } else if (ioend->io_append_trans) {
245                error = xfs_setfilesize(ioend);
246        } else {
247                ASSERT(!xfs_ioend_is_append(ioend));
248        }
249
250done:
251        if (error)
252                ioend->io_error = error;
253        xfs_destroy_ioend(ioend);
254}
255
256/*
257 * Call IO completion handling in caller context on the final put of an ioend.
258 */
259STATIC void
260xfs_finish_ioend_sync(
261        struct xfs_ioend        *ioend)
262{
263        if (atomic_dec_and_test(&ioend->io_remaining))
264                xfs_end_io(&ioend->io_work);
265}
266
267/*
268 * Allocate and initialise an IO completion structure.
269 * We need to track unwritten extent write completion here initially.
270 * We'll need to extend this for updating the ondisk inode size later
271 * (vs. incore size).
272 */
273STATIC xfs_ioend_t *
274xfs_alloc_ioend(
275        struct inode            *inode,
276        unsigned int            type)
277{
278        xfs_ioend_t             *ioend;
279
280        ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
281
282        /*
283         * Set the count to 1 initially, which will prevent an I/O
284         * completion callback from happening before we have started
285         * all the I/O from calling the completion routine too early.
286         */
287        atomic_set(&ioend->io_remaining, 1);
288        ioend->io_isdirect = 0;
289        ioend->io_error = 0;
290        ioend->io_list = NULL;
291        ioend->io_type = type;
292        ioend->io_inode = inode;
293        ioend->io_buffer_head = NULL;
294        ioend->io_buffer_tail = NULL;
295        ioend->io_offset = 0;
296        ioend->io_size = 0;
297        ioend->io_append_trans = NULL;
298
299        INIT_WORK(&ioend->io_work, xfs_end_io);
300        return ioend;
301}
302
303STATIC int
304xfs_map_blocks(
305        struct inode            *inode,
306        loff_t                  offset,
307        struct xfs_bmbt_irec    *imap,
308        int                     type,
309        int                     nonblocking)
310{
311        struct xfs_inode        *ip = XFS_I(inode);
312        struct xfs_mount        *mp = ip->i_mount;
313        ssize_t                 count = 1 << inode->i_blkbits;
314        xfs_fileoff_t           offset_fsb, end_fsb;
315        int                     error = 0;
316        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
317        int                     nimaps = 1;
318
319        if (XFS_FORCED_SHUTDOWN(mp))
320                return -EIO;
321
322        if (type == XFS_IO_UNWRITTEN)
323                bmapi_flags |= XFS_BMAPI_IGSTATE;
324
325        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
326                if (nonblocking)
327                        return -EAGAIN;
328                xfs_ilock(ip, XFS_ILOCK_SHARED);
329        }
330
331        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
332               (ip->i_df.if_flags & XFS_IFEXTENTS));
333        ASSERT(offset <= mp->m_super->s_maxbytes);
334
335        if (offset + count > mp->m_super->s_maxbytes)
336                count = mp->m_super->s_maxbytes - offset;
337        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
338        offset_fsb = XFS_B_TO_FSBT(mp, offset);
339        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
340                                imap, &nimaps, bmapi_flags);
341        xfs_iunlock(ip, XFS_ILOCK_SHARED);
342
343        if (error)
344                return error;
345
346        if (type == XFS_IO_DELALLOC &&
347            (!nimaps || isnullstartblock(imap->br_startblock))) {
348                error = xfs_iomap_write_allocate(ip, offset, imap);
349                if (!error)
350                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
351                return error;
352        }
353
354#ifdef DEBUG
355        if (type == XFS_IO_UNWRITTEN) {
356                ASSERT(nimaps);
357                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
358                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
359        }
360#endif
361        if (nimaps)
362                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
363        return 0;
364}
365
366STATIC int
367xfs_imap_valid(
368        struct inode            *inode,
369        struct xfs_bmbt_irec    *imap,
370        xfs_off_t               offset)
371{
372        offset >>= inode->i_blkbits;
373
374        return offset >= imap->br_startoff &&
375                offset < imap->br_startoff + imap->br_blockcount;
376}
377
378/*
379 * BIO completion handler for buffered IO.
380 */
381STATIC void
382xfs_end_bio(
383        struct bio              *bio,
384        int                     error)
385{
386        xfs_ioend_t             *ioend = bio->bi_private;
387
388        ASSERT(atomic_read(&bio->bi_cnt) >= 1);
389        ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
390
391        /* Toss bio and pass work off to an xfsdatad thread */
392        bio->bi_private = NULL;
393        bio->bi_end_io = NULL;
394        bio_put(bio);
395
396        xfs_finish_ioend(ioend);
397}
398
399STATIC void
400xfs_submit_ioend_bio(
401        struct writeback_control *wbc,
402        xfs_ioend_t             *ioend,
403        struct bio              *bio)
404{
405        atomic_inc(&ioend->io_remaining);
406        bio->bi_private = ioend;
407        bio->bi_end_io = xfs_end_bio;
408        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
409}
410
411STATIC struct bio *
412xfs_alloc_ioend_bio(
413        struct buffer_head      *bh)
414{
415        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
416        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
417
418        ASSERT(bio->bi_private == NULL);
419        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
420        bio->bi_bdev = bh->b_bdev;
421        return bio;
422}
423
424STATIC void
425xfs_start_buffer_writeback(
426        struct buffer_head      *bh)
427{
428        ASSERT(buffer_mapped(bh));
429        ASSERT(buffer_locked(bh));
430        ASSERT(!buffer_delay(bh));
431        ASSERT(!buffer_unwritten(bh));
432
433        mark_buffer_async_write(bh);
434        set_buffer_uptodate(bh);
435        clear_buffer_dirty(bh);
436}
437
438STATIC void
439xfs_start_page_writeback(
440        struct page             *page,
441        int                     clear_dirty,
442        int                     buffers)
443{
444        ASSERT(PageLocked(page));
445        ASSERT(!PageWriteback(page));
446
447        /*
448         * if the page was not fully cleaned, we need to ensure that the higher
449         * layers come back to it correctly. That means we need to keep the page
450         * dirty, and for WB_SYNC_ALL writeback we need to ensure the
451         * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
452         * write this page in this writeback sweep will be made.
453         */
454        if (clear_dirty) {
455                clear_page_dirty_for_io(page);
456                set_page_writeback(page);
457        } else
458                set_page_writeback_keepwrite(page);
459
460        unlock_page(page);
461
462        /* If no buffers on the page are to be written, finish it here */
463        if (!buffers)
464                end_page_writeback(page);
465}
466
467static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
468{
469        return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
470}
471
472/*
473 * Submit all of the bios for all of the ioends we have saved up, covering the
474 * initial writepage page and also any probed pages.
475 *
476 * Because we may have multiple ioends spanning a page, we need to start
477 * writeback on all the buffers before we submit them for I/O. If we mark the
478 * buffers as we got, then we can end up with a page that only has buffers
479 * marked async write and I/O complete on can occur before we mark the other
480 * buffers async write.
481 *
482 * The end result of this is that we trip a bug in end_page_writeback() because
483 * we call it twice for the one page as the code in end_buffer_async_write()
484 * assumes that all buffers on the page are started at the same time.
485 *
486 * The fix is two passes across the ioend list - one to start writeback on the
487 * buffer_heads, and then submit them for I/O on the second pass.
488 *
489 * If @fail is non-zero, it means that we have a situation where some part of
490 * the submission process has failed after we have marked paged for writeback
491 * and unlocked them. In this situation, we need to fail the ioend chain rather
492 * than submit it to IO. This typically only happens on a filesystem shutdown.
493 */
494STATIC void
495xfs_submit_ioend(
496        struct writeback_control *wbc,
497        xfs_ioend_t             *ioend,
498        int                     fail)
499{
500        xfs_ioend_t             *head = ioend;
501        xfs_ioend_t             *next;
502        struct buffer_head      *bh;
503        struct bio              *bio;
504        sector_t                lastblock = 0;
505
506        /* Pass 1 - start writeback */
507        do {
508                next = ioend->io_list;
509                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
510                        xfs_start_buffer_writeback(bh);
511        } while ((ioend = next) != NULL);
512
513        /* Pass 2 - submit I/O */
514        ioend = head;
515        do {
516                next = ioend->io_list;
517                bio = NULL;
518
519                /*
520                 * If we are failing the IO now, just mark the ioend with an
521                 * error and finish it. This will run IO completion immediately
522                 * as there is only one reference to the ioend at this point in
523                 * time.
524                 */
525                if (fail) {
526                        ioend->io_error = fail;
527                        xfs_finish_ioend(ioend);
528                        continue;
529                }
530
531                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
532
533                        if (!bio) {
534 retry:
535                                bio = xfs_alloc_ioend_bio(bh);
536                        } else if (bh->b_blocknr != lastblock + 1) {
537                                xfs_submit_ioend_bio(wbc, ioend, bio);
538                                goto retry;
539                        }
540
541                        if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
542                                xfs_submit_ioend_bio(wbc, ioend, bio);
543                                goto retry;
544                        }
545
546                        lastblock = bh->b_blocknr;
547                }
548                if (bio)
549                        xfs_submit_ioend_bio(wbc, ioend, bio);
550                xfs_finish_ioend(ioend);
551        } while ((ioend = next) != NULL);
552}
553
554/*
555 * Cancel submission of all buffer_heads so far in this endio.
556 * Toss the endio too.  Only ever called for the initial page
557 * in a writepage request, so only ever one page.
558 */
559STATIC void
560xfs_cancel_ioend(
561        xfs_ioend_t             *ioend)
562{
563        xfs_ioend_t             *next;
564        struct buffer_head      *bh, *next_bh;
565
566        do {
567                next = ioend->io_list;
568                bh = ioend->io_buffer_head;
569                do {
570                        next_bh = bh->b_private;
571                        clear_buffer_async_write(bh);
572                        /*
573                         * The unwritten flag is cleared when added to the
574                         * ioend. We're not submitting for I/O so mark the
575                         * buffer unwritten again for next time around.
576                         */
577                        if (ioend->io_type == XFS_IO_UNWRITTEN)
578                                set_buffer_unwritten(bh);
579                        unlock_buffer(bh);
580                } while ((bh = next_bh) != NULL);
581
582                mempool_free(ioend, xfs_ioend_pool);
583        } while ((ioend = next) != NULL);
584}
585
586/*
587 * Test to see if we've been building up a completion structure for
588 * earlier buffers -- if so, we try to append to this ioend if we
589 * can, otherwise we finish off any current ioend and start another.
590 * Return true if we've finished the given ioend.
591 */
592STATIC void
593xfs_add_to_ioend(
594        struct inode            *inode,
595        struct buffer_head      *bh,
596        xfs_off_t               offset,
597        unsigned int            type,
598        xfs_ioend_t             **result,
599        int                     need_ioend)
600{
601        xfs_ioend_t             *ioend = *result;
602
603        if (!ioend || need_ioend || type != ioend->io_type) {
604                xfs_ioend_t     *previous = *result;
605
606                ioend = xfs_alloc_ioend(inode, type);
607                ioend->io_offset = offset;
608                ioend->io_buffer_head = bh;
609                ioend->io_buffer_tail = bh;
610                if (previous)
611                        previous->io_list = ioend;
612                *result = ioend;
613        } else {
614                ioend->io_buffer_tail->b_private = bh;
615                ioend->io_buffer_tail = bh;
616        }
617
618        bh->b_private = NULL;
619        ioend->io_size += bh->b_size;
620}
621
622STATIC void
623xfs_map_buffer(
624        struct inode            *inode,
625        struct buffer_head      *bh,
626        struct xfs_bmbt_irec    *imap,
627        xfs_off_t               offset)
628{
629        sector_t                bn;
630        struct xfs_mount        *m = XFS_I(inode)->i_mount;
631        xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
632        xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
633
634        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
635        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
636
637        bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
638              ((offset - iomap_offset) >> inode->i_blkbits);
639
640        ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
641
642        bh->b_blocknr = bn;
643        set_buffer_mapped(bh);
644}
645
646STATIC void
647xfs_map_at_offset(
648        struct inode            *inode,
649        struct buffer_head      *bh,
650        struct xfs_bmbt_irec    *imap,
651        xfs_off_t               offset)
652{
653        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
654        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
655
656        xfs_map_buffer(inode, bh, imap, offset);
657        set_buffer_mapped(bh);
658        clear_buffer_delay(bh);
659        clear_buffer_unwritten(bh);
660}
661
662/*
663 * Test if a given page contains at least one buffer of a given @type.
664 * If @check_all_buffers is true, then we walk all the buffers in the page to
665 * try to find one of the type passed in. If it is not set, then the caller only
666 * needs to check the first buffer on the page for a match.
667 */
668STATIC bool
669xfs_check_page_type(
670        struct page             *page,
671        unsigned int            type,
672        bool                    check_all_buffers)
673{
674        struct buffer_head      *bh;
675        struct buffer_head      *head;
676
677        if (PageWriteback(page))
678                return false;
679        if (!page->mapping)
680                return false;
681        if (!page_has_buffers(page))
682                return false;
683
684        bh = head = page_buffers(page);
685        do {
686                if (buffer_unwritten(bh)) {
687                        if (type == XFS_IO_UNWRITTEN)
688                                return true;
689                } else if (buffer_delay(bh)) {
690                        if (type == XFS_IO_DELALLOC)
691                                return true;
692                } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
693                        if (type == XFS_IO_OVERWRITE)
694                                return true;
695                }
696
697                /* If we are only checking the first buffer, we are done now. */
698                if (!check_all_buffers)
699                        break;
700        } while ((bh = bh->b_this_page) != head);
701
702        return false;
703}
704
705/*
706 * Allocate & map buffers for page given the extent map. Write it out.
707 * except for the original page of a writepage, this is called on
708 * delalloc/unwritten pages only, for the original page it is possible
709 * that the page has no mapping at all.
710 */
711STATIC int
712xfs_convert_page(
713        struct inode            *inode,
714        struct page             *page,
715        loff_t                  tindex,
716        struct xfs_bmbt_irec    *imap,
717        xfs_ioend_t             **ioendp,
718        struct writeback_control *wbc)
719{
720        struct buffer_head      *bh, *head;
721        xfs_off_t               end_offset;
722        unsigned long           p_offset;
723        unsigned int            type;
724        int                     len, page_dirty;
725        int                     count = 0, done = 0, uptodate = 1;
726        xfs_off_t               offset = page_offset(page);
727
728        if (page->index != tindex)
729                goto fail;
730        if (!trylock_page(page))
731                goto fail;
732        if (PageWriteback(page))
733                goto fail_unlock_page;
734        if (page->mapping != inode->i_mapping)
735                goto fail_unlock_page;
736        if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
737                goto fail_unlock_page;
738
739        /*
740         * page_dirty is initially a count of buffers on the page before
741         * EOF and is decremented as we move each into a cleanable state.
742         *
743         * Derivation:
744         *
745         * End offset is the highest offset that this page should represent.
746         * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
747         * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
748         * hence give us the correct page_dirty count. On any other page,
749         * it will be zero and in that case we need page_dirty to be the
750         * count of buffers on the page.
751         */
752        end_offset = min_t(unsigned long long,
753                        (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
754                        i_size_read(inode));
755
756        /*
757         * If the current map does not span the entire page we are about to try
758         * to write, then give up. The only way we can write a page that spans
759         * multiple mappings in a single writeback iteration is via the
760         * xfs_vm_writepage() function. Data integrity writeback requires the
761         * entire page to be written in a single attempt, otherwise the part of
762         * the page we don't write here doesn't get written as part of the data
763         * integrity sync.
764         *
765         * For normal writeback, we also don't attempt to write partial pages
766         * here as it simply means that write_cache_pages() will see it under
767         * writeback and ignore the page until some point in the future, at
768         * which time this will be the only page in the file that needs
769         * writeback.  Hence for more optimal IO patterns, we should always
770         * avoid partial page writeback due to multiple mappings on a page here.
771         */
772        if (!xfs_imap_valid(inode, imap, end_offset))
773                goto fail_unlock_page;
774
775        len = 1 << inode->i_blkbits;
776        p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
777                                        PAGE_CACHE_SIZE);
778        p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
779        page_dirty = p_offset / len;
780
781        /*
782         * The moment we find a buffer that doesn't match our current type
783         * specification or can't be written, abort the loop and start
784         * writeback. As per the above xfs_imap_valid() check, only
785         * xfs_vm_writepage() can handle partial page writeback fully - we are
786         * limited here to the buffers that are contiguous with the current
787         * ioend, and hence a buffer we can't write breaks that contiguity and
788         * we have to defer the rest of the IO to xfs_vm_writepage().
789         */
790        bh = head = page_buffers(page);
791        do {
792                if (offset >= end_offset)
793                        break;
794                if (!buffer_uptodate(bh))
795                        uptodate = 0;
796                if (!(PageUptodate(page) || buffer_uptodate(bh))) {
797                        done = 1;
798                        break;
799                }
800
801                if (buffer_unwritten(bh) || buffer_delay(bh) ||
802                    buffer_mapped(bh)) {
803                        if (buffer_unwritten(bh))
804                                type = XFS_IO_UNWRITTEN;
805                        else if (buffer_delay(bh))
806                                type = XFS_IO_DELALLOC;
807                        else
808                                type = XFS_IO_OVERWRITE;
809
810                        /*
811                         * imap should always be valid because of the above
812                         * partial page end_offset check on the imap.
813                         */
814                        ASSERT(xfs_imap_valid(inode, imap, offset));
815
816                        lock_buffer(bh);
817                        if (type != XFS_IO_OVERWRITE)
818                                xfs_map_at_offset(inode, bh, imap, offset);
819                        xfs_add_to_ioend(inode, bh, offset, type,
820                                         ioendp, done);
821
822                        page_dirty--;
823                        count++;
824                } else {
825                        done = 1;
826                        break;
827                }
828        } while (offset += len, (bh = bh->b_this_page) != head);
829
830        if (uptodate && bh == head)
831                SetPageUptodate(page);
832
833        if (count) {
834                if (--wbc->nr_to_write <= 0 &&
835                    wbc->sync_mode == WB_SYNC_NONE)
836                        done = 1;
837        }
838        xfs_start_page_writeback(page, !page_dirty, count);
839
840        return done;
841 fail_unlock_page:
842        unlock_page(page);
843 fail:
844        return 1;
845}
846
847/*
848 * Convert & write out a cluster of pages in the same extent as defined
849 * by mp and following the start page.
850 */
851STATIC void
852xfs_cluster_write(
853        struct inode            *inode,
854        pgoff_t                 tindex,
855        struct xfs_bmbt_irec    *imap,
856        xfs_ioend_t             **ioendp,
857        struct writeback_control *wbc,
858        pgoff_t                 tlast)
859{
860        struct pagevec          pvec;
861        int                     done = 0, i;
862
863        pagevec_init(&pvec, 0);
864        while (!done && tindex <= tlast) {
865                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
866
867                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
868                        break;
869
870                for (i = 0; i < pagevec_count(&pvec); i++) {
871                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
872                                        imap, ioendp, wbc);
873                        if (done)
874                                break;
875                }
876
877                pagevec_release(&pvec);
878                cond_resched();
879        }
880}
881
882STATIC void
883xfs_vm_invalidatepage(
884        struct page             *page,
885        unsigned int            offset,
886        unsigned int            length)
887{
888        trace_xfs_invalidatepage(page->mapping->host, page, offset,
889                                 length);
890        block_invalidatepage(page, offset, length);
891}
892
893/*
894 * If the page has delalloc buffers on it, we need to punch them out before we
895 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
896 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
897 * is done on that same region - the delalloc extent is returned when none is
898 * supposed to be there.
899 *
900 * We prevent this by truncating away the delalloc regions on the page before
901 * invalidating it. Because they are delalloc, we can do this without needing a
902 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
903 * truncation without a transaction as there is no space left for block
904 * reservation (typically why we see a ENOSPC in writeback).
905 *
906 * This is not a performance critical path, so for now just do the punching a
907 * buffer head at a time.
908 */
909STATIC void
910xfs_aops_discard_page(
911        struct page             *page)
912{
913        struct inode            *inode = page->mapping->host;
914        struct xfs_inode        *ip = XFS_I(inode);
915        struct buffer_head      *bh, *head;
916        loff_t                  offset = page_offset(page);
917
918        if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
919                goto out_invalidate;
920
921        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
922                goto out_invalidate;
923
924        xfs_alert(ip->i_mount,
925                "page discard on page %p, inode 0x%llx, offset %llu.",
926                        page, ip->i_ino, offset);
927
928        xfs_ilock(ip, XFS_ILOCK_EXCL);
929        bh = head = page_buffers(page);
930        do {
931                int             error;
932                xfs_fileoff_t   start_fsb;
933
934                if (!buffer_delay(bh))
935                        goto next_buffer;
936
937                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
938                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
939                if (error) {
940                        /* something screwed, just bail */
941                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
942                                xfs_alert(ip->i_mount,
943                        "page discard unable to remove delalloc mapping.");
944                        }
945                        break;
946                }
947next_buffer:
948                offset += 1 << inode->i_blkbits;
949
950        } while ((bh = bh->b_this_page) != head);
951
952        xfs_iunlock(ip, XFS_ILOCK_EXCL);
953out_invalidate:
954        xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
955        return;
956}
957
958/*
959 * Write out a dirty page.
960 *
961 * For delalloc space on the page we need to allocate space and flush it.
962 * For unwritten space on the page we need to start the conversion to
963 * regular allocated space.
964 * For any other dirty buffer heads on the page we should flush them.
965 */
966STATIC int
967xfs_vm_writepage(
968        struct page             *page,
969        struct writeback_control *wbc)
970{
971        struct inode            *inode = page->mapping->host;
972        struct buffer_head      *bh, *head;
973        struct xfs_bmbt_irec    imap;
974        xfs_ioend_t             *ioend = NULL, *iohead = NULL;
975        loff_t                  offset;
976        unsigned int            type;
977        __uint64_t              end_offset;
978        pgoff_t                 end_index, last_index;
979        ssize_t                 len;
980        int                     err, imap_valid = 0, uptodate = 1;
981        int                     count = 0;
982        int                     nonblocking = 0;
983
984        trace_xfs_writepage(inode, page, 0, 0);
985
986        ASSERT(page_has_buffers(page));
987
988        /*
989         * Refuse to write the page out if we are called from reclaim context.
990         *
991         * This avoids stack overflows when called from deeply used stacks in
992         * random callers for direct reclaim or memcg reclaim.  We explicitly
993         * allow reclaim from kswapd as the stack usage there is relatively low.
994         *
995         * This should never happen except in the case of a VM regression so
996         * warn about it.
997         */
998        if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
999                        PF_MEMALLOC))
1000                goto redirty;
1001
1002        /*
1003         * Given that we do not allow direct reclaim to call us, we should
1004         * never be called while in a filesystem transaction.
1005         */
1006        if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
1007                goto redirty;
1008
1009        /* Is this page beyond the end of the file? */
1010        offset = i_size_read(inode);
1011        end_index = offset >> PAGE_CACHE_SHIFT;
1012        last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
1013
1014        /*
1015         * The page index is less than the end_index, adjust the end_offset
1016         * to the highest offset that this page should represent.
1017         * -----------------------------------------------------
1018         * |                    file mapping           | <EOF> |
1019         * -----------------------------------------------------
1020         * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
1021         * ^--------------------------------^----------|--------
1022         * |     desired writeback range    |      see else    |
1023         * ---------------------------------^------------------|
1024         */
1025        if (page->index < end_index)
1026                end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
1027        else {
1028                /*
1029                 * Check whether the page to write out is beyond or straddles
1030                 * i_size or not.
1031                 * -------------------------------------------------------
1032                 * |            file mapping                    | <EOF>  |
1033                 * -------------------------------------------------------
1034                 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
1035                 * ^--------------------------------^-----------|---------
1036                 * |                                |      Straddles     |
1037                 * ---------------------------------^-----------|--------|
1038                 */
1039                unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
1040
1041                /*
1042                 * Skip the page if it is fully outside i_size, e.g. due to a
1043                 * truncate operation that is in progress. We must redirty the
1044                 * page so that reclaim stops reclaiming it. Otherwise
1045                 * xfs_vm_releasepage() is called on it and gets confused.
1046                 *
1047                 * Note that the end_index is unsigned long, it would overflow
1048                 * if the given offset is greater than 16TB on 32-bit system
1049                 * and if we do check the page is fully outside i_size or not
1050                 * via "if (page->index >= end_index + 1)" as "end_index + 1"
1051                 * will be evaluated to 0.  Hence this page will be redirtied
1052                 * and be written out repeatedly which would result in an
1053                 * infinite loop, the user program that perform this operation
1054                 * will hang.  Instead, we can verify this situation by checking
1055                 * if the page to write is totally beyond the i_size or if it's
1056                 * offset is just equal to the EOF.
1057                 */
1058                if (page->index > end_index ||
1059                    (page->index == end_index && offset_into_page == 0))
1060                        goto redirty;
1061
1062                /*
1063                 * The page straddles i_size.  It must be zeroed out on each
1064                 * and every writepage invocation because it may be mmapped.
1065                 * "A file is mapped in multiples of the page size.  For a file
1066                 * that is not a multiple of the page size, the remaining
1067                 * memory is zeroed when mapped, and writes to that region are
1068                 * not written out to the file."
1069                 */
1070                zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
1071
1072                /* Adjust the end_offset to the end of file */
1073                end_offset = offset;
1074        }
1075
1076        len = 1 << inode->i_blkbits;
1077
1078        bh = head = page_buffers(page);
1079        offset = page_offset(page);
1080        type = XFS_IO_OVERWRITE;
1081
1082        if (wbc->sync_mode == WB_SYNC_NONE)
1083                nonblocking = 1;
1084
1085        do {
1086                int new_ioend = 0;
1087
1088                if (offset >= end_offset)
1089                        break;
1090                if (!buffer_uptodate(bh))
1091                        uptodate = 0;
1092
1093                /*
1094                 * set_page_dirty dirties all buffers in a page, independent
1095                 * of their state.  The dirty state however is entirely
1096                 * meaningless for holes (!mapped && uptodate), so skip
1097                 * buffers covering holes here.
1098                 */
1099                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1100                        imap_valid = 0;
1101                        continue;
1102                }
1103
1104                if (buffer_unwritten(bh)) {
1105                        if (type != XFS_IO_UNWRITTEN) {
1106                                type = XFS_IO_UNWRITTEN;
1107                                imap_valid = 0;
1108                        }
1109                } else if (buffer_delay(bh)) {
1110                        if (type != XFS_IO_DELALLOC) {
1111                                type = XFS_IO_DELALLOC;
1112                                imap_valid = 0;
1113                        }
1114                } else if (buffer_uptodate(bh)) {
1115                        if (type != XFS_IO_OVERWRITE) {
1116                                type = XFS_IO_OVERWRITE;
1117                                imap_valid = 0;
1118                        }
1119                } else {
1120                        if (PageUptodate(page))
1121                                ASSERT(buffer_mapped(bh));
1122                        /*
1123                         * This buffer is not uptodate and will not be
1124                         * written to disk.  Ensure that we will put any
1125                         * subsequent writeable buffers into a new
1126                         * ioend.
1127                         */
1128                        imap_valid = 0;
1129                        continue;
1130                }
1131
1132                if (imap_valid)
1133                        imap_valid = xfs_imap_valid(inode, &imap, offset);
1134                if (!imap_valid) {
1135                        /*
1136                         * If we didn't have a valid mapping then we need to
1137                         * put the new mapping into a separate ioend structure.
1138                         * This ensures non-contiguous extents always have
1139                         * separate ioends, which is particularly important
1140                         * for unwritten extent conversion at I/O completion
1141                         * time.
1142                         */
1143                        new_ioend = 1;
1144                        err = xfs_map_blocks(inode, offset, &imap, type,
1145                                             nonblocking);
1146                        if (err)
1147                                goto error;
1148                        imap_valid = xfs_imap_valid(inode, &imap, offset);
1149                }
1150                if (imap_valid) {
1151                        lock_buffer(bh);
1152                        if (type != XFS_IO_OVERWRITE)
1153                                xfs_map_at_offset(inode, bh, &imap, offset);
1154                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1155                                         new_ioend);
1156                        count++;
1157                }
1158
1159                if (!iohead)
1160                        iohead = ioend;
1161
1162        } while (offset += len, ((bh = bh->b_this_page) != head));
1163
1164        if (uptodate && bh == head)
1165                SetPageUptodate(page);
1166
1167        xfs_start_page_writeback(page, 1, count);
1168
1169        /* if there is no IO to be submitted for this page, we are done */
1170        if (!ioend)
1171                return 0;
1172
1173        ASSERT(iohead);
1174
1175        /*
1176         * Any errors from this point onwards need tobe reported through the IO
1177         * completion path as we have marked the initial page as under writeback
1178         * and unlocked it.
1179         */
1180        if (imap_valid) {
1181                xfs_off_t               end_index;
1182
1183                end_index = imap.br_startoff + imap.br_blockcount;
1184
1185                /* to bytes */
1186                end_index <<= inode->i_blkbits;
1187
1188                /* to pages */
1189                end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1190
1191                /* check against file size */
1192                if (end_index > last_index)
1193                        end_index = last_index;
1194
1195                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1196                                  wbc, end_index);
1197        }
1198
1199
1200        /*
1201         * Reserve log space if we might write beyond the on-disk inode size.
1202         */
1203        err = 0;
1204        if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
1205                err = xfs_setfilesize_trans_alloc(ioend);
1206
1207        xfs_submit_ioend(wbc, iohead, err);
1208
1209        return 0;
1210
1211error:
1212        if (iohead)
1213                xfs_cancel_ioend(iohead);
1214
1215        if (err == -EAGAIN)
1216                goto redirty;
1217
1218        xfs_aops_discard_page(page);
1219        ClearPageUptodate(page);
1220        unlock_page(page);
1221        return err;
1222
1223redirty:
1224        redirty_page_for_writepage(wbc, page);
1225        unlock_page(page);
1226        return 0;
1227}
1228
1229STATIC int
1230xfs_vm_writepages(
1231        struct address_space    *mapping,
1232        struct writeback_control *wbc)
1233{
1234        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1235        return generic_writepages(mapping, wbc);
1236}
1237
1238/*
1239 * Called to move a page into cleanable state - and from there
1240 * to be released. The page should already be clean. We always
1241 * have buffer heads in this call.
1242 *
1243 * Returns 1 if the page is ok to release, 0 otherwise.
1244 */
1245STATIC int
1246xfs_vm_releasepage(
1247        struct page             *page,
1248        gfp_t                   gfp_mask)
1249{
1250        int                     delalloc, unwritten;
1251
1252        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1253
1254        xfs_count_page_state(page, &delalloc, &unwritten);
1255
1256        if (WARN_ON_ONCE(delalloc))
1257                return 0;
1258        if (WARN_ON_ONCE(unwritten))
1259                return 0;
1260
1261        return try_to_free_buffers(page);
1262}
1263
1264STATIC int
1265__xfs_get_blocks(
1266        struct inode            *inode,
1267        sector_t                iblock,
1268        struct buffer_head      *bh_result,
1269        int                     create,
1270        int                     direct)
1271{
1272        struct xfs_inode        *ip = XFS_I(inode);
1273        struct xfs_mount        *mp = ip->i_mount;
1274        xfs_fileoff_t           offset_fsb, end_fsb;
1275        int                     error = 0;
1276        int                     lockmode = 0;
1277        struct xfs_bmbt_irec    imap;
1278        int                     nimaps = 1;
1279        xfs_off_t               offset;
1280        ssize_t                 size;
1281        int                     new = 0;
1282
1283        if (XFS_FORCED_SHUTDOWN(mp))
1284                return -EIO;
1285
1286        offset = (xfs_off_t)iblock << inode->i_blkbits;
1287        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1288        size = bh_result->b_size;
1289
1290        if (!create && direct && offset >= i_size_read(inode))
1291                return 0;
1292
1293        /*
1294         * Direct I/O is usually done on preallocated files, so try getting
1295         * a block mapping without an exclusive lock first.  For buffered
1296         * writes we already have the exclusive iolock anyway, so avoiding
1297         * a lock roundtrip here by taking the ilock exclusive from the
1298         * beginning is a useful micro optimization.
1299         */
1300        if (create && !direct) {
1301                lockmode = XFS_ILOCK_EXCL;
1302                xfs_ilock(ip, lockmode);
1303        } else {
1304                lockmode = xfs_ilock_data_map_shared(ip);
1305        }
1306
1307        ASSERT(offset <= mp->m_super->s_maxbytes);
1308        if (offset + size > mp->m_super->s_maxbytes)
1309                size = mp->m_super->s_maxbytes - offset;
1310        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1311        offset_fsb = XFS_B_TO_FSBT(mp, offset);
1312
1313        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1314                                &imap, &nimaps, XFS_BMAPI_ENTIRE);
1315        if (error)
1316                goto out_unlock;
1317
1318        if (create &&
1319            (!nimaps ||
1320             (imap.br_startblock == HOLESTARTBLOCK ||
1321              imap.br_startblock == DELAYSTARTBLOCK))) {
1322                if (direct || xfs_get_extsz_hint(ip)) {
1323                        /*
1324                         * Drop the ilock in preparation for starting the block
1325                         * allocation transaction.  It will be retaken
1326                         * exclusively inside xfs_iomap_write_direct for the
1327                         * actual allocation.
1328                         */
1329                        xfs_iunlock(ip, lockmode);
1330                        error = xfs_iomap_write_direct(ip, offset, size,
1331                                                       &imap, nimaps);
1332                        if (error)
1333                                return error;
1334                        new = 1;
1335                } else {
1336                        /*
1337                         * Delalloc reservations do not require a transaction,
1338                         * we can go on without dropping the lock here. If we
1339                         * are allocating a new delalloc block, make sure that
1340                         * we set the new flag so that we mark the buffer new so
1341                         * that we know that it is newly allocated if the write
1342                         * fails.
1343                         */
1344                        if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1345                                new = 1;
1346                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
1347                        if (error)
1348                                goto out_unlock;
1349
1350                        xfs_iunlock(ip, lockmode);
1351                }
1352
1353                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1354        } else if (nimaps) {
1355                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1356                xfs_iunlock(ip, lockmode);
1357        } else {
1358                trace_xfs_get_blocks_notfound(ip, offset, size);
1359                goto out_unlock;
1360        }
1361
1362        if (imap.br_startblock != HOLESTARTBLOCK &&
1363            imap.br_startblock != DELAYSTARTBLOCK) {
1364                /*
1365                 * For unwritten extents do not report a disk address on
1366                 * the read case (treat as if we're reading into a hole).
1367                 */
1368                if (create || !ISUNWRITTEN(&imap))
1369                        xfs_map_buffer(inode, bh_result, &imap, offset);
1370                if (create && ISUNWRITTEN(&imap)) {
1371                        if (direct) {
1372                                bh_result->b_private = inode;
1373                                set_buffer_defer_completion(bh_result);
1374                        }
1375                        set_buffer_unwritten(bh_result);
1376                }
1377        }
1378
1379        /*
1380         * If this is a realtime file, data may be on a different device.
1381         * to that pointed to from the buffer_head b_bdev currently.
1382         */
1383        bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1384
1385        /*
1386         * If we previously allocated a block out beyond eof and we are now
1387         * coming back to use it then we will need to flag it as new even if it
1388         * has a disk address.
1389         *
1390         * With sub-block writes into unwritten extents we also need to mark
1391         * the buffer as new so that the unwritten parts of the buffer gets
1392         * correctly zeroed.
1393         */
1394        if (create &&
1395            ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1396             (offset >= i_size_read(inode)) ||
1397             (new || ISUNWRITTEN(&imap))))
1398                set_buffer_new(bh_result);
1399
1400        if (imap.br_startblock == DELAYSTARTBLOCK) {
1401                BUG_ON(direct);
1402                if (create) {
1403                        set_buffer_uptodate(bh_result);
1404                        set_buffer_mapped(bh_result);
1405                        set_buffer_delay(bh_result);
1406                }
1407        }
1408
1409        /*
1410         * If this is O_DIRECT or the mpage code calling tell them how large
1411         * the mapping is, so that we can avoid repeated get_blocks calls.
1412         *
1413         * If the mapping spans EOF, then we have to break the mapping up as the
1414         * mapping for blocks beyond EOF must be marked new so that sub block
1415         * regions can be correctly zeroed. We can't do this for mappings within
1416         * EOF unless the mapping was just allocated or is unwritten, otherwise
1417         * the callers would overwrite existing data with zeros. Hence we have
1418         * to split the mapping into a range up to and including EOF, and a
1419         * second mapping for beyond EOF.
1420         */
1421        if (direct || size > (1 << inode->i_blkbits)) {
1422                xfs_off_t               mapping_size;
1423
1424                mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1425                mapping_size <<= inode->i_blkbits;
1426
1427                ASSERT(mapping_size > 0);
1428                if (mapping_size > size)
1429                        mapping_size = size;
1430                if (offset < i_size_read(inode) &&
1431                    offset + mapping_size >= i_size_read(inode)) {
1432                        /* limit mapping to block that spans EOF */
1433                        mapping_size = roundup_64(i_size_read(inode) - offset,
1434                                                  1 << inode->i_blkbits);
1435                }
1436                if (mapping_size > LONG_MAX)
1437                        mapping_size = LONG_MAX;
1438
1439                bh_result->b_size = mapping_size;
1440        }
1441
1442        return 0;
1443
1444out_unlock:
1445        xfs_iunlock(ip, lockmode);
1446        return error;
1447}
1448
1449int
1450xfs_get_blocks(
1451        struct inode            *inode,
1452        sector_t                iblock,
1453        struct buffer_head      *bh_result,
1454        int                     create)
1455{
1456        return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
1457}
1458
1459STATIC int
1460xfs_get_blocks_direct(
1461        struct inode            *inode,
1462        sector_t                iblock,
1463        struct buffer_head      *bh_result,
1464        int                     create)
1465{
1466        return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
1467}
1468
1469/*
1470 * Complete a direct I/O write request.
1471 *
1472 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1473 * need to issue a transaction to convert the range from unwritten to written
1474 * extents.  In case this is regular synchronous I/O we just call xfs_end_io
1475 * to do this and we are done.  But in case this was a successful AIO
1476 * request this handler is called from interrupt context, from which we
1477 * can't start transactions.  In that case offload the I/O completion to
1478 * the workqueues we also use for buffered I/O completion.
1479 */
1480STATIC void
1481xfs_end_io_direct_write(
1482        struct kiocb            *iocb,
1483        loff_t                  offset,
1484        ssize_t                 size,
1485        void                    *private)
1486{
1487        struct xfs_ioend        *ioend = iocb->private;
1488
1489        /*
1490         * While the generic direct I/O code updates the inode size, it does
1491         * so only after the end_io handler is called, which means our
1492         * end_io handler thinks the on-disk size is outside the in-core
1493         * size.  To prevent this just update it a little bit earlier here.
1494         */
1495        if (offset + size > i_size_read(ioend->io_inode))
1496                i_size_write(ioend->io_inode, offset + size);
1497
1498        /*
1499         * blockdev_direct_IO can return an error even after the I/O
1500         * completion handler was called.  Thus we need to protect
1501         * against double-freeing.
1502         */
1503        iocb->private = NULL;
1504
1505        ioend->io_offset = offset;
1506        ioend->io_size = size;
1507        if (private && size > 0)
1508                ioend->io_type = XFS_IO_UNWRITTEN;
1509
1510        xfs_finish_ioend_sync(ioend);
1511}
1512
1513STATIC ssize_t
1514xfs_vm_direct_IO(
1515        int                     rw,
1516        struct kiocb            *iocb,
1517        struct iov_iter         *iter,
1518        loff_t                  offset)
1519{
1520        struct inode            *inode = iocb->ki_filp->f_mapping->host;
1521        struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
1522        struct xfs_ioend        *ioend = NULL;
1523        ssize_t                 ret;
1524
1525        if (rw & WRITE) {
1526                size_t size = iov_iter_count(iter);
1527
1528                /*
1529                 * We cannot preallocate a size update transaction here as we
1530                 * don't know whether allocation is necessary or not. Hence we
1531                 * can only tell IO completion that one is necessary if we are
1532                 * not doing unwritten extent conversion.
1533                 */
1534                iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
1535                if (offset + size > XFS_I(inode)->i_d.di_size)
1536                        ioend->io_isdirect = 1;
1537
1538                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1539                                            offset, xfs_get_blocks_direct,
1540                                            xfs_end_io_direct_write, NULL,
1541                                            DIO_ASYNC_EXTEND);
1542                if (ret != -EIOCBQUEUED && iocb->private)
1543                        goto out_destroy_ioend;
1544        } else {
1545                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1546                                            offset, xfs_get_blocks_direct,
1547                                            NULL, NULL, 0);
1548        }
1549
1550        return ret;
1551
1552out_destroy_ioend:
1553        xfs_destroy_ioend(ioend);
1554        return ret;
1555}
1556
1557/*
1558 * Punch out the delalloc blocks we have already allocated.
1559 *
1560 * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1561 * as the page is still locked at this point.
1562 */
1563STATIC void
1564xfs_vm_kill_delalloc_range(
1565        struct inode            *inode,
1566        loff_t                  start,
1567        loff_t                  end)
1568{
1569        struct xfs_inode        *ip = XFS_I(inode);
1570        xfs_fileoff_t           start_fsb;
1571        xfs_fileoff_t           end_fsb;
1572        int                     error;
1573
1574        start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1575        end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1576        if (end_fsb <= start_fsb)
1577                return;
1578
1579        xfs_ilock(ip, XFS_ILOCK_EXCL);
1580        error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1581                                                end_fsb - start_fsb);
1582        if (error) {
1583                /* something screwed, just bail */
1584                if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1585                        xfs_alert(ip->i_mount,
1586                "xfs_vm_write_failed: unable to clean up ino %lld",
1587                                        ip->i_ino);
1588                }
1589        }
1590        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1591}
1592
1593STATIC void
1594xfs_vm_write_failed(
1595        struct inode            *inode,
1596        struct page             *page,
1597        loff_t                  pos,
1598        unsigned                len)
1599{
1600        loff_t                  block_offset;
1601        loff_t                  block_start;
1602        loff_t                  block_end;
1603        loff_t                  from = pos & (PAGE_CACHE_SIZE - 1);
1604        loff_t                  to = from + len;
1605        struct buffer_head      *bh, *head;
1606
1607        /*
1608         * The request pos offset might be 32 or 64 bit, this is all fine
1609         * on 64-bit platform.  However, for 64-bit pos request on 32-bit
1610         * platform, the high 32-bit will be masked off if we evaluate the
1611         * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1612         * 0xfffff000 as an unsigned long, hence the result is incorrect
1613         * which could cause the following ASSERT failed in most cases.
1614         * In order to avoid this, we can evaluate the block_offset of the
1615         * start of the page by using shifts rather than masks the mismatch
1616         * problem.
1617         */
1618        block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
1619
1620        ASSERT(block_offset + from == pos);
1621
1622        head = page_buffers(page);
1623        block_start = 0;
1624        for (bh = head; bh != head || !block_start;
1625             bh = bh->b_this_page, block_start = block_end,
1626                                   block_offset += bh->b_size) {
1627                block_end = block_start + bh->b_size;
1628
1629                /* skip buffers before the write */
1630                if (block_end <= from)
1631                        continue;
1632
1633                /* if the buffer is after the write, we're done */
1634                if (block_start >= to)
1635                        break;
1636
1637                if (!buffer_delay(bh))
1638                        continue;
1639
1640                if (!buffer_new(bh) && block_offset < i_size_read(inode))
1641                        continue;
1642
1643                xfs_vm_kill_delalloc_range(inode, block_offset,
1644                                           block_offset + bh->b_size);
1645
1646                /*
1647                 * This buffer does not contain data anymore. make sure anyone
1648                 * who finds it knows that for certain.
1649                 */
1650                clear_buffer_delay(bh);
1651                clear_buffer_uptodate(bh);
1652                clear_buffer_mapped(bh);
1653                clear_buffer_new(bh);
1654                clear_buffer_dirty(bh);
1655        }
1656
1657}
1658
1659/*
1660 * This used to call block_write_begin(), but it unlocks and releases the page
1661 * on error, and we need that page to be able to punch stale delalloc blocks out
1662 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1663 * the appropriate point.
1664 */
1665STATIC int
1666xfs_vm_write_begin(
1667        struct file             *file,
1668        struct address_space    *mapping,
1669        loff_t                  pos,
1670        unsigned                len,
1671        unsigned                flags,
1672        struct page             **pagep,
1673        void                    **fsdata)
1674{
1675        pgoff_t                 index = pos >> PAGE_CACHE_SHIFT;
1676        struct page             *page;
1677        int                     status;
1678
1679        ASSERT(len <= PAGE_CACHE_SIZE);
1680
1681        page = grab_cache_page_write_begin(mapping, index, flags);
1682        if (!page)
1683                return -ENOMEM;
1684
1685        status = __block_write_begin(page, pos, len, xfs_get_blocks);
1686        if (unlikely(status)) {
1687                struct inode    *inode = mapping->host;
1688                size_t          isize = i_size_read(inode);
1689
1690                xfs_vm_write_failed(inode, page, pos, len);
1691                unlock_page(page);
1692
1693                /*
1694                 * If the write is beyond EOF, we only want to kill blocks
1695                 * allocated in this write, not blocks that were previously
1696                 * written successfully.
1697                 */
1698                if (pos + len > isize) {
1699                        ssize_t start = max_t(ssize_t, pos, isize);
1700
1701                        truncate_pagecache_range(inode, start, pos + len);
1702                }
1703
1704                page_cache_release(page);
1705                page = NULL;
1706        }
1707
1708        *pagep = page;
1709        return status;
1710}
1711
1712/*
1713 * On failure, we only need to kill delalloc blocks beyond EOF in the range of
1714 * this specific write because they will never be written. Previous writes
1715 * beyond EOF where block allocation succeeded do not need to be trashed, so
1716 * only new blocks from this write should be trashed. For blocks within
1717 * EOF, generic_write_end() zeros them so they are safe to leave alone and be
1718 * written with all the other valid data.
1719 */
1720STATIC int
1721xfs_vm_write_end(
1722        struct file             *file,
1723        struct address_space    *mapping,
1724        loff_t                  pos,
1725        unsigned                len,
1726        unsigned                copied,
1727        struct page             *page,
1728        void                    *fsdata)
1729{
1730        int                     ret;
1731
1732        ASSERT(len <= PAGE_CACHE_SIZE);
1733
1734        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1735        if (unlikely(ret < len)) {
1736                struct inode    *inode = mapping->host;
1737                size_t          isize = i_size_read(inode);
1738                loff_t          to = pos + len;
1739
1740                if (to > isize) {
1741                        /* only kill blocks in this write beyond EOF */
1742                        if (pos > isize)
1743                                isize = pos;
1744                        xfs_vm_kill_delalloc_range(inode, isize, to);
1745                        truncate_pagecache_range(inode, isize, to);
1746                }
1747        }
1748        return ret;
1749}
1750
1751STATIC sector_t
1752xfs_vm_bmap(
1753        struct address_space    *mapping,
1754        sector_t                block)
1755{
1756        struct inode            *inode = (struct inode *)mapping->host;
1757        struct xfs_inode        *ip = XFS_I(inode);
1758
1759        trace_xfs_vm_bmap(XFS_I(inode));
1760        xfs_ilock(ip, XFS_IOLOCK_SHARED);
1761        filemap_write_and_wait(mapping);
1762        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1763        return generic_block_bmap(mapping, block, xfs_get_blocks);
1764}
1765
1766STATIC int
1767xfs_vm_readpage(
1768        struct file             *unused,
1769        struct page             *page)
1770{
1771        return mpage_readpage(page, xfs_get_blocks);
1772}
1773
1774STATIC int
1775xfs_vm_readpages(
1776        struct file             *unused,
1777        struct address_space    *mapping,
1778        struct list_head        *pages,
1779        unsigned                nr_pages)
1780{
1781        return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1782}
1783
1784/*
1785 * This is basically a copy of __set_page_dirty_buffers() with one
1786 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1787 * dirty, we'll never be able to clean them because we don't write buffers
1788 * beyond EOF, and that means we can't invalidate pages that span EOF
1789 * that have been marked dirty. Further, the dirty state can leak into
1790 * the file interior if the file is extended, resulting in all sorts of
1791 * bad things happening as the state does not match the underlying data.
1792 *
1793 * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1794 * this only exist because of bufferheads and how the generic code manages them.
1795 */
1796STATIC int
1797xfs_vm_set_page_dirty(
1798        struct page             *page)
1799{
1800        struct address_space    *mapping = page->mapping;
1801        struct inode            *inode = mapping->host;
1802        loff_t                  end_offset;
1803        loff_t                  offset;
1804        int                     newly_dirty;
1805
1806        if (unlikely(!mapping))
1807                return !TestSetPageDirty(page);
1808
1809        end_offset = i_size_read(inode);
1810        offset = page_offset(page);
1811
1812        spin_lock(&mapping->private_lock);
1813        if (page_has_buffers(page)) {
1814                struct buffer_head *head = page_buffers(page);
1815                struct buffer_head *bh = head;
1816
1817                do {
1818                        if (offset < end_offset)
1819                                set_buffer_dirty(bh);
1820                        bh = bh->b_this_page;
1821                        offset += 1 << inode->i_blkbits;
1822                } while (bh != head);
1823        }
1824        newly_dirty = !TestSetPageDirty(page);
1825        spin_unlock(&mapping->private_lock);
1826
1827        if (newly_dirty) {
1828                /* sigh - __set_page_dirty() is static, so copy it here, too */
1829                unsigned long flags;
1830
1831                spin_lock_irqsave(&mapping->tree_lock, flags);
1832                if (page->mapping) {    /* Race with truncate? */
1833                        WARN_ON_ONCE(!PageUptodate(page));
1834                        account_page_dirtied(page, mapping);
1835                        radix_tree_tag_set(&mapping->page_tree,
1836                                        page_index(page), PAGECACHE_TAG_DIRTY);
1837                }
1838                spin_unlock_irqrestore(&mapping->tree_lock, flags);
1839                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1840        }
1841        return newly_dirty;
1842}
1843
1844const struct address_space_operations xfs_address_space_operations = {
1845        .readpage               = xfs_vm_readpage,
1846        .readpages              = xfs_vm_readpages,
1847        .writepage              = xfs_vm_writepage,
1848        .writepages             = xfs_vm_writepages,
1849        .set_page_dirty         = xfs_vm_set_page_dirty,
1850        .releasepage            = xfs_vm_releasepage,
1851        .invalidatepage         = xfs_vm_invalidatepage,
1852        .write_begin            = xfs_vm_write_begin,
1853        .write_end              = xfs_vm_write_end,
1854        .bmap                   = xfs_vm_bmap,
1855        .direct_IO              = xfs_vm_direct_IO,
1856        .migratepage            = buffer_migrate_page,
1857        .is_partially_uptodate  = block_is_partially_uptodate,
1858        .error_remove_page      = generic_error_remove_page,
1859};
Note: See TracBrowser for help on using the repository browser.