source: src/linux/universal/linux-3.18/drivers/md/dm.c @ 31869

Last change on this file since 31869 was 31869, checked in by brainslayer, 6 weeks ago

update

File size: 71.0 KB
Line 
1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm.h"
9#include "dm-uevent.h"
10
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/mutex.h>
14#include <linux/moduleparam.h>
15#include <linux/blkpg.h>
16#include <linux/bio.h>
17#include <linux/mempool.h>
18#include <linux/slab.h>
19#include <linux/idr.h>
20#include <linux/hdreg.h>
21#include <linux/delay.h>
22
23#include <trace/events/block.h>
24
25#define DM_MSG_PREFIX "core"
26
27#ifdef CONFIG_PRINTK
28/*
29 * ratelimit state to be used in DMXXX_LIMIT().
30 */
31DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
32                       DEFAULT_RATELIMIT_INTERVAL,
33                       DEFAULT_RATELIMIT_BURST);
34EXPORT_SYMBOL(dm_ratelimit_state);
35#endif
36
37/*
38 * Cookies are numeric values sent with CHANGE and REMOVE
39 * uevents while resuming, removing or renaming the device.
40 */
41#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
42#define DM_COOKIE_LENGTH 24
43
44static const char *_name = DM_NAME;
45
46static unsigned int major = 0;
47static unsigned int _major = 0;
48
49static DEFINE_IDR(_minor_idr);
50
51static DEFINE_SPINLOCK(_minor_lock);
52
53static void do_deferred_remove(struct work_struct *w);
54
55static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
56
57static struct workqueue_struct *deferred_remove_workqueue;
58
59/*
60 * For bio-based dm.
61 * One of these is allocated per bio.
62 */
63struct dm_io {
64        struct mapped_device *md;
65        int error;
66        atomic_t io_count;
67        struct bio *bio;
68        unsigned long start_time;
69        spinlock_t endio_lock;
70        struct dm_stats_aux stats_aux;
71};
72
73/*
74 * For request-based dm.
75 * One of these is allocated per request.
76 */
77struct dm_rq_target_io {
78        struct mapped_device *md;
79        struct dm_target *ti;
80        struct request *orig, clone;
81        int error;
82        union map_info info;
83};
84
85/*
86 * For request-based dm - the bio clones we allocate are embedded in these
87 * structs.
88 *
89 * We allocate these with bio_alloc_bioset, using the front_pad parameter when
90 * the bioset is created - this means the bio has to come at the end of the
91 * struct.
92 */
93struct dm_rq_clone_bio_info {
94        struct bio *orig;
95        struct dm_rq_target_io *tio;
96        struct bio clone;
97};
98
99union map_info *dm_get_rq_mapinfo(struct request *rq)
100{
101        if (rq && rq->end_io_data)
102                return &((struct dm_rq_target_io *)rq->end_io_data)->info;
103        return NULL;
104}
105EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
106
107#define MINOR_ALLOCED ((void *)-1)
108
109/*
110 * Bits for the md->flags field.
111 */
112#define DMF_BLOCK_IO_FOR_SUSPEND 0
113#define DMF_SUSPENDED 1
114#define DMF_FROZEN 2
115#define DMF_FREEING 3
116#define DMF_DELETING 4
117#define DMF_NOFLUSH_SUSPENDING 5
118#define DMF_MERGE_IS_OPTIONAL 6
119#define DMF_DEFERRED_REMOVE 7
120
121/*
122 * A dummy definition to make RCU happy.
123 * struct dm_table should never be dereferenced in this file.
124 */
125struct dm_table {
126        int undefined__;
127};
128
129/*
130 * Work processed by per-device workqueue.
131 */
132struct mapped_device {
133        struct srcu_struct io_barrier;
134        struct mutex suspend_lock;
135        atomic_t holders;
136        atomic_t open_count;
137
138        /*
139         * The current mapping.
140         * Use dm_get_live_table{_fast} or take suspend_lock for
141         * dereference.
142         */
143        struct dm_table *map;
144
145        struct list_head table_devices;
146        struct mutex table_devices_lock;
147
148        unsigned long flags;
149
150        struct request_queue *queue;
151        unsigned type;
152        /* Protect queue and type against concurrent access. */
153        struct mutex type_lock;
154
155        struct target_type *immutable_target_type;
156
157        struct gendisk *disk;
158        char name[16];
159
160        void *interface_ptr;
161
162        /*
163         * A list of ios that arrived while we were suspended.
164         */
165        atomic_t pending[2];
166        wait_queue_head_t wait;
167        struct work_struct work;
168        struct bio_list deferred;
169        spinlock_t deferred_lock;
170
171        /*
172         * Processing queue (flush)
173         */
174        struct workqueue_struct *wq;
175
176        /*
177         * io objects are allocated from here.
178         */
179        mempool_t *io_pool;
180
181        struct bio_set *bs;
182
183        /*
184         * Event handling.
185         */
186        atomic_t event_nr;
187        wait_queue_head_t eventq;
188        atomic_t uevent_seq;
189        struct list_head uevent_list;
190        spinlock_t uevent_lock; /* Protect access to uevent_list */
191
192        /*
193         * freeze/thaw support require holding onto a super block
194         */
195        struct super_block *frozen_sb;
196        struct block_device *bdev;
197
198        /* forced geometry settings */
199        struct hd_geometry geometry;
200
201        /* kobject and completion */
202        struct dm_kobject_holder kobj_holder;
203
204        /* zero-length flush that will be cloned and submitted to targets */
205        struct bio flush_bio;
206
207        struct dm_stats stats;
208};
209
210/*
211 * For mempools pre-allocation at the table loading time.
212 */
213struct dm_md_mempools {
214        mempool_t *io_pool;
215        struct bio_set *bs;
216};
217
218struct table_device {
219        struct list_head list;
220        atomic_t count;
221        struct dm_dev dm_dev;
222};
223
224#define RESERVED_BIO_BASED_IOS          16
225#define RESERVED_REQUEST_BASED_IOS      256
226#define RESERVED_MAX_IOS                1024
227static struct kmem_cache *_io_cache;
228static struct kmem_cache *_rq_tio_cache;
229
230/*
231 * Bio-based DM's mempools' reserved IOs set by the user.
232 */
233static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
234
235/*
236 * Request-based DM's mempools' reserved IOs set by the user.
237 */
238static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
239
240static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
241                                      unsigned def, unsigned max)
242{
243        unsigned ios = ACCESS_ONCE(*reserved_ios);
244        unsigned modified_ios = 0;
245
246        if (!ios)
247                modified_ios = def;
248        else if (ios > max)
249                modified_ios = max;
250
251        if (modified_ios) {
252                (void)cmpxchg(reserved_ios, ios, modified_ios);
253                ios = modified_ios;
254        }
255
256        return ios;
257}
258
259unsigned dm_get_reserved_bio_based_ios(void)
260{
261        return __dm_get_reserved_ios(&reserved_bio_based_ios,
262                                     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
263}
264EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
265
266unsigned dm_get_reserved_rq_based_ios(void)
267{
268        return __dm_get_reserved_ios(&reserved_rq_based_ios,
269                                     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
270}
271EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
272
273static int __init local_init(void)
274{
275        int r = -ENOMEM;
276
277        /* allocate a slab for the dm_ios */
278        _io_cache = KMEM_CACHE(dm_io, 0);
279        if (!_io_cache)
280                return r;
281
282        _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
283        if (!_rq_tio_cache)
284                goto out_free_io_cache;
285
286        r = dm_uevent_init();
287        if (r)
288                goto out_free_rq_tio_cache;
289
290        deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
291        if (!deferred_remove_workqueue) {
292                r = -ENOMEM;
293                goto out_uevent_exit;
294        }
295
296        _major = major;
297        r = register_blkdev(_major, _name);
298        if (r < 0)
299                goto out_free_workqueue;
300
301        if (!_major)
302                _major = r;
303
304        return 0;
305
306out_free_workqueue:
307        destroy_workqueue(deferred_remove_workqueue);
308out_uevent_exit:
309        dm_uevent_exit();
310out_free_rq_tio_cache:
311        kmem_cache_destroy(_rq_tio_cache);
312out_free_io_cache:
313        kmem_cache_destroy(_io_cache);
314
315        return r;
316}
317
318static void local_exit(void)
319{
320        flush_scheduled_work();
321        destroy_workqueue(deferred_remove_workqueue);
322
323        kmem_cache_destroy(_rq_tio_cache);
324        kmem_cache_destroy(_io_cache);
325        unregister_blkdev(_major, _name);
326        dm_uevent_exit();
327
328        _major = 0;
329
330        DMINFO("cleaned up");
331}
332
333static int (*_inits[])(void) __initdata = {
334        local_init,
335        dm_target_init,
336        dm_linear_init,
337        dm_stripe_init,
338        dm_io_init,
339        dm_kcopyd_init,
340        dm_interface_init,
341        dm_statistics_init,
342};
343
344static void (*_exits[])(void) = {
345        local_exit,
346        dm_target_exit,
347        dm_linear_exit,
348        dm_stripe_exit,
349        dm_io_exit,
350        dm_kcopyd_exit,
351        dm_interface_exit,
352        dm_statistics_exit,
353};
354
355static int __init dm_init(void)
356{
357        const int count = ARRAY_SIZE(_inits);
358
359        int r, i;
360
361        for (i = 0; i < count; i++) {
362                r = _inits[i]();
363                if (r)
364                        goto bad;
365        }
366
367        return 0;
368
369      bad:
370        while (i--)
371                _exits[i]();
372
373        return r;
374}
375
376static void __exit dm_exit(void)
377{
378        int i = ARRAY_SIZE(_exits);
379
380        while (i--)
381                _exits[i]();
382
383        /*
384         * Should be empty by this point.
385         */
386        idr_destroy(&_minor_idr);
387}
388
389/*
390 * Block device functions
391 */
392int dm_deleting_md(struct mapped_device *md)
393{
394        return test_bit(DMF_DELETING, &md->flags);
395}
396
397static int dm_blk_open(struct block_device *bdev, fmode_t mode)
398{
399        struct mapped_device *md;
400
401        spin_lock(&_minor_lock);
402
403        md = bdev->bd_disk->private_data;
404        if (!md)
405                goto out;
406
407        if (test_bit(DMF_FREEING, &md->flags) ||
408            dm_deleting_md(md)) {
409                md = NULL;
410                goto out;
411        }
412
413        dm_get(md);
414        atomic_inc(&md->open_count);
415
416out:
417        spin_unlock(&_minor_lock);
418
419        return md ? 0 : -ENXIO;
420}
421
422static void dm_blk_close(struct gendisk *disk, fmode_t mode)
423{
424        struct mapped_device *md = disk->private_data;
425
426        spin_lock(&_minor_lock);
427
428        if (atomic_dec_and_test(&md->open_count) &&
429            (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
430                queue_work(deferred_remove_workqueue, &deferred_remove_work);
431
432        dm_put(md);
433
434        spin_unlock(&_minor_lock);
435}
436
437int dm_open_count(struct mapped_device *md)
438{
439        return atomic_read(&md->open_count);
440}
441
442/*
443 * Guarantees nothing is using the device before it's deleted.
444 */
445int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
446{
447        int r = 0;
448
449        spin_lock(&_minor_lock);
450
451        if (dm_open_count(md)) {
452                r = -EBUSY;
453                if (mark_deferred)
454                        set_bit(DMF_DEFERRED_REMOVE, &md->flags);
455        } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
456                r = -EEXIST;
457        else
458                set_bit(DMF_DELETING, &md->flags);
459
460        spin_unlock(&_minor_lock);
461
462        return r;
463}
464
465int dm_cancel_deferred_remove(struct mapped_device *md)
466{
467        int r = 0;
468
469        spin_lock(&_minor_lock);
470
471        if (test_bit(DMF_DELETING, &md->flags))
472                r = -EBUSY;
473        else
474                clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
475
476        spin_unlock(&_minor_lock);
477
478        return r;
479}
480
481static void do_deferred_remove(struct work_struct *w)
482{
483        dm_deferred_remove();
484}
485
486sector_t dm_get_size(struct mapped_device *md)
487{
488        return get_capacity(md->disk);
489}
490
491struct request_queue *dm_get_md_queue(struct mapped_device *md)
492{
493        return md->queue;
494}
495
496struct dm_stats *dm_get_stats(struct mapped_device *md)
497{
498        return &md->stats;
499}
500
501static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
502{
503        struct mapped_device *md = bdev->bd_disk->private_data;
504
505        return dm_get_geometry(md, geo);
506}
507
508static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
509                        unsigned int cmd, unsigned long arg)
510{
511        struct mapped_device *md = bdev->bd_disk->private_data;
512        int srcu_idx;
513        struct dm_table *map;
514        struct dm_target *tgt;
515        int r = -ENOTTY;
516
517retry:
518        map = dm_get_live_table(md, &srcu_idx);
519
520        if (!map || !dm_table_get_size(map))
521                goto out;
522
523        /* We only support devices that have a single target */
524        if (dm_table_get_num_targets(map) != 1)
525                goto out;
526
527        tgt = dm_table_get_target(map, 0);
528
529        if (dm_suspended_md(md)) {
530                r = -EAGAIN;
531                goto out;
532        }
533
534        if (tgt->type->ioctl)
535                r = tgt->type->ioctl(tgt, cmd, arg);
536
537out:
538        dm_put_live_table(md, srcu_idx);
539
540        if (r == -ENOTCONN) {
541                msleep(10);
542                goto retry;
543        }
544
545        return r;
546}
547
548static struct dm_io *alloc_io(struct mapped_device *md)
549{
550        return mempool_alloc(md->io_pool, GFP_NOIO);
551}
552
553static void free_io(struct mapped_device *md, struct dm_io *io)
554{
555        mempool_free(io, md->io_pool);
556}
557
558static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
559{
560        bio_put(&tio->clone);
561}
562
563static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
564                                            gfp_t gfp_mask)
565{
566        return mempool_alloc(md->io_pool, gfp_mask);
567}
568
569static void free_rq_tio(struct dm_rq_target_io *tio)
570{
571        mempool_free(tio, tio->md->io_pool);
572}
573
574static int md_in_flight(struct mapped_device *md)
575{
576        return atomic_read(&md->pending[READ]) +
577               atomic_read(&md->pending[WRITE]);
578}
579
580static void start_io_acct(struct dm_io *io)
581{
582        struct mapped_device *md = io->md;
583        struct bio *bio = io->bio;
584        int cpu;
585        int rw = bio_data_dir(bio);
586
587        io->start_time = jiffies;
588
589        cpu = part_stat_lock();
590        part_round_stats(cpu, &dm_disk(md)->part0);
591        part_stat_unlock();
592        atomic_set(&dm_disk(md)->part0.in_flight[rw],
593                atomic_inc_return(&md->pending[rw]));
594
595        if (unlikely(dm_stats_used(&md->stats)))
596                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
597                                    bio_sectors(bio), false, 0, &io->stats_aux);
598}
599
600static void end_io_acct(struct dm_io *io)
601{
602        struct mapped_device *md = io->md;
603        struct bio *bio = io->bio;
604        unsigned long duration = jiffies - io->start_time;
605        int pending, cpu;
606        int rw = bio_data_dir(bio);
607
608        cpu = part_stat_lock();
609        part_round_stats(cpu, &dm_disk(md)->part0);
610        part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
611        part_stat_unlock();
612
613        if (unlikely(dm_stats_used(&md->stats)))
614                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
615                                    bio_sectors(bio), true, duration, &io->stats_aux);
616
617        /*
618         * After this is decremented the bio must not be touched if it is
619         * a flush.
620         */
621        pending = atomic_dec_return(&md->pending[rw]);
622        atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
623        pending += atomic_read(&md->pending[rw^0x1]);
624
625        /* nudge anyone waiting on suspend queue */
626        if (!pending)
627                wake_up(&md->wait);
628}
629
630/*
631 * Add the bio to the list of deferred io.
632 */
633static void queue_io(struct mapped_device *md, struct bio *bio)
634{
635        unsigned long flags;
636
637        spin_lock_irqsave(&md->deferred_lock, flags);
638        bio_list_add(&md->deferred, bio);
639        spin_unlock_irqrestore(&md->deferred_lock, flags);
640        queue_work(md->wq, &md->work);
641}
642
643/*
644 * Everyone (including functions in this file), should use this
645 * function to access the md->map field, and make sure they call
646 * dm_put_live_table() when finished.
647 */
648struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
649{
650        *srcu_idx = srcu_read_lock(&md->io_barrier);
651
652        return srcu_dereference(md->map, &md->io_barrier);
653}
654
655void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
656{
657        srcu_read_unlock(&md->io_barrier, srcu_idx);
658}
659
660void dm_sync_table(struct mapped_device *md)
661{
662        synchronize_srcu(&md->io_barrier);
663        synchronize_rcu_expedited();
664}
665
666/*
667 * A fast alternative to dm_get_live_table/dm_put_live_table.
668 * The caller must not block between these two functions.
669 */
670static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
671{
672        rcu_read_lock();
673        return rcu_dereference(md->map);
674}
675
676static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
677{
678        rcu_read_unlock();
679}
680
681/*
682 * Open a table device so we can use it as a map destination.
683 */
684static int open_table_device(struct table_device *td, dev_t dev,
685                             struct mapped_device *md)
686{
687        static char *_claim_ptr = "I belong to device-mapper";
688        struct block_device *bdev;
689
690        int r;
691
692        BUG_ON(td->dm_dev.bdev);
693
694        bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
695        if (IS_ERR(bdev))
696                return PTR_ERR(bdev);
697
698        r = bd_link_disk_holder(bdev, dm_disk(md));
699        if (r) {
700                blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
701                return r;
702        }
703
704        td->dm_dev.bdev = bdev;
705        return 0;
706}
707
708/*
709 * Close a table device that we've been using.
710 */
711static void close_table_device(struct table_device *td, struct mapped_device *md)
712{
713        if (!td->dm_dev.bdev)
714                return;
715
716        bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
717        blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
718        td->dm_dev.bdev = NULL;
719}
720
721static struct table_device *find_table_device(struct list_head *l, dev_t dev,
722                                              fmode_t mode) {
723        struct table_device *td;
724
725        list_for_each_entry(td, l, list)
726                if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
727                        return td;
728
729        return NULL;
730}
731
732int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
733                        struct dm_dev **result) {
734        int r;
735        struct table_device *td;
736
737        mutex_lock(&md->table_devices_lock);
738        td = find_table_device(&md->table_devices, dev, mode);
739        if (!td) {
740                td = kmalloc(sizeof(*td), GFP_KERNEL);
741                if (!td) {
742                        mutex_unlock(&md->table_devices_lock);
743                        return -ENOMEM;
744                }
745
746                td->dm_dev.mode = mode;
747                td->dm_dev.bdev = NULL;
748
749                if ((r = open_table_device(td, dev, md))) {
750                        mutex_unlock(&md->table_devices_lock);
751                        kfree(td);
752                        return r;
753                }
754
755                format_dev_t(td->dm_dev.name, dev);
756
757                atomic_set(&td->count, 0);
758                list_add(&td->list, &md->table_devices);
759        }
760        atomic_inc(&td->count);
761        mutex_unlock(&md->table_devices_lock);
762
763        *result = &td->dm_dev;
764        return 0;
765}
766EXPORT_SYMBOL_GPL(dm_get_table_device);
767
768void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
769{
770        struct table_device *td = container_of(d, struct table_device, dm_dev);
771
772        mutex_lock(&md->table_devices_lock);
773        if (atomic_dec_and_test(&td->count)) {
774                close_table_device(td, md);
775                list_del(&td->list);
776                kfree(td);
777        }
778        mutex_unlock(&md->table_devices_lock);
779}
780EXPORT_SYMBOL(dm_put_table_device);
781
782static void free_table_devices(struct list_head *devices)
783{
784        struct list_head *tmp, *next;
785
786        list_for_each_safe(tmp, next, devices) {
787                struct table_device *td = list_entry(tmp, struct table_device, list);
788
789                DMWARN("dm_destroy: %s still exists with %d references",
790                       td->dm_dev.name, atomic_read(&td->count));
791                kfree(td);
792        }
793}
794
795/*
796 * Get the geometry associated with a dm device
797 */
798int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
799{
800        *geo = md->geometry;
801
802        return 0;
803}
804
805/*
806 * Set the geometry of a device.
807 */
808int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
809{
810        sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
811
812        if (geo->start > sz) {
813                DMWARN("Start sector is beyond the geometry limits.");
814                return -EINVAL;
815        }
816
817        md->geometry = *geo;
818
819        return 0;
820}
821
822/*-----------------------------------------------------------------
823 * CRUD START:
824 *   A more elegant soln is in the works that uses the queue
825 *   merge fn, unfortunately there are a couple of changes to
826 *   the block layer that I want to make for this.  So in the
827 *   interests of getting something for people to use I give
828 *   you this clearly demarcated crap.
829 *---------------------------------------------------------------*/
830
831static int __noflush_suspending(struct mapped_device *md)
832{
833        return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
834}
835
836/*
837 * Decrements the number of outstanding ios that a bio has been
838 * cloned into, completing the original io if necc.
839 */
840static void dec_pending(struct dm_io *io, int error)
841{
842        unsigned long flags;
843        int io_error;
844        struct bio *bio;
845        struct mapped_device *md = io->md;
846
847        /* Push-back supersedes any I/O errors */
848        if (unlikely(error)) {
849                spin_lock_irqsave(&io->endio_lock, flags);
850                if (!(io->error > 0 && __noflush_suspending(md)))
851                        io->error = error;
852                spin_unlock_irqrestore(&io->endio_lock, flags);
853        }
854
855        if (atomic_dec_and_test(&io->io_count)) {
856                if (io->error == DM_ENDIO_REQUEUE) {
857                        /*
858                         * Target requested pushing back the I/O.
859                         */
860                        spin_lock_irqsave(&md->deferred_lock, flags);
861                        if (__noflush_suspending(md))
862                                bio_list_add_head(&md->deferred, io->bio);
863                        else
864                                /* noflush suspend was interrupted. */
865                                io->error = -EIO;
866                        spin_unlock_irqrestore(&md->deferred_lock, flags);
867                }
868
869                io_error = io->error;
870                bio = io->bio;
871                end_io_acct(io);
872                free_io(md, io);
873
874                if (io_error == DM_ENDIO_REQUEUE)
875                        return;
876
877                if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
878                        /*
879                         * Preflush done for flush with data, reissue
880                         * without REQ_FLUSH.
881                         */
882                        bio->bi_rw &= ~REQ_FLUSH;
883                        queue_io(md, bio);
884                } else {
885                        /* done with normal IO or empty flush */
886                        trace_block_bio_complete(md->queue, bio, io_error);
887                        bio_endio(bio, io_error);
888                }
889        }
890}
891
892static void disable_write_same(struct mapped_device *md)
893{
894        struct queue_limits *limits = dm_get_queue_limits(md);
895
896        /* device doesn't really support WRITE SAME, disable it */
897        limits->max_write_same_sectors = 0;
898}
899
900static void clone_endio(struct bio *bio, int error)
901{
902        int r = error;
903        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
904        struct dm_io *io = tio->io;
905        struct mapped_device *md = tio->io->md;
906        dm_endio_fn endio = tio->ti->type->end_io;
907
908        if (!bio_flagged(bio, BIO_UPTODATE) && !error)
909                error = -EIO;
910
911        if (endio) {
912                r = endio(tio->ti, bio, error);
913                if (r < 0 || r == DM_ENDIO_REQUEUE)
914                        /*
915                         * error and requeue request are handled
916                         * in dec_pending().
917                         */
918                        error = r;
919                else if (r == DM_ENDIO_INCOMPLETE)
920                        /* The target will handle the io */
921                        return;
922                else if (r) {
923                        DMWARN("unimplemented target endio return value: %d", r);
924                        BUG();
925                }
926        }
927
928        if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
929                     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
930                disable_write_same(md);
931
932        free_tio(md, tio);
933        dec_pending(io, error);
934}
935
936/*
937 * Partial completion handling for request-based dm
938 */
939static void end_clone_bio(struct bio *clone, int error)
940{
941        struct dm_rq_clone_bio_info *info =
942                container_of(clone, struct dm_rq_clone_bio_info, clone);
943        struct dm_rq_target_io *tio = info->tio;
944        struct bio *bio = info->orig;
945        unsigned int nr_bytes = info->orig->bi_iter.bi_size;
946
947        bio_put(clone);
948
949        if (tio->error)
950                /*
951                 * An error has already been detected on the request.
952                 * Once error occurred, just let clone->end_io() handle
953                 * the remainder.
954                 */
955                return;
956        else if (error) {
957                /*
958                 * Don't notice the error to the upper layer yet.
959                 * The error handling decision is made by the target driver,
960                 * when the request is completed.
961                 */
962                tio->error = error;
963                return;
964        }
965
966        /*
967         * I/O for the bio successfully completed.
968         * Notice the data completion to the upper layer.
969         */
970
971        /*
972         * bios are processed from the head of the list.
973         * So the completing bio should always be rq->bio.
974         * If it's not, something wrong is happening.
975         */
976        if (tio->orig->bio != bio)
977                DMERR("bio completion is going in the middle of the request");
978
979        /*
980         * Update the original request.
981         * Do not use blk_end_request() here, because it may complete
982         * the original request before the clone, and break the ordering.
983         */
984        blk_update_request(tio->orig, 0, nr_bytes);
985}
986
987/*
988 * Don't touch any member of the md after calling this function because
989 * the md may be freed in dm_put() at the end of this function.
990 * Or do dm_get() before calling this function and dm_put() later.
991 */
992static void rq_completed(struct mapped_device *md, int rw, int run_queue)
993{
994        atomic_dec(&md->pending[rw]);
995
996        /* nudge anyone waiting on suspend queue */
997        if (!md_in_flight(md))
998                wake_up(&md->wait);
999
1000        /*
1001         * Run this off this callpath, as drivers could invoke end_io while
1002         * inside their request_fn (and holding the queue lock). Calling
1003         * back into ->request_fn() could deadlock attempting to grab the
1004         * queue lock again.
1005         */
1006        if (run_queue)
1007                blk_run_queue_async(md->queue);
1008
1009        /*
1010         * dm_put() must be at the end of this function. See the comment above
1011         */
1012        dm_put(md);
1013}
1014
1015static void free_rq_clone(struct request *clone)
1016{
1017        struct dm_rq_target_io *tio = clone->end_io_data;
1018
1019        blk_rq_unprep_clone(clone);
1020        free_rq_tio(tio);
1021}
1022
1023/*
1024 * Complete the clone and the original request.
1025 * Must be called without queue lock.
1026 */
1027static void dm_end_request(struct request *clone, int error)
1028{
1029        int rw = rq_data_dir(clone);
1030        struct dm_rq_target_io *tio = clone->end_io_data;
1031        struct mapped_device *md = tio->md;
1032        struct request *rq = tio->orig;
1033
1034        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
1035                rq->errors = clone->errors;
1036                rq->resid_len = clone->resid_len;
1037
1038                if (rq->sense)
1039                        /*
1040                         * We are using the sense buffer of the original
1041                         * request.
1042                         * So setting the length of the sense data is enough.
1043                         */
1044                        rq->sense_len = clone->sense_len;
1045        }
1046
1047        free_rq_clone(clone);
1048        blk_end_request_all(rq, error);
1049        rq_completed(md, rw, true);
1050}
1051
1052static void dm_unprep_request(struct request *rq)
1053{
1054        struct request *clone = rq->special;
1055
1056        rq->special = NULL;
1057        rq->cmd_flags &= ~REQ_DONTPREP;
1058
1059        free_rq_clone(clone);
1060}
1061
1062/*
1063 * Requeue the original request of a clone.
1064 */
1065void dm_requeue_unmapped_request(struct request *clone)
1066{
1067        int rw = rq_data_dir(clone);
1068        struct dm_rq_target_io *tio = clone->end_io_data;
1069        struct mapped_device *md = tio->md;
1070        struct request *rq = tio->orig;
1071        struct request_queue *q = rq->q;
1072        unsigned long flags;
1073
1074        dm_unprep_request(rq);
1075
1076        spin_lock_irqsave(q->queue_lock, flags);
1077        blk_requeue_request(q, rq);
1078        spin_unlock_irqrestore(q->queue_lock, flags);
1079
1080        rq_completed(md, rw, 0);
1081}
1082EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
1083
1084static void __stop_queue(struct request_queue *q)
1085{
1086        blk_stop_queue(q);
1087}
1088
1089static void stop_queue(struct request_queue *q)
1090{
1091        unsigned long flags;
1092
1093        spin_lock_irqsave(q->queue_lock, flags);
1094        __stop_queue(q);
1095        spin_unlock_irqrestore(q->queue_lock, flags);
1096}
1097
1098static void __start_queue(struct request_queue *q)
1099{
1100        if (blk_queue_stopped(q))
1101                blk_start_queue(q);
1102}
1103
1104static void start_queue(struct request_queue *q)
1105{
1106        unsigned long flags;
1107
1108        spin_lock_irqsave(q->queue_lock, flags);
1109        __start_queue(q);
1110        spin_unlock_irqrestore(q->queue_lock, flags);
1111}
1112
1113static void dm_done(struct request *clone, int error, bool mapped)
1114{
1115        int r = error;
1116        struct dm_rq_target_io *tio = clone->end_io_data;
1117        dm_request_endio_fn rq_end_io = NULL;
1118
1119        if (tio->ti) {
1120                rq_end_io = tio->ti->type->rq_end_io;
1121
1122                if (mapped && rq_end_io)
1123                        r = rq_end_io(tio->ti, clone, error, &tio->info);
1124        }
1125
1126        if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) &&
1127                     !clone->q->limits.max_write_same_sectors))
1128                disable_write_same(tio->md);
1129
1130        if (r <= 0)
1131                /* The target wants to complete the I/O */
1132                dm_end_request(clone, r);
1133        else if (r == DM_ENDIO_INCOMPLETE)
1134                /* The target will handle the I/O */
1135                return;
1136        else if (r == DM_ENDIO_REQUEUE)
1137                /* The target wants to requeue the I/O */
1138                dm_requeue_unmapped_request(clone);
1139        else {
1140                DMWARN("unimplemented target endio return value: %d", r);
1141                BUG();
1142        }
1143}
1144
1145/*
1146 * Request completion handler for request-based dm
1147 */
1148static void dm_softirq_done(struct request *rq)
1149{
1150        bool mapped = true;
1151        struct request *clone = rq->completion_data;
1152        struct dm_rq_target_io *tio = clone->end_io_data;
1153
1154        if (rq->cmd_flags & REQ_FAILED)
1155                mapped = false;
1156
1157        dm_done(clone, tio->error, mapped);
1158}
1159
1160/*
1161 * Complete the clone and the original request with the error status
1162 * through softirq context.
1163 */
1164static void dm_complete_request(struct request *clone, int error)
1165{
1166        struct dm_rq_target_io *tio = clone->end_io_data;
1167        struct request *rq = tio->orig;
1168
1169        tio->error = error;
1170        rq->completion_data = clone;
1171        blk_complete_request(rq);
1172}
1173
1174/*
1175 * Complete the not-mapped clone and the original request with the error status
1176 * through softirq context.
1177 * Target's rq_end_io() function isn't called.
1178 * This may be used when the target's map_rq() function fails.
1179 */
1180void dm_kill_unmapped_request(struct request *clone, int error)
1181{
1182        struct dm_rq_target_io *tio = clone->end_io_data;
1183        struct request *rq = tio->orig;
1184
1185        rq->cmd_flags |= REQ_FAILED;
1186        dm_complete_request(clone, error);
1187}
1188EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
1189
1190/*
1191 * Called with the queue lock held
1192 */
1193static void end_clone_request(struct request *clone, int error)
1194{
1195        /*
1196         * For just cleaning up the information of the queue in which
1197         * the clone was dispatched.
1198         * The clone is *NOT* freed actually here because it is alloced from
1199         * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
1200         */
1201        __blk_put_request(clone->q, clone);
1202
1203        /*
1204         * Actual request completion is done in a softirq context which doesn't
1205         * hold the queue lock.  Otherwise, deadlock could occur because:
1206         *     - another request may be submitted by the upper level driver
1207         *       of the stacking during the completion
1208         *     - the submission which requires queue lock may be done
1209         *       against this queue
1210         */
1211        dm_complete_request(clone, error);
1212}
1213
1214/*
1215 * Return maximum size of I/O possible at the supplied sector up to the current
1216 * target boundary.
1217 */
1218static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1219{
1220        sector_t target_offset = dm_target_offset(ti, sector);
1221
1222        return ti->len - target_offset;
1223}
1224
1225static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1226{
1227        sector_t len = max_io_len_target_boundary(sector, ti);
1228        sector_t offset, max_len;
1229
1230        /*
1231         * Does the target need to split even further?
1232         */
1233        if (ti->max_io_len) {
1234                offset = dm_target_offset(ti, sector);
1235                if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1236                        max_len = sector_div(offset, ti->max_io_len);
1237                else
1238                        max_len = offset & (ti->max_io_len - 1);
1239                max_len = ti->max_io_len - max_len;
1240
1241                if (len > max_len)
1242                        len = max_len;
1243        }
1244
1245        return len;
1246}
1247
1248int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1249{
1250        if (len > UINT_MAX) {
1251                DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1252                      (unsigned long long)len, UINT_MAX);
1253                ti->error = "Maximum size of target IO is too large";
1254                return -EINVAL;
1255        }
1256
1257        ti->max_io_len = (uint32_t) len;
1258
1259        return 0;
1260}
1261EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1262
1263/*
1264 * A target may call dm_accept_partial_bio only from the map routine.  It is
1265 * allowed for all bio types except REQ_FLUSH.
1266 *
1267 * dm_accept_partial_bio informs the dm that the target only wants to process
1268 * additional n_sectors sectors of the bio and the rest of the data should be
1269 * sent in a next bio.
1270 *
1271 * A diagram that explains the arithmetics:
1272 * +--------------------+---------------+-------+
1273 * |         1          |       2       |   3   |
1274 * +--------------------+---------------+-------+
1275 *
1276 * <-------------- *tio->len_ptr --------------->
1277 *                      <------- bi_size ------->
1278 *                      <-- n_sectors -->
1279 *
1280 * Region 1 was already iterated over with bio_advance or similar function.
1281 *      (it may be empty if the target doesn't use bio_advance)
1282 * Region 2 is the remaining bio size that the target wants to process.
1283 *      (it may be empty if region 1 is non-empty, although there is no reason
1284 *       to make it empty)
1285 * The target requires that region 3 is to be sent in the next bio.
1286 *
1287 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1288 * the partially processed part (the sum of regions 1+2) must be the same for all
1289 * copies of the bio.
1290 */
1291void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1292{
1293        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1294        unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1295        BUG_ON(bio->bi_rw & REQ_FLUSH);
1296        BUG_ON(bi_size > *tio->len_ptr);
1297        BUG_ON(n_sectors > bi_size);
1298        *tio->len_ptr -= bi_size - n_sectors;
1299        bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1300}
1301EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1302
1303/*
1304 * Flush current->bio_list when the target map method blocks.
1305 * This fixes deadlocks in snapshot and possibly in other targets.
1306 */
1307struct dm_offload {
1308        struct blk_plug plug;
1309        struct blk_plug_cb cb;
1310};
1311
1312static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
1313{
1314        struct dm_offload *o = container_of(cb, struct dm_offload, cb);
1315        struct bio_list list;
1316        struct bio *bio;
1317
1318        INIT_LIST_HEAD(&o->cb.list);
1319
1320        if (unlikely(!current->bio_list))
1321                return;
1322
1323        list = *current->bio_list;
1324        bio_list_init(current->bio_list);
1325
1326        while ((bio = bio_list_pop(&list))) {
1327                struct bio_set *bs = bio->bi_pool;
1328                if (unlikely(!bs) || bs == fs_bio_set) {
1329                        bio_list_add(current->bio_list, bio);
1330                        continue;
1331                }
1332
1333                spin_lock(&bs->rescue_lock);
1334                bio_list_add(&bs->rescue_list, bio);
1335                queue_work(bs->rescue_workqueue, &bs->rescue_work);
1336                spin_unlock(&bs->rescue_lock);
1337        }
1338}
1339
1340static void dm_offload_start(struct dm_offload *o)
1341{
1342        blk_start_plug(&o->plug);
1343        o->cb.callback = flush_current_bio_list;
1344        list_add(&o->cb.list, &current->plug->cb_list);
1345}
1346
1347static void dm_offload_end(struct dm_offload *o)
1348{
1349        list_del(&o->cb.list);
1350        blk_finish_plug(&o->plug);
1351}
1352
1353static void __map_bio(struct dm_target_io *tio)
1354{
1355        int r;
1356        sector_t sector;
1357        struct mapped_device *md;
1358        struct dm_offload o;
1359        struct bio *clone = &tio->clone;
1360        struct dm_target *ti = tio->ti;
1361
1362        clone->bi_end_io = clone_endio;
1363
1364        /*
1365         * Map the clone.  If r == 0 we don't need to do
1366         * anything, the target has assumed ownership of
1367         * this io.
1368         */
1369        atomic_inc(&tio->io->io_count);
1370        sector = clone->bi_iter.bi_sector;
1371
1372        dm_offload_start(&o);
1373        r = ti->type->map(ti, clone);
1374        dm_offload_end(&o);
1375
1376        if (r == DM_MAPIO_REMAPPED) {
1377                /* the bio has been remapped so dispatch it */
1378
1379                trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1380                                      tio->io->bio->bi_bdev->bd_dev, sector);
1381
1382                generic_make_request(clone);
1383        } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1384                /* error the io and bail out, or requeue it if needed */
1385                md = tio->io->md;
1386                dec_pending(tio->io, r);
1387                free_tio(md, tio);
1388        } else if (r) {
1389                DMWARN("unimplemented target map return value: %d", r);
1390                BUG();
1391        }
1392}
1393
1394struct clone_info {
1395        struct mapped_device *md;
1396        struct dm_table *map;
1397        struct bio *bio;
1398        struct dm_io *io;
1399        sector_t sector;
1400        unsigned sector_count;
1401};
1402
1403static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1404{
1405        bio->bi_iter.bi_sector = sector;
1406        bio->bi_iter.bi_size = to_bytes(len);
1407}
1408
1409/*
1410 * Creates a bio that consists of range of complete bvecs.
1411 */
1412static void clone_bio(struct dm_target_io *tio, struct bio *bio,
1413                      sector_t sector, unsigned len)
1414{
1415        struct bio *clone = &tio->clone;
1416
1417        __bio_clone_fast(clone, bio);
1418
1419        if (bio_integrity(bio))
1420                bio_integrity_clone(clone, bio, GFP_NOIO);
1421
1422        bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1423        clone->bi_iter.bi_size = to_bytes(len);
1424
1425        if (bio_integrity(bio))
1426                bio_integrity_trim(clone, 0, len);
1427}
1428
1429static struct dm_target_io *alloc_tio(struct clone_info *ci,
1430                                      struct dm_target *ti,
1431                                      unsigned target_bio_nr)
1432{
1433        struct dm_target_io *tio;
1434        struct bio *clone;
1435
1436        clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1437        tio = container_of(clone, struct dm_target_io, clone);
1438
1439        tio->io = ci->io;
1440        tio->ti = ti;
1441        tio->target_bio_nr = target_bio_nr;
1442
1443        return tio;
1444}
1445
1446static void __clone_and_map_simple_bio(struct clone_info *ci,
1447                                       struct dm_target *ti,
1448                                       unsigned target_bio_nr, unsigned *len)
1449{
1450        struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
1451        struct bio *clone = &tio->clone;
1452
1453        tio->len_ptr = len;
1454
1455        __bio_clone_fast(clone, ci->bio);
1456        if (len)
1457                bio_setup_sector(clone, ci->sector, *len);
1458
1459        __map_bio(tio);
1460}
1461
1462static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1463                                  unsigned num_bios, unsigned *len)
1464{
1465        unsigned target_bio_nr;
1466
1467        for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1468                __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1469}
1470
1471static int __send_empty_flush(struct clone_info *ci)
1472{
1473        unsigned target_nr = 0;
1474        struct dm_target *ti;
1475
1476        BUG_ON(bio_has_data(ci->bio));
1477        while ((ti = dm_table_get_target(ci->map, target_nr++)))
1478                __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1479
1480        return 0;
1481}
1482
1483static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1484                                     sector_t sector, unsigned *len)
1485{
1486        struct bio *bio = ci->bio;
1487        struct dm_target_io *tio;
1488        unsigned target_bio_nr;
1489        unsigned num_target_bios = 1;
1490
1491        /*
1492         * Does the target want to receive duplicate copies of the bio?
1493         */
1494        if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1495                num_target_bios = ti->num_write_bios(ti, bio);
1496
1497        for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1498                tio = alloc_tio(ci, ti, target_bio_nr);
1499                tio->len_ptr = len;
1500                clone_bio(tio, bio, sector, *len);
1501                __map_bio(tio);
1502        }
1503}
1504
1505typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1506
1507static unsigned get_num_discard_bios(struct dm_target *ti)
1508{
1509        return ti->num_discard_bios;
1510}
1511
1512static unsigned get_num_write_same_bios(struct dm_target *ti)
1513{
1514        return ti->num_write_same_bios;
1515}
1516
1517typedef bool (*is_split_required_fn)(struct dm_target *ti);
1518
1519static bool is_split_required_for_discard(struct dm_target *ti)
1520{
1521        return ti->split_discard_bios;
1522}
1523
1524static int __send_changing_extent_only(struct clone_info *ci,
1525                                       get_num_bios_fn get_num_bios,
1526                                       is_split_required_fn is_split_required)
1527{
1528        struct dm_target *ti;
1529        unsigned len;
1530        unsigned num_bios;
1531
1532        do {
1533                ti = dm_table_find_target(ci->map, ci->sector);
1534                if (!dm_target_is_valid(ti))
1535                        return -EIO;
1536
1537                /*
1538                 * Even though the device advertised support for this type of
1539                 * request, that does not mean every target supports it, and
1540                 * reconfiguration might also have changed that since the
1541                 * check was performed.
1542                 */
1543                num_bios = get_num_bios ? get_num_bios(ti) : 0;
1544                if (!num_bios)
1545                        return -EOPNOTSUPP;
1546
1547                if (is_split_required && !is_split_required(ti))
1548                        len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1549                else
1550                        len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1551
1552                __send_duplicate_bios(ci, ti, num_bios, &len);
1553
1554                ci->sector += len;
1555        } while (ci->sector_count -= len);
1556
1557        return 0;
1558}
1559
1560static int __send_discard(struct clone_info *ci)
1561{
1562        return __send_changing_extent_only(ci, get_num_discard_bios,
1563                                           is_split_required_for_discard);
1564}
1565
1566static int __send_write_same(struct clone_info *ci)
1567{
1568        return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1569}
1570
1571/*
1572 * Select the correct strategy for processing a non-flush bio.
1573 */
1574static int __split_and_process_non_flush(struct clone_info *ci)
1575{
1576        struct bio *bio = ci->bio;
1577        struct dm_target *ti;
1578        unsigned len;
1579
1580        if (unlikely(bio->bi_rw & REQ_DISCARD))
1581                return __send_discard(ci);
1582        else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1583                return __send_write_same(ci);
1584
1585        ti = dm_table_find_target(ci->map, ci->sector);
1586        if (!dm_target_is_valid(ti))
1587                return -EIO;
1588
1589        len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1590
1591        __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1592
1593        ci->sector += len;
1594        ci->sector_count -= len;
1595
1596        return 0;
1597}
1598
1599/*
1600 * Entry point to split a bio into clones and submit them to the targets.
1601 */
1602static void __split_and_process_bio(struct mapped_device *md,
1603                                    struct dm_table *map, struct bio *bio)
1604{
1605        struct clone_info ci;
1606        int error = 0;
1607
1608        if (unlikely(!map)) {
1609                bio_io_error(bio);
1610                return;
1611        }
1612
1613        ci.map = map;
1614        ci.md = md;
1615        ci.io = alloc_io(md);
1616        ci.io->error = 0;
1617        atomic_set(&ci.io->io_count, 1);
1618        ci.io->bio = bio;
1619        ci.io->md = md;
1620        spin_lock_init(&ci.io->endio_lock);
1621        ci.sector = bio->bi_iter.bi_sector;
1622
1623        start_io_acct(ci.io);
1624
1625        if (bio->bi_rw & REQ_FLUSH) {
1626                ci.bio = &ci.md->flush_bio;
1627                ci.sector_count = 0;
1628                error = __send_empty_flush(&ci);
1629                /* dec_pending submits any data associated with flush */
1630        } else {
1631                ci.bio = bio;
1632                ci.sector_count = bio_sectors(bio);
1633                while (ci.sector_count && !error)
1634                        error = __split_and_process_non_flush(&ci);
1635        }
1636
1637        /* drop the extra reference count */
1638        dec_pending(ci.io, error);
1639}
1640/*-----------------------------------------------------------------
1641 * CRUD END
1642 *---------------------------------------------------------------*/
1643
1644static int dm_merge_bvec(struct request_queue *q,
1645                         struct bvec_merge_data *bvm,
1646                         struct bio_vec *biovec)
1647{
1648        struct mapped_device *md = q->queuedata;
1649        struct dm_table *map = dm_get_live_table_fast(md);
1650        struct dm_target *ti;
1651        sector_t max_sectors;
1652        int max_size = 0;
1653
1654        if (unlikely(!map))
1655                goto out;
1656
1657        ti = dm_table_find_target(map, bvm->bi_sector);
1658        if (!dm_target_is_valid(ti))
1659                goto out;
1660
1661        /*
1662         * Find maximum amount of I/O that won't need splitting
1663         */
1664        max_sectors = min(max_io_len(bvm->bi_sector, ti),
1665                          (sector_t) BIO_MAX_SECTORS);
1666        max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1667        if (max_size < 0)
1668                max_size = 0;
1669
1670        /*
1671         * merge_bvec_fn() returns number of bytes
1672         * it can accept at this offset
1673         * max is precomputed maximal io size
1674         */
1675        if (max_size && ti->type->merge)
1676                max_size = ti->type->merge(ti, bvm, biovec, max_size);
1677        /*
1678         * If the target doesn't support merge method and some of the devices
1679         * provided their merge_bvec method (we know this by looking at
1680         * queue_max_hw_sectors), then we can't allow bios with multiple vector
1681         * entries.  So always set max_size to 0, and the code below allows
1682         * just one page.
1683         */
1684        else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1685                max_size = 0;
1686
1687out:
1688        dm_put_live_table_fast(md);
1689        /*
1690         * Always allow an entire first page
1691         */
1692        if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1693                max_size = biovec->bv_len;
1694
1695        return max_size;
1696}
1697
1698/*
1699 * The request function that just remaps the bio built up by
1700 * dm_merge_bvec.
1701 */
1702static void _dm_request(struct request_queue *q, struct bio *bio)
1703{
1704        int rw = bio_data_dir(bio);
1705        struct mapped_device *md = q->queuedata;
1706        int cpu;
1707        int srcu_idx;
1708        struct dm_table *map;
1709
1710        map = dm_get_live_table(md, &srcu_idx);
1711
1712        cpu = part_stat_lock();
1713        part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1714        part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1715        part_stat_unlock();
1716
1717        /* if we're suspended, we have to queue this io for later */
1718        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1719                dm_put_live_table(md, srcu_idx);
1720
1721                if (bio_rw(bio) != READA)
1722                        queue_io(md, bio);
1723                else
1724                        bio_io_error(bio);
1725                return;
1726        }
1727
1728        __split_and_process_bio(md, map, bio);
1729        dm_put_live_table(md, srcu_idx);
1730        return;
1731}
1732
1733int dm_request_based(struct mapped_device *md)
1734{
1735        return blk_queue_stackable(md->queue);
1736}
1737
1738static void dm_request(struct request_queue *q, struct bio *bio)
1739{
1740        struct mapped_device *md = q->queuedata;
1741
1742        if (dm_request_based(md))
1743                blk_queue_bio(q, bio);
1744        else
1745                _dm_request(q, bio);
1746}
1747
1748void dm_dispatch_request(struct request *rq)
1749{
1750        int r;
1751
1752        if (blk_queue_io_stat(rq->q))
1753                rq->cmd_flags |= REQ_IO_STAT;
1754
1755        rq->start_time = jiffies;
1756        r = blk_insert_cloned_request(rq->q, rq);
1757        if (r)
1758                dm_complete_request(rq, r);
1759}
1760EXPORT_SYMBOL_GPL(dm_dispatch_request);
1761
1762static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1763                                 void *data)
1764{
1765        struct dm_rq_target_io *tio = data;
1766        struct dm_rq_clone_bio_info *info =
1767                container_of(bio, struct dm_rq_clone_bio_info, clone);
1768
1769        info->orig = bio_orig;
1770        info->tio = tio;
1771        bio->bi_end_io = end_clone_bio;
1772
1773        return 0;
1774}
1775
1776static int setup_clone(struct request *clone, struct request *rq,
1777                       struct dm_rq_target_io *tio)
1778{
1779        int r;
1780
1781        r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1782                              dm_rq_bio_constructor, tio);
1783        if (r)
1784                return r;
1785
1786        clone->cmd = rq->cmd;
1787        clone->cmd_len = rq->cmd_len;
1788        clone->sense = rq->sense;
1789        clone->end_io = end_clone_request;
1790        clone->end_io_data = tio;
1791
1792        return 0;
1793}
1794
1795static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1796                                gfp_t gfp_mask)
1797{
1798        struct request *clone;
1799        struct dm_rq_target_io *tio;
1800
1801        tio = alloc_rq_tio(md, gfp_mask);
1802        if (!tio)
1803                return NULL;
1804
1805        tio->md = md;
1806        tio->ti = NULL;
1807        tio->orig = rq;
1808        tio->error = 0;
1809        memset(&tio->info, 0, sizeof(tio->info));
1810
1811        clone = &tio->clone;
1812        if (setup_clone(clone, rq, tio)) {
1813                /* -ENOMEM */
1814                free_rq_tio(tio);
1815                return NULL;
1816        }
1817
1818        return clone;
1819}
1820
1821/*
1822 * Called with the queue lock held.
1823 */
1824static int dm_prep_fn(struct request_queue *q, struct request *rq)
1825{
1826        struct mapped_device *md = q->queuedata;
1827        struct request *clone;
1828
1829        if (unlikely(rq->special)) {
1830                DMWARN("Already has something in rq->special.");
1831                return BLKPREP_KILL;
1832        }
1833
1834        clone = clone_rq(rq, md, GFP_ATOMIC);
1835        if (!clone)
1836                return BLKPREP_DEFER;
1837
1838        rq->special = clone;
1839        rq->cmd_flags |= REQ_DONTPREP;
1840
1841        return BLKPREP_OK;
1842}
1843
1844/*
1845 * Returns:
1846 * 0  : the request has been processed (not requeued)
1847 * !0 : the request has been requeued
1848 */
1849static int map_request(struct dm_target *ti, struct request *clone,
1850                       struct mapped_device *md)
1851{
1852        int r, requeued = 0;
1853        struct dm_rq_target_io *tio = clone->end_io_data;
1854
1855        tio->ti = ti;
1856        r = ti->type->map_rq(ti, clone, &tio->info);
1857        switch (r) {
1858        case DM_MAPIO_SUBMITTED:
1859                /* The target has taken the I/O to submit by itself later */
1860                break;
1861        case DM_MAPIO_REMAPPED:
1862                /* The target has remapped the I/O so dispatch it */
1863                trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1864                                     blk_rq_pos(tio->orig));
1865                dm_dispatch_request(clone);
1866                break;
1867        case DM_MAPIO_REQUEUE:
1868                /* The target wants to requeue the I/O */
1869                dm_requeue_unmapped_request(clone);
1870                requeued = 1;
1871                break;
1872        default:
1873                if (r > 0) {
1874                        DMWARN("unimplemented target map return value: %d", r);
1875                        BUG();
1876                }
1877
1878                /* The target wants to complete the I/O */
1879                dm_kill_unmapped_request(clone, r);
1880                break;
1881        }
1882
1883        return requeued;
1884}
1885
1886static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
1887{
1888        struct request *clone;
1889
1890        blk_start_request(orig);
1891        clone = orig->special;
1892        atomic_inc(&md->pending[rq_data_dir(clone)]);
1893
1894        /*
1895         * Hold the md reference here for the in-flight I/O.
1896         * We can't rely on the reference count by device opener,
1897         * because the device may be closed during the request completion
1898         * when all bios are completed.
1899         * See the comment in rq_completed() too.
1900         */
1901        dm_get(md);
1902
1903        return clone;
1904}
1905
1906/*
1907 * q->request_fn for request-based dm.
1908 * Called with the queue lock held.
1909 */
1910static void dm_request_fn(struct request_queue *q)
1911{
1912        struct mapped_device *md = q->queuedata;
1913        int srcu_idx;
1914        struct dm_table *map = dm_get_live_table(md, &srcu_idx);
1915        struct dm_target *ti;
1916        struct request *rq, *clone;
1917        sector_t pos;
1918
1919        /*
1920         * For suspend, check blk_queue_stopped() and increment
1921         * ->pending within a single queue_lock not to increment the
1922         * number of in-flight I/Os after the queue is stopped in
1923         * dm_suspend().
1924         */
1925        while (!blk_queue_stopped(q)) {
1926                rq = blk_peek_request(q);
1927                if (!rq)
1928                        goto delay_and_out;
1929
1930                /* always use block 0 to find the target for flushes for now */
1931                pos = 0;
1932                if (!(rq->cmd_flags & REQ_FLUSH))
1933                        pos = blk_rq_pos(rq);
1934
1935                ti = dm_table_find_target(map, pos);
1936                if (!dm_target_is_valid(ti)) {
1937                        /*
1938                         * Must perform setup, that dm_done() requires,
1939                         * before calling dm_kill_unmapped_request
1940                         */
1941                        DMERR_LIMIT("request attempted access beyond the end of device");
1942                        clone = dm_start_request(md, rq);
1943                        dm_kill_unmapped_request(clone, -EIO);
1944                        continue;
1945                }
1946
1947                if (ti->type->busy && ti->type->busy(ti))
1948                        goto delay_and_out;
1949
1950                clone = dm_start_request(md, rq);
1951
1952                spin_unlock(q->queue_lock);
1953                if (map_request(ti, clone, md))
1954                        goto requeued;
1955
1956                BUG_ON(!irqs_disabled());
1957                spin_lock(q->queue_lock);
1958        }
1959
1960        goto out;
1961
1962requeued:
1963        BUG_ON(!irqs_disabled());
1964        spin_lock(q->queue_lock);
1965
1966delay_and_out:
1967        blk_delay_queue(q, HZ / 10);
1968out:
1969        dm_put_live_table(md, srcu_idx);
1970}
1971
1972int dm_underlying_device_busy(struct request_queue *q)
1973{
1974        return blk_lld_busy(q);
1975}
1976EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1977
1978static int dm_lld_busy(struct request_queue *q)
1979{
1980        int r;
1981        struct mapped_device *md = q->queuedata;
1982        struct dm_table *map = dm_get_live_table_fast(md);
1983
1984        if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1985                r = 1;
1986        else
1987                r = dm_table_any_busy_target(map);
1988
1989        dm_put_live_table_fast(md);
1990
1991        return r;
1992}
1993
1994static int dm_any_congested(void *congested_data, int bdi_bits)
1995{
1996        int r = bdi_bits;
1997        struct mapped_device *md = congested_data;
1998        struct dm_table *map;
1999
2000        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2001                map = dm_get_live_table_fast(md);
2002                if (map) {
2003                        /*
2004                         * Request-based dm cares about only own queue for
2005                         * the query about congestion status of request_queue
2006                         */
2007                        if (dm_request_based(md))
2008                                r = md->queue->backing_dev_info.state &
2009                                    bdi_bits;
2010                        else
2011                                r = dm_table_any_congested(map, bdi_bits);
2012                }
2013                dm_put_live_table_fast(md);
2014        }
2015
2016        return r;
2017}
2018
2019/*-----------------------------------------------------------------
2020 * An IDR is used to keep track of allocated minor numbers.
2021 *---------------------------------------------------------------*/
2022static void free_minor(int minor)
2023{
2024        spin_lock(&_minor_lock);
2025        idr_remove(&_minor_idr, minor);
2026        spin_unlock(&_minor_lock);
2027}
2028
2029/*
2030 * See if the device with a specific minor # is free.
2031 */
2032static int specific_minor(int minor)
2033{
2034        int r;
2035
2036        if (minor >= (1 << MINORBITS))
2037                return -EINVAL;
2038
2039        idr_preload(GFP_KERNEL);
2040        spin_lock(&_minor_lock);
2041
2042        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
2043
2044        spin_unlock(&_minor_lock);
2045        idr_preload_end();
2046        if (r < 0)
2047                return r == -ENOSPC ? -EBUSY : r;
2048        return 0;
2049}
2050
2051static int next_free_minor(int *minor)
2052{
2053        int r;
2054
2055        idr_preload(GFP_KERNEL);
2056        spin_lock(&_minor_lock);
2057
2058        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
2059
2060        spin_unlock(&_minor_lock);
2061        idr_preload_end();
2062        if (r < 0)
2063                return r;
2064        *minor = r;
2065        return 0;
2066}
2067
2068static const struct block_device_operations dm_blk_dops;
2069
2070static void dm_wq_work(struct work_struct *work);
2071
2072static void dm_init_md_queue(struct mapped_device *md)
2073{
2074        /*
2075         * Request-based dm devices cannot be stacked on top of bio-based dm
2076         * devices.  The type of this dm device has not been decided yet.
2077         * The type is decided at the first table loading time.
2078         * To prevent problematic device stacking, clear the queue flag
2079         * for request stacking support until then.
2080         *
2081         * This queue is new, so no concurrency on the queue_flags.
2082         */
2083        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
2084
2085        md->queue->queuedata = md;
2086        md->queue->backing_dev_info.congested_fn = dm_any_congested;
2087        md->queue->backing_dev_info.congested_data = md;
2088        blk_queue_make_request(md->queue, dm_request);
2089        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
2090        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
2091}
2092
2093/*
2094 * Allocate and initialise a blank device with a given minor.
2095 */
2096static struct mapped_device *alloc_dev(int minor)
2097{
2098        int r;
2099        struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
2100        void *old_md;
2101
2102        if (!md) {
2103                DMWARN("unable to allocate device, out of memory.");
2104                return NULL;
2105        }
2106
2107        if (!try_module_get(THIS_MODULE))
2108                goto bad_module_get;
2109
2110        /* get a minor number for the dev */
2111        if (minor == DM_ANY_MINOR)
2112                r = next_free_minor(&minor);
2113        else
2114                r = specific_minor(minor);
2115        if (r < 0)
2116                goto bad_minor;
2117
2118        r = init_srcu_struct(&md->io_barrier);
2119        if (r < 0)
2120                goto bad_io_barrier;
2121
2122        md->type = DM_TYPE_NONE;
2123        mutex_init(&md->suspend_lock);
2124        mutex_init(&md->type_lock);
2125        mutex_init(&md->table_devices_lock);
2126        spin_lock_init(&md->deferred_lock);
2127        atomic_set(&md->holders, 1);
2128        atomic_set(&md->open_count, 0);
2129        atomic_set(&md->event_nr, 0);
2130        atomic_set(&md->uevent_seq, 0);
2131        INIT_LIST_HEAD(&md->uevent_list);
2132        INIT_LIST_HEAD(&md->table_devices);
2133        spin_lock_init(&md->uevent_lock);
2134
2135        md->queue = blk_alloc_queue(GFP_KERNEL);
2136        if (!md->queue)
2137                goto bad_queue;
2138
2139        dm_init_md_queue(md);
2140
2141        md->disk = alloc_disk(1);
2142        if (!md->disk)
2143                goto bad_disk;
2144
2145        atomic_set(&md->pending[0], 0);
2146        atomic_set(&md->pending[1], 0);
2147        init_waitqueue_head(&md->wait);
2148        INIT_WORK(&md->work, dm_wq_work);
2149        init_waitqueue_head(&md->eventq);
2150        init_completion(&md->kobj_holder.completion);
2151
2152        md->disk->major = _major;
2153        md->disk->first_minor = minor;
2154        md->disk->fops = &dm_blk_dops;
2155        md->disk->queue = md->queue;
2156        md->disk->private_data = md;
2157        sprintf(md->disk->disk_name, "dm-%d", minor);
2158        add_disk(md->disk);
2159        format_dev_t(md->name, MKDEV(_major, minor));
2160
2161        md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2162        if (!md->wq)
2163                goto bad_thread;
2164
2165        md->bdev = bdget_disk(md->disk, 0);
2166        if (!md->bdev)
2167                goto bad_bdev;
2168
2169        bio_init(&md->flush_bio);
2170        md->flush_bio.bi_bdev = md->bdev;
2171        md->flush_bio.bi_rw = WRITE_FLUSH;
2172
2173        dm_stats_init(&md->stats);
2174
2175        /* Populate the mapping, nobody knows we exist yet */
2176        spin_lock(&_minor_lock);
2177        old_md = idr_replace(&_minor_idr, md, minor);
2178        spin_unlock(&_minor_lock);
2179
2180        BUG_ON(old_md != MINOR_ALLOCED);
2181
2182        return md;
2183
2184bad_bdev:
2185        destroy_workqueue(md->wq);
2186bad_thread:
2187        del_gendisk(md->disk);
2188        put_disk(md->disk);
2189bad_disk:
2190        blk_cleanup_queue(md->queue);
2191bad_queue:
2192        cleanup_srcu_struct(&md->io_barrier);
2193bad_io_barrier:
2194        free_minor(minor);
2195bad_minor:
2196        module_put(THIS_MODULE);
2197bad_module_get:
2198        kfree(md);
2199        return NULL;
2200}
2201
2202static void unlock_fs(struct mapped_device *md);
2203
2204static void free_dev(struct mapped_device *md)
2205{
2206        int minor = MINOR(disk_devt(md->disk));
2207
2208        unlock_fs(md);
2209        bdput(md->bdev);
2210        destroy_workqueue(md->wq);
2211        if (md->io_pool)
2212                mempool_destroy(md->io_pool);
2213        if (md->bs)
2214                bioset_free(md->bs);
2215        blk_integrity_unregister(md->disk);
2216        del_gendisk(md->disk);
2217        cleanup_srcu_struct(&md->io_barrier);
2218        free_table_devices(&md->table_devices);
2219        free_minor(minor);
2220
2221        spin_lock(&_minor_lock);
2222        md->disk->private_data = NULL;
2223        spin_unlock(&_minor_lock);
2224
2225        put_disk(md->disk);
2226        blk_cleanup_queue(md->queue);
2227        dm_stats_cleanup(&md->stats);
2228        module_put(THIS_MODULE);
2229        kfree(md);
2230}
2231
2232static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
2233{
2234        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2235
2236        if (md->io_pool && md->bs) {
2237                /* The md already has necessary mempools. */
2238                if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
2239                        /*
2240                         * Reload bioset because front_pad may have changed
2241                         * because a different table was loaded.
2242                         */
2243                        bioset_free(md->bs);
2244                        md->bs = p->bs;
2245                        p->bs = NULL;
2246                } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
2247                        /*
2248                         * There's no need to reload with request-based dm
2249                         * because the size of front_pad doesn't change.
2250                         * Note for future: If you are to reload bioset,
2251                         * prep-ed requests in the queue may refer
2252                         * to bio from the old bioset, so you must walk
2253                         * through the queue to unprep.
2254                         */
2255                }
2256                goto out;
2257        }
2258
2259        BUG_ON(!p || md->io_pool || md->bs);
2260
2261        md->io_pool = p->io_pool;
2262        p->io_pool = NULL;
2263        md->bs = p->bs;
2264        p->bs = NULL;
2265
2266out:
2267        /* mempool bind completed, now no need any mempools in the table */
2268        dm_table_free_md_mempools(t);
2269}
2270
2271/*
2272 * Bind a table to the device.
2273 */
2274static void event_callback(void *context)
2275{
2276        unsigned long flags;
2277        LIST_HEAD(uevents);
2278        struct mapped_device *md = (struct mapped_device *) context;
2279
2280        spin_lock_irqsave(&md->uevent_lock, flags);
2281        list_splice_init(&md->uevent_list, &uevents);
2282        spin_unlock_irqrestore(&md->uevent_lock, flags);
2283
2284        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2285
2286        atomic_inc(&md->event_nr);
2287        wake_up(&md->eventq);
2288}
2289
2290/*
2291 * Protected by md->suspend_lock obtained by dm_swap_table().
2292 */
2293static void __set_size(struct mapped_device *md, sector_t size)
2294{
2295        set_capacity(md->disk, size);
2296
2297        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2298}
2299
2300/*
2301 * Return 1 if the queue has a compulsory merge_bvec_fn function.
2302 *
2303 * If this function returns 0, then the device is either a non-dm
2304 * device without a merge_bvec_fn, or it is a dm device that is
2305 * able to split any bios it receives that are too big.
2306 */
2307int dm_queue_merge_is_compulsory(struct request_queue *q)
2308{
2309        struct mapped_device *dev_md;
2310
2311        if (!q->merge_bvec_fn)
2312                return 0;
2313
2314        if (q->make_request_fn == dm_request) {
2315                dev_md = q->queuedata;
2316                if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2317                        return 0;
2318        }
2319
2320        return 1;
2321}
2322
2323static int dm_device_merge_is_compulsory(struct dm_target *ti,
2324                                         struct dm_dev *dev, sector_t start,
2325                                         sector_t len, void *data)
2326{
2327        struct block_device *bdev = dev->bdev;
2328        struct request_queue *q = bdev_get_queue(bdev);
2329
2330        return dm_queue_merge_is_compulsory(q);
2331}
2332
2333/*
2334 * Return 1 if it is acceptable to ignore merge_bvec_fn based
2335 * on the properties of the underlying devices.
2336 */
2337static int dm_table_merge_is_optional(struct dm_table *table)
2338{
2339        unsigned i = 0;
2340        struct dm_target *ti;
2341
2342        while (i < dm_table_get_num_targets(table)) {
2343                ti = dm_table_get_target(table, i++);
2344
2345                if (ti->type->iterate_devices &&
2346                    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2347                        return 0;
2348        }
2349
2350        return 1;
2351}
2352
2353/*
2354 * Returns old map, which caller must destroy.
2355 */
2356static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2357                               struct queue_limits *limits)
2358{
2359        struct dm_table *old_map;
2360        struct request_queue *q = md->queue;
2361        sector_t size;
2362        int merge_is_optional;
2363
2364        size = dm_table_get_size(t);
2365
2366        /*
2367         * Wipe any geometry if the size of the table changed.
2368         */
2369        if (size != dm_get_size(md))
2370                memset(&md->geometry, 0, sizeof(md->geometry));
2371
2372        __set_size(md, size);
2373
2374        dm_table_event_callback(t, event_callback, md);
2375
2376        /*
2377         * The queue hasn't been stopped yet, if the old table type wasn't
2378         * for request-based during suspension.  So stop it to prevent
2379         * I/O mapping before resume.
2380         * This must be done before setting the queue restrictions,
2381         * because request-based dm may be run just after the setting.
2382         */
2383        if (dm_table_request_based(t) && !blk_queue_stopped(q))
2384                stop_queue(q);
2385
2386        __bind_mempools(md, t);
2387
2388        merge_is_optional = dm_table_merge_is_optional(t);
2389
2390        old_map = md->map;
2391        rcu_assign_pointer(md->map, t);
2392        md->immutable_target_type = dm_table_get_immutable_target_type(t);
2393
2394        dm_table_set_restrictions(t, q, limits);
2395        if (merge_is_optional)
2396                set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2397        else
2398                clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2399        dm_sync_table(md);
2400
2401        return old_map;
2402}
2403
2404/*
2405 * Returns unbound table for the caller to free.
2406 */
2407static struct dm_table *__unbind(struct mapped_device *md)
2408{
2409        struct dm_table *map = md->map;
2410
2411        if (!map)
2412                return NULL;
2413
2414        dm_table_event_callback(map, NULL, NULL);
2415        RCU_INIT_POINTER(md->map, NULL);
2416        dm_sync_table(md);
2417
2418        return map;
2419}
2420
2421/*
2422 * Constructor for a new device.
2423 */
2424int dm_create(int minor, struct mapped_device **result)
2425{
2426        struct mapped_device *md;
2427
2428        md = alloc_dev(minor);
2429        if (!md)
2430                return -ENXIO;
2431
2432        dm_sysfs_init(md);
2433
2434        *result = md;
2435        return 0;
2436}
2437
2438/*
2439 * Functions to manage md->type.
2440 * All are required to hold md->type_lock.
2441 */
2442void dm_lock_md_type(struct mapped_device *md)
2443{
2444        mutex_lock(&md->type_lock);
2445}
2446
2447void dm_unlock_md_type(struct mapped_device *md)
2448{
2449        mutex_unlock(&md->type_lock);
2450}
2451
2452void dm_set_md_type(struct mapped_device *md, unsigned type)
2453{
2454        BUG_ON(!mutex_is_locked(&md->type_lock));
2455        md->type = type;
2456}
2457
2458unsigned dm_get_md_type(struct mapped_device *md)
2459{
2460        BUG_ON(!mutex_is_locked(&md->type_lock));
2461        return md->type;
2462}
2463
2464struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2465{
2466        return md->immutable_target_type;
2467}
2468
2469/*
2470 * The queue_limits are only valid as long as you have a reference
2471 * count on 'md'.
2472 */
2473struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2474{
2475        BUG_ON(!atomic_read(&md->holders));
2476        return &md->queue->limits;
2477}
2478EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2479
2480/*
2481 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2482 */
2483static int dm_init_request_based_queue(struct mapped_device *md)
2484{
2485        struct request_queue *q = NULL;
2486
2487        if (md->queue->elevator)
2488                return 1;
2489
2490        /* Fully initialize the queue */
2491        q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2492        if (!q)
2493                return 0;
2494
2495        md->queue = q;
2496        dm_init_md_queue(md);
2497        blk_queue_softirq_done(md->queue, dm_softirq_done);
2498        blk_queue_prep_rq(md->queue, dm_prep_fn);
2499        blk_queue_lld_busy(md->queue, dm_lld_busy);
2500
2501        elv_register_queue(md->queue);
2502
2503        return 1;
2504}
2505
2506/*
2507 * Setup the DM device's queue based on md's type
2508 */
2509int dm_setup_md_queue(struct mapped_device *md)
2510{
2511        if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
2512            !dm_init_request_based_queue(md)) {
2513                DMWARN("Cannot initialize queue for request-based mapped device");
2514                return -EINVAL;
2515        }
2516
2517        return 0;
2518}
2519
2520struct mapped_device *dm_get_md(dev_t dev)
2521{
2522        struct mapped_device *md;
2523        unsigned minor = MINOR(dev);
2524
2525        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2526                return NULL;
2527
2528        spin_lock(&_minor_lock);
2529
2530        md = idr_find(&_minor_idr, minor);
2531        if (md) {
2532                if ((md == MINOR_ALLOCED ||
2533                     (MINOR(disk_devt(dm_disk(md))) != minor) ||
2534                     dm_deleting_md(md) ||
2535                     test_bit(DMF_FREEING, &md->flags))) {
2536                        md = NULL;
2537                        goto out;
2538                }
2539                dm_get(md);
2540        }
2541
2542out:
2543        spin_unlock(&_minor_lock);
2544
2545        return md;
2546}
2547EXPORT_SYMBOL_GPL(dm_get_md);
2548
2549void *dm_get_mdptr(struct mapped_device *md)
2550{
2551        return md->interface_ptr;
2552}
2553
2554void dm_set_mdptr(struct mapped_device *md, void *ptr)
2555{
2556        md->interface_ptr = ptr;
2557}
2558
2559void dm_get(struct mapped_device *md)
2560{
2561        atomic_inc(&md->holders);
2562        BUG_ON(test_bit(DMF_FREEING, &md->flags));
2563}
2564
2565int dm_hold(struct mapped_device *md)
2566{
2567        spin_lock(&_minor_lock);
2568        if (test_bit(DMF_FREEING, &md->flags)) {
2569                spin_unlock(&_minor_lock);
2570                return -EBUSY;
2571        }
2572        dm_get(md);
2573        spin_unlock(&_minor_lock);
2574        return 0;
2575}
2576EXPORT_SYMBOL_GPL(dm_hold);
2577
2578const char *dm_device_name(struct mapped_device *md)
2579{
2580        return md->name;
2581}
2582EXPORT_SYMBOL_GPL(dm_device_name);
2583
2584static void __dm_destroy(struct mapped_device *md, bool wait)
2585{
2586        struct dm_table *map;
2587        int srcu_idx;
2588
2589        might_sleep();
2590
2591        spin_lock(&_minor_lock);
2592        map = dm_get_live_table(md, &srcu_idx);
2593        idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2594        set_bit(DMF_FREEING, &md->flags);
2595        spin_unlock(&_minor_lock);
2596
2597        /*
2598         * Take suspend_lock so that presuspend and postsuspend methods
2599         * do not race with internal suspend.
2600         */
2601        mutex_lock(&md->suspend_lock);
2602        if (!dm_suspended_md(md)) {
2603                dm_table_presuspend_targets(map);
2604                dm_table_postsuspend_targets(map);
2605        }
2606        mutex_unlock(&md->suspend_lock);
2607
2608        /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2609        dm_put_live_table(md, srcu_idx);
2610
2611        /*
2612         * Rare, but there may be I/O requests still going to complete,
2613         * for example.  Wait for all references to disappear.
2614         * No one should increment the reference count of the mapped_device,
2615         * after the mapped_device state becomes DMF_FREEING.
2616         */
2617        if (wait)
2618                while (atomic_read(&md->holders))
2619                        msleep(1);
2620        else if (atomic_read(&md->holders))
2621                DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2622                       dm_device_name(md), atomic_read(&md->holders));
2623
2624        dm_sysfs_exit(md);
2625        dm_table_destroy(__unbind(md));
2626        free_dev(md);
2627}
2628
2629void dm_destroy(struct mapped_device *md)
2630{
2631        __dm_destroy(md, true);
2632}
2633
2634void dm_destroy_immediate(struct mapped_device *md)
2635{
2636        __dm_destroy(md, false);
2637}
2638
2639void dm_put(struct mapped_device *md)
2640{
2641        atomic_dec(&md->holders);
2642}
2643EXPORT_SYMBOL_GPL(dm_put);
2644
2645static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2646{
2647        int r = 0;
2648        DECLARE_WAITQUEUE(wait, current);
2649
2650        add_wait_queue(&md->wait, &wait);
2651
2652        while (1) {
2653                set_current_state(interruptible);
2654
2655                if (!md_in_flight(md))
2656                        break;
2657
2658                if (interruptible == TASK_INTERRUPTIBLE &&
2659                    signal_pending(current)) {
2660                        r = -EINTR;
2661                        break;
2662                }
2663
2664                io_schedule();
2665        }
2666        set_current_state(TASK_RUNNING);
2667
2668        remove_wait_queue(&md->wait, &wait);
2669
2670        return r;
2671}
2672
2673/*
2674 * Process the deferred bios
2675 */
2676static void dm_wq_work(struct work_struct *work)
2677{
2678        struct mapped_device *md = container_of(work, struct mapped_device,
2679                                                work);
2680        struct bio *c;
2681        int srcu_idx;
2682        struct dm_table *map;
2683
2684        map = dm_get_live_table(md, &srcu_idx);
2685
2686        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2687                spin_lock_irq(&md->deferred_lock);
2688                c = bio_list_pop(&md->deferred);
2689                spin_unlock_irq(&md->deferred_lock);
2690
2691                if (!c)
2692                        break;
2693
2694                if (dm_request_based(md))
2695                        generic_make_request(c);
2696                else
2697                        __split_and_process_bio(md, map, c);
2698        }
2699
2700        dm_put_live_table(md, srcu_idx);
2701}
2702
2703static void dm_queue_flush(struct mapped_device *md)
2704{
2705        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2706        smp_mb__after_atomic();
2707        queue_work(md->wq, &md->work);
2708}
2709
2710/*
2711 * Swap in a new table, returning the old one for the caller to destroy.
2712 */
2713struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2714{
2715        struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2716        struct queue_limits limits;
2717        int r;
2718
2719        mutex_lock(&md->suspend_lock);
2720
2721        /* device must be suspended */
2722        if (!dm_suspended_md(md))
2723                goto out;
2724
2725        /*
2726         * If the new table has no data devices, retain the existing limits.
2727         * This helps multipath with queue_if_no_path if all paths disappear,
2728         * then new I/O is queued based on these limits, and then some paths
2729         * reappear.
2730         */
2731        if (dm_table_has_no_data_devices(table)) {
2732                live_map = dm_get_live_table_fast(md);
2733                if (live_map)
2734                        limits = md->queue->limits;
2735                dm_put_live_table_fast(md);
2736        }
2737
2738        if (!live_map) {
2739                r = dm_calculate_queue_limits(table, &limits);
2740                if (r) {
2741                        map = ERR_PTR(r);
2742                        goto out;
2743                }
2744        }
2745
2746        map = __bind(md, table, &limits);
2747
2748out:
2749        mutex_unlock(&md->suspend_lock);
2750        return map;
2751}
2752
2753/*
2754 * Functions to lock and unlock any filesystem running on the
2755 * device.
2756 */
2757static int lock_fs(struct mapped_device *md)
2758{
2759        int r;
2760
2761        WARN_ON(md->frozen_sb);
2762
2763        md->frozen_sb = freeze_bdev(md->bdev);
2764        if (IS_ERR(md->frozen_sb)) {
2765                r = PTR_ERR(md->frozen_sb);
2766                md->frozen_sb = NULL;
2767                return r;
2768        }
2769
2770        set_bit(DMF_FROZEN, &md->flags);
2771
2772        return 0;
2773}
2774
2775static void unlock_fs(struct mapped_device *md)
2776{
2777        if (!test_bit(DMF_FROZEN, &md->flags))
2778                return;
2779
2780        thaw_bdev(md->bdev, md->frozen_sb);
2781        md->frozen_sb = NULL;
2782        clear_bit(DMF_FROZEN, &md->flags);
2783}
2784
2785/*
2786 * We need to be able to change a mapping table under a mounted
2787 * filesystem.  For example we might want to move some data in
2788 * the background.  Before the table can be swapped with
2789 * dm_bind_table, dm_suspend must be called to flush any in
2790 * flight bios and ensure that any further io gets deferred.
2791 */
2792/*
2793 * Suspend mechanism in request-based dm.
2794 *
2795 * 1. Flush all I/Os by lock_fs() if needed.
2796 * 2. Stop dispatching any I/O by stopping the request_queue.
2797 * 3. Wait for all in-flight I/Os to be completed or requeued.
2798 *
2799 * To abort suspend, start the request_queue.
2800 */
2801int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2802{
2803        struct dm_table *map = NULL;
2804        int r = 0;
2805        int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
2806        int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2807
2808        mutex_lock(&md->suspend_lock);
2809
2810        if (dm_suspended_md(md)) {
2811                r = -EINVAL;
2812                goto out_unlock;
2813        }
2814
2815        map = md->map;
2816
2817        /*
2818         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2819         * This flag is cleared before dm_suspend returns.
2820         */
2821        if (noflush)
2822                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2823
2824        /* This does not get reverted if there's an error later. */
2825        dm_table_presuspend_targets(map);
2826
2827        /*
2828         * Flush I/O to the device.
2829         * Any I/O submitted after lock_fs() may not be flushed.
2830         * noflush takes precedence over do_lockfs.
2831         * (lock_fs() flushes I/Os and waits for them to complete.)
2832         */
2833        if (!noflush && do_lockfs) {
2834                r = lock_fs(md);
2835                if (r)
2836                        goto out_unlock;
2837        }
2838
2839        /*
2840         * Here we must make sure that no processes are submitting requests
2841         * to target drivers i.e. no one may be executing
2842         * __split_and_process_bio. This is called from dm_request and
2843         * dm_wq_work.
2844         *
2845         * To get all processes out of __split_and_process_bio in dm_request,
2846         * we take the write lock. To prevent any process from reentering
2847         * __split_and_process_bio from dm_request and quiesce the thread
2848         * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2849         * flush_workqueue(md->wq).
2850         */
2851        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2852        synchronize_srcu(&md->io_barrier);
2853
2854        /*
2855         * Stop md->queue before flushing md->wq in case request-based
2856         * dm defers requests to md->wq from md->queue.
2857         */
2858        if (dm_request_based(md))
2859                stop_queue(md->queue);
2860
2861        flush_workqueue(md->wq);
2862
2863        /*
2864         * At this point no more requests are entering target request routines.
2865         * We call dm_wait_for_completion to wait for all existing requests
2866         * to finish.
2867         */
2868        r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
2869
2870        if (noflush)
2871                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2872        synchronize_srcu(&md->io_barrier);
2873
2874        /* were we interrupted ? */
2875        if (r < 0) {
2876                dm_queue_flush(md);
2877
2878                if (dm_request_based(md))
2879                        start_queue(md->queue);
2880
2881                unlock_fs(md);
2882                goto out_unlock; /* pushback list is already flushed, so skip flush */
2883        }
2884
2885        /*
2886         * If dm_wait_for_completion returned 0, the device is completely
2887         * quiescent now. There is no request-processing activity. All new
2888         * requests are being added to md->deferred list.
2889         */
2890
2891        set_bit(DMF_SUSPENDED, &md->flags);
2892
2893        dm_table_postsuspend_targets(map);
2894
2895out_unlock:
2896        mutex_unlock(&md->suspend_lock);
2897        return r;
2898}
2899
2900int dm_resume(struct mapped_device *md)
2901{
2902        int r = -EINVAL;
2903        struct dm_table *map = NULL;
2904
2905        mutex_lock(&md->suspend_lock);
2906        if (!dm_suspended_md(md))
2907                goto out;
2908
2909        map = md->map;
2910        if (!map || !dm_table_get_size(map))
2911                goto out;
2912
2913        r = dm_table_resume_targets(map);
2914        if (r)
2915                goto out;
2916
2917        dm_queue_flush(md);
2918
2919        /*
2920         * Flushing deferred I/Os must be done after targets are resumed
2921         * so that mapping of targets can work correctly.
2922         * Request-based dm is queueing the deferred I/Os in its request_queue.
2923         */
2924        if (dm_request_based(md))
2925                start_queue(md->queue);
2926
2927        unlock_fs(md);
2928
2929        clear_bit(DMF_SUSPENDED, &md->flags);
2930
2931        r = 0;
2932out:
2933        mutex_unlock(&md->suspend_lock);
2934
2935        return r;
2936}
2937
2938/*
2939 * Internal suspend/resume works like userspace-driven suspend. It waits
2940 * until all bios finish and prevents issuing new bios to the target drivers.
2941 * It may be used only from the kernel.
2942 *
2943 * Internal suspend holds md->suspend_lock, which prevents interaction with
2944 * userspace-driven suspend.
2945 */
2946
2947void dm_internal_suspend(struct mapped_device *md)
2948{
2949        mutex_lock(&md->suspend_lock);
2950        if (dm_suspended_md(md))
2951                return;
2952
2953        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2954        synchronize_srcu(&md->io_barrier);
2955        flush_workqueue(md->wq);
2956        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2957}
2958EXPORT_SYMBOL_GPL(dm_internal_suspend);
2959
2960void dm_internal_resume(struct mapped_device *md)
2961{
2962        if (dm_suspended_md(md))
2963                goto done;
2964
2965        dm_queue_flush(md);
2966
2967done:
2968        mutex_unlock(&md->suspend_lock);
2969}
2970EXPORT_SYMBOL_GPL(dm_internal_resume);
2971
2972/*-----------------------------------------------------------------
2973 * Event notification.
2974 *---------------------------------------------------------------*/
2975int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2976                       unsigned cookie)
2977{
2978        char udev_cookie[DM_COOKIE_LENGTH];
2979        char *envp[] = { udev_cookie, NULL };
2980
2981        if (!cookie)
2982                return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2983        else {
2984                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2985                         DM_COOKIE_ENV_VAR_NAME, cookie);
2986                return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2987                                          action, envp);
2988        }
2989}
2990
2991uint32_t dm_next_uevent_seq(struct mapped_device *md)
2992{
2993        return atomic_add_return(1, &md->uevent_seq);
2994}
2995
2996uint32_t dm_get_event_nr(struct mapped_device *md)
2997{
2998        return atomic_read(&md->event_nr);
2999}
3000
3001int dm_wait_event(struct mapped_device *md, int event_nr)
3002{
3003        return wait_event_interruptible(md->eventq,
3004                        (event_nr != atomic_read(&md->event_nr)));
3005}
3006
3007void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
3008{
3009        unsigned long flags;
3010
3011        spin_lock_irqsave(&md->uevent_lock, flags);
3012        list_add(elist, &md->uevent_list);
3013        spin_unlock_irqrestore(&md->uevent_lock, flags);
3014}
3015
3016/*
3017 * The gendisk is only valid as long as you have a reference
3018 * count on 'md'.
3019 */
3020struct gendisk *dm_disk(struct mapped_device *md)
3021{
3022        return md->disk;
3023}
3024
3025struct kobject *dm_kobject(struct mapped_device *md)
3026{
3027        return &md->kobj_holder.kobj;
3028}
3029
3030struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
3031{
3032        struct mapped_device *md;
3033
3034        md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
3035
3036        if (test_bit(DMF_FREEING, &md->flags) ||
3037            dm_deleting_md(md))
3038                return NULL;
3039
3040        dm_get(md);
3041        return md;
3042}
3043
3044int dm_suspended_md(struct mapped_device *md)
3045{
3046        return test_bit(DMF_SUSPENDED, &md->flags);
3047}
3048
3049int dm_test_deferred_remove_flag(struct mapped_device *md)
3050{
3051        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3052}
3053
3054int dm_suspended(struct dm_target *ti)
3055{
3056        return dm_suspended_md(dm_table_get_md(ti->table));
3057}
3058EXPORT_SYMBOL_GPL(dm_suspended);
3059
3060int dm_noflush_suspending(struct dm_target *ti)
3061{
3062        return __noflush_suspending(dm_table_get_md(ti->table));
3063}
3064EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3065
3066struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
3067{
3068        struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
3069        struct kmem_cache *cachep;
3070        unsigned int pool_size;
3071        unsigned int front_pad;
3072
3073        if (!pools)
3074                return NULL;
3075
3076        if (type == DM_TYPE_BIO_BASED) {
3077                cachep = _io_cache;
3078                pool_size = dm_get_reserved_bio_based_ios();
3079                front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3080        } else if (type == DM_TYPE_REQUEST_BASED) {
3081                cachep = _rq_tio_cache;
3082                pool_size = dm_get_reserved_rq_based_ios();
3083                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3084                /* per_bio_data_size is not used. See __bind_mempools(). */
3085                WARN_ON(per_bio_data_size != 0);
3086        } else
3087                goto out;
3088
3089        pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
3090        if (!pools->io_pool)
3091                goto out;
3092
3093        pools->bs = bioset_create_nobvec(pool_size, front_pad);
3094        if (!pools->bs)
3095                goto out;
3096
3097        if (integrity && bioset_integrity_create(pools->bs, pool_size))
3098                goto out;
3099
3100        return pools;
3101
3102out:
3103        dm_free_md_mempools(pools);
3104
3105        return NULL;
3106}
3107
3108void dm_free_md_mempools(struct dm_md_mempools *pools)
3109{
3110        if (!pools)
3111                return;
3112
3113        if (pools->io_pool)
3114                mempool_destroy(pools->io_pool);
3115
3116        if (pools->bs)
3117                bioset_free(pools->bs);
3118
3119        kfree(pools);
3120}
3121
3122static const struct block_device_operations dm_blk_dops = {
3123        .open = dm_blk_open,
3124        .release = dm_blk_close,
3125        .ioctl = dm_blk_ioctl,
3126        .getgeo = dm_blk_getgeo,
3127        .owner = THIS_MODULE
3128};
3129
3130/*
3131 * module hooks
3132 */
3133module_init(dm_init);
3134module_exit(dm_exit);
3135
3136module_param(major, uint, 0);
3137MODULE_PARM_DESC(major, "The major number of the device mapper");
3138
3139module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3140MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3141
3142module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
3143MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
3144
3145MODULE_DESCRIPTION(DM_NAME " driver");
3146MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3147MODULE_LICENSE("GPL");
Note: See TracBrowser for help on using the repository browser.