source: src/linux/universal/linux-3.18/arch/arm/kvm/mmu.c @ 31885

Last change on this file since 31885 was 31885, checked in by brainslayer, 5 weeks ago

update

File size: 39.1 KB
Line 
1/*
2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
3 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License, version 2, as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
17 */
18
19#include <linux/mman.h>
20#include <linux/kvm_host.h>
21#include <linux/io.h>
22#include <linux/hugetlb.h>
23#include <trace/events/kvm.h>
24#include <asm/pgalloc.h>
25#include <asm/cacheflush.h>
26#include <asm/kvm_arm.h>
27#include <asm/kvm_mmu.h>
28#include <asm/kvm_mmio.h>
29#include <asm/kvm_asm.h>
30#include <asm/kvm_emulate.h>
31
32#include "trace.h"
33
34extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
35
36static pgd_t *boot_hyp_pgd;
37static pgd_t *hyp_pgd;
38static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
39
40static void *init_bounce_page;
41static unsigned long hyp_idmap_start;
42static unsigned long hyp_idmap_end;
43static phys_addr_t hyp_idmap_vector;
44
45#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
46
47#define kvm_pmd_huge(_x)        (pmd_huge(_x) || pmd_trans_huge(_x))
48
49static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
50{
51        /*
52         * This function also gets called when dealing with HYP page
53         * tables. As HYP doesn't have an associated struct kvm (and
54         * the HYP page tables are fairly static), we don't do
55         * anything there.
56         */
57        if (kvm)
58                kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
59}
60
61/*
62 * D-Cache management functions. They take the page table entries by
63 * value, as they are flushing the cache using the kernel mapping (or
64 * kmap on 32bit).
65 */
66static void kvm_flush_dcache_pte(pte_t pte)
67{
68        __kvm_flush_dcache_pte(pte);
69}
70
71static void kvm_flush_dcache_pmd(pmd_t pmd)
72{
73        __kvm_flush_dcache_pmd(pmd);
74}
75
76static void kvm_flush_dcache_pud(pud_t pud)
77{
78        __kvm_flush_dcache_pud(pud);
79}
80
81static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
82                                  int min, int max)
83{
84        void *page;
85
86        BUG_ON(max > KVM_NR_MEM_OBJS);
87        if (cache->nobjs >= min)
88                return 0;
89        while (cache->nobjs < max) {
90                page = (void *)__get_free_page(PGALLOC_GFP);
91                if (!page)
92                        return -ENOMEM;
93                cache->objects[cache->nobjs++] = page;
94        }
95        return 0;
96}
97
98static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
99{
100        while (mc->nobjs)
101                free_page((unsigned long)mc->objects[--mc->nobjs]);
102}
103
104static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
105{
106        void *p;
107
108        BUG_ON(!mc || !mc->nobjs);
109        p = mc->objects[--mc->nobjs];
110        return p;
111}
112
113static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
114{
115        pud_t *pud_table __maybe_unused = pud_offset(pgd, 0);
116        pgd_clear(pgd);
117        kvm_tlb_flush_vmid_ipa(kvm, addr);
118        pud_free(NULL, pud_table);
119        put_page(virt_to_page(pgd));
120}
121
122static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
123{
124        pmd_t *pmd_table = pmd_offset(pud, 0);
125        VM_BUG_ON(pud_huge(*pud));
126        pud_clear(pud);
127        kvm_tlb_flush_vmid_ipa(kvm, addr);
128        pmd_free(NULL, pmd_table);
129        put_page(virt_to_page(pud));
130}
131
132static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
133{
134        pte_t *pte_table = pte_offset_kernel(pmd, 0);
135        VM_BUG_ON(kvm_pmd_huge(*pmd));
136        pmd_clear(pmd);
137        kvm_tlb_flush_vmid_ipa(kvm, addr);
138        pte_free_kernel(NULL, pte_table);
139        put_page(virt_to_page(pmd));
140}
141
142/*
143 * Unmapping vs dcache management:
144 *
145 * If a guest maps certain memory pages as uncached, all writes will
146 * bypass the data cache and go directly to RAM.  However, the CPUs
147 * can still speculate reads (not writes) and fill cache lines with
148 * data.
149 *
150 * Those cache lines will be *clean* cache lines though, so a
151 * clean+invalidate operation is equivalent to an invalidate
152 * operation, because no cache lines are marked dirty.
153 *
154 * Those clean cache lines could be filled prior to an uncached write
155 * by the guest, and the cache coherent IO subsystem would therefore
156 * end up writing old data to disk.
157 *
158 * This is why right after unmapping a page/section and invalidating
159 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
160 * the IO subsystem will never hit in the cache.
161 */
162static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
163                       phys_addr_t addr, phys_addr_t end)
164{
165        phys_addr_t start_addr = addr;
166        pte_t *pte, *start_pte;
167
168        start_pte = pte = pte_offset_kernel(pmd, addr);
169        do {
170                if (!pte_none(*pte)) {
171                        pte_t old_pte = *pte;
172
173                        kvm_set_pte(pte, __pte(0));
174                        kvm_tlb_flush_vmid_ipa(kvm, addr);
175
176                        /* No need to invalidate the cache for device mappings */
177                        if ((pte_val(old_pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE)
178                                kvm_flush_dcache_pte(old_pte);
179
180                        put_page(virt_to_page(pte));
181                }
182        } while (pte++, addr += PAGE_SIZE, addr != end);
183
184        if (kvm_pte_table_empty(kvm, start_pte))
185                clear_pmd_entry(kvm, pmd, start_addr);
186}
187
188static void unmap_pmds(struct kvm *kvm, pud_t *pud,
189                       phys_addr_t addr, phys_addr_t end)
190{
191        phys_addr_t next, start_addr = addr;
192        pmd_t *pmd, *start_pmd;
193
194        start_pmd = pmd = pmd_offset(pud, addr);
195        do {
196                next = kvm_pmd_addr_end(addr, end);
197                if (!pmd_none(*pmd)) {
198                        if (kvm_pmd_huge(*pmd)) {
199                                pmd_t old_pmd = *pmd;
200
201                                pmd_clear(pmd);
202                                kvm_tlb_flush_vmid_ipa(kvm, addr);
203
204                                kvm_flush_dcache_pmd(old_pmd);
205
206                                put_page(virt_to_page(pmd));
207                        } else {
208                                unmap_ptes(kvm, pmd, addr, next);
209                        }
210                }
211        } while (pmd++, addr = next, addr != end);
212
213        if (kvm_pmd_table_empty(kvm, start_pmd))
214                clear_pud_entry(kvm, pud, start_addr);
215}
216
217static void unmap_puds(struct kvm *kvm, pgd_t *pgd,
218                       phys_addr_t addr, phys_addr_t end)
219{
220        phys_addr_t next, start_addr = addr;
221        pud_t *pud, *start_pud;
222
223        start_pud = pud = pud_offset(pgd, addr);
224        do {
225                next = kvm_pud_addr_end(addr, end);
226                if (!pud_none(*pud)) {
227                        if (pud_huge(*pud)) {
228                                pud_t old_pud = *pud;
229
230                                pud_clear(pud);
231                                kvm_tlb_flush_vmid_ipa(kvm, addr);
232
233                                kvm_flush_dcache_pud(old_pud);
234
235                                put_page(virt_to_page(pud));
236                        } else {
237                                unmap_pmds(kvm, pud, addr, next);
238                        }
239                }
240        } while (pud++, addr = next, addr != end);
241
242        if (kvm_pud_table_empty(kvm, start_pud))
243                clear_pgd_entry(kvm, pgd, start_addr);
244}
245
246
247static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
248                        phys_addr_t start, u64 size)
249{
250        pgd_t *pgd;
251        phys_addr_t addr = start, end = start + size;
252        phys_addr_t next;
253
254        pgd = pgdp + kvm_pgd_index(addr);
255        do {
256                next = kvm_pgd_addr_end(addr, end);
257                if (!pgd_none(*pgd))
258                        unmap_puds(kvm, pgd, addr, next);
259        } while (pgd++, addr = next, addr != end);
260}
261
262static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
263                              phys_addr_t addr, phys_addr_t end)
264{
265        pte_t *pte;
266
267        pte = pte_offset_kernel(pmd, addr);
268        do {
269                if (!pte_none(*pte) &&
270                    (pte_val(*pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE)
271                        kvm_flush_dcache_pte(*pte);
272        } while (pte++, addr += PAGE_SIZE, addr != end);
273}
274
275static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
276                              phys_addr_t addr, phys_addr_t end)
277{
278        pmd_t *pmd;
279        phys_addr_t next;
280
281        pmd = pmd_offset(pud, addr);
282        do {
283                next = kvm_pmd_addr_end(addr, end);
284                if (!pmd_none(*pmd)) {
285                        if (kvm_pmd_huge(*pmd))
286                                kvm_flush_dcache_pmd(*pmd);
287                        else
288                                stage2_flush_ptes(kvm, pmd, addr, next);
289                }
290        } while (pmd++, addr = next, addr != end);
291}
292
293static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
294                              phys_addr_t addr, phys_addr_t end)
295{
296        pud_t *pud;
297        phys_addr_t next;
298
299        pud = pud_offset(pgd, addr);
300        do {
301                next = kvm_pud_addr_end(addr, end);
302                if (!pud_none(*pud)) {
303                        if (pud_huge(*pud))
304                                kvm_flush_dcache_pud(*pud);
305                        else
306                                stage2_flush_pmds(kvm, pud, addr, next);
307                }
308        } while (pud++, addr = next, addr != end);
309}
310
311static void stage2_flush_memslot(struct kvm *kvm,
312                                 struct kvm_memory_slot *memslot)
313{
314        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
315        phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
316        phys_addr_t next;
317        pgd_t *pgd;
318
319        pgd = kvm->arch.pgd + kvm_pgd_index(addr);
320        do {
321                next = kvm_pgd_addr_end(addr, end);
322                stage2_flush_puds(kvm, pgd, addr, next);
323        } while (pgd++, addr = next, addr != end);
324}
325
326/**
327 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
328 * @kvm: The struct kvm pointer
329 *
330 * Go through the stage 2 page tables and invalidate any cache lines
331 * backing memory already mapped to the VM.
332 */
333void stage2_flush_vm(struct kvm *kvm)
334{
335        struct kvm_memslots *slots;
336        struct kvm_memory_slot *memslot;
337        int idx;
338
339        idx = srcu_read_lock(&kvm->srcu);
340        spin_lock(&kvm->mmu_lock);
341
342        slots = kvm_memslots(kvm);
343        kvm_for_each_memslot(memslot, slots)
344                stage2_flush_memslot(kvm, memslot);
345
346        spin_unlock(&kvm->mmu_lock);
347        srcu_read_unlock(&kvm->srcu, idx);
348}
349
350/**
351 * free_boot_hyp_pgd - free HYP boot page tables
352 *
353 * Free the HYP boot page tables. The bounce page is also freed.
354 */
355void free_boot_hyp_pgd(void)
356{
357        mutex_lock(&kvm_hyp_pgd_mutex);
358
359        if (boot_hyp_pgd) {
360                unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
361                unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
362                free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
363                boot_hyp_pgd = NULL;
364        }
365
366        if (hyp_pgd)
367                unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
368
369        free_page((unsigned long)init_bounce_page);
370        init_bounce_page = NULL;
371
372        mutex_unlock(&kvm_hyp_pgd_mutex);
373}
374
375/**
376 * free_hyp_pgds - free Hyp-mode page tables
377 *
378 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
379 * therefore contains either mappings in the kernel memory area (above
380 * PAGE_OFFSET), or device mappings in the vmalloc range (from
381 * VMALLOC_START to VMALLOC_END).
382 *
383 * boot_hyp_pgd should only map two pages for the init code.
384 */
385void free_hyp_pgds(void)
386{
387        unsigned long addr;
388
389        free_boot_hyp_pgd();
390
391        mutex_lock(&kvm_hyp_pgd_mutex);
392
393        if (hyp_pgd) {
394                for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
395                        unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
396                for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
397                        unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
398
399                free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
400                hyp_pgd = NULL;
401        }
402
403        mutex_unlock(&kvm_hyp_pgd_mutex);
404}
405
406static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
407                                    unsigned long end, unsigned long pfn,
408                                    pgprot_t prot)
409{
410        pte_t *pte;
411        unsigned long addr;
412
413        addr = start;
414        do {
415                pte = pte_offset_kernel(pmd, addr);
416                kvm_set_pte(pte, pfn_pte(pfn, prot));
417                get_page(virt_to_page(pte));
418                kvm_flush_dcache_to_poc(pte, sizeof(*pte));
419                pfn++;
420        } while (addr += PAGE_SIZE, addr != end);
421}
422
423static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
424                                   unsigned long end, unsigned long pfn,
425                                   pgprot_t prot)
426{
427        pmd_t *pmd;
428        pte_t *pte;
429        unsigned long addr, next;
430
431        addr = start;
432        do {
433                pmd = pmd_offset(pud, addr);
434
435                BUG_ON(pmd_sect(*pmd));
436
437                if (pmd_none(*pmd)) {
438                        pte = pte_alloc_one_kernel(NULL, addr);
439                        if (!pte) {
440                                kvm_err("Cannot allocate Hyp pte\n");
441                                return -ENOMEM;
442                        }
443                        pmd_populate_kernel(NULL, pmd, pte);
444                        get_page(virt_to_page(pmd));
445                        kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
446                }
447
448                next = pmd_addr_end(addr, end);
449
450                create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
451                pfn += (next - addr) >> PAGE_SHIFT;
452        } while (addr = next, addr != end);
453
454        return 0;
455}
456
457static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
458                                   unsigned long end, unsigned long pfn,
459                                   pgprot_t prot)
460{
461        pud_t *pud;
462        pmd_t *pmd;
463        unsigned long addr, next;
464        int ret;
465
466        addr = start;
467        do {
468                pud = pud_offset(pgd, addr);
469
470                if (pud_none_or_clear_bad(pud)) {
471                        pmd = pmd_alloc_one(NULL, addr);
472                        if (!pmd) {
473                                kvm_err("Cannot allocate Hyp pmd\n");
474                                return -ENOMEM;
475                        }
476                        pud_populate(NULL, pud, pmd);
477                        get_page(virt_to_page(pud));
478                        kvm_flush_dcache_to_poc(pud, sizeof(*pud));
479                }
480
481                next = pud_addr_end(addr, end);
482                ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
483                if (ret)
484                        return ret;
485                pfn += (next - addr) >> PAGE_SHIFT;
486        } while (addr = next, addr != end);
487
488        return 0;
489}
490
491static int __create_hyp_mappings(pgd_t *pgdp,
492                                 unsigned long start, unsigned long end,
493                                 unsigned long pfn, pgprot_t prot)
494{
495        pgd_t *pgd;
496        pud_t *pud;
497        unsigned long addr, next;
498        int err = 0;
499
500        mutex_lock(&kvm_hyp_pgd_mutex);
501        addr = start & PAGE_MASK;
502        end = PAGE_ALIGN(end);
503        do {
504                pgd = pgdp + pgd_index(addr);
505
506                if (pgd_none(*pgd)) {
507                        pud = pud_alloc_one(NULL, addr);
508                        if (!pud) {
509                                kvm_err("Cannot allocate Hyp pud\n");
510                                err = -ENOMEM;
511                                goto out;
512                        }
513                        pgd_populate(NULL, pgd, pud);
514                        get_page(virt_to_page(pgd));
515                        kvm_flush_dcache_to_poc(pgd, sizeof(*pgd));
516                }
517
518                next = pgd_addr_end(addr, end);
519                err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
520                if (err)
521                        goto out;
522                pfn += (next - addr) >> PAGE_SHIFT;
523        } while (addr = next, addr != end);
524out:
525        mutex_unlock(&kvm_hyp_pgd_mutex);
526        return err;
527}
528
529static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
530{
531        if (!is_vmalloc_addr(kaddr)) {
532                BUG_ON(!virt_addr_valid(kaddr));
533                return __pa(kaddr);
534        } else {
535                return page_to_phys(vmalloc_to_page(kaddr)) +
536                       offset_in_page(kaddr);
537        }
538}
539
540/**
541 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
542 * @from:       The virtual kernel start address of the range
543 * @to:         The virtual kernel end address of the range (exclusive)
544 *
545 * The same virtual address as the kernel virtual address is also used
546 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
547 * physical pages.
548 */
549int create_hyp_mappings(void *from, void *to)
550{
551        phys_addr_t phys_addr;
552        unsigned long virt_addr;
553        unsigned long start = KERN_TO_HYP((unsigned long)from);
554        unsigned long end = KERN_TO_HYP((unsigned long)to);
555
556        start = start & PAGE_MASK;
557        end = PAGE_ALIGN(end);
558
559        for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
560                int err;
561
562                phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
563                err = __create_hyp_mappings(hyp_pgd, virt_addr,
564                                            virt_addr + PAGE_SIZE,
565                                            __phys_to_pfn(phys_addr),
566                                            PAGE_HYP);
567                if (err)
568                        return err;
569        }
570
571        return 0;
572}
573
574/**
575 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
576 * @from:       The kernel start VA of the range
577 * @to:         The kernel end VA of the range (exclusive)
578 * @phys_addr:  The physical start address which gets mapped
579 *
580 * The resulting HYP VA is the same as the kernel VA, modulo
581 * HYP_PAGE_OFFSET.
582 */
583int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
584{
585        unsigned long start = KERN_TO_HYP((unsigned long)from);
586        unsigned long end = KERN_TO_HYP((unsigned long)to);
587
588        /* Check for a valid kernel IO mapping */
589        if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
590                return -EINVAL;
591
592        return __create_hyp_mappings(hyp_pgd, start, end,
593                                     __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
594}
595
596/* Free the HW pgd, one page at a time */
597static void kvm_free_hwpgd(void *hwpgd)
598{
599        free_pages_exact(hwpgd, kvm_get_hwpgd_size());
600}
601
602/* Allocate the HW PGD, making sure that each page gets its own refcount */
603static void *kvm_alloc_hwpgd(void)
604{
605        unsigned int size = kvm_get_hwpgd_size();
606
607        return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
608}
609
610/**
611 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
612 * @kvm:        The KVM struct pointer for the VM.
613 *
614 * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can
615 * support either full 40-bit input addresses or limited to 32-bit input
616 * addresses). Clears the allocated pages.
617 *
618 * Note we don't need locking here as this is only called when the VM is
619 * created, which can only be done once.
620 */
621int kvm_alloc_stage2_pgd(struct kvm *kvm)
622{
623        pgd_t *pgd;
624        void *hwpgd;
625
626        if (kvm->arch.pgd != NULL) {
627                kvm_err("kvm_arch already initialized?\n");
628                return -EINVAL;
629        }
630
631        hwpgd = kvm_alloc_hwpgd();
632        if (!hwpgd)
633                return -ENOMEM;
634
635        /* When the kernel uses more levels of page tables than the
636         * guest, we allocate a fake PGD and pre-populate it to point
637         * to the next-level page table, which will be the real
638         * initial page table pointed to by the VTTBR.
639         *
640         * When KVM_PREALLOC_LEVEL==2, we allocate a single page for
641         * the PMD and the kernel will use folded pud.
642         * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
643         * pages.
644         */
645        if (KVM_PREALLOC_LEVEL > 0) {
646                int i;
647
648                /*
649                 * Allocate fake pgd for the page table manipulation macros to
650                 * work.  This is not used by the hardware and we have no
651                 * alignment requirement for this allocation.
652                 */
653                pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
654                                       GFP_KERNEL | __GFP_ZERO);
655
656                if (!pgd) {
657                        kvm_free_hwpgd(hwpgd);
658                        return -ENOMEM;
659                }
660
661                /* Plug the HW PGD into the fake one. */
662                for (i = 0; i < PTRS_PER_S2_PGD; i++) {
663                        if (KVM_PREALLOC_LEVEL == 1)
664                                pgd_populate(NULL, pgd + i,
665                                             (pud_t *)hwpgd + i * PTRS_PER_PUD);
666                        else if (KVM_PREALLOC_LEVEL == 2)
667                                pud_populate(NULL, pud_offset(pgd, 0) + i,
668                                             (pmd_t *)hwpgd + i * PTRS_PER_PMD);
669                }
670        } else {
671                /*
672                 * Allocate actual first-level Stage-2 page table used by the
673                 * hardware for Stage-2 page table walks.
674                 */
675                pgd = (pgd_t *)hwpgd;
676        }
677
678        kvm_clean_pgd(pgd);
679        kvm->arch.pgd = pgd;
680        return 0;
681}
682
683/**
684 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
685 * @kvm:   The VM pointer
686 * @start: The intermediate physical base address of the range to unmap
687 * @size:  The size of the area to unmap
688 *
689 * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
690 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
691 * destroying the VM), otherwise another faulting VCPU may come in and mess
692 * with things behind our backs.
693 */
694static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
695{
696        unmap_range(kvm, kvm->arch.pgd, start, size);
697}
698
699static void stage2_unmap_memslot(struct kvm *kvm,
700                                 struct kvm_memory_slot *memslot)
701{
702        hva_t hva = memslot->userspace_addr;
703        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
704        phys_addr_t size = PAGE_SIZE * memslot->npages;
705        hva_t reg_end = hva + size;
706
707        /*
708         * A memory region could potentially cover multiple VMAs, and any holes
709         * between them, so iterate over all of them to find out if we should
710         * unmap any of them.
711         *
712         *     +--------------------------------------------+
713         * +---------------+----------------+   +----------------+
714         * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
715         * +---------------+----------------+   +----------------+
716         *     |               memory region                |
717         *     +--------------------------------------------+
718         */
719        do {
720                struct vm_area_struct *vma = find_vma(current->mm, hva);
721                hva_t vm_start, vm_end;
722
723                if (!vma || vma->vm_start >= reg_end)
724                        break;
725
726                /*
727                 * Take the intersection of this VMA with the memory region
728                 */
729                vm_start = max(hva, vma->vm_start);
730                vm_end = min(reg_end, vma->vm_end);
731
732                if (!(vma->vm_flags & VM_PFNMAP)) {
733                        gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
734                        unmap_stage2_range(kvm, gpa, vm_end - vm_start);
735                }
736                hva = vm_end;
737        } while (hva < reg_end);
738}
739
740/**
741 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
742 * @kvm: The struct kvm pointer
743 *
744 * Go through the memregions and unmap any reguler RAM
745 * backing memory already mapped to the VM.
746 */
747void stage2_unmap_vm(struct kvm *kvm)
748{
749        struct kvm_memslots *slots;
750        struct kvm_memory_slot *memslot;
751        int idx;
752
753        idx = srcu_read_lock(&kvm->srcu);
754        spin_lock(&kvm->mmu_lock);
755
756        slots = kvm_memslots(kvm);
757        kvm_for_each_memslot(memslot, slots)
758                stage2_unmap_memslot(kvm, memslot);
759
760        spin_unlock(&kvm->mmu_lock);
761        srcu_read_unlock(&kvm->srcu, idx);
762}
763
764/**
765 * kvm_free_stage2_pgd - free all stage-2 tables
766 * @kvm:        The KVM struct pointer for the VM.
767 *
768 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
769 * underlying level-2 and level-3 tables before freeing the actual level-1 table
770 * and setting the struct pointer to NULL.
771 *
772 * Note we don't need locking here as this is only called when the VM is
773 * destroyed, which can only be done once.
774 */
775void kvm_free_stage2_pgd(struct kvm *kvm)
776{
777        if (kvm->arch.pgd == NULL)
778                return;
779
780        unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
781        kvm_free_hwpgd(kvm_get_hwpgd(kvm));
782        if (KVM_PREALLOC_LEVEL > 0)
783                kfree(kvm->arch.pgd);
784
785        kvm->arch.pgd = NULL;
786}
787
788static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
789                             phys_addr_t addr)
790{
791        pgd_t *pgd;
792        pud_t *pud;
793
794        pgd = kvm->arch.pgd + kvm_pgd_index(addr);
795        if (WARN_ON(pgd_none(*pgd))) {
796                if (!cache)
797                        return NULL;
798                pud = mmu_memory_cache_alloc(cache);
799                pgd_populate(NULL, pgd, pud);
800                get_page(virt_to_page(pgd));
801        }
802
803        return pud_offset(pgd, addr);
804}
805
806static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
807                             phys_addr_t addr)
808{
809        pud_t *pud;
810        pmd_t *pmd;
811
812        pud = stage2_get_pud(kvm, cache, addr);
813        if (pud_none(*pud)) {
814                if (!cache)
815                        return NULL;
816                pmd = mmu_memory_cache_alloc(cache);
817                pud_populate(NULL, pud, pmd);
818                get_page(virt_to_page(pud));
819        }
820
821        return pmd_offset(pud, addr);
822}
823
824static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
825                               *cache, phys_addr_t addr, const pmd_t *new_pmd)
826{
827        pmd_t *pmd, old_pmd;
828
829        pmd = stage2_get_pmd(kvm, cache, addr);
830        VM_BUG_ON(!pmd);
831
832        /*
833         * Mapping in huge pages should only happen through a fault.  If a
834         * page is merged into a transparent huge page, the individual
835         * subpages of that huge page should be unmapped through MMU
836         * notifiers before we get here.
837         *
838         * Merging of CompoundPages is not supported; they should become
839         * splitting first, unmapped, merged, and mapped back in on-demand.
840         */
841        VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
842
843        old_pmd = *pmd;
844        if (pmd_present(old_pmd)) {
845                pmd_clear(pmd);
846                kvm_tlb_flush_vmid_ipa(kvm, addr);
847        } else {
848                get_page(virt_to_page(pmd));
849        }
850
851        kvm_set_pmd(pmd, *new_pmd);
852        return 0;
853}
854
855static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
856                          phys_addr_t addr, const pte_t *new_pte, bool iomap)
857{
858        pmd_t *pmd;
859        pte_t *pte, old_pte;
860
861        /* Create stage-2 page table mapping - Levels 0 and 1 */
862        pmd = stage2_get_pmd(kvm, cache, addr);
863        if (!pmd) {
864                /*
865                 * Ignore calls from kvm_set_spte_hva for unallocated
866                 * address ranges.
867                 */
868                return 0;
869        }
870
871        /* Create stage-2 page mappings - Level 2 */
872        if (pmd_none(*pmd)) {
873                if (!cache)
874                        return 0; /* ignore calls from kvm_set_spte_hva */
875                pte = mmu_memory_cache_alloc(cache);
876                kvm_clean_pte(pte);
877                pmd_populate_kernel(NULL, pmd, pte);
878                get_page(virt_to_page(pmd));
879        }
880
881        pte = pte_offset_kernel(pmd, addr);
882
883        if (iomap && pte_present(*pte))
884                return -EFAULT;
885
886        /* Create 2nd stage page table mapping - Level 3 */
887        old_pte = *pte;
888        if (pte_present(old_pte)) {
889                kvm_set_pte(pte, __pte(0));
890                kvm_tlb_flush_vmid_ipa(kvm, addr);
891        } else {
892                get_page(virt_to_page(pte));
893        }
894
895        kvm_set_pte(pte, *new_pte);
896        return 0;
897}
898
899/**
900 * kvm_phys_addr_ioremap - map a device range to guest IPA
901 *
902 * @kvm:        The KVM pointer
903 * @guest_ipa:  The IPA at which to insert the mapping
904 * @pa:         The physical address of the device
905 * @size:       The size of the mapping
906 */
907int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
908                          phys_addr_t pa, unsigned long size, bool writable)
909{
910        phys_addr_t addr, end;
911        int ret = 0;
912        unsigned long pfn;
913        struct kvm_mmu_memory_cache cache = { 0, };
914
915        end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
916        pfn = __phys_to_pfn(pa);
917
918        for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
919                pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
920
921                if (writable)
922                        kvm_set_s2pte_writable(&pte);
923
924                ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
925                                                KVM_NR_MEM_OBJS);
926                if (ret)
927                        goto out;
928                spin_lock(&kvm->mmu_lock);
929                ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
930                spin_unlock(&kvm->mmu_lock);
931                if (ret)
932                        goto out;
933
934                pfn++;
935        }
936
937out:
938        mmu_free_memory_cache(&cache);
939        return ret;
940}
941
942static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
943{
944        pfn_t pfn = *pfnp;
945        gfn_t gfn = *ipap >> PAGE_SHIFT;
946
947        if (PageTransCompound(pfn_to_page(pfn))) {
948                unsigned long mask;
949                /*
950                 * The address we faulted on is backed by a transparent huge
951                 * page.  However, because we map the compound huge page and
952                 * not the individual tail page, we need to transfer the
953                 * refcount to the head page.  We have to be careful that the
954                 * THP doesn't start to split while we are adjusting the
955                 * refcounts.
956                 *
957                 * We are sure this doesn't happen, because mmu_notifier_retry
958                 * was successful and we are holding the mmu_lock, so if this
959                 * THP is trying to split, it will be blocked in the mmu
960                 * notifier before touching any of the pages, specifically
961                 * before being able to call __split_huge_page_refcount().
962                 *
963                 * We can therefore safely transfer the refcount from PG_tail
964                 * to PG_head and switch the pfn from a tail page to the head
965                 * page accordingly.
966                 */
967                mask = PTRS_PER_PMD - 1;
968                VM_BUG_ON((gfn & mask) != (pfn & mask));
969                if (pfn & mask) {
970                        *ipap &= PMD_MASK;
971                        kvm_release_pfn_clean(pfn);
972                        pfn &= ~mask;
973                        kvm_get_pfn(pfn);
974                        *pfnp = pfn;
975                }
976
977                return true;
978        }
979
980        return false;
981}
982
983static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
984{
985        if (kvm_vcpu_trap_is_iabt(vcpu))
986                return false;
987
988        return kvm_vcpu_dabt_iswrite(vcpu);
989}
990
991static bool kvm_is_device_pfn(unsigned long pfn)
992{
993        return !pfn_valid(pfn);
994}
995
996static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
997                                      unsigned long size, bool uncached)
998{
999        __coherent_cache_guest_page(vcpu, pfn, size, uncached);
1000}
1001
1002static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1003                          struct kvm_memory_slot *memslot, unsigned long hva,
1004                          unsigned long fault_status)
1005{
1006        int ret;
1007        bool write_fault, writable, hugetlb = false, force_pte = false;
1008        unsigned long mmu_seq;
1009        gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1010        struct kvm *kvm = vcpu->kvm;
1011        struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1012        struct vm_area_struct *vma;
1013        pfn_t pfn;
1014        pgprot_t mem_type = PAGE_S2;
1015        bool fault_ipa_uncached;
1016
1017        write_fault = kvm_is_write_fault(vcpu);
1018        if (fault_status == FSC_PERM && !write_fault) {
1019                kvm_err("Unexpected L2 read permission error\n");
1020                return -EFAULT;
1021        }
1022
1023        /* Let's check if we will get back a huge page backed by hugetlbfs */
1024        down_read(&current->mm->mmap_sem);
1025        vma = find_vma_intersection(current->mm, hva, hva + 1);
1026        if (unlikely(!vma)) {
1027                kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1028                up_read(&current->mm->mmap_sem);
1029                return -EFAULT;
1030        }
1031
1032        if (is_vm_hugetlb_page(vma)) {
1033                hugetlb = true;
1034                gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
1035        } else {
1036                /*
1037                 * Pages belonging to memslots that don't have the same
1038                 * alignment for userspace and IPA cannot be mapped using
1039                 * block descriptors even if the pages belong to a THP for
1040                 * the process, because the stage-2 block descriptor will
1041                 * cover more than a single THP and we loose atomicity for
1042                 * unmapping, updates, and splits of the THP or other pages
1043                 * in the stage-2 block range.
1044                 */
1045                if ((memslot->userspace_addr & ~PMD_MASK) !=
1046                    ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
1047                        force_pte = true;
1048        }
1049        up_read(&current->mm->mmap_sem);
1050
1051        /* We need minimum second+third level pages */
1052        ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
1053                                     KVM_NR_MEM_OBJS);
1054        if (ret)
1055                return ret;
1056
1057        mmu_seq = vcpu->kvm->mmu_notifier_seq;
1058        /*
1059         * Ensure the read of mmu_notifier_seq happens before we call
1060         * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1061         * the page we just got a reference to gets unmapped before we have a
1062         * chance to grab the mmu_lock, which ensure that if the page gets
1063         * unmapped afterwards, the call to kvm_unmap_hva will take it away
1064         * from us again properly. This smp_rmb() interacts with the smp_wmb()
1065         * in kvm_mmu_notifier_invalidate_<page|range_end>.
1066         */
1067        smp_rmb();
1068
1069        pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1070        if (is_error_pfn(pfn))
1071                return -EFAULT;
1072
1073        if (kvm_is_device_pfn(pfn))
1074                mem_type = PAGE_S2_DEVICE;
1075
1076        spin_lock(&kvm->mmu_lock);
1077        if (mmu_notifier_retry(kvm, mmu_seq))
1078                goto out_unlock;
1079        if (!hugetlb && !force_pte)
1080                hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
1081
1082        fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT;
1083
1084        if (hugetlb) {
1085                pmd_t new_pmd = pfn_pmd(pfn, mem_type);
1086                new_pmd = pmd_mkhuge(new_pmd);
1087                if (writable) {
1088                        kvm_set_s2pmd_writable(&new_pmd);
1089                        kvm_set_pfn_dirty(pfn);
1090                }
1091                coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
1092                ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1093        } else {
1094                pte_t new_pte = pfn_pte(pfn, mem_type);
1095                if (writable) {
1096                        kvm_set_s2pte_writable(&new_pte);
1097                        kvm_set_pfn_dirty(pfn);
1098                }
1099                coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached);
1100                ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
1101                        pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
1102        }
1103
1104
1105out_unlock:
1106        spin_unlock(&kvm->mmu_lock);
1107        kvm_release_pfn_clean(pfn);
1108        return ret;
1109}
1110
1111/**
1112 * kvm_handle_guest_abort - handles all 2nd stage aborts
1113 * @vcpu:       the VCPU pointer
1114 * @run:        the kvm_run structure
1115 *
1116 * Any abort that gets to the host is almost guaranteed to be caused by a
1117 * missing second stage translation table entry, which can mean that either the
1118 * guest simply needs more memory and we must allocate an appropriate page or it
1119 * can mean that the guest tried to access I/O memory, which is emulated by user
1120 * space. The distinction is based on the IPA causing the fault and whether this
1121 * memory region has been registered as standard RAM by user space.
1122 */
1123int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1124{
1125        unsigned long fault_status;
1126        phys_addr_t fault_ipa;
1127        struct kvm_memory_slot *memslot;
1128        unsigned long hva;
1129        bool is_iabt, write_fault, writable;
1130        gfn_t gfn;
1131        int ret, idx;
1132
1133        is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1134        fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1135
1136        trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
1137                              kvm_vcpu_get_hfar(vcpu), fault_ipa);
1138
1139        /* Check the stage-2 fault is trans. fault or write fault */
1140        fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1141        if (fault_status != FSC_FAULT && fault_status != FSC_PERM) {
1142                kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1143                        kvm_vcpu_trap_get_class(vcpu),
1144                        (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1145                        (unsigned long)kvm_vcpu_get_hsr(vcpu));
1146                return -EFAULT;
1147        }
1148
1149        idx = srcu_read_lock(&vcpu->kvm->srcu);
1150
1151        gfn = fault_ipa >> PAGE_SHIFT;
1152        memslot = gfn_to_memslot(vcpu->kvm, gfn);
1153        hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1154        write_fault = kvm_is_write_fault(vcpu);
1155        if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1156                if (is_iabt) {
1157                        /* Prefetch Abort on I/O address */
1158                        kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1159                        ret = 1;
1160                        goto out_unlock;
1161                }
1162
1163                /*
1164                 * The IPA is reported as [MAX:12], so we need to
1165                 * complement it with the bottom 12 bits from the
1166                 * faulting VA. This is always 12 bits, irrespective
1167                 * of the page size.
1168                 */
1169                fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1170                ret = io_mem_abort(vcpu, run, fault_ipa);
1171                goto out_unlock;
1172        }
1173
1174        /* Userspace should not be able to register out-of-bounds IPAs */
1175        VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
1176
1177        ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1178        if (ret == 0)
1179                ret = 1;
1180out_unlock:
1181        srcu_read_unlock(&vcpu->kvm->srcu, idx);
1182        return ret;
1183}
1184
1185static void handle_hva_to_gpa(struct kvm *kvm,
1186                              unsigned long start,
1187                              unsigned long end,
1188                              void (*handler)(struct kvm *kvm,
1189                                              gpa_t gpa, void *data),
1190                              void *data)
1191{
1192        struct kvm_memslots *slots;
1193        struct kvm_memory_slot *memslot;
1194
1195        slots = kvm_memslots(kvm);
1196
1197        /* we only care about the pages that the guest sees */
1198        kvm_for_each_memslot(memslot, slots) {
1199                unsigned long hva_start, hva_end;
1200                gfn_t gfn, gfn_end;
1201
1202                hva_start = max(start, memslot->userspace_addr);
1203                hva_end = min(end, memslot->userspace_addr +
1204                                        (memslot->npages << PAGE_SHIFT));
1205                if (hva_start >= hva_end)
1206                        continue;
1207
1208                /*
1209                 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1210                 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1211                 */
1212                gfn = hva_to_gfn_memslot(hva_start, memslot);
1213                gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1214
1215                for (; gfn < gfn_end; ++gfn) {
1216                        gpa_t gpa = gfn << PAGE_SHIFT;
1217                        handler(kvm, gpa, data);
1218                }
1219        }
1220}
1221
1222static void kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1223{
1224        unmap_stage2_range(kvm, gpa, PAGE_SIZE);
1225}
1226
1227int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1228{
1229        unsigned long end = hva + PAGE_SIZE;
1230
1231        if (!kvm->arch.pgd)
1232                return 0;
1233
1234        trace_kvm_unmap_hva(hva);
1235        handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
1236        return 0;
1237}
1238
1239int kvm_unmap_hva_range(struct kvm *kvm,
1240                        unsigned long start, unsigned long end)
1241{
1242        if (!kvm->arch.pgd)
1243                return 0;
1244
1245        trace_kvm_unmap_hva_range(start, end);
1246        handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
1247        return 0;
1248}
1249
1250static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
1251{
1252        pte_t *pte = (pte_t *)data;
1253
1254        stage2_set_pte(kvm, NULL, gpa, pte, false);
1255}
1256
1257
1258void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1259{
1260        unsigned long end = hva + PAGE_SIZE;
1261        pte_t stage2_pte;
1262
1263        if (!kvm->arch.pgd)
1264                return;
1265
1266        trace_kvm_set_spte_hva(hva);
1267        stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2);
1268        handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
1269}
1270
1271void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1272{
1273        mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
1274}
1275
1276phys_addr_t kvm_mmu_get_httbr(void)
1277{
1278        return virt_to_phys(hyp_pgd);
1279}
1280
1281phys_addr_t kvm_mmu_get_boot_httbr(void)
1282{
1283        return virt_to_phys(boot_hyp_pgd);
1284}
1285
1286phys_addr_t kvm_get_idmap_vector(void)
1287{
1288        return hyp_idmap_vector;
1289}
1290
1291int kvm_mmu_init(void)
1292{
1293        int err;
1294
1295        hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
1296        hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
1297        hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
1298
1299        if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) {
1300                /*
1301                 * Our init code is crossing a page boundary. Allocate
1302                 * a bounce page, copy the code over and use that.
1303                 */
1304                size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start;
1305                phys_addr_t phys_base;
1306
1307                init_bounce_page = (void *)__get_free_page(GFP_KERNEL);
1308                if (!init_bounce_page) {
1309                        kvm_err("Couldn't allocate HYP init bounce page\n");
1310                        err = -ENOMEM;
1311                        goto out;
1312                }
1313
1314                memcpy(init_bounce_page, __hyp_idmap_text_start, len);
1315                /*
1316                 * Warning: the code we just copied to the bounce page
1317                 * must be flushed to the point of coherency.
1318                 * Otherwise, the data may be sitting in L2, and HYP
1319                 * mode won't be able to observe it as it runs with
1320                 * caches off at that point.
1321                 */
1322                kvm_flush_dcache_to_poc(init_bounce_page, len);
1323
1324                phys_base = kvm_virt_to_phys(init_bounce_page);
1325                hyp_idmap_vector += phys_base - hyp_idmap_start;
1326                hyp_idmap_start = phys_base;
1327                hyp_idmap_end = phys_base + len;
1328
1329                kvm_info("Using HYP init bounce page @%lx\n",
1330                         (unsigned long)phys_base);
1331        }
1332
1333        hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
1334        boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
1335
1336        if (!hyp_pgd || !boot_hyp_pgd) {
1337                kvm_err("Hyp mode PGD not allocated\n");
1338                err = -ENOMEM;
1339                goto out;
1340        }
1341
1342        /* Create the idmap in the boot page tables */
1343        err =   __create_hyp_mappings(boot_hyp_pgd,
1344                                      hyp_idmap_start, hyp_idmap_end,
1345                                      __phys_to_pfn(hyp_idmap_start),
1346                                      PAGE_HYP);
1347
1348        if (err) {
1349                kvm_err("Failed to idmap %lx-%lx\n",
1350                        hyp_idmap_start, hyp_idmap_end);
1351                goto out;
1352        }
1353
1354        /* Map the very same page at the trampoline VA */
1355        err =   __create_hyp_mappings(boot_hyp_pgd,
1356                                      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
1357                                      __phys_to_pfn(hyp_idmap_start),
1358                                      PAGE_HYP);
1359        if (err) {
1360                kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
1361                        TRAMPOLINE_VA);
1362                goto out;
1363        }
1364
1365        /* Map the same page again into the runtime page tables */
1366        err =   __create_hyp_mappings(hyp_pgd,
1367                                      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
1368                                      __phys_to_pfn(hyp_idmap_start),
1369                                      PAGE_HYP);
1370        if (err) {
1371                kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
1372                        TRAMPOLINE_VA);
1373                goto out;
1374        }
1375
1376        return 0;
1377out:
1378        free_hyp_pgds();
1379        return err;
1380}
1381
1382void kvm_arch_commit_memory_region(struct kvm *kvm,
1383                                   struct kvm_userspace_memory_region *mem,
1384                                   const struct kvm_memory_slot *old,
1385                                   enum kvm_mr_change change)
1386{
1387}
1388
1389int kvm_arch_prepare_memory_region(struct kvm *kvm,
1390                                   struct kvm_memory_slot *memslot,
1391                                   struct kvm_userspace_memory_region *mem,
1392                                   enum kvm_mr_change change)
1393{
1394        hva_t hva = mem->userspace_addr;
1395        hva_t reg_end = hva + mem->memory_size;
1396        bool writable = !(mem->flags & KVM_MEM_READONLY);
1397        int ret = 0;
1398
1399        if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
1400                return 0;
1401
1402        /*
1403         * Prevent userspace from creating a memory region outside of the IPA
1404         * space addressable by the KVM guest IPA space.
1405         */
1406        if (memslot->base_gfn + memslot->npages >=
1407            (KVM_PHYS_SIZE >> PAGE_SHIFT))
1408                return -EFAULT;
1409
1410        down_read(&current->mm->mmap_sem);
1411        /*
1412         * A memory region could potentially cover multiple VMAs, and any holes
1413         * between them, so iterate over all of them to find out if we can map
1414         * any of them right now.
1415         *
1416         *     +--------------------------------------------+
1417         * +---------------+----------------+   +----------------+
1418         * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
1419         * +---------------+----------------+   +----------------+
1420         *     |               memory region                |
1421         *     +--------------------------------------------+
1422         */
1423        do {
1424                struct vm_area_struct *vma = find_vma(current->mm, hva);
1425                hva_t vm_start, vm_end;
1426
1427                if (!vma || vma->vm_start >= reg_end)
1428                        break;
1429
1430                /*
1431                 * Mapping a read-only VMA is only allowed if the
1432                 * memory region is configured as read-only.
1433                 */
1434                if (writable && !(vma->vm_flags & VM_WRITE)) {
1435                        ret = -EPERM;
1436                        break;
1437                }
1438
1439                /*
1440                 * Take the intersection of this VMA with the memory region
1441                 */
1442                vm_start = max(hva, vma->vm_start);
1443                vm_end = min(reg_end, vma->vm_end);
1444
1445                if (vma->vm_flags & VM_PFNMAP) {
1446                        gpa_t gpa = mem->guest_phys_addr +
1447                                    (vm_start - mem->userspace_addr);
1448                        phys_addr_t pa;
1449
1450                        pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
1451                        pa += vm_start - vma->vm_start;
1452
1453                        ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
1454                                                    vm_end - vm_start,
1455                                                    writable);
1456                        if (ret)
1457                                break;
1458                }
1459                hva = vm_end;
1460        } while (hva < reg_end);
1461
1462        spin_lock(&kvm->mmu_lock);
1463        if (ret)
1464                unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
1465        else
1466                stage2_flush_memslot(kvm, memslot);
1467        spin_unlock(&kvm->mmu_lock);
1468
1469        up_read(&current->mm->mmap_sem);
1470        return ret;
1471}
1472
1473void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
1474                           struct kvm_memory_slot *dont)
1475{
1476}
1477
1478int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
1479                            unsigned long npages)
1480{
1481        /*
1482         * Readonly memslots are not incoherent with the caches by definition,
1483         * but in practice, they are used mostly to emulate ROMs or NOR flashes
1484         * that the guest may consider devices and hence map as uncached.
1485         * To prevent incoherency issues in these cases, tag all readonly
1486         * regions as incoherent.
1487         */
1488        if (slot->flags & KVM_MEM_READONLY)
1489                slot->flags |= KVM_MEMSLOT_INCOHERENT;
1490        return 0;
1491}
1492
1493void kvm_arch_memslots_updated(struct kvm *kvm)
1494{
1495}
1496
1497void kvm_arch_flush_shadow_all(struct kvm *kvm)
1498{
1499        kvm_free_stage2_pgd(kvm);
1500}
1501
1502void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1503                                   struct kvm_memory_slot *slot)
1504{
1505        gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
1506        phys_addr_t size = slot->npages << PAGE_SHIFT;
1507
1508        spin_lock(&kvm->mmu_lock);
1509        unmap_stage2_range(kvm, gpa, size);
1510        spin_unlock(&kvm->mmu_lock);
1511}
Note: See TracBrowser for help on using the repository browser.