[BACK]Return to uvm_swap.c CVS log [TXT][DIR] Up to [local] / sys / uvm

Annotation of sys/uvm/uvm_swap.c, Revision 1.1.1.1

1.1       nbrk        1: /*     $OpenBSD: uvm_swap.c,v 1.72 2007/06/18 21:51:15 pedro Exp $     */
                      2: /*     $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $        */
                      3:
                      4: /*
                      5:  * Copyright (c) 1995, 1996, 1997 Matthew R. Green
                      6:  * All rights reserved.
                      7:  *
                      8:  * Redistribution and use in source and binary forms, with or without
                      9:  * modification, are permitted provided that the following conditions
                     10:  * are met:
                     11:  * 1. Redistributions of source code must retain the above copyright
                     12:  *    notice, this list of conditions and the following disclaimer.
                     13:  * 2. Redistributions in binary form must reproduce the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer in the
                     15:  *    documentation and/or other materials provided with the distribution.
                     16:  * 3. The name of the author may not be used to endorse or promote products
                     17:  *    derived from this software without specific prior written permission.
                     18:  *
                     19:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
                     20:  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
                     21:  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
                     22:  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
                     23:  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
                     24:  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
                     25:  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
                     26:  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
                     27:  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     28:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     29:  * SUCH DAMAGE.
                     30:  *
                     31:  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
                     32:  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
                     33:  */
                     34:
                     35: #include <sys/param.h>
                     36: #include <sys/systm.h>
                     37: #include <sys/buf.h>
                     38: #include <sys/conf.h>
                     39: #include <sys/proc.h>
                     40: #include <sys/namei.h>
                     41: #include <sys/disklabel.h>
                     42: #include <sys/errno.h>
                     43: #include <sys/kernel.h>
                     44: #include <sys/malloc.h>
                     45: #include <sys/vnode.h>
                     46: #include <sys/file.h>
                     47: #include <sys/extent.h>
                     48: #include <sys/mount.h>
                     49: #include <sys/pool.h>
                     50: #include <sys/syscallargs.h>
                     51: #include <sys/swap.h>
                     52:
                     53: #include <uvm/uvm.h>
                     54: #ifdef UVM_SWAP_ENCRYPT
                     55: #include <sys/syslog.h>
                     56: #endif
                     57:
                     58: #include <miscfs/specfs/specdev.h>
                     59:
                     60: /*
                     61:  * uvm_swap.c: manage configuration and i/o to swap space.
                     62:  */
                     63:
                     64: /*
                     65:  * swap space is managed in the following way:
                     66:  *
                     67:  * each swap partition or file is described by a "swapdev" structure.
                     68:  * each "swapdev" structure contains a "swapent" structure which contains
                     69:  * information that is passed up to the user (via system calls).
                     70:  *
                     71:  * each swap partition is assigned a "priority" (int) which controls
                     72:  * swap partition usage.
                     73:  *
                     74:  * the system maintains a global data structure describing all swap
                     75:  * partitions/files.   there is a sorted LIST of "swappri" structures
                     76:  * which describe "swapdev"'s at that priority.   this LIST is headed
                     77:  * by the "swap_priority" global var.    each "swappri" contains a
                     78:  * CIRCLEQ of "swapdev" structures at that priority.
                     79:  *
                     80:  * locking:
                     81:  *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
                     82:  *    system call and prevents the swap priority list from changing
                     83:  *    while we are in the middle of a system call (e.g. SWAP_STATS).
                     84:  *  - uvm.swap_data_lock (simple_lock): this lock protects all swap data
                     85:  *    structures including the priority list, the swapdev structures,
                     86:  *    and the swapmap extent.
                     87:  *
                     88:  * each swap device has the following info:
                     89:  *  - swap device in use (could be disabled, preventing future use)
                     90:  *  - swap enabled (allows new allocations on swap)
                     91:  *  - map info in /dev/drum
                     92:  *  - vnode pointer
                     93:  * for swap files only:
                     94:  *  - block size
                     95:  *  - max byte count in buffer
                     96:  *  - buffer
                     97:  *  - credentials to use when doing i/o to file
                     98:  *
                     99:  * userland controls and configures swap with the swapctl(2) system call.
                    100:  * the sys_swapctl performs the following operations:
                    101:  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
                    102:  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
                    103:  *     (passed in via "arg") of a size passed in via "misc" ... we load
                    104:  *     the current swap config into the array.
                    105:  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
                    106:  *     priority in "misc", start swapping on it.
                    107:  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
                    108:  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
                    109:  *     "misc")
                    110:  */
                    111:
                    112: /*
                    113:  * swapdev: describes a single swap partition/file
                    114:  *
                    115:  * note the following should be true:
                    116:  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
                    117:  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
                    118:  */
                    119: struct swapdev {
                    120:        struct swapent  swd_se;
                    121: #define        swd_dev         swd_se.se_dev           /* device id */
                    122: #define        swd_flags       swd_se.se_flags         /* flags:inuse/enable/fake */
                    123: #define        swd_priority    swd_se.se_priority      /* our priority */
                    124: #define        swd_inuse       swd_se.se_inuse         /* our priority */
                    125: #define        swd_nblks       swd_se.se_nblks         /* our priority */
                    126:        char                    *swd_path;      /* saved pathname of device */
                    127:        int                     swd_pathlen;    /* length of pathname */
                    128:        int                     swd_npages;     /* #pages we can use */
                    129:        int                     swd_npginuse;   /* #pages in use */
                    130:        int                     swd_npgbad;     /* #pages bad */
                    131:        int                     swd_drumoffset; /* page0 offset in drum */
                    132:        int                     swd_drumsize;   /* #pages in drum */
                    133:        struct extent           *swd_ex;        /* extent for this swapdev */
                    134:        char                    swd_exname[12]; /* name of extent above */
                    135:        struct vnode            *swd_vp;        /* backing vnode */
                    136:        CIRCLEQ_ENTRY(swapdev)  swd_next;       /* priority circleq */
                    137:
                    138:        int                     swd_bsize;      /* blocksize (bytes) */
                    139:        int                     swd_maxactive;  /* max active i/o reqs */
                    140:        struct buf              swd_tab;        /* buffer list */
                    141:        struct ucred            *swd_cred;      /* cred for file access */
                    142: #ifdef UVM_SWAP_ENCRYPT
                    143: #define SWD_KEY_SHIFT          7               /* One key per 0.5 MByte */
                    144: #define SWD_KEY(x,y)           &((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT])
                    145:
                    146: #define SWD_DCRYPT_SHIFT       5
                    147: #define SWD_DCRYPT_BITS                32
                    148: #define SWD_DCRYPT_MASK                (SWD_DCRYPT_BITS - 1)
                    149: #define SWD_DCRYPT_OFF(x)      ((x) >> SWD_DCRYPT_SHIFT)
                    150: #define SWD_DCRYPT_BIT(x)      ((x) & SWD_DCRYPT_MASK)
                    151: #define SWD_DCRYPT_SIZE(x)     (SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t))
                    152:        u_int32_t               *swd_decrypt;   /* bitmap for decryption */
                    153:        struct swap_key         *swd_keys;      /* keys for different parts */
                    154:        int                     swd_nkeys;      /* active keys */
                    155: #endif
                    156: };
                    157:
                    158: /*
                    159:  * swap device priority entry; the list is kept sorted on `spi_priority'.
                    160:  */
                    161: struct swappri {
                    162:        int                     spi_priority;     /* priority */
                    163:        CIRCLEQ_HEAD(spi_swapdev, swapdev)      spi_swapdev;
                    164:        /* circleq of swapdevs at this priority */
                    165:        LIST_ENTRY(swappri)     spi_swappri;      /* global list of pri's */
                    166: };
                    167:
                    168: /*
                    169:  * The following two structures are used to keep track of data transfers
                    170:  * on swap devices associated with regular files.
                    171:  * NOTE: this code is more or less a copy of vnd.c; we use the same
                    172:  * structure names here to ease porting..
                    173:  */
                    174: struct vndxfer {
                    175:        struct buf      *vx_bp;         /* Pointer to parent buffer */
                    176:        struct swapdev  *vx_sdp;
                    177:        int             vx_error;
                    178:        int             vx_pending;     /* # of pending aux buffers */
                    179:        int             vx_flags;
                    180: #define VX_BUSY                1
                    181: #define VX_DEAD                2
                    182: };
                    183:
                    184: struct vndbuf {
                    185:        struct buf      vb_buf;
                    186:        struct vndxfer  *vb_xfer;
                    187: };
                    188:
                    189:
                    190: /*
                    191:  * We keep a of pool vndbuf's and vndxfer structures.
                    192:  */
                    193: struct pool vndxfer_pool;
                    194: struct pool vndbuf_pool;
                    195:
                    196: #define        getvndxfer(vnx) do {                                            \
                    197:        int s = splbio();                                               \
                    198:        vnx = pool_get(&vndxfer_pool, PR_WAITOK);                       \
                    199:        splx(s);                                                        \
                    200: } while (0)
                    201:
                    202: #define putvndxfer(vnx) {                                              \
                    203:        pool_put(&vndxfer_pool, (void *)(vnx));                         \
                    204: }
                    205:
                    206: #define        getvndbuf(vbp)  do {                                            \
                    207:        int s = splbio();                                               \
                    208:        vbp = pool_get(&vndbuf_pool, PR_WAITOK);                        \
                    209:        splx(s);                                                        \
                    210: } while (0)
                    211:
                    212: #define putvndbuf(vbp) {                                               \
                    213:        pool_put(&vndbuf_pool, (void *)(vbp));                          \
                    214: }
                    215:
                    216: /* /dev/drum */
                    217: bdev_decl(sw);
                    218: cdev_decl(sw);
                    219:
                    220: /*
                    221:  * local variables
                    222:  */
                    223: static struct extent *swapmap;         /* controls the mapping of /dev/drum */
                    224:
                    225: /* list of all active swap devices [by priority] */
                    226: LIST_HEAD(swap_priority, swappri);
                    227: static struct swap_priority swap_priority;
                    228:
                    229: /* locks */
                    230: struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
                    231:
                    232: /*
                    233:  * prototypes
                    234:  */
                    235: static void             swapdrum_add(struct swapdev *, int);
                    236: static struct swapdev  *swapdrum_getsdp(int);
                    237:
                    238: static struct swapdev  *swaplist_find(struct vnode *, int);
                    239: static void             swaplist_insert(struct swapdev *,
                    240:                                             struct swappri *, int);
                    241: static void             swaplist_trim(void);
                    242:
                    243: static int swap_on(struct proc *, struct swapdev *);
                    244: static int swap_off(struct proc *, struct swapdev *);
                    245:
                    246: static void sw_reg_strategy(struct swapdev *, struct buf *, int);
                    247: static void sw_reg_iodone(struct buf *);
                    248: static void sw_reg_start(struct swapdev *);
                    249:
                    250: static int uvm_swap_io(struct vm_page **, int, int, int);
                    251:
                    252: static void swapmount(void);
                    253:
                    254: #ifdef UVM_SWAP_ENCRYPT
                    255: /* for swap encrypt */
                    256: boolean_t uvm_swap_allocpages(struct vm_page **, int);
                    257: void uvm_swap_markdecrypt(struct swapdev *, int, int, int);
                    258: boolean_t uvm_swap_needdecrypt(struct swapdev *, int);
                    259: void uvm_swap_initcrypt(struct swapdev *, int);
                    260: #endif
                    261:
                    262: /*
                    263:  * uvm_swap_init: init the swap system data structures and locks
                    264:  *
                    265:  * => called at boot time from init_main.c after the filesystems
                    266:  *     are brought up (which happens after uvm_init())
                    267:  */
                    268: void
                    269: uvm_swap_init()
                    270: {
                    271:        UVMHIST_FUNC("uvm_swap_init");
                    272:
                    273:        UVMHIST_CALLED(pdhist);
                    274:        /*
                    275:         * first, init the swap list, its counter, and its lock.
                    276:         * then get a handle on the vnode for /dev/drum by using
                    277:         * the its dev_t number ("swapdev", from MD conf.c).
                    278:         */
                    279:
                    280:        LIST_INIT(&swap_priority);
                    281:        uvmexp.nswapdev = 0;
                    282:        simple_lock_init(&uvm.swap_data_lock);
                    283:
                    284:        if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp))
                    285:                panic("uvm_swap_init: can't get vnode for swap device");
                    286:
                    287:        /*
                    288:         * create swap block resource map to map /dev/drum.   the range
                    289:         * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
                    290:         * that block 0 is reserved (used to indicate an allocation
                    291:         * failure, or no allocation).
                    292:         */
                    293:        swapmap = extent_create("swapmap", 1, INT_MAX,
                    294:                                M_VMSWAP, 0, 0, EX_NOWAIT);
                    295:        if (swapmap == 0)
                    296:                panic("uvm_swap_init: extent_create failed");
                    297:
                    298:        /*
                    299:         * allocate pools for structures used for swapping to files.
                    300:         */
                    301:
                    302:
                    303:        pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
                    304:            NULL);
                    305:
                    306:        pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
                    307:            NULL);
                    308:
                    309:        /*
                    310:         * Setup the initial swap partition
                    311:         */
                    312:        swapmount();
                    313:
                    314:        /*
                    315:         * done!
                    316:         */
                    317:        UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
                    318: }
                    319:
                    320: #ifdef UVM_SWAP_ENCRYPT
                    321: void
                    322: uvm_swap_initcrypt_all(void)
                    323: {
                    324:        struct swapdev *sdp;
                    325:        struct swappri *spp;
                    326:
                    327:        simple_lock(&uvm.swap_data_lock);
                    328:
                    329:        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                    330:                CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)
                    331:                        if (sdp->swd_decrypt == NULL)
                    332:                                uvm_swap_initcrypt(sdp, sdp->swd_npages);
                    333:        }
                    334:        simple_unlock(&uvm.swap_data_lock);
                    335: }
                    336:
                    337: void
                    338: uvm_swap_initcrypt(struct swapdev *sdp, int npages)
                    339: {
                    340:        /*
                    341:         * keep information if a page needs to be decrypted when we get it
                    342:         * from the swap device.
                    343:         * We cannot chance a malloc later, if we are doing ASYNC puts,
                    344:         * we may not call malloc with M_WAITOK.  This consumes only
                    345:         * 8KB memory for a 256MB swap partition.
                    346:         */
                    347:        sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP, M_WAITOK);
                    348:        memset(sdp->swd_decrypt, 0, SWD_DCRYPT_SIZE(npages));
                    349:        sdp->swd_keys = malloc((npages >> SWD_KEY_SHIFT) * sizeof(struct swap_key),
                    350:                               M_VMSWAP, M_WAITOK);
                    351:        memset(sdp->swd_keys, 0, (npages >> SWD_KEY_SHIFT) * sizeof(struct swap_key));
                    352:        sdp->swd_nkeys = 0;
                    353: }
                    354:
                    355: boolean_t
                    356: uvm_swap_allocpages(struct vm_page **pps, int npages)
                    357: {
                    358:        int i, s;
                    359:        int minus, reserve;
                    360:        boolean_t fail;
                    361:
                    362:        /* Estimate if we will succeed */
                    363:        s = uvm_lock_fpageq();
                    364:
                    365:        minus = uvmexp.free - npages;
                    366:        reserve = uvmexp.reserve_kernel;
                    367:        fail = uvmexp.free - npages < uvmexp.reserve_kernel;
                    368:
                    369:        uvm_unlock_fpageq(s);
                    370:
                    371:        if (fail)
                    372:                return FALSE;
                    373:
                    374:        /* Get new pages */
                    375:        for (i = 0; i < npages; i++) {
                    376:                pps[i] = uvm_pagealloc(NULL, 0, NULL, 0);
                    377:                if (pps[i] == NULL)
                    378:                        break;
                    379:        }
                    380:
                    381:        /* On failure free and return */
                    382:        if (i < npages) {
                    383:                uvm_swap_freepages(pps, i);
                    384:                return FALSE;
                    385:        }
                    386:
                    387:        return TRUE;
                    388: }
                    389:
                    390: void
                    391: uvm_swap_freepages(struct vm_page **pps, int npages)
                    392: {
                    393:        int i;
                    394:
                    395:        uvm_lock_pageq();
                    396:        for (i = 0; i < npages; i++)
                    397:                uvm_pagefree(pps[i]);
                    398:        uvm_unlock_pageq();
                    399: }
                    400:
                    401: /*
                    402:  * Mark pages on the swap device for later decryption
                    403:  */
                    404:
                    405: void
                    406: uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages,
                    407:                     int decrypt)
                    408: {
                    409:        int pagestart, i;
                    410:        int off, bit;
                    411:
                    412:        if (!sdp)
                    413:                return;
                    414:
                    415:        pagestart = startslot - sdp->swd_drumoffset;
                    416:        for (i = 0; i < npages; i++, pagestart++) {
                    417:                off = SWD_DCRYPT_OFF(pagestart);
                    418:                bit = SWD_DCRYPT_BIT(pagestart);
                    419:                if (decrypt)
                    420:                        /* pages read need decryption */
                    421:                        sdp->swd_decrypt[off] |= 1 << bit;
                    422:                else
                    423:                        /* pages read do not need decryption */
                    424:                        sdp->swd_decrypt[off] &= ~(1 << bit);
                    425:        }
                    426: }
                    427:
                    428: /*
                    429:  * Check if the page that we got from disk needs to be decrypted
                    430:  */
                    431:
                    432: boolean_t
                    433: uvm_swap_needdecrypt(struct swapdev *sdp, int off)
                    434: {
                    435:        if (!sdp)
                    436:                return FALSE;
                    437:
                    438:        off -= sdp->swd_drumoffset;
                    439:        return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ?
                    440:                TRUE : FALSE;
                    441: }
                    442: #endif /* UVM_SWAP_ENCRYPT */
                    443: /*
                    444:  * swaplist functions: functions that operate on the list of swap
                    445:  * devices on the system.
                    446:  */
                    447:
                    448: /*
                    449:  * swaplist_insert: insert swap device "sdp" into the global list
                    450:  *
                    451:  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
                    452:  * => caller must provide a newly malloc'd swappri structure (we will
                    453:  *     FREE it if we don't need it... this it to prevent malloc blocking
                    454:  *     here while adding swap)
                    455:  */
                    456: static void
                    457: swaplist_insert(sdp, newspp, priority)
                    458:        struct swapdev *sdp;
                    459:        struct swappri *newspp;
                    460:        int priority;
                    461: {
                    462:        struct swappri *spp, *pspp;
                    463:        UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
                    464:
                    465:        /*
                    466:         * find entry at or after which to insert the new device.
                    467:         */
                    468:        for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL;
                    469:             spp = LIST_NEXT(spp, spi_swappri)) {
                    470:                if (priority <= spp->spi_priority)
                    471:                        break;
                    472:                pspp = spp;
                    473:        }
                    474:
                    475:        /*
                    476:         * new priority?
                    477:         */
                    478:        if (spp == NULL || spp->spi_priority != priority) {
                    479:                spp = newspp;  /* use newspp! */
                    480:                UVMHIST_LOG(pdhist, "created new swappri = %ld",
                    481:                            priority, 0, 0, 0);
                    482:
                    483:                spp->spi_priority = priority;
                    484:                CIRCLEQ_INIT(&spp->spi_swapdev);
                    485:
                    486:                if (pspp)
                    487:                        LIST_INSERT_AFTER(pspp, spp, spi_swappri);
                    488:                else
                    489:                        LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
                    490:        } else {
                    491:                /* we don't need a new priority structure, free it */
                    492:                FREE(newspp, M_VMSWAP);
                    493:        }
                    494:
                    495:        /*
                    496:         * priority found (or created).   now insert on the priority's
                    497:         * circleq list and bump the total number of swapdevs.
                    498:         */
                    499:        sdp->swd_priority = priority;
                    500:        CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
                    501:        uvmexp.nswapdev++;
                    502: }
                    503:
                    504: /*
                    505:  * swaplist_find: find and optionally remove a swap device from the
                    506:  *     global list.
                    507:  *
                    508:  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
                    509:  * => we return the swapdev we found (and removed)
                    510:  */
                    511: static struct swapdev *
                    512: swaplist_find(vp, remove)
                    513:        struct vnode *vp;
                    514:        boolean_t remove;
                    515: {
                    516:        struct swapdev *sdp;
                    517:        struct swappri *spp;
                    518:
                    519:        /*
                    520:         * search the lists for the requested vp
                    521:         */
                    522:        for (spp = LIST_FIRST(&swap_priority); spp != NULL;
                    523:             spp = LIST_NEXT(spp, spi_swappri)) {
                    524:                for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
                    525:                     sdp != (void *)&spp->spi_swapdev;
                    526:                     sdp = CIRCLEQ_NEXT(sdp, swd_next))
                    527:                        if (sdp->swd_vp == vp) {
                    528:                                if (remove) {
                    529:                                        CIRCLEQ_REMOVE(&spp->spi_swapdev,
                    530:                                            sdp, swd_next);
                    531:                                        uvmexp.nswapdev--;
                    532:                                }
                    533:                                return(sdp);
                    534:                        }
                    535:        }
                    536:        return (NULL);
                    537: }
                    538:
                    539:
                    540: /*
                    541:  * swaplist_trim: scan priority list for empty priority entries and kill
                    542:  *     them.
                    543:  *
                    544:  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
                    545:  */
                    546: static void
                    547: swaplist_trim()
                    548: {
                    549:        struct swappri *spp, *nextspp;
                    550:
                    551:        for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
                    552:                nextspp = LIST_NEXT(spp, spi_swappri);
                    553:                if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
                    554:                    (void *)&spp->spi_swapdev)
                    555:                        continue;
                    556:                LIST_REMOVE(spp, spi_swappri);
                    557:                free(spp, M_VMSWAP);
                    558:        }
                    559: }
                    560:
                    561: /*
                    562:  * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
                    563:  *
                    564:  * => caller must hold swap_syscall_lock
                    565:  * => uvm.swap_data_lock should be unlocked (we may sleep)
                    566:  */
                    567: static void
                    568: swapdrum_add(sdp, npages)
                    569:        struct swapdev *sdp;
                    570:        int     npages;
                    571: {
                    572:        u_long result;
                    573:
                    574:        if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY,
                    575:            EX_WAITOK, &result))
                    576:                panic("swapdrum_add");
                    577:
                    578:        sdp->swd_drumoffset = result;
                    579:        sdp->swd_drumsize = npages;
                    580: }
                    581:
                    582: /*
                    583:  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
                    584:  *     to the "swapdev" that maps that section of the drum.
                    585:  *
                    586:  * => each swapdev takes one big contig chunk of the drum
                    587:  * => caller must hold uvm.swap_data_lock
                    588:  */
                    589: static struct swapdev *
                    590: swapdrum_getsdp(pgno)
                    591:        int pgno;
                    592: {
                    593:        struct swapdev *sdp;
                    594:        struct swappri *spp;
                    595:
                    596:        for (spp = LIST_FIRST(&swap_priority); spp != NULL;
                    597:             spp = LIST_NEXT(spp, spi_swappri))
                    598:                for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
                    599:                     sdp != (void *)&spp->spi_swapdev;
                    600:                     sdp = CIRCLEQ_NEXT(sdp, swd_next))
                    601:                        if (pgno >= sdp->swd_drumoffset &&
                    602:                            pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
                    603:                                return sdp;
                    604:                        }
                    605:        return NULL;
                    606: }
                    607:
                    608:
                    609: /*
                    610:  * sys_swapctl: main entry point for swapctl(2) system call
                    611:  *     [with two helper functions: swap_on and swap_off]
                    612:  */
                    613: int
                    614: sys_swapctl(p, v, retval)
                    615:        struct proc *p;
                    616:        void *v;
                    617:        register_t *retval;
                    618: {
                    619:        struct sys_swapctl_args /* {
                    620:                syscallarg(int) cmd;
                    621:                syscallarg(void *) arg;
                    622:                syscallarg(int) misc;
                    623:        } */ *uap = (struct sys_swapctl_args *)v;
                    624:        struct vnode *vp;
                    625:        struct nameidata nd;
                    626:        struct swappri *spp;
                    627:        struct swapdev *sdp;
                    628:        struct swapent *sep;
                    629:        char    userpath[MAXPATHLEN];
                    630:        size_t  len;
                    631:        int     count, error, misc;
                    632:        int     priority;
                    633:        UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
                    634:
                    635:        misc = SCARG(uap, misc);
                    636:
                    637:        /*
                    638:         * ensure serialized syscall access by grabbing the swap_syscall_lock
                    639:         */
                    640:        rw_enter_write(&swap_syscall_lock);
                    641:
                    642:        /*
                    643:         * we handle the non-priv NSWAP and STATS request first.
                    644:         *
                    645:         * SWAP_NSWAP: return number of config'd swap devices
                    646:         * [can also be obtained with uvmexp sysctl]
                    647:         */
                    648:        if (SCARG(uap, cmd) == SWAP_NSWAP) {
                    649:                UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%ld", uvmexp.nswapdev,
                    650:                    0, 0, 0);
                    651:                *retval = uvmexp.nswapdev;
                    652:                error = 0;
                    653:                goto out;
                    654:        }
                    655:
                    656:        /*
                    657:         * SWAP_STATS: get stats on current # of configured swap devs
                    658:         *
                    659:         * note that the swap_priority list can't change as long
                    660:         * as we are holding the swap_syscall_lock.  we don't want
                    661:         * to grab the uvm.swap_data_lock because we may fault&sleep during
                    662:         * copyout() and we don't want to be holding that lock then!
                    663:         */
                    664:        if (SCARG(uap, cmd) == SWAP_STATS
                    665: #if defined(COMPAT_13)
                    666:            || SCARG(uap, cmd) == SWAP_OSTATS
                    667: #endif
                    668:            ) {
                    669:                sep = (struct swapent *)SCARG(uap, arg);
                    670:                count = 0;
                    671:
                    672:                for (spp = LIST_FIRST(&swap_priority); spp != NULL;
                    673:                    spp = LIST_NEXT(spp, spi_swappri)) {
                    674:                        for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
                    675:                             sdp != (void *)&spp->spi_swapdev && misc-- > 0;
                    676:                             sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
                    677:                                sdp->swd_inuse =
                    678:                                    btodb((u_int64_t)sdp->swd_npginuse <<
                    679:                                    PAGE_SHIFT);
                    680:                                error = copyout(&sdp->swd_se, sep,
                    681:                                    sizeof(struct swapent));
                    682:
                    683:                                /* now copy out the path if necessary */
                    684: #if defined(COMPAT_13)
                    685:                                if (error == 0 && SCARG(uap, cmd) == SWAP_STATS)
                    686: #else
                    687:                                if (error == 0)
                    688: #endif
                    689:                                        error = copyout(sdp->swd_path,
                    690:                                            &sep->se_path, sdp->swd_pathlen);
                    691:
                    692:                                if (error)
                    693:                                        goto out;
                    694:                                count++;
                    695: #if defined(COMPAT_13)
                    696:                                if (SCARG(uap, cmd) == SWAP_OSTATS)
                    697:                                        ((struct oswapent *)sep)++;
                    698:                                else
                    699: #endif
                    700:                                        sep++;
                    701:                        }
                    702:                }
                    703:
                    704:                UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
                    705:
                    706:                *retval = count;
                    707:                error = 0;
                    708:                goto out;
                    709:        }
                    710:
                    711:        /*
                    712:         * all other requests require superuser privs.   verify.
                    713:         */
                    714:        if ((error = suser(p, 0)))
                    715:                goto out;
                    716:
                    717:        /*
                    718:         * at this point we expect a path name in arg.   we will
                    719:         * use namei() to gain a vnode reference (vref), and lock
                    720:         * the vnode (VOP_LOCK).
                    721:         *
                    722:         * XXX: a NULL arg means use the root vnode pointer (e.g. for
                    723:         * miniroot)
                    724:         */
                    725:        if (SCARG(uap, arg) == NULL) {
                    726:                vp = rootvp;            /* miniroot */
                    727:                if (vget(vp, LK_EXCLUSIVE, p)) {
                    728:                        error = EBUSY;
                    729:                        goto out;
                    730:                }
                    731:                if (SCARG(uap, cmd) == SWAP_ON &&
                    732:                    copystr("miniroot", userpath, sizeof userpath, &len))
                    733:                        panic("swapctl: miniroot copy failed");
                    734:        } else {
                    735:                int     space;
                    736:                char    *where;
                    737:
                    738:                if (SCARG(uap, cmd) == SWAP_ON) {
                    739:                        if ((error = copyinstr(SCARG(uap, arg), userpath,
                    740:                            sizeof userpath, &len)))
                    741:                                goto out;
                    742:                        space = UIO_SYSSPACE;
                    743:                        where = userpath;
                    744:                } else {
                    745:                        space = UIO_USERSPACE;
                    746:                        where = (char *)SCARG(uap, arg);
                    747:                }
                    748:                NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p);
                    749:                if ((error = namei(&nd)))
                    750:                        goto out;
                    751:                vp = nd.ni_vp;
                    752:        }
                    753:        /* note: "vp" is referenced and locked */
                    754:
                    755:        error = 0;              /* assume no error */
                    756:        switch(SCARG(uap, cmd)) {
                    757:
                    758:        case SWAP_DUMPDEV:
                    759:                if (vp->v_type != VBLK) {
                    760:                        error = ENOTBLK;
                    761:                        break;
                    762:                }
                    763:                dumpdev = vp->v_rdev;
                    764:                break;
                    765:
                    766:        case SWAP_CTL:
                    767:                /*
                    768:                 * get new priority, remove old entry (if any) and then
                    769:                 * reinsert it in the correct place.  finally, prune out
                    770:                 * any empty priority structures.
                    771:                 */
                    772:                priority = SCARG(uap, misc);
                    773:                spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
                    774:                simple_lock(&uvm.swap_data_lock);
                    775:                if ((sdp = swaplist_find(vp, 1)) == NULL) {
                    776:                        error = ENOENT;
                    777:                } else {
                    778:                        swaplist_insert(sdp, spp, priority);
                    779:                        swaplist_trim();
                    780:                }
                    781:                simple_unlock(&uvm.swap_data_lock);
                    782:                if (error)
                    783:                        free(spp, M_VMSWAP);
                    784:                break;
                    785:
                    786:        case SWAP_ON:
                    787:
                    788:                /*
                    789:                 * check for duplicates.   if none found, then insert a
                    790:                 * dummy entry on the list to prevent someone else from
                    791:                 * trying to enable this device while we are working on
                    792:                 * it.
                    793:                 */
                    794:
                    795:                priority = SCARG(uap, misc);
                    796:                simple_lock(&uvm.swap_data_lock);
                    797:                if ((sdp = swaplist_find(vp, 0)) != NULL) {
                    798:                        error = EBUSY;
                    799:                        simple_unlock(&uvm.swap_data_lock);
                    800:                        break;
                    801:                }
                    802:                sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
                    803:                spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
                    804:                memset(sdp, 0, sizeof(*sdp));
                    805:                sdp->swd_flags = SWF_FAKE;      /* placeholder only */
                    806:                sdp->swd_vp = vp;
                    807:                sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
                    808:
                    809:                /*
                    810:                 * XXX Is NFS elaboration necessary?
                    811:                 */
                    812:                if (vp->v_type == VREG) {
                    813:                        sdp->swd_cred = crdup(p->p_ucred);
                    814:                }
                    815:
                    816:                swaplist_insert(sdp, spp, priority);
                    817:                simple_unlock(&uvm.swap_data_lock);
                    818:
                    819:                sdp->swd_pathlen = len;
                    820:                sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
                    821:                if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
                    822:                        panic("swapctl: copystr");
                    823:
                    824:                /*
                    825:                 * we've now got a FAKE placeholder in the swap list.
                    826:                 * now attempt to enable swap on it.  if we fail, undo
                    827:                 * what we've done and kill the fake entry we just inserted.
                    828:                 * if swap_on is a success, it will clear the SWF_FAKE flag
                    829:                 */
                    830:
                    831:                if ((error = swap_on(p, sdp)) != 0) {
                    832:                        simple_lock(&uvm.swap_data_lock);
                    833:                        (void) swaplist_find(vp, 1);  /* kill fake entry */
                    834:                        swaplist_trim();
                    835:                        simple_unlock(&uvm.swap_data_lock);
                    836:                        if (vp->v_type == VREG) {
                    837:                                crfree(sdp->swd_cred);
                    838:                        }
                    839:                        free(sdp->swd_path, M_VMSWAP);
                    840:                        free(sdp, M_VMSWAP);
                    841:                        break;
                    842:                }
                    843:                break;
                    844:
                    845:        case SWAP_OFF:
                    846:                simple_lock(&uvm.swap_data_lock);
                    847:                if ((sdp = swaplist_find(vp, 0)) == NULL) {
                    848:                        simple_unlock(&uvm.swap_data_lock);
                    849:                        error = ENXIO;
                    850:                        break;
                    851:                }
                    852:
                    853:                /*
                    854:                 * If a device isn't in use or enabled, we
                    855:                 * can't stop swapping from it (again).
                    856:                 */
                    857:                if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
                    858:                        simple_unlock(&uvm.swap_data_lock);
                    859:                        error = EBUSY;
                    860:                        break;
                    861:                }
                    862:
                    863:                /*
                    864:                 * do the real work.
                    865:                 */
                    866:                error = swap_off(p, sdp);
                    867:                break;
                    868:
                    869:        default:
                    870:                error = EINVAL;
                    871:        }
                    872:
                    873:        /*
                    874:         * done!  release the ref gained by namei() and unlock.
                    875:         */
                    876:        vput(vp);
                    877:
                    878: out:
                    879:        rw_exit_write(&swap_syscall_lock);
                    880:
                    881:        UVMHIST_LOG(pdhist, "<- done!  error=%ld", error, 0, 0, 0);
                    882:        return (error);
                    883: }
                    884:
                    885: /*
                    886:  * swap_on: attempt to enable a swapdev for swapping.   note that the
                    887:  *     swapdev is already on the global list, but disabled (marked
                    888:  *     SWF_FAKE).
                    889:  *
                    890:  * => we avoid the start of the disk (to protect disk labels)
                    891:  * => we also avoid the miniroot, if we are swapping to root.
                    892:  * => caller should leave uvm.swap_data_lock unlocked, we may lock it
                    893:  *     if needed.
                    894:  */
                    895: static int
                    896: swap_on(p, sdp)
                    897:        struct proc *p;
                    898:        struct swapdev *sdp;
                    899: {
                    900:        static int count = 0;   /* static */
                    901:        struct vnode *vp;
                    902:        int error, npages, nblocks, size;
                    903:        long addr;
                    904:        struct vattr va;
                    905: #if defined(NFSCLIENT)
                    906:        extern int (**nfsv2_vnodeop_p)(void *);
                    907: #endif /* defined(NFSCLIENT) */
                    908:        dev_t dev;
                    909:        UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
                    910:
                    911:        /*
                    912:         * we want to enable swapping on sdp.   the swd_vp contains
                    913:         * the vnode we want (locked and ref'd), and the swd_dev
                    914:         * contains the dev_t of the file, if it a block device.
                    915:         */
                    916:
                    917:        vp = sdp->swd_vp;
                    918:        dev = sdp->swd_dev;
                    919:
                    920:        /*
                    921:         * open the swap file (mostly useful for block device files to
                    922:         * let device driver know what is up).
                    923:         *
                    924:         * we skip the open/close for root on swap because the root
                    925:         * has already been opened when root was mounted (mountroot).
                    926:         */
                    927:        if (vp != rootvp) {
                    928:                if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
                    929:                        return (error);
                    930:        }
                    931:
                    932:        /* XXX this only works for block devices */
                    933:        UVMHIST_LOG(pdhist, "  dev=%ld, major(dev)=%ld", dev, major(dev), 0,0);
                    934:
                    935:        /*
                    936:         * we now need to determine the size of the swap area.   for
                    937:         * block specials we can call the d_psize function.
                    938:         * for normal files, we must stat [get attrs].
                    939:         *
                    940:         * we put the result in nblks.
                    941:         * for normal files, we also want the filesystem block size
                    942:         * (which we get with statfs).
                    943:         */
                    944:        switch (vp->v_type) {
                    945:        case VBLK:
                    946:                if (bdevsw[major(dev)].d_psize == 0 ||
                    947:                    (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
                    948:                        error = ENXIO;
                    949:                        goto bad;
                    950:                }
                    951:                break;
                    952:
                    953:        case VREG:
                    954:                if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
                    955:                        goto bad;
                    956:                nblocks = (int)btodb(va.va_size);
                    957:                if ((error =
                    958:                     VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
                    959:                        goto bad;
                    960:
                    961:                sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
                    962:                /*
                    963:                 * limit the max # of outstanding I/O requests we issue
                    964:                 * at any one time.   take it easy on NFS servers.
                    965:                 */
                    966: #if defined(NFSCLIENT)
                    967:                if (vp->v_op == nfsv2_vnodeop_p)
                    968:                        sdp->swd_maxactive = 2; /* XXX */
                    969:                else
                    970: #endif /* defined(NFSCLIENT) */
                    971:                        sdp->swd_maxactive = 8; /* XXX */
                    972:                break;
                    973:
                    974:        default:
                    975:                error = ENXIO;
                    976:                goto bad;
                    977:        }
                    978:
                    979:        /*
                    980:         * save nblocks in a safe place and convert to pages.
                    981:         */
                    982:
                    983:        sdp->swd_nblks = nblocks;
                    984:        npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
                    985:
                    986:        /*
                    987:         * for block special files, we want to make sure that leave
                    988:         * the disklabel and bootblocks alone, so we arrange to skip
                    989:         * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
                    990:         * note that because of this the "size" can be less than the
                    991:         * actual number of blocks on the device.
                    992:         */
                    993:        if (vp->v_type == VBLK) {
                    994:                /* we use pages 1 to (size - 1) [inclusive] */
                    995:                size = npages - 1;
                    996:                addr = 1;
                    997:        } else {
                    998:                /* we use pages 0 to (size - 1) [inclusive] */
                    999:                size = npages;
                   1000:                addr = 0;
                   1001:        }
                   1002:
                   1003:        /*
                   1004:         * make sure we have enough blocks for a reasonable sized swap
                   1005:         * area.   we want at least one page.
                   1006:         */
                   1007:
                   1008:        if (size < 1) {
                   1009:                UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
                   1010:                error = EINVAL;
                   1011:                goto bad;
                   1012:        }
                   1013:
                   1014:        UVMHIST_LOG(pdhist, "  dev=%lx: size=%ld addr=0x%lx\n",
                   1015:            dev, size, addr, 0);
                   1016:
                   1017:        /*
                   1018:         * now we need to allocate an extent to manage this swap device
                   1019:         */
                   1020:        snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x",
                   1021:            count++);
                   1022:
                   1023:        /* note that extent_create's 3rd arg is inclusive, thus "- 1" */
                   1024:        sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP,
                   1025:                                    0, 0, EX_WAITOK);
                   1026:        /* allocate the `saved' region from the extent so it won't be used */
                   1027:        if (addr) {
                   1028:                if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
                   1029:                        panic("disklabel region");
                   1030:        }
                   1031:
                   1032:        /*
                   1033:         * if the vnode we are swapping to is the root vnode
                   1034:         * (i.e. we are swapping to the miniroot) then we want
                   1035:         * to make sure we don't overwrite it.   do a statfs to
                   1036:         * find its size and skip over it.
                   1037:         */
                   1038:        if (vp == rootvp) {
                   1039:                struct mount *mp;
                   1040:                struct statfs *sp;
                   1041:                int rootblocks, rootpages;
                   1042:
                   1043:                mp = rootvnode->v_mount;
                   1044:                sp = &mp->mnt_stat;
                   1045:                rootblocks = sp->f_blocks * btodb(sp->f_bsize);
                   1046:                rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
                   1047:                if (rootpages > size)
                   1048:                        panic("swap_on: miniroot larger than swap?");
                   1049:
                   1050:                if (extent_alloc_region(sdp->swd_ex, addr,
                   1051:                                        rootpages, EX_WAITOK))
                   1052:                        panic("swap_on: unable to preserve miniroot");
                   1053:
                   1054:                size -= rootpages;
                   1055:                printf("Preserved %d pages of miniroot ", rootpages);
                   1056:                printf("leaving %d pages of swap\n", size);
                   1057:        }
                   1058:
                   1059:        /*
                   1060:         * add a ref to vp to reflect usage as a swap device.
                   1061:         */
                   1062:        vref(vp);
                   1063:
                   1064: #ifdef UVM_SWAP_ENCRYPT
                   1065:        if (uvm_doswapencrypt)
                   1066:                uvm_swap_initcrypt(sdp, npages);
                   1067: #endif
                   1068:        /*
                   1069:         * now add the new swapdev to the drum and enable.
                   1070:         */
                   1071:        simple_lock(&uvm.swap_data_lock);
                   1072:        swapdrum_add(sdp, npages);
                   1073:        sdp->swd_npages = size;
                   1074:        sdp->swd_flags &= ~SWF_FAKE;    /* going live */
                   1075:        sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
                   1076:        uvmexp.swpages += size;
                   1077:        simple_unlock(&uvm.swap_data_lock);
                   1078:        return (0);
                   1079:
                   1080: bad:
                   1081:        /*
                   1082:         * failure: close device if necessary and return error.
                   1083:         */
                   1084:        if (vp != rootvp)
                   1085:                (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
                   1086:        return (error);
                   1087: }
                   1088:
                   1089: /*
                   1090:  * swap_off: stop swapping on swapdev
                   1091:  *
                   1092:  * => swap data should be locked, we will unlock.
                   1093:  */
                   1094: static int
                   1095: swap_off(p, sdp)
                   1096:        struct proc *p;
                   1097:        struct swapdev *sdp;
                   1098: {
                   1099:        int error;
                   1100:        UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
                   1101:        UVMHIST_LOG(pdhist, "  dev=%lx", sdp->swd_dev,0,0,0);
                   1102:
                   1103:        /* disable the swap area being removed */
                   1104:        sdp->swd_flags &= ~SWF_ENABLE;
                   1105:        simple_unlock(&uvm.swap_data_lock);
                   1106:
                   1107:        /*
                   1108:         * the idea is to find all the pages that are paged out to this
                   1109:         * device, and page them all in.  in uvm, swap-backed pageable
                   1110:         * memory can take two forms: aobjs and anons.  call the
                   1111:         * swapoff hook for each subsystem to bring in pages.
                   1112:         */
                   1113:
                   1114:        if (uao_swap_off(sdp->swd_drumoffset,
                   1115:                         sdp->swd_drumoffset + sdp->swd_drumsize) ||
                   1116:            amap_swap_off(sdp->swd_drumoffset,
                   1117:                          sdp->swd_drumoffset + sdp->swd_drumsize)) {
                   1118:
                   1119:                error = ENOMEM;
                   1120:        } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
                   1121:                error = EBUSY;
                   1122:        }
                   1123:
                   1124:        if (error) {
                   1125:                simple_lock(&uvm.swap_data_lock);
                   1126:                sdp->swd_flags |= SWF_ENABLE;
                   1127:                simple_unlock(&uvm.swap_data_lock);
                   1128:                return (error);
                   1129:        }
                   1130:
                   1131:        /*
                   1132:         * done with the vnode and saved creds.
                   1133:         * drop our ref on the vnode before calling VOP_CLOSE()
                   1134:         * so that spec_close() can tell if this is the last close.
                   1135:         */
                   1136:        if (sdp->swd_vp->v_type == VREG) {
                   1137:                crfree(sdp->swd_cred);
                   1138:        }
                   1139:        vrele(sdp->swd_vp);
                   1140:        if (sdp->swd_vp != rootvp) {
                   1141:                (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
                   1142:        }
                   1143:
                   1144:        simple_lock(&uvm.swap_data_lock);
                   1145:        uvmexp.swpages -= sdp->swd_npages;
                   1146:
                   1147:        if (swaplist_find(sdp->swd_vp, 1) == NULL)
                   1148:                panic("swap_off: swapdev not in list");
                   1149:        swaplist_trim();
                   1150:
                   1151:        /*
                   1152:         * free all resources!
                   1153:         */
                   1154:        extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
                   1155:                    EX_WAITOK);
                   1156:        extent_destroy(sdp->swd_ex);
                   1157:        free(sdp, M_VMSWAP);
                   1158:        simple_unlock(&uvm.swap_data_lock);
                   1159:        return (0);
                   1160: }
                   1161:
                   1162: /*
                   1163:  * /dev/drum interface and i/o functions
                   1164:  */
                   1165:
                   1166: /*
                   1167:  * swread: the read function for the drum (just a call to physio)
                   1168:  */
                   1169: /*ARGSUSED*/
                   1170: int
                   1171: swread(dev, uio, ioflag)
                   1172:        dev_t dev;
                   1173:        struct uio *uio;
                   1174:        int ioflag;
                   1175: {
                   1176:        UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
                   1177:
                   1178:        UVMHIST_LOG(pdhist, "  dev=%lx offset=%lx",
                   1179:            dev, (u_long)uio->uio_offset, 0, 0);
                   1180:        return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
                   1181: }
                   1182:
                   1183: /*
                   1184:  * swwrite: the write function for the drum (just a call to physio)
                   1185:  */
                   1186: /*ARGSUSED*/
                   1187: int
                   1188: swwrite(dev, uio, ioflag)
                   1189:        dev_t dev;
                   1190:        struct uio *uio;
                   1191:        int ioflag;
                   1192: {
                   1193:        UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
                   1194:
                   1195:        UVMHIST_LOG(pdhist, "  dev=%lx offset=%lx",
                   1196:            dev, (u_long)uio->uio_offset, 0, 0);
                   1197:        return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
                   1198: }
                   1199:
                   1200: /*
                   1201:  * swstrategy: perform I/O on the drum
                   1202:  *
                   1203:  * => we must map the i/o request from the drum to the correct swapdev.
                   1204:  */
                   1205: void
                   1206: swstrategy(bp)
                   1207:        struct buf *bp;
                   1208: {
                   1209:        struct swapdev *sdp;
                   1210:        int s, pageno, bn;
                   1211:        UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
                   1212:
                   1213:        /*
                   1214:         * convert block number to swapdev.   note that swapdev can't
                   1215:         * be yanked out from under us because we are holding resources
                   1216:         * in it (i.e. the blocks we are doing I/O on).
                   1217:         */
                   1218:        pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
                   1219:        simple_lock(&uvm.swap_data_lock);
                   1220:        sdp = swapdrum_getsdp(pageno);
                   1221:        simple_unlock(&uvm.swap_data_lock);
                   1222:        if (sdp == NULL) {
                   1223:                bp->b_error = EINVAL;
                   1224:                bp->b_flags |= B_ERROR;
                   1225:                s = splbio();
                   1226:                biodone(bp);
                   1227:                splx(s);
                   1228:                UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
                   1229:                return;
                   1230:        }
                   1231:
                   1232:        /*
                   1233:         * convert drum page number to block number on this swapdev.
                   1234:         */
                   1235:
                   1236:        pageno -= sdp->swd_drumoffset;  /* page # on swapdev */
                   1237:        bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
                   1238:
                   1239:        UVMHIST_LOG(pdhist, "  %s: mapoff=%lx bn=0x%lx bcount=%ld",
                   1240:                ((bp->b_flags & B_READ) == 0) ? "write" : "read",
                   1241:                sdp->swd_drumoffset, bn, bp->b_bcount);
                   1242:
                   1243:        /*
                   1244:         * for block devices we finish up here.
                   1245:         * for regular files we have to do more work which we delegate
                   1246:         * to sw_reg_strategy().
                   1247:         */
                   1248:
                   1249:        switch (sdp->swd_vp->v_type) {
                   1250:        default:
                   1251:                panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
                   1252:
                   1253:        case VBLK:
                   1254:
                   1255:                /*
                   1256:                 * must convert "bp" from an I/O on /dev/drum to an I/O
                   1257:                 * on the swapdev (sdp).
                   1258:                 */
                   1259:                s = splbio();
                   1260:                buf_replacevnode(bp, sdp->swd_vp);
                   1261:
                   1262:                bp->b_blkno = bn;
                   1263:                splx(s);
                   1264:                VOP_STRATEGY(bp);
                   1265:                return;
                   1266:
                   1267:        case VREG:
                   1268:                /*
                   1269:                 * delegate to sw_reg_strategy function.
                   1270:                 */
                   1271:                sw_reg_strategy(sdp, bp, bn);
                   1272:                return;
                   1273:        }
                   1274:        /* NOTREACHED */
                   1275: }
                   1276:
                   1277: /*
                   1278:  * sw_reg_strategy: handle swap i/o to regular files
                   1279:  */
                   1280: static void
                   1281: sw_reg_strategy(sdp, bp, bn)
                   1282:        struct swapdev  *sdp;
                   1283:        struct buf      *bp;
                   1284:        int             bn;
                   1285: {
                   1286:        struct vnode    *vp;
                   1287:        struct vndxfer  *vnx;
                   1288:        daddr64_t       nbn;
                   1289:        caddr_t         addr;
                   1290:        off_t           byteoff;
                   1291:        int             s, off, nra, error, sz, resid;
                   1292:        UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
                   1293:
                   1294:        /*
                   1295:         * allocate a vndxfer head for this transfer and point it to
                   1296:         * our buffer.
                   1297:         */
                   1298:        getvndxfer(vnx);
                   1299:        vnx->vx_flags = VX_BUSY;
                   1300:        vnx->vx_error = 0;
                   1301:        vnx->vx_pending = 0;
                   1302:        vnx->vx_bp = bp;
                   1303:        vnx->vx_sdp = sdp;
                   1304:
                   1305:        /*
                   1306:         * setup for main loop where we read filesystem blocks into
                   1307:         * our buffer.
                   1308:         */
                   1309:        error = 0;
                   1310:        bp->b_resid = bp->b_bcount;     /* nothing transferred yet! */
                   1311:        addr = bp->b_data;              /* current position in buffer */
                   1312:        byteoff = dbtob((u_int64_t)bn);
                   1313:
                   1314:        for (resid = bp->b_resid; resid; resid -= sz) {
                   1315:                struct vndbuf   *nbp;
                   1316:
                   1317:                /*
                   1318:                 * translate byteoffset into block number.  return values:
                   1319:                 *   vp = vnode of underlying device
                   1320:                 *  nbn = new block number (on underlying vnode dev)
                   1321:                 *  nra = num blocks we can read-ahead (excludes requested
                   1322:                 *      block)
                   1323:                 */
                   1324:                nra = 0;
                   1325:                error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
                   1326:                                        &vp, &nbn, &nra);
                   1327:
                   1328:                if (error == 0 && nbn == (daddr64_t)-1) {
                   1329:                        /*
                   1330:                         * this used to just set error, but that doesn't
                   1331:                         * do the right thing.  Instead, it causes random
                   1332:                         * memory errors.  The panic() should remain until
                   1333:                         * this condition doesn't destabilize the system.
                   1334:                         */
                   1335: #if 1
                   1336:                        panic("sw_reg_strategy: swap to sparse file");
                   1337: #else
                   1338:                        error = EIO;    /* failure */
                   1339: #endif
                   1340:                }
                   1341:
                   1342:                /*
                   1343:                 * punt if there was an error or a hole in the file.
                   1344:                 * we must wait for any i/o ops we have already started
                   1345:                 * to finish before returning.
                   1346:                 *
                   1347:                 * XXX we could deal with holes here but it would be
                   1348:                 * a hassle (in the write case).
                   1349:                 */
                   1350:                if (error) {
                   1351:                        s = splbio();
                   1352:                        vnx->vx_error = error;  /* pass error up */
                   1353:                        goto out;
                   1354:                }
                   1355:
                   1356:                /*
                   1357:                 * compute the size ("sz") of this transfer (in bytes).
                   1358:                 */
                   1359:                off = byteoff % sdp->swd_bsize;
                   1360:                sz = (1 + nra) * sdp->swd_bsize - off;
                   1361:                if (sz > resid)
                   1362:                        sz = resid;
                   1363:
                   1364:                UVMHIST_LOG(pdhist, "sw_reg_strategy: "
                   1365:                            "vp %p/%p offset 0x%lx/0x%llx",
                   1366:                            sdp->swd_vp, vp, (u_long)byteoff, nbn);
                   1367:
                   1368:                /*
                   1369:                 * now get a buf structure.   note that the vb_buf is
                   1370:                 * at the front of the nbp structure so that you can
                   1371:                 * cast pointers between the two structure easily.
                   1372:                 */
                   1373:                getvndbuf(nbp);
                   1374:                nbp->vb_buf.b_flags    = bp->b_flags | B_CALL;
                   1375:                nbp->vb_buf.b_bcount   = sz;
                   1376:                nbp->vb_buf.b_bufsize  = sz;
                   1377:                nbp->vb_buf.b_error    = 0;
                   1378:                nbp->vb_buf.b_data     = addr;
                   1379:                nbp->vb_buf.b_blkno    = nbn + btodb(off);
                   1380:                nbp->vb_buf.b_proc     = bp->b_proc;
                   1381:                nbp->vb_buf.b_iodone   = sw_reg_iodone;
                   1382:                nbp->vb_buf.b_vp       = NULLVP;
                   1383:                nbp->vb_buf.b_vnbufs.le_next = NOLIST;
                   1384:                LIST_INIT(&nbp->vb_buf.b_dep);
                   1385:
                   1386:                /*
                   1387:                 * set b_dirtyoff/end and b_validoff/end.   this is
                   1388:                 * required by the NFS client code (otherwise it will
                   1389:                 * just discard our I/O request).
                   1390:                 */
                   1391:                if (bp->b_dirtyend == 0) {
                   1392:                        nbp->vb_buf.b_dirtyoff = 0;
                   1393:                        nbp->vb_buf.b_dirtyend = sz;
                   1394:                } else {
                   1395:                        nbp->vb_buf.b_dirtyoff =
                   1396:                            max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
                   1397:                        nbp->vb_buf.b_dirtyend =
                   1398:                            min(sz,
                   1399:                                max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
                   1400:                }
                   1401:                if (bp->b_validend == 0) {
                   1402:                        nbp->vb_buf.b_validoff = 0;
                   1403:                        nbp->vb_buf.b_validend = sz;
                   1404:                } else {
                   1405:                        nbp->vb_buf.b_validoff =
                   1406:                            max(0, bp->b_validoff - (bp->b_bcount-resid));
                   1407:                        nbp->vb_buf.b_validend =
                   1408:                            min(sz,
                   1409:                                max(0, bp->b_validend - (bp->b_bcount-resid)));
                   1410:                }
                   1411:
                   1412:                nbp->vb_xfer = vnx;     /* patch it back in to vnx */
                   1413:
                   1414:                /*
                   1415:                 * Just sort by block number
                   1416:                 */
                   1417:                nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
                   1418:                s = splbio();
                   1419:                if (vnx->vx_error != 0) {
                   1420:                        putvndbuf(nbp);
                   1421:                        goto out;
                   1422:                }
                   1423:                vnx->vx_pending++;
                   1424:
                   1425:                /* assoc new buffer with underlying vnode */
                   1426:                bgetvp(vp, &nbp->vb_buf);
                   1427:
                   1428:                /* sort it in and start I/O if we are not over our limit */
                   1429:                disksort(&sdp->swd_tab, &nbp->vb_buf);
                   1430:                sw_reg_start(sdp);
                   1431:                splx(s);
                   1432:
                   1433:                /*
                   1434:                 * advance to the next I/O
                   1435:                 */
                   1436:                byteoff += sz;
                   1437:                addr += sz;
                   1438:        }
                   1439:
                   1440:        s = splbio();
                   1441:
                   1442: out: /* Arrive here at splbio */
                   1443:        vnx->vx_flags &= ~VX_BUSY;
                   1444:        if (vnx->vx_pending == 0) {
                   1445:                if (vnx->vx_error != 0) {
                   1446:                        bp->b_error = vnx->vx_error;
                   1447:                        bp->b_flags |= B_ERROR;
                   1448:                }
                   1449:                putvndxfer(vnx);
                   1450:                biodone(bp);
                   1451:        }
                   1452:        splx(s);
                   1453: }
                   1454:
                   1455: /*
                   1456:  * sw_reg_start: start an I/O request on the requested swapdev
                   1457:  *
                   1458:  * => reqs are sorted by disksort (above)
                   1459:  */
                   1460: static void
                   1461: sw_reg_start(sdp)
                   1462:        struct swapdev  *sdp;
                   1463: {
                   1464:        struct buf      *bp;
                   1465:        UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
                   1466:
                   1467:        /* recursion control */
                   1468:        if ((sdp->swd_flags & SWF_BUSY) != 0)
                   1469:                return;
                   1470:
                   1471:        sdp->swd_flags |= SWF_BUSY;
                   1472:
                   1473:        while (sdp->swd_tab.b_active < sdp->swd_maxactive) {
                   1474:                bp = sdp->swd_tab.b_actf;
                   1475:                if (bp == NULL)
                   1476:                        break;
                   1477:                sdp->swd_tab.b_actf = bp->b_actf;
                   1478:                sdp->swd_tab.b_active++;
                   1479:
                   1480:                UVMHIST_LOG(pdhist,
                   1481:                    "sw_reg_start:  bp %p vp %p blkno 0x%lx cnt 0x%lx",
                   1482:                    bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
                   1483:                if ((bp->b_flags & B_READ) == 0)
                   1484:                        bp->b_vp->v_numoutput++;
                   1485:
                   1486:                VOP_STRATEGY(bp);
                   1487:        }
                   1488:        sdp->swd_flags &= ~SWF_BUSY;
                   1489: }
                   1490:
                   1491: /*
                   1492:  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
                   1493:  *
                   1494:  * => note that we can recover the vndbuf struct by casting the buf ptr
                   1495:  */
                   1496: static void
                   1497: sw_reg_iodone(bp)
                   1498:        struct buf *bp;
                   1499: {
                   1500:        struct vndbuf *vbp = (struct vndbuf *) bp;
                   1501:        struct vndxfer *vnx = vbp->vb_xfer;
                   1502:        struct buf *pbp = vnx->vx_bp;           /* parent buffer */
                   1503:        struct swapdev  *sdp = vnx->vx_sdp;
                   1504:        int resid;
                   1505:        UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
                   1506:
                   1507:        UVMHIST_LOG(pdhist, "  vbp=%p vp=%p blkno=0x%lx addr=%p",
                   1508:            vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
                   1509:        UVMHIST_LOG(pdhist, "  cnt=%lx resid=%lx",
                   1510:            vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
                   1511:
                   1512:        splassert(IPL_BIO);
                   1513:
                   1514:        resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
                   1515:        pbp->b_resid -= resid;
                   1516:        vnx->vx_pending--;
                   1517:
                   1518:        if (vbp->vb_buf.b_error) {
                   1519:                UVMHIST_LOG(pdhist, "  got error=%ld !",
                   1520:                    vbp->vb_buf.b_error, 0, 0, 0);
                   1521:
                   1522:                /* pass error upward */
                   1523:                vnx->vx_error = vbp->vb_buf.b_error;
                   1524:        }
                   1525:
                   1526:        /*
                   1527:         * disassociate this buffer from the vnode (if any).
                   1528:         */
                   1529:        if (vbp->vb_buf.b_vp != NULL) {
                   1530:                brelvp(&vbp->vb_buf);
                   1531:        }
                   1532:
                   1533:        /*
                   1534:         * kill vbp structure
                   1535:         */
                   1536:        putvndbuf(vbp);
                   1537:
                   1538:        /*
                   1539:         * wrap up this transaction if it has run to completion or, in
                   1540:         * case of an error, when all auxiliary buffers have returned.
                   1541:         */
                   1542:        if (vnx->vx_error != 0) {
                   1543:                /* pass error upward */
                   1544:                pbp->b_flags |= B_ERROR;
                   1545:                pbp->b_error = vnx->vx_error;
                   1546:                if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
                   1547:                        putvndxfer(vnx);
                   1548:                        biodone(pbp);
                   1549:                }
                   1550:        } else if (pbp->b_resid == 0) {
                   1551:                KASSERT(vnx->vx_pending == 0);
                   1552:                if ((vnx->vx_flags & VX_BUSY) == 0) {
                   1553:                        UVMHIST_LOG(pdhist, "  iodone error=%ld !",
                   1554:                            pbp, vnx->vx_error, 0, 0);
                   1555:                        putvndxfer(vnx);
                   1556:                        biodone(pbp);
                   1557:                }
                   1558:        }
                   1559:
                   1560:        /*
                   1561:         * done!   start next swapdev I/O if one is pending
                   1562:         */
                   1563:        sdp->swd_tab.b_active--;
                   1564:        sw_reg_start(sdp);
                   1565: }
                   1566:
                   1567:
                   1568: /*
                   1569:  * uvm_swap_alloc: allocate space on swap
                   1570:  *
                   1571:  * => allocation is done "round robin" down the priority list, as we
                   1572:  *     allocate in a priority we "rotate" the circle queue.
                   1573:  * => space can be freed with uvm_swap_free
                   1574:  * => we return the page slot number in /dev/drum (0 == invalid slot)
                   1575:  * => we lock uvm.swap_data_lock
                   1576:  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
                   1577:  */
                   1578: int
                   1579: uvm_swap_alloc(nslots, lessok)
                   1580:        int *nslots;    /* IN/OUT */
                   1581:        boolean_t lessok;
                   1582: {
                   1583:        struct swapdev *sdp;
                   1584:        struct swappri *spp;
                   1585:        u_long  result;
                   1586:        UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
                   1587:
                   1588:        /*
                   1589:         * no swap devices configured yet?   definite failure.
                   1590:         */
                   1591:        if (uvmexp.nswapdev < 1)
                   1592:                return 0;
                   1593:
                   1594:        /*
                   1595:         * lock data lock, convert slots into blocks, and enter loop
                   1596:         */
                   1597:        simple_lock(&uvm.swap_data_lock);
                   1598:
                   1599: ReTry: /* XXXMRG */
                   1600:        for (spp = LIST_FIRST(&swap_priority); spp != NULL;
                   1601:             spp = LIST_NEXT(spp, spi_swappri)) {
                   1602:                for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
                   1603:                     sdp != (void *)&spp->spi_swapdev;
                   1604:                     sdp = CIRCLEQ_NEXT(sdp,swd_next)) {
                   1605:                        /* if it's not enabled, then we can't swap from it */
                   1606:                        if ((sdp->swd_flags & SWF_ENABLE) == 0)
                   1607:                                continue;
                   1608:                        if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
                   1609:                                continue;
                   1610:                        if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0,
                   1611:                                         EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
                   1612:                                         &result) != 0) {
                   1613:                                continue;
                   1614:                        }
                   1615:
                   1616:                        /*
                   1617:                         * successful allocation!  now rotate the circleq.
                   1618:                         */
                   1619:                        CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
                   1620:                        CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
                   1621:                        sdp->swd_npginuse += *nslots;
                   1622:                        uvmexp.swpginuse += *nslots;
                   1623:                        simple_unlock(&uvm.swap_data_lock);
                   1624:                        /* done!  return drum slot number */
                   1625:                        UVMHIST_LOG(pdhist,
                   1626:                            "success!  returning %ld slots starting at %ld",
                   1627:                            *nslots, result + sdp->swd_drumoffset, 0, 0);
                   1628:                        return(result + sdp->swd_drumoffset);
                   1629:                }
                   1630:        }
                   1631:
                   1632:        /* XXXMRG: BEGIN HACK */
                   1633:        if (*nslots > 1 && lessok) {
                   1634:                *nslots = 1;
                   1635:                goto ReTry;     /* XXXMRG: ugh!  extent should support this for us */
                   1636:        }
                   1637:        /* XXXMRG: END HACK */
                   1638:
                   1639:        simple_unlock(&uvm.swap_data_lock);
                   1640:        return 0;               /* failed */
                   1641: }
                   1642:
                   1643: /*
                   1644:  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
                   1645:  *
                   1646:  * => we lock uvm.swap_data_lock
                   1647:  */
                   1648: void
                   1649: uvm_swap_markbad(startslot, nslots)
                   1650:        int startslot;
                   1651:        int nslots;
                   1652: {
                   1653:        struct swapdev *sdp;
                   1654:        UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
                   1655:
                   1656:        simple_lock(&uvm.swap_data_lock);
                   1657:        sdp = swapdrum_getsdp(startslot);
                   1658:        if (sdp != NULL) {
                   1659:                /*
                   1660:                 * we just keep track of how many pages have been marked bad
                   1661:                 * in this device, to make everything add up in swap_off().
                   1662:                 * we assume here that the range of slots will all be within
                   1663:                 * one swap device.
                   1664:                 */
                   1665:                sdp->swd_npgbad += nslots;
                   1666:                UVMHIST_LOG(pdhist, "now %ld bad", sdp->swd_npgbad, 0,0,0);
                   1667:        }
                   1668:        simple_unlock(&uvm.swap_data_lock);
                   1669: }
                   1670:
                   1671: /*
                   1672:  * uvm_swap_free: free swap slots
                   1673:  *
                   1674:  * => this can be all or part of an allocation made by uvm_swap_alloc
                   1675:  * => we lock uvm.swap_data_lock
                   1676:  */
                   1677: void
                   1678: uvm_swap_free(startslot, nslots)
                   1679:        int startslot;
                   1680:        int nslots;
                   1681: {
                   1682:        struct swapdev *sdp;
                   1683:        UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
                   1684:
                   1685:        UVMHIST_LOG(pdhist, "freeing %ld slots starting at %ld", nslots,
                   1686:            startslot, 0, 0);
                   1687:
                   1688:        /*
                   1689:         * ignore attempts to free the "bad" slot.
                   1690:         */
                   1691:
                   1692:        if (startslot == SWSLOT_BAD) {
                   1693:                return;
                   1694:        }
                   1695:
                   1696:        /*
                   1697:         * convert drum slot offset back to sdp, free the blocks
                   1698:         * in the extent, and return.   must hold pri lock to do
                   1699:         * lookup and access the extent.
                   1700:         */
                   1701:
                   1702:        simple_lock(&uvm.swap_data_lock);
                   1703:        sdp = swapdrum_getsdp(startslot);
                   1704:        KASSERT(uvmexp.nswapdev >= 1);
                   1705:        KASSERT(sdp != NULL);
                   1706:        KASSERT(sdp->swd_npginuse >= nslots);
                   1707:        if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
                   1708:                        EX_MALLOCOK|EX_NOWAIT) != 0) {
                   1709:                printf("warning: resource shortage: %d pages of swap lost\n",
                   1710:                        nslots);
                   1711:        }
                   1712:
                   1713:        sdp->swd_npginuse -= nslots;
                   1714:        uvmexp.swpginuse -= nslots;
                   1715: #ifdef UVM_SWAP_ENCRYPT
                   1716:        {
                   1717:                int i;
                   1718:                if (swap_encrypt_initialized) {
                   1719:                        /* Dereference keys */
                   1720:                        for (i = 0; i < nslots; i++)
                   1721:                                if (uvm_swap_needdecrypt(sdp, startslot + i))
                   1722:                                        SWAP_KEY_PUT(sdp, SWD_KEY(sdp, startslot + i));
                   1723:
                   1724:                        /* Mark range as not decrypt */
                   1725:                        uvm_swap_markdecrypt(sdp, startslot, nslots, 0);
                   1726:                }
                   1727:        }
                   1728: #endif /* UVM_SWAP_ENCRYPT */
                   1729:        simple_unlock(&uvm.swap_data_lock);
                   1730: }
                   1731:
                   1732: /*
                   1733:  * uvm_swap_put: put any number of pages into a contig place on swap
                   1734:  *
                   1735:  * => can be sync or async
                   1736:  * => XXXMRG: consider making it an inline or macro
                   1737:  */
                   1738: int
                   1739: uvm_swap_put(swslot, ppsp, npages, flags)
                   1740:        int swslot;
                   1741:        struct vm_page **ppsp;
                   1742:        int     npages;
                   1743:        int     flags;
                   1744: {
                   1745:        int     result;
                   1746:
                   1747:        result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
                   1748:            ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
                   1749:
                   1750:        return (result);
                   1751: }
                   1752:
                   1753: /*
                   1754:  * uvm_swap_get: get a single page from swap
                   1755:  *
                   1756:  * => usually a sync op (from fault)
                   1757:  * => XXXMRG: consider making it an inline or macro
                   1758:  */
                   1759: int
                   1760: uvm_swap_get(page, swslot, flags)
                   1761:        struct vm_page *page;
                   1762:        int swslot, flags;
                   1763: {
                   1764:        int     result;
                   1765:
                   1766:        uvmexp.nswget++;
                   1767:        KASSERT(flags & PGO_SYNCIO);
                   1768:        if (swslot == SWSLOT_BAD) {
                   1769:                return VM_PAGER_ERROR;
                   1770:        }
                   1771:
                   1772:        /*
                   1773:         * this page is (about to be) no longer only in swap.
                   1774:         */
                   1775:        simple_lock(&uvm.swap_data_lock);
                   1776:        uvmexp.swpgonly--;
                   1777:        simple_unlock(&uvm.swap_data_lock);
                   1778:
                   1779:        result = uvm_swap_io(&page, swslot, 1, B_READ |
                   1780:            ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
                   1781:
                   1782:        if (result != VM_PAGER_OK && result != VM_PAGER_PEND) {
                   1783:                /*
                   1784:                 * oops, the read failed so it really is still only in swap.
                   1785:                 */
                   1786:                simple_lock(&uvm.swap_data_lock);
                   1787:                uvmexp.swpgonly++;
                   1788:                simple_unlock(&uvm.swap_data_lock);
                   1789:        }
                   1790:
                   1791:        return (result);
                   1792: }
                   1793:
                   1794: /*
                   1795:  * uvm_swap_io: do an i/o operation to swap
                   1796:  */
                   1797:
                   1798: static int
                   1799: uvm_swap_io(pps, startslot, npages, flags)
                   1800:        struct vm_page **pps;
                   1801:        int startslot, npages, flags;
                   1802: {
                   1803:        daddr64_t startblk;
                   1804:        struct  buf *bp;
                   1805:        vaddr_t kva;
                   1806:        int     result, s, mapinflags, pflag;
                   1807:        boolean_t write, async;
                   1808: #ifdef UVM_SWAP_ENCRYPT
                   1809:        vaddr_t dstkva;
                   1810:        struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT];
                   1811:        struct swapdev *sdp;
                   1812:        int     encrypt = 0;
                   1813: #endif
                   1814:        UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
                   1815:
                   1816:        UVMHIST_LOG(pdhist, "<- called, startslot=%ld, npages=%ld, flags=%ld",
                   1817:            startslot, npages, flags, 0);
                   1818:
                   1819:        write = (flags & B_READ) == 0;
                   1820:        async = (flags & B_ASYNC) != 0;
                   1821:
                   1822:        /*
                   1823:         * convert starting drum slot to block number
                   1824:         */
                   1825:        startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
                   1826:
                   1827:        /*
                   1828:         * first, map the pages into the kernel (XXX: currently required
                   1829:         * by buffer system).
                   1830:         */
                   1831:        mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
                   1832:        if (!async)
                   1833:                mapinflags |= UVMPAGER_MAPIN_WAITOK;
                   1834:        kva = uvm_pagermapin(pps, npages, mapinflags);
                   1835:        if (kva == 0)
                   1836:                return (VM_PAGER_AGAIN);
                   1837:
                   1838: #ifdef UVM_SWAP_ENCRYPT
                   1839:        if (write) {
                   1840:                /*
                   1841:                 * Check if we need to do swap encryption on old pages.
                   1842:                 * Later we need a different scheme, that swap encrypts
                   1843:                 * all pages of a process that had at least one page swap
                   1844:                 * encrypted.  Then we might not need to copy all pages
                   1845:                 * in the cluster, and avoid the memory overheard in
                   1846:                 * swapping.
                   1847:                 */
                   1848:                if (uvm_doswapencrypt)
                   1849:                        encrypt = 1;
                   1850:        }
                   1851:
                   1852:        if (swap_encrypt_initialized  || encrypt) {
                   1853:                /*
                   1854:                 * we need to know the swap device that we are swapping to/from
                   1855:                 * to see if the pages need to be marked for decryption or
                   1856:                 * actually need to be decrypted.
                   1857:                 * XXX - does this information stay the same over the whole
                   1858:                 * execution of this function?
                   1859:                 */
                   1860:                simple_lock(&uvm.swap_data_lock);
                   1861:                sdp = swapdrum_getsdp(startslot);
                   1862:                simple_unlock(&uvm.swap_data_lock);
                   1863:        }
                   1864:
                   1865:        /*
                   1866:         * encrypt to swap
                   1867:         */
                   1868:        if (write && encrypt) {
                   1869:                int i, opages;
                   1870:                caddr_t src, dst;
                   1871:                struct swap_key *key;
                   1872:                u_int64_t block;
                   1873:                int swmapflags;
                   1874:
                   1875:                /* We always need write access. */
                   1876:                swmapflags = UVMPAGER_MAPIN_READ;
                   1877:                if (!async)
                   1878:                        swmapflags |= UVMPAGER_MAPIN_WAITOK;
                   1879:
                   1880:                if (!uvm_swap_allocpages(tpps, npages)) {
                   1881:                        uvm_pagermapout(kva, npages);
                   1882:                        return (VM_PAGER_AGAIN);
                   1883:                }
                   1884:
                   1885:                dstkva = uvm_pagermapin(tpps, npages, swmapflags);
                   1886:                if (dstkva == 0) {
                   1887:                        uvm_pagermapout(kva, npages);
                   1888:                        uvm_swap_freepages(tpps, npages);
                   1889:                        return (VM_PAGER_AGAIN);
                   1890:                }
                   1891:
                   1892:                src = (caddr_t) kva;
                   1893:                dst = (caddr_t) dstkva;
                   1894:                block = startblk;
                   1895:                for (i = 0; i < npages; i++) {
                   1896:                        key = SWD_KEY(sdp, startslot + i);
                   1897:                        SWAP_KEY_GET(sdp, key); /* add reference */
                   1898:
                   1899:                        /* mark for async writes */
                   1900:                        atomic_setbits_int(&tpps[i]->pg_flags, PQ_ENCRYPT);
                   1901:                        swap_encrypt(key, src, dst, block, 1 << PAGE_SHIFT);
                   1902:                        src += 1 << PAGE_SHIFT;
                   1903:                        dst += 1 << PAGE_SHIFT;
                   1904:                        block += btodb(1 << PAGE_SHIFT);
                   1905:                }
                   1906:
                   1907:                uvm_pagermapout(kva, npages);
                   1908:
                   1909:                /* dispose of pages we dont use anymore */
                   1910:                opages = npages;
                   1911:                uvm_pager_dropcluster(NULL, NULL, pps, &opages,
                   1912:                                      PGO_PDFREECLUST);
                   1913:
                   1914:                kva = dstkva;
                   1915:        }
                   1916: #endif /* UVM_SWAP_ENCRYPT */
                   1917:
                   1918:        /*
                   1919:         * now allocate a buf for the i/o.
                   1920:         * [make sure we don't put the pagedaemon to sleep...]
                   1921:         */
                   1922:        s = splbio();
                   1923:        pflag = (async || curproc == uvm.pagedaemon_proc) ? 0 : PR_WAITOK;
                   1924:        bp = pool_get(&bufpool, pflag);
                   1925:        splx(s);
                   1926:
                   1927:        /*
                   1928:         * if we failed to get a swapbuf, return "try again"
                   1929:         */
                   1930:        if (bp == NULL) {
                   1931: #ifdef UVM_SWAP_ENCRYPT
                   1932:                if (write && encrypt) {
                   1933:                        int i;
                   1934:
                   1935:                        /* swap encrypt needs cleanup */
                   1936:                        for (i = 0; i < npages; i++)
                   1937:                                SWAP_KEY_PUT(sdp, SWD_KEY(sdp, startslot + i));
                   1938:
                   1939:                        uvm_pagermapout(kva, npages);
                   1940:                        uvm_swap_freepages(tpps, npages);
                   1941:                }
                   1942: #endif
                   1943:                return (VM_PAGER_AGAIN);
                   1944:        }
                   1945:
                   1946: #ifdef UVM_SWAP_ENCRYPT
                   1947:        /*
                   1948:         * prevent ASYNC reads.
                   1949:         * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get
                   1950:         * assumes that all gets are SYNCIO.  Just make sure here.
                   1951:         * XXXARTUBC - might not be true anymore.
                   1952:         */
                   1953:        if (!write) {
                   1954:                flags &= ~B_ASYNC;
                   1955:                async = 0;
                   1956:        }
                   1957: #endif
                   1958:        /*
                   1959:         * fill in the bp.   we currently route our i/o through
                   1960:         * /dev/drum's vnode [swapdev_vp].
                   1961:         */
                   1962:        bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC));
                   1963:        bp->b_proc = &proc0;    /* XXX */
                   1964:        bp->b_vnbufs.le_next = NOLIST;
                   1965:        bp->b_data = (caddr_t)kva;
                   1966:        bp->b_blkno = startblk;
                   1967:        LIST_INIT(&bp->b_dep);
                   1968:        s = splbio();
                   1969:        bp->b_vp = NULL;
                   1970:        buf_replacevnode(bp, swapdev_vp);
                   1971:        splx(s);
                   1972:        bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
                   1973:
                   1974:        /*
                   1975:         * for pageouts we must set "dirtyoff" [NFS client code needs it].
                   1976:         * and we bump v_numoutput (counter of number of active outputs).
                   1977:         */
                   1978:        if (write) {
                   1979:                bp->b_dirtyoff = 0;
                   1980:                bp->b_dirtyend = npages << PAGE_SHIFT;
                   1981: #ifdef UVM_SWAP_ENCRYPT
                   1982:                /* mark the pages in the drum for decryption */
                   1983:                if (swap_encrypt_initialized)
                   1984:                        uvm_swap_markdecrypt(sdp, startslot, npages, encrypt);
                   1985: #endif
                   1986:                s = splbio();
                   1987:                swapdev_vp->v_numoutput++;
                   1988:                splx(s);
                   1989:        }
                   1990:
                   1991:        /*
                   1992:         * for async ops we must set up the iodone handler.
                   1993:         */
                   1994:        if (async) {
                   1995:                bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ?
                   1996:                                         B_PDAEMON : 0);
                   1997:                bp->b_iodone = uvm_aio_biodone;
                   1998:                UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
                   1999:        }
                   2000:        UVMHIST_LOG(pdhist,
                   2001:            "about to start io: data = %p blkno = 0x%lx, bcount = %ld",
                   2002:            bp->b_data, bp->b_blkno, bp->b_bcount, 0);
                   2003:
                   2004:        /*
                   2005:         * now we start the I/O, and if async, return.
                   2006:         */
                   2007:        VOP_STRATEGY(bp);
                   2008:        if (async)
                   2009:                return (VM_PAGER_PEND);
                   2010:
                   2011:        /*
                   2012:         * must be sync i/o.   wait for it to finish
                   2013:         */
                   2014:        (void) biowait(bp);
                   2015:        result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
                   2016:
                   2017: #ifdef UVM_SWAP_ENCRYPT
                   2018:        /*
                   2019:         * decrypt swap
                   2020:         */
                   2021:        if (swap_encrypt_initialized &&
                   2022:            (bp->b_flags & B_READ) && !(bp->b_flags & B_ERROR)) {
                   2023:                int i;
                   2024:                caddr_t data = bp->b_data;
                   2025:                u_int64_t block = startblk;
                   2026:                struct swap_key *key = NULL;
                   2027:
                   2028:                for (i = 0; i < npages; i++) {
                   2029:                        /* Check if we need to decrypt */
                   2030:                        if (uvm_swap_needdecrypt(sdp, startslot + i)) {
                   2031:                                key = SWD_KEY(sdp, startslot + i);
                   2032:                                swap_decrypt(key, data, data, block,
                   2033:                                             1 << PAGE_SHIFT);
                   2034:                        }
                   2035:                        data += 1 << PAGE_SHIFT;
                   2036:                        block += btodb(1 << PAGE_SHIFT);
                   2037:                }
                   2038:        }
                   2039: #endif
                   2040:        /*
                   2041:         * kill the pager mapping
                   2042:         */
                   2043:        uvm_pagermapout(kva, npages);
                   2044:
                   2045: #ifdef UVM_SWAP_ENCRYPT
                   2046:        /*
                   2047:         *  Not anymore needed, free after encryption
                   2048:         */
                   2049:        if ((bp->b_flags & B_READ) == 0 && encrypt)
                   2050:                uvm_swap_freepages(tpps, npages);
                   2051: #endif
                   2052:        /*
                   2053:         * now dispose of the buf
                   2054:         */
                   2055:        s = splbio();
                   2056:        if (bp->b_vp)
                   2057:                brelvp(bp);
                   2058:
                   2059:        if (write && bp->b_vp)
                   2060:                vwakeup(bp->b_vp);
                   2061:        pool_put(&bufpool, bp);
                   2062:        splx(s);
                   2063:
                   2064:        /*
                   2065:         * finally return.
                   2066:         */
                   2067:        UVMHIST_LOG(pdhist, "<- done (sync)  result=%ld", result, 0, 0, 0);
                   2068:        return (result);
                   2069: }
                   2070:
                   2071: static void
                   2072: swapmount()
                   2073: {
                   2074:        struct swapdev *sdp;
                   2075:        struct swappri *spp;
                   2076:        struct vnode *vp;
                   2077:        dev_t swap_dev = swdevt[0].sw_dev;
                   2078:
                   2079:        /*
                   2080:         * No locking here since we happen to know that we will just be called
                   2081:         * once before any other process has forked.
                   2082:         */
                   2083:
                   2084:        if (swap_dev == NODEV) {
                   2085:                printf("swapmount: no device\n");
                   2086:                return;
                   2087:        }
                   2088:
                   2089:        if (bdevvp(swap_dev, &vp)) {
                   2090:                printf("swapmount: no device 2\n");
                   2091:                return;
                   2092:        }
                   2093:
                   2094:        sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK);
                   2095:        spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK);
                   2096:        memset(sdp, 0, sizeof(*sdp));
                   2097:
                   2098:        sdp->swd_flags = SWF_FAKE;
                   2099:        sdp->swd_dev = swap_dev;
                   2100:        sdp->swd_vp = vp;
                   2101:        swaplist_insert(sdp, spp, 0);
                   2102:        sdp->swd_pathlen = strlen("swap_device") + 1;
                   2103:        sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
                   2104:        if (copystr("swap_device", sdp->swd_path, sdp->swd_pathlen, 0))
                   2105:                panic("swapmount: copystr");
                   2106:
                   2107:        if (swap_on(curproc, sdp)) {
                   2108:                swaplist_find(vp, 1);
                   2109:                swaplist_trim();
                   2110:                vput(sdp->swd_vp);
                   2111:                free(sdp->swd_path, M_VMSWAP);
                   2112:                free(sdp, M_VMSWAP);
                   2113:                return;
                   2114:        }
                   2115:
                   2116:        VOP_UNLOCK(vp, 0, curproc);
                   2117: }

CVSweb