From 992379c016dedb50ba29655f60c2468c876dec17 Mon Sep 17 00:00:00 2001 From: sugam45 Date: Tue, 13 Jul 2021 06:06:27 +0530 Subject: [PATCH 1/3] Scripts Required for reading bootloader support for ZFS --- .../osnet/dist/uts/common/fs/zfs/sys/lz4.h | 55 + .../dist/uts/common/fs/zfs/sys/zfs_bootenv.h | 53 + .../uts/common/fs/zfs/sys/zfs_bootenv_os.h | 29 + .../osnet/dist/uts/common/fs/zfs/zfssubr.c | 1848 +++++++++++++++++ 4 files changed, 1985 insertions(+) create mode 100644 external/cddl/osnet/dist/uts/common/fs/zfs/sys/lz4.h create mode 100644 external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h create mode 100644 external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv_os.h create mode 100644 external/cddl/osnet/dist/uts/common/fs/zfs/zfssubr.c diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/lz4.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/lz4.h new file mode 100644 index 0000000000..153efe9161 --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/lz4.h @@ -0,0 +1,55 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#ifndef _LZ4_H +#define _LZ4_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern size_t lz4_compress(void *, void *, size_t, size_t, int); +extern int lz4_decompress(void *, void *, size_t, size_t, int); + +#if defined(_KERNEL) || defined(_FAKE_KERNEL) +extern void lz4_init(void); +extern void lz4_fini(void); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _LZ4_H */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h new file mode 100644 index 0000000000..cb06324c13 --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Toomas Soome + */ + +#ifndef _ZFS_BOOTENV_H +#define _ZFS_BOOTENV_H + +/* + * Define macros for label bootenv nvlist pair keys. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define BOOTENV_VERSION "version" + +#define BE_ILLUMOS_VENDOR "illumos" +#define BE_NETBSD_VENDOR "netbsd" +#define BE_GRUB_VENDOR "grub" +#define BE_LINUX_VENDOR "linux" + +#include "zfs_bootenv_os.h" + +#define GRUB_ENVMAP BE_GRUB_VENDOR ":" "envmap" + +#define NETBSD_BOOTONCE BE_NETBSD_VENDOR ":" "bootonce" +#define NETBSD_BOOTONCE_USED BE_NETBSD_VENDOR ":" "bootonce-used" +#define NETBSD_NVSTORE BE_NETBSD_VENDOR ":" "nvstore" +#define ILLUMOS_BOOTONCE BE_ILLUMOS_VENDOR ":" "bootonce" +#define ILLUMOS_BOOTONCE_USED BE_ILLUMOS_VENDOR ":" "bootonce-used" +#define ILLUMOS_NVSTORE BE_ILLUMOS_VENDOR ":" "nvstore" + +#define OS_BOOTONCE BOOTENV_OS ":" "bootonce" +#define OS_BOOTONCE_USED BOOTENV_OS ":" "bootonce-used" +#define OS_NVSTORE BOOTENV_OS ":" "nvstore" + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_BOOTENV_H */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv_os.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv_os.h new file mode 100644 index 0000000000..e9a88bdee6 --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv_os.h @@ -0,0 +1,29 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Toomas Soome + */ + +#ifndef _ZFS_BOOTENV_OS_H +#define _ZFS_BOOTENV_OS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define BOOTENV_OS BE_NETBSD_VENDOR + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_BOOTENV_OS_H */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfssubr.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfssubr.c new file mode 100644 index 0000000000..a262e40f79 --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfssubr.c @@ -0,0 +1,1848 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +__RCSID("$NETBSD: zfssubr.c, v 1.1 2021/07/13 19:34:06 sugam Exp $"); + +#include +#include + +#include "blkptr.c" + +#include "zfs_fletcher.c" +#include "sha256.c" +#include "skein_zfs.c" +#include "lzjb.c" +#include "zle.c" + +static uint64_t zfs_crc64_table[256]; + +#ifndef __DECONST +#define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var)) +#endif + +#define ASSERT3S(x, y, z) ((void)0) +#define ASSERT3U(x, y, z) ((void)0) +#define ASSERT3P(x, y, z) ((void)0) +#define ASSERT0(x) ((void)0) +#define ASSERT(x) ((void)0) + +#define panic(...) do { \ + printf(__VA_ARGS__); \ + for (;;) ; \ +} while (0) + +static void +zfs_init_crc(void) +{ + int i, j; + uint64_t *ct; + + /* + * Calculate the crc64 table (used for the zap hash + * function). + */ + if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { + memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + } +} + +static void +zio_checksum_off(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +/* + * Signature for checksum functions. + */ +typedef void zio_checksum_t(const void *data, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp); +typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); +typedef void zio_checksum_tmpl_free_t(void *ctx_template); + +typedef enum zio_checksum_flags { + /* Strong enough for metadata? */ + ZCHECKSUM_FLAG_METADATA = (1 << 1), + /* ZIO embedded checksum */ + ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), + /* Strong enough for dedup (without verification)? */ + ZCHECKSUM_FLAG_DEDUP = (1 << 3), + /* Uses salt value */ + ZCHECKSUM_FLAG_SALTED = (1 << 4), + /* Strong enough for nopwrite? */ + ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) +} zio_checksum_flags_t; + +/* + * Information about each checksum function. + */ +typedef struct zio_checksum_info { + /* checksum function for each byteorder */ + zio_checksum_t *ci_func[2]; + zio_checksum_tmpl_init_t *ci_tmpl_init; + zio_checksum_tmpl_free_t *ci_tmpl_free; + zio_checksum_flags_t ci_flags; + const char *ci_name; /* descriptive name */ +} zio_checksum_info_t; + + + +static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { + {{NULL, NULL}, NULL, NULL, 0, "inherit"}, + {{NULL, NULL}, NULL, NULL, 0, "on"}, + {{zio_checksum_off, zio_checksum_off}, + NULL, NULL, 0, "off"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "label"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "gang_header"}, + {{fletcher_2_native, fletcher_2_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, + {{fletcher_2_native, fletcher_2_byteswap}, + NULL, NULL, 0, "fletcher2"}, + {{fletcher_4_native, fletcher_4_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, + {{fletcher_4_native, fletcher_4_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, + {{zio_checksum_off, zio_checksum_off}, + NULL, NULL, 0, "noparity"}, +#ifndef __NetBSD__ + {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, + {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, + zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, +#endif + /* no edonr for now */ + {{NULL, NULL}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | + ZCHECKSUM_FLAG_NOPWRITE, "edonr"} +}; + +/* + * Common signature for all zio compress/decompress functions. + */ +typedef size_t zio_compress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); +typedef int zio_decompress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); + +/* + * Information about each compression function. + */ +typedef struct zio_compress_info { + zio_compress_func_t *ci_compress; /* compression function */ + zio_decompress_func_t *ci_decompress; /* decompression function */ + int ci_level; /* level parameter */ + const char *ci_name; /* algorithm name */ +} zio_compress_info_t; + +/* + * Compression vectors. + */ +static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { + {NULL, NULL, 0, "inherit"}, + {NULL, NULL, 0, "on"}, + {NULL, NULL, 0, "uncompressed"}, + {NULL, lzjb_decompress, 0, "lzjb"}, + {NULL, NULL, 0, "empty"}, + {NULL, NULL, 1, "gzip-1"}, + {NULL, NULL, 2, "gzip-2"}, + {NULL, NULL, 3, "gzip-3"}, + {NULL, NULL, 4, "gzip-4"}, + {NULL, NULL, 5, "gzip-5"}, + {NULL, NULL, 6, "gzip-6"}, + {NULL, NULL, 7, "gzip-7"}, + {NULL, NULL, 8, "gzip-8"}, + {NULL, NULL, 9, "gzip-9"}, + {NULL, zle_decompress, 64, "zle"}, + {NULL, lz4_decompress, 0, "lz4"}, +}; + +static void +byteswap_uint64_array(void *vbuf, size_t size) +{ + uint64_t *buf = vbuf; + size_t count = size >> 3; + int i; + + ASSERT((size & 7) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_64(buf[i]); +} + +/* + * Set the external verifier for a gang block based on , + * a tuple which is guaranteed to be unique for the life of the pool. + */ +static void +zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) +{ + dva_t *dva = BP_IDENTITY(bp); + uint64_t txg = BP_PHYSICAL_BIRTH(bp); + + ASSERT(BP_IS_GANG(bp)); + + ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); +} + +/* + * Set the external verifier for a label block based on its offset. + * The vdev is implicit, and the txg is unknowable at pool open time -- + * hence the logic in vdev_uberblock_load() to find the most recent copy. + */ +static void +zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) +{ + ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); +} + +/* + * Calls the template init function of a checksum which supports context + * templates and installs the template into the spa_t. + */ +static void +zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) +{ + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + + if (ci->ci_tmpl_init == NULL) + return; + + if (spa->spa_cksum_tmpls[checksum] != NULL) + return; + + if (spa->spa_cksum_tmpls[checksum] == NULL) { + spa->spa_cksum_tmpls[checksum] = + ci->ci_tmpl_init(&spa->spa_cksum_salt); + } +} + +/* + * Called by a spa_t that's about to be deallocated. This steps through + * all of the checksum context templates and deallocates any that were + * initialized using the algorithm-specific template init function. + */ +static void __unused +zio_checksum_templates_free(spa_t *spa) +{ + for (enum zio_checksum checksum = 0; + checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { + if (spa->spa_cksum_tmpls[checksum] != NULL) { + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + + ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); + spa->spa_cksum_tmpls[checksum] = NULL; + } + } +} + +static int +zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) +{ + uint64_t size; + unsigned int checksum; + zio_checksum_info_t *ci; + void *ctx = NULL; + zio_cksum_t actual_cksum, expected_cksum, verifier; + int byteswap; + + checksum = BP_GET_CHECKSUM(bp); + uint64_t size = BP_GET_PSIZE(bp); + + if (checksum >= ZIO_CHECKSUM_FUNCTIONS) + return (EINVAL); + ci = &zio_checksum_table[checksum]; + if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) + return (EINVAL); + + if (spa != NULL) { + zio_checksum_template_init(checksum, __DECONST(spa_t *,spa)); + ctx = spa->spa_cksum_tmpls[checksum]; + } + + if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { + zio_eck_t *eck; + + ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || + checksum == ZIO_CHECKSUM_LABEL); + + eck = (zio_eck_t *)((char *)data + size) - 1; + + if (checksum == ZIO_CHECKSUM_GANG_HEADER) + zio_checksum_gang_verifier(&verifier, bp); + else if (checksum == ZIO_CHECKSUM_LABEL) + zio_checksum_label_verifier(&verifier, + DVA_GET_OFFSET(BP_IDENTITY(bp))); + else + verifier = bp->blk_cksum; + + byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); + + if (byteswap) + byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + + expected_cksum = eck->zec_cksum; + eck->zec_cksum = verifier; + ci->ci_func[byteswap](data, size, ctx, &actual_cksum); + eck->zec_cksum = expected_cksum; + + if (byteswap) + byteswap_uint64_array(&expected_cksum, + sizeof (zio_cksum_t)); + } else { + byteswap = BP_SHOULD_BYTESWAP(bp); + expected_cksum = bp->blk_cksum; + ci->ci_func[byteswap](data, size, ctx, &actual_cksum); + } + + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { + /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ + return (EIO); + } + + return (0); +} + +static int +zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, + void *dest, size_t destsize) +{ + zio_compress_info_t *ci; + + if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { + printf("ZFS: unsupported compression algorithm %u\n", cpfunc); + return (EIO); + } + + ci = &zio_compress_table[cpfunc]; + if (!ci->ci_decompress) { + printf("ZFS: unsupported compression algorithm %s\n", + ci->ci_name); + return (EIO); + } + + return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); +} + +static uint64_t +zap_hash(uint64_t salt, const char *name) +{ + const uint8_t *cp; + uint8_t c; + uint64_t crc = salt; + + ASSERT(crc != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; + + /* + * Only use 28 bits, since we need 4 bits in the cookie for the + * collision differentiator. We MUST use the high bits, since + * those are the onces that we first pay attention to when + * chosing the bucket. + */ + crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + + return (crc); +} + +typedef struct raidz_col { + uint64_t rc_devidx; /* child device index for I/O */ + uint64_t rc_offset; /* device offset */ + uint64_t rc_size; /* I/O size */ + void *rc_data; /* I/O data */ + int rc_error; /* I/O error for this device */ + uint8_t rc_tried; /* Did we attempt this I/O column? */ + uint8_t rc_skipped; /* Did we skip this I/O column? */ +} raidz_col_t; + +typedef struct raidz_map { + uint64_t rm_cols; /* Regular column count */ + uint64_t rm_scols; /* Count including skipped columns */ + uint64_t rm_bigcols; /* Number of oversized columns */ + uint64_t rm_asize; /* Actual total I/O size */ + uint64_t rm_missingdata; /* Count of missing data devices */ + uint64_t rm_missingparity; /* Count of missing parity devices */ + uint64_t rm_firstdatacol; /* First data column/parity count */ + uint64_t rm_nskip; /* Skipped sectors for padding */ + uint64_t rm_skipstart; /* Column index of padding start */ + uintptr_t rm_reports; /* # of referencing checksum reports */ + uint8_t rm_freed; /* map no longer has referencing ZIO */ + uint8_t rm_ecksuminjected; /* checksum error was injected */ + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ +} raidz_map_t; + +#define VDEV_RAIDZ_P 0 +#define VDEV_RAIDZ_Q 1 +#define VDEV_RAIDZ_R 2 + +#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) +#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) + +/* + * We provide a mechanism to perform the field multiplication operation on a + * 64-bit value all at once rather than a byte at a time. This works by + * creating a mask from the top bit in each byte and using that to + * conditionally apply the XOR of 0x1d. + */ +#define VDEV_RAIDZ_64MUL_2(x, mask) \ +{ \ + (mask) = (x) & 0x8080808080808080ULL; \ + (mask) = ((mask) << 1) - ((mask) >> 7); \ + (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ + ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ +} + +#define VDEV_RAIDZ_64MUL_4(x, mask) \ +{ \ + VDEV_RAIDZ_64MUL_2((x), mask); \ + VDEV_RAIDZ_64MUL_2((x), mask); \ +} + +#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) + +/* + * These two tables represent powers and logs of 2 in the Galois field defined + * above. These values were computed by repeatedly multiplying by 2 as above. + */ +static const uint8_t vdev_raidz_pow2[256] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, + 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, + 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, + 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, + 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, + 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, + 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, + 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, + 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, + 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, + 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, + 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, + 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, + 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, + 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, + 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, + 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, + 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, + 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, + 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, + 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, + 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, + 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, + 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, + 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, + 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, + 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, + 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, + 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, + 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, + 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 +}; +static const uint8_t vdev_raidz_log2[256] = { + 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, + 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, + 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, + 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, + 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, + 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, + 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, + 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, + 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, + 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, + 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, + 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, + 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, + 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, + 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, + 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, + 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, + 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, + 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, + 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, + 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, + 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, + 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, + 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, + 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, + 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, + 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, + 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, + 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, + 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, + 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, + 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, +}; + +/* + * Multiply a given number by 2 raised to the given power. + */ +static uint8_t +vdev_raidz_exp2(uint8_t a, int exp) +{ + if (a == 0) + return (0); + + ASSERT(exp >= 0); + ASSERT(vdev_raidz_log2[a] > 0 || a == 1); + + exp += vdev_raidz_log2[a]; + if (exp > 255) + exp -= 255; + + return (vdev_raidz_pow2[exp]); +} + +static void +vdev_raidz_generate_parity_p(raidz_map_t *rm) +{ + uint64_t *p, *src, pcount, ccount, i; + int c; + + pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccount == pcount); + for (i = 0; i < ccount; i++, src++, p++) { + *p = *src; + } + } else { + ASSERT(ccount <= pcount); + for (i = 0; i < ccount; i++, src++, p++) { + *p ^= *src; + } + } + } +} + +static void +vdev_raidz_generate_parity_pq(raidz_map_t *rm) +{ + uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++) { + *p = *src; + *q = *src; + } + for (; i < pcnt; i++, src++, p++, q++) { + *p = 0; + *q = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + for (i = 0; i < ccnt; i++, src++, p++, q++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + } + } + } +} + +static void +vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { + *p = *src; + *q = *src; + *r = *src; + } + for (; i < pcnt; i++, src++, p++, q++, r++) { + *p = 0; + *q = 0; + *r = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + + VDEV_RAIDZ_64MUL_4(*r, mask); + *r ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++, r++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + VDEV_RAIDZ_64MUL_4(*r, mask); + } + } + } +} + +/* + * Generate RAID parity in the first virtual columns according to the number of + * parity columns available. + */ +static void +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + switch (rm->rm_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rm); + break; + case 2: + vdev_raidz_generate_parity_pq(rm); + break; + case 3: + vdev_raidz_generate_parity_pqr(rm); + break; + default: + panic("invalid RAID-Z configuration"); + } +} + +/* BEGIN CSTYLED */ +/* + * In the general case of reconstruction, we must solve the system of linear + * equations defined by the coeffecients used to generate parity as well as + * the contents of the data and parity disks. This can be expressed with + * vectors for the original data (D) and the actual data (d) and parity (p) + * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): + * + * __ __ __ __ + * | | __ __ | p_0 | + * | V | | D_0 | | p_m-1 | + * | | x | : | = | d_0 | + * | I | | D_n-1 | | : | + * | | ~~ ~~ | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * I is simply a square identity matrix of size n, and V is a vandermonde + * matrix defined by the coeffecients we chose for the various parity columns + * (1, 2, 4). Note that these values were chosen both for simplicity, speedy + * computation as well as linear separability. + * + * __ __ __ __ + * | 1 .. 1 1 1 | | p_0 | + * | 2^n-1 .. 4 2 1 | __ __ | : | + * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | + * | 1 .. 0 0 0 | | D_1 | | d_0 | + * | 0 .. 0 0 0 | x | D_2 | = | d_1 | + * | : : : : | | : | | d_2 | + * | 0 .. 1 0 0 | | D_n-1 | | : | + * | 0 .. 0 1 0 | ~~ ~~ | : | + * | 0 .. 0 0 1 | | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * Note that I, V, d, and p are known. To compute D, we must invert the + * matrix and use the known data and parity values to reconstruct the unknown + * data values. We begin by removing the rows in V|I and d|p that correspond + * to failed or missing columns; we then make V|I square (n x n) and d|p + * sized n by removing rows corresponding to unused parity from the bottom up + * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' + * using Gauss-Jordan elimination. In the example below we use m=3 parity + * columns, n=8 data columns, with errors in d_1, d_2, and p_1: + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks + * | 19 205 116 29 64 16 4 1 | / / + * | 1 0 0 0 0 0 0 0 | / / + * | 0 1 0 0 0 0 0 0 | <--' / + * (V|I) = | 0 0 1 0 0 0 0 0 | <---' + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | + * | 19 205 116 29 64 16 4 1 | + * | 1 0 0 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 | + * (V|I)' = | 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We + * have carefully chosen the seed values 1, 2, and 4 to ensure that this + * matrix is not singular. + * __ __ + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 0 0 1 0 0 0 0 0 | + * | 167 100 5 41 159 169 217 208 | + * | 166 100 4 40 158 168 216 209 | + * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values + * of the missing data. + * + * As is apparent from the example above, the only non-trivial rows in the + * inverse matrix correspond to the data disks that we're trying to + * reconstruct. Indeed, those are the only rows we need as the others would + * only be useful for reconstructing data known or assumed to be valid. For + * that reason, we only build the coefficients in the rows that correspond to + * targeted columns. + */ +/* END CSTYLED */ + +static void +vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, + uint8_t **rows) +{ + int i, j; + int pow; + + ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + + /* + * Fill in the missing rows of interest. + */ + for (i = 0; i < nmap; i++) { + ASSERT3S(0, <=, map[i]); + ASSERT3S(map[i], <=, 2); + + pow = map[i] * n; + if (pow > 255) + pow -= 255; + ASSERT(pow <= 255); + + for (j = 0; j < n; j++) { + pow -= map[i]; + if (pow < 0) + pow += 255; + rows[i][j] = vdev_raidz_pow2[pow]; + } + } +} + +static void +vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, + uint8_t **rows, uint8_t **invrows, const uint8_t *used) +{ + int i, j, ii, jj; + uint8_t log; + + /* + * Assert that the first nmissing entries from the array of used + * columns correspond to parity columns and that subsequent entries + * correspond to data columns. + */ + for (i = 0; i < nmissing; i++) { + ASSERT3S(used[i], <, rm->rm_firstdatacol); + } + for (; i < n; i++) { + ASSERT3S(used[i], >=, rm->rm_firstdatacol); + } + + /* + * First initialize the storage where we'll compute the inverse rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + invrows[i][j] = (i == j) ? 1 : 0; + } + } + + /* + * Subtract all trivial rows from the rows of consequence. + */ + for (i = 0; i < nmissing; i++) { + for (j = nmissing; j < n; j++) { + ASSERT3U(used[j], >=, rm->rm_firstdatacol); + jj = used[j] - rm->rm_firstdatacol; + ASSERT3S(jj, <, n); + invrows[i][j] = rows[i][jj]; + rows[i][jj] = 0; + } + } + + /* + * For each of the rows of interest, we must normalize it and subtract + * a multiple of it from the other rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < missing[i]; j++) { + ASSERT3U(rows[i][j], ==, 0); + } + ASSERT3U(rows[i][missing[i]], !=, 0); + + /* + * Compute the inverse of the first element and multiply each + * element in the row by that value. + */ + log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[i][j] = vdev_raidz_exp2(rows[i][j], log); + invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); + } + + for (ii = 0; ii < nmissing; ii++) { + if (i == ii) + continue; + + ASSERT3U(rows[ii][missing[i]], !=, 0); + + log = vdev_raidz_log2[rows[ii][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[ii][j] ^= + vdev_raidz_exp2(rows[i][j], log); + invrows[ii][j] ^= + vdev_raidz_exp2(invrows[i][j], log); + } + } + } + + /* + * Verify that the data that is left in the rows are properly part of + * an identity matrix. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + if (j == missing[i]) { + ASSERT3U(rows[i][j], ==, 1); + } else { + ASSERT3U(rows[i][j], ==, 0); + } + } + } +} + +static void +vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, + int *missing, uint8_t **invrows, const uint8_t *used) +{ + int i, j, x, cc, c; + uint8_t *src; + uint64_t ccount; + uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; + uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; + uint8_t log, val; + int ll; + uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; + uint8_t *p, *pp; + size_t psize; + + log = 0; /* gcc */ + psize = sizeof (invlog[0][0]) * n * nmissing; + p = malloc(psize); + if (p == NULL) { + printf("Out of memory\n"); + return; + } + + for (pp = p, i = 0; i < nmissing; i++) { + invlog[i] = pp; + pp += n; + } + + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + ASSERT3U(invrows[i][j], !=, 0); + invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; + } + } + + for (i = 0; i < n; i++) { + c = used[i]; + ASSERT3U(c, <, rm->rm_cols); + + src = rm->rm_col[c].rc_data; + ccount = rm->rm_col[c].rc_size; + for (j = 0; j < nmissing; j++) { + cc = missing[j] + rm->rm_firstdatacol; + ASSERT3U(cc, >=, rm->rm_firstdatacol); + ASSERT3U(cc, <, rm->rm_cols); + ASSERT3U(cc, !=, c); + + dst[j] = rm->rm_col[cc].rc_data; + dcount[j] = rm->rm_col[cc].rc_size; + } + + ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); + + for (x = 0; x < ccount; x++, src++) { + if (*src != 0) + log = vdev_raidz_log2[*src]; + + for (cc = 0; cc < nmissing; cc++) { + if (x >= dcount[cc]) + continue; + + if (*src == 0) { + val = 0; + } else { + if ((ll = log + invlog[cc][i]) >= 255) + ll -= 255; + val = vdev_raidz_pow2[ll]; + } + + if (i == 0) + dst[cc][x] = val; + else + dst[cc][x] ^= val; + } + } + } + + free(p); +} + +static int +vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +{ + int n, i, c, t, tt; + int nmissing_rows; + int missing_rows[VDEV_RAIDZ_MAXPARITY]; + int parity_map[VDEV_RAIDZ_MAXPARITY]; + + uint8_t *p, *pp; + size_t psize; + + uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *used; + + int code = 0; + + + n = rm->rm_cols - rm->rm_firstdatacol; + + /* + * Figure out which data columns are missing. + */ + nmissing_rows = 0; + for (t = 0; t < ntgts; t++) { + if (tgts[t] >= rm->rm_firstdatacol) { + missing_rows[nmissing_rows++] = + tgts[t] - rm->rm_firstdatacol; + } + } + + /* + * Figure out which parity columns to use to help generate the missing + * data columns. + */ + for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { + ASSERT(tt < ntgts); + ASSERT(c < rm->rm_firstdatacol); + + /* + * Skip any targeted parity columns. + */ + if (c == tgts[tt]) { + tt++; + continue; + } + + code |= 1 << c; + + parity_map[i] = c; + i++; + } + + ASSERT(code != 0); + ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); + + psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * + nmissing_rows * n + sizeof (used[0]) * n; + p = malloc(psize); + if (p == NULL) { + printf("Out of memory\n"); + return (code); + } + + for (pp = p, i = 0; i < nmissing_rows; i++) { + rows[i] = pp; + pp += n; + invrows[i] = pp; + pp += n; + } + used = pp; + + for (i = 0; i < nmissing_rows; i++) { + used[i] = parity_map[i]; + } + + for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + if (tt < nmissing_rows && + c == missing_rows[tt] + rm->rm_firstdatacol) { + tt++; + continue; + } + + ASSERT3S(i, <, n); + used[i] = c; + i++; + } + + /* + * Initialize the interesting rows of the matrix. + */ + vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + + /* + * Invert the matrix. + */ + vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + invrows, used); + + /* + * Reconstruct the missing data using the generated matrix. + */ + vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + invrows, used); + + free(p); + + return (code); +} + +static int +vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY]; + int ntgts; + int i, c; + int code; + int nbadparity, nbaddata; + + /* + * The tgts list must already be sorted. + */ + for (i = 1; i < nt; i++) { + ASSERT(t[i] > t[i - 1]); + } + + nbadparity = rm->rm_firstdatacol; + nbaddata = rm->rm_cols - nbadparity; + ntgts = 0; + for (i = 0, c = 0; c < rm->rm_cols; c++) { + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rm->rm_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } else if (c >= rm->rm_firstdatacol) { + nbaddata--; + } else { + nbadparity--; + } + } + + ASSERT(ntgts >= nt); + ASSERT(nbaddata >= 0); + ASSERT(nbaddata + nbadparity == ntgts); + + code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); + ASSERT(code > 0); + return (code); +} + +static raidz_map_t * +vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree, + uint64_t unit_shift, uint64_t dcols, uint64_t nparity) +{ + raidz_map_t *rm; + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> unit_shift; + /* The first column for this stripe. */ + uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ + uint64_t o = (b / dcols) << unit_shift; + uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + q = s / (dcols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (dcols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ + if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ + acols = bc; + scols = MIN(dcols, roundup(bc, nparity + 1)); + } else { + acols = dcols; + scols = dcols; + } + + ASSERT3U(acols, <=, scols); + + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + + rm->rm_cols = acols; + rm->rm_scols = scols; + rm->rm_bigcols = bc; + rm->rm_skipstart = bc; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + rm->rm_firstdatacol = nparity; + rm->rm_datacopy = NULL; + rm->rm_reports = 0; + rm->rm_freed = 0; + rm->rm_ecksuminjected = 0; + + asize = 0; + + for (c = 0; c < scols; c++) { + col = f + c; + coff = o; + if (col >= dcols) { + col -= dcols; + coff += 1ULL << unit_shift; + } + rm->rm_col[c].rc_devidx = col; + rm->rm_col[c].rc_offset = coff; + rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_gdata = NULL; + rm->rm_col[c].rc_error = 0; + rm->rm_col[c].rc_tried = 0; + rm->rm_col[c].rc_skipped = 0; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << unit_shift; + else + rm->rm_col[c].rc_size = q << unit_shift; + + asize += rm->rm_col[c].rc_size; + } + + ASSERT3U(asize, ==, tot << unit_shift); + rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_nskip, <=, nparity); + + if (!dofree) { + for (c = 0; c < rm->rm_firstdatacol; c++) { + rm->rm_col[c].rc_data = + zio_buf_alloc(rm->rm_col[c].rc_size); + } + + rm->rm_col[c].rc_data = data; + + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_data = + (char *)rm->rm_col[c - 1].rc_data + + rm->rm_col[c - 1].rc_size; + } + } + + /* + * If all data stored spans all columns, there's a danger that parity + * will always be on the same device and, since parity isn't read + * during normal operation, that that device's I/O bandwidth won't be + * used effectively. We therefore switch the parity every 1MB. + * + * ... at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices evenly, we + * won't see any benefit. Further, occasional writes that aren't a + * multiple of the LCM of the number of children and the minimum + * stripe width are sufficient to avoid pessimal behavior. + * Unfortunately, this decision created an implicit on-disk format + * requirement that we need to support for all eternity, but only + * for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for padding + * we must make sure to note this swap. We will never intend to + * skip the first column since at least one data and one parity + * column must appear in each row. + */ + ASSERT(rm->rm_cols >= 2); + ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + + if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { + devidx = rm->rm_col[0].rc_devidx; + o = rm->rm_col[0].rc_offset; + rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; + rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; + rm->rm_col[1].rc_devidx = devidx; + rm->rm_col[1].rc_offset = o; + + if (rm->rm_skipstart == 0) + rm->rm_skipstart = 1; + } + + return (rm); +} + +static void +vdev_raidz_map_free(raidz_map_t *rm) +{ + int c; + + for (c = rm->rm_firstdatacol - 1; c >= 0; c--) + free(rm->rm_col[c].rc_data); + + free(rm); +} + +static vdev_t * +vdev_child(vdev_t *pvd, uint64_t devidx) +{ + vdev_t *cvd; + + STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { + if (cvd->v_id == devidx) + break; + } + + return (cvd); +} + +/* + * We keep track of whether or not there were any injected errors, so that + * any ereports we generate can note it. + */ +static int +raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, + uint64_t size) +{ + return (zio_checksum_verify(spa, bp, data)); +} + +/* + * Generate the parity from the data columns. If we tried and were able to + * read the parity without error, verify that the generated parity matches the + * data we read. If it doesn't, we fire off a checksum error. Return the + * number such failures. + */ +static int +raidz_parity_verify(raidz_map_t *rm) +{ + void *orig[VDEV_RAIDZ_MAXPARITY]; + int c, ret = 0; + raidz_col_t *rc; + + for (c = 0; c < rm->rm_firstdatacol; c++) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + orig[c] = malloc(rc->rc_size); + if (orig[c] != NULL) { + bcopy(rc->rc_data, orig[c], rc->rc_size); + } else { + printf("Out of memory\n"); + } + } + + vdev_raidz_generate_parity(rm); + + for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + if (orig[c] == NULL || + bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + rc->rc_error = ECKSUM; + ret++; + } + free(orig[c]); + } + + return (ret); +} + +/* + * Iterate over all combinations of bad data and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + */ +static int +vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, + void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) +{ + raidz_col_t *rc; + void *orig[VDEV_RAIDZ_MAXPARITY]; + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *tgts = &tstore[1]; + int current, next, i, c, n; + int code, ret = 0; + + ASSERT(total_errors < rm->rm_firstdatacol); + + /* + * This simplifies one edge condition. + */ + tgts[-1] = -1; + + for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { + /* + * Initialize the targets array by finding the first n columns + * that contain no error. + * + * If there were no data errors, we need to ensure that we're + * always explicitly attempting to reconstruct at least one + * data column. To do this, we simply push the highest target + * up into the data columns. + */ + for (c = 0, i = 0; i < n; i++) { + if (i == n - 1 && data_errors == 0 && + c < rm->rm_firstdatacol) { + c = rm->rm_firstdatacol; + } + + while (rm->rm_col[c].rc_error != 0) { + c++; + ASSERT3S(c, <, rm->rm_cols); + } + + tgts[i] = c++; + } + + /* + * Setting tgts[n] simplifies the other edge condition. + */ + tgts[n] = rm->rm_cols; + + /* + * These buffers were allocated in previous iterations. + */ + for (i = 0; i < n - 1; i++) { + ASSERT(orig[i] != NULL); + } + + orig[n - 1] = malloc(rm->rm_col[0].rc_size); + if (orig[n - 1] == NULL) { + ret = ENOMEM; + goto done; + } + + current = 0; + next = tgts[current]; + + while (current != n) { + tgts[current] = next; + current = 0; + + /* + * Save off the original data that we're going to + * attempt to reconstruct. + */ + for (i = 0; i < n; i++) { + ASSERT(orig[i] != NULL); + c = tgts[i]; + ASSERT3S(c, >=, 0); + ASSERT3S(c, <, rm->rm_cols); + rc = &rm->rm_col[c]; + bcopy(rc->rc_data, orig[i], rc->rc_size); + } + + /* + * Attempt a reconstruction and exit the outer loop on + * success. + */ + code = vdev_raidz_reconstruct(rm, tgts, n); + if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + ASSERT(rc->rc_error == 0); + rc->rc_error = ECKSUM; + } + + ret = code; + goto done; + } + + /* + * Restore the original data. + */ + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + bcopy(orig[i], rc->rc_data, rc->rc_size); + } + + do { + /* + * Find the next valid column after the current + * position.. + */ + for (next = tgts[current] + 1; + next < rm->rm_cols && + rm->rm_col[next].rc_error != 0; next++) + continue; + + ASSERT(next <= tgts[current + 1]); + + /* + * If that spot is available, we're done here. + */ + if (next != tgts[current + 1]) + break; + + /* + * Otherwise, find the next valid column after + * the previous position. + */ + for (c = tgts[current - 1] + 1; + rm->rm_col[c].rc_error != 0; c++) + continue; + + tgts[current] = c; + current++; + + } while (current != n); + } + } + n--; +done: + for (i = n - 1; i >= 0; i--) { + free(orig[i]); + } + + return (ret); +} + +static int +vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, + off_t offset, size_t bytes) +{ + vdev_t *tvd = vd->v_top; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, error; + int unexpected_errors; + int parity_errors; + int parity_untried; + int data_errors; + int total_errors; + int n; + int tgts[VDEV_RAIDZ_MAXPARITY]; + int code; + + rc = NULL; /* gcc */ + error = 0; + + rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, + vd->v_nchildren, vd->v_nparity); + if (rm == NULL) + return (ENOMEM); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity. + */ + for (c = rm->rm_cols - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + cvd = vdev_child(vd, rc->rc_devidx); + if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = ENXIO; + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } +#if 0 /* XXX: Too hard for the boot code. */ + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = ESTALE; + rc->rc_skipped = 1; + continue; + } +#endif + if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { + rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, + rc->rc_offset, rc->rc_size); + rc->rc_tried = 1; + rc->rc_skipped = 0; + } + } + +reconstruct: + unexpected_errors = 0; + parity_errors = 0; + parity_untried = 0; + data_errors = 0; + total_errors = 0; + + ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); + ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + if (c < rm->rm_firstdatacol) + parity_errors++; + else + data_errors++; + + if (!rc->rc_skipped) + unexpected_errors++; + + total_errors++; + } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } + + /* + * There are three potential phases for a read: + * 1. produce valid data from the columns read + * 2. read all disks and try again + * 3. perform combinatorial reconstruction + * + * Each phase is progressively both more expensive and less likely to + * occur. If we encounter more errors than we can repair or all phases + * fail, we have no choice but to return an error. + */ + + /* + * If the number of errors we saw was correctable -- less than or equal + * to the number of parity disks read -- attempt to produce data that + * has a valid checksum. Naturally, this case applies in the absence of + * any errors. + */ + if (total_errors <= rm->rm_firstdatacol - parity_untried) { + int rv; + + if (data_errors == 0) { + rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); + if (rv == 0) { + /* + * If we read parity information (unnecessarily + * as it happens since no reconstruction was + * needed) regenerate and verify the parity. + * We also regenerate parity when resilvering + * so we can write it out to the failed device + * later. + */ + if (parity_errors + parity_untried < + rm->rm_firstdatacol) { + n = raidz_parity_verify(rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + goto done; + } + } else { + /* + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. + */ + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rm->rm_firstdatacol); + + /* + * Identify the data columns that reported an error. + */ + n = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } + } + + ASSERT(rm->rm_firstdatacol >= n); + + code = vdev_raidz_reconstruct(rm, tgts, n); + + rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); + if (rv == 0) { + /* + * If we read more parity disks than were used + * for reconstruction, confirm that the other + * parity disks produced correct data. This + * routine is suboptimal in that it regenerates + * the parity that we already used in addition + * to the parity that we're attempting to + * verify, but this should be a relatively + * uncommon case, and can be optimized if it + * becomes a problem. Note that we regenerate + * parity when resilvering so we can write it + * out to failed devices later. + */ + if (parity_errors < rm->rm_firstdatacol - n) { + n = raidz_parity_verify(rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + + goto done; + } + } + } + + /* + * This isn't a typical situation -- either we got a read + * error or a child silently returned bad data. Read every + * block so we can try again with as much data and parity as + * we can track down. If we've already been through once + * before, all children will be marked as tried so we'll + * proceed to combinatorial reconstruction. + */ + unexpected_errors = 1; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + + n = 0; + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + + if (rc->rc_tried) + continue; + + cvd = vdev_child(vd, rc->rc_devidx); + ASSERT(cvd != NULL); + rc->rc_error = cvd->v_read(cvd, NULL, + rc->rc_data, rc->rc_offset, rc->rc_size); + if (rc->rc_error == 0) + n++; + rc->rc_tried = 1; + rc->rc_skipped = 0; + } + /* + * If we managed to read anything more, retry the + * reconstruction. + */ + if (n > 0) + goto reconstruct; + + /* + * At this point we've attempted to reconstruct the data given the + * errors we detected, and we've attempted to read all columns. There + * must, therefore, be one or more additional problems -- silent errors + * resulting in invalid data rather than explicit I/O errors resulting + * in absent data. We check if there is enough additional data to + * possibly reconstruct the data and then perform combinatorial + * reconstruction over all possible combinations. If that fails, + * we're cooked. + */ + if (total_errors > rm->rm_firstdatacol) { + error = EIO; + } else if (total_errors < rm->rm_firstdatacol && + (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, + total_errors, data_errors)) != 0) { + /* + * If we didn't use all the available parity for the + * combinatorial reconstruction, verify that the remaining + * parity is correct. + */ + if (code != (1 << rm->rm_firstdatacol) - 1) + (void) raidz_parity_verify(rm); + } else { + /* + * We're here because either: + * + * total_errors == rm_first_datacol, or + * vdev_raidz_combrec() failed + * + * In either case, there is enough bad data to prevent + * reconstruction. + * + * Start checksum ereports for all children which haven't + * failed, and the IO wasn't speculative. + */ + error = ECKSUM; + } + +done: + vdev_raidz_map_free(rm); + + return (error); +} From 96b416bf27fda61858bb27c710f5cd28f1208f40 Mon Sep 17 00:00:00 2001 From: sugam45 Date: Fri, 16 Jul 2021 03:20:58 +0530 Subject: [PATCH 2/3] Edits to the Original Commit --- .../osnet/dist/uts/common/fs/zfs/skein_zfs.c | 2 +- .../osnet/dist/uts/common/fs/zfs/sys/lz4.h | 2 +- .../osnet/dist/uts/common/fs/zfs/sys/skein.h | 183 ++++++++ .../dist/uts/common/fs/zfs/sys/zfs_bootenv.h | 36 +- .../uts/common/fs/zfs/sys/zfs_bootenv_os.h | 8 +- .../osnet/dist/uts/common/fs/zfs/zfssubr.c | 417 ++++-------------- sys/modules/zfs/Makefile.zfsmod | 3 +- 7 files changed, 289 insertions(+), 362 deletions(-) create mode 100644 external/cddl/osnet/dist/uts/common/fs/zfs/sys/skein.h diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c b/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c index f8d1cf9dd1..71462dfe50 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c @@ -24,7 +24,7 @@ #include #include #ifdef _KERNEL -#include +#include #else #include #endif diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/lz4.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/lz4.h index 153efe9161..b29a12d462 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/lz4.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/lz4.h @@ -32,7 +32,7 @@ */ #ifndef _LZ4_H -#define _LZ4_H +#define _LZ4_H #include diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/skein.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/skein.h new file mode 100644 index 0000000000..e5f2fb5ae3 --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/skein.h @@ -0,0 +1,183 @@ +/* + * Interface declarations for Skein hashing. + * Source code author: Doug Whiting, 2008. + * This algorithm and source code is released to the public domain. + * + * The following compile-time switches may be defined to control some + * tradeoffs between speed, code size, error checking, and security. + * + * The "default" note explains what happens when the switch is not defined. + * + * SKEIN_DEBUG -- make callouts from inside Skein code + * to examine/display intermediate values. + * [default: no callouts (no overhead)] + * + * SKEIN_ERR_CHECK -- how error checking is handled inside Skein + * code. If not defined, most error checking + * is disabled (for performance). Otherwise, + * the switch value is interpreted as: + * 0: use assert() to flag errors + * 1: return SKEIN_FAIL to flag errors + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ +#ifndef _SYS_SKEIN_H_ +#define _SYS_SKEIN_H_ + +#ifdef _KERNEL +#include /* get size_t definition */ +#else +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + SKEIN_SUCCESS = 0, /* return codes from Skein calls */ + SKEIN_FAIL = 1, + SKEIN_BAD_HASHLEN = 2 +}; + +#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */ + +#define SKEIN_256_STATE_WORDS (4) +#define SKEIN_512_STATE_WORDS (8) +#define SKEIN1024_STATE_WORDS (16) +#define SKEIN_MAX_STATE_WORDS (16) + +#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS) + +#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS) + +#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS) + +typedef struct { + size_t hashBitLen; /* size of hash result, in bits */ + size_t bCnt; /* current byte count in buffer b[] */ + /* tweak words: T[0]=byte cnt, T[1]=flags */ + uint64_t T[SKEIN_MODIFIER_WORDS]; +} Skein_Ctxt_Hdr_t; + +typedef struct { /* 256-bit Skein hash context structure */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + uint64_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */ + /* partial block buffer (8-byte aligned) */ + uint8_t b[SKEIN_256_BLOCK_BYTES]; +} Skein_256_Ctxt_t; + +typedef struct { /* 512-bit Skein hash context structure */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + uint64_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ + /* partial block buffer (8-byte aligned) */ + uint8_t b[SKEIN_512_BLOCK_BYTES]; +} Skein_512_Ctxt_t; + +typedef struct { /* 1024-bit Skein hash context structure */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + uint64_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */ + /* partial block buffer (8-byte aligned) */ + uint8_t b[SKEIN1024_BLOCK_BYTES]; +} Skein1024_Ctxt_t; + +/* Skein APIs for (incremental) "straight hashing" */ +int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen); +int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen); +int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen); + +int Skein_256_Update(Skein_256_Ctxt_t *ctx, const uint8_t *msg, + size_t msgByteCnt); +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const uint8_t *msg, + size_t msgByteCnt); +int Skein1024_Update(Skein1024_Ctxt_t *ctx, const uint8_t *msg, + size_t msgByteCnt); + +int Skein_256_Final(Skein_256_Ctxt_t *ctx, uint8_t *hashVal); +int Skein_512_Final(Skein_512_Ctxt_t *ctx, uint8_t *hashVal); +int Skein1024_Final(Skein1024_Ctxt_t *ctx, uint8_t *hashVal); + +/* + * Skein APIs for "extended" initialization: MAC keys, tree hashing. + * After an InitExt() call, just use Update/Final calls as with Init(). + * + * Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes. + * When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL, + * the results of InitExt() are identical to calling Init(). + * The function Init() may be called once to "precompute" the IV for + * a given hashBitLen value, then by saving a copy of the context + * the IV computation may be avoided in later calls. + * Similarly, the function InitExt() may be called once per MAC key + * to precompute the MAC IV, then a copy of the context saved and + * reused for each new MAC computation. + */ +int Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, + uint64_t treeInfo, const uint8_t *key, size_t keyBytes); +int Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, + uint64_t treeInfo, const uint8_t *key, size_t keyBytes); +int Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, + uint64_t treeInfo, const uint8_t *key, size_t keyBytes); + +/* + * Skein APIs for MAC and tree hash: + * Final_Pad: pad, do final block, but no OUTPUT type + * Output: do just the output stage + */ +int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, uint8_t *hashVal); +int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, uint8_t *hashVal); +int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, uint8_t *hashVal); + +#ifndef SKEIN_TREE_HASH +#define SKEIN_TREE_HASH (1) +#endif +#if SKEIN_TREE_HASH +int Skein_256_Output(Skein_256_Ctxt_t *ctx, uint8_t *hashVal); +int Skein_512_Output(Skein_512_Ctxt_t *ctx, uint8_t *hashVal); +int Skein1024_Output(Skein1024_Ctxt_t *ctx, uint8_t *hashVal); +#endif + +/* + * When you initialize a Skein KCF hashing method you can pass this param + * structure in cm_param to fine-tune the algorithm's defaults. + */ +typedef struct skein_param { + size_t sp_digest_bitlen; /* length of digest in bits */ +} skein_param_t; + +/* Module definitions */ +#ifdef SKEIN_MODULE_IMPL +#define CKM_SKEIN_256 "CKM_SKEIN_256" +#define CKM_SKEIN_512 "CKM_SKEIN_512" +#define CKM_SKEIN1024 "CKM_SKEIN1024" +#define CKM_SKEIN_256_MAC "CKM_SKEIN_256_MAC" +#define CKM_SKEIN_512_MAC "CKM_SKEIN_512_MAC" +#define CKM_SKEIN1024_MAC "CKM_SKEIN1024_MAC" + +typedef enum skein_mech_type { + SKEIN_256_MECH_INFO_TYPE, + SKEIN_512_MECH_INFO_TYPE, + SKEIN1024_MECH_INFO_TYPE, + SKEIN_256_MAC_MECH_INFO_TYPE, + SKEIN_512_MAC_MECH_INFO_TYPE, + SKEIN1024_MAC_MECH_INFO_TYPE +} skein_mech_type_t; + +#define VALID_SKEIN_DIGEST_MECH(__mech) \ + ((int)(__mech) >= SKEIN_256_MECH_INFO_TYPE && \ + (__mech) <= SKEIN1024_MECH_INFO_TYPE) +#define VALID_SKEIN_MAC_MECH(__mech) \ + ((int)(__mech) >= SKEIN_256_MAC_MECH_INFO_TYPE && \ + (__mech) <= SKEIN1024_MAC_MECH_INFO_TYPE) +#endif /* SKEIN_MODULE_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SKEIN_H_ */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h index cb06324c13..36e08b37f1 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h @@ -14,7 +14,7 @@ */ #ifndef _ZFS_BOOTENV_H -#define _ZFS_BOOTENV_H +#define _ZFS_BOOTENV_H /* * Define macros for label bootenv nvlist pair keys. @@ -24,27 +24,31 @@ extern "C" { #endif -#define BOOTENV_VERSION "version" +#define BOOTENV_VERSION "version" -#define BE_ILLUMOS_VENDOR "illumos" -#define BE_NETBSD_VENDOR "netbsd" -#define BE_GRUB_VENDOR "grub" -#define BE_LINUX_VENDOR "linux" + +#define BE_ILLUMOS_VENDOR "illumos" +#ifdef __NETBSDBSD__ +#define BE_NETBSD_VENDOR "netbsd" +#ifdef __FreeBSD__ +#define BE_FREEBSD_VENDOR "freebsd" +#define BE_GRUB_VENDOR "grub" +#define BE_LINUX_VENDOR "linux" #include "zfs_bootenv_os.h" -#define GRUB_ENVMAP BE_GRUB_VENDOR ":" "envmap" +#define GRUB_ENVMAP BE_GRUB_VENDOR ":" "envmap" -#define NETBSD_BOOTONCE BE_NETBSD_VENDOR ":" "bootonce" -#define NETBSD_BOOTONCE_USED BE_NETBSD_VENDOR ":" "bootonce-used" -#define NETBSD_NVSTORE BE_NETBSD_VENDOR ":" "nvstore" -#define ILLUMOS_BOOTONCE BE_ILLUMOS_VENDOR ":" "bootonce" -#define ILLUMOS_BOOTONCE_USED BE_ILLUMOS_VENDOR ":" "bootonce-used" -#define ILLUMOS_NVSTORE BE_ILLUMOS_VENDOR ":" "nvstore" +#define NETBSD_BOOTONCE BE_NETBSD_VENDOR ":" "bootonce" +#define NETBSD_BOOTONCE_USED BE_NETBSD_VENDOR ":" "bootonce-used" +#define NETBSD_NVSTORE BE_NETBSD_VENDOR ":" "nvstore" +#define ILLUMOS_BOOTONCE BE_ILLUMOS_VENDOR ":" "bootonce" +#define ILLUMOS_BOOTONCE_USED BE_ILLUMOS_VENDOR ":" "bootonce-used" +#define ILLUMOS_NVSTORE BE_ILLUMOS_VENDOR ":" "nvstore" -#define OS_BOOTONCE BOOTENV_OS ":" "bootonce" -#define OS_BOOTONCE_USED BOOTENV_OS ":" "bootonce-used" -#define OS_NVSTORE BOOTENV_OS ":" "nvstore" +#define OS_BOOTONCE BOOTENV_OS ":" "bootonce" +#define OS_BOOTONCE_USED BOOTENV_OS ":" "bootonce-used" +#define OS_NVSTORE BOOTENV_OS ":" "nvstore" #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv_os.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv_os.h index e9a88bdee6..3248e931f2 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv_os.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv_os.h @@ -14,13 +14,17 @@ */ #ifndef _ZFS_BOOTENV_OS_H -#define _ZFS_BOOTENV_OS_H +#define _ZFS_BOOTENV_OS_H #ifdef __cplusplus extern "C" { #endif -#define BOOTENV_OS BE_NETBSD_VENDOR +#ifdef __FreeBSD__ +#define BOOTENV_OS BE_FREEBSD_VENDOR + +#ifdef __NetBSD__ +#define BOOTENV_OS BE_NETBSD_VENDOR #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfssubr.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfssubr.c index a262e40f79..a306cce5bb 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfssubr.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfssubr.c @@ -22,13 +22,20 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -__RCSID("$NETBSD: zfssubr.c, v 1.1 2021/07/13 19:34:06 sugam Exp $"); +#include +#ifdef __FBSDID +__FBSDID("$FreeBSD: head/bin/ln/ln.c 251261 2013-06-02 17:55:00Z eadler $"); +#else +__RCSID("$NetBSD: zfssubr.c,v 1.1 2021/07/13 15:21:40 sugam Exp $"); +#endif #include -#include - -#include "blkptr.c" +#include +#include +#include +#include +#include +#include #include "zfs_fletcher.c" #include "sha256.c" @@ -36,19 +43,20 @@ __RCSID("$NETBSD: zfssubr.c, v 1.1 2021/07/13 19:34:06 sugam Exp $"); #include "lzjb.c" #include "zle.c" -static uint64_t zfs_crc64_table[256]; +uint64_t zfs_crc64_table[256]; #ifndef __DECONST #define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var)) #endif -#define ASSERT3S(x, y, z) ((void)0) -#define ASSERT3U(x, y, z) ((void)0) -#define ASSERT3P(x, y, z) ((void)0) -#define ASSERT0(x) ((void)0) -#define ASSERT(x) ((void)0) +#define ASSERT3S(x, y, z) ((void)0) +#define ASSERT3U(x, y, z) ((void)0) +#define ASSERT3P(x, y, z) ((void)0) +#define ASSERT0(x) ((void)0) +#define ASSERT(x) ((void)0) +#define ZAP_HASHBITS 28 -#define panic(...) do { \ +#define panic(...) do { \ printf(__VA_ARGS__); \ for (;;) ; \ } while (0) @@ -111,45 +119,45 @@ typedef struct zio_checksum_info { const char *ci_name; /* descriptive name */ } zio_checksum_info_t; - - -static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { +zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, - NULL, NULL, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, - "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, - "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, - NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, - NULL, NULL, 0, "fletcher2"}, + {{zio_checksum_off, zio_checksum_off}, + NULL, NULL, 0, "off"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "label"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "gang_header"}, + {{fletcher_2_native, fletcher_2_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, + {{fletcher_2_native, fletcher_2_byteswap}, + NULL, NULL, 0, "fletcher2"}, {{fletcher_4_native, fletcher_4_byteswap}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, {{zio_checksum_SHA256, zio_checksum_SHA256}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | - ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, {{fletcher_4_native, fletcher_4_byteswap}, - NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, {{zio_checksum_off, zio_checksum_off}, - NULL, NULL, 0, "noparity"}, + NULL, NULL, 0, "noparity"}, #ifndef __NetBSD__ {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | - ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, - {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, + ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, + {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, #endif - /* no edonr for now */ - {{NULL, NULL}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | - ZCHECKSUM_FLAG_NOPWRITE, "edonr"} +#ifdef illumos + {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, + zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | + ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, +#endif }; /* @@ -192,7 +200,7 @@ static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {NULL, lz4_decompress, 0, "lz4"}, }; -static void +void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; @@ -210,7 +218,7 @@ byteswap_uint64_array(void *vbuf, size_t size) * a tuple which is guaranteed to be unique for the life of the pool. */ static void -zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) +zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) { dva_t *dva = BP_IDENTITY(bp); uint64_t txg = BP_PHYSICAL_BIRTH(bp); @@ -272,9 +280,8 @@ zio_checksum_templates_free(spa_t *spa) } static int -zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) +zio_checksum_verify(const spa_t *spa, blkptr_t *bp, void *data) { - uint64_t size; unsigned int checksum; zio_checksum_info_t *ci; void *ctx = NULL; @@ -387,6 +394,7 @@ typedef struct raidz_col { uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ void *rc_data; /* I/O data */ + void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ @@ -402,18 +410,19 @@ typedef struct raidz_map { uint64_t rm_firstdatacol; /* First data column/parity count */ uint64_t rm_nskip; /* Skipped sectors for padding */ uint64_t rm_skipstart; /* Column index of padding start */ + void *rm_datacopy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; -#define VDEV_RAIDZ_P 0 -#define VDEV_RAIDZ_Q 1 -#define VDEV_RAIDZ_R 2 +#define VDEV_RAIDZ_P 0 +#define VDEV_RAIDZ_Q 1 +#define VDEV_RAIDZ_R 2 -#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) -#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) +#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) +#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) /* * We provide a mechanism to perform the field multiplication operation on a @@ -421,7 +430,7 @@ typedef struct raidz_map { * creating a mask from the top bit in each byte and using that to * conditionally apply the XOR of 0x1d. */ -#define VDEV_RAIDZ_64MUL_2(x, mask) \ +#define VDEV_RAIDZ_64MUL_2(x, mask) \ { \ (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ @@ -429,13 +438,13 @@ typedef struct raidz_map { ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ } -#define VDEV_RAIDZ_64MUL_4(x, mask) \ +#define VDEV_RAIDZ_64MUL_4(x, mask) \ { \ VDEV_RAIDZ_64MUL_2((x), mask); \ VDEV_RAIDZ_64MUL_2((x), mask); \ } -#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) +#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) /* * These two tables represent powers and logs of 2 in the Galois field defined @@ -986,7 +995,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, log = 0; /* gcc */ psize = sizeof (invlog[0][0]) * n * nmissing; - p = malloc(psize); + p = kmem_alloc(psize, KM_SLEEP); if (p == NULL) { printf("Out of memory\n"); return; @@ -1046,7 +1055,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, } } - free(p); + kmem_free(p, psize); } static int @@ -1107,7 +1116,7 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; - p = malloc(psize); + p = kmem_alloc(psize, KM_SLEEP); if (p == NULL) { printf("Out of memory\n"); return (code); @@ -1154,7 +1163,7 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, invrows, used); - free(p); + kmem_free(p, psize); return (code); } @@ -1355,23 +1364,14 @@ vdev_raidz_map_free(raidz_map_t *rm) { int c; - for (c = rm->rm_firstdatacol - 1; c >= 0; c--) - free(rm->rm_col[c].rc_data); - - free(rm); -} - -static vdev_t * -vdev_child(vdev_t *pvd, uint64_t devidx) -{ - vdev_t *cvd; + size_t size = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + size += rm->rm_col[c].rc_size; - STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { - if (cvd->v_id == devidx) - break; - } + if (rm->rm_datacopy != NULL) + zio_buf_free(rm->rm_datacopy, size); - return (cvd); + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } /* @@ -1379,7 +1379,7 @@ vdev_child(vdev_t *pvd, uint64_t devidx) * any ereports we generate can note it. */ static int -raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, +raidz_checksum_verify(const spa_t *spa, blkptr_t *bp, void *data, uint64_t size) { return (zio_checksum_verify(spa, bp, data)); @@ -1402,7 +1402,7 @@ raidz_parity_verify(raidz_map_t *rm) rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - orig[c] = malloc(rc->rc_size); + orig[c] = zio_buf_alloc(rc->rc_size); if (orig[c] != NULL) { bcopy(rc->rc_data, orig[c], rc->rc_size); } else { @@ -1421,7 +1421,7 @@ raidz_parity_verify(raidz_map_t *rm) rc->rc_error = ECKSUM; ret++; } - free(orig[c]); + zio_buf_free(orig[c], rc->rc_size); } return (ret); @@ -1436,7 +1436,7 @@ raidz_parity_verify(raidz_map_t *rm) * cases we'd only use parity information in column 0. */ static int -vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, +vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, blkptr_t *bp, void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) { raidz_col_t *rc; @@ -1489,7 +1489,7 @@ vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, ASSERT(orig[i] != NULL); } - orig[n - 1] = malloc(rm->rm_col[0].rc_size); + orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); if (orig[n - 1] == NULL) { ret = ENOMEM; goto done; @@ -1576,273 +1576,8 @@ vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, n--; done: for (i = n - 1; i >= 0; i--) { - free(orig[i]); + zio_buf_free(orig[i], rm->rm_col[0].rc_size); } return (ret); -} - -static int -vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, - off_t offset, size_t bytes) -{ - vdev_t *tvd = vd->v_top; - vdev_t *cvd; - raidz_map_t *rm; - raidz_col_t *rc; - int c, error; - int unexpected_errors; - int parity_errors; - int parity_untried; - int data_errors; - int total_errors; - int n; - int tgts[VDEV_RAIDZ_MAXPARITY]; - int code; - - rc = NULL; /* gcc */ - error = 0; - - rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, - vd->v_nchildren, vd->v_nparity); - if (rm == NULL) - return (ENOMEM); - - /* - * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity. - */ - for (c = rm->rm_cols - 1; c >= 0; c--) { - rc = &rm->rm_col[c]; - cvd = vdev_child(vd, rc->rc_devidx); - if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = ENXIO; - rc->rc_tried = 1; /* don't even try */ - rc->rc_skipped = 1; - continue; - } -#if 0 /* XXX: Too hard for the boot code. */ - if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = ESTALE; - rc->rc_skipped = 1; - continue; - } -#endif - if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { - rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, - rc->rc_offset, rc->rc_size); - rc->rc_tried = 1; - rc->rc_skipped = 0; - } - } - -reconstruct: - unexpected_errors = 0; - parity_errors = 0; - parity_untried = 0; - data_errors = 0; - total_errors = 0; - - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); - ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - - if (rc->rc_error) { - ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - if (c < rm->rm_firstdatacol) - parity_errors++; - else - data_errors++; - - if (!rc->rc_skipped) - unexpected_errors++; - - total_errors++; - } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { - parity_untried++; - } - } - - /* - * There are three potential phases for a read: - * 1. produce valid data from the columns read - * 2. read all disks and try again - * 3. perform combinatorial reconstruction - * - * Each phase is progressively both more expensive and less likely to - * occur. If we encounter more errors than we can repair or all phases - * fail, we have no choice but to return an error. - */ - - /* - * If the number of errors we saw was correctable -- less than or equal - * to the number of parity disks read -- attempt to produce data that - * has a valid checksum. Naturally, this case applies in the absence of - * any errors. - */ - if (total_errors <= rm->rm_firstdatacol - parity_untried) { - int rv; - - if (data_errors == 0) { - rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); - if (rv == 0) { - /* - * If we read parity information (unnecessarily - * as it happens since no reconstruction was - * needed) regenerate and verify the parity. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. - */ - if (parity_errors + parity_untried < - rm->rm_firstdatacol) { - n = raidz_parity_verify(rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - goto done; - } - } else { - /* - * We either attempt to read all the parity columns or - * none of them. If we didn't try to read parity, we - * wouldn't be here in the correctable case. There must - * also have been fewer parity errors than parity - * columns or, again, we wouldn't be in this code path. - */ - ASSERT(parity_untried == 0); - ASSERT(parity_errors < rm->rm_firstdatacol); - - /* - * Identify the data columns that reported an error. - */ - n = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) { - ASSERT(n < VDEV_RAIDZ_MAXPARITY); - tgts[n++] = c; - } - } - - ASSERT(rm->rm_firstdatacol >= n); - - code = vdev_raidz_reconstruct(rm, tgts, n); - - rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); - if (rv == 0) { - /* - * If we read more parity disks than were used - * for reconstruction, confirm that the other - * parity disks produced correct data. This - * routine is suboptimal in that it regenerates - * the parity that we already used in addition - * to the parity that we're attempting to - * verify, but this should be a relatively - * uncommon case, and can be optimized if it - * becomes a problem. Note that we regenerate - * parity when resilvering so we can write it - * out to failed devices later. - */ - if (parity_errors < rm->rm_firstdatacol - n) { - n = raidz_parity_verify(rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - - goto done; - } - } - } - - /* - * This isn't a typical situation -- either we got a read - * error or a child silently returned bad data. Read every - * block so we can try again with as much data and parity as - * we can track down. If we've already been through once - * before, all children will be marked as tried so we'll - * proceed to combinatorial reconstruction. - */ - unexpected_errors = 1; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - - n = 0; - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - - if (rc->rc_tried) - continue; - - cvd = vdev_child(vd, rc->rc_devidx); - ASSERT(cvd != NULL); - rc->rc_error = cvd->v_read(cvd, NULL, - rc->rc_data, rc->rc_offset, rc->rc_size); - if (rc->rc_error == 0) - n++; - rc->rc_tried = 1; - rc->rc_skipped = 0; - } - /* - * If we managed to read anything more, retry the - * reconstruction. - */ - if (n > 0) - goto reconstruct; - - /* - * At this point we've attempted to reconstruct the data given the - * errors we detected, and we've attempted to read all columns. There - * must, therefore, be one or more additional problems -- silent errors - * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. We check if there is enough additional data to - * possibly reconstruct the data and then perform combinatorial - * reconstruction over all possible combinations. If that fails, - * we're cooked. - */ - if (total_errors > rm->rm_firstdatacol) { - error = EIO; - } else if (total_errors < rm->rm_firstdatacol && - (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, - total_errors, data_errors)) != 0) { - /* - * If we didn't use all the available parity for the - * combinatorial reconstruction, verify that the remaining - * parity is correct. - */ - if (code != (1 << rm->rm_firstdatacol) - 1) - (void) raidz_parity_verify(rm); - } else { - /* - * We're here because either: - * - * total_errors == rm_first_datacol, or - * vdev_raidz_combrec() failed - * - * In either case, there is enough bad data to prevent - * reconstruction. - * - * Start checksum ereports for all children which haven't - * failed, and the IO wasn't speculative. - */ - error = ECKSUM; - } - -done: - vdev_raidz_map_free(rm); - - return (error); -} +} \ No newline at end of file diff --git a/sys/modules/zfs/Makefile.zfsmod b/sys/modules/zfs/Makefile.zfsmod index 6ecaacf3c2..01eeeb5fb2 100644 --- a/sys/modules/zfs/Makefile.zfsmod +++ b/sys/modules/zfs/Makefile.zfsmod @@ -59,7 +59,7 @@ SRCS+= refcount.c SRCS+= rrwlock.c SRCS+= sa.c SRCS+= sha256.c -#SRCS+= skein_zfs.c +SRCS+= skein_zfs.c SRCS+= spa.c SRCS+= spa_config.c SRCS+= spa_errlog.c @@ -102,6 +102,7 @@ SRCS+= zfs_sa.c SRCS+= zfs_vfsops.c SRCS+= zfs_vnops.c SRCS+= zfs_znode.c +SRCS+= zfssubr.c SRCS+= zil.c SRCS+= zio.c SRCS+= zio_checksum.c From 6ca8d6c33ad15832957f9d4e840f335dc869d487 Mon Sep 17 00:00:00 2001 From: sugam45 Date: Fri, 16 Jul 2021 03:31:43 +0530 Subject: [PATCH 3/3] Tabspace Indentation --- .../cddl/osnet/dist/uts/common/fs/zfs/sys/skein.h | 12 ++++++------ .../osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h | 12 +++++++----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/skein.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/skein.h index e5f2fb5ae3..9a4514ae93 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/skein.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/skein.h @@ -152,12 +152,12 @@ typedef struct skein_param { /* Module definitions */ #ifdef SKEIN_MODULE_IMPL -#define CKM_SKEIN_256 "CKM_SKEIN_256" -#define CKM_SKEIN_512 "CKM_SKEIN_512" -#define CKM_SKEIN1024 "CKM_SKEIN1024" -#define CKM_SKEIN_256_MAC "CKM_SKEIN_256_MAC" -#define CKM_SKEIN_512_MAC "CKM_SKEIN_512_MAC" -#define CKM_SKEIN1024_MAC "CKM_SKEIN1024_MAC" +#define CKM_SKEIN_256 "CKM_SKEIN_256" +#define CKM_SKEIN_512 "CKM_SKEIN_512" +#define CKM_SKEIN1024 "CKM_SKEIN1024" +#define CKM_SKEIN_256_MAC "CKM_SKEIN_256_MAC" +#define CKM_SKEIN_512_MAC "CKM_SKEIN_512_MAC" +#define CKM_SKEIN1024_MAC "CKM_SKEIN1024_MAC" typedef enum skein_mech_type { SKEIN_256_MECH_INFO_TYPE, diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h index 36e08b37f1..4bea816c71 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_bootenv.h @@ -24,16 +24,18 @@ extern "C" { #endif -#define BOOTENV_VERSION "version" - +#define BOOTENV_VERSION "version" #define BE_ILLUMOS_VENDOR "illumos" + #ifdef __NETBSDBSD__ -#define BE_NETBSD_VENDOR "netbsd" +#define BE_NETBSD_VENDOR "netbsd" + #ifdef __FreeBSD__ #define BE_FREEBSD_VENDOR "freebsd" -#define BE_GRUB_VENDOR "grub" -#define BE_LINUX_VENDOR "linux" + +#define BE_GRUB_VENDOR "grub" +#define BE_LINUX_VENDOR "linux" #include "zfs_bootenv_os.h"