static int raw_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { int ret; if (__hook_bdrv_read) { int read_count = 0; while (nb_sectors) { read_count = __hook_bdrv_read(bs, sector_num, buf, nb_sectors); buf += 512 * read_count; sector_num += read_count; nb_sectors -= read_count; if (nb_sectors > 0) { ret = raw_pread(bs, (sector_num) * 512, buf, 1 * 512); if (ret != 512) { return ret; } buf += 512; ++sector_num; --nb_sectors; } } return 0; } ret = raw_pread(bs, sector_num * 512, buf, nb_sectors * 512); if (ret == (nb_sectors * 512)) ret = 0; return ret; }
static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { RawAIOCB *acb; /* * If O_DIRECT is used and the buffer is not aligned fall back * to synchronous IO. */ BDRVRawState *s = bs->opaque; if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { QEMUBH *bh; acb = qemu_aio_get(bs, cb, opaque); acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors); bh = qemu_bh_new(raw_aio_em_cb, acb); qemu_bh_schedule(bh); return &acb->common; } acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); if (!acb) return NULL; if (qemu_paio_read(&acb->aiocb) < 0) { raw_aio_remove(acb); return NULL; } return &acb->common; }
static int raw_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { int ret; ret = raw_pread(bs, sector_num * 512, buf, nb_sectors * 512); if (ret == (nb_sectors * 512)) ret = 0; return ret; }
static int raw_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { int ret; ret = raw_pread(bs, sector_num * BDRV_SECTOR_SIZE, buf, nb_sectors * BDRV_SECTOR_SIZE); if (ret == (nb_sectors * BDRV_SECTOR_SIZE)) ret = 0; return ret; }
/* reads up to `count' bytes starting from `offset' using error correction and integrity validation, if available */ ssize_t fec_pread(struct fec_handle *f, void *buf, size_t count, uint64_t offset) { check(f); check(buf); if (unlikely(offset > UINT64_MAX - count)) { errno = EOVERFLOW; return -1; } if (f->verity.hash) { return process(f, (uint8_t *)buf, get_max_count(offset, count, f->data_size), offset, verity_read); } else if (f->ecc.start) { check(f->ecc.start < f->size); count = get_max_count(offset, count, f->data_size); ssize_t rc = process(f, (uint8_t *)buf, count, offset, ecc_read); if (rc >= 0) { return rc; } /* return raw data if pure ecc read fails; due to interleaving the specific blocks the caller wants may still be fine */ } else { count = get_max_count(offset, count, f->size); } if (raw_pread(f, buf, count, offset)) { return count; } return -1; }
/* attempts to read verity metadata from `f->fd' position `offset'; if in r/w mode, rewrites the metadata if it had errors */ int verity_parse_header(fec_handle *f, uint64_t offset) { check(f); check(f->data_size > VERITY_METADATA_SIZE); if (offset > f->data_size - VERITY_METADATA_SIZE) { debug("failed to read verity header: offset %" PRIu64 " is too far", offset); return -1; } verity_info *v = &f->verity; uint64_t errors = f->errors; if (!raw_pread(f, &v->header, sizeof(v->header), offset)) { error("failed to read verity header: %s", strerror(errno)); return -1; } /* use raw data to check for the alternative magic, because it will be error corrected to VERITY_MAGIC otherwise */ if (v->header.magic == VERITY_MAGIC_DISABLE) { /* this value is not used by us, but can be used by a caller to decide whether dm-verity should be enabled */ v->disabled = true; } if (fec_pread(f, &v->ecc_header, sizeof(v->ecc_header), offset) != sizeof(v->ecc_header)) { warn("failed to read verity header: %s", strerror(errno)); return -1; } if (validate_header(f, &v->header, offset)) { /* raw verity header is invalid; this could be due to corruption, or due to missing verity metadata */ if (validate_header(f, &v->ecc_header, offset)) { return -1; /* either way, we cannot recover */ } /* report mismatching fields */ if (!v->disabled && v->header.magic != v->ecc_header.magic) { warn("corrected verity header magic"); v->header.magic = v->ecc_header.magic; } if (v->header.version != v->ecc_header.version) { warn("corrected verity header version"); v->header.version = v->ecc_header.version; } if (v->header.length != v->ecc_header.length) { warn("corrected verity header length"); v->header.length = v->ecc_header.length; } if (memcmp(v->header.signature, v->ecc_header.signature, sizeof(v->header.signature))) { warn("corrected verity header signature"); /* we have no way of knowing which signature is correct, if either of them is */ } } v->metadata_start = offset; if (parse_table(f, offset + sizeof(v->header), v->header.length, false) == -1 && parse_table(f, offset + sizeof(v->header), v->header.length, true) == -1) { return -1; } /* if we corrected something while parsing metadata and we are in r/w mode, rewrite the corrected metadata */ if (f->mode & O_RDWR && f->errors > errors && rewrite_metadata(f, offset) < 0) { warn("failed to rewrite verity metadata: %s", strerror(errno)); } if (v->metadata_start < v->hash_start) { f->data_size = v->metadata_start; } else { f->data_size = v->hash_start; } return 0; }
/* reads, corrects and parses the verity table, validates parameters, and if `f->flags' does not have `FEC_VERITY_DISABLE' set, calls `verify_tree' to load and validate the hash tree */ static int parse_table(fec_handle *f, uint64_t offset, uint32_t size, bool useecc) { check(f); check(size >= VERITY_MIN_TABLE_SIZE); check(size <= VERITY_MAX_TABLE_SIZE); debug("offset = %" PRIu64 ", size = %u", offset, size); verity_info *v = &f->verity; std::unique_ptr<char[]> table(new (std::nothrow) char[size + 1]); if (!table) { errno = ENOMEM; return -1; } if (!useecc) { if (!raw_pread(f, table.get(), size, offset)) { error("failed to read verity table: %s", strerror(errno)); return -1; } } else if (fec_pread(f, table.get(), size, offset) != (ssize_t)size) { error("failed to ecc read verity table: %s", strerror(errno)); return -1; } table[size] = '\0'; debug("verity table: '%s'", table.get()); int i = 0; std::unique_ptr<uint8_t[]> salt; uint8_t root[SHA256_DIGEST_LENGTH]; auto tokens = android::base::Split(table.get(), " "); for (const auto& token : tokens) { switch (i++) { case 0: /* version */ if (token != stringify(VERITY_TABLE_VERSION)) { error("unsupported verity table version: %s", token.c_str()); return -1; } break; case 3: /* data_block_size */ case 4: /* hash_block_size */ /* assume 4 KiB block sizes for everything */ if (token != stringify(FEC_BLOCKSIZE)) { error("unsupported verity block size: %s", token.c_str()); return -1; } break; case 5: /* num_data_blocks */ if (parse_uint64(token.c_str(), f->data_size / FEC_BLOCKSIZE, &v->data_blocks) == -1) { error("invalid number of verity data blocks: %s", token.c_str()); return -1; } break; case 6: /* hash_start_block */ if (parse_uint64(token.c_str(), f->data_size / FEC_BLOCKSIZE, &v->hash_start) == -1) { error("invalid verity hash start block: %s", token.c_str()); return -1; } v->hash_start *= FEC_BLOCKSIZE; break; case 7: /* algorithm */ if (token != "sha256") { error("unsupported verity hash algorithm: %s", token.c_str()); return -1; } break; case 8: /* digest */ if (parse_hex(root, sizeof(root), token.c_str()) == -1) { error("invalid verity root hash: %s", token.c_str()); return -1; } break; case 9: /* salt */ v->salt_size = token.size(); check(v->salt_size % 2 == 0); v->salt_size /= 2; salt.reset(new (std::nothrow) uint8_t[v->salt_size]); if (!salt) { errno = ENOMEM; return -1; } if (parse_hex(salt.get(), v->salt_size, token.c_str()) == -1) { error("invalid verity salt: %s", token.c_str()); return -1; } break; default: break; } } if (i < VERITY_TABLE_ARGS) { error("not enough arguments in verity table: %d; expected at least " stringify(VERITY_TABLE_ARGS), i); return -1; } check(v->hash_start < f->data_size); if (v->metadata_start < v->hash_start) { check(v->data_blocks == v->metadata_start / FEC_BLOCKSIZE); } else { check(v->data_blocks == v->hash_start / FEC_BLOCKSIZE); } if (v->salt) { delete[] v->salt; v->salt = NULL; } v->salt = salt.release(); if (v->table) { delete[] v->table; v->table = NULL; } v->table = table.release(); if (!(f->flags & FEC_VERITY_DISABLE)) { if (verify_tree(f, root) == -1) { return -1; } check(v->hash); uint8_t zero_block[FEC_BLOCKSIZE]; memset(zero_block, 0, FEC_BLOCKSIZE); if (verity_hash(f, zero_block, v->zero_hash) == -1) { error("failed to hash"); return -1; } } return 0; }
/* reads the verity hash tree, validates it against the root hash in `root', corrects errors if necessary, and copies valid data blocks for later use to `f->verity.hash' */ static int verify_tree(fec_handle *f, const uint8_t *root) { uint8_t data[FEC_BLOCKSIZE]; uint8_t hash[SHA256_DIGEST_LENGTH]; check(f); check(root); verity_info *v = &f->verity; uint32_t levels = 0; /* calculate the size and the number of levels in the hash tree */ v->hash_size = verity_get_size(v->data_blocks * FEC_BLOCKSIZE, &levels, NULL); check(v->hash_start < UINT64_MAX - v->hash_size); check(v->hash_start + v->hash_size <= f->data_size); uint64_t hash_offset = v->hash_start; uint64_t data_offset = hash_offset + FEC_BLOCKSIZE; v->hash_data_offset = data_offset; /* validate the root hash */ if (!raw_pread(f, data, FEC_BLOCKSIZE, hash_offset) || !verity_check_block(f, root, data)) { /* try to correct */ if (!ecc_read_hashes(f, 0, NULL, hash_offset, data) || !verity_check_block(f, root, data)) { error("root hash invalid"); return -1; } else if (f->mode & O_RDWR && !raw_pwrite(f, data, FEC_BLOCKSIZE, hash_offset)) { error("failed to rewrite the root block: %s", strerror(errno)); return -1; } } debug("root hash valid"); /* calculate the number of hashes on each level */ uint32_t hashes[levels]; verity_get_size(v->data_blocks * FEC_BLOCKSIZE, NULL, hashes); /* calculate the size and offset for the data hashes */ for (uint32_t i = 1; i < levels; ++i) { uint32_t blocks = hashes[levels - i]; debug("%u hash blocks on level %u", blocks, levels - i); v->hash_data_offset = data_offset; v->hash_data_blocks = blocks; data_offset += blocks * FEC_BLOCKSIZE; } check(v->hash_data_blocks); check(v->hash_data_blocks <= v->hash_size / FEC_BLOCKSIZE); check(v->hash_data_offset); check(v->hash_data_offset <= UINT64_MAX - (v->hash_data_blocks * FEC_BLOCKSIZE)); check(v->hash_data_offset < f->data_size); check(v->hash_data_offset + v->hash_data_blocks * FEC_BLOCKSIZE <= f->data_size); /* copy data hashes to memory in case they are corrupted, so we don't have to correct them every time they are needed */ std::unique_ptr<uint8_t[]> data_hashes( new (std::nothrow) uint8_t[f->verity.hash_data_blocks * FEC_BLOCKSIZE]); if (!data_hashes) { errno = ENOMEM; return -1; } /* validate the rest of the hash tree */ data_offset = hash_offset + FEC_BLOCKSIZE; for (uint32_t i = 1; i < levels; ++i) { uint32_t blocks = hashes[levels - i]; for (uint32_t j = 0; j < blocks; ++j) { /* ecc reads are very I/O intensive, so read raw hash tree and do error correcting only if it doesn't validate */ if (!raw_pread(f, hash, SHA256_DIGEST_LENGTH, hash_offset + j * SHA256_DIGEST_LENGTH) || !raw_pread(f, data, FEC_BLOCKSIZE, data_offset + j * FEC_BLOCKSIZE)) { error("failed to read hashes: %s", strerror(errno)); return -1; } if (!verity_check_block(f, hash, data)) { /* try to correct */ if (!ecc_read_hashes(f, hash_offset + j * SHA256_DIGEST_LENGTH, hash, data_offset + j * FEC_BLOCKSIZE, data) || !verity_check_block(f, hash, data)) { error("invalid hash tree: hash_offset %" PRIu64 ", " "data_offset %" PRIu64 ", block %u", hash_offset, data_offset, j); return -1; } /* update the corrected blocks to the file if we are in r/w mode */ if (f->mode & O_RDWR) { if (!raw_pwrite(f, hash, SHA256_DIGEST_LENGTH, hash_offset + j * SHA256_DIGEST_LENGTH) || !raw_pwrite(f, data, FEC_BLOCKSIZE, data_offset + j * FEC_BLOCKSIZE)) { error("failed to write hashes: %s", strerror(errno)); return -1; } } } if (blocks == v->hash_data_blocks) { memcpy(data_hashes.get() + j * FEC_BLOCKSIZE, data, FEC_BLOCKSIZE); } } hash_offset = data_offset; data_offset += blocks * FEC_BLOCKSIZE; } debug("valid"); if (v->hash) { delete[] v->hash; v->hash = NULL; } v->hash = data_hashes.release(); return 0; }
/* reads `count' bytes from `offset', corrects possible errors with erasure detection, and verifies the integrity of read data using verity hash tree; returns the number of corrections in `errors' */ static ssize_t verity_read(fec_handle *f, uint8_t *dest, size_t count, uint64_t offset, size_t *errors) { check(f); check(dest); check(offset < f->data_size); check(offset + count <= f->data_size); check(f->verity.hash); check(errors); debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count); rs_unique_ptr rs(NULL, free_rs_char); std::unique_ptr<uint8_t[]> ecc_data; if (f->ecc.start && ecc_init(f, rs, ecc_data) == -1) { return -1; } uint64_t curr = offset / FEC_BLOCKSIZE; size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE); size_t left = count; uint8_t data[FEC_BLOCKSIZE]; uint64_t max_hash_block = (f->verity.hash_data_blocks * FEC_BLOCKSIZE - SHA256_DIGEST_LENGTH) / SHA256_DIGEST_LENGTH; while (left > 0) { check(curr <= max_hash_block); uint8_t *hash = &f->verity.hash[curr * SHA256_DIGEST_LENGTH]; uint64_t curr_offset = curr * FEC_BLOCKSIZE; bool expect_zeros = is_zero(f, curr_offset); /* if we are in read-only mode and expect to read a zero block, skip reading and just return zeros */ if (f->mode & O_RDONLY && expect_zeros) { memset(data, 0, FEC_BLOCKSIZE); goto valid; } /* copy raw data without error correction */ if (!raw_pread(f, data, FEC_BLOCKSIZE, curr_offset)) { error("failed to read: %s", strerror(errno)); return -1; } if (likely(verity_check_block(f, hash, data))) { goto valid; } /* we know the block is supposed to contain zeros, so return zeros instead of trying to correct it */ if (expect_zeros) { memset(data, 0, FEC_BLOCKSIZE); goto corrected; } if (!f->ecc.start) { /* fatal error without ecc */ error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64, offset, offset + count, curr); return -1; } else { debug("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64, offset, offset + count, curr); } /* try to correct without erasures first, because checking for erasure locations is slower */ if (__ecc_read(f, rs.get(), data, curr_offset, false, ecc_data.get(), errors) == FEC_BLOCKSIZE && verity_check_block(f, hash, data)) { goto corrected; } /* try to correct with erasures */ if (__ecc_read(f, rs.get(), data, curr_offset, true, ecc_data.get(), errors) == FEC_BLOCKSIZE && verity_check_block(f, hash, data)) { goto corrected; } error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64 " (offset %" PRIu64 ") cannot be recovered", offset, offset + count, curr, curr_offset); dump("decoded block", curr, data, FEC_BLOCKSIZE); errno = EIO; return -1; corrected: /* update the corrected block to the file if we are in r/w mode */ if (f->mode & O_RDWR && !raw_pwrite(f, data, FEC_BLOCKSIZE, curr_offset)) { error("failed to write: %s", strerror(errno)); return -1; } valid: size_t copy = FEC_BLOCKSIZE - coff; if (copy > left) { copy = left; } memcpy(dest, &data[coff], copy); dest += copy; left -= copy; coff = 0; ++curr; } return count; }
/* reads and decodes a single block starting from `offset', returns the number of bytes corrected in `errors' */ static int __ecc_read(fec_handle *f, void *rs, uint8_t *dest, uint64_t offset, bool use_erasures, uint8_t *ecc_data, size_t *errors) { check(offset % FEC_BLOCKSIZE == 0); ecc_info *e = &f->ecc; /* reverse interleaving: calculate the RS block that includes the requested offset */ uint64_t rsb = offset - (offset / (e->rounds * FEC_BLOCKSIZE)) * e->rounds * FEC_BLOCKSIZE; int data_index = -1; int erasures[e->rsn]; int neras = 0; /* verity is required to check for erasures */ check(!use_erasures || f->verity.hash); for (int i = 0; i < e->rsn; ++i) { uint64_t interleaved = fec_ecc_interleave(rsb * e->rsn + i, e->rsn, e->rounds); if (interleaved == offset) { data_index = i; } /* copy raw data to reconstruct the RS block */ uint8_t bbuf[FEC_BLOCKSIZE]; if (unlikely(interleaved >= e->start) || is_zero(f, interleaved)) { memset(bbuf, 0, FEC_BLOCKSIZE); } else { if (!raw_pread(f, bbuf, FEC_BLOCKSIZE, interleaved)) { error("failed to read: %s", strerror(errno)); return -1; } if (use_erasures && neras <= e->roots && is_erasure(f, interleaved, bbuf)) { erasures[neras++] = i; } } for (int j = 0; j < FEC_BLOCKSIZE; ++j) { ecc_data[j * FEC_RSM + i] = bbuf[j]; } } check(data_index >= 0); size_t nerrs = 0; uint8_t copy[FEC_RSM]; for (int i = 0; i < FEC_BLOCKSIZE; ++i) { /* copy parity data */ if (!raw_pread(f, &ecc_data[i * FEC_RSM + e->rsn], e->roots, e->start + (i + rsb) * e->roots)) { error("failed to read ecc data: %s", strerror(errno)); return -1; } /* for debugging decoding failures, because decode_rs_char can mangle ecc_data */ if (unlikely(use_erasures)) { memcpy(copy, &ecc_data[i * FEC_RSM], FEC_RSM); } /* decode */ int rc = decode_rs_char(rs, &ecc_data[i * FEC_RSM], erasures, neras); if (unlikely(rc < 0)) { if (use_erasures) { error("RS block %" PRIu64 ": decoding failed (%d erasures)", rsb, neras); dump("raw RS block", rsb, copy, FEC_RSM); } else if (!f->verity.hash) { warn("RS block %" PRIu64 ": decoding failed", rsb); } else { debug("RS block %" PRIu64 ": decoding failed", rsb); } errno = EIO; return -1; } else if (unlikely(rc > 0)) { check(rc <= (use_erasures ? e->roots : e->roots / 2)); nerrs += rc; } dest[i] = ecc_data[i * FEC_RSM + data_index]; } if (nerrs) { warn("RS block %" PRIu64 ": corrected %zu errors", rsb, nerrs); *errors += nerrs; } return FEC_BLOCKSIZE; }
/* attempts to read and validate an ecc header from file position `offset' */ static int parse_ecc_header(fec_handle *f, uint64_t offset) { check(f); check(f->ecc.rsn > 0 && f->ecc.rsn < FEC_RSM); check(f->size > sizeof(fec_header)); debug("offset = %" PRIu64, offset); if (offset > f->size - sizeof(fec_header)) { return -1; } fec_header header; /* there's obviously no ecc data at this point, so there is no need to call fec_pread to access this data */ if (!raw_pread(f, &header, sizeof(fec_header), offset)) { error("failed to read: %s", strerror(errno)); return -1; } /* move offset back to the beginning of the block for validating header */ offset -= offset % FEC_BLOCKSIZE; if (header.magic != FEC_MAGIC) { return -1; } if (header.version != FEC_VERSION) { error("unsupported ecc version: %u", header.version); return -1; } if (header.size != sizeof(fec_header)) { error("unexpected ecc header size: %u", header.size); return -1; } if (header.roots == 0 || header.roots >= FEC_RSM) { error("invalid ecc roots: %u", header.roots); return -1; } if (f->ecc.roots != (int)header.roots) { error("unexpected number of roots: %d vs %u", f->ecc.roots, header.roots); return -1; } if (header.fec_size % header.roots || header.fec_size % FEC_BLOCKSIZE) { error("inconsistent ecc size %u", header.fec_size); return -1; } /* structure: data | ecc | header */ if (offset < header.fec_size || offset - header.fec_size != header.inp_size) { error("unexpected input size: %" PRIu64 " vs %" PRIu64, offset, header.inp_size); return -1; } f->data_size = header.inp_size; f->ecc.blocks = fec_div_round_up(f->data_size, FEC_BLOCKSIZE); f->ecc.rounds = fec_div_round_up(f->ecc.blocks, f->ecc.rsn); if (header.fec_size != (uint32_t)f->ecc.rounds * f->ecc.roots * FEC_BLOCKSIZE) { error("inconsistent ecc size %u", header.fec_size); return -1; } f->ecc.size = header.fec_size; f->ecc.start = header.inp_size; /* validate encoding data; caller may opt not to use it if invalid */ SHA256_CTX ctx; SHA256_Init(&ctx); uint8_t buf[FEC_BLOCKSIZE]; uint32_t n = 0; uint32_t len = FEC_BLOCKSIZE; while (n < f->ecc.size) { if (len > f->ecc.size - n) { len = f->ecc.size - n; } if (!raw_pread(f, buf, len, f->ecc.start + n)) { error("failed to read ecc: %s", strerror(errno)); return -1; } SHA256_Update(&ctx, buf, len); n += len; } uint8_t hash[SHA256_DIGEST_LENGTH]; SHA256_Final(hash, &ctx); f->ecc.valid = !memcmp(hash, header.hash, SHA256_DIGEST_LENGTH); if (!f->ecc.valid) { warn("ecc data not valid"); } return 0; }